linux/mm/page_alloc.c
<<
>>
Prefs
   1/*
   2 *  linux/mm/page_alloc.c
   3 *
   4 *  Manages the free list, the system allocates free pages here.
   5 *  Note that kmalloc() lives in slab.c
   6 *
   7 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   8 *  Swap reorganised 29.12.95, Stephen Tweedie
   9 *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
  10 *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
  11 *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
  12 *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
  13 *  Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
  14 *          (lots of bits borrowed from Ingo Molnar & Andrew Morton)
  15 */
  16
  17#include <linux/stddef.h>
  18#include <linux/mm.h>
  19#include <linux/swap.h>
  20#include <linux/interrupt.h>
  21#include <linux/pagemap.h>
  22#include <linux/jiffies.h>
  23#include <linux/bootmem.h>
  24#include <linux/memblock.h>
  25#include <linux/compiler.h>
  26#include <linux/kernel.h>
  27#include <linux/kmemcheck.h>
  28#include <linux/kasan.h>
  29#include <linux/module.h>
  30#include <linux/suspend.h>
  31#include <linux/pagevec.h>
  32#include <linux/blkdev.h>
  33#include <linux/slab.h>
  34#include <linux/ratelimit.h>
  35#include <linux/oom.h>
  36#include <linux/notifier.h>
  37#include <linux/topology.h>
  38#include <linux/sysctl.h>
  39#include <linux/cpu.h>
  40#include <linux/cpuset.h>
  41#include <linux/memory_hotplug.h>
  42#include <linux/nodemask.h>
  43#include <linux/vmalloc.h>
  44#include <linux/vmstat.h>
  45#include <linux/mempolicy.h>
  46#include <linux/memremap.h>
  47#include <linux/stop_machine.h>
  48#include <linux/sort.h>
  49#include <linux/pfn.h>
  50#include <linux/backing-dev.h>
  51#include <linux/fault-inject.h>
  52#include <linux/page-isolation.h>
  53#include <linux/page_ext.h>
  54#include <linux/debugobjects.h>
  55#include <linux/kmemleak.h>
  56#include <linux/compaction.h>
  57#include <trace/events/kmem.h>
  58#include <trace/events/oom.h>
  59#include <linux/prefetch.h>
  60#include <linux/mm_inline.h>
  61#include <linux/migrate.h>
  62#include <linux/hugetlb.h>
  63#include <linux/sched/rt.h>
  64#include <linux/sched/mm.h>
  65#include <linux/page_owner.h>
  66#include <linux/kthread.h>
  67#include <linux/memcontrol.h>
  68#include <linux/ftrace.h>
  69#include <linux/lockdep.h>
  70#include <linux/nmi.h>
  71
  72#include <asm/sections.h>
  73#include <asm/tlbflush.h>
  74#include <asm/div64.h>
  75#include "internal.h"
  76
  77/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
  78static DEFINE_MUTEX(pcp_batch_high_lock);
  79#define MIN_PERCPU_PAGELIST_FRACTION    (8)
  80
  81#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID
  82DEFINE_PER_CPU(int, numa_node);
  83EXPORT_PER_CPU_SYMBOL(numa_node);
  84#endif
  85
  86#ifdef CONFIG_HAVE_MEMORYLESS_NODES
  87/*
  88 * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly.
  89 * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined.
  90 * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem()
  91 * defined in <linux/topology.h>.
  92 */
  93DEFINE_PER_CPU(int, _numa_mem_);                /* Kernel "local memory" node */
  94EXPORT_PER_CPU_SYMBOL(_numa_mem_);
  95int _node_numa_mem_[MAX_NUMNODES];
  96#endif
  97
  98/* work_structs for global per-cpu drains */
  99DEFINE_MUTEX(pcpu_drain_mutex);
 100DEFINE_PER_CPU(struct work_struct, pcpu_drain);
 101
 102#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
 103volatile unsigned long latent_entropy __latent_entropy;
 104EXPORT_SYMBOL(latent_entropy);
 105#endif
 106
 107/*
 108 * Array of node states.
 109 */
 110nodemask_t node_states[NR_NODE_STATES] __read_mostly = {
 111        [N_POSSIBLE] = NODE_MASK_ALL,
 112        [N_ONLINE] = { { [0] = 1UL } },
 113#ifndef CONFIG_NUMA
 114        [N_NORMAL_MEMORY] = { { [0] = 1UL } },
 115#ifdef CONFIG_HIGHMEM
 116        [N_HIGH_MEMORY] = { { [0] = 1UL } },
 117#endif
 118        [N_MEMORY] = { { [0] = 1UL } },
 119        [N_CPU] = { { [0] = 1UL } },
 120#endif  /* NUMA */
 121};
 122EXPORT_SYMBOL(node_states);
 123
 124/* Protect totalram_pages and zone->managed_pages */
 125static DEFINE_SPINLOCK(managed_page_count_lock);
 126
 127unsigned long totalram_pages __read_mostly;
 128unsigned long totalreserve_pages __read_mostly;
 129unsigned long totalcma_pages __read_mostly;
 130
 131int percpu_pagelist_fraction;
 132gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
 133
 134/*
 135 * A cached value of the page's pageblock's migratetype, used when the page is
 136 * put on a pcplist. Used to avoid the pageblock migratetype lookup when
 137 * freeing from pcplists in most cases, at the cost of possibly becoming stale.
 138 * Also the migratetype set in the page does not necessarily match the pcplist
 139 * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any
 140 * other index - this ensures that it will be put on the correct CMA freelist.
 141 */
 142static inline int get_pcppage_migratetype(struct page *page)
 143{
 144        return page->index;
 145}
 146
 147static inline void set_pcppage_migratetype(struct page *page, int migratetype)
 148{
 149        page->index = migratetype;
 150}
 151
 152#ifdef CONFIG_PM_SLEEP
 153/*
 154 * The following functions are used by the suspend/hibernate code to temporarily
 155 * change gfp_allowed_mask in order to avoid using I/O during memory allocations
 156 * while devices are suspended.  To avoid races with the suspend/hibernate code,
 157 * they should always be called with pm_mutex held (gfp_allowed_mask also should
 158 * only be modified with pm_mutex held, unless the suspend/hibernate code is
 159 * guaranteed not to run in parallel with that modification).
 160 */
 161
 162static gfp_t saved_gfp_mask;
 163
 164void pm_restore_gfp_mask(void)
 165{
 166        WARN_ON(!mutex_is_locked(&pm_mutex));
 167        if (saved_gfp_mask) {
 168                gfp_allowed_mask = saved_gfp_mask;
 169                saved_gfp_mask = 0;
 170        }
 171}
 172
 173void pm_restrict_gfp_mask(void)
 174{
 175        WARN_ON(!mutex_is_locked(&pm_mutex));
 176        WARN_ON(saved_gfp_mask);
 177        saved_gfp_mask = gfp_allowed_mask;
 178        gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS);
 179}
 180
 181bool pm_suspended_storage(void)
 182{
 183        if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
 184                return false;
 185        return true;
 186}
 187#endif /* CONFIG_PM_SLEEP */
 188
 189#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
 190unsigned int pageblock_order __read_mostly;
 191#endif
 192
 193static void __free_pages_ok(struct page *page, unsigned int order);
 194
 195/*
 196 * results with 256, 32 in the lowmem_reserve sysctl:
 197 *      1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
 198 *      1G machine -> (16M dma, 784M normal, 224M high)
 199 *      NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
 200 *      HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
 201 *      HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
 202 *
 203 * TBD: should special case ZONE_DMA32 machines here - in those we normally
 204 * don't need any ZONE_NORMAL reservation
 205 */
 206int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = {
 207#ifdef CONFIG_ZONE_DMA
 208         256,
 209#endif
 210#ifdef CONFIG_ZONE_DMA32
 211         256,
 212#endif
 213#ifdef CONFIG_HIGHMEM
 214         32,
 215#endif
 216         32,
 217};
 218
 219EXPORT_SYMBOL(totalram_pages);
 220
 221static char * const zone_names[MAX_NR_ZONES] = {
 222#ifdef CONFIG_ZONE_DMA
 223         "DMA",
 224#endif
 225#ifdef CONFIG_ZONE_DMA32
 226         "DMA32",
 227#endif
 228         "Normal",
 229#ifdef CONFIG_HIGHMEM
 230         "HighMem",
 231#endif
 232         "Movable",
 233#ifdef CONFIG_ZONE_DEVICE
 234         "Device",
 235#endif
 236};
 237
 238char * const migratetype_names[MIGRATE_TYPES] = {
 239        "Unmovable",
 240        "Movable",
 241        "Reclaimable",
 242        "HighAtomic",
 243#ifdef CONFIG_CMA
 244        "CMA",
 245#endif
 246#ifdef CONFIG_MEMORY_ISOLATION
 247        "Isolate",
 248#endif
 249};
 250
 251compound_page_dtor * const compound_page_dtors[] = {
 252        NULL,
 253        free_compound_page,
 254#ifdef CONFIG_HUGETLB_PAGE
 255        free_huge_page,
 256#endif
 257#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 258        free_transhuge_page,
 259#endif
 260};
 261
 262int min_free_kbytes = 1024;
 263int user_min_free_kbytes = -1;
 264int watermark_scale_factor = 10;
 265
 266static unsigned long __meminitdata nr_kernel_pages;
 267static unsigned long __meminitdata nr_all_pages;
 268static unsigned long __meminitdata dma_reserve;
 269
 270#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
 271static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
 272static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
 273static unsigned long __initdata required_kernelcore;
 274static unsigned long __initdata required_movablecore;
 275static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
 276static bool mirrored_kernelcore;
 277
 278/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
 279int movable_zone;
 280EXPORT_SYMBOL(movable_zone);
 281#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
 282
 283#if MAX_NUMNODES > 1
 284int nr_node_ids __read_mostly = MAX_NUMNODES;
 285int nr_online_nodes __read_mostly = 1;
 286EXPORT_SYMBOL(nr_node_ids);
 287EXPORT_SYMBOL(nr_online_nodes);
 288#endif
 289
 290int page_group_by_mobility_disabled __read_mostly;
 291
 292#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
 293static inline void reset_deferred_meminit(pg_data_t *pgdat)
 294{
 295        unsigned long max_initialise;
 296        unsigned long reserved_lowmem;
 297
 298        /*
 299         * Initialise at least 2G of a node but also take into account that
 300         * two large system hashes that can take up 1GB for 0.25TB/node.
 301         */
 302        max_initialise = max(2UL << (30 - PAGE_SHIFT),
 303                (pgdat->node_spanned_pages >> 8));
 304
 305        /*
 306         * Compensate the all the memblock reservations (e.g. crash kernel)
 307         * from the initial estimation to make sure we will initialize enough
 308         * memory to boot.
 309         */
 310        reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn,
 311                        pgdat->node_start_pfn + max_initialise);
 312        max_initialise += reserved_lowmem;
 313
 314        pgdat->static_init_size = min(max_initialise, pgdat->node_spanned_pages);
 315        pgdat->first_deferred_pfn = ULONG_MAX;
 316}
 317
 318/* Returns true if the struct page for the pfn is uninitialised */
 319static inline bool __meminit early_page_uninitialised(unsigned long pfn)
 320{
 321        int nid = early_pfn_to_nid(pfn);
 322
 323        if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn)
 324                return true;
 325
 326        return false;
 327}
 328
 329/*
 330 * Returns false when the remaining initialisation should be deferred until
 331 * later in the boot cycle when it can be parallelised.
 332 */
 333static inline bool update_defer_init(pg_data_t *pgdat,
 334                                unsigned long pfn, unsigned long zone_end,
 335                                unsigned long *nr_initialised)
 336{
 337        /* Always populate low zones for address-contrained allocations */
 338        if (zone_end < pgdat_end_pfn(pgdat))
 339                return true;
 340        (*nr_initialised)++;
 341        if ((*nr_initialised > pgdat->static_init_size) &&
 342            (pfn & (PAGES_PER_SECTION - 1)) == 0) {
 343                pgdat->first_deferred_pfn = pfn;
 344                return false;
 345        }
 346
 347        return true;
 348}
 349#else
 350static inline void reset_deferred_meminit(pg_data_t *pgdat)
 351{
 352}
 353
 354static inline bool early_page_uninitialised(unsigned long pfn)
 355{
 356        return false;
 357}
 358
 359static inline bool update_defer_init(pg_data_t *pgdat,
 360                                unsigned long pfn, unsigned long zone_end,
 361                                unsigned long *nr_initialised)
 362{
 363        return true;
 364}
 365#endif
 366
 367/* Return a pointer to the bitmap storing bits affecting a block of pages */
 368static inline unsigned long *get_pageblock_bitmap(struct page *page,
 369                                                        unsigned long pfn)
 370{
 371#ifdef CONFIG_SPARSEMEM
 372        return __pfn_to_section(pfn)->pageblock_flags;
 373#else
 374        return page_zone(page)->pageblock_flags;
 375#endif /* CONFIG_SPARSEMEM */
 376}
 377
 378static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
 379{
 380#ifdef CONFIG_SPARSEMEM
 381        pfn &= (PAGES_PER_SECTION-1);
 382        return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
 383#else
 384        pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
 385        return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
 386#endif /* CONFIG_SPARSEMEM */
 387}
 388
 389/**
 390 * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
 391 * @page: The page within the block of interest
 392 * @pfn: The target page frame number
 393 * @end_bitidx: The last bit of interest to retrieve
 394 * @mask: mask of bits that the caller is interested in
 395 *
 396 * Return: pageblock_bits flags
 397 */
 398static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page,
 399                                        unsigned long pfn,
 400                                        unsigned long end_bitidx,
 401                                        unsigned long mask)
 402{
 403        unsigned long *bitmap;
 404        unsigned long bitidx, word_bitidx;
 405        unsigned long word;
 406
 407        bitmap = get_pageblock_bitmap(page, pfn);
 408        bitidx = pfn_to_bitidx(page, pfn);
 409        word_bitidx = bitidx / BITS_PER_LONG;
 410        bitidx &= (BITS_PER_LONG-1);
 411
 412        word = bitmap[word_bitidx];
 413        bitidx += end_bitidx;
 414        return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
 415}
 416
 417unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
 418                                        unsigned long end_bitidx,
 419                                        unsigned long mask)
 420{
 421        return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask);
 422}
 423
 424static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
 425{
 426        return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK);
 427}
 428
 429/**
 430 * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages
 431 * @page: The page within the block of interest
 432 * @flags: The flags to set
 433 * @pfn: The target page frame number
 434 * @end_bitidx: The last bit of interest
 435 * @mask: mask of bits that the caller is interested in
 436 */
 437void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
 438                                        unsigned long pfn,
 439                                        unsigned long end_bitidx,
 440                                        unsigned long mask)
 441{
 442        unsigned long *bitmap;
 443        unsigned long bitidx, word_bitidx;
 444        unsigned long old_word, word;
 445
 446        BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
 447
 448        bitmap = get_pageblock_bitmap(page, pfn);
 449        bitidx = pfn_to_bitidx(page, pfn);
 450        word_bitidx = bitidx / BITS_PER_LONG;
 451        bitidx &= (BITS_PER_LONG-1);
 452
 453        VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
 454
 455        bitidx += end_bitidx;
 456        mask <<= (BITS_PER_LONG - bitidx - 1);
 457        flags <<= (BITS_PER_LONG - bitidx - 1);
 458
 459        word = READ_ONCE(bitmap[word_bitidx]);
 460        for (;;) {
 461                old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags);
 462                if (word == old_word)
 463                        break;
 464                word = old_word;
 465        }
 466}
 467
 468void set_pageblock_migratetype(struct page *page, int migratetype)
 469{
 470        if (unlikely(page_group_by_mobility_disabled &&
 471                     migratetype < MIGRATE_PCPTYPES))
 472                migratetype = MIGRATE_UNMOVABLE;
 473
 474        set_pageblock_flags_group(page, (unsigned long)migratetype,
 475                                        PB_migrate, PB_migrate_end);
 476}
 477
 478#ifdef CONFIG_DEBUG_VM
 479static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
 480{
 481        int ret = 0;
 482        unsigned seq;
 483        unsigned long pfn = page_to_pfn(page);
 484        unsigned long sp, start_pfn;
 485
 486        do {
 487                seq = zone_span_seqbegin(zone);
 488                start_pfn = zone->zone_start_pfn;
 489                sp = zone->spanned_pages;
 490                if (!zone_spans_pfn(zone, pfn))
 491                        ret = 1;
 492        } while (zone_span_seqretry(zone, seq));
 493
 494        if (ret)
 495                pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n",
 496                        pfn, zone_to_nid(zone), zone->name,
 497                        start_pfn, start_pfn + sp);
 498
 499        return ret;
 500}
 501
 502static int page_is_consistent(struct zone *zone, struct page *page)
 503{
 504        if (!pfn_valid_within(page_to_pfn(page)))
 505                return 0;
 506        if (zone != page_zone(page))
 507                return 0;
 508
 509        return 1;
 510}
 511/*
 512 * Temporary debugging check for pages not lying within a given zone.
 513 */
 514static int __maybe_unused bad_range(struct zone *zone, struct page *page)
 515{
 516        if (page_outside_zone_boundaries(zone, page))
 517                return 1;
 518        if (!page_is_consistent(zone, page))
 519                return 1;
 520
 521        return 0;
 522}
 523#else
 524static inline int __maybe_unused bad_range(struct zone *zone, struct page *page)
 525{
 526        return 0;
 527}
 528#endif
 529
 530static void bad_page(struct page *page, const char *reason,
 531                unsigned long bad_flags)
 532{
 533        static unsigned long resume;
 534        static unsigned long nr_shown;
 535        static unsigned long nr_unshown;
 536
 537        /*
 538         * Allow a burst of 60 reports, then keep quiet for that minute;
 539         * or allow a steady drip of one report per second.
 540         */
 541        if (nr_shown == 60) {
 542                if (time_before(jiffies, resume)) {
 543                        nr_unshown++;
 544                        goto out;
 545                }
 546                if (nr_unshown) {
 547                        pr_alert(
 548                              "BUG: Bad page state: %lu messages suppressed\n",
 549                                nr_unshown);
 550                        nr_unshown = 0;
 551                }
 552                nr_shown = 0;
 553        }
 554        if (nr_shown++ == 0)
 555                resume = jiffies + 60 * HZ;
 556
 557        pr_alert("BUG: Bad page state in process %s  pfn:%05lx\n",
 558                current->comm, page_to_pfn(page));
 559        __dump_page(page, reason);
 560        bad_flags &= page->flags;
 561        if (bad_flags)
 562                pr_alert("bad because of flags: %#lx(%pGp)\n",
 563                                                bad_flags, &bad_flags);
 564        dump_page_owner(page);
 565
 566        print_modules();
 567        dump_stack();
 568out:
 569        /* Leave bad fields for debug, except PageBuddy could make trouble */
 570        page_mapcount_reset(page); /* remove PageBuddy */
 571        add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
 572}
 573
 574/*
 575 * Higher-order pages are called "compound pages".  They are structured thusly:
 576 *
 577 * The first PAGE_SIZE page is called the "head page" and have PG_head set.
 578 *
 579 * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded
 580 * in bit 0 of page->compound_head. The rest of bits is pointer to head page.
 581 *
 582 * The first tail page's ->compound_dtor holds the offset in array of compound
 583 * page destructors. See compound_page_dtors.
 584 *
 585 * The first tail page's ->compound_order holds the order of allocation.
 586 * This usage means that zero-order pages may not be compound.
 587 */
 588
 589void free_compound_page(struct page *page)
 590{
 591        __free_pages_ok(page, compound_order(page));
 592}
 593
 594void prep_compound_page(struct page *page, unsigned int order)
 595{
 596        int i;
 597        int nr_pages = 1 << order;
 598
 599        set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
 600        set_compound_order(page, order);
 601        __SetPageHead(page);
 602        for (i = 1; i < nr_pages; i++) {
 603                struct page *p = page + i;
 604                set_page_count(p, 0);
 605                p->mapping = TAIL_MAPPING;
 606                set_compound_head(p, page);
 607        }
 608        atomic_set(compound_mapcount_ptr(page), -1);
 609}
 610
 611#ifdef CONFIG_DEBUG_PAGEALLOC
 612unsigned int _debug_guardpage_minorder;
 613bool _debug_pagealloc_enabled __read_mostly
 614                        = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
 615EXPORT_SYMBOL(_debug_pagealloc_enabled);
 616bool _debug_guardpage_enabled __read_mostly;
 617
 618static int __init early_debug_pagealloc(char *buf)
 619{
 620        if (!buf)
 621                return -EINVAL;
 622        return kstrtobool(buf, &_debug_pagealloc_enabled);
 623}
 624early_param("debug_pagealloc", early_debug_pagealloc);
 625
 626static bool need_debug_guardpage(void)
 627{
 628        /* If we don't use debug_pagealloc, we don't need guard page */
 629        if (!debug_pagealloc_enabled())
 630                return false;
 631
 632        if (!debug_guardpage_minorder())
 633                return false;
 634
 635        return true;
 636}
 637
 638static void init_debug_guardpage(void)
 639{
 640        if (!debug_pagealloc_enabled())
 641                return;
 642
 643        if (!debug_guardpage_minorder())
 644                return;
 645
 646        _debug_guardpage_enabled = true;
 647}
 648
 649struct page_ext_operations debug_guardpage_ops = {
 650        .need = need_debug_guardpage,
 651        .init = init_debug_guardpage,
 652};
 653
 654static int __init debug_guardpage_minorder_setup(char *buf)
 655{
 656        unsigned long res;
 657
 658        if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_ORDER / 2) {
 659                pr_err("Bad debug_guardpage_minorder value\n");
 660                return 0;
 661        }
 662        _debug_guardpage_minorder = res;
 663        pr_info("Setting debug_guardpage_minorder to %lu\n", res);
 664        return 0;
 665}
 666early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup);
 667
 668static inline bool set_page_guard(struct zone *zone, struct page *page,
 669                                unsigned int order, int migratetype)
 670{
 671        struct page_ext *page_ext;
 672
 673        if (!debug_guardpage_enabled())
 674                return false;
 675
 676        if (order >= debug_guardpage_minorder())
 677                return false;
 678
 679        page_ext = lookup_page_ext(page);
 680        if (unlikely(!page_ext))
 681                return false;
 682
 683        __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
 684
 685        INIT_LIST_HEAD(&page->lru);
 686        set_page_private(page, order);
 687        /* Guard pages are not available for any usage */
 688        __mod_zone_freepage_state(zone, -(1 << order), migratetype);
 689
 690        return true;
 691}
 692
 693static inline void clear_page_guard(struct zone *zone, struct page *page,
 694                                unsigned int order, int migratetype)
 695{
 696        struct page_ext *page_ext;
 697
 698        if (!debug_guardpage_enabled())
 699                return;
 700
 701        page_ext = lookup_page_ext(page);
 702        if (unlikely(!page_ext))
 703                return;
 704
 705        __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
 706
 707        set_page_private(page, 0);
 708        if (!is_migrate_isolate(migratetype))
 709                __mod_zone_freepage_state(zone, (1 << order), migratetype);
 710}
 711#else
 712struct page_ext_operations debug_guardpage_ops;
 713static inline bool set_page_guard(struct zone *zone, struct page *page,
 714                        unsigned int order, int migratetype) { return false; }
 715static inline void clear_page_guard(struct zone *zone, struct page *page,
 716                                unsigned int order, int migratetype) {}
 717#endif
 718
 719static inline void set_page_order(struct page *page, unsigned int order)
 720{
 721        set_page_private(page, order);
 722        __SetPageBuddy(page);
 723}
 724
 725static inline void rmv_page_order(struct page *page)
 726{
 727        __ClearPageBuddy(page);
 728        set_page_private(page, 0);
 729}
 730
 731/*
 732 * This function checks whether a page is free && is the buddy
 733 * we can do coalesce a page and its buddy if
 734 * (a) the buddy is not in a hole (check before calling!) &&
 735 * (b) the buddy is in the buddy system &&
 736 * (c) a page and its buddy have the same order &&
 737 * (d) a page and its buddy are in the same zone.
 738 *
 739 * For recording whether a page is in the buddy system, we set ->_mapcount
 740 * PAGE_BUDDY_MAPCOUNT_VALUE.
 741 * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is
 742 * serialized by zone->lock.
 743 *
 744 * For recording page's order, we use page_private(page).
 745 */
 746static inline int page_is_buddy(struct page *page, struct page *buddy,
 747                                                        unsigned int order)
 748{
 749        if (page_is_guard(buddy) && page_order(buddy) == order) {
 750                if (page_zone_id(page) != page_zone_id(buddy))
 751                        return 0;
 752
 753                VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
 754
 755                return 1;
 756        }
 757
 758        if (PageBuddy(buddy) && page_order(buddy) == order) {
 759                /*
 760                 * zone check is done late to avoid uselessly
 761                 * calculating zone/node ids for pages that could
 762                 * never merge.
 763                 */
 764                if (page_zone_id(page) != page_zone_id(buddy))
 765                        return 0;
 766
 767                VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
 768
 769                return 1;
 770        }
 771        return 0;
 772}
 773
 774/*
 775 * Freeing function for a buddy system allocator.
 776 *
 777 * The concept of a buddy system is to maintain direct-mapped table
 778 * (containing bit values) for memory blocks of various "orders".
 779 * The bottom level table contains the map for the smallest allocatable
 780 * units of memory (here, pages), and each level above it describes
 781 * pairs of units from the levels below, hence, "buddies".
 782 * At a high level, all that happens here is marking the table entry
 783 * at the bottom level available, and propagating the changes upward
 784 * as necessary, plus some accounting needed to play nicely with other
 785 * parts of the VM system.
 786 * At each level, we keep a list of pages, which are heads of continuous
 787 * free pages of length of (1 << order) and marked with _mapcount
 788 * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page)
 789 * field.
 790 * So when we are allocating or freeing one, we can derive the state of the
 791 * other.  That is, if we allocate a small block, and both were
 792 * free, the remainder of the region must be split into blocks.
 793 * If a block is freed, and its buddy is also free, then this
 794 * triggers coalescing into a block of larger size.
 795 *
 796 * -- nyc
 797 */
 798
 799static inline void __free_one_page(struct page *page,
 800                unsigned long pfn,
 801                struct zone *zone, unsigned int order,
 802                int migratetype)
 803{
 804        unsigned long combined_pfn;
 805        unsigned long uninitialized_var(buddy_pfn);
 806        struct page *buddy;
 807        unsigned int max_order;
 808
 809        max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
 810
 811        VM_BUG_ON(!zone_is_initialized(zone));
 812        VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
 813
 814        VM_BUG_ON(migratetype == -1);
 815        if (likely(!is_migrate_isolate(migratetype)))
 816                __mod_zone_freepage_state(zone, 1 << order, migratetype);
 817
 818        VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page);
 819        VM_BUG_ON_PAGE(bad_range(zone, page), page);
 820
 821continue_merging:
 822        while (order < max_order - 1) {
 823                buddy_pfn = __find_buddy_pfn(pfn, order);
 824                buddy = page + (buddy_pfn - pfn);
 825
 826                if (!pfn_valid_within(buddy_pfn))
 827                        goto done_merging;
 828                if (!page_is_buddy(page, buddy, order))
 829                        goto done_merging;
 830                /*
 831                 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
 832                 * merge with it and move up one order.
 833                 */
 834                if (page_is_guard(buddy)) {
 835                        clear_page_guard(zone, buddy, order, migratetype);
 836                } else {
 837                        list_del(&buddy->lru);
 838                        zone->free_area[order].nr_free--;
 839                        rmv_page_order(buddy);
 840                }
 841                combined_pfn = buddy_pfn & pfn;
 842                page = page + (combined_pfn - pfn);
 843                pfn = combined_pfn;
 844                order++;
 845        }
 846        if (max_order < MAX_ORDER) {
 847                /* If we are here, it means order is >= pageblock_order.
 848                 * We want to prevent merge between freepages on isolate
 849                 * pageblock and normal pageblock. Without this, pageblock
 850                 * isolation could cause incorrect freepage or CMA accounting.
 851                 *
 852                 * We don't want to hit this code for the more frequent
 853                 * low-order merging.
 854                 */
 855                if (unlikely(has_isolate_pageblock(zone))) {
 856                        int buddy_mt;
 857
 858                        buddy_pfn = __find_buddy_pfn(pfn, order);
 859                        buddy = page + (buddy_pfn - pfn);
 860                        buddy_mt = get_pageblock_migratetype(buddy);
 861
 862                        if (migratetype != buddy_mt
 863                                        && (is_migrate_isolate(migratetype) ||
 864                                                is_migrate_isolate(buddy_mt)))
 865                                goto done_merging;
 866                }
 867                max_order++;
 868                goto continue_merging;
 869        }
 870
 871done_merging:
 872        set_page_order(page, order);
 873
 874        /*
 875         * If this is not the largest possible page, check if the buddy
 876         * of the next-highest order is free. If it is, it's possible
 877         * that pages are being freed that will coalesce soon. In case,
 878         * that is happening, add the free page to the tail of the list
 879         * so it's less likely to be used soon and more likely to be merged
 880         * as a higher order page
 881         */
 882        if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) {
 883                struct page *higher_page, *higher_buddy;
 884                combined_pfn = buddy_pfn & pfn;
 885                higher_page = page + (combined_pfn - pfn);
 886                buddy_pfn = __find_buddy_pfn(combined_pfn, order + 1);
 887                higher_buddy = higher_page + (buddy_pfn - combined_pfn);
 888                if (pfn_valid_within(buddy_pfn) &&
 889                    page_is_buddy(higher_page, higher_buddy, order + 1)) {
 890                        list_add_tail(&page->lru,
 891                                &zone->free_area[order].free_list[migratetype]);
 892                        goto out;
 893                }
 894        }
 895
 896        list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
 897out:
 898        zone->free_area[order].nr_free++;
 899}
 900
 901/*
 902 * A bad page could be due to a number of fields. Instead of multiple branches,
 903 * try and check multiple fields with one check. The caller must do a detailed
 904 * check if necessary.
 905 */
 906static inline bool page_expected_state(struct page *page,
 907                                        unsigned long check_flags)
 908{
 909        if (unlikely(atomic_read(&page->_mapcount) != -1))
 910                return false;
 911
 912        if (unlikely((unsigned long)page->mapping |
 913                        page_ref_count(page) |
 914#ifdef CONFIG_MEMCG
 915                        (unsigned long)page->mem_cgroup |
 916#endif
 917                        (page->flags & check_flags)))
 918                return false;
 919
 920        return true;
 921}
 922
 923static void free_pages_check_bad(struct page *page)
 924{
 925        const char *bad_reason;
 926        unsigned long bad_flags;
 927
 928        bad_reason = NULL;
 929        bad_flags = 0;
 930
 931        if (unlikely(atomic_read(&page->_mapcount) != -1))
 932                bad_reason = "nonzero mapcount";
 933        if (unlikely(page->mapping != NULL))
 934                bad_reason = "non-NULL mapping";
 935        if (unlikely(page_ref_count(page) != 0))
 936                bad_reason = "nonzero _refcount";
 937        if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
 938                bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
 939                bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
 940        }
 941#ifdef CONFIG_MEMCG
 942        if (unlikely(page->mem_cgroup))
 943                bad_reason = "page still charged to cgroup";
 944#endif
 945        bad_page(page, bad_reason, bad_flags);
 946}
 947
 948static inline int free_pages_check(struct page *page)
 949{
 950        if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
 951                return 0;
 952
 953        /* Something has gone sideways, find it */
 954        free_pages_check_bad(page);
 955        return 1;
 956}
 957
 958static int free_tail_pages_check(struct page *head_page, struct page *page)
 959{
 960        int ret = 1;
 961
 962        /*
 963         * We rely page->lru.next never has bit 0 set, unless the page
 964         * is PageTail(). Let's make sure that's true even for poisoned ->lru.
 965         */
 966        BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1);
 967
 968        if (!IS_ENABLED(CONFIG_DEBUG_VM)) {
 969                ret = 0;
 970                goto out;
 971        }
 972        switch (page - head_page) {
 973        case 1:
 974                /* the first tail page: ->mapping is compound_mapcount() */
 975                if (unlikely(compound_mapcount(page))) {
 976                        bad_page(page, "nonzero compound_mapcount", 0);
 977                        goto out;
 978                }
 979                break;
 980        case 2:
 981                /*
 982                 * the second tail page: ->mapping is
 983                 * page_deferred_list().next -- ignore value.
 984                 */
 985                break;
 986        default:
 987                if (page->mapping != TAIL_MAPPING) {
 988                        bad_page(page, "corrupted mapping in tail page", 0);
 989                        goto out;
 990                }
 991                break;
 992        }
 993        if (unlikely(!PageTail(page))) {
 994                bad_page(page, "PageTail not set", 0);
 995                goto out;
 996        }
 997        if (unlikely(compound_head(page) != head_page)) {
 998                bad_page(page, "compound_head not consistent", 0);
 999                goto out;
1000        }
1001        ret = 0;
1002out:
1003        page->mapping = NULL;
1004        clear_compound_head(page);
1005        return ret;
1006}
1007
1008static __always_inline bool free_pages_prepare(struct page *page,
1009                                        unsigned int order, bool check_free)
1010{
1011        int bad = 0;
1012
1013        VM_BUG_ON_PAGE(PageTail(page), page);
1014
1015        trace_mm_page_free(page, order);
1016        kmemcheck_free_shadow(page, order);
1017
1018        /*
1019         * Check tail pages before head page information is cleared to
1020         * avoid checking PageCompound for order-0 pages.
1021         */
1022        if (unlikely(order)) {
1023                bool compound = PageCompound(page);
1024                int i;
1025
1026                VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
1027
1028                if (compound)
1029                        ClearPageDoubleMap(page);
1030                for (i = 1; i < (1 << order); i++) {
1031                        if (compound)
1032                                bad += free_tail_pages_check(page, page + i);
1033                        if (unlikely(free_pages_check(page + i))) {
1034                                bad++;
1035                                continue;
1036                        }
1037                        (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
1038                }
1039        }
1040        if (PageMappingFlags(page))
1041                page->mapping = NULL;
1042        if (memcg_kmem_enabled() && PageKmemcg(page))
1043                memcg_kmem_uncharge(page, order);
1044        if (check_free)
1045                bad += free_pages_check(page);
1046        if (bad)
1047                return false;
1048
1049        page_cpupid_reset_last(page);
1050        page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
1051        reset_page_owner(page, order);
1052
1053        if (!PageHighMem(page)) {
1054                debug_check_no_locks_freed(page_address(page),
1055                                           PAGE_SIZE << order);
1056                debug_check_no_obj_freed(page_address(page),
1057                                           PAGE_SIZE << order);
1058        }
1059        arch_free_page(page, order);
1060        kernel_poison_pages(page, 1 << order, 0);
1061        kernel_map_pages(page, 1 << order, 0);
1062        kasan_free_pages(page, order);
1063
1064        return true;
1065}
1066
1067#ifdef CONFIG_DEBUG_VM
1068static inline bool free_pcp_prepare(struct page *page)
1069{
1070        return free_pages_prepare(page, 0, true);
1071}
1072
1073static inline bool bulkfree_pcp_prepare(struct page *page)
1074{
1075        return false;
1076}
1077#else
1078static bool free_pcp_prepare(struct page *page)
1079{
1080        return free_pages_prepare(page, 0, false);
1081}
1082
1083static bool bulkfree_pcp_prepare(struct page *page)
1084{
1085        return free_pages_check(page);
1086}
1087#endif /* CONFIG_DEBUG_VM */
1088
1089/*
1090 * Frees a number of pages from the PCP lists
1091 * Assumes all pages on list are in same zone, and of same order.
1092 * count is the number of pages to free.
1093 *
1094 * If the zone was previously in an "all pages pinned" state then look to
1095 * see if this freeing clears that state.
1096 *
1097 * And clear the zone's pages_scanned counter, to hold off the "all pages are
1098 * pinned" detection logic.
1099 */
1100static void free_pcppages_bulk(struct zone *zone, int count,
1101                                        struct per_cpu_pages *pcp)
1102{
1103        int migratetype = 0;
1104        int batch_free = 0;
1105        bool isolated_pageblocks;
1106
1107        spin_lock(&zone->lock);
1108        isolated_pageblocks = has_isolate_pageblock(zone);
1109
1110        while (count) {
1111                struct page *page;
1112                struct list_head *list;
1113
1114                /*
1115                 * Remove pages from lists in a round-robin fashion. A
1116                 * batch_free count is maintained that is incremented when an
1117                 * empty list is encountered.  This is so more pages are freed
1118                 * off fuller lists instead of spinning excessively around empty
1119                 * lists
1120                 */
1121                do {
1122                        batch_free++;
1123                        if (++migratetype == MIGRATE_PCPTYPES)
1124                                migratetype = 0;
1125                        list = &pcp->lists[migratetype];
1126                } while (list_empty(list));
1127
1128                /* This is the only non-empty list. Free them all. */
1129                if (batch_free == MIGRATE_PCPTYPES)
1130                        batch_free = count;
1131
1132                do {
1133                        int mt; /* migratetype of the to-be-freed page */
1134
1135                        page = list_last_entry(list, struct page, lru);
1136                        /* must delete as __free_one_page list manipulates */
1137                        list_del(&page->lru);
1138
1139                        mt = get_pcppage_migratetype(page);
1140                        /* MIGRATE_ISOLATE page should not go to pcplists */
1141                        VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
1142                        /* Pageblock could have been isolated meanwhile */
1143                        if (unlikely(isolated_pageblocks))
1144                                mt = get_pageblock_migratetype(page);
1145
1146                        if (bulkfree_pcp_prepare(page))
1147                                continue;
1148
1149                        __free_one_page(page, page_to_pfn(page), zone, 0, mt);
1150                        trace_mm_page_pcpu_drain(page, 0, mt);
1151                } while (--count && --batch_free && !list_empty(list));
1152        }
1153        spin_unlock(&zone->lock);
1154}
1155
1156static void free_one_page(struct zone *zone,
1157                                struct page *page, unsigned long pfn,
1158                                unsigned int order,
1159                                int migratetype)
1160{
1161        spin_lock(&zone->lock);
1162        if (unlikely(has_isolate_pageblock(zone) ||
1163                is_migrate_isolate(migratetype))) {
1164                migratetype = get_pfnblock_migratetype(page, pfn);
1165        }
1166        __free_one_page(page, pfn, zone, order, migratetype);
1167        spin_unlock(&zone->lock);
1168}
1169
1170static void __meminit __init_single_page(struct page *page, unsigned long pfn,
1171                                unsigned long zone, int nid)
1172{
1173        set_page_links(page, zone, nid, pfn);
1174        init_page_count(page);
1175        page_mapcount_reset(page);
1176        page_cpupid_reset_last(page);
1177
1178        INIT_LIST_HEAD(&page->lru);
1179#ifdef WANT_PAGE_VIRTUAL
1180        /* The shift won't overflow because ZONE_NORMAL is below 4G. */
1181        if (!is_highmem_idx(zone))
1182                set_page_address(page, __va(pfn << PAGE_SHIFT));
1183#endif
1184}
1185
1186static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone,
1187                                        int nid)
1188{
1189        return __init_single_page(pfn_to_page(pfn), pfn, zone, nid);
1190}
1191
1192#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1193static void __meminit init_reserved_page(unsigned long pfn)
1194{
1195        pg_data_t *pgdat;
1196        int nid, zid;
1197
1198        if (!early_page_uninitialised(pfn))
1199                return;
1200
1201        nid = early_pfn_to_nid(pfn);
1202        pgdat = NODE_DATA(nid);
1203
1204        for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1205                struct zone *zone = &pgdat->node_zones[zid];
1206
1207                if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))
1208                        break;
1209        }
1210        __init_single_pfn(pfn, zid, nid);
1211}
1212#else
1213static inline void init_reserved_page(unsigned long pfn)
1214{
1215}
1216#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
1217
1218/*
1219 * Initialised pages do not have PageReserved set. This function is
1220 * called for each range allocated by the bootmem allocator and
1221 * marks the pages PageReserved. The remaining valid pages are later
1222 * sent to the buddy page allocator.
1223 */
1224void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end)
1225{
1226        unsigned long start_pfn = PFN_DOWN(start);
1227        unsigned long end_pfn = PFN_UP(end);
1228
1229        for (; start_pfn < end_pfn; start_pfn++) {
1230                if (pfn_valid(start_pfn)) {
1231                        struct page *page = pfn_to_page(start_pfn);
1232
1233                        init_reserved_page(start_pfn);
1234
1235                        /* Avoid false-positive PageTail() */
1236                        INIT_LIST_HEAD(&page->lru);
1237
1238                        SetPageReserved(page);
1239                }
1240        }
1241}
1242
1243static void __free_pages_ok(struct page *page, unsigned int order)
1244{
1245        unsigned long flags;
1246        int migratetype;
1247        unsigned long pfn = page_to_pfn(page);
1248
1249        if (!free_pages_prepare(page, order, true))
1250                return;
1251
1252        migratetype = get_pfnblock_migratetype(page, pfn);
1253        local_irq_save(flags);
1254        __count_vm_events(PGFREE, 1 << order);
1255        free_one_page(page_zone(page), page, pfn, order, migratetype);
1256        local_irq_restore(flags);
1257}
1258
1259static void __init __free_pages_boot_core(struct page *page, unsigned int order)
1260{
1261        unsigned int nr_pages = 1 << order;
1262        struct page *p = page;
1263        unsigned int loop;
1264
1265        prefetchw(p);
1266        for (loop = 0; loop < (nr_pages - 1); loop++, p++) {
1267                prefetchw(p + 1);
1268                __ClearPageReserved(p);
1269                set_page_count(p, 0);
1270        }
1271        __ClearPageReserved(p);
1272        set_page_count(p, 0);
1273
1274        page_zone(page)->managed_pages += nr_pages;
1275        set_page_refcounted(page);
1276        __free_pages(page, order);
1277}
1278
1279#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \
1280        defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
1281
1282static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
1283
1284int __meminit early_pfn_to_nid(unsigned long pfn)
1285{
1286        static DEFINE_SPINLOCK(early_pfn_lock);
1287        int nid;
1288
1289        spin_lock(&early_pfn_lock);
1290        nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
1291        if (nid < 0)
1292                nid = first_online_node;
1293        spin_unlock(&early_pfn_lock);
1294
1295        return nid;
1296}
1297#endif
1298
1299#ifdef CONFIG_NODES_SPAN_OTHER_NODES
1300static inline bool __meminit __maybe_unused
1301meminit_pfn_in_nid(unsigned long pfn, int node,
1302                   struct mminit_pfnnid_cache *state)
1303{
1304        int nid;
1305
1306        nid = __early_pfn_to_nid(pfn, state);
1307        if (nid >= 0 && nid != node)
1308                return false;
1309        return true;
1310}
1311
1312/* Only safe to use early in boot when initialisation is single-threaded */
1313static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
1314{
1315        return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache);
1316}
1317
1318#else
1319
1320static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
1321{
1322        return true;
1323}
1324static inline bool __meminit  __maybe_unused
1325meminit_pfn_in_nid(unsigned long pfn, int node,
1326                   struct mminit_pfnnid_cache *state)
1327{
1328        return true;
1329}
1330#endif
1331
1332
1333void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
1334                                                        unsigned int order)
1335{
1336        if (early_page_uninitialised(pfn))
1337                return;
1338        return __free_pages_boot_core(page, order);
1339}
1340
1341/*
1342 * Check that the whole (or subset of) a pageblock given by the interval of
1343 * [start_pfn, end_pfn) is valid and within the same zone, before scanning it
1344 * with the migration of free compaction scanner. The scanners then need to
1345 * use only pfn_valid_within() check for arches that allow holes within
1346 * pageblocks.
1347 *
1348 * Return struct page pointer of start_pfn, or NULL if checks were not passed.
1349 *
1350 * It's possible on some configurations to have a setup like node0 node1 node0
1351 * i.e. it's possible that all pages within a zones range of pages do not
1352 * belong to a single zone. We assume that a border between node0 and node1
1353 * can occur within a single pageblock, but not a node0 node1 node0
1354 * interleaving within a single pageblock. It is therefore sufficient to check
1355 * the first and last page of a pageblock and avoid checking each individual
1356 * page in a pageblock.
1357 */
1358struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
1359                                     unsigned long end_pfn, struct zone *zone)
1360{
1361        struct page *start_page;
1362        struct page *end_page;
1363
1364        /* end_pfn is one past the range we are checking */
1365        end_pfn--;
1366
1367        if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn))
1368                return NULL;
1369
1370        start_page = pfn_to_online_page(start_pfn);
1371        if (!start_page)
1372                return NULL;
1373
1374        if (page_zone(start_page) != zone)
1375                return NULL;
1376
1377        end_page = pfn_to_page(end_pfn);
1378
1379        /* This gives a shorter code than deriving page_zone(end_page) */
1380        if (page_zone_id(start_page) != page_zone_id(end_page))
1381                return NULL;
1382
1383        return start_page;
1384}
1385
1386void set_zone_contiguous(struct zone *zone)
1387{
1388        unsigned long block_start_pfn = zone->zone_start_pfn;
1389        unsigned long block_end_pfn;
1390
1391        block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages);
1392        for (; block_start_pfn < zone_end_pfn(zone);
1393                        block_start_pfn = block_end_pfn,
1394                         block_end_pfn += pageblock_nr_pages) {
1395
1396                block_end_pfn = min(block_end_pfn, zone_end_pfn(zone));
1397
1398                if (!__pageblock_pfn_to_page(block_start_pfn,
1399                                             block_end_pfn, zone))
1400                        return;
1401        }
1402
1403        /* We confirm that there is no hole */
1404        zone->contiguous = true;
1405}
1406
1407void clear_zone_contiguous(struct zone *zone)
1408{
1409        zone->contiguous = false;
1410}
1411
1412#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1413static void __init deferred_free_range(struct page *page,
1414                                        unsigned long pfn, int nr_pages)
1415{
1416        int i;
1417
1418        if (!page)
1419                return;
1420
1421        /* Free a large naturally-aligned chunk if possible */
1422        if (nr_pages == pageblock_nr_pages &&
1423            (pfn & (pageblock_nr_pages - 1)) == 0) {
1424                set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1425                __free_pages_boot_core(page, pageblock_order);
1426                return;
1427        }
1428
1429        for (i = 0; i < nr_pages; i++, page++, pfn++) {
1430                if ((pfn & (pageblock_nr_pages - 1)) == 0)
1431                        set_pageblock_migratetype(page, MIGRATE_MOVABLE);
1432                __free_pages_boot_core(page, 0);
1433        }
1434}
1435
1436/* Completion tracking for deferred_init_memmap() threads */
1437static atomic_t pgdat_init_n_undone __initdata;
1438static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
1439
1440static inline void __init pgdat_init_report_one_done(void)
1441{
1442        if (atomic_dec_and_test(&pgdat_init_n_undone))
1443                complete(&pgdat_init_all_done_comp);
1444}
1445
1446/* Initialise remaining memory on a node */
1447static int __init deferred_init_memmap(void *data)
1448{
1449        pg_data_t *pgdat = data;
1450        int nid = pgdat->node_id;
1451        struct mminit_pfnnid_cache nid_init_state = { };
1452        unsigned long start = jiffies;
1453        unsigned long nr_pages = 0;
1454        unsigned long walk_start, walk_end;
1455        int i, zid;
1456        struct zone *zone;
1457        unsigned long first_init_pfn = pgdat->first_deferred_pfn;
1458        const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
1459
1460        if (first_init_pfn == ULONG_MAX) {
1461                pgdat_init_report_one_done();
1462                return 0;
1463        }
1464
1465        /* Bind memory initialisation thread to a local node if possible */
1466        if (!cpumask_empty(cpumask))
1467                set_cpus_allowed_ptr(current, cpumask);
1468
1469        /* Sanity check boundaries */
1470        BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
1471        BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
1472        pgdat->first_deferred_pfn = ULONG_MAX;
1473
1474        /* Only the highest zone is deferred so find it */
1475        for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1476                zone = pgdat->node_zones + zid;
1477                if (first_init_pfn < zone_end_pfn(zone))
1478                        break;
1479        }
1480
1481        for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) {
1482                unsigned long pfn, end_pfn;
1483                struct page *page = NULL;
1484                struct page *free_base_page = NULL;
1485                unsigned long free_base_pfn = 0;
1486                int nr_to_free = 0;
1487
1488                end_pfn = min(walk_end, zone_end_pfn(zone));
1489                pfn = first_init_pfn;
1490                if (pfn < walk_start)
1491                        pfn = walk_start;
1492                if (pfn < zone->zone_start_pfn)
1493                        pfn = zone->zone_start_pfn;
1494
1495                for (; pfn < end_pfn; pfn++) {
1496                        if (!pfn_valid_within(pfn))
1497                                goto free_range;
1498
1499                        /*
1500                         * Ensure pfn_valid is checked every
1501                         * pageblock_nr_pages for memory holes
1502                         */
1503                        if ((pfn & (pageblock_nr_pages - 1)) == 0) {
1504                                if (!pfn_valid(pfn)) {
1505                                        page = NULL;
1506                                        goto free_range;
1507                                }
1508                        }
1509
1510                        if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) {
1511                                page = NULL;
1512                                goto free_range;
1513                        }
1514
1515                        /* Minimise pfn page lookups and scheduler checks */
1516                        if (page && (pfn & (pageblock_nr_pages - 1)) != 0) {
1517                                page++;
1518                        } else {
1519                                nr_pages += nr_to_free;
1520                                deferred_free_range(free_base_page,
1521                                                free_base_pfn, nr_to_free);
1522                                free_base_page = NULL;
1523                                free_base_pfn = nr_to_free = 0;
1524
1525                                page = pfn_to_page(pfn);
1526                                cond_resched();
1527                        }
1528
1529                        if (page->flags) {
1530                                VM_BUG_ON(page_zone(page) != zone);
1531                                goto free_range;
1532                        }
1533
1534                        __init_single_page(page, pfn, zid, nid);
1535                        if (!free_base_page) {
1536                                free_base_page = page;
1537                                free_base_pfn = pfn;
1538                                nr_to_free = 0;
1539                        }
1540                        nr_to_free++;
1541
1542                        /* Where possible, batch up pages for a single free */
1543                        continue;
1544free_range:
1545                        /* Free the current block of pages to allocator */
1546                        nr_pages += nr_to_free;
1547                        deferred_free_range(free_base_page, free_base_pfn,
1548                                                                nr_to_free);
1549                        free_base_page = NULL;
1550                        free_base_pfn = nr_to_free = 0;
1551                }
1552                /* Free the last block of pages to allocator */
1553                nr_pages += nr_to_free;
1554                deferred_free_range(free_base_page, free_base_pfn, nr_to_free);
1555
1556                first_init_pfn = max(end_pfn, first_init_pfn);
1557        }
1558
1559        /* Sanity check that the next zone really is unpopulated */
1560        WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
1561
1562        pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages,
1563                                        jiffies_to_msecs(jiffies - start));
1564
1565        pgdat_init_report_one_done();
1566        return 0;
1567}
1568#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
1569
1570void __init page_alloc_init_late(void)
1571{
1572        struct zone *zone;
1573
1574#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
1575        int nid;
1576
1577        /* There will be num_node_state(N_MEMORY) threads */
1578        atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
1579        for_each_node_state(nid, N_MEMORY) {
1580                kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
1581        }
1582
1583        /* Block until all are initialised */
1584        wait_for_completion(&pgdat_init_all_done_comp);
1585
1586        /* Reinit limits that are based on free pages after the kernel is up */
1587        files_maxfiles_init();
1588#endif
1589#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
1590        /* Discard memblock private memory */
1591        memblock_discard();
1592#endif
1593
1594        for_each_populated_zone(zone)
1595                set_zone_contiguous(zone);
1596}
1597
1598#ifdef CONFIG_CMA
1599/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
1600void __init init_cma_reserved_pageblock(struct page *page)
1601{
1602        unsigned i = pageblock_nr_pages;
1603        struct page *p = page;
1604
1605        do {
1606                __ClearPageReserved(p);
1607                set_page_count(p, 0);
1608        } while (++p, --i);
1609
1610        set_pageblock_migratetype(page, MIGRATE_CMA);
1611
1612        if (pageblock_order >= MAX_ORDER) {
1613                i = pageblock_nr_pages;
1614                p = page;
1615                do {
1616                        set_page_refcounted(p);
1617                        __free_pages(p, MAX_ORDER - 1);
1618                        p += MAX_ORDER_NR_PAGES;
1619                } while (i -= MAX_ORDER_NR_PAGES);
1620        } else {
1621                set_page_refcounted(page);
1622                __free_pages(page, pageblock_order);
1623        }
1624
1625        adjust_managed_page_count(page, pageblock_nr_pages);
1626}
1627#endif
1628
1629/*
1630 * The order of subdivision here is critical for the IO subsystem.
1631 * Please do not alter this order without good reasons and regression
1632 * testing. Specifically, as large blocks of memory are subdivided,
1633 * the order in which smaller blocks are delivered depends on the order
1634 * they're subdivided in this function. This is the primary factor
1635 * influencing the order in which pages are delivered to the IO
1636 * subsystem according to empirical testing, and this is also justified
1637 * by considering the behavior of a buddy system containing a single
1638 * large block of memory acted on by a series of small allocations.
1639 * This behavior is a critical factor in sglist merging's success.
1640 *
1641 * -- nyc
1642 */
1643static inline void expand(struct zone *zone, struct page *page,
1644        int low, int high, struct free_area *area,
1645        int migratetype)
1646{
1647        unsigned long size = 1 << high;
1648
1649        while (high > low) {
1650                area--;
1651                high--;
1652                size >>= 1;
1653                VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]);
1654
1655                /*
1656                 * Mark as guard pages (or page), that will allow to
1657                 * merge back to allocator when buddy will be freed.
1658                 * Corresponding page table entries will not be touched,
1659                 * pages will stay not present in virtual address space
1660                 */
1661                if (set_page_guard(zone, &page[size], high, migratetype))
1662                        continue;
1663
1664                list_add(&page[size].lru, &area->free_list[migratetype]);
1665                area->nr_free++;
1666                set_page_order(&page[size], high);
1667        }
1668}
1669
1670static void check_new_page_bad(struct page *page)
1671{
1672        const char *bad_reason = NULL;
1673        unsigned long bad_flags = 0;
1674
1675        if (unlikely(atomic_read(&page->_mapcount) != -1))
1676                bad_reason = "nonzero mapcount";
1677        if (unlikely(page->mapping != NULL))
1678                bad_reason = "non-NULL mapping";
1679        if (unlikely(page_ref_count(page) != 0))
1680                bad_reason = "nonzero _count";
1681        if (unlikely(page->flags & __PG_HWPOISON)) {
1682                bad_reason = "HWPoisoned (hardware-corrupted)";
1683                bad_flags = __PG_HWPOISON;
1684                /* Don't complain about hwpoisoned pages */
1685                page_mapcount_reset(page); /* remove PageBuddy */
1686                return;
1687        }
1688        if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
1689                bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
1690                bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
1691        }
1692#ifdef CONFIG_MEMCG
1693        if (unlikely(page->mem_cgroup))
1694                bad_reason = "page still charged to cgroup";
1695#endif
1696        bad_page(page, bad_reason, bad_flags);
1697}
1698
1699/*
1700 * This page is about to be returned from the page allocator
1701 */
1702static inline int check_new_page(struct page *page)
1703{
1704        if (likely(page_expected_state(page,
1705                                PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON)))
1706                return 0;
1707
1708        check_new_page_bad(page);
1709        return 1;
1710}
1711
1712static inline bool free_pages_prezeroed(void)
1713{
1714        return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
1715                page_poisoning_enabled();
1716}
1717
1718#ifdef CONFIG_DEBUG_VM
1719static bool check_pcp_refill(struct page *page)
1720{
1721        return false;
1722}
1723
1724static bool check_new_pcp(struct page *page)
1725{
1726        return check_new_page(page);
1727}
1728#else
1729static bool check_pcp_refill(struct page *page)
1730{
1731        return check_new_page(page);
1732}
1733static bool check_new_pcp(struct page *page)
1734{
1735        return false;
1736}
1737#endif /* CONFIG_DEBUG_VM */
1738
1739static bool check_new_pages(struct page *page, unsigned int order)
1740{
1741        int i;
1742        for (i = 0; i < (1 << order); i++) {
1743                struct page *p = page + i;
1744
1745                if (unlikely(check_new_page(p)))
1746                        return true;
1747        }
1748
1749        return false;
1750}
1751
1752inline void post_alloc_hook(struct page *page, unsigned int order,
1753                                gfp_t gfp_flags)
1754{
1755        set_page_private(page, 0);
1756        set_page_refcounted(page);
1757
1758        arch_alloc_page(page, order);
1759        kernel_map_pages(page, 1 << order, 1);
1760        kernel_poison_pages(page, 1 << order, 1);
1761        kasan_alloc_pages(page, order);
1762        set_page_owner(page, order, gfp_flags);
1763}
1764
1765static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
1766                                                        unsigned int alloc_flags)
1767{
1768        int i;
1769
1770        post_alloc_hook(page, order, gfp_flags);
1771
1772        if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO))
1773                for (i = 0; i < (1 << order); i++)
1774                        clear_highpage(page + i);
1775
1776        if (order && (gfp_flags & __GFP_COMP))
1777                prep_compound_page(page, order);
1778
1779        /*
1780         * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to
1781         * allocate the page. The expectation is that the caller is taking
1782         * steps that will free more memory. The caller should avoid the page
1783         * being used for !PFMEMALLOC purposes.
1784         */
1785        if (alloc_flags & ALLOC_NO_WATERMARKS)
1786                set_page_pfmemalloc(page);
1787        else
1788                clear_page_pfmemalloc(page);
1789}
1790
1791/*
1792 * Go through the free lists for the given migratetype and remove
1793 * the smallest available page from the freelists
1794 */
1795static inline
1796struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
1797                                                int migratetype)
1798{
1799        unsigned int current_order;
1800        struct free_area *area;
1801        struct page *page;
1802
1803        /* Find a page of the appropriate size in the preferred list */
1804        for (current_order = order; current_order < MAX_ORDER; ++current_order) {
1805                area = &(zone->free_area[current_order]);
1806                page = list_first_entry_or_null(&area->free_list[migratetype],
1807                                                        struct page, lru);
1808                if (!page)
1809                        continue;
1810                list_del(&page->lru);
1811                rmv_page_order(page);
1812                area->nr_free--;
1813                expand(zone, page, order, current_order, area, migratetype);
1814                set_pcppage_migratetype(page, migratetype);
1815                return page;
1816        }
1817
1818        return NULL;
1819}
1820
1821
1822/*
1823 * This array describes the order lists are fallen back to when
1824 * the free lists for the desirable migrate type are depleted
1825 */
1826static int fallbacks[MIGRATE_TYPES][4] = {
1827        [MIGRATE_UNMOVABLE]   = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE,   MIGRATE_TYPES },
1828        [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE,   MIGRATE_MOVABLE,   MIGRATE_TYPES },
1829        [MIGRATE_MOVABLE]     = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
1830#ifdef CONFIG_CMA
1831        [MIGRATE_CMA]         = { MIGRATE_TYPES }, /* Never used */
1832#endif
1833#ifdef CONFIG_MEMORY_ISOLATION
1834        [MIGRATE_ISOLATE]     = { MIGRATE_TYPES }, /* Never used */
1835#endif
1836};
1837
1838#ifdef CONFIG_CMA
1839static struct page *__rmqueue_cma_fallback(struct zone *zone,
1840                                        unsigned int order)
1841{
1842        return __rmqueue_smallest(zone, order, MIGRATE_CMA);
1843}
1844#else
1845static inline struct page *__rmqueue_cma_fallback(struct zone *zone,
1846                                        unsigned int order) { return NULL; }
1847#endif
1848
1849/*
1850 * Move the free pages in a range to the free lists of the requested type.
1851 * Note that start_page and end_pages are not aligned on a pageblock
1852 * boundary. If alignment is required, use move_freepages_block()
1853 */
1854static int move_freepages(struct zone *zone,
1855                          struct page *start_page, struct page *end_page,
1856                          int migratetype, int *num_movable)
1857{
1858        struct page *page;
1859        unsigned int order;
1860        int pages_moved = 0;
1861
1862#ifndef CONFIG_HOLES_IN_ZONE
1863        /*
1864         * page_zone is not safe to call in this context when
1865         * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
1866         * anyway as we check zone boundaries in move_freepages_block().
1867         * Remove at a later date when no bug reports exist related to
1868         * grouping pages by mobility
1869         */
1870        VM_BUG_ON(page_zone(start_page) != page_zone(end_page));
1871#endif
1872
1873        if (num_movable)
1874                *num_movable = 0;
1875
1876        for (page = start_page; page <= end_page;) {
1877                if (!pfn_valid_within(page_to_pfn(page))) {
1878                        page++;
1879                        continue;
1880                }
1881
1882                /* Make sure we are not inadvertently changing nodes */
1883                VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
1884
1885                if (!PageBuddy(page)) {
1886                        /*
1887                         * We assume that pages that could be isolated for
1888                         * migration are movable. But we don't actually try
1889                         * isolating, as that would be expensive.
1890                         */
1891                        if (num_movable &&
1892                                        (PageLRU(page) || __PageMovable(page)))
1893                                (*num_movable)++;
1894
1895                        page++;
1896                        continue;
1897                }
1898
1899                order = page_order(page);
1900                list_move(&page->lru,
1901                          &zone->free_area[order].free_list[migratetype]);
1902                page += 1 << order;
1903                pages_moved += 1 << order;
1904        }
1905
1906        return pages_moved;
1907}
1908
1909int move_freepages_block(struct zone *zone, struct page *page,
1910                                int migratetype, int *num_movable)
1911{
1912        unsigned long start_pfn, end_pfn;
1913        struct page *start_page, *end_page;
1914
1915        start_pfn = page_to_pfn(page);
1916        start_pfn = start_pfn & ~(pageblock_nr_pages-1);
1917        start_page = pfn_to_page(start_pfn);
1918        end_page = start_page + pageblock_nr_pages - 1;
1919        end_pfn = start_pfn + pageblock_nr_pages - 1;
1920
1921        /* Do not cross zone boundaries */
1922        if (!zone_spans_pfn(zone, start_pfn))
1923                start_page = page;
1924        if (!zone_spans_pfn(zone, end_pfn))
1925                return 0;
1926
1927        return move_freepages(zone, start_page, end_page, migratetype,
1928                                                                num_movable);
1929}
1930
1931static void change_pageblock_range(struct page *pageblock_page,
1932                                        int start_order, int migratetype)
1933{
1934        int nr_pageblocks = 1 << (start_order - pageblock_order);
1935
1936        while (nr_pageblocks--) {
1937                set_pageblock_migratetype(pageblock_page, migratetype);
1938                pageblock_page += pageblock_nr_pages;
1939        }
1940}
1941
1942/*
1943 * When we are falling back to another migratetype during allocation, try to
1944 * steal extra free pages from the same pageblocks to satisfy further
1945 * allocations, instead of polluting multiple pageblocks.
1946 *
1947 * If we are stealing a relatively large buddy page, it is likely there will
1948 * be more free pages in the pageblock, so try to steal them all. For
1949 * reclaimable and unmovable allocations, we steal regardless of page size,
1950 * as fragmentation caused by those allocations polluting movable pageblocks
1951 * is worse than movable allocations stealing from unmovable and reclaimable
1952 * pageblocks.
1953 */
1954static bool can_steal_fallback(unsigned int order, int start_mt)
1955{
1956        /*
1957         * Leaving this order check is intended, although there is
1958         * relaxed order check in next check. The reason is that
1959         * we can actually steal whole pageblock if this condition met,
1960         * but, below check doesn't guarantee it and that is just heuristic
1961         * so could be changed anytime.
1962         */
1963        if (order >= pageblock_order)
1964                return true;
1965
1966        if (order >= pageblock_order / 2 ||
1967                start_mt == MIGRATE_RECLAIMABLE ||
1968                start_mt == MIGRATE_UNMOVABLE ||
1969                page_group_by_mobility_disabled)
1970                return true;
1971
1972        return false;
1973}
1974
1975/*
1976 * This function implements actual steal behaviour. If order is large enough,
1977 * we can steal whole pageblock. If not, we first move freepages in this
1978 * pageblock to our migratetype and determine how many already-allocated pages
1979 * are there in the pageblock with a compatible migratetype. If at least half
1980 * of pages are free or compatible, we can change migratetype of the pageblock
1981 * itself, so pages freed in the future will be put on the correct free list.
1982 */
1983static void steal_suitable_fallback(struct zone *zone, struct page *page,
1984                                        int start_type, bool whole_block)
1985{
1986        unsigned int current_order = page_order(page);
1987        struct free_area *area;
1988        int free_pages, movable_pages, alike_pages;
1989        int old_block_type;
1990
1991        old_block_type = get_pageblock_migratetype(page);
1992
1993        /*
1994         * This can happen due to races and we want to prevent broken
1995         * highatomic accounting.
1996         */
1997        if (is_migrate_highatomic(old_block_type))
1998                goto single_page;
1999
2000        /* Take ownership for orders >= pageblock_order */
2001        if (current_order >= pageblock_order) {
2002                change_pageblock_range(page, current_order, start_type);
2003                goto single_page;
2004        }
2005
2006        /* We are not allowed to try stealing from the whole block */
2007        if (!whole_block)
2008                goto single_page;
2009
2010        free_pages = move_freepages_block(zone, page, start_type,
2011                                                &movable_pages);
2012        /*
2013         * Determine how many pages are compatible with our allocation.
2014         * For movable allocation, it's the number of movable pages which
2015         * we just obtained. For other types it's a bit more tricky.
2016         */
2017        if (start_type == MIGRATE_MOVABLE) {
2018                alike_pages = movable_pages;
2019        } else {
2020                /*
2021                 * If we are falling back a RECLAIMABLE or UNMOVABLE allocation
2022                 * to MOVABLE pageblock, consider all non-movable pages as
2023                 * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or
2024                 * vice versa, be conservative since we can't distinguish the
2025                 * exact migratetype of non-movable pages.
2026                 */
2027                if (old_block_type == MIGRATE_MOVABLE)
2028                        alike_pages = pageblock_nr_pages
2029                                                - (free_pages + movable_pages);
2030                else
2031                        alike_pages = 0;
2032        }
2033
2034        /* moving whole block can fail due to zone boundary conditions */
2035        if (!free_pages)
2036                goto single_page;
2037
2038        /*
2039         * If a sufficient number of pages in the block are either free or of
2040         * comparable migratability as our allocation, claim the whole block.
2041         */
2042        if (free_pages + alike_pages >= (1 << (pageblock_order-1)) ||
2043                        page_group_by_mobility_disabled)
2044                set_pageblock_migratetype(page, start_type);
2045
2046        return;
2047
2048single_page:
2049        area = &zone->free_area[current_order];
2050        list_move(&page->lru, &area->free_list[start_type]);
2051}
2052
2053/*
2054 * Check whether there is a suitable fallback freepage with requested order.
2055 * If only_stealable is true, this function returns fallback_mt only if
2056 * we can steal other freepages all together. This would help to reduce
2057 * fragmentation due to mixed migratetype pages in one pageblock.
2058 */
2059int find_suitable_fallback(struct free_area *area, unsigned int order,
2060                        int migratetype, bool only_stealable, bool *can_steal)
2061{
2062        int i;
2063        int fallback_mt;
2064
2065        if (area->nr_free == 0)
2066                return -1;
2067
2068        *can_steal = false;
2069        for (i = 0;; i++) {
2070                fallback_mt = fallbacks[migratetype][i];
2071                if (fallback_mt == MIGRATE_TYPES)
2072                        break;
2073
2074                if (list_empty(&area->free_list[fallback_mt]))
2075                        continue;
2076
2077                if (can_steal_fallback(order, migratetype))
2078                        *can_steal = true;
2079
2080                if (!only_stealable)
2081                        return fallback_mt;
2082
2083                if (*can_steal)
2084                        return fallback_mt;
2085        }
2086
2087        return -1;
2088}
2089
2090/*
2091 * Reserve a pageblock for exclusive use of high-order atomic allocations if
2092 * there are no empty page blocks that contain a page with a suitable order
2093 */
2094static void reserve_highatomic_pageblock(struct page *page, struct zone *zone,
2095                                unsigned int alloc_order)
2096{
2097        int mt;
2098        unsigned long max_managed, flags;
2099
2100        /*
2101         * Limit the number reserved to 1 pageblock or roughly 1% of a zone.
2102         * Check is race-prone but harmless.
2103         */
2104        max_managed = (zone->managed_pages / 100) + pageblock_nr_pages;
2105        if (zone->nr_reserved_highatomic >= max_managed)
2106                return;
2107
2108        spin_lock_irqsave(&zone->lock, flags);
2109
2110        /* Recheck the nr_reserved_highatomic limit under the lock */
2111        if (zone->nr_reserved_highatomic >= max_managed)
2112                goto out_unlock;
2113
2114        /* Yoink! */
2115        mt = get_pageblock_migratetype(page);
2116        if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt)
2117            && !is_migrate_cma(mt)) {
2118                zone->nr_reserved_highatomic += pageblock_nr_pages;
2119                set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC);
2120                move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL);
2121        }
2122
2123out_unlock:
2124        spin_unlock_irqrestore(&zone->lock, flags);
2125}
2126
2127/*
2128 * Used when an allocation is about to fail under memory pressure. This
2129 * potentially hurts the reliability of high-order allocations when under
2130 * intense memory pressure but failed atomic allocations should be easier
2131 * to recover from than an OOM.
2132 *
2133 * If @force is true, try to unreserve a pageblock even though highatomic
2134 * pageblock is exhausted.
2135 */
2136static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
2137                                                bool force)
2138{
2139        struct zonelist *zonelist = ac->zonelist;
2140        unsigned long flags;
2141        struct zoneref *z;
2142        struct zone *zone;
2143        struct page *page;
2144        int order;
2145        bool ret;
2146
2147        for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
2148                                                                ac->nodemask) {
2149                /*
2150                 * Preserve at least one pageblock unless memory pressure
2151                 * is really high.
2152                 */
2153                if (!force && zone->nr_reserved_highatomic <=
2154                                        pageblock_nr_pages)
2155                        continue;
2156
2157                spin_lock_irqsave(&zone->lock, flags);
2158                for (order = 0; order < MAX_ORDER; order++) {
2159                        struct free_area *area = &(zone->free_area[order]);
2160
2161                        page = list_first_entry_or_null(
2162                                        &area->free_list[MIGRATE_HIGHATOMIC],
2163                                        struct page, lru);
2164                        if (!page)
2165                                continue;
2166
2167                        /*
2168                         * In page freeing path, migratetype change is racy so
2169                         * we can counter several free pages in a pageblock
2170                         * in this loop althoug we changed the pageblock type
2171                         * from highatomic to ac->migratetype. So we should
2172                         * adjust the count once.
2173                         */
2174                        if (is_migrate_highatomic_page(page)) {
2175                                /*
2176                                 * It should never happen but changes to
2177                                 * locking could inadvertently allow a per-cpu
2178                                 * drain to add pages to MIGRATE_HIGHATOMIC
2179                                 * while unreserving so be safe and watch for
2180                                 * underflows.
2181                                 */
2182                                zone->nr_reserved_highatomic -= min(
2183                                                pageblock_nr_pages,
2184                                                zone->nr_reserved_highatomic);
2185                        }
2186
2187                        /*
2188                         * Convert to ac->migratetype and avoid the normal
2189                         * pageblock stealing heuristics. Minimally, the caller
2190                         * is doing the work and needs the pages. More
2191                         * importantly, if the block was always converted to
2192                         * MIGRATE_UNMOVABLE or another type then the number
2193                         * of pageblocks that cannot be completely freed
2194                         * may increase.
2195                         */
2196                        set_pageblock_migratetype(page, ac->migratetype);
2197                        ret = move_freepages_block(zone, page, ac->migratetype,
2198                                                                        NULL);
2199                        if (ret) {
2200                                spin_unlock_irqrestore(&zone->lock, flags);
2201                                return ret;
2202                        }
2203                }
2204                spin_unlock_irqrestore(&zone->lock, flags);
2205        }
2206
2207        return false;
2208}
2209
2210/*
2211 * Try finding a free buddy page on the fallback list and put it on the free
2212 * list of requested migratetype, possibly along with other pages from the same
2213 * block, depending on fragmentation avoidance heuristics. Returns true if
2214 * fallback was found so that __rmqueue_smallest() can grab it.
2215 *
2216 * The use of signed ints for order and current_order is a deliberate
2217 * deviation from the rest of this file, to make the for loop
2218 * condition simpler.
2219 */
2220static inline bool
2221__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
2222{
2223        struct free_area *area;
2224        int current_order;
2225        struct page *page;
2226        int fallback_mt;
2227        bool can_steal;
2228
2229        /*
2230         * Find the largest available free page in the other list. This roughly
2231         * approximates finding the pageblock with the most free pages, which
2232         * would be too costly to do exactly.
2233         */
2234        for (current_order = MAX_ORDER - 1; current_order >= order;
2235                                --current_order) {
2236                area = &(zone->free_area[current_order]);
2237                fallback_mt = find_suitable_fallback(area, current_order,
2238                                start_migratetype, false, &can_steal);
2239                if (fallback_mt == -1)
2240                        continue;
2241
2242                /*
2243                 * We cannot steal all free pages from the pageblock and the
2244                 * requested migratetype is movable. In that case it's better to
2245                 * steal and split the smallest available page instead of the
2246                 * largest available page, because even if the next movable
2247                 * allocation falls back into a different pageblock than this
2248                 * one, it won't cause permanent fragmentation.
2249                 */
2250                if (!can_steal && start_migratetype == MIGRATE_MOVABLE
2251                                        && current_order > order)
2252                        goto find_smallest;
2253
2254                goto do_steal;
2255        }
2256
2257        return false;
2258
2259find_smallest:
2260        for (current_order = order; current_order < MAX_ORDER;
2261                                                        current_order++) {
2262                area = &(zone->free_area[current_order]);
2263                fallback_mt = find_suitable_fallback(area, current_order,
2264                                start_migratetype, false, &can_steal);
2265                if (fallback_mt != -1)
2266                        break;
2267        }
2268
2269        /*
2270         * This should not happen - we already found a suitable fallback
2271         * when looking for the largest page.
2272         */
2273        VM_BUG_ON(current_order == MAX_ORDER);
2274
2275do_steal:
2276        page = list_first_entry(&area->free_list[fallback_mt],
2277                                                        struct page, lru);
2278
2279        steal_suitable_fallback(zone, page, start_migratetype, can_steal);
2280
2281        trace_mm_page_alloc_extfrag(page, order, current_order,
2282                start_migratetype, fallback_mt);
2283
2284        return true;
2285
2286}
2287
2288/*
2289 * Do the hard work of removing an element from the buddy allocator.
2290 * Call me with the zone->lock already held.
2291 */
2292static struct page *__rmqueue(struct zone *zone, unsigned int order,
2293                                int migratetype)
2294{
2295        struct page *page;
2296
2297retry:
2298        page = __rmqueue_smallest(zone, order, migratetype);
2299        if (unlikely(!page)) {
2300                if (migratetype == MIGRATE_MOVABLE)
2301                        page = __rmqueue_cma_fallback(zone, order);
2302
2303                if (!page && __rmqueue_fallback(zone, order, migratetype))
2304                        goto retry;
2305        }
2306
2307        trace_mm_page_alloc_zone_locked(page, order, migratetype);
2308        return page;
2309}
2310
2311/*
2312 * Obtain a specified number of elements from the buddy allocator, all under
2313 * a single hold of the lock, for efficiency.  Add them to the supplied list.
2314 * Returns the number of new pages which were placed at *list.
2315 */
2316static int rmqueue_bulk(struct zone *zone, unsigned int order,
2317                        unsigned long count, struct list_head *list,
2318                        int migratetype, bool cold)
2319{
2320        int i, alloced = 0;
2321
2322        spin_lock(&zone->lock);
2323        for (i = 0; i < count; ++i) {
2324                struct page *page = __rmqueue(zone, order, migratetype);
2325                if (unlikely(page == NULL))
2326                        break;
2327
2328                if (unlikely(check_pcp_refill(page)))
2329                        continue;
2330
2331                /*
2332                 * Split buddy pages returned by expand() are received here
2333                 * in physical page order. The page is added to the callers and
2334                 * list and the list head then moves forward. From the callers
2335                 * perspective, the linked list is ordered by page number in
2336                 * some conditions. This is useful for IO devices that can
2337                 * merge IO requests if the physical pages are ordered
2338                 * properly.
2339                 */
2340                if (likely(!cold))
2341                        list_add(&page->lru, list);
2342                else
2343                        list_add_tail(&page->lru, list);
2344                list = &page->lru;
2345                alloced++;
2346                if (is_migrate_cma(get_pcppage_migratetype(page)))
2347                        __mod_zone_page_state(zone, NR_FREE_CMA_PAGES,
2348                                              -(1 << order));
2349        }
2350
2351        /*
2352         * i pages were removed from the buddy list even if some leak due
2353         * to check_pcp_refill failing so adjust NR_FREE_PAGES based
2354         * on i. Do not confuse with 'alloced' which is the number of
2355         * pages added to the pcp list.
2356         */
2357        __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order));
2358        spin_unlock(&zone->lock);
2359        return alloced;
2360}
2361
2362#ifdef CONFIG_NUMA
2363/*
2364 * Called from the vmstat counter updater to drain pagesets of this
2365 * currently executing processor on remote nodes after they have
2366 * expired.
2367 *
2368 * Note that this function must be called with the thread pinned to
2369 * a single processor.
2370 */
2371void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp)
2372{
2373        unsigned long flags;
2374        int to_drain, batch;
2375
2376        local_irq_save(flags);
2377        batch = READ_ONCE(pcp->batch);
2378        to_drain = min(pcp->count, batch);
2379        if (to_drain > 0) {
2380                free_pcppages_bulk(zone, to_drain, pcp);
2381                pcp->count -= to_drain;
2382        }
2383        local_irq_restore(flags);
2384}
2385#endif
2386
2387/*
2388 * Drain pcplists of the indicated processor and zone.
2389 *
2390 * The processor must either be the current processor and the
2391 * thread pinned to the current processor or a processor that
2392 * is not online.
2393 */
2394static void drain_pages_zone(unsigned int cpu, struct zone *zone)
2395{
2396        unsigned long flags;
2397        struct per_cpu_pageset *pset;
2398        struct per_cpu_pages *pcp;
2399
2400        local_irq_save(flags);
2401        pset = per_cpu_ptr(zone->pageset, cpu);
2402
2403        pcp = &pset->pcp;
2404        if (pcp->count) {
2405                free_pcppages_bulk(zone, pcp->count, pcp);
2406                pcp->count = 0;
2407        }
2408        local_irq_restore(flags);
2409}
2410
2411/*
2412 * Drain pcplists of all zones on the indicated processor.
2413 *
2414 * The processor must either be the current processor and the
2415 * thread pinned to the current processor or a processor that
2416 * is not online.
2417 */
2418static void drain_pages(unsigned int cpu)
2419{
2420        struct zone *zone;
2421
2422        for_each_populated_zone(zone) {
2423                drain_pages_zone(cpu, zone);
2424        }
2425}
2426
2427/*
2428 * Spill all of this CPU's per-cpu pages back into the buddy allocator.
2429 *
2430 * The CPU has to be pinned. When zone parameter is non-NULL, spill just
2431 * the single zone's pages.
2432 */
2433void drain_local_pages(struct zone *zone)
2434{
2435        int cpu = smp_processor_id();
2436
2437        if (zone)
2438                drain_pages_zone(cpu, zone);
2439        else
2440                drain_pages(cpu);
2441}
2442
2443static void drain_local_pages_wq(struct work_struct *work)
2444{
2445        /*
2446         * drain_all_pages doesn't use proper cpu hotplug protection so
2447         * we can race with cpu offline when the WQ can move this from
2448         * a cpu pinned worker to an unbound one. We can operate on a different
2449         * cpu which is allright but we also have to make sure to not move to
2450         * a different one.
2451         */
2452        preempt_disable();
2453        drain_local_pages(NULL);
2454        preempt_enable();
2455}
2456
2457/*
2458 * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
2459 *
2460 * When zone parameter is non-NULL, spill just the single zone's pages.
2461 *
2462 * Note that this can be extremely slow as the draining happens in a workqueue.
2463 */
2464void drain_all_pages(struct zone *zone)
2465{
2466        int cpu;
2467
2468        /*
2469         * Allocate in the BSS so we wont require allocation in
2470         * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y
2471         */
2472        static cpumask_t cpus_with_pcps;
2473
2474        /*
2475         * Make sure nobody triggers this path before mm_percpu_wq is fully
2476         * initialized.
2477         */
2478        if (WARN_ON_ONCE(!mm_percpu_wq))
2479                return;
2480
2481        /* Workqueues cannot recurse */
2482        if (current->flags & PF_WQ_WORKER)
2483                return;
2484
2485        /*
2486         * Do not drain if one is already in progress unless it's specific to
2487         * a zone. Such callers are primarily CMA and memory hotplug and need
2488         * the drain to be complete when the call returns.
2489         */
2490        if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) {
2491                if (!zone)
2492                        return;
2493                mutex_lock(&pcpu_drain_mutex);
2494        }
2495
2496        /*
2497         * We don't care about racing with CPU hotplug event
2498         * as offline notification will cause the notified
2499         * cpu to drain that CPU pcps and on_each_cpu_mask
2500         * disables preemption as part of its processing
2501         */
2502        for_each_online_cpu(cpu) {
2503                struct per_cpu_pageset *pcp;
2504                struct zone *z;
2505                bool has_pcps = false;
2506
2507                if (zone) {
2508                        pcp = per_cpu_ptr(zone->pageset, cpu);
2509                        if (pcp->pcp.count)
2510                                has_pcps = true;
2511                } else {
2512                        for_each_populated_zone(z) {
2513                                pcp = per_cpu_ptr(z->pageset, cpu);
2514                                if (pcp->pcp.count) {
2515                                        has_pcps = true;
2516                                        break;
2517                                }
2518                        }
2519                }
2520
2521                if (has_pcps)
2522                        cpumask_set_cpu(cpu, &cpus_with_pcps);
2523                else
2524                        cpumask_clear_cpu(cpu, &cpus_with_pcps);
2525        }
2526
2527        for_each_cpu(cpu, &cpus_with_pcps) {
2528                struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
2529                INIT_WORK(work, drain_local_pages_wq);
2530                queue_work_on(cpu, mm_percpu_wq, work);
2531        }
2532        for_each_cpu(cpu, &cpus_with_pcps)
2533                flush_work(per_cpu_ptr(&pcpu_drain, cpu));
2534
2535        mutex_unlock(&pcpu_drain_mutex);
2536}
2537
2538#ifdef CONFIG_HIBERNATION
2539
2540/*
2541 * Touch the watchdog for every WD_PAGE_COUNT pages.
2542 */
2543#define WD_PAGE_COUNT   (128*1024)
2544
2545void mark_free_pages(struct zone *zone)
2546{
2547        unsigned long pfn, max_zone_pfn, page_count = WD_PAGE_COUNT;
2548        unsigned long flags;
2549        unsigned int order, t;
2550        struct page *page;
2551
2552        if (zone_is_empty(zone))
2553                return;
2554
2555        spin_lock_irqsave(&zone->lock, flags);
2556
2557        max_zone_pfn = zone_end_pfn(zone);
2558        for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
2559                if (pfn_valid(pfn)) {
2560                        page = pfn_to_page(pfn);
2561
2562                        if (!--page_count) {
2563                                touch_nmi_watchdog();
2564                                page_count = WD_PAGE_COUNT;
2565                        }
2566
2567                        if (page_zone(page) != zone)
2568                                continue;
2569
2570                        if (!swsusp_page_is_forbidden(page))
2571                                swsusp_unset_page_free(page);
2572                }
2573
2574        for_each_migratetype_order(order, t) {
2575                list_for_each_entry(page,
2576                                &zone->free_area[order].free_list[t], lru) {
2577                        unsigned long i;
2578
2579                        pfn = page_to_pfn(page);
2580                        for (i = 0; i < (1UL << order); i++) {
2581                                if (!--page_count) {
2582                                        touch_nmi_watchdog();
2583                                        page_count = WD_PAGE_COUNT;
2584                                }
2585                                swsusp_set_page_free(pfn_to_page(pfn + i));
2586                        }
2587                }
2588        }
2589        spin_unlock_irqrestore(&zone->lock, flags);
2590}
2591#endif /* CONFIG_PM */
2592
2593/*
2594 * Free a 0-order page
2595 * cold == true ? free a cold page : free a hot page
2596 */
2597void free_hot_cold_page(struct page *page, bool cold)
2598{
2599        struct zone *zone = page_zone(page);
2600        struct per_cpu_pages *pcp;
2601        unsigned long flags;
2602        unsigned long pfn = page_to_pfn(page);
2603        int migratetype;
2604
2605        if (!free_pcp_prepare(page))
2606                return;
2607
2608        migratetype = get_pfnblock_migratetype(page, pfn);
2609        set_pcppage_migratetype(page, migratetype);
2610        local_irq_save(flags);
2611        __count_vm_event(PGFREE);
2612
2613        /*
2614         * We only track unmovable, reclaimable and movable on pcp lists.
2615         * Free ISOLATE pages back to the allocator because they are being
2616         * offlined but treat HIGHATOMIC as movable pages so we can get those
2617         * areas back if necessary. Otherwise, we may have to free
2618         * excessively into the page allocator
2619         */
2620        if (migratetype >= MIGRATE_PCPTYPES) {
2621                if (unlikely(is_migrate_isolate(migratetype))) {
2622                        free_one_page(zone, page, pfn, 0, migratetype);
2623                        goto out;
2624                }
2625                migratetype = MIGRATE_MOVABLE;
2626        }
2627
2628        pcp = &this_cpu_ptr(zone->pageset)->pcp;
2629        if (!cold)
2630                list_add(&page->lru, &pcp->lists[migratetype]);
2631        else
2632                list_add_tail(&page->lru, &pcp->lists[migratetype]);
2633        pcp->count++;
2634        if (pcp->count >= pcp->high) {
2635                unsigned long batch = READ_ONCE(pcp->batch);
2636                free_pcppages_bulk(zone, batch, pcp);
2637                pcp->count -= batch;
2638        }
2639
2640out:
2641        local_irq_restore(flags);
2642}
2643
2644/*
2645 * Free a list of 0-order pages
2646 */
2647void free_hot_cold_page_list(struct list_head *list, bool cold)
2648{
2649        struct page *page, *next;
2650
2651        list_for_each_entry_safe(page, next, list, lru) {
2652                trace_mm_page_free_batched(page, cold);
2653                free_hot_cold_page(page, cold);
2654        }
2655}
2656
2657/*
2658 * split_page takes a non-compound higher-order page, and splits it into
2659 * n (1<<order) sub-pages: page[0..n]
2660 * Each sub-page must be freed individually.
2661 *
2662 * Note: this is probably too low level an operation for use in drivers.
2663 * Please consult with lkml before using this in your driver.
2664 */
2665void split_page(struct page *page, unsigned int order)
2666{
2667        int i;
2668
2669        VM_BUG_ON_PAGE(PageCompound(page), page);
2670        VM_BUG_ON_PAGE(!page_count(page), page);
2671
2672#ifdef CONFIG_KMEMCHECK
2673        /*
2674         * Split shadow pages too, because free(page[0]) would
2675         * otherwise free the whole shadow.
2676         */
2677        if (kmemcheck_page_is_tracked(page))
2678                split_page(virt_to_page(page[0].shadow), order);
2679#endif
2680
2681        for (i = 1; i < (1 << order); i++)
2682                set_page_refcounted(page + i);
2683        split_page_owner(page, order);
2684}
2685EXPORT_SYMBOL_GPL(split_page);
2686
2687int __isolate_free_page(struct page *page, unsigned int order)
2688{
2689        unsigned long watermark;
2690        struct zone *zone;
2691        int mt;
2692
2693        BUG_ON(!PageBuddy(page));
2694
2695        zone = page_zone(page);
2696        mt = get_pageblock_migratetype(page);
2697
2698        if (!is_migrate_isolate(mt)) {
2699                /*
2700                 * Obey watermarks as if the page was being allocated. We can
2701                 * emulate a high-order watermark check with a raised order-0
2702                 * watermark, because we already know our high-order page
2703                 * exists.
2704                 */
2705                watermark = min_wmark_pages(zone) + (1UL << order);
2706                if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
2707                        return 0;
2708
2709                __mod_zone_freepage_state(zone, -(1UL << order), mt);
2710        }
2711
2712        /* Remove page from free list */
2713        list_del(&page->lru);
2714        zone->free_area[order].nr_free--;
2715        rmv_page_order(page);
2716
2717        /*
2718         * Set the pageblock if the isolated page is at least half of a
2719         * pageblock
2720         */
2721        if (order >= pageblock_order - 1) {
2722                struct page *endpage = page + (1 << order) - 1;
2723                for (; page < endpage; page += pageblock_nr_pages) {
2724                        int mt = get_pageblock_migratetype(page);
2725                        if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)
2726                            && !is_migrate_highatomic(mt))
2727                                set_pageblock_migratetype(page,
2728                                                          MIGRATE_MOVABLE);
2729                }
2730        }
2731
2732
2733        return 1UL << order;
2734}
2735
2736/*
2737 * Update NUMA hit/miss statistics
2738 *
2739 * Must be called with interrupts disabled.
2740 */
2741static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
2742{
2743#ifdef CONFIG_NUMA
2744        enum numa_stat_item local_stat = NUMA_LOCAL;
2745
2746        if (z->node != numa_node_id())
2747                local_stat = NUMA_OTHER;
2748
2749        if (z->node == preferred_zone->node)
2750                __inc_numa_state(z, NUMA_HIT);
2751        else {
2752                __inc_numa_state(z, NUMA_MISS);
2753                __inc_numa_state(preferred_zone, NUMA_FOREIGN);
2754        }
2755        __inc_numa_state(z, local_stat);
2756#endif
2757}
2758
2759/* Remove page from the per-cpu list, caller must protect the list */
2760static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
2761                        bool cold, struct per_cpu_pages *pcp,
2762                        struct list_head *list)
2763{
2764        struct page *page;
2765
2766        do {
2767                if (list_empty(list)) {
2768                        pcp->count += rmqueue_bulk(zone, 0,
2769                                        pcp->batch, list,
2770                                        migratetype, cold);
2771                        if (unlikely(list_empty(list)))
2772                                return NULL;
2773                }
2774
2775                if (cold)
2776                        page = list_last_entry(list, struct page, lru);
2777                else
2778                        page = list_first_entry(list, struct page, lru);
2779
2780                list_del(&page->lru);
2781                pcp->count--;
2782        } while (check_new_pcp(page));
2783
2784        return page;
2785}
2786
2787/* Lock and remove page from the per-cpu list */
2788static struct page *rmqueue_pcplist(struct zone *preferred_zone,
2789                        struct zone *zone, unsigned int order,
2790                        gfp_t gfp_flags, int migratetype)
2791{
2792        struct per_cpu_pages *pcp;
2793        struct list_head *list;
2794        bool cold = ((gfp_flags & __GFP_COLD) != 0);
2795        struct page *page;
2796        unsigned long flags;
2797
2798        local_irq_save(flags);
2799        pcp = &this_cpu_ptr(zone->pageset)->pcp;
2800        list = &pcp->lists[migratetype];
2801        page = __rmqueue_pcplist(zone,  migratetype, cold, pcp, list);
2802        if (page) {
2803                __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
2804                zone_statistics(preferred_zone, zone);
2805        }
2806        local_irq_restore(flags);
2807        return page;
2808}
2809
2810/*
2811 * Allocate a page from the given zone. Use pcplists for order-0 allocations.
2812 */
2813static inline
2814struct page *rmqueue(struct zone *preferred_zone,
2815                        struct zone *zone, unsigned int order,
2816                        gfp_t gfp_flags, unsigned int alloc_flags,
2817                        int migratetype)
2818{
2819        unsigned long flags;
2820        struct page *page;
2821
2822        if (likely(order == 0)) {
2823                page = rmqueue_pcplist(preferred_zone, zone, order,
2824                                gfp_flags, migratetype);
2825                goto out;
2826        }
2827
2828        /*
2829         * We most definitely don't want callers attempting to
2830         * allocate greater than order-1 page units with __GFP_NOFAIL.
2831         */
2832        WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
2833        spin_lock_irqsave(&zone->lock, flags);
2834
2835        do {
2836                page = NULL;
2837                if (alloc_flags & ALLOC_HARDER) {
2838                        page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
2839                        if (page)
2840                                trace_mm_page_alloc_zone_locked(page, order, migratetype);
2841                }
2842                if (!page)
2843                        page = __rmqueue(zone, order, migratetype);
2844        } while (page && check_new_pages(page, order));
2845        spin_unlock(&zone->lock);
2846        if (!page)
2847                goto failed;
2848        __mod_zone_freepage_state(zone, -(1 << order),
2849                                  get_pcppage_migratetype(page));
2850
2851        __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
2852        zone_statistics(preferred_zone, zone);
2853        local_irq_restore(flags);
2854
2855out:
2856        VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
2857        return page;
2858
2859failed:
2860        local_irq_restore(flags);
2861        return NULL;
2862}
2863
2864#ifdef CONFIG_FAIL_PAGE_ALLOC
2865
2866static struct {
2867        struct fault_attr attr;
2868
2869        bool ignore_gfp_highmem;
2870        bool ignore_gfp_reclaim;
2871        u32 min_order;
2872} fail_page_alloc = {
2873        .attr = FAULT_ATTR_INITIALIZER,
2874        .ignore_gfp_reclaim = true,
2875        .ignore_gfp_highmem = true,
2876        .min_order = 1,
2877};
2878
2879static int __init setup_fail_page_alloc(char *str)
2880{
2881        return setup_fault_attr(&fail_page_alloc.attr, str);
2882}
2883__setup("fail_page_alloc=", setup_fail_page_alloc);
2884
2885static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
2886{
2887        if (order < fail_page_alloc.min_order)
2888                return false;
2889        if (gfp_mask & __GFP_NOFAIL)
2890                return false;
2891        if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
2892                return false;
2893        if (fail_page_alloc.ignore_gfp_reclaim &&
2894                        (gfp_mask & __GFP_DIRECT_RECLAIM))
2895                return false;
2896
2897        return should_fail(&fail_page_alloc.attr, 1 << order);
2898}
2899
2900#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
2901
2902static int __init fail_page_alloc_debugfs(void)
2903{
2904        umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
2905        struct dentry *dir;
2906
2907        dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
2908                                        &fail_page_alloc.attr);
2909        if (IS_ERR(dir))
2910                return PTR_ERR(dir);
2911
2912        if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
2913                                &fail_page_alloc.ignore_gfp_reclaim))
2914                goto fail;
2915        if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
2916                                &fail_page_alloc.ignore_gfp_highmem))
2917                goto fail;
2918        if (!debugfs_create_u32("min-order", mode, dir,
2919                                &fail_page_alloc.min_order))
2920                goto fail;
2921
2922        return 0;
2923fail:
2924        debugfs_remove_recursive(dir);
2925
2926        return -ENOMEM;
2927}
2928
2929late_initcall(fail_page_alloc_debugfs);
2930
2931#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
2932
2933#else /* CONFIG_FAIL_PAGE_ALLOC */
2934
2935static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
2936{
2937        return false;
2938}
2939
2940#endif /* CONFIG_FAIL_PAGE_ALLOC */
2941
2942/*
2943 * Return true if free base pages are above 'mark'. For high-order checks it
2944 * will return true of the order-0 watermark is reached and there is at least
2945 * one free page of a suitable size. Checking now avoids taking the zone lock
2946 * to check in the allocation paths if no pages are free.
2947 */
2948bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
2949                         int classzone_idx, unsigned int alloc_flags,
2950                         long free_pages)
2951{
2952        long min = mark;
2953        int o;
2954        const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
2955
2956        /* free_pages may go negative - that's OK */
2957        free_pages -= (1 << order) - 1;
2958
2959        if (alloc_flags & ALLOC_HIGH)
2960                min -= min / 2;
2961
2962        /*
2963         * If the caller does not have rights to ALLOC_HARDER then subtract
2964         * the high-atomic reserves. This will over-estimate the size of the
2965         * atomic reserve but it avoids a search.
2966         */
2967        if (likely(!alloc_harder)) {
2968                free_pages -= z->nr_reserved_highatomic;
2969        } else {
2970                /*
2971                 * OOM victims can try even harder than normal ALLOC_HARDER
2972                 * users on the grounds that it's definitely going to be in
2973                 * the exit path shortly and free memory. Any allocation it
2974                 * makes during the free path will be small and short-lived.
2975                 */
2976                if (alloc_flags & ALLOC_OOM)
2977                        min -= min / 2;
2978                else
2979                        min -= min / 4;
2980        }
2981
2982
2983#ifdef CONFIG_CMA
2984        /* If allocation can't use CMA areas don't use free CMA pages */
2985        if (!(alloc_flags & ALLOC_CMA))
2986                free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
2987#endif
2988
2989        /*
2990         * Check watermarks for an order-0 allocation request. If these
2991         * are not met, then a high-order request also cannot go ahead
2992         * even if a suitable page happened to be free.
2993         */
2994        if (free_pages <= min + z->lowmem_reserve[classzone_idx])
2995                return false;
2996
2997        /* If this is an order-0 request then the watermark is fine */
2998        if (!order)
2999                return true;
3000
3001        /* For a high-order request, check at least one suitable page is free */
3002        for (o = order; o < MAX_ORDER; o++) {
3003                struct free_area *area = &z->free_area[o];
3004                int mt;
3005
3006                if (!area->nr_free)
3007                        continue;
3008
3009                if (alloc_harder)
3010                        return true;
3011
3012                for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
3013                        if (!list_empty(&area->free_list[mt]))
3014                                return true;
3015                }
3016
3017#ifdef CONFIG_CMA
3018                if ((alloc_flags & ALLOC_CMA) &&
3019                    !list_empty(&area->free_list[MIGRATE_CMA])) {
3020                        return true;
3021                }
3022#endif
3023        }
3024        return false;
3025}
3026
3027bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
3028                      int classzone_idx, unsigned int alloc_flags)
3029{
3030        return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
3031                                        zone_page_state(z, NR_FREE_PAGES));
3032}
3033
3034static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
3035                unsigned long mark, int classzone_idx, unsigned int alloc_flags)
3036{
3037        long free_pages = zone_page_state(z, NR_FREE_PAGES);
3038        long cma_pages = 0;
3039
3040#ifdef CONFIG_CMA
3041        /* If allocation can't use CMA areas don't use free CMA pages */
3042        if (!(alloc_flags & ALLOC_CMA))
3043                cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);
3044#endif
3045
3046        /*
3047         * Fast check for order-0 only. If this fails then the reserves
3048         * need to be calculated. There is a corner case where the check
3049         * passes but only the high-order atomic reserve are free. If
3050         * the caller is !atomic then it'll uselessly search the free
3051         * list. That corner case is then slower but it is harmless.
3052         */
3053        if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])
3054                return true;
3055
3056        return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
3057                                        free_pages);
3058}
3059
3060bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
3061                        unsigned long mark, int classzone_idx)
3062{
3063        long free_pages = zone_page_state(z, NR_FREE_PAGES);
3064
3065        if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
3066                free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
3067
3068        return __zone_watermark_ok(z, order, mark, classzone_idx, 0,
3069                                                                free_pages);
3070}
3071
3072#ifdef CONFIG_NUMA
3073static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
3074{
3075        return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
3076                                RECLAIM_DISTANCE;
3077}
3078#else   /* CONFIG_NUMA */
3079static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
3080{
3081        return true;
3082}
3083#endif  /* CONFIG_NUMA */
3084
3085/*
3086 * get_page_from_freelist goes through the zonelist trying to allocate
3087 * a page.
3088 */
3089static struct page *
3090get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
3091                                                const struct alloc_context *ac)
3092{
3093        struct zoneref *z = ac->preferred_zoneref;
3094        struct zone *zone;
3095        struct pglist_data *last_pgdat_dirty_limit = NULL;
3096
3097        /*
3098         * Scan zonelist, looking for a zone with enough free.
3099         * See also __cpuset_node_allowed() comment in kernel/cpuset.c.
3100         */
3101        for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
3102                                                                ac->nodemask) {
3103                struct page *page;
3104                unsigned long mark;
3105
3106                if (cpusets_enabled() &&
3107                        (alloc_flags & ALLOC_CPUSET) &&
3108                        !__cpuset_zone_allowed(zone, gfp_mask))
3109                                continue;
3110                /*
3111                 * When allocating a page cache page for writing, we
3112                 * want to get it from a node that is within its dirty
3113                 * limit, such that no single node holds more than its
3114                 * proportional share of globally allowed dirty pages.
3115                 * The dirty limits take into account the node's
3116                 * lowmem reserves and high watermark so that kswapd
3117                 * should be able to balance it without having to
3118                 * write pages from its LRU list.
3119                 *
3120                 * XXX: For now, allow allocations to potentially
3121                 * exceed the per-node dirty limit in the slowpath
3122                 * (spread_dirty_pages unset) before going into reclaim,
3123                 * which is important when on a NUMA setup the allowed
3124                 * nodes are together not big enough to reach the
3125                 * global limit.  The proper fix for these situations
3126                 * will require awareness of nodes in the
3127                 * dirty-throttling and the flusher threads.
3128                 */
3129                if (ac->spread_dirty_pages) {
3130                        if (last_pgdat_dirty_limit == zone->zone_pgdat)
3131                                continue;
3132
3133                        if (!node_dirty_ok(zone->zone_pgdat)) {
3134                                last_pgdat_dirty_limit = zone->zone_pgdat;
3135                                continue;
3136                        }
3137                }
3138
3139                mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
3140                if (!zone_watermark_fast(zone, order, mark,
3141                                       ac_classzone_idx(ac), alloc_flags)) {
3142                        int ret;
3143
3144                        /* Checked here to keep the fast path fast */
3145                        BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
3146                        if (alloc_flags & ALLOC_NO_WATERMARKS)
3147                                goto try_this_zone;
3148
3149                        if (node_reclaim_mode == 0 ||
3150                            !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
3151                                continue;
3152
3153                        ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
3154                        switch (ret) {
3155                        case NODE_RECLAIM_NOSCAN:
3156                                /* did not scan */
3157                                continue;
3158                        case NODE_RECLAIM_FULL:
3159                                /* scanned but unreclaimable */
3160                                continue;
3161                        default:
3162                                /* did we reclaim enough */
3163                                if (zone_watermark_ok(zone, order, mark,
3164                                                ac_classzone_idx(ac), alloc_flags))
3165                                        goto try_this_zone;
3166
3167                                continue;
3168                        }
3169                }
3170
3171try_this_zone:
3172                page = rmqueue(ac->preferred_zoneref->zone, zone, order,
3173                                gfp_mask, alloc_flags, ac->migratetype);
3174                if (page) {
3175                        prep_new_page(page, order, gfp_mask, alloc_flags);
3176
3177                        /*
3178                         * If this is a high-order atomic allocation then check
3179                         * if the pageblock should be reserved for the future
3180                         */
3181                        if (unlikely(order && (alloc_flags & ALLOC_HARDER)))
3182                                reserve_highatomic_pageblock(page, zone, order);
3183
3184                        return page;
3185                }
3186        }
3187
3188        return NULL;
3189}
3190
3191/*
3192 * Large machines with many possible nodes should not always dump per-node
3193 * meminfo in irq context.
3194 */
3195static inline bool should_suppress_show_mem(void)
3196{
3197        bool ret = false;
3198
3199#if NODES_SHIFT > 8
3200        ret = in_interrupt();
3201#endif
3202        return ret;
3203}
3204
3205static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
3206{
3207        unsigned int filter = SHOW_MEM_FILTER_NODES;
3208        static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1);
3209
3210        if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs))
3211                return;
3212
3213        /*
3214         * This documents exceptions given to allocations in certain
3215         * contexts that are allowed to allocate outside current's set
3216         * of allowed nodes.
3217         */
3218        if (!(gfp_mask & __GFP_NOMEMALLOC))
3219                if (tsk_is_oom_victim(current) ||
3220                    (current->flags & (PF_MEMALLOC | PF_EXITING)))
3221                        filter &= ~SHOW_MEM_FILTER_NODES;
3222        if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
3223                filter &= ~SHOW_MEM_FILTER_NODES;
3224
3225        show_mem(filter, nodemask);
3226}
3227
3228void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
3229{
3230        struct va_format vaf;
3231        va_list args;
3232        static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL,
3233                                      DEFAULT_RATELIMIT_BURST);
3234
3235        if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
3236                return;
3237
3238        pr_warn("%s: ", current->comm);
3239
3240        va_start(args, fmt);
3241        vaf.fmt = fmt;
3242        vaf.va = &args;
3243        pr_cont("%pV", &vaf);
3244        va_end(args);
3245
3246        pr_cont(", mode:%#x(%pGg), nodemask=", gfp_mask, &gfp_mask);
3247        if (nodemask)
3248                pr_cont("%*pbl\n", nodemask_pr_args(nodemask));
3249        else
3250                pr_cont("(null)\n");
3251
3252        cpuset_print_current_mems_allowed();
3253
3254        dump_stack();
3255        warn_alloc_show_mem(gfp_mask, nodemask);
3256}
3257
3258static inline struct page *
3259__alloc_pages_cpuset_fallback(gfp_t gfp_mask, unsigned int order,
3260                              unsigned int alloc_flags,
3261                              const struct alloc_context *ac)
3262{
3263        struct page *page;
3264
3265        page = get_page_from_freelist(gfp_mask, order,
3266                        alloc_flags|ALLOC_CPUSET, ac);
3267        /*
3268         * fallback to ignore cpuset restriction if our nodes
3269         * are depleted
3270         */
3271        if (!page)
3272                page = get_page_from_freelist(gfp_mask, order,
3273                                alloc_flags, ac);
3274
3275        return page;
3276}
3277
3278static inline struct page *
3279__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
3280        const struct alloc_context *ac, unsigned long *did_some_progress)
3281{
3282        struct oom_control oc = {
3283                .zonelist = ac->zonelist,
3284                .nodemask = ac->nodemask,
3285                .memcg = NULL,
3286                .gfp_mask = gfp_mask,
3287                .order = order,
3288        };
3289        struct page *page;
3290
3291        *did_some_progress = 0;
3292
3293        /*
3294         * Acquire the oom lock.  If that fails, somebody else is
3295         * making progress for us.
3296         */
3297        if (!mutex_trylock(&oom_lock)) {
3298                *did_some_progress = 1;
3299                schedule_timeout_uninterruptible(1);
3300                return NULL;
3301        }
3302
3303        /*
3304         * Go through the zonelist yet one more time, keep very high watermark
3305         * here, this is only to catch a parallel oom killing, we must fail if
3306         * we're still under heavy pressure. But make sure that this reclaim
3307         * attempt shall not depend on __GFP_DIRECT_RECLAIM && !__GFP_NORETRY
3308         * allocation which will never fail due to oom_lock already held.
3309         */
3310        page = get_page_from_freelist((gfp_mask | __GFP_HARDWALL) &
3311                                      ~__GFP_DIRECT_RECLAIM, order,
3312                                      ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
3313        if (page)
3314                goto out;
3315
3316        /* Coredumps can quickly deplete all memory reserves */
3317        if (current->flags & PF_DUMPCORE)
3318                goto out;
3319        /* The OOM killer will not help higher order allocs */
3320        if (order > PAGE_ALLOC_COSTLY_ORDER)
3321                goto out;
3322        /*
3323         * We have already exhausted all our reclaim opportunities without any
3324         * success so it is time to admit defeat. We will skip the OOM killer
3325         * because it is very likely that the caller has a more reasonable
3326         * fallback than shooting a random task.
3327         */
3328        if (gfp_mask & __GFP_RETRY_MAYFAIL)
3329                goto out;
3330        /* The OOM killer does not needlessly kill tasks for lowmem */
3331        if (ac->high_zoneidx < ZONE_NORMAL)
3332                goto out;
3333        if (pm_suspended_storage())
3334                goto out;
3335        /*
3336         * XXX: GFP_NOFS allocations should rather fail than rely on
3337         * other request to make a forward progress.
3338         * We are in an unfortunate situation where out_of_memory cannot
3339         * do much for this context but let's try it to at least get
3340         * access to memory reserved if the current task is killed (see
3341         * out_of_memory). Once filesystems are ready to handle allocation
3342         * failures more gracefully we should just bail out here.
3343         */
3344
3345        /* The OOM killer may not free memory on a specific node */
3346        if (gfp_mask & __GFP_THISNODE)
3347                goto out;
3348
3349        /* Exhausted what can be done so it's blamo time */
3350        if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
3351                *did_some_progress = 1;
3352
3353                /*
3354                 * Help non-failing allocations by giving them access to memory
3355                 * reserves
3356                 */
3357                if (gfp_mask & __GFP_NOFAIL)
3358                        page = __alloc_pages_cpuset_fallback(gfp_mask, order,
3359                                        ALLOC_NO_WATERMARKS, ac);
3360        }
3361out:
3362        mutex_unlock(&oom_lock);
3363        return page;
3364}
3365
3366/*
3367 * Maximum number of compaction retries wit a progress before OOM
3368 * killer is consider as the only way to move forward.
3369 */
3370#define MAX_COMPACT_RETRIES 16
3371
3372#ifdef CONFIG_COMPACTION
3373/* Try memory compaction for high-order allocations before reclaim */
3374static struct page *
3375__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
3376                unsigned int alloc_flags, const struct alloc_context *ac,
3377                enum compact_priority prio, enum compact_result *compact_result)
3378{
3379        struct page *page;
3380        unsigned int noreclaim_flag;
3381
3382        if (!order)
3383                return NULL;
3384
3385        noreclaim_flag = memalloc_noreclaim_save();
3386        *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
3387                                                                        prio);
3388        memalloc_noreclaim_restore(noreclaim_flag);
3389
3390        if (*compact_result <= COMPACT_INACTIVE)
3391                return NULL;
3392
3393        /*
3394         * At least in one zone compaction wasn't deferred or skipped, so let's
3395         * count a compaction stall
3396         */
3397        count_vm_event(COMPACTSTALL);
3398
3399        page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
3400
3401        if (page) {
3402                struct zone *zone = page_zone(page);
3403
3404                zone->compact_blockskip_flush = false;
3405                compaction_defer_reset(zone, order, true);
3406                count_vm_event(COMPACTSUCCESS);
3407                return page;
3408        }
3409
3410        /*
3411         * It's bad if compaction run occurs and fails. The most likely reason
3412         * is that pages exist, but not enough to satisfy watermarks.
3413         */
3414        count_vm_event(COMPACTFAIL);
3415
3416        cond_resched();
3417
3418        return NULL;
3419}
3420
3421static inline bool
3422should_compact_retry(struct alloc_context *ac, int order, int alloc_flags,
3423                     enum compact_result compact_result,
3424                     enum compact_priority *compact_priority,
3425                     int *compaction_retries)
3426{
3427        int max_retries = MAX_COMPACT_RETRIES;
3428        int min_priority;
3429        bool ret = false;
3430        int retries = *compaction_retries;
3431        enum compact_priority priority = *compact_priority;
3432
3433        if (!order)
3434                return false;
3435
3436        if (compaction_made_progress(compact_result))
3437                (*compaction_retries)++;
3438
3439        /*
3440         * compaction considers all the zone as desperately out of memory
3441         * so it doesn't really make much sense to retry except when the
3442         * failure could be caused by insufficient priority
3443         */
3444        if (compaction_failed(compact_result))
3445                goto check_priority;
3446
3447        /*
3448         * make sure the compaction wasn't deferred or didn't bail out early
3449         * due to locks contention before we declare that we should give up.
3450         * But do not retry if the given zonelist is not suitable for
3451         * compaction.
3452         */
3453        if (compaction_withdrawn(compact_result)) {
3454                ret = compaction_zonelist_suitable(ac, order, alloc_flags);
3455                goto out;
3456        }
3457
3458        /*
3459         * !costly requests are much more important than __GFP_RETRY_MAYFAIL
3460         * costly ones because they are de facto nofail and invoke OOM
3461         * killer to move on while costly can fail and users are ready
3462         * to cope with that. 1/4 retries is rather arbitrary but we
3463         * would need much more detailed feedback from compaction to
3464         * make a better decision.
3465         */
3466        if (order > PAGE_ALLOC_COSTLY_ORDER)
3467                max_retries /= 4;
3468        if (*compaction_retries <= max_retries) {
3469                ret = true;
3470                goto out;
3471        }
3472
3473        /*
3474         * Make sure there are attempts at the highest priority if we exhausted
3475         * all retries or failed at the lower priorities.
3476         */
3477check_priority:
3478        min_priority = (order > PAGE_ALLOC_COSTLY_ORDER) ?
3479                        MIN_COMPACT_COSTLY_PRIORITY : MIN_COMPACT_PRIORITY;
3480
3481        if (*compact_priority > min_priority) {
3482                (*compact_priority)--;
3483                *compaction_retries = 0;
3484                ret = true;
3485        }
3486out:
3487        trace_compact_retry(order, priority, compact_result, retries, max_retries, ret);
3488        return ret;
3489}
3490#else
3491static inline struct page *
3492__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
3493                unsigned int alloc_flags, const struct alloc_context *ac,
3494                enum compact_priority prio, enum compact_result *compact_result)
3495{
3496        *compact_result = COMPACT_SKIPPED;
3497        return NULL;
3498}
3499
3500static inline bool
3501should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_flags,
3502                     enum compact_result compact_result,
3503                     enum compact_priority *compact_priority,
3504                     int *compaction_retries)
3505{
3506        struct zone *zone;
3507        struct zoneref *z;
3508
3509        if (!order || order > PAGE_ALLOC_COSTLY_ORDER)
3510                return false;
3511
3512        /*
3513         * There are setups with compaction disabled which would prefer to loop
3514         * inside the allocator rather than hit the oom killer prematurely.
3515         * Let's give them a good hope and keep retrying while the order-0
3516         * watermarks are OK.
3517         */
3518        for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
3519                                        ac->nodemask) {
3520                if (zone_watermark_ok(zone, 0, min_wmark_pages(zone),
3521                                        ac_classzone_idx(ac), alloc_flags))
3522                        return true;
3523        }
3524        return false;
3525}
3526#endif /* CONFIG_COMPACTION */
3527
3528#ifdef CONFIG_LOCKDEP
3529struct lockdep_map __fs_reclaim_map =
3530        STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
3531
3532static bool __need_fs_reclaim(gfp_t gfp_mask)
3533{
3534        gfp_mask = current_gfp_context(gfp_mask);
3535
3536        /* no reclaim without waiting on it */
3537        if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
3538                return false;
3539
3540        /* this guy won't enter reclaim */
3541        if ((current->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
3542                return false;
3543
3544        /* We're only interested __GFP_FS allocations for now */
3545        if (!(gfp_mask & __GFP_FS))
3546                return false;
3547
3548        if (gfp_mask & __GFP_NOLOCKDEP)
3549                return false;
3550
3551        return true;
3552}
3553
3554void fs_reclaim_acquire(gfp_t gfp_mask)
3555{
3556        if (__need_fs_reclaim(gfp_mask))
3557                lock_map_acquire(&__fs_reclaim_map);
3558}
3559EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
3560
3561void fs_reclaim_release(gfp_t gfp_mask)
3562{
3563        if (__need_fs_reclaim(gfp_mask))
3564                lock_map_release(&__fs_reclaim_map);
3565}
3566EXPORT_SYMBOL_GPL(fs_reclaim_release);
3567#endif
3568
3569/* Perform direct synchronous page reclaim */
3570static int
3571__perform_reclaim(gfp_t gfp_mask, unsigned int order,
3572                                        const struct alloc_context *ac)
3573{
3574        struct reclaim_state reclaim_state;
3575        int progress;
3576        unsigned int noreclaim_flag;
3577
3578        cond_resched();
3579
3580        /* We now go into synchronous reclaim */
3581        cpuset_memory_pressure_bump();
3582        noreclaim_flag = memalloc_noreclaim_save();
3583        fs_reclaim_acquire(gfp_mask);
3584        reclaim_state.reclaimed_slab = 0;
3585        current->reclaim_state = &reclaim_state;
3586
3587        progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
3588                                                                ac->nodemask);
3589
3590        current->reclaim_state = NULL;
3591        fs_reclaim_release(gfp_mask);
3592        memalloc_noreclaim_restore(noreclaim_flag);
3593
3594        cond_resched();
3595
3596        return progress;
3597}
3598
3599/* The really slow allocator path where we enter direct reclaim */
3600static inline struct page *
3601__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
3602                unsigned int alloc_flags, const struct alloc_context *ac,
3603                unsigned long *did_some_progress)
3604{
3605        struct page *page = NULL;
3606        bool drained = false;
3607
3608        *did_some_progress = __perform_reclaim(gfp_mask, order, ac);
3609        if (unlikely(!(*did_some_progress)))
3610                return NULL;
3611
3612retry:
3613        page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
3614
3615        /*
3616         * If an allocation failed after direct reclaim, it could be because
3617         * pages are pinned on the per-cpu lists or in high alloc reserves.
3618         * Shrink them them and try again
3619         */
3620        if (!page && !drained) {
3621                unreserve_highatomic_pageblock(ac, false);
3622                drain_all_pages(NULL);
3623                drained = true;
3624                goto retry;
3625        }
3626
3627        return page;
3628}
3629
3630static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
3631{
3632        struct zoneref *z;
3633        struct zone *zone;
3634        pg_data_t *last_pgdat = NULL;
3635
3636        for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
3637                                        ac->high_zoneidx, ac->nodemask) {
3638                if (last_pgdat != zone->zone_pgdat)
3639                        wakeup_kswapd(zone, order, ac->high_zoneidx);
3640                last_pgdat = zone->zone_pgdat;
3641        }
3642}
3643
3644static inline unsigned int
3645gfp_to_alloc_flags(gfp_t gfp_mask)
3646{
3647        unsigned int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
3648
3649        /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */
3650        BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH);
3651
3652        /*
3653         * The caller may dip into page reserves a bit more if the caller
3654         * cannot run direct reclaim, or if the caller has realtime scheduling
3655         * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
3656         * set both ALLOC_HARDER (__GFP_ATOMIC) and ALLOC_HIGH (__GFP_HIGH).
3657         */
3658        alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH);
3659
3660        if (gfp_mask & __GFP_ATOMIC) {
3661                /*
3662                 * Not worth trying to allocate harder for __GFP_NOMEMALLOC even
3663                 * if it can't schedule.
3664                 */
3665                if (!(gfp_mask & __GFP_NOMEMALLOC))
3666                        alloc_flags |= ALLOC_HARDER;
3667                /*
3668                 * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the
3669                 * comment for __cpuset_node_allowed().
3670                 */
3671                alloc_flags &= ~ALLOC_CPUSET;
3672        } else if (unlikely(rt_task(current)) && !in_interrupt())
3673                alloc_flags |= ALLOC_HARDER;
3674
3675#ifdef CONFIG_CMA
3676        if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
3677                alloc_flags |= ALLOC_CMA;
3678#endif
3679        return alloc_flags;
3680}
3681
3682static bool oom_reserves_allowed(struct task_struct *tsk)
3683{
3684        if (!tsk_is_oom_victim(tsk))
3685                return false;
3686
3687        /*
3688         * !MMU doesn't have oom reaper so give access to memory reserves
3689         * only to the thread with TIF_MEMDIE set
3690         */
3691        if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE))
3692                return false;
3693
3694        return true;
3695}
3696
3697/*
3698 * Distinguish requests which really need access to full memory
3699 * reserves from oom victims which can live with a portion of it
3700 */
3701static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)
3702{
3703        if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
3704                return 0;
3705        if (gfp_mask & __GFP_MEMALLOC)
3706                return ALLOC_NO_WATERMARKS;
3707        if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
3708                return ALLOC_NO_WATERMARKS;
3709        if (!in_interrupt()) {
3710                if (current->flags & PF_MEMALLOC)
3711                        return ALLOC_NO_WATERMARKS;
3712                else if (oom_reserves_allowed(current))
3713                        return ALLOC_OOM;
3714        }
3715
3716        return 0;
3717}
3718
3719bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
3720{
3721        return !!__gfp_pfmemalloc_flags(gfp_mask);
3722}
3723
3724/*
3725 * Checks whether it makes sense to retry the reclaim to make a forward progress
3726 * for the given allocation request.
3727 *
3728 * We give up when we either have tried MAX_RECLAIM_RETRIES in a row
3729 * without success, or when we couldn't even meet the watermark if we
3730 * reclaimed all remaining pages on the LRU lists.
3731 *
3732 * Returns true if a retry is viable or false to enter the oom path.
3733 */
3734static inline bool
3735should_reclaim_retry(gfp_t gfp_mask, unsigned order,
3736                     struct alloc_context *ac, int alloc_flags,
3737                     bool did_some_progress, int *no_progress_loops)
3738{
3739        struct zone *zone;
3740        struct zoneref *z;
3741
3742        /*
3743         * Costly allocations might have made a progress but this doesn't mean
3744         * their order will become available due to high fragmentation so
3745         * always increment the no progress counter for them
3746         */
3747        if (did_some_progress && order <= PAGE_ALLOC_COSTLY_ORDER)
3748                *no_progress_loops = 0;
3749        else
3750                (*no_progress_loops)++;
3751
3752        /*
3753         * Make sure we converge to OOM if we cannot make any progress
3754         * several times in the row.
3755         */
3756        if (*no_progress_loops > MAX_RECLAIM_RETRIES) {
3757                /* Before OOM, exhaust highatomic_reserve */
3758                return unreserve_highatomic_pageblock(ac, true);
3759        }
3760
3761        /*
3762         * Keep reclaiming pages while there is a chance this will lead
3763         * somewhere.  If none of the target zones can satisfy our allocation
3764         * request even if all reclaimable pages are considered then we are
3765         * screwed and have to go OOM.
3766         */
3767        for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
3768                                        ac->nodemask) {
3769                unsigned long available;
3770                unsigned long reclaimable;
3771                unsigned long min_wmark = min_wmark_pages(zone);
3772                bool wmark;
3773
3774                available = reclaimable = zone_reclaimable_pages(zone);
3775                available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
3776
3777                /*
3778                 * Would the allocation succeed if we reclaimed all
3779                 * reclaimable pages?
3780                 */
3781                wmark = __zone_watermark_ok(zone, order, min_wmark,
3782                                ac_classzone_idx(ac), alloc_flags, available);
3783                trace_reclaim_retry_zone(z, order, reclaimable,
3784                                available, min_wmark, *no_progress_loops, wmark);
3785                if (wmark) {
3786                        /*
3787                         * If we didn't make any progress and have a lot of
3788                         * dirty + writeback pages then we should wait for
3789                         * an IO to complete to slow down the reclaim and
3790                         * prevent from pre mature OOM
3791                         */
3792                        if (!did_some_progress) {
3793                                unsigned long write_pending;
3794
3795                                write_pending = zone_page_state_snapshot(zone,
3796                                                        NR_ZONE_WRITE_PENDING);
3797
3798                                if (2 * write_pending > reclaimable) {
3799                                        congestion_wait(BLK_RW_ASYNC, HZ/10);
3800                                        return true;
3801                                }
3802                        }
3803
3804                        /*
3805                         * Memory allocation/reclaim might be called from a WQ
3806                         * context and the current implementation of the WQ
3807                         * concurrency control doesn't recognize that
3808                         * a particular WQ is congested if the worker thread is
3809                         * looping without ever sleeping. Therefore we have to
3810                         * do a short sleep here rather than calling
3811                         * cond_resched().
3812                         */
3813                        if (current->flags & PF_WQ_WORKER)
3814                                schedule_timeout_uninterruptible(1);
3815                        else
3816                                cond_resched();
3817
3818                        return true;
3819                }
3820        }
3821
3822        return false;
3823}
3824
3825static inline bool
3826check_retry_cpuset(int cpuset_mems_cookie, struct alloc_context *ac)
3827{
3828        /*
3829         * It's possible that cpuset's mems_allowed and the nodemask from
3830         * mempolicy don't intersect. This should be normally dealt with by
3831         * policy_nodemask(), but it's possible to race with cpuset update in
3832         * such a way the check therein was true, and then it became false
3833         * before we got our cpuset_mems_cookie here.
3834         * This assumes that for all allocations, ac->nodemask can come only
3835         * from MPOL_BIND mempolicy (whose documented semantics is to be ignored
3836         * when it does not intersect with the cpuset restrictions) or the
3837         * caller can deal with a violated nodemask.
3838         */
3839        if (cpusets_enabled() && ac->nodemask &&
3840                        !cpuset_nodemask_valid_mems_allowed(ac->nodemask)) {
3841                ac->nodemask = NULL;
3842                return true;
3843        }
3844
3845        /*
3846         * When updating a task's mems_allowed or mempolicy nodemask, it is
3847         * possible to race with parallel threads in such a way that our
3848         * allocation can fail while the mask is being updated. If we are about
3849         * to fail, check if the cpuset changed during allocation and if so,
3850         * retry.
3851         */
3852        if (read_mems_allowed_retry(cpuset_mems_cookie))
3853                return true;
3854
3855        return false;
3856}
3857
3858static inline struct page *
3859__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
3860                                                struct alloc_context *ac)
3861{
3862        bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
3863        const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
3864        struct page *page = NULL;
3865        unsigned int alloc_flags;
3866        unsigned long did_some_progress;
3867        enum compact_priority compact_priority;
3868        enum compact_result compact_result;
3869        int compaction_retries;
3870        int no_progress_loops;
3871        unsigned long alloc_start = jiffies;
3872        unsigned int stall_timeout = 10 * HZ;
3873        unsigned int cpuset_mems_cookie;
3874        int reserve_flags;
3875
3876        /*
3877         * In the slowpath, we sanity check order to avoid ever trying to
3878         * reclaim >= MAX_ORDER areas which will never succeed. Callers may
3879         * be using allocators in order of preference for an area that is
3880         * too large.
3881         */
3882        if (order >= MAX_ORDER) {
3883                WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
3884                return NULL;
3885        }
3886
3887        /*
3888         * We also sanity check to catch abuse of atomic reserves being used by
3889         * callers that are not in atomic context.
3890         */
3891        if (WARN_ON_ONCE((gfp_mask & (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)) ==
3892                                (__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
3893                gfp_mask &= ~__GFP_ATOMIC;
3894
3895retry_cpuset:
3896        compaction_retries = 0;
3897        no_progress_loops = 0;
3898        compact_priority = DEF_COMPACT_PRIORITY;
3899        cpuset_mems_cookie = read_mems_allowed_begin();
3900
3901        /*
3902         * The fast path uses conservative alloc_flags to succeed only until
3903         * kswapd needs to be woken up, and to avoid the cost of setting up
3904         * alloc_flags precisely. So we do that now.
3905         */
3906        alloc_flags = gfp_to_alloc_flags(gfp_mask);
3907
3908        /*
3909         * We need to recalculate the starting point for the zonelist iterator
3910         * because we might have used different nodemask in the fast path, or
3911         * there was a cpuset modification and we are retrying - otherwise we
3912         * could end up iterating over non-eligible zones endlessly.
3913         */
3914        ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
3915                                        ac->high_zoneidx, ac->nodemask);
3916        if (!ac->preferred_zoneref->zone)
3917                goto nopage;
3918
3919        if (gfp_mask & __GFP_KSWAPD_RECLAIM)
3920                wake_all_kswapds(order, ac);
3921
3922        /*
3923         * The adjusted alloc_flags might result in immediate success, so try
3924         * that first
3925         */
3926        page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
3927        if (page)
3928                goto got_pg;
3929
3930        /*
3931         * For costly allocations, try direct compaction first, as it's likely
3932         * that we have enough base pages and don't need to reclaim. For non-
3933         * movable high-order allocations, do that as well, as compaction will
3934         * try prevent permanent fragmentation by migrating from blocks of the
3935         * same migratetype.
3936         * Don't try this for allocations that are allowed to ignore
3937         * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen.
3938         */
3939        if (can_direct_reclaim &&
3940                        (costly_order ||
3941                           (order > 0 && ac->migratetype != MIGRATE_MOVABLE))
3942                        && !gfp_pfmemalloc_allowed(gfp_mask)) {
3943                page = __alloc_pages_direct_compact(gfp_mask, order,
3944                                                alloc_flags, ac,
3945                                                INIT_COMPACT_PRIORITY,
3946                                                &compact_result);
3947                if (page)
3948                        goto got_pg;
3949
3950                /*
3951                 * Checks for costly allocations with __GFP_NORETRY, which
3952                 * includes THP page fault allocations
3953                 */
3954                if (costly_order && (gfp_mask & __GFP_NORETRY)) {
3955                        /*
3956                         * If compaction is deferred for high-order allocations,
3957                         * it is because sync compaction recently failed. If
3958                         * this is the case and the caller requested a THP
3959                         * allocation, we do not want to heavily disrupt the
3960                         * system, so we fail the allocation instead of entering
3961                         * direct reclaim.
3962                         */
3963                        if (compact_result == COMPACT_DEFERRED)
3964                                goto nopage;
3965
3966                        /*
3967                         * Looks like reclaim/compaction is worth trying, but
3968                         * sync compaction could be very expensive, so keep
3969                         * using async compaction.
3970                         */
3971                        compact_priority = INIT_COMPACT_PRIORITY;
3972                }
3973        }
3974
3975retry:
3976        /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
3977        if (gfp_mask & __GFP_KSWAPD_RECLAIM)
3978                wake_all_kswapds(order, ac);
3979
3980        reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
3981        if (reserve_flags)
3982                alloc_flags = reserve_flags;
3983
3984        /*
3985         * Reset the zonelist iterators if memory policies can be ignored.
3986         * These allocations are high priority and system rather than user
3987         * orientated.
3988         */
3989        if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
3990                ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
3991                ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
3992                                        ac->high_zoneidx, ac->nodemask);
3993        }
3994
3995        /* Attempt with potentially adjusted zonelist and alloc_flags */
3996        page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
3997        if (page)
3998                goto got_pg;
3999
4000        /* Caller is not willing to reclaim, we can't balance anything */
4001        if (!can_direct_reclaim)
4002                goto nopage;
4003
4004        /* Make sure we know about allocations which stall for too long */
4005        if (time_after(jiffies, alloc_start + stall_timeout)) {
4006                warn_alloc(gfp_mask & ~__GFP_NOWARN, ac->nodemask,
4007                        "page allocation stalls for %ums, order:%u",
4008                        jiffies_to_msecs(jiffies-alloc_start), order);
4009                stall_timeout += 10 * HZ;
4010        }
4011
4012        /* Avoid recursion of direct reclaim */
4013        if (current->flags & PF_MEMALLOC)
4014                goto nopage;
4015
4016        /* Try direct reclaim and then allocating */
4017        page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
4018                                                        &did_some_progress);
4019        if (page)
4020                goto got_pg;
4021
4022        /* Try direct compaction and then allocating */
4023        page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
4024                                        compact_priority, &compact_result);
4025        if (page)
4026                goto got_pg;
4027
4028        /* Do not loop if specifically requested */
4029        if (gfp_mask & __GFP_NORETRY)
4030                goto nopage;
4031
4032        /*
4033         * Do not retry costly high order allocations unless they are
4034         * __GFP_RETRY_MAYFAIL
4035         */
4036        if (costly_order && !(gfp_mask & __GFP_RETRY_MAYFAIL))
4037                goto nopage;
4038
4039        if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags,
4040                                 did_some_progress > 0, &no_progress_loops))
4041                goto retry;
4042
4043        /*
4044         * It doesn't make any sense to retry for the compaction if the order-0
4045         * reclaim is not able to make any progress because the current
4046         * implementation of the compaction depends on the sufficient amount
4047         * of free memory (see __compaction_suitable)
4048         */
4049        if (did_some_progress > 0 &&
4050                        should_compact_retry(ac, order, alloc_flags,
4051                                compact_result, &compact_priority,
4052                                &compaction_retries))
4053                goto retry;
4054
4055
4056        /* Deal with possible cpuset update races before we start OOM killing */
4057        if (check_retry_cpuset(cpuset_mems_cookie, ac))
4058                goto retry_cpuset;
4059
4060        /* Reclaim has failed us, start killing things */
4061        page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
4062        if (page)
4063                goto got_pg;
4064
4065        /* Avoid allocations with no watermarks from looping endlessly */
4066        if (tsk_is_oom_victim(current) &&
4067            (alloc_flags == ALLOC_OOM ||
4068             (gfp_mask & __GFP_NOMEMALLOC)))
4069                goto nopage;
4070
4071        /* Retry as long as the OOM killer is making progress */
4072        if (did_some_progress) {
4073                no_progress_loops = 0;
4074                goto retry;
4075        }
4076
4077nopage:
4078        /* Deal with possible cpuset update races before we fail */
4079        if (check_retry_cpuset(cpuset_mems_cookie, ac))
4080                goto retry_cpuset;
4081
4082        /*
4083         * Make sure that __GFP_NOFAIL request doesn't leak out and make sure
4084         * we always retry
4085         */
4086        if (gfp_mask & __GFP_NOFAIL) {
4087                /*
4088                 * All existing users of the __GFP_NOFAIL are blockable, so warn
4089                 * of any new users that actually require GFP_NOWAIT
4090                 */
4091                if (WARN_ON_ONCE(!can_direct_reclaim))
4092                        goto fail;
4093
4094                /*
4095                 * PF_MEMALLOC request from this context is rather bizarre
4096                 * because we cannot reclaim anything and only can loop waiting
4097                 * for somebody to do a work for us
4098                 */
4099                WARN_ON_ONCE(current->flags & PF_MEMALLOC);
4100
4101                /*
4102                 * non failing costly orders are a hard requirement which we
4103                 * are not prepared for much so let's warn about these users
4104                 * so that we can identify them and convert them to something
4105                 * else.
4106                 */
4107                WARN_ON_ONCE(order > PAGE_ALLOC_COSTLY_ORDER);
4108
4109                /*
4110                 * Help non-failing allocations by giving them access to memory
4111                 * reserves but do not use ALLOC_NO_WATERMARKS because this
4112                 * could deplete whole memory reserves which would just make
4113                 * the situation worse
4114                 */
4115                page = __alloc_pages_cpuset_fallback(gfp_mask, order, ALLOC_HARDER, ac);
4116                if (page)
4117                        goto got_pg;
4118
4119                cond_resched();
4120                goto retry;
4121        }
4122fail:
4123        warn_alloc(gfp_mask, ac->nodemask,
4124                        "page allocation failure: order:%u", order);
4125got_pg:
4126        return page;
4127}
4128
4129static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
4130                int preferred_nid, nodemask_t *nodemask,
4131                struct alloc_context *ac, gfp_t *alloc_mask,
4132                unsigned int *alloc_flags)
4133{
4134        ac->high_zoneidx = gfp_zone(gfp_mask);
4135        ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
4136        ac->nodemask = nodemask;
4137        ac->migratetype = gfpflags_to_migratetype(gfp_mask);
4138
4139        if (cpusets_enabled()) {
4140                *alloc_mask |= __GFP_HARDWALL;
4141                if (!ac->nodemask)
4142                        ac->nodemask = &cpuset_current_mems_allowed;
4143                else
4144                        *alloc_flags |= ALLOC_CPUSET;
4145        }
4146
4147        fs_reclaim_acquire(gfp_mask);
4148        fs_reclaim_release(gfp_mask);
4149
4150        might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
4151
4152        if (should_fail_alloc_page(gfp_mask, order))
4153                return false;
4154
4155        if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
4156                *alloc_flags |= ALLOC_CMA;
4157
4158        return true;
4159}
4160
4161/* Determine whether to spread dirty pages and what the first usable zone */
4162static inline void finalise_ac(gfp_t gfp_mask,
4163                unsigned int order, struct alloc_context *ac)
4164{
4165        /* Dirty zone balancing only done in the fast path */
4166        ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE);
4167
4168        /*
4169         * The preferred zone is used for statistics but crucially it is
4170         * also used as the starting point for the zonelist iterator. It
4171         * may get reset for allocations that ignore memory policies.
4172         */
4173        ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
4174                                        ac->high_zoneidx, ac->nodemask);
4175}
4176
4177/*
4178 * This is the 'heart' of the zoned buddy allocator.
4179 */
4180struct page *
4181__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
4182                                                        nodemask_t *nodemask)
4183{
4184        struct page *page;
4185        unsigned int alloc_flags = ALLOC_WMARK_LOW;
4186        gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
4187        struct alloc_context ac = { };
4188
4189        gfp_mask &= gfp_allowed_mask;
4190        alloc_mask = gfp_mask;
4191        if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
4192                return NULL;
4193
4194        finalise_ac(gfp_mask, order, &ac);
4195
4196        /* First allocation attempt */
4197        page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
4198        if (likely(page))
4199                goto out;
4200
4201        /*
4202         * Apply scoped allocation constraints. This is mainly about GFP_NOFS
4203         * resp. GFP_NOIO which has to be inherited for all allocation requests
4204         * from a particular context which has been marked by
4205         * memalloc_no{fs,io}_{save,restore}.
4206         */
4207        alloc_mask = current_gfp_context(gfp_mask);
4208        ac.spread_dirty_pages = false;
4209
4210        /*
4211         * Restore the original nodemask if it was potentially replaced with
4212         * &cpuset_current_mems_allowed to optimize the fast-path attempt.
4213         */
4214        if (unlikely(ac.nodemask != nodemask))
4215                ac.nodemask = nodemask;
4216
4217        page = __alloc_pages_slowpath(alloc_mask, order, &ac);
4218
4219out:
4220        if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
4221            unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) {
4222                __free_pages(page, order);
4223                page = NULL;
4224        }
4225
4226        if (kmemcheck_enabled && page)
4227                kmemcheck_pagealloc_alloc(page, order, gfp_mask);
4228
4229        trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
4230
4231        return page;
4232}
4233EXPORT_SYMBOL(__alloc_pages_nodemask);
4234
4235/*
4236 * Common helper functions.
4237 */
4238unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
4239{
4240        struct page *page;
4241
4242        /*
4243         * __get_free_pages() returns a 32-bit address, which cannot represent
4244         * a highmem page
4245         */
4246        VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
4247
4248        page = alloc_pages(gfp_mask, order);
4249        if (!page)
4250                return 0;
4251        return (unsigned long) page_address(page);
4252}
4253EXPORT_SYMBOL(__get_free_pages);
4254
4255unsigned long get_zeroed_page(gfp_t gfp_mask)
4256{
4257        return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
4258}
4259EXPORT_SYMBOL(get_zeroed_page);
4260
4261void __free_pages(struct page *page, unsigned int order)
4262{
4263        if (put_page_testzero(page)) {
4264                if (order == 0)
4265                        free_hot_cold_page(page, false);
4266                else
4267                        __free_pages_ok(page, order);
4268        }
4269}
4270
4271EXPORT_SYMBOL(__free_pages);
4272
4273void free_pages(unsigned long addr, unsigned int order)
4274{
4275        if (addr != 0) {
4276                VM_BUG_ON(!virt_addr_valid((void *)addr));
4277                __free_pages(virt_to_page((void *)addr), order);
4278        }
4279}
4280
4281EXPORT_SYMBOL(free_pages);
4282
4283/*
4284 * Page Fragment:
4285 *  An arbitrary-length arbitrary-offset area of memory which resides
4286 *  within a 0 or higher order page.  Multiple fragments within that page
4287 *  are individually refcounted, in the page's reference counter.
4288 *
4289 * The page_frag functions below provide a simple allocation framework for
4290 * page fragments.  This is used by the network stack and network device
4291 * drivers to provide a backing region of memory for use as either an
4292 * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
4293 */
4294static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
4295                                             gfp_t gfp_mask)
4296{
4297        struct page *page = NULL;
4298        gfp_t gfp = gfp_mask;
4299
4300#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
4301        gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
4302                    __GFP_NOMEMALLOC;
4303        page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
4304                                PAGE_FRAG_CACHE_MAX_ORDER);
4305        nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
4306#endif
4307        if (unlikely(!page))
4308                page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
4309
4310        nc->va = page ? page_address(page) : NULL;
4311
4312        return page;
4313}
4314
4315void __page_frag_cache_drain(struct page *page, unsigned int count)
4316{
4317        VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
4318
4319        if (page_ref_sub_and_test(page, count)) {
4320                unsigned int order = compound_order(page);
4321
4322                if (order == 0)
4323                        free_hot_cold_page(page, false);
4324                else
4325                        __free_pages_ok(page, order);
4326        }
4327}
4328EXPORT_SYMBOL(__page_frag_cache_drain);
4329
4330void *page_frag_alloc(struct page_frag_cache *nc,
4331                      unsigned int fragsz, gfp_t gfp_mask)
4332{
4333        unsigned int size = PAGE_SIZE;
4334        struct page *page;
4335        int offset;
4336
4337        if (unlikely(!nc->va)) {
4338refill:
4339                page = __page_frag_cache_refill(nc, gfp_mask);
4340                if (!page)
4341                        return NULL;
4342
4343#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
4344                /* if size can vary use size else just use PAGE_SIZE */
4345                size = nc->size;
4346#endif
4347                /* Even if we own the page, we do not use atomic_set().
4348                 * This would break get_page_unless_zero() users.
4349                 */
4350                page_ref_add(page, size - 1);
4351
4352                /* reset page count bias and offset to start of new frag */
4353                nc->pfmemalloc = page_is_pfmemalloc(page);
4354                nc->pagecnt_bias = size;
4355                nc->offset = size;
4356        }
4357
4358        offset = nc->offset - fragsz;
4359        if (unlikely(offset < 0)) {
4360                page = virt_to_page(nc->va);
4361
4362                if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
4363                        goto refill;
4364
4365#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
4366                /* if size can vary use size else just use PAGE_SIZE */
4367                size = nc->size;
4368#endif
4369                /* OK, page count is 0, we can safely set it */
4370                set_page_count(page, size);
4371
4372                /* reset page count bias and offset to start of new frag */
4373                nc->pagecnt_bias = size;
4374                offset = size - fragsz;
4375        }
4376
4377        nc->pagecnt_bias--;
4378        nc->offset = offset;
4379
4380        return nc->va + offset;
4381}
4382EXPORT_SYMBOL(page_frag_alloc);
4383
4384/*
4385 * Frees a page fragment allocated out of either a compound or order 0 page.
4386 */
4387void page_frag_free(void *addr)
4388{
4389        struct page *page = virt_to_head_page(addr);
4390
4391        if (unlikely(put_page_testzero(page)))
4392                __free_pages_ok(page, compound_order(page));
4393}
4394EXPORT_SYMBOL(page_frag_free);
4395
4396static void *make_alloc_exact(unsigned long addr, unsigned int order,
4397                size_t size)
4398{
4399        if (addr) {
4400                unsigned long alloc_end = addr + (PAGE_SIZE << order);
4401                unsigned long used = addr + PAGE_ALIGN(size);
4402
4403                split_page(virt_to_page((void *)addr), order);
4404                while (used < alloc_end) {
4405                        free_page(used);
4406                        used += PAGE_SIZE;
4407                }
4408        }
4409        return (void *)addr;
4410}
4411
4412/**
4413 * alloc_pages_exact - allocate an exact number physically-contiguous pages.
4414 * @size: the number of bytes to allocate
4415 * @gfp_mask: GFP flags for the allocation
4416 *
4417 * This function is similar to alloc_pages(), except that it allocates the
4418 * minimum number of pages to satisfy the request.  alloc_pages() can only
4419 * allocate memory in power-of-two pages.
4420 *
4421 * This function is also limited by MAX_ORDER.
4422 *
4423 * Memory allocated by this function must be released by free_pages_exact().
4424 */
4425void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
4426{
4427        unsigned int order = get_order(size);
4428        unsigned long addr;
4429
4430        addr = __get_free_pages(gfp_mask, order);
4431        return make_alloc_exact(addr, order, size);
4432}
4433EXPORT_SYMBOL(alloc_pages_exact);
4434
4435/**
4436 * alloc_pages_exact_nid - allocate an exact number of physically-contiguous
4437 *                         pages on a node.
4438 * @nid: the preferred node ID where memory should be allocated
4439 * @size: the number of bytes to allocate
4440 * @gfp_mask: GFP flags for the allocation
4441 *
4442 * Like alloc_pages_exact(), but try to allocate on node nid first before falling
4443 * back.
4444 */
4445void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
4446{
4447        unsigned int order = get_order(size);
4448        struct page *p = alloc_pages_node(nid, gfp_mask, order);
4449        if (!p)
4450                return NULL;
4451        return make_alloc_exact((unsigned long)page_address(p), order, size);
4452}
4453
4454/**
4455 * free_pages_exact - release memory allocated via alloc_pages_exact()
4456 * @virt: the value returned by alloc_pages_exact.
4457 * @size: size of allocation, same value as passed to alloc_pages_exact().
4458 *
4459 * Release the memory allocated by a previous call to alloc_pages_exact.
4460 */
4461void free_pages_exact(void *virt, size_t size)
4462{
4463        unsigned long addr = (unsigned long)virt;
4464        unsigned long end = addr + PAGE_ALIGN(size);
4465
4466        while (addr < end) {
4467                free_page(addr);
4468                addr += PAGE_SIZE;
4469        }
4470}
4471EXPORT_SYMBOL(free_pages_exact);
4472
4473/**
4474 * nr_free_zone_pages - count number of pages beyond high watermark
4475 * @offset: The zone index of the highest zone
4476 *
4477 * nr_free_zone_pages() counts the number of counts pages which are beyond the
4478 * high watermark within all zones at or below a given zone index.  For each
4479 * zone, the number of pages is calculated as:
4480 *
4481 *     nr_free_zone_pages = managed_pages - high_pages
4482 */
4483static unsigned long nr_free_zone_pages(int offset)
4484{
4485        struct zoneref *z;
4486        struct zone *zone;
4487
4488        /* Just pick one node, since fallback list is circular */
4489        unsigned long sum = 0;
4490
4491        struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
4492
4493        for_each_zone_zonelist(zone, z, zonelist, offset) {
4494                unsigned long size = zone->managed_pages;
4495                unsigned long high = high_wmark_pages(zone);
4496                if (size > high)
4497                        sum += size - high;
4498        }
4499
4500        return sum;
4501}
4502
4503/**
4504 * nr_free_buffer_pages - count number of pages beyond high watermark
4505 *
4506 * nr_free_buffer_pages() counts the number of pages which are beyond the high
4507 * watermark within ZONE_DMA and ZONE_NORMAL.
4508 */
4509unsigned long nr_free_buffer_pages(void)
4510{
4511        return nr_free_zone_pages(gfp_zone(GFP_USER));
4512}
4513EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
4514
4515/**
4516 * nr_free_pagecache_pages - count number of pages beyond high watermark
4517 *
4518 * nr_free_pagecache_pages() counts the number of pages which are beyond the
4519 * high watermark within all zones.
4520 */
4521unsigned long nr_free_pagecache_pages(void)
4522{
4523        return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
4524}
4525
4526static inline void show_node(struct zone *zone)
4527{
4528        if (IS_ENABLED(CONFIG_NUMA))
4529                printk("Node %d ", zone_to_nid(zone));
4530}
4531
4532long si_mem_available(void)
4533{
4534        long available;
4535        unsigned long pagecache;
4536        unsigned long wmark_low = 0;
4537        unsigned long pages[NR_LRU_LISTS];
4538        struct zone *zone;
4539        int lru;
4540
4541        for (lru = LRU_BASE; lru < NR_LRU_LISTS; lru++)
4542                pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
4543
4544        for_each_zone(zone)
4545                wmark_low += zone->watermark[WMARK_LOW];
4546
4547        /*
4548         * Estimate the amount of memory available for userspace allocations,
4549         * without causing swapping.
4550         */
4551        available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
4552
4553        /*
4554         * Not all the page cache can be freed, otherwise the system will
4555         * start swapping. Assume at least half of the page cache, or the
4556         * low watermark worth of cache, needs to stay.
4557         */
4558        pagecache = pages[LRU_ACTIVE_FILE] + pages[LRU_INACTIVE_FILE];
4559        pagecache -= min(pagecache / 2, wmark_low);
4560        available += pagecache;
4561
4562        /*
4563         * Part of the reclaimable slab consists of items that are in use,
4564         * and cannot be freed. Cap this estimate at the low watermark.
4565         */
4566        available += global_node_page_state(NR_SLAB_RECLAIMABLE) -
4567                     min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2,
4568                         wmark_low);
4569
4570        if (available < 0)
4571                available = 0;
4572        return available;
4573}
4574EXPORT_SYMBOL_GPL(si_mem_available);
4575
4576void si_meminfo(struct sysinfo *val)
4577{
4578        val->totalram = totalram_pages;
4579        val->sharedram = global_node_page_state(NR_SHMEM);
4580        val->freeram = global_zone_page_state(NR_FREE_PAGES);
4581        val->bufferram = nr_blockdev_pages();
4582        val->totalhigh = totalhigh_pages;
4583        val->freehigh = nr_free_highpages();
4584        val->mem_unit = PAGE_SIZE;
4585}
4586
4587EXPORT_SYMBOL(si_meminfo);
4588
4589#ifdef CONFIG_NUMA
4590void si_meminfo_node(struct sysinfo *val, int nid)
4591{
4592        int zone_type;          /* needs to be signed */
4593        unsigned long managed_pages = 0;
4594        unsigned long managed_highpages = 0;
4595        unsigned long free_highpages = 0;
4596        pg_data_t *pgdat = NODE_DATA(nid);
4597
4598        for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
4599                managed_pages += pgdat->node_zones[zone_type].managed_pages;
4600        val->totalram = managed_pages;
4601        val->sharedram = node_page_state(pgdat, NR_SHMEM);
4602        val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
4603#ifdef CONFIG_HIGHMEM
4604        for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
4605                struct zone *zone = &pgdat->node_zones[zone_type];
4606
4607                if (is_highmem(zone)) {
4608                        managed_highpages += zone->managed_pages;
4609                        free_highpages += zone_page_state(zone, NR_FREE_PAGES);
4610                }
4611        }
4612        val->totalhigh = managed_highpages;
4613        val->freehigh = free_highpages;
4614#else
4615        val->totalhigh = managed_highpages;
4616        val->freehigh = free_highpages;
4617#endif
4618        val->mem_unit = PAGE_SIZE;
4619}
4620#endif
4621
4622/*
4623 * Determine whether the node should be displayed or not, depending on whether
4624 * SHOW_MEM_FILTER_NODES was passed to show_free_areas().
4625 */
4626static bool show_mem_node_skip(unsigned int flags, int nid, nodemask_t *nodemask)
4627{
4628        if (!(flags & SHOW_MEM_FILTER_NODES))
4629                return false;
4630
4631        /*
4632         * no node mask - aka implicit memory numa policy. Do not bother with
4633         * the synchronization - read_mems_allowed_begin - because we do not
4634         * have to be precise here.
4635         */
4636        if (!nodemask)
4637                nodemask = &cpuset_current_mems_allowed;
4638
4639        return !node_isset(nid, *nodemask);
4640}
4641
4642#define K(x) ((x) << (PAGE_SHIFT-10))
4643
4644static void show_migration_types(unsigned char type)
4645{
4646        static const char types[MIGRATE_TYPES] = {
4647                [MIGRATE_UNMOVABLE]     = 'U',
4648                [MIGRATE_MOVABLE]       = 'M',
4649                [MIGRATE_RECLAIMABLE]   = 'E',
4650                [MIGRATE_HIGHATOMIC]    = 'H',
4651#ifdef CONFIG_CMA
4652                [MIGRATE_CMA]           = 'C',
4653#endif
4654#ifdef CONFIG_MEMORY_ISOLATION
4655                [MIGRATE_ISOLATE]       = 'I',
4656#endif
4657        };
4658        char tmp[MIGRATE_TYPES + 1];
4659        char *p = tmp;
4660        int i;
4661
4662        for (i = 0; i < MIGRATE_TYPES; i++) {
4663                if (type & (1 << i))
4664                        *p++ = types[i];
4665        }
4666
4667        *p = '\0';
4668        printk(KERN_CONT "(%s) ", tmp);
4669}
4670
4671/*
4672 * Show free area list (used inside shift_scroll-lock stuff)
4673 * We also calculate the percentage fragmentation. We do this by counting the
4674 * memory on each free list with the exception of the first item on the list.
4675 *
4676 * Bits in @filter:
4677 * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's
4678 *   cpuset.
4679 */
4680void show_free_areas(unsigned int filter, nodemask_t *nodemask)
4681{
4682        unsigned long free_pcp = 0;
4683        int cpu;
4684        struct zone *zone;
4685        pg_data_t *pgdat;
4686
4687        for_each_populated_zone(zone) {
4688                if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
4689                        continue;
4690
4691                for_each_online_cpu(cpu)
4692                        free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
4693        }
4694
4695        printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
4696                " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
4697                " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
4698                " slab_reclaimable:%lu slab_unreclaimable:%lu\n"
4699                " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
4700                " free:%lu free_pcp:%lu free_cma:%lu\n",
4701                global_node_page_state(NR_ACTIVE_ANON),
4702                global_node_page_state(NR_INACTIVE_ANON),
4703                global_node_page_state(NR_ISOLATED_ANON),
4704                global_node_page_state(NR_ACTIVE_FILE),
4705                global_node_page_state(NR_INACTIVE_FILE),
4706                global_node_page_state(NR_ISOLATED_FILE),
4707                global_node_page_state(NR_UNEVICTABLE),
4708                global_node_page_state(NR_FILE_DIRTY),
4709                global_node_page_state(NR_WRITEBACK),
4710                global_node_page_state(NR_UNSTABLE_NFS),
4711                global_node_page_state(NR_SLAB_RECLAIMABLE),
4712                global_node_page_state(NR_SLAB_UNRECLAIMABLE),
4713                global_node_page_state(NR_FILE_MAPPED),
4714                global_node_page_state(NR_SHMEM),
4715                global_zone_page_state(NR_PAGETABLE),
4716                global_zone_page_state(NR_BOUNCE),
4717                global_zone_page_state(NR_FREE_PAGES),
4718                free_pcp,
4719                global_zone_page_state(NR_FREE_CMA_PAGES));
4720
4721        for_each_online_pgdat(pgdat) {
4722                if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
4723                        continue;
4724
4725                printk("Node %d"
4726                        " active_anon:%lukB"
4727                        " inactive_anon:%lukB"
4728                        " active_file:%lukB"
4729                        " inactive_file:%lukB"
4730                        " unevictable:%lukB"
4731                        " isolated(anon):%lukB"
4732                        " isolated(file):%lukB"
4733                        " mapped:%lukB"
4734                        " dirty:%lukB"
4735                        " writeback:%lukB"
4736                        " shmem:%lukB"
4737#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4738                        " shmem_thp: %lukB"
4739                        " shmem_pmdmapped: %lukB"
4740                        " anon_thp: %lukB"
4741#endif
4742                        " writeback_tmp:%lukB"
4743                        " unstable:%lukB"
4744                        " all_unreclaimable? %s"
4745                        "\n",
4746                        pgdat->node_id,
4747                        K(node_page_state(pgdat, NR_ACTIVE_ANON)),
4748                        K(node_page_state(pgdat, NR_INACTIVE_ANON)),
4749                        K(node_page_state(pgdat, NR_ACTIVE_FILE)),
4750                        K(node_page_state(pgdat, NR_INACTIVE_FILE)),
4751                        K(node_page_state(pgdat, NR_UNEVICTABLE)),
4752                        K(node_page_state(pgdat, NR_ISOLATED_ANON)),
4753                        K(node_page_state(pgdat, NR_ISOLATED_FILE)),
4754                        K(node_page_state(pgdat, NR_FILE_MAPPED)),
4755                        K(node_page_state(pgdat, NR_FILE_DIRTY)),
4756                        K(node_page_state(pgdat, NR_WRITEBACK)),
4757                        K(node_page_state(pgdat, NR_SHMEM)),
4758#ifdef CONFIG_TRANSPARENT_HUGEPAGE
4759                        K(node_page_state(pgdat, NR_SHMEM_THPS) * HPAGE_PMD_NR),
4760                        K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED)
4761                                        * HPAGE_PMD_NR),
4762                        K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
4763#endif
4764                        K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
4765                        K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
4766                        pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
4767                                "yes" : "no");
4768        }
4769
4770        for_each_populated_zone(zone) {
4771                int i;
4772
4773                if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
4774                        continue;
4775
4776                free_pcp = 0;
4777                for_each_online_cpu(cpu)
4778                        free_pcp += per_cpu_ptr(zone->pageset, cpu)->pcp.count;
4779
4780                show_node(zone);
4781                printk(KERN_CONT
4782                        "%s"
4783                        " free:%lukB"
4784                        " min:%lukB"
4785                        " low:%lukB"
4786                        " high:%lukB"
4787                        " active_anon:%lukB"
4788                        " inactive_anon:%lukB"
4789                        " active_file:%lukB"
4790                        " inactive_file:%lukB"
4791                        " unevictable:%lukB"
4792                        " writepending:%lukB"
4793                        " present:%lukB"
4794                        " managed:%lukB"
4795                        " mlocked:%lukB"
4796                        " kernel_stack:%lukB"
4797                        " pagetables:%lukB"
4798                        " bounce:%lukB"
4799                        " free_pcp:%lukB"
4800                        " local_pcp:%ukB"
4801                        " free_cma:%lukB"
4802                        "\n",
4803                        zone->name,
4804                        K(zone_page_state(zone, NR_FREE_PAGES)),
4805                        K(min_wmark_pages(zone)),
4806                        K(low_wmark_pages(zone)),
4807                        K(high_wmark_pages(zone)),
4808                        K(zone_page_state(zone, NR_ZONE_ACTIVE_ANON)),
4809                        K(zone_page_state(zone, NR_ZONE_INACTIVE_ANON)),
4810                        K(zone_page_state(zone, NR_ZONE_ACTIVE_FILE)),
4811                        K(zone_page_state(zone, NR_ZONE_INACTIVE_FILE)),
4812                        K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
4813                        K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
4814                        K(zone->present_pages),
4815                        K(zone->managed_pages),
4816                        K(zone_page_state(zone, NR_MLOCK)),
4817                        zone_page_state(zone, NR_KERNEL_STACK_KB),
4818                        K(zone_page_state(zone, NR_PAGETABLE)),
4819                        K(zone_page_state(zone, NR_BOUNCE)),
4820                        K(free_pcp),
4821                        K(this_cpu_read(zone->pageset->pcp.count)),
4822                        K(zone_page_state(zone, NR_FREE_CMA_PAGES)));
4823                printk("lowmem_reserve[]:");
4824                for (i = 0; i < MAX_NR_ZONES; i++)
4825                        printk(KERN_CONT " %ld", zone->lowmem_reserve[i]);
4826                printk(KERN_CONT "\n");
4827        }
4828
4829        for_each_populated_zone(zone) {
4830                unsigned int order;
4831                unsigned long nr[MAX_ORDER], flags, total = 0;
4832                unsigned char types[MAX_ORDER];
4833
4834                if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask))
4835                        continue;
4836                show_node(zone);
4837                printk(KERN_CONT "%s: ", zone->name);
4838
4839                spin_lock_irqsave(&zone->lock, flags);
4840                for (order = 0; order < MAX_ORDER; order++) {
4841                        struct free_area *area = &zone->free_area[order];
4842                        int type;
4843
4844                        nr[order] = area->nr_free;
4845                        total += nr[order] << order;
4846
4847                        types[order] = 0;
4848                        for (type = 0; type < MIGRATE_TYPES; type++) {
4849                                if (!list_empty(&area->free_list[type]))
4850                                        types[order] |= 1 << type;
4851                        }
4852                }
4853                spin_unlock_irqrestore(&zone->lock, flags);
4854                for (order = 0; order < MAX_ORDER; order++) {
4855                        printk(KERN_CONT "%lu*%lukB ",
4856                               nr[order], K(1UL) << order);
4857                        if (nr[order])
4858                                show_migration_types(types[order]);
4859                }
4860                printk(KERN_CONT "= %lukB\n", K(total));
4861        }
4862
4863        hugetlb_show_meminfo();
4864
4865        printk("%ld total pagecache pages\n", global_node_page_state(NR_FILE_PAGES));
4866
4867        show_swap_cache_info();
4868}
4869
4870static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
4871{
4872        zoneref->zone = zone;
4873        zoneref->zone_idx = zone_idx(zone);
4874}
4875
4876/*
4877 * Builds allocation fallback zone lists.
4878 *
4879 * Add all populated zones of a node to the zonelist.
4880 */
4881static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
4882{
4883        struct zone *zone;
4884        enum zone_type zone_type = MAX_NR_ZONES;
4885        int nr_zones = 0;
4886
4887        do {
4888                zone_type--;
4889                zone = pgdat->node_zones + zone_type;
4890                if (managed_zone(zone)) {
4891                        zoneref_set_zone(zone, &zonerefs[nr_zones++]);
4892                        check_highest_zone(zone_type);
4893                }
4894        } while (zone_type);
4895
4896        return nr_zones;
4897}
4898
4899#ifdef CONFIG_NUMA
4900
4901static int __parse_numa_zonelist_order(char *s)
4902{
4903        /*
4904         * We used to support different zonlists modes but they turned
4905         * out to be just not useful. Let's keep the warning in place
4906         * if somebody still use the cmd line parameter so that we do
4907         * not fail it silently
4908         */
4909        if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) {
4910                pr_warn("Ignoring unsupported numa_zonelist_order value:  %s\n", s);
4911                return -EINVAL;
4912        }
4913        return 0;
4914}
4915
4916static __init int setup_numa_zonelist_order(char *s)
4917{
4918        if (!s)
4919                return 0;
4920
4921        return __parse_numa_zonelist_order(s);
4922}
4923early_param("numa_zonelist_order", setup_numa_zonelist_order);
4924
4925char numa_zonelist_order[] = "Node";
4926
4927/*
4928 * sysctl handler for numa_zonelist_order
4929 */
4930int numa_zonelist_order_handler(struct ctl_table *table, int write,
4931                void __user *buffer, size_t *length,
4932                loff_t *ppos)
4933{
4934        char *str;
4935        int ret;
4936
4937        if (!write)
4938                return proc_dostring(table, write, buffer, length, ppos);
4939        str = memdup_user_nul(buffer, 16);
4940        if (IS_ERR(str))
4941                return PTR_ERR(str);
4942
4943        ret = __parse_numa_zonelist_order(str);
4944        kfree(str);
4945        return ret;
4946}
4947
4948
4949#define MAX_NODE_LOAD (nr_online_nodes)
4950static int node_load[MAX_NUMNODES];
4951
4952/**
4953 * find_next_best_node - find the next node that should appear in a given node's fallback list
4954 * @node: node whose fallback list we're appending
4955 * @used_node_mask: nodemask_t of already used nodes
4956 *
4957 * We use a number of factors to determine which is the next node that should
4958 * appear on a given node's fallback list.  The node should not have appeared
4959 * already in @node's fallback list, and it should be the next closest node
4960 * according to the distance array (which contains arbitrary distance values
4961 * from each node to each node in the system), and should also prefer nodes
4962 * with no CPUs, since presumably they'll have very little allocation pressure
4963 * on them otherwise.
4964 * It returns -1 if no node is found.
4965 */
4966static int find_next_best_node(int node, nodemask_t *used_node_mask)
4967{
4968        int n, val;
4969        int min_val = INT_MAX;
4970        int best_node = NUMA_NO_NODE;
4971        const struct cpumask *tmp = cpumask_of_node(0);
4972
4973        /* Use the local node if we haven't already */
4974        if (!node_isset(node, *used_node_mask)) {
4975                node_set(node, *used_node_mask);
4976                return node;
4977        }
4978
4979        for_each_node_state(n, N_MEMORY) {
4980
4981                /* Don't want a node to appear more than once */
4982                if (node_isset(n, *used_node_mask))
4983                        continue;
4984
4985                /* Use the distance array to find the distance */
4986                val = node_distance(node, n);
4987
4988                /* Penalize nodes under us ("prefer the next node") */
4989                val += (n < node);
4990
4991                /* Give preference to headless and unused nodes */
4992                tmp = cpumask_of_node(n);
4993                if (!cpumask_empty(tmp))
4994                        val += PENALTY_FOR_NODE_WITH_CPUS;
4995
4996                /* Slight preference for less loaded node */
4997                val *= (MAX_NODE_LOAD*MAX_NUMNODES);
4998                val += node_load[n];
4999
5000                if (val < min_val) {
5001                        min_val = val;
5002                        best_node = n;
5003                }
5004        }
5005
5006        if (best_node >= 0)
5007                node_set(best_node, *used_node_mask);
5008
5009        return best_node;
5010}
5011
5012
5013/*
5014 * Build zonelists ordered by node and zones within node.
5015 * This results in maximum locality--normal zone overflows into local
5016 * DMA zone, if any--but risks exhausting DMA zone.
5017 */
5018static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
5019                unsigned nr_nodes)
5020{
5021        struct zoneref *zonerefs;
5022        int i;
5023
5024        zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
5025
5026        for (i = 0; i < nr_nodes; i++) {
5027                int nr_zones;
5028
5029                pg_data_t *node = NODE_DATA(node_order[i]);
5030
5031                nr_zones = build_zonerefs_node(node, zonerefs);
5032                zonerefs += nr_zones;
5033        }
5034        zonerefs->zone = NULL;
5035        zonerefs->zone_idx = 0;
5036}
5037
5038/*
5039 * Build gfp_thisnode zonelists
5040 */
5041static void build_thisnode_zonelists(pg_data_t *pgdat)
5042{
5043        struct zoneref *zonerefs;
5044        int nr_zones;
5045
5046        zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;
5047        nr_zones = build_zonerefs_node(pgdat, zonerefs);
5048        zonerefs += nr_zones;
5049        zonerefs->zone = NULL;
5050        zonerefs->zone_idx = 0;
5051}
5052
5053/*
5054 * Build zonelists ordered by zone and nodes within zones.
5055 * This results in conserving DMA zone[s] until all Normal memory is
5056 * exhausted, but results in overflowing to remote node while memory
5057 * may still exist in local DMA zone.
5058 */
5059
5060static void build_zonelists(pg_data_t *pgdat)
5061{
5062        static int node_order[MAX_NUMNODES];
5063        int node, load, nr_nodes = 0;
5064        nodemask_t used_mask;
5065        int local_node, prev_node;
5066
5067        /* NUMA-aware ordering of nodes */
5068        local_node = pgdat->node_id;
5069        load = nr_online_nodes;
5070        prev_node = local_node;
5071        nodes_clear(used_mask);
5072
5073        memset(node_order, 0, sizeof(node_order));
5074        while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
5075                /*
5076                 * We don't want to pressure a particular node.
5077                 * So adding penalty to the first node in same
5078                 * distance group to make it round-robin.
5079                 */
5080                if (node_distance(local_node, node) !=
5081                    node_distance(local_node, prev_node))
5082                        node_load[node] = load;
5083
5084                node_order[nr_nodes++] = node;
5085                prev_node = node;
5086                load--;
5087        }
5088
5089        build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
5090        build_thisnode_zonelists(pgdat);
5091}
5092
5093#ifdef CONFIG_HAVE_MEMORYLESS_NODES
5094/*
5095 * Return node id of node used for "local" allocations.
5096 * I.e., first node id of first zone in arg node's generic zonelist.
5097 * Used for initializing percpu 'numa_mem', which is used primarily
5098 * for kernel allocations, so use GFP_KERNEL flags to locate zonelist.
5099 */
5100int local_memory_node(int node)
5101{
5102        struct zoneref *z;
5103
5104        z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
5105                                   gfp_zone(GFP_KERNEL),
5106                                   NULL);
5107        return z->zone->node;
5108}
5109#endif
5110
5111static void setup_min_unmapped_ratio(void);
5112static void setup_min_slab_ratio(void);
5113#else   /* CONFIG_NUMA */
5114
5115static void build_zonelists(pg_data_t *pgdat)
5116{
5117        int node, local_node;
5118        struct zoneref *zonerefs;
5119        int nr_zones;
5120
5121        local_node = pgdat->node_id;
5122
5123        zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
5124        nr_zones = build_zonerefs_node(pgdat, zonerefs);
5125        zonerefs += nr_zones;
5126
5127        /*
5128         * Now we build the zonelist so that it contains the zones
5129         * of all the other nodes.
5130         * We don't want to pressure a particular node, so when
5131         * building the zones for node N, we make sure that the
5132         * zones coming right after the local ones are those from
5133         * node N+1 (modulo N)
5134         */
5135        for (node = local_node + 1; node < MAX_NUMNODES; node++) {
5136                if (!node_online(node))
5137                        continue;
5138                nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
5139                zonerefs += nr_zones;
5140        }
5141        for (node = 0; node < local_node; node++) {
5142                if (!node_online(node))
5143                        continue;
5144                nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
5145                zonerefs += nr_zones;
5146        }
5147
5148        zonerefs->zone = NULL;
5149        zonerefs->zone_idx = 0;
5150}
5151
5152#endif  /* CONFIG_NUMA */
5153
5154/*
5155 * Boot pageset table. One per cpu which is going to be used for all
5156 * zones and all nodes. The parameters will be set in such a way
5157 * that an item put on a list will immediately be handed over to
5158 * the buddy list. This is safe since pageset manipulation is done
5159 * with interrupts disabled.
5160 *
5161 * The boot_pagesets must be kept even after bootup is complete for
5162 * unused processors and/or zones. They do play a role for bootstrapping
5163 * hotplugged processors.
5164 *
5165 * zoneinfo_show() and maybe other functions do
5166 * not check if the processor is online before following the pageset pointer.
5167 * Other parts of the kernel may not check if the zone is available.
5168 */
5169static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
5170static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
5171static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
5172
5173static void __build_all_zonelists(void *data)
5174{
5175        int nid;
5176        int __maybe_unused cpu;
5177        pg_data_t *self = data;
5178        static DEFINE_SPINLOCK(lock);
5179
5180        spin_lock(&lock);
5181
5182#ifdef CONFIG_NUMA
5183        memset(node_load, 0, sizeof(node_load));
5184#endif
5185
5186        /*
5187         * This node is hotadded and no memory is yet present.   So just
5188         * building zonelists is fine - no need to touch other nodes.
5189         */
5190        if (self && !node_online(self->node_id)) {
5191                build_zonelists(self);
5192        } else {
5193                for_each_online_node(nid) {
5194                        pg_data_t *pgdat = NODE_DATA(nid);
5195
5196                        build_zonelists(pgdat);
5197                }
5198
5199#ifdef CONFIG_HAVE_MEMORYLESS_NODES
5200                /*
5201                 * We now know the "local memory node" for each node--
5202                 * i.e., the node of the first zone in the generic zonelist.
5203                 * Set up numa_mem percpu variable for on-line cpus.  During
5204                 * boot, only the boot cpu should be on-line;  we'll init the
5205                 * secondary cpus' numa_mem as they come on-line.  During
5206                 * node/memory hotplug, we'll fixup all on-line cpus.
5207                 */
5208                for_each_online_cpu(cpu)
5209                        set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
5210#endif
5211        }
5212
5213        spin_unlock(&lock);
5214}
5215
5216static noinline void __init
5217build_all_zonelists_init(void)
5218{
5219        int cpu;
5220
5221        __build_all_zonelists(NULL);
5222
5223        /*
5224         * Initialize the boot_pagesets that are going to be used
5225         * for bootstrapping processors. The real pagesets for
5226         * each zone will be allocated later when the per cpu
5227         * allocator is available.
5228         *
5229         * boot_pagesets are used also for bootstrapping offline
5230         * cpus if the system is already booted because the pagesets
5231         * are needed to initialize allocators on a specific cpu too.
5232         * F.e. the percpu allocator needs the page allocator which
5233         * needs the percpu allocator in order to allocate its pagesets
5234         * (a chicken-egg dilemma).
5235         */
5236        for_each_possible_cpu(cpu)
5237                setup_pageset(&per_cpu(boot_pageset, cpu), 0);
5238
5239        mminit_verify_zonelist();
5240        cpuset_init_current_mems_allowed();
5241}
5242
5243/*
5244 * unless system_state == SYSTEM_BOOTING.
5245 *
5246 * __ref due to call of __init annotated helper build_all_zonelists_init
5247 * [protected by SYSTEM_BOOTING].
5248 */
5249void __ref build_all_zonelists(pg_data_t *pgdat)
5250{
5251        if (system_state == SYSTEM_BOOTING) {
5252                build_all_zonelists_init();
5253        } else {
5254                __build_all_zonelists(pgdat);
5255                /* cpuset refresh routine should be here */
5256        }
5257        vm_total_pages = nr_free_pagecache_pages();
5258        /*
5259         * Disable grouping by mobility if the number of pages in the
5260         * system is too low to allow the mechanism to work. It would be
5261         * more accurate, but expensive to check per-zone. This check is
5262         * made on memory-hotadd so a system can start with mobility
5263         * disabled and enable it later
5264         */
5265        if (vm_total_pages < (pageblock_nr_pages * MIGRATE_TYPES))
5266                page_group_by_mobility_disabled = 1;
5267        else
5268                page_group_by_mobility_disabled = 0;
5269
5270        pr_info("Built %i zonelists, mobility grouping %s.  Total pages: %ld\n",
5271                nr_online_nodes,
5272                page_group_by_mobility_disabled ? "off" : "on",
5273                vm_total_pages);
5274#ifdef CONFIG_NUMA
5275        pr_info("Policy zone: %s\n", zone_names[policy_zone]);
5276#endif
5277}
5278
5279/*
5280 * Initially all pages are reserved - free ones are freed
5281 * up by free_all_bootmem() once the early boot process is
5282 * done. Non-atomic initialization, single-pass.
5283 */
5284void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
5285                unsigned long start_pfn, enum memmap_context context)
5286{
5287        struct vmem_altmap *altmap = to_vmem_altmap(__pfn_to_phys(start_pfn));
5288        unsigned long end_pfn = start_pfn + size;
5289        pg_data_t *pgdat = NODE_DATA(nid);
5290        unsigned long pfn;
5291        unsigned long nr_initialised = 0;
5292#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5293        struct memblock_region *r = NULL, *tmp;
5294#endif
5295
5296        if (highest_memmap_pfn < end_pfn - 1)
5297                highest_memmap_pfn = end_pfn - 1;
5298
5299        /*
5300         * Honor reservation requested by the driver for this ZONE_DEVICE
5301         * memory
5302         */
5303        if (altmap && start_pfn == altmap->base_pfn)
5304                start_pfn += altmap->reserve;
5305
5306        for (pfn = start_pfn; pfn < end_pfn; pfn++) {
5307                /*
5308                 * There can be holes in boot-time mem_map[]s handed to this
5309                 * function.  They do not exist on hotplugged memory.
5310                 */
5311                if (context != MEMMAP_EARLY)
5312                        goto not_early;
5313
5314                if (!early_pfn_valid(pfn)) {
5315#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5316                        /*
5317                         * Skip to the pfn preceding the next valid one (or
5318                         * end_pfn), such that we hit a valid pfn (or end_pfn)
5319                         * on our next iteration of the loop.
5320                         */
5321                        pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1;
5322#endif
5323                        continue;
5324                }
5325                if (!early_pfn_in_nid(pfn, nid))
5326                        continue;
5327                if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
5328                        break;
5329
5330#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5331                /*
5332                 * Check given memblock attribute by firmware which can affect
5333                 * kernel memory layout.  If zone==ZONE_MOVABLE but memory is
5334                 * mirrored, it's an overlapped memmap init. skip it.
5335                 */
5336                if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
5337                        if (!r || pfn >= memblock_region_memory_end_pfn(r)) {
5338                                for_each_memblock(memory, tmp)
5339                                        if (pfn < memblock_region_memory_end_pfn(tmp))
5340                                                break;
5341                                r = tmp;
5342                        }
5343                        if (pfn >= memblock_region_memory_base_pfn(r) &&
5344                            memblock_is_mirror(r)) {
5345                                /* already initialized as NORMAL */
5346                                pfn = memblock_region_memory_end_pfn(r);
5347                                continue;
5348                        }
5349                }
5350#endif
5351
5352not_early:
5353                /*
5354                 * Mark the block movable so that blocks are reserved for
5355                 * movable at startup. This will force kernel allocations
5356                 * to reserve their blocks rather than leaking throughout
5357                 * the address space during boot when many long-lived
5358                 * kernel allocations are made.
5359                 *
5360                 * bitmap is created for zone's valid pfn range. but memmap
5361                 * can be created for invalid pages (for alignment)
5362                 * check here not to call set_pageblock_migratetype() against
5363                 * pfn out of zone.
5364                 */
5365                if (!(pfn & (pageblock_nr_pages - 1))) {
5366                        struct page *page = pfn_to_page(pfn);
5367
5368                        __init_single_page(page, pfn, zone, nid);
5369                        set_pageblock_migratetype(page, MIGRATE_MOVABLE);
5370                        cond_resched();
5371                } else {
5372                        __init_single_pfn(pfn, zone, nid);
5373                }
5374        }
5375}
5376
5377static void __meminit zone_init_free_lists(struct zone *zone)
5378{
5379        unsigned int order, t;
5380        for_each_migratetype_order(order, t) {
5381                INIT_LIST_HEAD(&zone->free_area[order].free_list[t]);
5382                zone->free_area[order].nr_free = 0;
5383        }
5384}
5385
5386#ifndef __HAVE_ARCH_MEMMAP_INIT
5387#define memmap_init(size, nid, zone, start_pfn) \
5388        memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
5389#endif
5390
5391static int zone_batchsize(struct zone *zone)
5392{
5393#ifdef CONFIG_MMU
5394        int batch;
5395
5396        /*
5397         * The per-cpu-pages pools are set to around 1000th of the
5398         * size of the zone.  But no more than 1/2 of a meg.
5399         *
5400         * OK, so we don't know how big the cache is.  So guess.
5401         */
5402        batch = zone->managed_pages / 1024;
5403        if (batch * PAGE_SIZE > 512 * 1024)
5404                batch = (512 * 1024) / PAGE_SIZE;
5405        batch /= 4;             /* We effectively *= 4 below */
5406        if (batch < 1)
5407                batch = 1;
5408
5409        /*
5410         * Clamp the batch to a 2^n - 1 value. Having a power
5411         * of 2 value was found to be more likely to have
5412         * suboptimal cache aliasing properties in some cases.
5413         *
5414         * For example if 2 tasks are alternately allocating
5415         * batches of pages, one task can end up with a lot
5416         * of pages of one half of the possible page colors
5417         * and the other with pages of the other colors.
5418         */
5419        batch = rounddown_pow_of_two(batch + batch/2) - 1;
5420
5421        return batch;
5422
5423#else
5424        /* The deferral and batching of frees should be suppressed under NOMMU
5425         * conditions.
5426         *
5427         * The problem is that NOMMU needs to be able to allocate large chunks
5428         * of contiguous memory as there's no hardware page translation to
5429         * assemble apparent contiguous memory from discontiguous pages.
5430         *
5431         * Queueing large contiguous runs of pages for batching, however,
5432         * causes the pages to actually be freed in smaller chunks.  As there
5433         * can be a significant delay between the individual batches being
5434         * recycled, this leads to the once large chunks of space being
5435         * fragmented and becoming unavailable for high-order allocations.
5436         */
5437        return 0;
5438#endif
5439}
5440
5441/*
5442 * pcp->high and pcp->batch values are related and dependent on one another:
5443 * ->batch must never be higher then ->high.
5444 * The following function updates them in a safe manner without read side
5445 * locking.
5446 *
5447 * Any new users of pcp->batch and pcp->high should ensure they can cope with
5448 * those fields changing asynchronously (acording the the above rule).
5449 *
5450 * mutex_is_locked(&pcp_batch_high_lock) required when calling this function
5451 * outside of boot time (or some other assurance that no concurrent updaters
5452 * exist).
5453 */
5454static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
5455                unsigned long batch)
5456{
5457       /* start with a fail safe value for batch */
5458        pcp->batch = 1;
5459        smp_wmb();
5460
5461       /* Update high, then batch, in order */
5462        pcp->high = high;
5463        smp_wmb();
5464
5465        pcp->batch = batch;
5466}
5467
5468/* a companion to pageset_set_high() */
5469static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
5470{
5471        pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
5472}
5473
5474static void pageset_init(struct per_cpu_pageset *p)
5475{
5476        struct per_cpu_pages *pcp;
5477        int migratetype;
5478
5479        memset(p, 0, sizeof(*p));
5480
5481        pcp = &p->pcp;
5482        pcp->count = 0;
5483        for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
5484                INIT_LIST_HEAD(&pcp->lists[migratetype]);
5485}
5486
5487static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
5488{
5489        pageset_init(p);
5490        pageset_set_batch(p, batch);
5491}
5492
5493/*
5494 * pageset_set_high() sets the high water mark for hot per_cpu_pagelist
5495 * to the value high for the pageset p.
5496 */
5497static void pageset_set_high(struct per_cpu_pageset *p,
5498                                unsigned long high)
5499{
5500        unsigned long batch = max(1UL, high / 4);
5501        if ((high / 4) > (PAGE_SHIFT * 8))
5502                batch = PAGE_SHIFT * 8;
5503
5504        pageset_update(&p->pcp, high, batch);
5505}
5506
5507static void pageset_set_high_and_batch(struct zone *zone,
5508                                       struct per_cpu_pageset *pcp)
5509{
5510        if (percpu_pagelist_fraction)
5511                pageset_set_high(pcp,
5512                        (zone->managed_pages /
5513                                percpu_pagelist_fraction));
5514        else
5515                pageset_set_batch(pcp, zone_batchsize(zone));
5516}
5517
5518static void __meminit zone_pageset_init(struct zone *zone, int cpu)
5519{
5520        struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
5521
5522        pageset_init(pcp);
5523        pageset_set_high_and_batch(zone, pcp);
5524}
5525
5526void __meminit setup_zone_pageset(struct zone *zone)
5527{
5528        int cpu;
5529        zone->pageset = alloc_percpu(struct per_cpu_pageset);
5530        for_each_possible_cpu(cpu)
5531                zone_pageset_init(zone, cpu);
5532}
5533
5534/*
5535 * Allocate per cpu pagesets and initialize them.
5536 * Before this call only boot pagesets were available.
5537 */
5538void __init setup_per_cpu_pageset(void)
5539{
5540        struct pglist_data *pgdat;
5541        struct zone *zone;
5542
5543        for_each_populated_zone(zone)
5544                setup_zone_pageset(zone);
5545
5546        for_each_online_pgdat(pgdat)
5547                pgdat->per_cpu_nodestats =
5548                        alloc_percpu(struct per_cpu_nodestat);
5549}
5550
5551static __meminit void zone_pcp_init(struct zone *zone)
5552{
5553        /*
5554         * per cpu subsystem is not up at this point. The following code
5555         * relies on the ability of the linker to provide the
5556         * offset of a (static) per cpu variable into the per cpu area.
5557         */
5558        zone->pageset = &boot_pageset;
5559
5560        if (populated_zone(zone))
5561                printk(KERN_DEBUG "  %s zone: %lu pages, LIFO batch:%u\n",
5562                        zone->name, zone->present_pages,
5563                                         zone_batchsize(zone));
5564}
5565
5566void __meminit init_currently_empty_zone(struct zone *zone,
5567                                        unsigned long zone_start_pfn,
5568                                        unsigned long size)
5569{
5570        struct pglist_data *pgdat = zone->zone_pgdat;
5571
5572        pgdat->nr_zones = zone_idx(zone) + 1;
5573
5574        zone->zone_start_pfn = zone_start_pfn;
5575
5576        mminit_dprintk(MMINIT_TRACE, "memmap_init",
5577                        "Initialising map node %d zone %lu pfns %lu -> %lu\n",
5578                        pgdat->node_id,
5579                        (unsigned long)zone_idx(zone),
5580                        zone_start_pfn, (zone_start_pfn + size));
5581
5582        zone_init_free_lists(zone);
5583        zone->initialized = 1;
5584}
5585
5586#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
5587#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
5588
5589/*
5590 * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
5591 */
5592int __meminit __early_pfn_to_nid(unsigned long pfn,
5593                                        struct mminit_pfnnid_cache *state)
5594{
5595        unsigned long start_pfn, end_pfn;
5596        int nid;
5597
5598        if (state->last_start <= pfn && pfn < state->last_end)
5599                return state->last_nid;
5600
5601        nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
5602        if (nid != -1) {
5603                state->last_start = start_pfn;
5604                state->last_end = end_pfn;
5605                state->last_nid = nid;
5606        }
5607
5608        return nid;
5609}
5610#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
5611
5612/**
5613 * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range
5614 * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
5615 * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid
5616 *
5617 * If an architecture guarantees that all ranges registered contain no holes
5618 * and may be freed, this this function may be used instead of calling
5619 * memblock_free_early_nid() manually.
5620 */
5621void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
5622{
5623        unsigned long start_pfn, end_pfn;
5624        int i, this_nid;
5625
5626        for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
5627                start_pfn = min(start_pfn, max_low_pfn);
5628                end_pfn = min(end_pfn, max_low_pfn);
5629
5630                if (start_pfn < end_pfn)
5631                        memblock_free_early_nid(PFN_PHYS(start_pfn),
5632                                        (end_pfn - start_pfn) << PAGE_SHIFT,
5633                                        this_nid);
5634        }
5635}
5636
5637/**
5638 * sparse_memory_present_with_active_regions - Call memory_present for each active range
5639 * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
5640 *
5641 * If an architecture guarantees that all ranges registered contain no holes and may
5642 * be freed, this function may be used instead of calling memory_present() manually.
5643 */
5644void __init sparse_memory_present_with_active_regions(int nid)
5645{
5646        unsigned long start_pfn, end_pfn;
5647        int i, this_nid;
5648
5649        for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
5650                memory_present(this_nid, start_pfn, end_pfn);
5651}
5652
5653/**
5654 * get_pfn_range_for_nid - Return the start and end page frames for a node
5655 * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
5656 * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
5657 * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
5658 *
5659 * It returns the start and end page frame of a node based on information
5660 * provided by memblock_set_node(). If called for a node
5661 * with no available memory, a warning is printed and the start and end
5662 * PFNs will be 0.
5663 */
5664void __meminit get_pfn_range_for_nid(unsigned int nid,
5665                        unsigned long *start_pfn, unsigned long *end_pfn)
5666{
5667        unsigned long this_start_pfn, this_end_pfn;
5668        int i;
5669
5670        *start_pfn = -1UL;
5671        *end_pfn = 0;
5672
5673        for_each_mem_pfn_range(i, nid, &this_start_pfn, &this_end_pfn, NULL) {
5674                *start_pfn = min(*start_pfn, this_start_pfn);
5675                *end_pfn = max(*end_pfn, this_end_pfn);
5676        }
5677
5678        if (*start_pfn == -1UL)
5679                *start_pfn = 0;
5680}
5681
5682/*
5683 * This finds a zone that can be used for ZONE_MOVABLE pages. The
5684 * assumption is made that zones within a node are ordered in monotonic
5685 * increasing memory addresses so that the "highest" populated zone is used
5686 */
5687static void __init find_usable_zone_for_movable(void)
5688{
5689        int zone_index;
5690        for (zone_index = MAX_NR_ZONES - 1; zone_index >= 0; zone_index--) {
5691                if (zone_index == ZONE_MOVABLE)
5692                        continue;
5693
5694                if (arch_zone_highest_possible_pfn[zone_index] >
5695                                arch_zone_lowest_possible_pfn[zone_index])
5696                        break;
5697        }
5698
5699        VM_BUG_ON(zone_index == -1);
5700        movable_zone = zone_index;
5701}
5702
5703/*
5704 * The zone ranges provided by the architecture do not include ZONE_MOVABLE
5705 * because it is sized independent of architecture. Unlike the other zones,
5706 * the starting point for ZONE_MOVABLE is not fixed. It may be different
5707 * in each node depending on the size of each node and how evenly kernelcore
5708 * is distributed. This helper function adjusts the zone ranges
5709 * provided by the architecture for a given node by using the end of the
5710 * highest usable zone for ZONE_MOVABLE. This preserves the assumption that
5711 * zones within a node are in order of monotonic increases memory addresses
5712 */
5713static void __meminit adjust_zone_range_for_zone_movable(int nid,
5714                                        unsigned long zone_type,
5715                                        unsigned long node_start_pfn,
5716                                        unsigned long node_end_pfn,
5717                                        unsigned long *zone_start_pfn,
5718                                        unsigned long *zone_end_pfn)
5719{
5720        /* Only adjust if ZONE_MOVABLE is on this node */
5721        if (zone_movable_pfn[nid]) {
5722                /* Size ZONE_MOVABLE */
5723                if (zone_type == ZONE_MOVABLE) {
5724                        *zone_start_pfn = zone_movable_pfn[nid];
5725                        *zone_end_pfn = min(node_end_pfn,
5726                                arch_zone_highest_possible_pfn[movable_zone]);
5727
5728                /* Adjust for ZONE_MOVABLE starting within this range */
5729                } else if (!mirrored_kernelcore &&
5730                        *zone_start_pfn < zone_movable_pfn[nid] &&
5731                        *zone_end_pfn > zone_movable_pfn[nid]) {
5732                        *zone_end_pfn = zone_movable_pfn[nid];
5733
5734                /* Check if this whole range is within ZONE_MOVABLE */
5735                } else if (*zone_start_pfn >= zone_movable_pfn[nid])
5736                        *zone_start_pfn = *zone_end_pfn;
5737        }
5738}
5739
5740/*
5741 * Return the number of pages a zone spans in a node, including holes
5742 * present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
5743 */
5744static unsigned long __meminit zone_spanned_pages_in_node(int nid,
5745                                        unsigned long zone_type,
5746                                        unsigned long node_start_pfn,
5747                                        unsigned long node_end_pfn,
5748                                        unsigned long *zone_start_pfn,
5749                                        unsigned long *zone_end_pfn,
5750                                        unsigned long *ignored)
5751{
5752        /* When hotadd a new node from cpu_up(), the node should be empty */
5753        if (!node_start_pfn && !node_end_pfn)
5754                return 0;
5755
5756        /* Get the start and end of the zone */
5757        *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
5758        *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
5759        adjust_zone_range_for_zone_movable(nid, zone_type,
5760                                node_start_pfn, node_end_pfn,
5761                                zone_start_pfn, zone_end_pfn);
5762
5763        /* Check that this node has pages within the zone's required range */
5764        if (*zone_end_pfn < node_start_pfn || *zone_start_pfn > node_end_pfn)
5765                return 0;
5766
5767        /* Move the zone boundaries inside the node if necessary */
5768        *zone_end_pfn = min(*zone_end_pfn, node_end_pfn);
5769        *zone_start_pfn = max(*zone_start_pfn, node_start_pfn);
5770
5771        /* Return the spanned pages */
5772        return *zone_end_pfn - *zone_start_pfn;
5773}
5774
5775/*
5776 * Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
5777 * then all holes in the requested range will be accounted for.
5778 */
5779unsigned long __meminit __absent_pages_in_range(int nid,
5780                                unsigned long range_start_pfn,
5781                                unsigned long range_end_pfn)
5782{
5783        unsigned long nr_absent = range_end_pfn - range_start_pfn;
5784        unsigned long start_pfn, end_pfn;
5785        int i;
5786
5787        for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
5788                start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
5789                end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
5790                nr_absent -= end_pfn - start_pfn;
5791        }
5792        return nr_absent;
5793}
5794
5795/**
5796 * absent_pages_in_range - Return number of page frames in holes within a range
5797 * @start_pfn: The start PFN to start searching for holes
5798 * @end_pfn: The end PFN to stop searching for holes
5799 *
5800 * It returns the number of pages frames in memory holes within a range.
5801 */
5802unsigned long __init absent_pages_in_range(unsigned long start_pfn,
5803                                                        unsigned long end_pfn)
5804{
5805        return __absent_pages_in_range(MAX_NUMNODES, start_pfn, end_pfn);
5806}
5807
5808/* Return the number of page frames in holes in a zone on a node */
5809static unsigned long __meminit zone_absent_pages_in_node(int nid,
5810                                        unsigned long zone_type,
5811                                        unsigned long node_start_pfn,
5812                                        unsigned long node_end_pfn,
5813                                        unsigned long *ignored)
5814{
5815        unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
5816        unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
5817        unsigned long zone_start_pfn, zone_end_pfn;
5818        unsigned long nr_absent;
5819
5820        /* When hotadd a new node from cpu_up(), the node should be empty */
5821        if (!node_start_pfn && !node_end_pfn)
5822                return 0;
5823
5824        zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
5825        zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
5826
5827        adjust_zone_range_for_zone_movable(nid, zone_type,
5828                        node_start_pfn, node_end_pfn,
5829                        &zone_start_pfn, &zone_end_pfn);
5830        nr_absent = __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
5831
5832        /*
5833         * ZONE_MOVABLE handling.
5834         * Treat pages to be ZONE_MOVABLE in ZONE_NORMAL as absent pages
5835         * and vice versa.
5836         */
5837        if (mirrored_kernelcore && zone_movable_pfn[nid]) {
5838                unsigned long start_pfn, end_pfn;
5839                struct memblock_region *r;
5840
5841                for_each_memblock(memory, r) {
5842                        start_pfn = clamp(memblock_region_memory_base_pfn(r),
5843                                          zone_start_pfn, zone_end_pfn);
5844                        end_pfn = clamp(memblock_region_memory_end_pfn(r),
5845                                        zone_start_pfn, zone_end_pfn);
5846
5847                        if (zone_type == ZONE_MOVABLE &&
5848                            memblock_is_mirror(r))
5849                                nr_absent += end_pfn - start_pfn;
5850
5851                        if (zone_type == ZONE_NORMAL &&
5852                            !memblock_is_mirror(r))
5853                                nr_absent += end_pfn - start_pfn;
5854                }
5855        }
5856
5857        return nr_absent;
5858}
5859
5860#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
5861static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
5862                                        unsigned long zone_type,
5863                                        unsigned long node_start_pfn,
5864                                        unsigned long node_end_pfn,
5865                                        unsigned long *zone_start_pfn,
5866                                        unsigned long *zone_end_pfn,
5867                                        unsigned long *zones_size)
5868{
5869        unsigned int zone;
5870
5871        *zone_start_pfn = node_start_pfn;
5872        for (zone = 0; zone < zone_type; zone++)
5873                *zone_start_pfn += zones_size[zone];
5874
5875        *zone_end_pfn = *zone_start_pfn + zones_size[zone_type];
5876
5877        return zones_size[zone_type];
5878}
5879
5880static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
5881                                                unsigned long zone_type,
5882                                                unsigned long node_start_pfn,
5883                                                unsigned long node_end_pfn,
5884                                                unsigned long *zholes_size)
5885{
5886        if (!zholes_size)
5887                return 0;
5888
5889        return zholes_size[zone_type];
5890}
5891
5892#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
5893
5894static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
5895                                                unsigned long node_start_pfn,
5896                                                unsigned long node_end_pfn,
5897                                                unsigned long *zones_size,
5898                                                unsigned long *zholes_size)
5899{
5900        unsigned long realtotalpages = 0, totalpages = 0;
5901        enum zone_type i;
5902
5903        for (i = 0; i < MAX_NR_ZONES; i++) {
5904                struct zone *zone = pgdat->node_zones + i;
5905                unsigned long zone_start_pfn, zone_end_pfn;
5906                unsigned long size, real_size;
5907
5908                size = zone_spanned_pages_in_node(pgdat->node_id, i,
5909                                                  node_start_pfn,
5910                                                  node_end_pfn,
5911                                                  &zone_start_pfn,
5912                                                  &zone_end_pfn,
5913                                                  zones_size);
5914                real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
5915                                                  node_start_pfn, node_end_pfn,
5916                                                  zholes_size);
5917                if (size)
5918                        zone->zone_start_pfn = zone_start_pfn;
5919                else
5920                        zone->zone_start_pfn = 0;
5921                zone->spanned_pages = size;
5922                zone->present_pages = real_size;
5923
5924                totalpages += size;
5925                realtotalpages += real_size;
5926        }
5927
5928        pgdat->node_spanned_pages = totalpages;
5929        pgdat->node_present_pages = realtotalpages;
5930        printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
5931                                                        realtotalpages);
5932}
5933
5934#ifndef CONFIG_SPARSEMEM
5935/*
5936 * Calculate the size of the zone->blockflags rounded to an unsigned long
5937 * Start by making sure zonesize is a multiple of pageblock_order by rounding
5938 * up. Then use 1 NR_PAGEBLOCK_BITS worth of bits per pageblock, finally
5939 * round what is now in bits to nearest long in bits, then return it in
5940 * bytes.
5941 */
5942static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned long zonesize)
5943{
5944        unsigned long usemapsize;
5945
5946        zonesize += zone_start_pfn & (pageblock_nr_pages-1);
5947        usemapsize = roundup(zonesize, pageblock_nr_pages);
5948        usemapsize = usemapsize >> pageblock_order;
5949        usemapsize *= NR_PAGEBLOCK_BITS;
5950        usemapsize = roundup(usemapsize, 8 * sizeof(unsigned long));
5951
5952        return usemapsize / 8;
5953}
5954
5955static void __init setup_usemap(struct pglist_data *pgdat,
5956                                struct zone *zone,
5957                                unsigned long zone_start_pfn,
5958                                unsigned long zonesize)
5959{
5960        unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
5961        zone->pageblock_flags = NULL;
5962        if (usemapsize)
5963                zone->pageblock_flags =
5964                        memblock_virt_alloc_node_nopanic(usemapsize,
5965                                                         pgdat->node_id);
5966}
5967#else
5968static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
5969                                unsigned long zone_start_pfn, unsigned long zonesize) {}
5970#endif /* CONFIG_SPARSEMEM */
5971
5972#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
5973
5974/* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */
5975void __paginginit set_pageblock_order(void)
5976{
5977        unsigned int order;
5978
5979        /* Check that pageblock_nr_pages has not already been setup */
5980        if (pageblock_order)
5981                return;
5982
5983        if (HPAGE_SHIFT > PAGE_SHIFT)
5984                order = HUGETLB_PAGE_ORDER;
5985        else
5986                order = MAX_ORDER - 1;
5987
5988        /*
5989         * Assume the largest contiguous order of interest is a huge page.
5990         * This value may be variable depending on boot parameters on IA64 and
5991         * powerpc.
5992         */
5993        pageblock_order = order;
5994}
5995#else /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
5996
5997/*
5998 * When CONFIG_HUGETLB_PAGE_SIZE_VARIABLE is not set, set_pageblock_order()
5999 * is unused as pageblock_order is set at compile-time. See
6000 * include/linux/pageblock-flags.h for the values of pageblock_order based on
6001 * the kernel config
6002 */
6003void __paginginit set_pageblock_order(void)
6004{
6005}
6006
6007#endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */
6008
6009static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
6010                                                   unsigned long present_pages)
6011{
6012        unsigned long pages = spanned_pages;
6013
6014        /*
6015         * Provide a more accurate estimation if there are holes within
6016         * the zone and SPARSEMEM is in use. If there are holes within the
6017         * zone, each populated memory region may cost us one or two extra
6018         * memmap pages due to alignment because memmap pages for each
6019         * populated regions may not be naturally aligned on page boundary.
6020         * So the (present_pages >> 4) heuristic is a tradeoff for that.
6021         */
6022        if (spanned_pages > present_pages + (present_pages >> 4) &&
6023            IS_ENABLED(CONFIG_SPARSEMEM))
6024                pages = present_pages;
6025
6026        return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT;
6027}
6028
6029/*
6030 * Set up the zone data structures:
6031 *   - mark all pages reserved
6032 *   - mark all memory queues empty
6033 *   - clear the memory bitmaps
6034 *
6035 * NOTE: pgdat should get zeroed by caller.
6036 */
6037static void __paginginit free_area_init_core(struct pglist_data *pgdat)
6038{
6039        enum zone_type j;
6040        int nid = pgdat->node_id;
6041
6042        pgdat_resize_init(pgdat);
6043#ifdef CONFIG_NUMA_BALANCING
6044        spin_lock_init(&pgdat->numabalancing_migrate_lock);
6045        pgdat->numabalancing_migrate_nr_pages = 0;
6046        pgdat->numabalancing_migrate_next_window = jiffies;
6047#endif
6048#ifdef CONFIG_TRANSPARENT_HUGEPAGE
6049        spin_lock_init(&pgdat->split_queue_lock);
6050        INIT_LIST_HEAD(&pgdat->split_queue);
6051        pgdat->split_queue_len = 0;
6052#endif
6053        init_waitqueue_head(&pgdat->kswapd_wait);
6054        init_waitqueue_head(&pgdat->pfmemalloc_wait);
6055#ifdef CONFIG_COMPACTION
6056        init_waitqueue_head(&pgdat->kcompactd_wait);
6057#endif
6058        pgdat_page_ext_init(pgdat);
6059        spin_lock_init(&pgdat->lru_lock);
6060        lruvec_init(node_lruvec(pgdat));
6061
6062        pgdat->per_cpu_nodestats = &boot_nodestats;
6063
6064        for (j = 0; j < MAX_NR_ZONES; j++) {
6065                struct zone *zone = pgdat->node_zones + j;
6066                unsigned long size, realsize, freesize, memmap_pages;
6067                unsigned long zone_start_pfn = zone->zone_start_pfn;
6068
6069                size = zone->spanned_pages;
6070                realsize = freesize = zone->present_pages;
6071
6072                /*
6073                 * Adjust freesize so that it accounts for how much memory
6074                 * is used by this zone for memmap. This affects the watermark
6075                 * and per-cpu initialisations
6076                 */
6077                memmap_pages = calc_memmap_size(size, realsize);
6078                if (!is_highmem_idx(j)) {
6079                        if (freesize >= memmap_pages) {
6080                                freesize -= memmap_pages;
6081                                if (memmap_pages)
6082                                        printk(KERN_DEBUG
6083                                               "  %s zone: %lu pages used for memmap\n",
6084                                               zone_names[j], memmap_pages);
6085                        } else
6086                                pr_warn("  %s zone: %lu pages exceeds freesize %lu\n",
6087                                        zone_names[j], memmap_pages, freesize);
6088                }
6089
6090                /* Account for reserved pages */
6091                if (j == 0 && freesize > dma_reserve) {
6092                        freesize -= dma_reserve;
6093                        printk(KERN_DEBUG "  %s zone: %lu pages reserved\n",
6094                                        zone_names[0], dma_reserve);
6095                }
6096
6097                if (!is_highmem_idx(j))
6098                        nr_kernel_pages += freesize;
6099                /* Charge for highmem memmap if there are enough kernel pages */
6100                else if (nr_kernel_pages > memmap_pages * 2)
6101                        nr_kernel_pages -= memmap_pages;
6102                nr_all_pages += freesize;
6103
6104                /*
6105                 * Set an approximate value for lowmem here, it will be adjusted
6106                 * when the bootmem allocator frees pages into the buddy system.
6107                 * And all highmem pages will be managed by the buddy system.
6108                 */
6109                zone->managed_pages = is_highmem_idx(j) ? realsize : freesize;
6110#ifdef CONFIG_NUMA
6111                zone->node = nid;
6112#endif
6113                zone->name = zone_names[j];
6114                zone->zone_pgdat = pgdat;
6115                spin_lock_init(&zone->lock);
6116                zone_seqlock_init(zone);
6117                zone_pcp_init(zone);
6118
6119                if (!size)
6120                        continue;
6121
6122                set_pageblock_order();
6123                setup_usemap(pgdat, zone, zone_start_pfn, size);
6124                init_currently_empty_zone(zone, zone_start_pfn, size);
6125                memmap_init(size, nid, j, zone_start_pfn);
6126        }
6127}
6128
6129static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
6130{
6131        unsigned long __maybe_unused start = 0;
6132        unsigned long __maybe_unused offset = 0;
6133
6134        /* Skip empty nodes */
6135        if (!pgdat->node_spanned_pages)
6136                return;
6137
6138#ifdef CONFIG_FLAT_NODE_MEM_MAP
6139        start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1);
6140        offset = pgdat->node_start_pfn - start;
6141        /* ia64 gets its own node_mem_map, before this, without bootmem */
6142        if (!pgdat->node_mem_map) {
6143                unsigned long size, end;
6144                struct page *map;
6145
6146                /*
6147                 * The zone's endpoints aren't required to be MAX_ORDER
6148                 * aligned but the node_mem_map endpoints must be in order
6149                 * for the buddy allocator to function correctly.
6150                 */
6151                end = pgdat_end_pfn(pgdat);
6152                end = ALIGN(end, MAX_ORDER_NR_PAGES);
6153                size =  (end - start) * sizeof(struct page);
6154                map = alloc_remap(pgdat->node_id, size);
6155                if (!map)
6156                        map = memblock_virt_alloc_node_nopanic(size,
6157                                                               pgdat->node_id);
6158                pgdat->node_mem_map = map + offset;
6159        }
6160#ifndef CONFIG_NEED_MULTIPLE_NODES
6161        /*
6162         * With no DISCONTIG, the global mem_map is just set as node 0's
6163         */
6164        if (pgdat == NODE_DATA(0)) {
6165                mem_map = NODE_DATA(0)->node_mem_map;
6166#if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM)
6167                if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
6168                        mem_map -= offset;
6169#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
6170        }
6171#endif
6172#endif /* CONFIG_FLAT_NODE_MEM_MAP */
6173}
6174
6175void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
6176                unsigned long node_start_pfn, unsigned long *zholes_size)
6177{
6178        pg_data_t *pgdat = NODE_DATA(nid);
6179        unsigned long start_pfn = 0;
6180        unsigned long end_pfn = 0;
6181
6182        /* pg_data_t should be reset to zero when it's allocated */
6183        WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx);
6184
6185        pgdat->node_id = nid;
6186        pgdat->node_start_pfn = node_start_pfn;
6187        pgdat->per_cpu_nodestats = NULL;
6188#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
6189        get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
6190        pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
6191                (u64)start_pfn << PAGE_SHIFT,
6192                end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
6193#else
6194        start_pfn = node_start_pfn;
6195#endif
6196        calculate_node_totalpages(pgdat, start_pfn, end_pfn,
6197                                  zones_size, zholes_size);
6198
6199        alloc_node_mem_map(pgdat);
6200#ifdef CONFIG_FLAT_NODE_MEM_MAP
6201        printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n",
6202                nid, (unsigned long)pgdat,
6203                (unsigned long)pgdat->node_mem_map);
6204#endif
6205
6206        reset_deferred_meminit(pgdat);
6207        free_area_init_core(pgdat);
6208}
6209
6210#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
6211
6212#if MAX_NUMNODES > 1
6213/*
6214 * Figure out the number of possible node ids.
6215 */
6216void __init setup_nr_node_ids(void)
6217{
6218        unsigned int highest;
6219
6220        highest = find_last_bit(node_possible_map.bits, MAX_NUMNODES);
6221        nr_node_ids = highest + 1;
6222}
6223#endif
6224
6225/**
6226 * node_map_pfn_alignment - determine the maximum internode alignment
6227 *
6228 * This function should be called after node map is populated and sorted.
6229 * It calculates the maximum power of two alignment which can distinguish
6230 * all the nodes.
6231 *
6232 * For example, if all nodes are 1GiB and aligned to 1GiB, the return value
6233 * would indicate 1GiB alignment with (1 << (30 - PAGE_SHIFT)).  If the
6234 * nodes are shifted by 256MiB, 256MiB.  Note that if only the last node is
6235 * shifted, 1GiB is enough and this function will indicate so.
6236 *
6237 * This is used to test whether pfn -> nid mapping of the chosen memory
6238 * model has fine enough granularity to avoid incorrect mapping for the
6239 * populated node map.
6240 *
6241 * Returns the determined alignment in pfn's.  0 if there is no alignment
6242 * requirement (single node).
6243 */
6244unsigned long __init node_map_pfn_alignment(void)
6245{
6246        unsigned long accl_mask = 0, last_end = 0;
6247        unsigned long start, end, mask;
6248        int last_nid = -1;
6249        int i, nid;
6250
6251        for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
6252                if (!start || last_nid < 0 || last_nid == nid) {
6253                        last_nid = nid;
6254                        last_end = end;
6255                        continue;
6256                }
6257
6258                /*
6259                 * Start with a mask granular enough to pin-point to the
6260                 * start pfn and tick off bits one-by-one until it becomes
6261                 * too coarse to separate the current node from the last.
6262                 */
6263                mask = ~((1 << __ffs(start)) - 1);
6264                while (mask && last_end <= (start & (mask << 1)))
6265                        mask <<= 1;
6266
6267                /* accumulate all internode masks */
6268                accl_mask |= mask;
6269        }
6270
6271        /* convert mask to number of pages */
6272        return ~accl_mask + 1;
6273}
6274
6275/* Find the lowest pfn for a node */
6276static unsigned long __init find_min_pfn_for_node(int nid)
6277{
6278        unsigned long min_pfn = ULONG_MAX;
6279        unsigned long start_pfn;
6280        int i;
6281
6282        for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
6283                min_pfn = min(min_pfn, start_pfn);
6284
6285        if (min_pfn == ULONG_MAX) {
6286                pr_warn("Could not find start_pfn for node %d\n", nid);
6287                return 0;
6288        }
6289
6290        return min_pfn;
6291}
6292
6293/**
6294 * find_min_pfn_with_active_regions - Find the minimum PFN registered
6295 *
6296 * It returns the minimum PFN based on information provided via
6297 * memblock_set_node().
6298 */
6299unsigned long __init find_min_pfn_with_active_regions(void)
6300{
6301        return find_min_pfn_for_node(MAX_NUMNODES);
6302}
6303
6304/*
6305 * early_calculate_totalpages()
6306 * Sum pages in active regions for movable zone.
6307 * Populate N_MEMORY for calculating usable_nodes.
6308 */
6309static unsigned long __init early_calculate_totalpages(void)
6310{
6311        unsigned long totalpages = 0;
6312        unsigned long start_pfn, end_pfn;
6313        int i, nid;
6314
6315        for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
6316                unsigned long pages = end_pfn - start_pfn;
6317
6318                totalpages += pages;
6319                if (pages)
6320                        node_set_state(nid, N_MEMORY);
6321        }
6322        return totalpages;
6323}
6324
6325/*
6326 * Find the PFN the Movable zone begins in each node. Kernel memory
6327 * is spread evenly between nodes as long as the nodes have enough
6328 * memory. When they don't, some nodes will have more kernelcore than
6329 * others
6330 */
6331static void __init find_zone_movable_pfns_for_nodes(void)
6332{
6333        int i, nid;
6334        unsigned long usable_startpfn;
6335        unsigned long kernelcore_node, kernelcore_remaining;
6336        /* save the state before borrow the nodemask */
6337        nodemask_t saved_node_state = node_states[N_MEMORY];
6338        unsigned long totalpages = early_calculate_totalpages();
6339        int usable_nodes = nodes_weight(node_states[N_MEMORY]);
6340        struct memblock_region *r;
6341
6342        /* Need to find movable_zone earlier when movable_node is specified. */
6343        find_usable_zone_for_movable();
6344
6345        /*
6346         * If movable_node is specified, ignore kernelcore and movablecore
6347         * options.
6348         */
6349        if (movable_node_is_enabled()) {
6350                for_each_memblock(memory, r) {
6351                        if (!memblock_is_hotpluggable(r))
6352                                continue;
6353
6354                        nid = r->nid;
6355
6356                        usable_startpfn = PFN_DOWN(r->base);
6357                        zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
6358                                min(usable_startpfn, zone_movable_pfn[nid]) :
6359                                usable_startpfn;
6360                }
6361
6362                goto out2;
6363        }
6364
6365        /*
6366         * If kernelcore=mirror is specified, ignore movablecore option
6367         */
6368        if (mirrored_kernelcore) {
6369                bool mem_below_4gb_not_mirrored = false;
6370
6371                for_each_memblock(memory, r) {
6372                        if (memblock_is_mirror(r))
6373                                continue;
6374
6375                        nid = r->nid;
6376
6377                        usable_startpfn = memblock_region_memory_base_pfn(r);
6378
6379                        if (usable_startpfn < 0x100000) {
6380                                mem_below_4gb_not_mirrored = true;
6381                                continue;
6382                        }
6383
6384                        zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
6385                                min(usable_startpfn, zone_movable_pfn[nid]) :
6386                                usable_startpfn;
6387                }
6388
6389                if (mem_below_4gb_not_mirrored)
6390                        pr_warn("This configuration results in unmirrored kernel memory.");
6391
6392                goto out2;
6393        }
6394
6395        /*
6396         * If movablecore=nn[KMG] was specified, calculate what size of
6397         * kernelcore that corresponds so that memory usable for
6398         * any allocation type is evenly spread. If both kernelcore
6399         * and movablecore are specified, then the value of kernelcore
6400         * will be used for required_kernelcore if it's greater than
6401         * what movablecore would have allowed.
6402         */
6403        if (required_movablecore) {
6404                unsigned long corepages;
6405
6406                /*
6407                 * Round-up so that ZONE_MOVABLE is at least as large as what
6408                 * was requested by the user
6409                 */
6410                required_movablecore =
6411                        roundup(required_movablecore, MAX_ORDER_NR_PAGES);
6412                required_movablecore = min(totalpages, required_movablecore);
6413                corepages = totalpages - required_movablecore;
6414
6415                required_kernelcore = max(required_kernelcore, corepages);
6416        }
6417
6418        /*
6419         * If kernelcore was not specified or kernelcore size is larger
6420         * than totalpages, there is no ZONE_MOVABLE.
6421         */
6422        if (!required_kernelcore || required_kernelcore >= totalpages)
6423                goto out;
6424
6425        /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
6426        usable_startpfn = arch_zone_lowest_possible_pfn[movable_zone];
6427
6428restart:
6429        /* Spread kernelcore memory as evenly as possible throughout nodes */
6430        kernelcore_node = required_kernelcore / usable_nodes;
6431        for_each_node_state(nid, N_MEMORY) {
6432                unsigned long start_pfn, end_pfn;
6433
6434                /*
6435                 * Recalculate kernelcore_node if the division per node
6436                 * now exceeds what is necessary to satisfy the requested
6437                 * amount of memory for the kernel
6438                 */
6439                if (required_kernelcore < kernelcore_node)
6440                        kernelcore_node = required_kernelcore / usable_nodes;
6441
6442                /*
6443                 * As the map is walked, we track how much memory is usable
6444                 * by the kernel using kernelcore_remaining. When it is
6445                 * 0, the rest of the node is usable by ZONE_MOVABLE
6446                 */
6447                kernelcore_remaining = kernelcore_node;
6448
6449                /* Go through each range of PFNs within this node */
6450                for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
6451                        unsigned long size_pages;
6452
6453                        start_pfn = max(start_pfn, zone_movable_pfn[nid]);
6454                        if (start_pfn >= end_pfn)
6455                                continue;
6456
6457                        /* Account for what is only usable for kernelcore */
6458                        if (start_pfn < usable_startpfn) {
6459                                unsigned long kernel_pages;
6460                                kernel_pages = min(end_pfn, usable_startpfn)
6461                                                                - start_pfn;
6462
6463                                kernelcore_remaining -= min(kernel_pages,
6464                                                        kernelcore_remaining);
6465                                required_kernelcore -= min(kernel_pages,
6466                                                        required_kernelcore);
6467
6468                                /* Continue if range is now fully accounted */
6469                                if (end_pfn <= usable_startpfn) {
6470
6471                                        /*
6472                                         * Push zone_movable_pfn to the end so
6473                                         * that if we have to rebalance
6474                                         * kernelcore across nodes, we will
6475                                         * not double account here
6476                                         */
6477                                        zone_movable_pfn[nid] = end_pfn;
6478                                        continue;
6479                                }
6480                                start_pfn = usable_startpfn;
6481                        }
6482
6483                        /*
6484                         * The usable PFN range for ZONE_MOVABLE is from
6485                         * start_pfn->end_pfn. Calculate size_pages as the
6486                         * number of pages used as kernelcore
6487                         */
6488                        size_pages = end_pfn - start_pfn;
6489                        if (size_pages > kernelcore_remaining)
6490                                size_pages = kernelcore_remaining;
6491                        zone_movable_pfn[nid] = start_pfn + size_pages;
6492
6493                        /*
6494                         * Some kernelcore has been met, update counts and
6495                         * break if the kernelcore for this node has been
6496                         * satisfied
6497                         */
6498                        required_kernelcore -= min(required_kernelcore,
6499                                                                size_pages);
6500                        kernelcore_remaining -= size_pages;
6501                        if (!kernelcore_remaining)
6502                                break;
6503                }
6504        }
6505
6506        /*
6507         * If there is still required_kernelcore, we do another pass with one
6508         * less node in the count. This will push zone_movable_pfn[nid] further
6509         * along on the nodes that still have memory until kernelcore is
6510         * satisfied
6511         */
6512        usable_nodes--;
6513        if (usable_nodes && required_kernelcore > usable_nodes)
6514                goto restart;
6515
6516out2:
6517        /* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
6518        for (nid = 0; nid < MAX_NUMNODES; nid++)
6519                zone_movable_pfn[nid] =
6520                        roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
6521
6522out:
6523        /* restore the node_state */
6524        node_states[N_MEMORY] = saved_node_state;
6525}
6526
6527/* Any regular or high memory on that node ? */
6528static void check_for_memory(pg_data_t *pgdat, int nid)
6529{
6530        enum zone_type zone_type;
6531
6532        if (N_MEMORY == N_NORMAL_MEMORY)
6533                return;
6534
6535        for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
6536                struct zone *zone = &pgdat->node_zones[zone_type];
6537                if (populated_zone(zone)) {
6538                        node_set_state(nid, N_HIGH_MEMORY);
6539                        if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
6540                            zone_type <= ZONE_NORMAL)
6541                                node_set_state(nid, N_NORMAL_MEMORY);
6542                        break;
6543                }
6544        }
6545}
6546
6547/**
6548 * free_area_init_nodes - Initialise all pg_data_t and zone data
6549 * @max_zone_pfn: an array of max PFNs for each zone
6550 *
6551 * This will call free_area_init_node() for each active node in the system.
6552 * Using the page ranges provided by memblock_set_node(), the size of each
6553 * zone in each node and their holes is calculated. If the maximum PFN
6554 * between two adjacent zones match, it is assumed that the zone is empty.
6555 * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed
6556 * that arch_max_dma32_pfn has no pages. It is also assumed that a zone
6557 * starts where the previous one ended. For example, ZONE_DMA32 starts
6558 * at arch_max_dma_pfn.
6559 */
6560void __init free_area_init_nodes(unsigned long *max_zone_pfn)
6561{
6562        unsigned long start_pfn, end_pfn;
6563        int i, nid;
6564
6565        /* Record where the zone boundaries are */
6566        memset(arch_zone_lowest_possible_pfn, 0,
6567                                sizeof(arch_zone_lowest_possible_pfn));
6568        memset(arch_zone_highest_possible_pfn, 0,
6569                                sizeof(arch_zone_highest_possible_pfn));
6570
6571        start_pfn = find_min_pfn_with_active_regions();
6572
6573        for (i = 0; i < MAX_NR_ZONES; i++) {
6574                if (i == ZONE_MOVABLE)
6575                        continue;
6576
6577                end_pfn = max(max_zone_pfn[i], start_pfn);
6578                arch_zone_lowest_possible_pfn[i] = start_pfn;
6579                arch_zone_highest_possible_pfn[i] = end_pfn;
6580
6581                start_pfn = end_pfn;
6582        }
6583
6584        /* Find the PFNs that ZONE_MOVABLE begins at in each node */
6585        memset(zone_movable_pfn, 0, sizeof(zone_movable_pfn));
6586        find_zone_movable_pfns_for_nodes();
6587
6588        /* Print out the zone ranges */
6589        pr_info("Zone ranges:\n");
6590        for (i = 0; i < MAX_NR_ZONES; i++) {
6591                if (i == ZONE_MOVABLE)
6592                        continue;
6593                pr_info("  %-8s ", zone_names[i]);
6594                if (arch_zone_lowest_possible_pfn[i] ==
6595                                arch_zone_highest_possible_pfn[i])
6596                        pr_cont("empty\n");
6597                else
6598                        pr_cont("[mem %#018Lx-%#018Lx]\n",
6599                                (u64)arch_zone_lowest_possible_pfn[i]
6600                                        << PAGE_SHIFT,
6601                                ((u64)arch_zone_highest_possible_pfn[i]
6602                                        << PAGE_SHIFT) - 1);
6603        }
6604
6605        /* Print out the PFNs ZONE_MOVABLE begins at in each node */
6606        pr_info("Movable zone start for each node\n");
6607        for (i = 0; i < MAX_NUMNODES; i++) {
6608                if (zone_movable_pfn[i])
6609                        pr_info("  Node %d: %#018Lx\n", i,
6610                               (u64)zone_movable_pfn[i] << PAGE_SHIFT);
6611        }
6612
6613        /* Print out the early node map */
6614        pr_info("Early memory node ranges\n");
6615        for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
6616                pr_info("  node %3d: [mem %#018Lx-%#018Lx]\n", nid,
6617                        (u64)start_pfn << PAGE_SHIFT,
6618                        ((u64)end_pfn << PAGE_SHIFT) - 1);
6619
6620        /* Initialise every node */
6621        mminit_verify_pageflags_layout();
6622        setup_nr_node_ids();
6623        for_each_online_node(nid) {
6624                pg_data_t *pgdat = NODE_DATA(nid);
6625                free_area_init_node(nid, NULL,
6626                                find_min_pfn_for_node(nid), NULL);
6627
6628                /* Any memory on that node */
6629                if (pgdat->node_present_pages)
6630                        node_set_state(nid, N_MEMORY);
6631                check_for_memory(pgdat, nid);
6632        }
6633}
6634
6635static int __init cmdline_parse_core(char *p, unsigned long *core)
6636{
6637        unsigned long long coremem;
6638        if (!p)
6639                return -EINVAL;
6640
6641        coremem = memparse(p, &p);
6642        *core = coremem >> PAGE_SHIFT;
6643
6644        /* Paranoid check that UL is enough for the coremem value */
6645        WARN_ON((coremem >> PAGE_SHIFT) > ULONG_MAX);
6646
6647        return 0;
6648}
6649
6650/*
6651 * kernelcore=size sets the amount of memory for use for allocations that
6652 * cannot be reclaimed or migrated.
6653 */
6654static int __init cmdline_parse_kernelcore(char *p)
6655{
6656        /* parse kernelcore=mirror */
6657        if (parse_option_str(p, "mirror")) {
6658                mirrored_kernelcore = true;
6659                return 0;
6660        }
6661
6662        return cmdline_parse_core(p, &required_kernelcore);
6663}
6664
6665/*
6666 * movablecore=size sets the amount of memory for use for allocations that
6667 * can be reclaimed or migrated.
6668 */
6669static int __init cmdline_parse_movablecore(char *p)
6670{
6671        return cmdline_parse_core(p, &required_movablecore);
6672}
6673
6674early_param("kernelcore", cmdline_parse_kernelcore);
6675early_param("movablecore", cmdline_parse_movablecore);
6676
6677#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
6678
6679void adjust_managed_page_count(struct page *page, long count)
6680{
6681        spin_lock(&managed_page_count_lock);
6682        page_zone(page)->managed_pages += count;
6683        totalram_pages += count;
6684#ifdef CONFIG_HIGHMEM
6685        if (PageHighMem(page))
6686                totalhigh_pages += count;
6687#endif
6688        spin_unlock(&managed_page_count_lock);
6689}
6690EXPORT_SYMBOL(adjust_managed_page_count);
6691
6692unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
6693{
6694        void *pos;
6695        unsigned long pages = 0;
6696
6697        start = (void *)PAGE_ALIGN((unsigned long)start);
6698        end = (void *)((unsigned long)end & PAGE_MASK);
6699        for (pos = start; pos < end; pos += PAGE_SIZE, pages++) {
6700                if ((unsigned int)poison <= 0xFF)
6701                        memset(pos, poison, PAGE_SIZE);
6702                free_reserved_page(virt_to_page(pos));
6703        }
6704
6705        if (pages && s)
6706                pr_info("Freeing %s memory: %ldK\n",
6707                        s, pages << (PAGE_SHIFT - 10));
6708
6709        return pages;
6710}
6711EXPORT_SYMBOL(free_reserved_area);
6712
6713#ifdef  CONFIG_HIGHMEM
6714void free_highmem_page(struct page *page)
6715{
6716        __free_reserved_page(page);
6717        totalram_pages++;
6718        page_zone(page)->managed_pages++;
6719        totalhigh_pages++;
6720}
6721#endif
6722
6723
6724void __init mem_init_print_info(const char *str)
6725{
6726        unsigned long physpages, codesize, datasize, rosize, bss_size;
6727        unsigned long init_code_size, init_data_size;
6728
6729        physpages = get_num_physpages();
6730        codesize = _etext - _stext;
6731        datasize = _edata - _sdata;
6732        rosize = __end_rodata - __start_rodata;
6733        bss_size = __bss_stop - __bss_start;
6734        init_data_size = __init_end - __init_begin;
6735        init_code_size = _einittext - _sinittext;
6736
6737        /*
6738         * Detect special cases and adjust section sizes accordingly:
6739         * 1) .init.* may be embedded into .data sections
6740         * 2) .init.text.* may be out of [__init_begin, __init_end],
6741         *    please refer to arch/tile/kernel/vmlinux.lds.S.
6742         * 3) .rodata.* may be embedded into .text or .data sections.
6743         */
6744#define adj_init_size(start, end, size, pos, adj) \
6745        do { \
6746                if (start <= pos && pos < end && size > adj) \
6747                        size -= adj; \
6748        } while (0)
6749
6750        adj_init_size(__init_begin, __init_end, init_data_size,
6751                     _sinittext, init_code_size);
6752        adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size);
6753        adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size);
6754        adj_init_size(_stext, _etext, codesize, __start_rodata, rosize);
6755        adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize);
6756
6757#undef  adj_init_size
6758
6759        pr_info("Memory: %luK/%luK available (%luK kernel code, %luK rwdata, %luK rodata, %luK init, %luK bss, %luK reserved, %luK cma-reserved"
6760#ifdef  CONFIG_HIGHMEM
6761                ", %luK highmem"
6762#endif
6763                "%s%s)\n",
6764                nr_free_pages() << (PAGE_SHIFT - 10),
6765                physpages << (PAGE_SHIFT - 10),
6766                codesize >> 10, datasize >> 10, rosize >> 10,
6767                (init_data_size + init_code_size) >> 10, bss_size >> 10,
6768                (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10),
6769                totalcma_pages << (PAGE_SHIFT - 10),
6770#ifdef  CONFIG_HIGHMEM
6771                totalhigh_pages << (PAGE_SHIFT - 10),
6772#endif
6773                str ? ", " : "", str ? str : "");
6774}
6775
6776/**
6777 * set_dma_reserve - set the specified number of pages reserved in the first zone
6778 * @new_dma_reserve: The number of pages to mark reserved
6779 *
6780 * The per-cpu batchsize and zone watermarks are determined by managed_pages.
6781 * In the DMA zone, a significant percentage may be consumed by kernel image
6782 * and other unfreeable allocations which can skew the watermarks badly. This
6783 * function may optionally be used to account for unfreeable pages in the
6784 * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
6785 * smaller per-cpu batchsize.
6786 */
6787void __init set_dma_reserve(unsigned long new_dma_reserve)
6788{
6789        dma_reserve = new_dma_reserve;
6790}
6791
6792void __init free_area_init(unsigned long *zones_size)
6793{
6794        free_area_init_node(0, zones_size,
6795                        __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
6796}
6797
6798static int page_alloc_cpu_dead(unsigned int cpu)
6799{
6800
6801        lru_add_drain_cpu(cpu);
6802        drain_pages(cpu);
6803
6804        /*
6805         * Spill the event counters of the dead processor
6806         * into the current processors event counters.
6807         * This artificially elevates the count of the current
6808         * processor.
6809         */
6810        vm_events_fold_cpu(cpu);
6811
6812        /*
6813         * Zero the differential counters of the dead processor
6814         * so that the vm statistics are consistent.
6815         *
6816         * This is only okay since the processor is dead and cannot
6817         * race with what we are doing.
6818         */
6819        cpu_vm_stats_fold(cpu);
6820        return 0;
6821}
6822
6823void __init page_alloc_init(void)
6824{
6825        int ret;
6826
6827        ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD,
6828                                        "mm/page_alloc:dead", NULL,
6829                                        page_alloc_cpu_dead);
6830        WARN_ON(ret < 0);
6831}
6832
6833/*
6834 * calculate_totalreserve_pages - called when sysctl_lowmem_reserve_ratio
6835 *      or min_free_kbytes changes.
6836 */
6837static void calculate_totalreserve_pages(void)
6838{
6839        struct pglist_data *pgdat;
6840        unsigned long reserve_pages = 0;
6841        enum zone_type i, j;
6842
6843        for_each_online_pgdat(pgdat) {
6844
6845                pgdat->totalreserve_pages = 0;
6846
6847                for (i = 0; i < MAX_NR_ZONES; i++) {
6848                        struct zone *zone = pgdat->node_zones + i;
6849                        long max = 0;
6850
6851                        /* Find valid and maximum lowmem_reserve in the zone */
6852                        for (j = i; j < MAX_NR_ZONES; j++) {
6853                                if (zone->lowmem_reserve[j] > max)
6854                                        max = zone->lowmem_reserve[j];
6855                        }
6856
6857                        /* we treat the high watermark as reserved pages. */
6858                        max += high_wmark_pages(zone);
6859
6860                        if (max > zone->managed_pages)
6861                                max = zone->managed_pages;
6862
6863                        pgdat->totalreserve_pages += max;
6864
6865                        reserve_pages += max;
6866                }
6867        }
6868        totalreserve_pages = reserve_pages;
6869}
6870
6871/*
6872 * setup_per_zone_lowmem_reserve - called whenever
6873 *      sysctl_lowmem_reserve_ratio changes.  Ensures that each zone
6874 *      has a correct pages reserved value, so an adequate number of
6875 *      pages are left in the zone after a successful __alloc_pages().
6876 */
6877static void setup_per_zone_lowmem_reserve(void)
6878{
6879        struct pglist_data *pgdat;
6880        enum zone_type j, idx;
6881
6882        for_each_online_pgdat(pgdat) {
6883                for (j = 0; j < MAX_NR_ZONES; j++) {
6884                        struct zone *zone = pgdat->node_zones + j;
6885                        unsigned long managed_pages = zone->managed_pages;
6886
6887                        zone->lowmem_reserve[j] = 0;
6888
6889                        idx = j;
6890                        while (idx) {
6891                                struct zone *lower_zone;
6892
6893                                idx--;
6894
6895                                if (sysctl_lowmem_reserve_ratio[idx] < 1)
6896                                        sysctl_lowmem_reserve_ratio[idx] = 1;
6897
6898                                lower_zone = pgdat->node_zones + idx;
6899                                lower_zone->lowmem_reserve[j] = managed_pages /
6900                                        sysctl_lowmem_reserve_ratio[idx];
6901                                managed_pages += lower_zone->managed_pages;
6902                        }
6903                }
6904        }
6905
6906        /* update totalreserve_pages */
6907        calculate_totalreserve_pages();
6908}
6909
6910static void __setup_per_zone_wmarks(void)
6911{
6912        unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
6913        unsigned long lowmem_pages = 0;
6914        struct zone *zone;
6915        unsigned long flags;
6916
6917        /* Calculate total number of !ZONE_HIGHMEM pages */
6918        for_each_zone(zone) {
6919                if (!is_highmem(zone))
6920                        lowmem_pages += zone->managed_pages;
6921        }
6922
6923        for_each_zone(zone) {
6924                u64 tmp;
6925
6926                spin_lock_irqsave(&zone->lock, flags);
6927                tmp = (u64)pages_min * zone->managed_pages;
6928                do_div(tmp, lowmem_pages);
6929                if (is_highmem(zone)) {
6930                        /*
6931                         * __GFP_HIGH and PF_MEMALLOC allocations usually don't
6932                         * need highmem pages, so cap pages_min to a small
6933                         * value here.
6934                         *
6935                         * The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
6936                         * deltas control asynch page reclaim, and so should
6937                         * not be capped for highmem.
6938                         */
6939                        unsigned long min_pages;
6940
6941                        min_pages = zone->managed_pages / 1024;
6942                        min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
6943                        zone->watermark[WMARK_MIN] = min_pages;
6944                } else {
6945                        /*
6946                         * If it's a lowmem zone, reserve a number of pages
6947                         * proportionate to the zone's size.
6948                         */
6949                        zone->watermark[WMARK_MIN] = tmp;
6950                }
6951
6952                /*
6953                 * Set the kswapd watermarks distance according to the
6954                 * scale factor in proportion to available memory, but
6955                 * ensure a minimum size on small systems.
6956                 */
6957                tmp = max_t(u64, tmp >> 2,
6958                            mult_frac(zone->managed_pages,
6959                                      watermark_scale_factor, 10000));
6960
6961                zone->watermark[WMARK_LOW]  = min_wmark_pages(zone) + tmp;
6962                zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
6963
6964                spin_unlock_irqrestore(&zone->lock, flags);
6965        }
6966
6967        /* update totalreserve_pages */
6968        calculate_totalreserve_pages();
6969}
6970
6971/**
6972 * setup_per_zone_wmarks - called when min_free_kbytes changes
6973 * or when memory is hot-{added|removed}
6974 *
6975 * Ensures that the watermark[min,low,high] values for each zone are set
6976 * correctly with respect to min_free_kbytes.
6977 */
6978void setup_per_zone_wmarks(void)
6979{
6980        static DEFINE_SPINLOCK(lock);
6981
6982        spin_lock(&lock);
6983        __setup_per_zone_wmarks();
6984        spin_unlock(&lock);
6985}
6986
6987/*
6988 * Initialise min_free_kbytes.
6989 *
6990 * For small machines we want it small (128k min).  For large machines
6991 * we want it large (64MB max).  But it is not linear, because network
6992 * bandwidth does not increase linearly with machine size.  We use
6993 *
6994 *      min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
6995 *      min_free_kbytes = sqrt(lowmem_kbytes * 16)
6996 *
6997 * which yields
6998 *
6999 * 16MB:        512k
7000 * 32MB:        724k
7001 * 64MB:        1024k
7002 * 128MB:       1448k
7003 * 256MB:       2048k
7004 * 512MB:       2896k
7005 * 1024MB:      4096k
7006 * 2048MB:      5792k
7007 * 4096MB:      8192k
7008 * 8192MB:      11584k
7009 * 16384MB:     16384k
7010 */
7011int __meminit init_per_zone_wmark_min(void)
7012{
7013        unsigned long lowmem_kbytes;
7014        int new_min_free_kbytes;
7015
7016        lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
7017        new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
7018
7019        if (new_min_free_kbytes > user_min_free_kbytes) {
7020                min_free_kbytes = new_min_free_kbytes;
7021                if (min_free_kbytes < 128)
7022                        min_free_kbytes = 128;
7023                if (min_free_kbytes > 65536)
7024                        min_free_kbytes = 65536;
7025        } else {
7026                pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
7027                                new_min_free_kbytes, user_min_free_kbytes);
7028        }
7029        setup_per_zone_wmarks();
7030        refresh_zone_stat_thresholds();
7031        setup_per_zone_lowmem_reserve();
7032
7033#ifdef CONFIG_NUMA
7034        setup_min_unmapped_ratio();
7035        setup_min_slab_ratio();
7036#endif
7037
7038        return 0;
7039}
7040core_initcall(init_per_zone_wmark_min)
7041
7042/*
7043 * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
7044 *      that we can call two helper functions whenever min_free_kbytes
7045 *      changes.
7046 */
7047int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
7048        void __user *buffer, size_t *length, loff_t *ppos)
7049{
7050        int rc;
7051
7052        rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
7053        if (rc)
7054                return rc;
7055
7056        if (write) {
7057                user_min_free_kbytes = min_free_kbytes;
7058                setup_per_zone_wmarks();
7059        }
7060        return 0;
7061}
7062
7063int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
7064        void __user *buffer, size_t *length, loff_t *ppos)
7065{
7066        int rc;
7067
7068        rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
7069        if (rc)
7070                return rc;
7071
7072        if (write)
7073                setup_per_zone_wmarks();
7074
7075        return 0;
7076}
7077
7078#ifdef CONFIG_NUMA
7079static void setup_min_unmapped_ratio(void)
7080{
7081        pg_data_t *pgdat;
7082        struct zone *zone;
7083
7084        for_each_online_pgdat(pgdat)
7085                pgdat->min_unmapped_pages = 0;
7086
7087        for_each_zone(zone)
7088                zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages *
7089                                sysctl_min_unmapped_ratio) / 100;
7090}
7091
7092
7093int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
7094        void __user *buffer, size_t *length, loff_t *ppos)
7095{
7096        int rc;
7097
7098        rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
7099        if (rc)
7100                return rc;
7101
7102        setup_min_unmapped_ratio();
7103
7104        return 0;
7105}
7106
7107static void setup_min_slab_ratio(void)
7108{
7109        pg_data_t *pgdat;
7110        struct zone *zone;
7111
7112        for_each_online_pgdat(pgdat)
7113                pgdat->min_slab_pages = 0;
7114
7115        for_each_zone(zone)
7116                zone->zone_pgdat->min_slab_pages += (zone->managed_pages *
7117                                sysctl_min_slab_ratio) / 100;
7118}
7119
7120int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
7121        void __user *buffer, size_t *length, loff_t *ppos)
7122{
7123        int rc;
7124
7125        rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
7126        if (rc)
7127                return rc;
7128
7129        setup_min_slab_ratio();
7130
7131        return 0;
7132}
7133#endif
7134
7135/*
7136 * lowmem_reserve_ratio_sysctl_handler - just a wrapper around
7137 *      proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
7138 *      whenever sysctl_lowmem_reserve_ratio changes.
7139 *
7140 * The reserve ratio obviously has absolutely no relation with the
7141 * minimum watermarks. The lowmem reserve ratio can only make sense
7142 * if in function of the boot time zone sizes.
7143 */
7144int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
7145        void __user *buffer, size_t *length, loff_t *ppos)
7146{
7147        proc_dointvec_minmax(table, write, buffer, length, ppos);
7148        setup_per_zone_lowmem_reserve();
7149        return 0;
7150}
7151
7152/*
7153 * percpu_pagelist_fraction - changes the pcp->high for each zone on each
7154 * cpu.  It is the fraction of total pages in each zone that a hot per cpu
7155 * pagelist can have before it gets flushed back to buddy allocator.
7156 */
7157int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
7158        void __user *buffer, size_t *length, loff_t *ppos)
7159{
7160        struct zone *zone;
7161        int old_percpu_pagelist_fraction;
7162        int ret;
7163
7164        mutex_lock(&pcp_batch_high_lock);
7165        old_percpu_pagelist_fraction = percpu_pagelist_fraction;
7166
7167        ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
7168        if (!write || ret < 0)
7169                goto out;
7170
7171        /* Sanity checking to avoid pcp imbalance */
7172        if (percpu_pagelist_fraction &&
7173            percpu_pagelist_fraction < MIN_PERCPU_PAGELIST_FRACTION) {
7174                percpu_pagelist_fraction = old_percpu_pagelist_fraction;
7175                ret = -EINVAL;
7176                goto out;
7177        }
7178
7179        /* No change? */
7180        if (percpu_pagelist_fraction == old_percpu_pagelist_fraction)
7181                goto out;
7182
7183        for_each_populated_zone(zone) {
7184                unsigned int cpu;
7185
7186                for_each_possible_cpu(cpu)
7187                        pageset_set_high_and_batch(zone,
7188                                        per_cpu_ptr(zone->pageset, cpu));
7189        }
7190out:
7191        mutex_unlock(&pcp_batch_high_lock);
7192        return ret;
7193}
7194
7195#ifdef CONFIG_NUMA
7196int hashdist = HASHDIST_DEFAULT;
7197
7198static int __init set_hashdist(char *str)
7199{
7200        if (!str)
7201                return 0;
7202        hashdist = simple_strtoul(str, &str, 0);
7203        return 1;
7204}
7205__setup("hashdist=", set_hashdist);
7206#endif
7207
7208#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
7209/*
7210 * Returns the number of pages that arch has reserved but
7211 * is not known to alloc_large_system_hash().
7212 */
7213static unsigned long __init arch_reserved_kernel_pages(void)
7214{
7215        return 0;
7216}
7217#endif
7218
7219/*
7220 * Adaptive scale is meant to reduce sizes of hash tables on large memory
7221 * machines. As memory size is increased the scale is also increased but at
7222 * slower pace.  Starting from ADAPT_SCALE_BASE (64G), every time memory
7223 * quadruples the scale is increased by one, which means the size of hash table
7224 * only doubles, instead of quadrupling as well.
7225 * Because 32-bit systems cannot have large physical memory, where this scaling
7226 * makes sense, it is disabled on such platforms.
7227 */
7228#if __BITS_PER_LONG > 32
7229#define ADAPT_SCALE_BASE        (64ul << 30)
7230#define ADAPT_SCALE_SHIFT       2
7231#define ADAPT_SCALE_NPAGES      (ADAPT_SCALE_BASE >> PAGE_SHIFT)
7232#endif
7233
7234/*
7235 * allocate a large system hash table from bootmem
7236 * - it is assumed that the hash table must contain an exact power-of-2
7237 *   quantity of entries
7238 * - limit is the number of hash buckets, not the total allocation size
7239 */
7240void *__init alloc_large_system_hash(const char *tablename,
7241                                     unsigned long bucketsize,
7242                                     unsigned long numentries,
7243                                     int scale,
7244                                     int flags,
7245                                     unsigned int *_hash_shift,
7246                                     unsigned int *_hash_mask,
7247                                     unsigned long low_limit,
7248                                     unsigned long high_limit)
7249{
7250        unsigned long long max = high_limit;
7251        unsigned long log2qty, size;
7252        void *table = NULL;
7253        gfp_t gfp_flags;
7254
7255        /* allow the kernel cmdline to have a say */
7256        if (!numentries) {
7257                /* round applicable memory size up to nearest megabyte */
7258                numentries = nr_kernel_pages;
7259                numentries -= arch_reserved_kernel_pages();
7260
7261                /* It isn't necessary when PAGE_SIZE >= 1MB */
7262                if (PAGE_SHIFT < 20)
7263                        numentries = round_up(numentries, (1<<20)/PAGE_SIZE);
7264
7265#if __BITS_PER_LONG > 32
7266                if (!high_limit) {
7267                        unsigned long adapt;
7268
7269                        for (adapt = ADAPT_SCALE_NPAGES; adapt < numentries;
7270                             adapt <<= ADAPT_SCALE_SHIFT)
7271                                scale++;
7272                }
7273#endif
7274
7275                /* limit to 1 bucket per 2^scale bytes of low memory */
7276                if (scale > PAGE_SHIFT)
7277                        numentries >>= (scale - PAGE_SHIFT);
7278                else
7279                        numentries <<= (PAGE_SHIFT - scale);
7280
7281                /* Make sure we've got at least a 0-order allocation.. */
7282                if (unlikely(flags & HASH_SMALL)) {
7283                        /* Makes no sense without HASH_EARLY */
7284                        WARN_ON(!(flags & HASH_EARLY));
7285                        if (!(numentries >> *_hash_shift)) {
7286                                numentries = 1UL << *_hash_shift;
7287                                BUG_ON(!numentries);
7288                        }
7289                } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
7290                        numentries = PAGE_SIZE / bucketsize;
7291        }
7292        numentries = roundup_pow_of_two(numentries);
7293
7294        /* limit allocation size to 1/16 total memory by default */
7295        if (max == 0) {
7296                max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
7297                do_div(max, bucketsize);
7298        }
7299        max = min(max, 0x80000000ULL);
7300
7301        if (numentries < low_limit)
7302                numentries = low_limit;
7303        if (numentries > max)
7304                numentries = max;
7305
7306        log2qty = ilog2(numentries);
7307
7308        /*
7309         * memblock allocator returns zeroed memory already, so HASH_ZERO is
7310         * currently not used when HASH_EARLY is specified.
7311         */
7312        gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
7313        do {
7314                size = bucketsize << log2qty;
7315                if (flags & HASH_EARLY)
7316                        table = memblock_virt_alloc_nopanic(size, 0);
7317                else if (hashdist)
7318                        table = __vmalloc(size, gfp_flags, PAGE_KERNEL);
7319                else {
7320                        /*
7321                         * If bucketsize is not a power-of-two, we may free
7322                         * some pages at the end of hash table which
7323                         * alloc_pages_exact() automatically does
7324                         */
7325                        if (get_order(size) < MAX_ORDER) {
7326                                table = alloc_pages_exact(size, gfp_flags);
7327                                kmemleak_alloc(table, size, 1, gfp_flags);
7328                        }
7329                }
7330        } while (!table && size > PAGE_SIZE && --log2qty);
7331
7332        if (!table)
7333                panic("Failed to allocate %s hash table\n", tablename);
7334
7335        pr_info("%s hash table entries: %ld (order: %d, %lu bytes)\n",
7336                tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size);
7337
7338        if (_hash_shift)
7339                *_hash_shift = log2qty;
7340        if (_hash_mask)
7341                *_hash_mask = (1 << log2qty) - 1;
7342
7343        return table;
7344}
7345
7346/*
7347 * This function checks whether pageblock includes unmovable pages or not.
7348 * If @count is not zero, it is okay to include less @count unmovable pages
7349 *
7350 * PageLRU check without isolation or lru_lock could race so that
7351 * MIGRATE_MOVABLE block might include unmovable pages. And __PageMovable
7352 * check without lock_page also may miss some movable non-lru pages at
7353 * race condition. So you can't expect this function should be exact.
7354 */
7355bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
7356                         bool skip_hwpoisoned_pages)
7357{
7358        unsigned long pfn, iter, found;
7359        int mt;
7360
7361        /*
7362         * For avoiding noise data, lru_add_drain_all() should be called
7363         * If ZONE_MOVABLE, the zone never contains unmovable pages
7364         */
7365        if (zone_idx(zone) == ZONE_MOVABLE)
7366                return false;
7367        mt = get_pageblock_migratetype(page);
7368        if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt))
7369                return false;
7370
7371        pfn = page_to_pfn(page);
7372        for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
7373                unsigned long check = pfn + iter;
7374
7375                if (!pfn_valid_within(check))
7376                        continue;
7377
7378                page = pfn_to_page(check);
7379
7380                /*
7381                 * Hugepages are not in LRU lists, but they're movable.
7382                 * We need not scan over tail pages bacause we don't
7383                 * handle each tail page individually in migration.
7384                 */
7385                if (PageHuge(page)) {
7386                        iter = round_up(iter + 1, 1<<compound_order(page)) - 1;
7387                        continue;
7388                }
7389
7390                /*
7391                 * We can't use page_count without pin a page
7392                 * because another CPU can free compound page.
7393                 * This check already skips compound tails of THP
7394                 * because their page->_refcount is zero at all time.
7395                 */
7396                if (!page_ref_count(page)) {
7397                        if (PageBuddy(page))
7398                                iter += (1 << page_order(page)) - 1;
7399                        continue;
7400                }
7401
7402                /*
7403                 * The HWPoisoned page may be not in buddy system, and
7404                 * page_count() is not 0.
7405                 */
7406                if (skip_hwpoisoned_pages && PageHWPoison(page))
7407                        continue;
7408
7409                if (__PageMovable(page))
7410                        continue;
7411
7412                if (!PageLRU(page))
7413                        found++;
7414                /*
7415                 * If there are RECLAIMABLE pages, we need to check
7416                 * it.  But now, memory offline itself doesn't call
7417                 * shrink_node_slabs() and it still to be fixed.
7418                 */
7419                /*
7420                 * If the page is not RAM, page_count()should be 0.
7421                 * we don't need more check. This is an _used_ not-movable page.
7422                 *
7423                 * The problematic thing here is PG_reserved pages. PG_reserved
7424                 * is set to both of a memory hole page and a _used_ kernel
7425                 * page at boot.
7426                 */
7427                if (found > count)
7428                        return true;
7429        }
7430        return false;
7431}
7432
7433bool is_pageblock_removable_nolock(struct page *page)
7434{
7435        struct zone *zone;
7436        unsigned long pfn;
7437
7438        /*
7439         * We have to be careful here because we are iterating over memory
7440         * sections which are not zone aware so we might end up outside of
7441         * the zone but still within the section.
7442         * We have to take care about the node as well. If the node is offline
7443         * its NODE_DATA will be NULL - see page_zone.
7444         */
7445        if (!node_online(page_to_nid(page)))
7446                return false;
7447
7448        zone = page_zone(page);
7449        pfn = page_to_pfn(page);
7450        if (!zone_spans_pfn(zone, pfn))
7451                return false;
7452
7453        return !has_unmovable_pages(zone, page, 0, true);
7454}
7455
7456#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
7457
7458static unsigned long pfn_max_align_down(unsigned long pfn)
7459{
7460        return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
7461                             pageblock_nr_pages) - 1);
7462}
7463
7464static unsigned long pfn_max_align_up(unsigned long pfn)
7465{
7466        return ALIGN(pfn, max_t(unsigned long, MAX_ORDER_NR_PAGES,
7467                                pageblock_nr_pages));
7468}
7469
7470/* [start, end) must belong to a single zone. */
7471static int __alloc_contig_migrate_range(struct compact_control *cc,
7472                                        unsigned long start, unsigned long end)
7473{
7474        /* This function is based on compact_zone() from compaction.c. */
7475        unsigned long nr_reclaimed;
7476        unsigned long pfn = start;
7477        unsigned int tries = 0;
7478        int ret = 0;
7479
7480        migrate_prep();
7481
7482        while (pfn < end || !list_empty(&cc->migratepages)) {
7483                if (fatal_signal_pending(current)) {
7484                        ret = -EINTR;
7485                        break;
7486                }
7487
7488                if (list_empty(&cc->migratepages)) {
7489                        cc->nr_migratepages = 0;
7490                        pfn = isolate_migratepages_range(cc, pfn, end);
7491                        if (!pfn) {
7492                                ret = -EINTR;
7493                                break;
7494                        }
7495                        tries = 0;
7496                } else if (++tries == 5) {
7497                        ret = ret < 0 ? ret : -EBUSY;
7498                        break;
7499                }
7500
7501                nr_reclaimed = reclaim_clean_pages_from_list(cc->zone,
7502                                                        &cc->migratepages);
7503                cc->nr_migratepages -= nr_reclaimed;
7504
7505                ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
7506                                    NULL, 0, cc->mode, MR_CMA);
7507        }
7508        if (ret < 0) {
7509                putback_movable_pages(&cc->migratepages);
7510                return ret;
7511        }
7512        return 0;
7513}
7514
7515/**
7516 * alloc_contig_range() -- tries to allocate given range of pages
7517 * @start:      start PFN to allocate
7518 * @end:        one-past-the-last PFN to allocate
7519 * @migratetype:        migratetype of the underlaying pageblocks (either
7520 *                      #MIGRATE_MOVABLE or #MIGRATE_CMA).  All pageblocks
7521 *                      in range must have the same migratetype and it must
7522 *                      be either of the two.
7523 * @gfp_mask:   GFP mask to use during compaction
7524 *
7525 * The PFN range does not have to be pageblock or MAX_ORDER_NR_PAGES
7526 * aligned, however it's the caller's responsibility to guarantee that
7527 * we are the only thread that changes migrate type of pageblocks the
7528 * pages fall in.
7529 *
7530 * The PFN range must belong to a single zone.
7531 *
7532 * Returns zero on success or negative error code.  On success all
7533 * pages which PFN is in [start, end) are allocated for the caller and
7534 * need to be freed with free_contig_range().
7535 */
7536int alloc_contig_range(unsigned long start, unsigned long end,
7537                       unsigned migratetype, gfp_t gfp_mask)
7538{
7539        unsigned long outer_start, outer_end;
7540        unsigned int order;
7541        int ret = 0;
7542
7543        struct compact_control cc = {
7544                .nr_migratepages = 0,
7545                .order = -1,
7546                .zone = page_zone(pfn_to_page(start)),
7547                .mode = MIGRATE_SYNC,
7548                .ignore_skip_hint = true,
7549                .gfp_mask = current_gfp_context(gfp_mask),
7550        };
7551        INIT_LIST_HEAD(&cc.migratepages);
7552
7553        /*
7554         * What we do here is we mark all pageblocks in range as
7555         * MIGRATE_ISOLATE.  Because pageblock and max order pages may
7556         * have different sizes, and due to the way page allocator
7557         * work, we align the range to biggest of the two pages so
7558         * that page allocator won't try to merge buddies from
7559         * different pageblocks and change MIGRATE_ISOLATE to some
7560         * other migration type.
7561         *
7562         * Once the pageblocks are marked as MIGRATE_ISOLATE, we
7563         * migrate the pages from an unaligned range (ie. pages that
7564         * we are interested in).  This will put all the pages in
7565         * range back to page allocator as MIGRATE_ISOLATE.
7566         *
7567         * When this is done, we take the pages in range from page
7568         * allocator removing them from the buddy system.  This way
7569         * page allocator will never consider using them.
7570         *
7571         * This lets us mark the pageblocks back as
7572         * MIGRATE_CMA/MIGRATE_MOVABLE so that free pages in the
7573         * aligned range but not in the unaligned, original range are
7574         * put back to page allocator so that buddy can use them.
7575         */
7576
7577        ret = start_isolate_page_range(pfn_max_align_down(start),
7578                                       pfn_max_align_up(end), migratetype,
7579                                       false);
7580        if (ret)
7581                return ret;
7582
7583        /*
7584         * In case of -EBUSY, we'd like to know which page causes problem.
7585         * So, just fall through. We will check it in test_pages_isolated().
7586         */
7587        ret = __alloc_contig_migrate_range(&cc, start, end);
7588        if (ret && ret != -EBUSY)
7589                goto done;
7590
7591        /*
7592         * Pages from [start, end) are within a MAX_ORDER_NR_PAGES
7593         * aligned blocks that are marked as MIGRATE_ISOLATE.  What's
7594         * more, all pages in [start, end) are free in page allocator.
7595         * What we are going to do is to allocate all pages from
7596         * [start, end) (that is remove them from page allocator).
7597         *
7598         * The only problem is that pages at the beginning and at the
7599         * end of interesting range may be not aligned with pages that
7600         * page allocator holds, ie. they can be part of higher order
7601         * pages.  Because of this, we reserve the bigger range and
7602         * once this is done free the pages we are not interested in.
7603         *
7604         * We don't have to hold zone->lock here because the pages are
7605         * isolated thus they won't get removed from buddy.
7606         */
7607
7608        lru_add_drain_all();
7609        drain_all_pages(cc.zone);
7610
7611        order = 0;
7612        outer_start = start;
7613        while (!PageBuddy(pfn_to_page(outer_start))) {
7614                if (++order >= MAX_ORDER) {
7615                        outer_start = start;
7616                        break;
7617                }
7618                outer_start &= ~0UL << order;
7619        }
7620
7621        if (outer_start != start) {
7622                order = page_order(pfn_to_page(outer_start));
7623
7624                /*
7625                 * outer_start page could be small order buddy page and
7626                 * it doesn't include start page. Adjust outer_start
7627                 * in this case to report failed page properly
7628                 * on tracepoint in test_pages_isolated()
7629                 */
7630                if (outer_start + (1UL << order) <= start)
7631                        outer_start = start;
7632        }
7633
7634        /* Make sure the range is really isolated. */
7635        if (test_pages_isolated(outer_start, end, false)) {
7636                pr_debug("%s: [%lx, %lx) PFNs busy\n",
7637                        __func__, outer_start, end);
7638                ret = -EBUSY;
7639                goto done;
7640        }
7641
7642        /* Grab isolated pages from freelists. */
7643        outer_end = isolate_freepages_range(&cc, outer_start, end);
7644        if (!outer_end) {
7645                ret = -EBUSY;
7646                goto done;
7647        }
7648
7649        /* Free head and tail (if any) */
7650        if (start != outer_start)
7651                free_contig_range(outer_start, start - outer_start);
7652        if (end != outer_end)
7653                free_contig_range(end, outer_end - end);
7654
7655done:
7656        undo_isolate_page_range(pfn_max_align_down(start),
7657                                pfn_max_align_up(end), migratetype);
7658        return ret;
7659}
7660
7661void free_contig_range(unsigned long pfn, unsigned nr_pages)
7662{
7663        unsigned int count = 0;
7664
7665        for (; nr_pages--; pfn++) {
7666                struct page *page = pfn_to_page(pfn);
7667
7668                count += page_count(page) != 1;
7669                __free_page(page);
7670        }
7671        WARN(count != 0, "%d pages are still in use!\n", count);
7672}
7673#endif
7674
7675#ifdef CONFIG_MEMORY_HOTPLUG
7676/*
7677 * The zone indicated has a new number of managed_pages; batch sizes and percpu
7678 * page high values need to be recalulated.
7679 */
7680void __meminit zone_pcp_update(struct zone *zone)
7681{
7682        unsigned cpu;
7683        mutex_lock(&pcp_batch_high_lock);
7684        for_each_possible_cpu(cpu)
7685                pageset_set_high_and_batch(zone,
7686                                per_cpu_ptr(zone->pageset, cpu));
7687        mutex_unlock(&pcp_batch_high_lock);
7688}
7689#endif
7690
7691void zone_pcp_reset(struct zone *zone)
7692{
7693        unsigned long flags;
7694        int cpu;
7695        struct per_cpu_pageset *pset;
7696
7697        /* avoid races with drain_pages()  */
7698        local_irq_save(flags);
7699        if (zone->pageset != &boot_pageset) {
7700                for_each_online_cpu(cpu) {
7701                        pset = per_cpu_ptr(zone->pageset, cpu);
7702                        drain_zonestat(zone, pset);
7703                }
7704                free_percpu(zone->pageset);
7705                zone->pageset = &boot_pageset;
7706        }
7707        local_irq_restore(flags);
7708}
7709
7710#ifdef CONFIG_MEMORY_HOTREMOVE
7711/*
7712 * All pages in the range must be in a single zone and isolated
7713 * before calling this.
7714 */
7715void
7716__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
7717{
7718        struct page *page;
7719        struct zone *zone;
7720        unsigned int order, i;
7721        unsigned long pfn;
7722        unsigned long flags;
7723        /* find the first valid pfn */
7724        for (pfn = start_pfn; pfn < end_pfn; pfn++)
7725                if (pfn_valid(pfn))
7726                        break;
7727        if (pfn == end_pfn)
7728                return;
7729        offline_mem_sections(pfn, end_pfn);
7730        zone = page_zone(pfn_to_page(pfn));
7731        spin_lock_irqsave(&zone->lock, flags);
7732        pfn = start_pfn;
7733        while (pfn < end_pfn) {
7734                if (!pfn_valid(pfn)) {
7735                        pfn++;
7736                        continue;
7737                }
7738                page = pfn_to_page(pfn);
7739                /*
7740                 * The HWPoisoned page may be not in buddy system, and
7741                 * page_count() is not 0.
7742                 */
7743                if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
7744                        pfn++;
7745                        SetPageReserved(page);
7746                        continue;
7747                }
7748
7749                BUG_ON(page_count(page));
7750                BUG_ON(!PageBuddy(page));
7751                order = page_order(page);
7752#ifdef CONFIG_DEBUG_VM
7753                pr_info("remove from free list %lx %d %lx\n",
7754                        pfn, 1 << order, end_pfn);
7755#endif
7756                list_del(&page->lru);
7757                rmv_page_order(page);
7758                zone->free_area[order].nr_free--;
7759                for (i = 0; i < (1 << order); i++)
7760                        SetPageReserved((page+i));
7761                pfn += (1 << order);
7762        }
7763        spin_unlock_irqrestore(&zone->lock, flags);
7764}
7765#endif
7766
7767bool is_free_buddy_page(struct page *page)
7768{
7769        struct zone *zone = page_zone(page);
7770        unsigned long pfn = page_to_pfn(page);
7771        unsigned long flags;
7772        unsigned int order;
7773
7774        spin_lock_irqsave(&zone->lock, flags);
7775        for (order = 0; order < MAX_ORDER; order++) {
7776                struct page *page_head = page - (pfn & ((1 << order) - 1));
7777
7778                if (PageBuddy(page_head) && page_order(page_head) >= order)
7779                        break;
7780        }
7781        spin_unlock_irqrestore(&zone->lock, flags);
7782
7783        return order < MAX_ORDER;
7784}
7785