linux/mm/compaction.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * linux/mm/compaction.c
   4 *
   5 * Memory compaction for the reduction of external fragmentation. Note that
   6 * this heavily depends upon page migration to do all the real heavy
   7 * lifting
   8 *
   9 * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
  10 */
  11#include <linux/cpu.h>
  12#include <linux/swap.h>
  13#include <linux/migrate.h>
  14#include <linux/compaction.h>
  15#include <linux/mm_inline.h>
  16#include <linux/sched/signal.h>
  17#include <linux/backing-dev.h>
  18#include <linux/sysctl.h>
  19#include <linux/sysfs.h>
  20#include <linux/page-isolation.h>
  21#include <linux/kasan.h>
  22#include <linux/kthread.h>
  23#include <linux/freezer.h>
  24#include <linux/page_owner.h>
  25#include <linux/psi.h>
  26#include "internal.h"
  27
  28#ifdef CONFIG_COMPACTION
  29static inline void count_compact_event(enum vm_event_item item)
  30{
  31        count_vm_event(item);
  32}
  33
  34static inline void count_compact_events(enum vm_event_item item, long delta)
  35{
  36        count_vm_events(item, delta);
  37}
  38#else
  39#define count_compact_event(item) do { } while (0)
  40#define count_compact_events(item, delta) do { } while (0)
  41#endif
  42
  43#if defined CONFIG_COMPACTION || defined CONFIG_CMA
  44
  45#define CREATE_TRACE_POINTS
  46#include <trace/events/compaction.h>
  47
  48#define block_start_pfn(pfn, order)     round_down(pfn, 1UL << (order))
  49#define block_end_pfn(pfn, order)       ALIGN((pfn) + 1, 1UL << (order))
  50#define pageblock_start_pfn(pfn)        block_start_pfn(pfn, pageblock_order)
  51#define pageblock_end_pfn(pfn)          block_end_pfn(pfn, pageblock_order)
  52
  53static unsigned long release_freepages(struct list_head *freelist)
  54{
  55        struct page *page, *next;
  56        unsigned long high_pfn = 0;
  57
  58        list_for_each_entry_safe(page, next, freelist, lru) {
  59                unsigned long pfn = page_to_pfn(page);
  60                list_del(&page->lru);
  61                __free_page(page);
  62                if (pfn > high_pfn)
  63                        high_pfn = pfn;
  64        }
  65
  66        return high_pfn;
  67}
  68
  69static void map_pages(struct list_head *list)
  70{
  71        unsigned int i, order, nr_pages;
  72        struct page *page, *next;
  73        LIST_HEAD(tmp_list);
  74
  75        list_for_each_entry_safe(page, next, list, lru) {
  76                list_del(&page->lru);
  77
  78                order = page_private(page);
  79                nr_pages = 1 << order;
  80
  81                post_alloc_hook(page, order, __GFP_MOVABLE);
  82                if (order)
  83                        split_page(page, order);
  84
  85                for (i = 0; i < nr_pages; i++) {
  86                        list_add(&page->lru, &tmp_list);
  87                        page++;
  88                }
  89        }
  90
  91        list_splice(&tmp_list, list);
  92}
  93
  94#ifdef CONFIG_COMPACTION
  95
  96int PageMovable(struct page *page)
  97{
  98        struct address_space *mapping;
  99
 100        VM_BUG_ON_PAGE(!PageLocked(page), page);
 101        if (!__PageMovable(page))
 102                return 0;
 103
 104        mapping = page_mapping(page);
 105        if (mapping && mapping->a_ops && mapping->a_ops->isolate_page)
 106                return 1;
 107
 108        return 0;
 109}
 110EXPORT_SYMBOL(PageMovable);
 111
 112void __SetPageMovable(struct page *page, struct address_space *mapping)
 113{
 114        VM_BUG_ON_PAGE(!PageLocked(page), page);
 115        VM_BUG_ON_PAGE((unsigned long)mapping & PAGE_MAPPING_MOVABLE, page);
 116        page->mapping = (void *)((unsigned long)mapping | PAGE_MAPPING_MOVABLE);
 117}
 118EXPORT_SYMBOL(__SetPageMovable);
 119
 120void __ClearPageMovable(struct page *page)
 121{
 122        VM_BUG_ON_PAGE(!PageLocked(page), page);
 123        VM_BUG_ON_PAGE(!PageMovable(page), page);
 124        /*
 125         * Clear registered address_space val with keeping PAGE_MAPPING_MOVABLE
 126         * flag so that VM can catch up released page by driver after isolation.
 127         * With it, VM migration doesn't try to put it back.
 128         */
 129        page->mapping = (void *)((unsigned long)page->mapping &
 130                                PAGE_MAPPING_MOVABLE);
 131}
 132EXPORT_SYMBOL(__ClearPageMovable);
 133
 134/* Do not skip compaction more than 64 times */
 135#define COMPACT_MAX_DEFER_SHIFT 6
 136
 137/*
 138 * Compaction is deferred when compaction fails to result in a page
 139 * allocation success. 1 << compact_defer_limit compactions are skipped up
 140 * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT
 141 */
 142void defer_compaction(struct zone *zone, int order)
 143{
 144        zone->compact_considered = 0;
 145        zone->compact_defer_shift++;
 146
 147        if (order < zone->compact_order_failed)
 148                zone->compact_order_failed = order;
 149
 150        if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT)
 151                zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT;
 152
 153        trace_mm_compaction_defer_compaction(zone, order);
 154}
 155
 156/* Returns true if compaction should be skipped this time */
 157bool compaction_deferred(struct zone *zone, int order)
 158{
 159        unsigned long defer_limit = 1UL << zone->compact_defer_shift;
 160
 161        if (order < zone->compact_order_failed)
 162                return false;
 163
 164        /* Avoid possible overflow */
 165        if (++zone->compact_considered > defer_limit)
 166                zone->compact_considered = defer_limit;
 167
 168        if (zone->compact_considered >= defer_limit)
 169                return false;
 170
 171        trace_mm_compaction_deferred(zone, order);
 172
 173        return true;
 174}
 175
 176/*
 177 * Update defer tracking counters after successful compaction of given order,
 178 * which means an allocation either succeeded (alloc_success == true) or is
 179 * expected to succeed.
 180 */
 181void compaction_defer_reset(struct zone *zone, int order,
 182                bool alloc_success)
 183{
 184        if (alloc_success) {
 185                zone->compact_considered = 0;
 186                zone->compact_defer_shift = 0;
 187        }
 188        if (order >= zone->compact_order_failed)
 189                zone->compact_order_failed = order + 1;
 190
 191        trace_mm_compaction_defer_reset(zone, order);
 192}
 193
 194/* Returns true if restarting compaction after many failures */
 195bool compaction_restarting(struct zone *zone, int order)
 196{
 197        if (order < zone->compact_order_failed)
 198                return false;
 199
 200        return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT &&
 201                zone->compact_considered >= 1UL << zone->compact_defer_shift;
 202}
 203
 204/* Returns true if the pageblock should be scanned for pages to isolate. */
 205static inline bool isolation_suitable(struct compact_control *cc,
 206                                        struct page *page)
 207{
 208        if (cc->ignore_skip_hint)
 209                return true;
 210
 211        return !get_pageblock_skip(page);
 212}
 213
 214static void reset_cached_positions(struct zone *zone)
 215{
 216        zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn;
 217        zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn;
 218        zone->compact_cached_free_pfn =
 219                                pageblock_start_pfn(zone_end_pfn(zone) - 1);
 220}
 221
 222/*
 223 * Compound pages of >= pageblock_order should consistenly be skipped until
 224 * released. It is always pointless to compact pages of such order (if they are
 225 * migratable), and the pageblocks they occupy cannot contain any free pages.
 226 */
 227static bool pageblock_skip_persistent(struct page *page)
 228{
 229        if (!PageCompound(page))
 230                return false;
 231
 232        page = compound_head(page);
 233
 234        if (compound_order(page) >= pageblock_order)
 235                return true;
 236
 237        return false;
 238}
 239
 240/*
 241 * This function is called to clear all cached information on pageblocks that
 242 * should be skipped for page isolation when the migrate and free page scanner
 243 * meet.
 244 */
 245static void __reset_isolation_suitable(struct zone *zone)
 246{
 247        unsigned long start_pfn = zone->zone_start_pfn;
 248        unsigned long end_pfn = zone_end_pfn(zone);
 249        unsigned long pfn;
 250
 251        zone->compact_blockskip_flush = false;
 252
 253        /* Walk the zone and mark every pageblock as suitable for isolation */
 254        for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
 255                struct page *page;
 256
 257                cond_resched();
 258
 259                page = pfn_to_online_page(pfn);
 260                if (!page)
 261                        continue;
 262                if (zone != page_zone(page))
 263                        continue;
 264                if (pageblock_skip_persistent(page))
 265                        continue;
 266
 267                clear_pageblock_skip(page);
 268        }
 269
 270        reset_cached_positions(zone);
 271}
 272
 273void reset_isolation_suitable(pg_data_t *pgdat)
 274{
 275        int zoneid;
 276
 277        for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
 278                struct zone *zone = &pgdat->node_zones[zoneid];
 279                if (!populated_zone(zone))
 280                        continue;
 281
 282                /* Only flush if a full compaction finished recently */
 283                if (zone->compact_blockskip_flush)
 284                        __reset_isolation_suitable(zone);
 285        }
 286}
 287
 288/*
 289 * If no pages were isolated then mark this pageblock to be skipped in the
 290 * future. The information is later cleared by __reset_isolation_suitable().
 291 */
 292static void update_pageblock_skip(struct compact_control *cc,
 293                        struct page *page, unsigned long nr_isolated,
 294                        bool migrate_scanner)
 295{
 296        struct zone *zone = cc->zone;
 297        unsigned long pfn;
 298
 299        if (cc->no_set_skip_hint)
 300                return;
 301
 302        if (!page)
 303                return;
 304
 305        if (nr_isolated)
 306                return;
 307
 308        set_pageblock_skip(page);
 309
 310        pfn = page_to_pfn(page);
 311
 312        /* Update where async and sync compaction should restart */
 313        if (migrate_scanner) {
 314                if (pfn > zone->compact_cached_migrate_pfn[0])
 315                        zone->compact_cached_migrate_pfn[0] = pfn;
 316                if (cc->mode != MIGRATE_ASYNC &&
 317                    pfn > zone->compact_cached_migrate_pfn[1])
 318                        zone->compact_cached_migrate_pfn[1] = pfn;
 319        } else {
 320                if (pfn < zone->compact_cached_free_pfn)
 321                        zone->compact_cached_free_pfn = pfn;
 322        }
 323}
 324#else
 325static inline bool isolation_suitable(struct compact_control *cc,
 326                                        struct page *page)
 327{
 328        return true;
 329}
 330
 331static inline bool pageblock_skip_persistent(struct page *page)
 332{
 333        return false;
 334}
 335
 336static inline void update_pageblock_skip(struct compact_control *cc,
 337                        struct page *page, unsigned long nr_isolated,
 338                        bool migrate_scanner)
 339{
 340}
 341#endif /* CONFIG_COMPACTION */
 342
 343/*
 344 * Compaction requires the taking of some coarse locks that are potentially
 345 * very heavily contended. For async compaction, back out if the lock cannot
 346 * be taken immediately. For sync compaction, spin on the lock if needed.
 347 *
 348 * Returns true if the lock is held
 349 * Returns false if the lock is not held and compaction should abort
 350 */
 351static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags,
 352                                                struct compact_control *cc)
 353{
 354        if (cc->mode == MIGRATE_ASYNC) {
 355                if (!spin_trylock_irqsave(lock, *flags)) {
 356                        cc->contended = true;
 357                        return false;
 358                }
 359        } else {
 360                spin_lock_irqsave(lock, *flags);
 361        }
 362
 363        return true;
 364}
 365
 366/*
 367 * Compaction requires the taking of some coarse locks that are potentially
 368 * very heavily contended. The lock should be periodically unlocked to avoid
 369 * having disabled IRQs for a long time, even when there is nobody waiting on
 370 * the lock. It might also be that allowing the IRQs will result in
 371 * need_resched() becoming true. If scheduling is needed, async compaction
 372 * aborts. Sync compaction schedules.
 373 * Either compaction type will also abort if a fatal signal is pending.
 374 * In either case if the lock was locked, it is dropped and not regained.
 375 *
 376 * Returns true if compaction should abort due to fatal signal pending, or
 377 *              async compaction due to need_resched()
 378 * Returns false when compaction can continue (sync compaction might have
 379 *              scheduled)
 380 */
 381static bool compact_unlock_should_abort(spinlock_t *lock,
 382                unsigned long flags, bool *locked, struct compact_control *cc)
 383{
 384        if (*locked) {
 385                spin_unlock_irqrestore(lock, flags);
 386                *locked = false;
 387        }
 388
 389        if (fatal_signal_pending(current)) {
 390                cc->contended = true;
 391                return true;
 392        }
 393
 394        if (need_resched()) {
 395                if (cc->mode == MIGRATE_ASYNC) {
 396                        cc->contended = true;
 397                        return true;
 398                }
 399                cond_resched();
 400        }
 401
 402        return false;
 403}
 404
 405/*
 406 * Aside from avoiding lock contention, compaction also periodically checks
 407 * need_resched() and either schedules in sync compaction or aborts async
 408 * compaction. This is similar to what compact_unlock_should_abort() does, but
 409 * is used where no lock is concerned.
 410 *
 411 * Returns false when no scheduling was needed, or sync compaction scheduled.
 412 * Returns true when async compaction should abort.
 413 */
 414static inline bool compact_should_abort(struct compact_control *cc)
 415{
 416        /* async compaction aborts if contended */
 417        if (need_resched()) {
 418                if (cc->mode == MIGRATE_ASYNC) {
 419                        cc->contended = true;
 420                        return true;
 421                }
 422
 423                cond_resched();
 424        }
 425
 426        return false;
 427}
 428
 429/*
 430 * Isolate free pages onto a private freelist. If @strict is true, will abort
 431 * returning 0 on any invalid PFNs or non-free pages inside of the pageblock
 432 * (even though it may still end up isolating some pages).
 433 */
 434static unsigned long isolate_freepages_block(struct compact_control *cc,
 435                                unsigned long *start_pfn,
 436                                unsigned long end_pfn,
 437                                struct list_head *freelist,
 438                                bool strict)
 439{
 440        int nr_scanned = 0, total_isolated = 0;
 441        struct page *cursor, *valid_page = NULL;
 442        unsigned long flags = 0;
 443        bool locked = false;
 444        unsigned long blockpfn = *start_pfn;
 445        unsigned int order;
 446
 447        cursor = pfn_to_page(blockpfn);
 448
 449        /* Isolate free pages. */
 450        for (; blockpfn < end_pfn; blockpfn++, cursor++) {
 451                int isolated;
 452                struct page *page = cursor;
 453
 454                /*
 455                 * Periodically drop the lock (if held) regardless of its
 456                 * contention, to give chance to IRQs. Abort if fatal signal
 457                 * pending or async compaction detects need_resched()
 458                 */
 459                if (!(blockpfn % SWAP_CLUSTER_MAX)
 460                    && compact_unlock_should_abort(&cc->zone->lock, flags,
 461                                                                &locked, cc))
 462                        break;
 463
 464                nr_scanned++;
 465                if (!pfn_valid_within(blockpfn))
 466                        goto isolate_fail;
 467
 468                if (!valid_page)
 469                        valid_page = page;
 470
 471                /*
 472                 * For compound pages such as THP and hugetlbfs, we can save
 473                 * potentially a lot of iterations if we skip them at once.
 474                 * The check is racy, but we can consider only valid values
 475                 * and the only danger is skipping too much.
 476                 */
 477                if (PageCompound(page)) {
 478                        const unsigned int order = compound_order(page);
 479
 480                        if (likely(order < MAX_ORDER)) {
 481                                blockpfn += (1UL << order) - 1;
 482                                cursor += (1UL << order) - 1;
 483                        }
 484                        goto isolate_fail;
 485                }
 486
 487                if (!PageBuddy(page))
 488                        goto isolate_fail;
 489
 490                /*
 491                 * If we already hold the lock, we can skip some rechecking.
 492                 * Note that if we hold the lock now, checked_pageblock was
 493                 * already set in some previous iteration (or strict is true),
 494                 * so it is correct to skip the suitable migration target
 495                 * recheck as well.
 496                 */
 497                if (!locked) {
 498                        /*
 499                         * The zone lock must be held to isolate freepages.
 500                         * Unfortunately this is a very coarse lock and can be
 501                         * heavily contended if there are parallel allocations
 502                         * or parallel compactions. For async compaction do not
 503                         * spin on the lock and we acquire the lock as late as
 504                         * possible.
 505                         */
 506                        locked = compact_trylock_irqsave(&cc->zone->lock,
 507                                                                &flags, cc);
 508                        if (!locked)
 509                                break;
 510
 511                        /* Recheck this is a buddy page under lock */
 512                        if (!PageBuddy(page))
 513                                goto isolate_fail;
 514                }
 515
 516                /* Found a free page, will break it into order-0 pages */
 517                order = page_order(page);
 518                isolated = __isolate_free_page(page, order);
 519                if (!isolated)
 520                        break;
 521                set_page_private(page, order);
 522
 523                total_isolated += isolated;
 524                cc->nr_freepages += isolated;
 525                list_add_tail(&page->lru, freelist);
 526
 527                if (!strict && cc->nr_migratepages <= cc->nr_freepages) {
 528                        blockpfn += isolated;
 529                        break;
 530                }
 531                /* Advance to the end of split page */
 532                blockpfn += isolated - 1;
 533                cursor += isolated - 1;
 534                continue;
 535
 536isolate_fail:
 537                if (strict)
 538                        break;
 539                else
 540                        continue;
 541
 542        }
 543
 544        if (locked)
 545                spin_unlock_irqrestore(&cc->zone->lock, flags);
 546
 547        /*
 548         * There is a tiny chance that we have read bogus compound_order(),
 549         * so be careful to not go outside of the pageblock.
 550         */
 551        if (unlikely(blockpfn > end_pfn))
 552                blockpfn = end_pfn;
 553
 554        trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn,
 555                                        nr_scanned, total_isolated);
 556
 557        /* Record how far we have got within the block */
 558        *start_pfn = blockpfn;
 559
 560        /*
 561         * If strict isolation is requested by CMA then check that all the
 562         * pages requested were isolated. If there were any failures, 0 is
 563         * returned and CMA will fail.
 564         */
 565        if (strict && blockpfn < end_pfn)
 566                total_isolated = 0;
 567
 568        /* Update the pageblock-skip if the whole pageblock was scanned */
 569        if (blockpfn == end_pfn)
 570                update_pageblock_skip(cc, valid_page, total_isolated, false);
 571
 572        cc->total_free_scanned += nr_scanned;
 573        if (total_isolated)
 574                count_compact_events(COMPACTISOLATED, total_isolated);
 575        return total_isolated;
 576}
 577
 578/**
 579 * isolate_freepages_range() - isolate free pages.
 580 * @cc:        Compaction control structure.
 581 * @start_pfn: The first PFN to start isolating.
 582 * @end_pfn:   The one-past-last PFN.
 583 *
 584 * Non-free pages, invalid PFNs, or zone boundaries within the
 585 * [start_pfn, end_pfn) range are considered errors, cause function to
 586 * undo its actions and return zero.
 587 *
 588 * Otherwise, function returns one-past-the-last PFN of isolated page
 589 * (which may be greater then end_pfn if end fell in a middle of
 590 * a free page).
 591 */
 592unsigned long
 593isolate_freepages_range(struct compact_control *cc,
 594                        unsigned long start_pfn, unsigned long end_pfn)
 595{
 596        unsigned long isolated, pfn, block_start_pfn, block_end_pfn;
 597        LIST_HEAD(freelist);
 598
 599        pfn = start_pfn;
 600        block_start_pfn = pageblock_start_pfn(pfn);
 601        if (block_start_pfn < cc->zone->zone_start_pfn)
 602                block_start_pfn = cc->zone->zone_start_pfn;
 603        block_end_pfn = pageblock_end_pfn(pfn);
 604
 605        for (; pfn < end_pfn; pfn += isolated,
 606                                block_start_pfn = block_end_pfn,
 607                                block_end_pfn += pageblock_nr_pages) {
 608                /* Protect pfn from changing by isolate_freepages_block */
 609                unsigned long isolate_start_pfn = pfn;
 610
 611                block_end_pfn = min(block_end_pfn, end_pfn);
 612
 613                /*
 614                 * pfn could pass the block_end_pfn if isolated freepage
 615                 * is more than pageblock order. In this case, we adjust
 616                 * scanning range to right one.
 617                 */
 618                if (pfn >= block_end_pfn) {
 619                        block_start_pfn = pageblock_start_pfn(pfn);
 620                        block_end_pfn = pageblock_end_pfn(pfn);
 621                        block_end_pfn = min(block_end_pfn, end_pfn);
 622                }
 623
 624                if (!pageblock_pfn_to_page(block_start_pfn,
 625                                        block_end_pfn, cc->zone))
 626                        break;
 627
 628                isolated = isolate_freepages_block(cc, &isolate_start_pfn,
 629                                                block_end_pfn, &freelist, true);
 630
 631                /*
 632                 * In strict mode, isolate_freepages_block() returns 0 if
 633                 * there are any holes in the block (ie. invalid PFNs or
 634                 * non-free pages).
 635                 */
 636                if (!isolated)
 637                        break;
 638
 639                /*
 640                 * If we managed to isolate pages, it is always (1 << n) *
 641                 * pageblock_nr_pages for some non-negative n.  (Max order
 642                 * page may span two pageblocks).
 643                 */
 644        }
 645
 646        /* __isolate_free_page() does not map the pages */
 647        map_pages(&freelist);
 648
 649        if (pfn < end_pfn) {
 650                /* Loop terminated early, cleanup. */
 651                release_freepages(&freelist);
 652                return 0;
 653        }
 654
 655        /* We don't use freelists for anything. */
 656        return pfn;
 657}
 658
 659/* Similar to reclaim, but different enough that they don't share logic */
 660static bool too_many_isolated(struct zone *zone)
 661{
 662        unsigned long active, inactive, isolated;
 663
 664        inactive = node_page_state(zone->zone_pgdat, NR_INACTIVE_FILE) +
 665                        node_page_state(zone->zone_pgdat, NR_INACTIVE_ANON);
 666        active = node_page_state(zone->zone_pgdat, NR_ACTIVE_FILE) +
 667                        node_page_state(zone->zone_pgdat, NR_ACTIVE_ANON);
 668        isolated = node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE) +
 669                        node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON);
 670
 671        return isolated > (inactive + active) / 2;
 672}
 673
 674/**
 675 * isolate_migratepages_block() - isolate all migrate-able pages within
 676 *                                a single pageblock
 677 * @cc:         Compaction control structure.
 678 * @low_pfn:    The first PFN to isolate
 679 * @end_pfn:    The one-past-the-last PFN to isolate, within same pageblock
 680 * @isolate_mode: Isolation mode to be used.
 681 *
 682 * Isolate all pages that can be migrated from the range specified by
 683 * [low_pfn, end_pfn). The range is expected to be within same pageblock.
 684 * Returns zero if there is a fatal signal pending, otherwise PFN of the
 685 * first page that was not scanned (which may be both less, equal to or more
 686 * than end_pfn).
 687 *
 688 * The pages are isolated on cc->migratepages list (not required to be empty),
 689 * and cc->nr_migratepages is updated accordingly. The cc->migrate_pfn field
 690 * is neither read nor updated.
 691 */
 692static unsigned long
 693isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
 694                        unsigned long end_pfn, isolate_mode_t isolate_mode)
 695{
 696        struct zone *zone = cc->zone;
 697        unsigned long nr_scanned = 0, nr_isolated = 0;
 698        struct lruvec *lruvec;
 699        unsigned long flags = 0;
 700        bool locked = false;
 701        struct page *page = NULL, *valid_page = NULL;
 702        unsigned long start_pfn = low_pfn;
 703        bool skip_on_failure = false;
 704        unsigned long next_skip_pfn = 0;
 705
 706        /*
 707         * Ensure that there are not too many pages isolated from the LRU
 708         * list by either parallel reclaimers or compaction. If there are,
 709         * delay for some time until fewer pages are isolated
 710         */
 711        while (unlikely(too_many_isolated(zone))) {
 712                /* async migration should just abort */
 713                if (cc->mode == MIGRATE_ASYNC)
 714                        return 0;
 715
 716                congestion_wait(BLK_RW_ASYNC, HZ/10);
 717
 718                if (fatal_signal_pending(current))
 719                        return 0;
 720        }
 721
 722        if (compact_should_abort(cc))
 723                return 0;
 724
 725        if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) {
 726                skip_on_failure = true;
 727                next_skip_pfn = block_end_pfn(low_pfn, cc->order);
 728        }
 729
 730        /* Time to isolate some pages for migration */
 731        for (; low_pfn < end_pfn; low_pfn++) {
 732
 733                if (skip_on_failure && low_pfn >= next_skip_pfn) {
 734                        /*
 735                         * We have isolated all migration candidates in the
 736                         * previous order-aligned block, and did not skip it due
 737                         * to failure. We should migrate the pages now and
 738                         * hopefully succeed compaction.
 739                         */
 740                        if (nr_isolated)
 741                                break;
 742
 743                        /*
 744                         * We failed to isolate in the previous order-aligned
 745                         * block. Set the new boundary to the end of the
 746                         * current block. Note we can't simply increase
 747                         * next_skip_pfn by 1 << order, as low_pfn might have
 748                         * been incremented by a higher number due to skipping
 749                         * a compound or a high-order buddy page in the
 750                         * previous loop iteration.
 751                         */
 752                        next_skip_pfn = block_end_pfn(low_pfn, cc->order);
 753                }
 754
 755                /*
 756                 * Periodically drop the lock (if held) regardless of its
 757                 * contention, to give chance to IRQs. Abort async compaction
 758                 * if contended.
 759                 */
 760                if (!(low_pfn % SWAP_CLUSTER_MAX)
 761                    && compact_unlock_should_abort(zone_lru_lock(zone), flags,
 762                                                                &locked, cc))
 763                        break;
 764
 765                if (!pfn_valid_within(low_pfn))
 766                        goto isolate_fail;
 767                nr_scanned++;
 768
 769                page = pfn_to_page(low_pfn);
 770
 771                if (!valid_page)
 772                        valid_page = page;
 773
 774                /*
 775                 * Skip if free. We read page order here without zone lock
 776                 * which is generally unsafe, but the race window is small and
 777                 * the worst thing that can happen is that we skip some
 778                 * potential isolation targets.
 779                 */
 780                if (PageBuddy(page)) {
 781                        unsigned long freepage_order = page_order_unsafe(page);
 782
 783                        /*
 784                         * Without lock, we cannot be sure that what we got is
 785                         * a valid page order. Consider only values in the
 786                         * valid order range to prevent low_pfn overflow.
 787                         */
 788                        if (freepage_order > 0 && freepage_order < MAX_ORDER)
 789                                low_pfn += (1UL << freepage_order) - 1;
 790                        continue;
 791                }
 792
 793                /*
 794                 * Regardless of being on LRU, compound pages such as THP and
 795                 * hugetlbfs are not to be compacted. We can potentially save
 796                 * a lot of iterations if we skip them at once. The check is
 797                 * racy, but we can consider only valid values and the only
 798                 * danger is skipping too much.
 799                 */
 800                if (PageCompound(page)) {
 801                        const unsigned int order = compound_order(page);
 802
 803                        if (likely(order < MAX_ORDER))
 804                                low_pfn += (1UL << order) - 1;
 805                        goto isolate_fail;
 806                }
 807
 808                /*
 809                 * Check may be lockless but that's ok as we recheck later.
 810                 * It's possible to migrate LRU and non-lru movable pages.
 811                 * Skip any other type of page
 812                 */
 813                if (!PageLRU(page)) {
 814                        /*
 815                         * __PageMovable can return false positive so we need
 816                         * to verify it under page_lock.
 817                         */
 818                        if (unlikely(__PageMovable(page)) &&
 819                                        !PageIsolated(page)) {
 820                                if (locked) {
 821                                        spin_unlock_irqrestore(zone_lru_lock(zone),
 822                                                                        flags);
 823                                        locked = false;
 824                                }
 825
 826                                if (!isolate_movable_page(page, isolate_mode))
 827                                        goto isolate_success;
 828                        }
 829
 830                        goto isolate_fail;
 831                }
 832
 833                /*
 834                 * Migration will fail if an anonymous page is pinned in memory,
 835                 * so avoid taking lru_lock and isolating it unnecessarily in an
 836                 * admittedly racy check.
 837                 */
 838                if (!page_mapping(page) &&
 839                    page_count(page) > page_mapcount(page))
 840                        goto isolate_fail;
 841
 842                /*
 843                 * Only allow to migrate anonymous pages in GFP_NOFS context
 844                 * because those do not depend on fs locks.
 845                 */
 846                if (!(cc->gfp_mask & __GFP_FS) && page_mapping(page))
 847                        goto isolate_fail;
 848
 849                /* If we already hold the lock, we can skip some rechecking */
 850                if (!locked) {
 851                        locked = compact_trylock_irqsave(zone_lru_lock(zone),
 852                                                                &flags, cc);
 853                        if (!locked)
 854                                break;
 855
 856                        /* Recheck PageLRU and PageCompound under lock */
 857                        if (!PageLRU(page))
 858                                goto isolate_fail;
 859
 860                        /*
 861                         * Page become compound since the non-locked check,
 862                         * and it's on LRU. It can only be a THP so the order
 863                         * is safe to read and it's 0 for tail pages.
 864                         */
 865                        if (unlikely(PageCompound(page))) {
 866                                low_pfn += (1UL << compound_order(page)) - 1;
 867                                goto isolate_fail;
 868                        }
 869                }
 870
 871                lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
 872
 873                /* Try isolate the page */
 874                if (__isolate_lru_page(page, isolate_mode) != 0)
 875                        goto isolate_fail;
 876
 877                VM_BUG_ON_PAGE(PageCompound(page), page);
 878
 879                /* Successfully isolated */
 880                del_page_from_lru_list(page, lruvec, page_lru(page));
 881                inc_node_page_state(page,
 882                                NR_ISOLATED_ANON + page_is_file_cache(page));
 883
 884isolate_success:
 885                list_add(&page->lru, &cc->migratepages);
 886                cc->nr_migratepages++;
 887                nr_isolated++;
 888
 889                /*
 890                 * Record where we could have freed pages by migration and not
 891                 * yet flushed them to buddy allocator.
 892                 * - this is the lowest page that was isolated and likely be
 893                 * then freed by migration.
 894                 */
 895                if (!cc->last_migrated_pfn)
 896                        cc->last_migrated_pfn = low_pfn;
 897
 898                /* Avoid isolating too much */
 899                if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
 900                        ++low_pfn;
 901                        break;
 902                }
 903
 904                continue;
 905isolate_fail:
 906                if (!skip_on_failure)
 907                        continue;
 908
 909                /*
 910                 * We have isolated some pages, but then failed. Release them
 911                 * instead of migrating, as we cannot form the cc->order buddy
 912                 * page anyway.
 913                 */
 914                if (nr_isolated) {
 915                        if (locked) {
 916                                spin_unlock_irqrestore(zone_lru_lock(zone), flags);
 917                                locked = false;
 918                        }
 919                        putback_movable_pages(&cc->migratepages);
 920                        cc->nr_migratepages = 0;
 921                        cc->last_migrated_pfn = 0;
 922                        nr_isolated = 0;
 923                }
 924
 925                if (low_pfn < next_skip_pfn) {
 926                        low_pfn = next_skip_pfn - 1;
 927                        /*
 928                         * The check near the loop beginning would have updated
 929                         * next_skip_pfn too, but this is a bit simpler.
 930                         */
 931                        next_skip_pfn += 1UL << cc->order;
 932                }
 933        }
 934
 935        /*
 936         * The PageBuddy() check could have potentially brought us outside
 937         * the range to be scanned.
 938         */
 939        if (unlikely(low_pfn > end_pfn))
 940                low_pfn = end_pfn;
 941
 942        if (locked)
 943                spin_unlock_irqrestore(zone_lru_lock(zone), flags);
 944
 945        /*
 946         * Update the pageblock-skip information and cached scanner pfn,
 947         * if the whole pageblock was scanned without isolating any page.
 948         */
 949        if (low_pfn == end_pfn)
 950                update_pageblock_skip(cc, valid_page, nr_isolated, true);
 951
 952        trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
 953                                                nr_scanned, nr_isolated);
 954
 955        cc->total_migrate_scanned += nr_scanned;
 956        if (nr_isolated)
 957                count_compact_events(COMPACTISOLATED, nr_isolated);
 958
 959        return low_pfn;
 960}
 961
 962/**
 963 * isolate_migratepages_range() - isolate migrate-able pages in a PFN range
 964 * @cc:        Compaction control structure.
 965 * @start_pfn: The first PFN to start isolating.
 966 * @end_pfn:   The one-past-last PFN.
 967 *
 968 * Returns zero if isolation fails fatally due to e.g. pending signal.
 969 * Otherwise, function returns one-past-the-last PFN of isolated page
 970 * (which may be greater than end_pfn if end fell in a middle of a THP page).
 971 */
 972unsigned long
 973isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn,
 974                                                        unsigned long end_pfn)
 975{
 976        unsigned long pfn, block_start_pfn, block_end_pfn;
 977
 978        /* Scan block by block. First and last block may be incomplete */
 979        pfn = start_pfn;
 980        block_start_pfn = pageblock_start_pfn(pfn);
 981        if (block_start_pfn < cc->zone->zone_start_pfn)
 982                block_start_pfn = cc->zone->zone_start_pfn;
 983        block_end_pfn = pageblock_end_pfn(pfn);
 984
 985        for (; pfn < end_pfn; pfn = block_end_pfn,
 986                                block_start_pfn = block_end_pfn,
 987                                block_end_pfn += pageblock_nr_pages) {
 988
 989                block_end_pfn = min(block_end_pfn, end_pfn);
 990
 991                if (!pageblock_pfn_to_page(block_start_pfn,
 992                                        block_end_pfn, cc->zone))
 993                        continue;
 994
 995                pfn = isolate_migratepages_block(cc, pfn, block_end_pfn,
 996                                                        ISOLATE_UNEVICTABLE);
 997
 998                if (!pfn)
 999                        break;
1000
1001                if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
1002                        break;
1003        }
1004
1005        return pfn;
1006}
1007
1008#endif /* CONFIG_COMPACTION || CONFIG_CMA */
1009#ifdef CONFIG_COMPACTION
1010
1011static bool suitable_migration_source(struct compact_control *cc,
1012                                                        struct page *page)
1013{
1014        int block_mt;
1015
1016        if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction)
1017                return true;
1018
1019        block_mt = get_pageblock_migratetype(page);
1020
1021        if (cc->migratetype == MIGRATE_MOVABLE)
1022                return is_migrate_movable(block_mt);
1023        else
1024                return block_mt == cc->migratetype;
1025}
1026
1027/* Returns true if the page is within a block suitable for migration to */
1028static bool suitable_migration_target(struct compact_control *cc,
1029                                                        struct page *page)
1030{
1031        /* If the page is a large free page, then disallow migration */
1032        if (PageBuddy(page)) {
1033                /*
1034                 * We are checking page_order without zone->lock taken. But
1035                 * the only small danger is that we skip a potentially suitable
1036                 * pageblock, so it's not worth to check order for valid range.
1037                 */
1038                if (page_order_unsafe(page) >= pageblock_order)
1039                        return false;
1040        }
1041
1042        if (cc->ignore_block_suitable)
1043                return true;
1044
1045        /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */
1046        if (is_migrate_movable(get_pageblock_migratetype(page)))
1047                return true;
1048
1049        /* Otherwise skip the block */
1050        return false;
1051}
1052
1053/*
1054 * Test whether the free scanner has reached the same or lower pageblock than
1055 * the migration scanner, and compaction should thus terminate.
1056 */
1057static inline bool compact_scanners_met(struct compact_control *cc)
1058{
1059        return (cc->free_pfn >> pageblock_order)
1060                <= (cc->migrate_pfn >> pageblock_order);
1061}
1062
1063/*
1064 * Based on information in the current compact_control, find blocks
1065 * suitable for isolating free pages from and then isolate them.
1066 */
1067static void isolate_freepages(struct compact_control *cc)
1068{
1069        struct zone *zone = cc->zone;
1070        struct page *page;
1071        unsigned long block_start_pfn;  /* start of current pageblock */
1072        unsigned long isolate_start_pfn; /* exact pfn we start at */
1073        unsigned long block_end_pfn;    /* end of current pageblock */
1074        unsigned long low_pfn;       /* lowest pfn scanner is able to scan */
1075        struct list_head *freelist = &cc->freepages;
1076
1077        /*
1078         * Initialise the free scanner. The starting point is where we last
1079         * successfully isolated from, zone-cached value, or the end of the
1080         * zone when isolating for the first time. For looping we also need
1081         * this pfn aligned down to the pageblock boundary, because we do
1082         * block_start_pfn -= pageblock_nr_pages in the for loop.
1083         * For ending point, take care when isolating in last pageblock of a
1084         * a zone which ends in the middle of a pageblock.
1085         * The low boundary is the end of the pageblock the migration scanner
1086         * is using.
1087         */
1088        isolate_start_pfn = cc->free_pfn;
1089        block_start_pfn = pageblock_start_pfn(cc->free_pfn);
1090        block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
1091                                                zone_end_pfn(zone));
1092        low_pfn = pageblock_end_pfn(cc->migrate_pfn);
1093
1094        /*
1095         * Isolate free pages until enough are available to migrate the
1096         * pages on cc->migratepages. We stop searching if the migrate
1097         * and free page scanners meet or enough free pages are isolated.
1098         */
1099        for (; block_start_pfn >= low_pfn;
1100                                block_end_pfn = block_start_pfn,
1101                                block_start_pfn -= pageblock_nr_pages,
1102                                isolate_start_pfn = block_start_pfn) {
1103                /*
1104                 * This can iterate a massively long zone without finding any
1105                 * suitable migration targets, so periodically check if we need
1106                 * to schedule, or even abort async compaction.
1107                 */
1108                if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
1109                                                && compact_should_abort(cc))
1110                        break;
1111
1112                page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
1113                                                                        zone);
1114                if (!page)
1115                        continue;
1116
1117                /* Check the block is suitable for migration */
1118                if (!suitable_migration_target(cc, page))
1119                        continue;
1120
1121                /* If isolation recently failed, do not retry */
1122                if (!isolation_suitable(cc, page))
1123                        continue;
1124
1125                /* Found a block suitable for isolating free pages from. */
1126                isolate_freepages_block(cc, &isolate_start_pfn, block_end_pfn,
1127                                        freelist, false);
1128
1129                /*
1130                 * If we isolated enough freepages, or aborted due to lock
1131                 * contention, terminate.
1132                 */
1133                if ((cc->nr_freepages >= cc->nr_migratepages)
1134                                                        || cc->contended) {
1135                        if (isolate_start_pfn >= block_end_pfn) {
1136                                /*
1137                                 * Restart at previous pageblock if more
1138                                 * freepages can be isolated next time.
1139                                 */
1140                                isolate_start_pfn =
1141                                        block_start_pfn - pageblock_nr_pages;
1142                        }
1143                        break;
1144                } else if (isolate_start_pfn < block_end_pfn) {
1145                        /*
1146                         * If isolation failed early, do not continue
1147                         * needlessly.
1148                         */
1149                        break;
1150                }
1151        }
1152
1153        /* __isolate_free_page() does not map the pages */
1154        map_pages(freelist);
1155
1156        /*
1157         * Record where the free scanner will restart next time. Either we
1158         * broke from the loop and set isolate_start_pfn based on the last
1159         * call to isolate_freepages_block(), or we met the migration scanner
1160         * and the loop terminated due to isolate_start_pfn < low_pfn
1161         */
1162        cc->free_pfn = isolate_start_pfn;
1163}
1164
1165/*
1166 * This is a migrate-callback that "allocates" freepages by taking pages
1167 * from the isolated freelists in the block we are migrating to.
1168 */
1169static struct page *compaction_alloc(struct page *migratepage,
1170                                        unsigned long data)
1171{
1172        struct compact_control *cc = (struct compact_control *)data;
1173        struct page *freepage;
1174
1175        /*
1176         * Isolate free pages if necessary, and if we are not aborting due to
1177         * contention.
1178         */
1179        if (list_empty(&cc->freepages)) {
1180                if (!cc->contended)
1181                        isolate_freepages(cc);
1182
1183                if (list_empty(&cc->freepages))
1184                        return NULL;
1185        }
1186
1187        freepage = list_entry(cc->freepages.next, struct page, lru);
1188        list_del(&freepage->lru);
1189        cc->nr_freepages--;
1190
1191        return freepage;
1192}
1193
1194/*
1195 * This is a migrate-callback that "frees" freepages back to the isolated
1196 * freelist.  All pages on the freelist are from the same zone, so there is no
1197 * special handling needed for NUMA.
1198 */
1199static void compaction_free(struct page *page, unsigned long data)
1200{
1201        struct compact_control *cc = (struct compact_control *)data;
1202
1203        list_add(&page->lru, &cc->freepages);
1204        cc->nr_freepages++;
1205}
1206
1207/* possible outcome of isolate_migratepages */
1208typedef enum {
1209        ISOLATE_ABORT,          /* Abort compaction now */
1210        ISOLATE_NONE,           /* No pages isolated, continue scanning */
1211        ISOLATE_SUCCESS,        /* Pages isolated, migrate */
1212} isolate_migrate_t;
1213
1214/*
1215 * Allow userspace to control policy on scanning the unevictable LRU for
1216 * compactable pages.
1217 */
1218int sysctl_compact_unevictable_allowed __read_mostly = 1;
1219
1220/*
1221 * Isolate all pages that can be migrated from the first suitable block,
1222 * starting at the block pointed to by the migrate scanner pfn within
1223 * compact_control.
1224 */
1225static isolate_migrate_t isolate_migratepages(struct zone *zone,
1226                                        struct compact_control *cc)
1227{
1228        unsigned long block_start_pfn;
1229        unsigned long block_end_pfn;
1230        unsigned long low_pfn;
1231        struct page *page;
1232        const isolate_mode_t isolate_mode =
1233                (sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
1234                (cc->mode != MIGRATE_SYNC ? ISOLATE_ASYNC_MIGRATE : 0);
1235
1236        /*
1237         * Start at where we last stopped, or beginning of the zone as
1238         * initialized by compact_zone()
1239         */
1240        low_pfn = cc->migrate_pfn;
1241        block_start_pfn = pageblock_start_pfn(low_pfn);
1242        if (block_start_pfn < zone->zone_start_pfn)
1243                block_start_pfn = zone->zone_start_pfn;
1244
1245        /* Only scan within a pageblock boundary */
1246        block_end_pfn = pageblock_end_pfn(low_pfn);
1247
1248        /*
1249         * Iterate over whole pageblocks until we find the first suitable.
1250         * Do not cross the free scanner.
1251         */
1252        for (; block_end_pfn <= cc->free_pfn;
1253                        low_pfn = block_end_pfn,
1254                        block_start_pfn = block_end_pfn,
1255                        block_end_pfn += pageblock_nr_pages) {
1256
1257                /*
1258                 * This can potentially iterate a massively long zone with
1259                 * many pageblocks unsuitable, so periodically check if we
1260                 * need to schedule, or even abort async compaction.
1261                 */
1262                if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
1263                                                && compact_should_abort(cc))
1264                        break;
1265
1266                page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
1267                                                                        zone);
1268                if (!page)
1269                        continue;
1270
1271                /* If isolation recently failed, do not retry */
1272                if (!isolation_suitable(cc, page))
1273                        continue;
1274
1275                /*
1276                 * For async compaction, also only scan in MOVABLE blocks.
1277                 * Async compaction is optimistic to see if the minimum amount
1278                 * of work satisfies the allocation.
1279                 */
1280                if (!suitable_migration_source(cc, page))
1281                        continue;
1282
1283                /* Perform the isolation */
1284                low_pfn = isolate_migratepages_block(cc, low_pfn,
1285                                                block_end_pfn, isolate_mode);
1286
1287                if (!low_pfn || cc->contended)
1288                        return ISOLATE_ABORT;
1289
1290                /*
1291                 * Either we isolated something and proceed with migration. Or
1292                 * we failed and compact_zone should decide if we should
1293                 * continue or not.
1294                 */
1295                break;
1296        }
1297
1298        /* Record where migration scanner will be restarted. */
1299        cc->migrate_pfn = low_pfn;
1300
1301        return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
1302}
1303
1304/*
1305 * order == -1 is expected when compacting via
1306 * /proc/sys/vm/compact_memory
1307 */
1308static inline bool is_via_compact_memory(int order)
1309{
1310        return order == -1;
1311}
1312
1313static enum compact_result __compact_finished(struct zone *zone,
1314                                                struct compact_control *cc)
1315{
1316        unsigned int order;
1317        const int migratetype = cc->migratetype;
1318
1319        if (cc->contended || fatal_signal_pending(current))
1320                return COMPACT_CONTENDED;
1321
1322        /* Compaction run completes if the migrate and free scanner meet */
1323        if (compact_scanners_met(cc)) {
1324                /* Let the next compaction start anew. */
1325                reset_cached_positions(zone);
1326
1327                /*
1328                 * Mark that the PG_migrate_skip information should be cleared
1329                 * by kswapd when it goes to sleep. kcompactd does not set the
1330                 * flag itself as the decision to be clear should be directly
1331                 * based on an allocation request.
1332                 */
1333                if (cc->direct_compaction)
1334                        zone->compact_blockskip_flush = true;
1335
1336                if (cc->whole_zone)
1337                        return COMPACT_COMPLETE;
1338                else
1339                        return COMPACT_PARTIAL_SKIPPED;
1340        }
1341
1342        if (is_via_compact_memory(cc->order))
1343                return COMPACT_CONTINUE;
1344
1345        if (cc->finishing_block) {
1346                /*
1347                 * We have finished the pageblock, but better check again that
1348                 * we really succeeded.
1349                 */
1350                if (IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages))
1351                        cc->finishing_block = false;
1352                else
1353                        return COMPACT_CONTINUE;
1354        }
1355
1356        /* Direct compactor: Is a suitable page free? */
1357        for (order = cc->order; order < MAX_ORDER; order++) {
1358                struct free_area *area = &zone->free_area[order];
1359                bool can_steal;
1360
1361                /* Job done if page is free of the right migratetype */
1362                if (!list_empty(&area->free_list[migratetype]))
1363                        return COMPACT_SUCCESS;
1364
1365#ifdef CONFIG_CMA
1366                /* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */
1367                if (migratetype == MIGRATE_MOVABLE &&
1368                        !list_empty(&area->free_list[MIGRATE_CMA]))
1369                        return COMPACT_SUCCESS;
1370#endif
1371                /*
1372                 * Job done if allocation would steal freepages from
1373                 * other migratetype buddy lists.
1374                 */
1375                if (find_suitable_fallback(area, order, migratetype,
1376                                                true, &can_steal) != -1) {
1377
1378                        /* movable pages are OK in any pageblock */
1379                        if (migratetype == MIGRATE_MOVABLE)
1380                                return COMPACT_SUCCESS;
1381
1382                        /*
1383                         * We are stealing for a non-movable allocation. Make
1384                         * sure we finish compacting the current pageblock
1385                         * first so it is as free as possible and we won't
1386                         * have to steal another one soon. This only applies
1387                         * to sync compaction, as async compaction operates
1388                         * on pageblocks of the same migratetype.
1389                         */
1390                        if (cc->mode == MIGRATE_ASYNC ||
1391                                        IS_ALIGNED(cc->migrate_pfn,
1392                                                        pageblock_nr_pages)) {
1393                                return COMPACT_SUCCESS;
1394                        }
1395
1396                        cc->finishing_block = true;
1397                        return COMPACT_CONTINUE;
1398                }
1399        }
1400
1401        return COMPACT_NO_SUITABLE_PAGE;
1402}
1403
1404static enum compact_result compact_finished(struct zone *zone,
1405                        struct compact_control *cc)
1406{
1407        int ret;
1408
1409        ret = __compact_finished(zone, cc);
1410        trace_mm_compaction_finished(zone, cc->order, ret);
1411        if (ret == COMPACT_NO_SUITABLE_PAGE)
1412                ret = COMPACT_CONTINUE;
1413
1414        return ret;
1415}
1416
1417/*
1418 * compaction_suitable: Is this suitable to run compaction on this zone now?
1419 * Returns
1420 *   COMPACT_SKIPPED  - If there are too few free pages for compaction
1421 *   COMPACT_SUCCESS  - If the allocation would succeed without compaction
1422 *   COMPACT_CONTINUE - If compaction should run now
1423 */
1424static enum compact_result __compaction_suitable(struct zone *zone, int order,
1425                                        unsigned int alloc_flags,
1426                                        int classzone_idx,
1427                                        unsigned long wmark_target)
1428{
1429        unsigned long watermark;
1430
1431        if (is_via_compact_memory(order))
1432                return COMPACT_CONTINUE;
1433
1434        watermark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
1435        /*
1436         * If watermarks for high-order allocation are already met, there
1437         * should be no need for compaction at all.
1438         */
1439        if (zone_watermark_ok(zone, order, watermark, classzone_idx,
1440                                                                alloc_flags))
1441                return COMPACT_SUCCESS;
1442
1443        /*
1444         * Watermarks for order-0 must be met for compaction to be able to
1445         * isolate free pages for migration targets. This means that the
1446         * watermark and alloc_flags have to match, or be more pessimistic than
1447         * the check in __isolate_free_page(). We don't use the direct
1448         * compactor's alloc_flags, as they are not relevant for freepage
1449         * isolation. We however do use the direct compactor's classzone_idx to
1450         * skip over zones where lowmem reserves would prevent allocation even
1451         * if compaction succeeds.
1452         * For costly orders, we require low watermark instead of min for
1453         * compaction to proceed to increase its chances.
1454         * ALLOC_CMA is used, as pages in CMA pageblocks are considered
1455         * suitable migration targets
1456         */
1457        watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?
1458                                low_wmark_pages(zone) : min_wmark_pages(zone);
1459        watermark += compact_gap(order);
1460        if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx,
1461                                                ALLOC_CMA, wmark_target))
1462                return COMPACT_SKIPPED;
1463
1464        return COMPACT_CONTINUE;
1465}
1466
1467enum compact_result compaction_suitable(struct zone *zone, int order,
1468                                        unsigned int alloc_flags,
1469                                        int classzone_idx)
1470{
1471        enum compact_result ret;
1472        int fragindex;
1473
1474        ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx,
1475                                    zone_page_state(zone, NR_FREE_PAGES));
1476        /*
1477         * fragmentation index determines if allocation failures are due to
1478         * low memory or external fragmentation
1479         *
1480         * index of -1000 would imply allocations might succeed depending on
1481         * watermarks, but we already failed the high-order watermark check
1482         * index towards 0 implies failure is due to lack of memory
1483         * index towards 1000 implies failure is due to fragmentation
1484         *
1485         * Only compact if a failure would be due to fragmentation. Also
1486         * ignore fragindex for non-costly orders where the alternative to
1487         * a successful reclaim/compaction is OOM. Fragindex and the
1488         * vm.extfrag_threshold sysctl is meant as a heuristic to prevent
1489         * excessive compaction for costly orders, but it should not be at the
1490         * expense of system stability.
1491         */
1492        if (ret == COMPACT_CONTINUE && (order > PAGE_ALLOC_COSTLY_ORDER)) {
1493                fragindex = fragmentation_index(zone, order);
1494                if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
1495                        ret = COMPACT_NOT_SUITABLE_ZONE;
1496        }
1497
1498        trace_mm_compaction_suitable(zone, order, ret);
1499        if (ret == COMPACT_NOT_SUITABLE_ZONE)
1500                ret = COMPACT_SKIPPED;
1501
1502        return ret;
1503}
1504
1505bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
1506                int alloc_flags)
1507{
1508        struct zone *zone;
1509        struct zoneref *z;
1510
1511        /*
1512         * Make sure at least one zone would pass __compaction_suitable if we continue
1513         * retrying the reclaim.
1514         */
1515        for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
1516                                        ac->nodemask) {
1517                unsigned long available;
1518                enum compact_result compact_result;
1519
1520                /*
1521                 * Do not consider all the reclaimable memory because we do not
1522                 * want to trash just for a single high order allocation which
1523                 * is even not guaranteed to appear even if __compaction_suitable
1524                 * is happy about the watermark check.
1525                 */
1526                available = zone_reclaimable_pages(zone) / order;
1527                available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
1528                compact_result = __compaction_suitable(zone, order, alloc_flags,
1529                                ac_classzone_idx(ac), available);
1530                if (compact_result != COMPACT_SKIPPED)
1531                        return true;
1532        }
1533
1534        return false;
1535}
1536
1537static enum compact_result compact_zone(struct zone *zone, struct compact_control *cc)
1538{
1539        enum compact_result ret;
1540        unsigned long start_pfn = zone->zone_start_pfn;
1541        unsigned long end_pfn = zone_end_pfn(zone);
1542        const bool sync = cc->mode != MIGRATE_ASYNC;
1543
1544        cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask);
1545        ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
1546                                                        cc->classzone_idx);
1547        /* Compaction is likely to fail */
1548        if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED)
1549                return ret;
1550
1551        /* huh, compaction_suitable is returning something unexpected */
1552        VM_BUG_ON(ret != COMPACT_CONTINUE);
1553
1554        /*
1555         * Clear pageblock skip if there were failures recently and compaction
1556         * is about to be retried after being deferred.
1557         */
1558        if (compaction_restarting(zone, cc->order))
1559                __reset_isolation_suitable(zone);
1560
1561        /*
1562         * Setup to move all movable pages to the end of the zone. Used cached
1563         * information on where the scanners should start (unless we explicitly
1564         * want to compact the whole zone), but check that it is initialised
1565         * by ensuring the values are within zone boundaries.
1566         */
1567        if (cc->whole_zone) {
1568                cc->migrate_pfn = start_pfn;
1569                cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
1570        } else {
1571                cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
1572                cc->free_pfn = zone->compact_cached_free_pfn;
1573                if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
1574                        cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
1575                        zone->compact_cached_free_pfn = cc->free_pfn;
1576                }
1577                if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
1578                        cc->migrate_pfn = start_pfn;
1579                        zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
1580                        zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
1581                }
1582
1583                if (cc->migrate_pfn == start_pfn)
1584                        cc->whole_zone = true;
1585        }
1586
1587        cc->last_migrated_pfn = 0;
1588
1589        trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
1590                                cc->free_pfn, end_pfn, sync);
1591
1592        migrate_prep_local();
1593
1594        while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
1595                int err;
1596
1597                switch (isolate_migratepages(zone, cc)) {
1598                case ISOLATE_ABORT:
1599                        ret = COMPACT_CONTENDED;
1600                        putback_movable_pages(&cc->migratepages);
1601                        cc->nr_migratepages = 0;
1602                        goto out;
1603                case ISOLATE_NONE:
1604                        /*
1605                         * We haven't isolated and migrated anything, but
1606                         * there might still be unflushed migrations from
1607                         * previous cc->order aligned block.
1608                         */
1609                        goto check_drain;
1610                case ISOLATE_SUCCESS:
1611                        ;
1612                }
1613
1614                err = migrate_pages(&cc->migratepages, compaction_alloc,
1615                                compaction_free, (unsigned long)cc, cc->mode,
1616                                MR_COMPACTION);
1617
1618                trace_mm_compaction_migratepages(cc->nr_migratepages, err,
1619                                                        &cc->migratepages);
1620
1621                /* All pages were either migrated or will be released */
1622                cc->nr_migratepages = 0;
1623                if (err) {
1624                        putback_movable_pages(&cc->migratepages);
1625                        /*
1626                         * migrate_pages() may return -ENOMEM when scanners meet
1627                         * and we want compact_finished() to detect it
1628                         */
1629                        if (err == -ENOMEM && !compact_scanners_met(cc)) {
1630                                ret = COMPACT_CONTENDED;
1631                                goto out;
1632                        }
1633                        /*
1634                         * We failed to migrate at least one page in the current
1635                         * order-aligned block, so skip the rest of it.
1636                         */
1637                        if (cc->direct_compaction &&
1638                                                (cc->mode == MIGRATE_ASYNC)) {
1639                                cc->migrate_pfn = block_end_pfn(
1640                                                cc->migrate_pfn - 1, cc->order);
1641                                /* Draining pcplists is useless in this case */
1642                                cc->last_migrated_pfn = 0;
1643
1644                        }
1645                }
1646
1647check_drain:
1648                /*
1649                 * Has the migration scanner moved away from the previous
1650                 * cc->order aligned block where we migrated from? If yes,
1651                 * flush the pages that were freed, so that they can merge and
1652                 * compact_finished() can detect immediately if allocation
1653                 * would succeed.
1654                 */
1655                if (cc->order > 0 && cc->last_migrated_pfn) {
1656                        int cpu;
1657                        unsigned long current_block_start =
1658                                block_start_pfn(cc->migrate_pfn, cc->order);
1659
1660                        if (cc->last_migrated_pfn < current_block_start) {
1661                                cpu = get_cpu();
1662                                lru_add_drain_cpu(cpu);
1663                                drain_local_pages(zone);
1664                                put_cpu();
1665                                /* No more flushing until we migrate again */
1666                                cc->last_migrated_pfn = 0;
1667                        }
1668                }
1669
1670        }
1671
1672out:
1673        /*
1674         * Release free pages and update where the free scanner should restart,
1675         * so we don't leave any returned pages behind in the next attempt.
1676         */
1677        if (cc->nr_freepages > 0) {
1678                unsigned long free_pfn = release_freepages(&cc->freepages);
1679
1680                cc->nr_freepages = 0;
1681                VM_BUG_ON(free_pfn == 0);
1682                /* The cached pfn is always the first in a pageblock */
1683                free_pfn = pageblock_start_pfn(free_pfn);
1684                /*
1685                 * Only go back, not forward. The cached pfn might have been
1686                 * already reset to zone end in compact_finished()
1687                 */
1688                if (free_pfn > zone->compact_cached_free_pfn)
1689                        zone->compact_cached_free_pfn = free_pfn;
1690        }
1691
1692        count_compact_events(COMPACTMIGRATE_SCANNED, cc->total_migrate_scanned);
1693        count_compact_events(COMPACTFREE_SCANNED, cc->total_free_scanned);
1694
1695        trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
1696                                cc->free_pfn, end_pfn, sync, ret);
1697
1698        return ret;
1699}
1700
1701static enum compact_result compact_zone_order(struct zone *zone, int order,
1702                gfp_t gfp_mask, enum compact_priority prio,
1703                unsigned int alloc_flags, int classzone_idx)
1704{
1705        enum compact_result ret;
1706        struct compact_control cc = {
1707                .nr_freepages = 0,
1708                .nr_migratepages = 0,
1709                .total_migrate_scanned = 0,
1710                .total_free_scanned = 0,
1711                .order = order,
1712                .gfp_mask = gfp_mask,
1713                .zone = zone,
1714                .mode = (prio == COMPACT_PRIO_ASYNC) ?
1715                                        MIGRATE_ASYNC : MIGRATE_SYNC_LIGHT,
1716                .alloc_flags = alloc_flags,
1717                .classzone_idx = classzone_idx,
1718                .direct_compaction = true,
1719                .whole_zone = (prio == MIN_COMPACT_PRIORITY),
1720                .ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY),
1721                .ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY)
1722        };
1723        INIT_LIST_HEAD(&cc.freepages);
1724        INIT_LIST_HEAD(&cc.migratepages);
1725
1726        ret = compact_zone(zone, &cc);
1727
1728        VM_BUG_ON(!list_empty(&cc.freepages));
1729        VM_BUG_ON(!list_empty(&cc.migratepages));
1730
1731        return ret;
1732}
1733
1734int sysctl_extfrag_threshold = 500;
1735
1736/**
1737 * try_to_compact_pages - Direct compact to satisfy a high-order allocation
1738 * @gfp_mask: The GFP mask of the current allocation
1739 * @order: The order of the current allocation
1740 * @alloc_flags: The allocation flags of the current allocation
1741 * @ac: The context of current allocation
1742 * @prio: Determines how hard direct compaction should try to succeed
1743 *
1744 * This is the main entry point for direct page compaction.
1745 */
1746enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
1747                unsigned int alloc_flags, const struct alloc_context *ac,
1748                enum compact_priority prio)
1749{
1750        int may_perform_io = gfp_mask & __GFP_IO;
1751        struct zoneref *z;
1752        struct zone *zone;
1753        enum compact_result rc = COMPACT_SKIPPED;
1754
1755        /*
1756         * Check if the GFP flags allow compaction - GFP_NOIO is really
1757         * tricky context because the migration might require IO
1758         */
1759        if (!may_perform_io)
1760                return COMPACT_SKIPPED;
1761
1762        trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio);
1763
1764        /* Compact each zone in the list */
1765        for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
1766                                                                ac->nodemask) {
1767                enum compact_result status;
1768
1769                if (prio > MIN_COMPACT_PRIORITY
1770                                        && compaction_deferred(zone, order)) {
1771                        rc = max_t(enum compact_result, COMPACT_DEFERRED, rc);
1772                        continue;
1773                }
1774
1775                status = compact_zone_order(zone, order, gfp_mask, prio,
1776                                        alloc_flags, ac_classzone_idx(ac));
1777                rc = max(status, rc);
1778
1779                /* The allocation should succeed, stop compacting */
1780                if (status == COMPACT_SUCCESS) {
1781                        /*
1782                         * We think the allocation will succeed in this zone,
1783                         * but it is not certain, hence the false. The caller
1784                         * will repeat this with true if allocation indeed
1785                         * succeeds in this zone.
1786                         */
1787                        compaction_defer_reset(zone, order, false);
1788
1789                        break;
1790                }
1791
1792                if (prio != COMPACT_PRIO_ASYNC && (status == COMPACT_COMPLETE ||
1793                                        status == COMPACT_PARTIAL_SKIPPED))
1794                        /*
1795                         * We think that allocation won't succeed in this zone
1796                         * so we defer compaction there. If it ends up
1797                         * succeeding after all, it will be reset.
1798                         */
1799                        defer_compaction(zone, order);
1800
1801                /*
1802                 * We might have stopped compacting due to need_resched() in
1803                 * async compaction, or due to a fatal signal detected. In that
1804                 * case do not try further zones
1805                 */
1806                if ((prio == COMPACT_PRIO_ASYNC && need_resched())
1807                                        || fatal_signal_pending(current))
1808                        break;
1809        }
1810
1811        return rc;
1812}
1813
1814
1815/* Compact all zones within a node */
1816static void compact_node(int nid)
1817{
1818        pg_data_t *pgdat = NODE_DATA(nid);
1819        int zoneid;
1820        struct zone *zone;
1821        struct compact_control cc = {
1822                .order = -1,
1823                .total_migrate_scanned = 0,
1824                .total_free_scanned = 0,
1825                .mode = MIGRATE_SYNC,
1826                .ignore_skip_hint = true,
1827                .whole_zone = true,
1828                .gfp_mask = GFP_KERNEL,
1829        };
1830
1831
1832        for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
1833
1834                zone = &pgdat->node_zones[zoneid];
1835                if (!populated_zone(zone))
1836                        continue;
1837
1838                cc.nr_freepages = 0;
1839                cc.nr_migratepages = 0;
1840                cc.zone = zone;
1841                INIT_LIST_HEAD(&cc.freepages);
1842                INIT_LIST_HEAD(&cc.migratepages);
1843
1844                compact_zone(zone, &cc);
1845
1846                VM_BUG_ON(!list_empty(&cc.freepages));
1847                VM_BUG_ON(!list_empty(&cc.migratepages));
1848        }
1849}
1850
1851/* Compact all nodes in the system */
1852static void compact_nodes(void)
1853{
1854        int nid;
1855
1856        /* Flush pending updates to the LRU lists */
1857        lru_add_drain_all();
1858
1859        for_each_online_node(nid)
1860                compact_node(nid);
1861}
1862
1863/* The written value is actually unused, all memory is compacted */
1864int sysctl_compact_memory;
1865
1866/*
1867 * This is the entry point for compacting all nodes via
1868 * /proc/sys/vm/compact_memory
1869 */
1870int sysctl_compaction_handler(struct ctl_table *table, int write,
1871                        void __user *buffer, size_t *length, loff_t *ppos)
1872{
1873        if (write)
1874                compact_nodes();
1875
1876        return 0;
1877}
1878
1879int sysctl_extfrag_handler(struct ctl_table *table, int write,
1880                        void __user *buffer, size_t *length, loff_t *ppos)
1881{
1882        proc_dointvec_minmax(table, write, buffer, length, ppos);
1883
1884        return 0;
1885}
1886
1887#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
1888static ssize_t sysfs_compact_node(struct device *dev,
1889                        struct device_attribute *attr,
1890                        const char *buf, size_t count)
1891{
1892        int nid = dev->id;
1893
1894        if (nid >= 0 && nid < nr_node_ids && node_online(nid)) {
1895                /* Flush pending updates to the LRU lists */
1896                lru_add_drain_all();
1897
1898                compact_node(nid);
1899        }
1900
1901        return count;
1902}
1903static DEVICE_ATTR(compact, 0200, NULL, sysfs_compact_node);
1904
1905int compaction_register_node(struct node *node)
1906{
1907        return device_create_file(&node->dev, &dev_attr_compact);
1908}
1909
1910void compaction_unregister_node(struct node *node)
1911{
1912        return device_remove_file(&node->dev, &dev_attr_compact);
1913}
1914#endif /* CONFIG_SYSFS && CONFIG_NUMA */
1915
1916static inline bool kcompactd_work_requested(pg_data_t *pgdat)
1917{
1918        return pgdat->kcompactd_max_order > 0 || kthread_should_stop();
1919}
1920
1921static bool kcompactd_node_suitable(pg_data_t *pgdat)
1922{
1923        int zoneid;
1924        struct zone *zone;
1925        enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx;
1926
1927        for (zoneid = 0; zoneid <= classzone_idx; zoneid++) {
1928                zone = &pgdat->node_zones[zoneid];
1929
1930                if (!populated_zone(zone))
1931                        continue;
1932
1933                if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0,
1934                                        classzone_idx) == COMPACT_CONTINUE)
1935                        return true;
1936        }
1937
1938        return false;
1939}
1940
1941static void kcompactd_do_work(pg_data_t *pgdat)
1942{
1943        /*
1944         * With no special task, compact all zones so that a page of requested
1945         * order is allocatable.
1946         */
1947        int zoneid;
1948        struct zone *zone;
1949        struct compact_control cc = {
1950                .order = pgdat->kcompactd_max_order,
1951                .total_migrate_scanned = 0,
1952                .total_free_scanned = 0,
1953                .classzone_idx = pgdat->kcompactd_classzone_idx,
1954                .mode = MIGRATE_SYNC_LIGHT,
1955                .ignore_skip_hint = false,
1956                .gfp_mask = GFP_KERNEL,
1957        };
1958        trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
1959                                                        cc.classzone_idx);
1960        count_compact_event(KCOMPACTD_WAKE);
1961
1962        for (zoneid = 0; zoneid <= cc.classzone_idx; zoneid++) {
1963                int status;
1964
1965                zone = &pgdat->node_zones[zoneid];
1966                if (!populated_zone(zone))
1967                        continue;
1968
1969                if (compaction_deferred(zone, cc.order))
1970                        continue;
1971
1972                if (compaction_suitable(zone, cc.order, 0, zoneid) !=
1973                                                        COMPACT_CONTINUE)
1974                        continue;
1975
1976                cc.nr_freepages = 0;
1977                cc.nr_migratepages = 0;
1978                cc.total_migrate_scanned = 0;
1979                cc.total_free_scanned = 0;
1980                cc.zone = zone;
1981                INIT_LIST_HEAD(&cc.freepages);
1982                INIT_LIST_HEAD(&cc.migratepages);
1983
1984                if (kthread_should_stop())
1985                        return;
1986                status = compact_zone(zone, &cc);
1987
1988                if (status == COMPACT_SUCCESS) {
1989                        compaction_defer_reset(zone, cc.order, false);
1990                } else if (status == COMPACT_PARTIAL_SKIPPED || status == COMPACT_COMPLETE) {
1991                        /*
1992                         * Buddy pages may become stranded on pcps that could
1993                         * otherwise coalesce on the zone's free area for
1994                         * order >= cc.order.  This is ratelimited by the
1995                         * upcoming deferral.
1996                         */
1997                        drain_all_pages(zone);
1998
1999                        /*
2000                         * We use sync migration mode here, so we defer like
2001                         * sync direct compaction does.
2002                         */
2003                        defer_compaction(zone, cc.order);
2004                }
2005
2006                count_compact_events(KCOMPACTD_MIGRATE_SCANNED,
2007                                     cc.total_migrate_scanned);
2008                count_compact_events(KCOMPACTD_FREE_SCANNED,
2009                                     cc.total_free_scanned);
2010
2011                VM_BUG_ON(!list_empty(&cc.freepages));
2012                VM_BUG_ON(!list_empty(&cc.migratepages));
2013        }
2014
2015        /*
2016         * Regardless of success, we are done until woken up next. But remember
2017         * the requested order/classzone_idx in case it was higher/tighter than
2018         * our current ones
2019         */
2020        if (pgdat->kcompactd_max_order <= cc.order)
2021                pgdat->kcompactd_max_order = 0;
2022        if (pgdat->kcompactd_classzone_idx >= cc.classzone_idx)
2023                pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
2024}
2025
2026void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
2027{
2028        if (!order)
2029                return;
2030
2031        if (pgdat->kcompactd_max_order < order)
2032                pgdat->kcompactd_max_order = order;
2033
2034        if (pgdat->kcompactd_classzone_idx > classzone_idx)
2035                pgdat->kcompactd_classzone_idx = classzone_idx;
2036
2037        /*
2038         * Pairs with implicit barrier in wait_event_freezable()
2039         * such that wakeups are not missed.
2040         */
2041        if (!wq_has_sleeper(&pgdat->kcompactd_wait))
2042                return;
2043
2044        if (!kcompactd_node_suitable(pgdat))
2045                return;
2046
2047        trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order,
2048                                                        classzone_idx);
2049        wake_up_interruptible(&pgdat->kcompactd_wait);
2050}
2051
2052/*
2053 * The background compaction daemon, started as a kernel thread
2054 * from the init process.
2055 */
2056static int kcompactd(void *p)
2057{
2058        pg_data_t *pgdat = (pg_data_t*)p;
2059        struct task_struct *tsk = current;
2060
2061        const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
2062
2063        if (!cpumask_empty(cpumask))
2064                set_cpus_allowed_ptr(tsk, cpumask);
2065
2066        set_freezable();
2067
2068        pgdat->kcompactd_max_order = 0;
2069        pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
2070
2071        while (!kthread_should_stop()) {
2072                unsigned long pflags;
2073
2074                trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
2075                wait_event_freezable(pgdat->kcompactd_wait,
2076                                kcompactd_work_requested(pgdat));
2077
2078                psi_memstall_enter(&pflags);
2079                kcompactd_do_work(pgdat);
2080                psi_memstall_leave(&pflags);
2081        }
2082
2083        return 0;
2084}
2085
2086/*
2087 * This kcompactd start function will be called by init and node-hot-add.
2088 * On node-hot-add, kcompactd will moved to proper cpus if cpus are hot-added.
2089 */
2090int kcompactd_run(int nid)
2091{
2092        pg_data_t *pgdat = NODE_DATA(nid);
2093        int ret = 0;
2094
2095        if (pgdat->kcompactd)
2096                return 0;
2097
2098        pgdat->kcompactd = kthread_run(kcompactd, pgdat, "kcompactd%d", nid);
2099        if (IS_ERR(pgdat->kcompactd)) {
2100                pr_err("Failed to start kcompactd on node %d\n", nid);
2101                ret = PTR_ERR(pgdat->kcompactd);
2102                pgdat->kcompactd = NULL;
2103        }
2104        return ret;
2105}
2106
2107/*
2108 * Called by memory hotplug when all memory in a node is offlined. Caller must
2109 * hold mem_hotplug_begin/end().
2110 */
2111void kcompactd_stop(int nid)
2112{
2113        struct task_struct *kcompactd = NODE_DATA(nid)->kcompactd;
2114
2115        if (kcompactd) {
2116                kthread_stop(kcompactd);
2117                NODE_DATA(nid)->kcompactd = NULL;
2118        }
2119}
2120
2121/*
2122 * It's optimal to keep kcompactd on the same CPUs as their memory, but
2123 * not required for correctness. So if the last cpu in a node goes
2124 * away, we get changed to run anywhere: as the first one comes back,
2125 * restore their cpu bindings.
2126 */
2127static int kcompactd_cpu_online(unsigned int cpu)
2128{
2129        int nid;
2130
2131        for_each_node_state(nid, N_MEMORY) {
2132                pg_data_t *pgdat = NODE_DATA(nid);
2133                const struct cpumask *mask;
2134
2135                mask = cpumask_of_node(pgdat->node_id);
2136
2137                if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
2138                        /* One of our CPUs online: restore mask */
2139                        set_cpus_allowed_ptr(pgdat->kcompactd, mask);
2140        }
2141        return 0;
2142}
2143
2144static int __init kcompactd_init(void)
2145{
2146        int nid;
2147        int ret;
2148
2149        ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
2150                                        "mm/compaction:online",
2151                                        kcompactd_cpu_online, NULL);
2152        if (ret < 0) {
2153                pr_err("kcompactd: failed to register hotplug callbacks.\n");
2154                return ret;
2155        }
2156
2157        for_each_node_state(nid, N_MEMORY)
2158                kcompactd_run(nid);
2159        return 0;
2160}
2161subsys_initcall(kcompactd_init)
2162
2163#endif /* CONFIG_COMPACTION */
2164