linux/mm/vmscan.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   4 *
   5 *  Swap reorganised 29.12.95, Stephen Tweedie.
   6 *  kswapd added: 7.1.96  sct
   7 *  Removed kswapd_ctl limits, and swap out as many pages as needed
   8 *  to bring the system back to freepages.high: 2.4.97, Rik van Riel.
   9 *  Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
  10 *  Multiqueue VM started 5.8.00, Rik van Riel.
  11 */
  12
  13#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  14
  15#include <linux/mm.h>
  16#include <linux/sched/mm.h>
  17#include <linux/module.h>
  18#include <linux/gfp.h>
  19#include <linux/kernel_stat.h>
  20#include <linux/swap.h>
  21#include <linux/pagemap.h>
  22#include <linux/init.h>
  23#include <linux/highmem.h>
  24#include <linux/vmpressure.h>
  25#include <linux/vmstat.h>
  26#include <linux/file.h>
  27#include <linux/writeback.h>
  28#include <linux/blkdev.h>
  29#include <linux/buffer_head.h>  /* for try_to_release_page(),
  30                                        buffer_heads_over_limit */
  31#include <linux/mm_inline.h>
  32#include <linux/backing-dev.h>
  33#include <linux/rmap.h>
  34#include <linux/topology.h>
  35#include <linux/cpu.h>
  36#include <linux/cpuset.h>
  37#include <linux/compaction.h>
  38#include <linux/notifier.h>
  39#include <linux/rwsem.h>
  40#include <linux/delay.h>
  41#include <linux/kthread.h>
  42#include <linux/freezer.h>
  43#include <linux/memcontrol.h>
  44#include <linux/delayacct.h>
  45#include <linux/sysctl.h>
  46#include <linux/oom.h>
  47#include <linux/pagevec.h>
  48#include <linux/prefetch.h>
  49#include <linux/printk.h>
  50#include <linux/dax.h>
  51#include <linux/psi.h>
  52
  53#include <asm/tlbflush.h>
  54#include <asm/div64.h>
  55
  56#include <linux/swapops.h>
  57#include <linux/balloon_compaction.h>
  58
  59#include "internal.h"
  60
  61#define CREATE_TRACE_POINTS
  62#include <trace/events/vmscan.h>
  63
  64struct scan_control {
  65        /* How many pages shrink_list() should reclaim */
  66        unsigned long nr_to_reclaim;
  67
  68        /*
  69         * Nodemask of nodes allowed by the caller. If NULL, all nodes
  70         * are scanned.
  71         */
  72        nodemask_t      *nodemask;
  73
  74        /*
  75         * The memory cgroup that hit its limit and as a result is the
  76         * primary target of this reclaim invocation.
  77         */
  78        struct mem_cgroup *target_mem_cgroup;
  79
  80        /*
  81         * Scan pressure balancing between anon and file LRUs
  82         */
  83        unsigned long   anon_cost;
  84        unsigned long   file_cost;
  85
  86        /* Can active pages be deactivated as part of reclaim? */
  87#define DEACTIVATE_ANON 1
  88#define DEACTIVATE_FILE 2
  89        unsigned int may_deactivate:2;
  90        unsigned int force_deactivate:1;
  91        unsigned int skipped_deactivate:1;
  92
  93        /* Writepage batching in laptop mode; RECLAIM_WRITE */
  94        unsigned int may_writepage:1;
  95
  96        /* Can mapped pages be reclaimed? */
  97        unsigned int may_unmap:1;
  98
  99        /* Can pages be swapped as part of reclaim? */
 100        unsigned int may_swap:1;
 101
 102        /*
 103         * Cgroups are not reclaimed below their configured memory.low,
 104         * unless we threaten to OOM. If any cgroups are skipped due to
 105         * memory.low and nothing was reclaimed, go back for memory.low.
 106         */
 107        unsigned int memcg_low_reclaim:1;
 108        unsigned int memcg_low_skipped:1;
 109
 110        unsigned int hibernation_mode:1;
 111
 112        /* One of the zones is ready for compaction */
 113        unsigned int compaction_ready:1;
 114
 115        /* There is easily reclaimable cold cache in the current node */
 116        unsigned int cache_trim_mode:1;
 117
 118        /* The file pages on the current node are dangerously low */
 119        unsigned int file_is_tiny:1;
 120
 121        /* Allocation order */
 122        s8 order;
 123
 124        /* Scan (total_size >> priority) pages at once */
 125        s8 priority;
 126
 127        /* The highest zone to isolate pages for reclaim from */
 128        s8 reclaim_idx;
 129
 130        /* This context's GFP mask */
 131        gfp_t gfp_mask;
 132
 133        /* Incremented by the number of inactive pages that were scanned */
 134        unsigned long nr_scanned;
 135
 136        /* Number of pages freed so far during a call to shrink_zones() */
 137        unsigned long nr_reclaimed;
 138
 139        struct {
 140                unsigned int dirty;
 141                unsigned int unqueued_dirty;
 142                unsigned int congested;
 143                unsigned int writeback;
 144                unsigned int immediate;
 145                unsigned int file_taken;
 146                unsigned int taken;
 147        } nr;
 148
 149        /* for recording the reclaimed slab by now */
 150        struct reclaim_state reclaim_state;
 151};
 152
 153#ifdef ARCH_HAS_PREFETCHW
 154#define prefetchw_prev_lru_page(_page, _base, _field)                   \
 155        do {                                                            \
 156                if ((_page)->lru.prev != _base) {                       \
 157                        struct page *prev;                              \
 158                                                                        \
 159                        prev = lru_to_page(&(_page->lru));              \
 160                        prefetchw(&prev->_field);                       \
 161                }                                                       \
 162        } while (0)
 163#else
 164#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
 165#endif
 166
 167/*
 168 * From 0 .. 200.  Higher means more swappy.
 169 */
 170int vm_swappiness = 60;
 171
 172static void set_task_reclaim_state(struct task_struct *task,
 173                                   struct reclaim_state *rs)
 174{
 175        /* Check for an overwrite */
 176        WARN_ON_ONCE(rs && task->reclaim_state);
 177
 178        /* Check for the nulling of an already-nulled member */
 179        WARN_ON_ONCE(!rs && !task->reclaim_state);
 180
 181        task->reclaim_state = rs;
 182}
 183
 184static LIST_HEAD(shrinker_list);
 185static DECLARE_RWSEM(shrinker_rwsem);
 186
 187#ifdef CONFIG_MEMCG
 188/*
 189 * We allow subsystems to populate their shrinker-related
 190 * LRU lists before register_shrinker_prepared() is called
 191 * for the shrinker, since we don't want to impose
 192 * restrictions on their internal registration order.
 193 * In this case shrink_slab_memcg() may find corresponding
 194 * bit is set in the shrinkers map.
 195 *
 196 * This value is used by the function to detect registering
 197 * shrinkers and to skip do_shrink_slab() calls for them.
 198 */
 199#define SHRINKER_REGISTERING ((struct shrinker *)~0UL)
 200
 201static DEFINE_IDR(shrinker_idr);
 202static int shrinker_nr_max;
 203
 204static int prealloc_memcg_shrinker(struct shrinker *shrinker)
 205{
 206        int id, ret = -ENOMEM;
 207
 208        down_write(&shrinker_rwsem);
 209        /* This may call shrinker, so it must use down_read_trylock() */
 210        id = idr_alloc(&shrinker_idr, SHRINKER_REGISTERING, 0, 0, GFP_KERNEL);
 211        if (id < 0)
 212                goto unlock;
 213
 214        if (id >= shrinker_nr_max) {
 215                if (memcg_expand_shrinker_maps(id)) {
 216                        idr_remove(&shrinker_idr, id);
 217                        goto unlock;
 218                }
 219
 220                shrinker_nr_max = id + 1;
 221        }
 222        shrinker->id = id;
 223        ret = 0;
 224unlock:
 225        up_write(&shrinker_rwsem);
 226        return ret;
 227}
 228
 229static void unregister_memcg_shrinker(struct shrinker *shrinker)
 230{
 231        int id = shrinker->id;
 232
 233        BUG_ON(id < 0);
 234
 235        down_write(&shrinker_rwsem);
 236        idr_remove(&shrinker_idr, id);
 237        up_write(&shrinker_rwsem);
 238}
 239
 240static bool cgroup_reclaim(struct scan_control *sc)
 241{
 242        return sc->target_mem_cgroup;
 243}
 244
 245/**
 246 * writeback_throttling_sane - is the usual dirty throttling mechanism available?
 247 * @sc: scan_control in question
 248 *
 249 * The normal page dirty throttling mechanism in balance_dirty_pages() is
 250 * completely broken with the legacy memcg and direct stalling in
 251 * shrink_page_list() is used for throttling instead, which lacks all the
 252 * niceties such as fairness, adaptive pausing, bandwidth proportional
 253 * allocation and configurability.
 254 *
 255 * This function tests whether the vmscan currently in progress can assume
 256 * that the normal dirty throttling mechanism is operational.
 257 */
 258static bool writeback_throttling_sane(struct scan_control *sc)
 259{
 260        if (!cgroup_reclaim(sc))
 261                return true;
 262#ifdef CONFIG_CGROUP_WRITEBACK
 263        if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
 264                return true;
 265#endif
 266        return false;
 267}
 268#else
 269static int prealloc_memcg_shrinker(struct shrinker *shrinker)
 270{
 271        return 0;
 272}
 273
 274static void unregister_memcg_shrinker(struct shrinker *shrinker)
 275{
 276}
 277
 278static bool cgroup_reclaim(struct scan_control *sc)
 279{
 280        return false;
 281}
 282
 283static bool writeback_throttling_sane(struct scan_control *sc)
 284{
 285        return true;
 286}
 287#endif
 288
 289/*
 290 * This misses isolated pages which are not accounted for to save counters.
 291 * As the data only determines if reclaim or compaction continues, it is
 292 * not expected that isolated pages will be a dominating factor.
 293 */
 294unsigned long zone_reclaimable_pages(struct zone *zone)
 295{
 296        unsigned long nr;
 297
 298        nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
 299                zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
 300        if (get_nr_swap_pages() > 0)
 301                nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
 302                        zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
 303
 304        return nr;
 305}
 306
 307/**
 308 * lruvec_lru_size -  Returns the number of pages on the given LRU list.
 309 * @lruvec: lru vector
 310 * @lru: lru to use
 311 * @zone_idx: zones to consider (use MAX_NR_ZONES for the whole LRU list)
 312 */
 313unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
 314{
 315        unsigned long size = 0;
 316        int zid;
 317
 318        for (zid = 0; zid <= zone_idx && zid < MAX_NR_ZONES; zid++) {
 319                struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
 320
 321                if (!managed_zone(zone))
 322                        continue;
 323
 324                if (!mem_cgroup_disabled())
 325                        size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
 326                else
 327                        size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru);
 328        }
 329        return size;
 330}
 331
 332/*
 333 * Add a shrinker callback to be called from the vm.
 334 */
 335int prealloc_shrinker(struct shrinker *shrinker)
 336{
 337        unsigned int size = sizeof(*shrinker->nr_deferred);
 338
 339        if (shrinker->flags & SHRINKER_NUMA_AWARE)
 340                size *= nr_node_ids;
 341
 342        shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
 343        if (!shrinker->nr_deferred)
 344                return -ENOMEM;
 345
 346        if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
 347                if (prealloc_memcg_shrinker(shrinker))
 348                        goto free_deferred;
 349        }
 350
 351        return 0;
 352
 353free_deferred:
 354        kfree(shrinker->nr_deferred);
 355        shrinker->nr_deferred = NULL;
 356        return -ENOMEM;
 357}
 358
 359void free_prealloced_shrinker(struct shrinker *shrinker)
 360{
 361        if (!shrinker->nr_deferred)
 362                return;
 363
 364        if (shrinker->flags & SHRINKER_MEMCG_AWARE)
 365                unregister_memcg_shrinker(shrinker);
 366
 367        kfree(shrinker->nr_deferred);
 368        shrinker->nr_deferred = NULL;
 369}
 370
 371void register_shrinker_prepared(struct shrinker *shrinker)
 372{
 373        down_write(&shrinker_rwsem);
 374        list_add_tail(&shrinker->list, &shrinker_list);
 375#ifdef CONFIG_MEMCG
 376        if (shrinker->flags & SHRINKER_MEMCG_AWARE)
 377                idr_replace(&shrinker_idr, shrinker, shrinker->id);
 378#endif
 379        up_write(&shrinker_rwsem);
 380}
 381
 382int register_shrinker(struct shrinker *shrinker)
 383{
 384        int err = prealloc_shrinker(shrinker);
 385
 386        if (err)
 387                return err;
 388        register_shrinker_prepared(shrinker);
 389        return 0;
 390}
 391EXPORT_SYMBOL(register_shrinker);
 392
 393/*
 394 * Remove one
 395 */
 396void unregister_shrinker(struct shrinker *shrinker)
 397{
 398        if (!shrinker->nr_deferred)
 399                return;
 400        if (shrinker->flags & SHRINKER_MEMCG_AWARE)
 401                unregister_memcg_shrinker(shrinker);
 402        down_write(&shrinker_rwsem);
 403        list_del(&shrinker->list);
 404        up_write(&shrinker_rwsem);
 405        kfree(shrinker->nr_deferred);
 406        shrinker->nr_deferred = NULL;
 407}
 408EXPORT_SYMBOL(unregister_shrinker);
 409
 410#define SHRINK_BATCH 128
 411
 412static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
 413                                    struct shrinker *shrinker, int priority)
 414{
 415        unsigned long freed = 0;
 416        unsigned long long delta;
 417        long total_scan;
 418        long freeable;
 419        long nr;
 420        long new_nr;
 421        int nid = shrinkctl->nid;
 422        long batch_size = shrinker->batch ? shrinker->batch
 423                                          : SHRINK_BATCH;
 424        long scanned = 0, next_deferred;
 425
 426        if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
 427                nid = 0;
 428
 429        freeable = shrinker->count_objects(shrinker, shrinkctl);
 430        if (freeable == 0 || freeable == SHRINK_EMPTY)
 431                return freeable;
 432
 433        /*
 434         * copy the current shrinker scan count into a local variable
 435         * and zero it so that other concurrent shrinker invocations
 436         * don't also do this scanning work.
 437         */
 438        nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
 439
 440        total_scan = nr;
 441        if (shrinker->seeks) {
 442                delta = freeable >> priority;
 443                delta *= 4;
 444                do_div(delta, shrinker->seeks);
 445        } else {
 446                /*
 447                 * These objects don't require any IO to create. Trim
 448                 * them aggressively under memory pressure to keep
 449                 * them from causing refetches in the IO caches.
 450                 */
 451                delta = freeable / 2;
 452        }
 453
 454        total_scan += delta;
 455        if (total_scan < 0) {
 456                pr_err("shrink_slab: %pS negative objects to delete nr=%ld\n",
 457                       shrinker->scan_objects, total_scan);
 458                total_scan = freeable;
 459                next_deferred = nr;
 460        } else
 461                next_deferred = total_scan;
 462
 463        /*
 464         * We need to avoid excessive windup on filesystem shrinkers
 465         * due to large numbers of GFP_NOFS allocations causing the
 466         * shrinkers to return -1 all the time. This results in a large
 467         * nr being built up so when a shrink that can do some work
 468         * comes along it empties the entire cache due to nr >>>
 469         * freeable. This is bad for sustaining a working set in
 470         * memory.
 471         *
 472         * Hence only allow the shrinker to scan the entire cache when
 473         * a large delta change is calculated directly.
 474         */
 475        if (delta < freeable / 4)
 476                total_scan = min(total_scan, freeable / 2);
 477
 478        /*
 479         * Avoid risking looping forever due to too large nr value:
 480         * never try to free more than twice the estimate number of
 481         * freeable entries.
 482         */
 483        if (total_scan > freeable * 2)
 484                total_scan = freeable * 2;
 485
 486        trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
 487                                   freeable, delta, total_scan, priority);
 488
 489        /*
 490         * Normally, we should not scan less than batch_size objects in one
 491         * pass to avoid too frequent shrinker calls, but if the slab has less
 492         * than batch_size objects in total and we are really tight on memory,
 493         * we will try to reclaim all available objects, otherwise we can end
 494         * up failing allocations although there are plenty of reclaimable
 495         * objects spread over several slabs with usage less than the
 496         * batch_size.
 497         *
 498         * We detect the "tight on memory" situations by looking at the total
 499         * number of objects we want to scan (total_scan). If it is greater
 500         * than the total number of objects on slab (freeable), we must be
 501         * scanning at high prio and therefore should try to reclaim as much as
 502         * possible.
 503         */
 504        while (total_scan >= batch_size ||
 505               total_scan >= freeable) {
 506                unsigned long ret;
 507                unsigned long nr_to_scan = min(batch_size, total_scan);
 508
 509                shrinkctl->nr_to_scan = nr_to_scan;
 510                shrinkctl->nr_scanned = nr_to_scan;
 511                ret = shrinker->scan_objects(shrinker, shrinkctl);
 512                if (ret == SHRINK_STOP)
 513                        break;
 514                freed += ret;
 515
 516                count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
 517                total_scan -= shrinkctl->nr_scanned;
 518                scanned += shrinkctl->nr_scanned;
 519
 520                cond_resched();
 521        }
 522
 523        if (next_deferred >= scanned)
 524                next_deferred -= scanned;
 525        else
 526                next_deferred = 0;
 527        /*
 528         * move the unused scan count back into the shrinker in a
 529         * manner that handles concurrent updates. If we exhausted the
 530         * scan, there is no need to do an update.
 531         */
 532        if (next_deferred > 0)
 533                new_nr = atomic_long_add_return(next_deferred,
 534                                                &shrinker->nr_deferred[nid]);
 535        else
 536                new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
 537
 538        trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan);
 539        return freed;
 540}
 541
 542#ifdef CONFIG_MEMCG
 543static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 544                        struct mem_cgroup *memcg, int priority)
 545{
 546        struct memcg_shrinker_map *map;
 547        unsigned long ret, freed = 0;
 548        int i;
 549
 550        if (!mem_cgroup_online(memcg))
 551                return 0;
 552
 553        if (!down_read_trylock(&shrinker_rwsem))
 554                return 0;
 555
 556        map = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_map,
 557                                        true);
 558        if (unlikely(!map))
 559                goto unlock;
 560
 561        for_each_set_bit(i, map->map, shrinker_nr_max) {
 562                struct shrink_control sc = {
 563                        .gfp_mask = gfp_mask,
 564                        .nid = nid,
 565                        .memcg = memcg,
 566                };
 567                struct shrinker *shrinker;
 568
 569                shrinker = idr_find(&shrinker_idr, i);
 570                if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) {
 571                        if (!shrinker)
 572                                clear_bit(i, map->map);
 573                        continue;
 574                }
 575
 576                /* Call non-slab shrinkers even though kmem is disabled */
 577                if (!memcg_kmem_enabled() &&
 578                    !(shrinker->flags & SHRINKER_NONSLAB))
 579                        continue;
 580
 581                ret = do_shrink_slab(&sc, shrinker, priority);
 582                if (ret == SHRINK_EMPTY) {
 583                        clear_bit(i, map->map);
 584                        /*
 585                         * After the shrinker reported that it had no objects to
 586                         * free, but before we cleared the corresponding bit in
 587                         * the memcg shrinker map, a new object might have been
 588                         * added. To make sure, we have the bit set in this
 589                         * case, we invoke the shrinker one more time and reset
 590                         * the bit if it reports that it is not empty anymore.
 591                         * The memory barrier here pairs with the barrier in
 592                         * memcg_set_shrinker_bit():
 593                         *
 594                         * list_lru_add()     shrink_slab_memcg()
 595                         *   list_add_tail()    clear_bit()
 596                         *   <MB>               <MB>
 597                         *   set_bit()          do_shrink_slab()
 598                         */
 599                        smp_mb__after_atomic();
 600                        ret = do_shrink_slab(&sc, shrinker, priority);
 601                        if (ret == SHRINK_EMPTY)
 602                                ret = 0;
 603                        else
 604                                memcg_set_shrinker_bit(memcg, nid, i);
 605                }
 606                freed += ret;
 607
 608                if (rwsem_is_contended(&shrinker_rwsem)) {
 609                        freed = freed ? : 1;
 610                        break;
 611                }
 612        }
 613unlock:
 614        up_read(&shrinker_rwsem);
 615        return freed;
 616}
 617#else /* CONFIG_MEMCG */
 618static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 619                        struct mem_cgroup *memcg, int priority)
 620{
 621        return 0;
 622}
 623#endif /* CONFIG_MEMCG */
 624
 625/**
 626 * shrink_slab - shrink slab caches
 627 * @gfp_mask: allocation context
 628 * @nid: node whose slab caches to target
 629 * @memcg: memory cgroup whose slab caches to target
 630 * @priority: the reclaim priority
 631 *
 632 * Call the shrink functions to age shrinkable caches.
 633 *
 634 * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
 635 * unaware shrinkers will receive a node id of 0 instead.
 636 *
 637 * @memcg specifies the memory cgroup to target. Unaware shrinkers
 638 * are called only if it is the root cgroup.
 639 *
 640 * @priority is sc->priority, we take the number of objects and >> by priority
 641 * in order to get the scan target.
 642 *
 643 * Returns the number of reclaimed slab objects.
 644 */
 645static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
 646                                 struct mem_cgroup *memcg,
 647                                 int priority)
 648{
 649        unsigned long ret, freed = 0;
 650        struct shrinker *shrinker;
 651
 652        /*
 653         * The root memcg might be allocated even though memcg is disabled
 654         * via "cgroup_disable=memory" boot parameter.  This could make
 655         * mem_cgroup_is_root() return false, then just run memcg slab
 656         * shrink, but skip global shrink.  This may result in premature
 657         * oom.
 658         */
 659        if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
 660                return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
 661
 662        if (!down_read_trylock(&shrinker_rwsem))
 663                goto out;
 664
 665        list_for_each_entry(shrinker, &shrinker_list, list) {
 666                struct shrink_control sc = {
 667                        .gfp_mask = gfp_mask,
 668                        .nid = nid,
 669                        .memcg = memcg,
 670                };
 671
 672                ret = do_shrink_slab(&sc, shrinker, priority);
 673                if (ret == SHRINK_EMPTY)
 674                        ret = 0;
 675                freed += ret;
 676                /*
 677                 * Bail out if someone want to register a new shrinker to
 678                 * prevent the registration from being stalled for long periods
 679                 * by parallel ongoing shrinking.
 680                 */
 681                if (rwsem_is_contended(&shrinker_rwsem)) {
 682                        freed = freed ? : 1;
 683                        break;
 684                }
 685        }
 686
 687        up_read(&shrinker_rwsem);
 688out:
 689        cond_resched();
 690        return freed;
 691}
 692
 693void drop_slab_node(int nid)
 694{
 695        unsigned long freed;
 696
 697        do {
 698                struct mem_cgroup *memcg = NULL;
 699
 700                if (fatal_signal_pending(current))
 701                        return;
 702
 703                freed = 0;
 704                memcg = mem_cgroup_iter(NULL, NULL, NULL);
 705                do {
 706                        freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
 707                } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
 708        } while (freed > 10);
 709}
 710
 711void drop_slab(void)
 712{
 713        int nid;
 714
 715        for_each_online_node(nid)
 716                drop_slab_node(nid);
 717}
 718
 719static inline int is_page_cache_freeable(struct page *page)
 720{
 721        /*
 722         * A freeable page cache page is referenced only by the caller
 723         * that isolated the page, the page cache and optional buffer
 724         * heads at page->private.
 725         */
 726        int page_cache_pins = thp_nr_pages(page);
 727        return page_count(page) - page_has_private(page) == 1 + page_cache_pins;
 728}
 729
 730static int may_write_to_inode(struct inode *inode)
 731{
 732        if (current->flags & PF_SWAPWRITE)
 733                return 1;
 734        if (!inode_write_congested(inode))
 735                return 1;
 736        if (inode_to_bdi(inode) == current->backing_dev_info)
 737                return 1;
 738        return 0;
 739}
 740
 741/*
 742 * We detected a synchronous write error writing a page out.  Probably
 743 * -ENOSPC.  We need to propagate that into the address_space for a subsequent
 744 * fsync(), msync() or close().
 745 *
 746 * The tricky part is that after writepage we cannot touch the mapping: nothing
 747 * prevents it from being freed up.  But we have a ref on the page and once
 748 * that page is locked, the mapping is pinned.
 749 *
 750 * We're allowed to run sleeping lock_page() here because we know the caller has
 751 * __GFP_FS.
 752 */
 753static void handle_write_error(struct address_space *mapping,
 754                                struct page *page, int error)
 755{
 756        lock_page(page);
 757        if (page_mapping(page) == mapping)
 758                mapping_set_error(mapping, error);
 759        unlock_page(page);
 760}
 761
 762/* possible outcome of pageout() */
 763typedef enum {
 764        /* failed to write page out, page is locked */
 765        PAGE_KEEP,
 766        /* move page to the active list, page is locked */
 767        PAGE_ACTIVATE,
 768        /* page has been sent to the disk successfully, page is unlocked */
 769        PAGE_SUCCESS,
 770        /* page is clean and locked */
 771        PAGE_CLEAN,
 772} pageout_t;
 773
 774/*
 775 * pageout is called by shrink_page_list() for each dirty page.
 776 * Calls ->writepage().
 777 */
 778static pageout_t pageout(struct page *page, struct address_space *mapping)
 779{
 780        /*
 781         * If the page is dirty, only perform writeback if that write
 782         * will be non-blocking.  To prevent this allocation from being
 783         * stalled by pagecache activity.  But note that there may be
 784         * stalls if we need to run get_block().  We could test
 785         * PagePrivate for that.
 786         *
 787         * If this process is currently in __generic_file_write_iter() against
 788         * this page's queue, we can perform writeback even if that
 789         * will block.
 790         *
 791         * If the page is swapcache, write it back even if that would
 792         * block, for some throttling. This happens by accident, because
 793         * swap_backing_dev_info is bust: it doesn't reflect the
 794         * congestion state of the swapdevs.  Easy to fix, if needed.
 795         */
 796        if (!is_page_cache_freeable(page))
 797                return PAGE_KEEP;
 798        if (!mapping) {
 799                /*
 800                 * Some data journaling orphaned pages can have
 801                 * page->mapping == NULL while being dirty with clean buffers.
 802                 */
 803                if (page_has_private(page)) {
 804                        if (try_to_free_buffers(page)) {
 805                                ClearPageDirty(page);
 806                                pr_info("%s: orphaned page\n", __func__);
 807                                return PAGE_CLEAN;
 808                        }
 809                }
 810                return PAGE_KEEP;
 811        }
 812        if (mapping->a_ops->writepage == NULL)
 813                return PAGE_ACTIVATE;
 814        if (!may_write_to_inode(mapping->host))
 815                return PAGE_KEEP;
 816
 817        if (clear_page_dirty_for_io(page)) {
 818                int res;
 819                struct writeback_control wbc = {
 820                        .sync_mode = WB_SYNC_NONE,
 821                        .nr_to_write = SWAP_CLUSTER_MAX,
 822                        .range_start = 0,
 823                        .range_end = LLONG_MAX,
 824                        .for_reclaim = 1,
 825                };
 826
 827                SetPageReclaim(page);
 828                res = mapping->a_ops->writepage(page, &wbc);
 829                if (res < 0)
 830                        handle_write_error(mapping, page, res);
 831                if (res == AOP_WRITEPAGE_ACTIVATE) {
 832                        ClearPageReclaim(page);
 833                        return PAGE_ACTIVATE;
 834                }
 835
 836                if (!PageWriteback(page)) {
 837                        /* synchronous write or broken a_ops? */
 838                        ClearPageReclaim(page);
 839                }
 840                trace_mm_vmscan_writepage(page);
 841                inc_node_page_state(page, NR_VMSCAN_WRITE);
 842                return PAGE_SUCCESS;
 843        }
 844
 845        return PAGE_CLEAN;
 846}
 847
 848/*
 849 * Same as remove_mapping, but if the page is removed from the mapping, it
 850 * gets returned with a refcount of 0.
 851 */
 852static int __remove_mapping(struct address_space *mapping, struct page *page,
 853                            bool reclaimed, struct mem_cgroup *target_memcg)
 854{
 855        unsigned long flags;
 856        int refcount;
 857        void *shadow = NULL;
 858
 859        BUG_ON(!PageLocked(page));
 860        BUG_ON(mapping != page_mapping(page));
 861
 862        xa_lock_irqsave(&mapping->i_pages, flags);
 863        /*
 864         * The non racy check for a busy page.
 865         *
 866         * Must be careful with the order of the tests. When someone has
 867         * a ref to the page, it may be possible that they dirty it then
 868         * drop the reference. So if PageDirty is tested before page_count
 869         * here, then the following race may occur:
 870         *
 871         * get_user_pages(&page);
 872         * [user mapping goes away]
 873         * write_to(page);
 874         *                              !PageDirty(page)    [good]
 875         * SetPageDirty(page);
 876         * put_page(page);
 877         *                              !page_count(page)   [good, discard it]
 878         *
 879         * [oops, our write_to data is lost]
 880         *
 881         * Reversing the order of the tests ensures such a situation cannot
 882         * escape unnoticed. The smp_rmb is needed to ensure the page->flags
 883         * load is not satisfied before that of page->_refcount.
 884         *
 885         * Note that if SetPageDirty is always performed via set_page_dirty,
 886         * and thus under the i_pages lock, then this ordering is not required.
 887         */
 888        refcount = 1 + compound_nr(page);
 889        if (!page_ref_freeze(page, refcount))
 890                goto cannot_free;
 891        /* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */
 892        if (unlikely(PageDirty(page))) {
 893                page_ref_unfreeze(page, refcount);
 894                goto cannot_free;
 895        }
 896
 897        if (PageSwapCache(page)) {
 898                swp_entry_t swap = { .val = page_private(page) };
 899                mem_cgroup_swapout(page, swap);
 900                if (reclaimed && !mapping_exiting(mapping))
 901                        shadow = workingset_eviction(page, target_memcg);
 902                __delete_from_swap_cache(page, swap, shadow);
 903                xa_unlock_irqrestore(&mapping->i_pages, flags);
 904                put_swap_page(page, swap);
 905        } else {
 906                void (*freepage)(struct page *);
 907
 908                freepage = mapping->a_ops->freepage;
 909                /*
 910                 * Remember a shadow entry for reclaimed file cache in
 911                 * order to detect refaults, thus thrashing, later on.
 912                 *
 913                 * But don't store shadows in an address space that is
 914                 * already exiting.  This is not just an optimization,
 915                 * inode reclaim needs to empty out the radix tree or
 916                 * the nodes are lost.  Don't plant shadows behind its
 917                 * back.
 918                 *
 919                 * We also don't store shadows for DAX mappings because the
 920                 * only page cache pages found in these are zero pages
 921                 * covering holes, and because we don't want to mix DAX
 922                 * exceptional entries and shadow exceptional entries in the
 923                 * same address_space.
 924                 */
 925                if (reclaimed && page_is_file_lru(page) &&
 926                    !mapping_exiting(mapping) && !dax_mapping(mapping))
 927                        shadow = workingset_eviction(page, target_memcg);
 928                __delete_from_page_cache(page, shadow);
 929                xa_unlock_irqrestore(&mapping->i_pages, flags);
 930
 931                if (freepage != NULL)
 932                        freepage(page);
 933        }
 934
 935        return 1;
 936
 937cannot_free:
 938        xa_unlock_irqrestore(&mapping->i_pages, flags);
 939        return 0;
 940}
 941
 942/*
 943 * Attempt to detach a locked page from its ->mapping.  If it is dirty or if
 944 * someone else has a ref on the page, abort and return 0.  If it was
 945 * successfully detached, return 1.  Assumes the caller has a single ref on
 946 * this page.
 947 */
 948int remove_mapping(struct address_space *mapping, struct page *page)
 949{
 950        if (__remove_mapping(mapping, page, false, NULL)) {
 951                /*
 952                 * Unfreezing the refcount with 1 rather than 2 effectively
 953                 * drops the pagecache ref for us without requiring another
 954                 * atomic operation.
 955                 */
 956                page_ref_unfreeze(page, 1);
 957                return 1;
 958        }
 959        return 0;
 960}
 961
 962/**
 963 * putback_lru_page - put previously isolated page onto appropriate LRU list
 964 * @page: page to be put back to appropriate lru list
 965 *
 966 * Add previously isolated @page to appropriate LRU list.
 967 * Page may still be unevictable for other reasons.
 968 *
 969 * lru_lock must not be held, interrupts must be enabled.
 970 */
 971void putback_lru_page(struct page *page)
 972{
 973        lru_cache_add(page);
 974        put_page(page);         /* drop ref from isolate */
 975}
 976
 977enum page_references {
 978        PAGEREF_RECLAIM,
 979        PAGEREF_RECLAIM_CLEAN,
 980        PAGEREF_KEEP,
 981        PAGEREF_ACTIVATE,
 982};
 983
 984static enum page_references page_check_references(struct page *page,
 985                                                  struct scan_control *sc)
 986{
 987        int referenced_ptes, referenced_page;
 988        unsigned long vm_flags;
 989
 990        referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
 991                                          &vm_flags);
 992        referenced_page = TestClearPageReferenced(page);
 993
 994        /*
 995         * Mlock lost the isolation race with us.  Let try_to_unmap()
 996         * move the page to the unevictable list.
 997         */
 998        if (vm_flags & VM_LOCKED)
 999                return PAGEREF_RECLAIM;
1000
1001        if (referenced_ptes) {
1002                /*
1003                 * All mapped pages start out with page table
1004                 * references from the instantiating fault, so we need
1005                 * to look twice if a mapped file page is used more
1006                 * than once.
1007                 *
1008                 * Mark it and spare it for another trip around the
1009                 * inactive list.  Another page table reference will
1010                 * lead to its activation.
1011                 *
1012                 * Note: the mark is set for activated pages as well
1013                 * so that recently deactivated but used pages are
1014                 * quickly recovered.
1015                 */
1016                SetPageReferenced(page);
1017
1018                if (referenced_page || referenced_ptes > 1)
1019                        return PAGEREF_ACTIVATE;
1020
1021                /*
1022                 * Activate file-backed executable pages after first usage.
1023                 */
1024                if ((vm_flags & VM_EXEC) && !PageSwapBacked(page))
1025                        return PAGEREF_ACTIVATE;
1026
1027                return PAGEREF_KEEP;
1028        }
1029
1030        /* Reclaim if clean, defer dirty pages to writeback */
1031        if (referenced_page && !PageSwapBacked(page))
1032                return PAGEREF_RECLAIM_CLEAN;
1033
1034        return PAGEREF_RECLAIM;
1035}
1036
1037/* Check if a page is dirty or under writeback */
1038static void page_check_dirty_writeback(struct page *page,
1039                                       bool *dirty, bool *writeback)
1040{
1041        struct address_space *mapping;
1042
1043        /*
1044         * Anonymous pages are not handled by flushers and must be written
1045         * from reclaim context. Do not stall reclaim based on them
1046         */
1047        if (!page_is_file_lru(page) ||
1048            (PageAnon(page) && !PageSwapBacked(page))) {
1049                *dirty = false;
1050                *writeback = false;
1051                return;
1052        }
1053
1054        /* By default assume that the page flags are accurate */
1055        *dirty = PageDirty(page);
1056        *writeback = PageWriteback(page);
1057
1058        /* Verify dirty/writeback state if the filesystem supports it */
1059        if (!page_has_private(page))
1060                return;
1061
1062        mapping = page_mapping(page);
1063        if (mapping && mapping->a_ops->is_dirty_writeback)
1064                mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
1065}
1066
1067/*
1068 * shrink_page_list() returns the number of reclaimed pages
1069 */
1070static unsigned int shrink_page_list(struct list_head *page_list,
1071                                     struct pglist_data *pgdat,
1072                                     struct scan_control *sc,
1073                                     struct reclaim_stat *stat,
1074                                     bool ignore_references)
1075{
1076        LIST_HEAD(ret_pages);
1077        LIST_HEAD(free_pages);
1078        unsigned int nr_reclaimed = 0;
1079        unsigned int pgactivate = 0;
1080
1081        memset(stat, 0, sizeof(*stat));
1082        cond_resched();
1083
1084        while (!list_empty(page_list)) {
1085                struct address_space *mapping;
1086                struct page *page;
1087                enum page_references references = PAGEREF_RECLAIM;
1088                bool dirty, writeback, may_enter_fs;
1089                unsigned int nr_pages;
1090
1091                cond_resched();
1092
1093                page = lru_to_page(page_list);
1094                list_del(&page->lru);
1095
1096                if (!trylock_page(page))
1097                        goto keep;
1098
1099                VM_BUG_ON_PAGE(PageActive(page), page);
1100
1101                nr_pages = compound_nr(page);
1102
1103                /* Account the number of base pages even though THP */
1104                sc->nr_scanned += nr_pages;
1105
1106                if (unlikely(!page_evictable(page)))
1107                        goto activate_locked;
1108
1109                if (!sc->may_unmap && page_mapped(page))
1110                        goto keep_locked;
1111
1112                may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
1113                        (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
1114
1115                /*
1116                 * The number of dirty pages determines if a node is marked
1117                 * reclaim_congested which affects wait_iff_congested. kswapd
1118                 * will stall and start writing pages if the tail of the LRU
1119                 * is all dirty unqueued pages.
1120                 */
1121                page_check_dirty_writeback(page, &dirty, &writeback);
1122                if (dirty || writeback)
1123                        stat->nr_dirty++;
1124
1125                if (dirty && !writeback)
1126                        stat->nr_unqueued_dirty++;
1127
1128                /*
1129                 * Treat this page as congested if the underlying BDI is or if
1130                 * pages are cycling through the LRU so quickly that the
1131                 * pages marked for immediate reclaim are making it to the
1132                 * end of the LRU a second time.
1133                 */
1134                mapping = page_mapping(page);
1135                if (((dirty || writeback) && mapping &&
1136                     inode_write_congested(mapping->host)) ||
1137                    (writeback && PageReclaim(page)))
1138                        stat->nr_congested++;
1139
1140                /*
1141                 * If a page at the tail of the LRU is under writeback, there
1142                 * are three cases to consider.
1143                 *
1144                 * 1) If reclaim is encountering an excessive number of pages
1145                 *    under writeback and this page is both under writeback and
1146                 *    PageReclaim then it indicates that pages are being queued
1147                 *    for IO but are being recycled through the LRU before the
1148                 *    IO can complete. Waiting on the page itself risks an
1149                 *    indefinite stall if it is impossible to writeback the
1150                 *    page due to IO error or disconnected storage so instead
1151                 *    note that the LRU is being scanned too quickly and the
1152                 *    caller can stall after page list has been processed.
1153                 *
1154                 * 2) Global or new memcg reclaim encounters a page that is
1155                 *    not marked for immediate reclaim, or the caller does not
1156                 *    have __GFP_FS (or __GFP_IO if it's simply going to swap,
1157                 *    not to fs). In this case mark the page for immediate
1158                 *    reclaim and continue scanning.
1159                 *
1160                 *    Require may_enter_fs because we would wait on fs, which
1161                 *    may not have submitted IO yet. And the loop driver might
1162                 *    enter reclaim, and deadlock if it waits on a page for
1163                 *    which it is needed to do the write (loop masks off
1164                 *    __GFP_IO|__GFP_FS for this reason); but more thought
1165                 *    would probably show more reasons.
1166                 *
1167                 * 3) Legacy memcg encounters a page that is already marked
1168                 *    PageReclaim. memcg does not have any dirty pages
1169                 *    throttling so we could easily OOM just because too many
1170                 *    pages are in writeback and there is nothing else to
1171                 *    reclaim. Wait for the writeback to complete.
1172                 *
1173                 * In cases 1) and 2) we activate the pages to get them out of
1174                 * the way while we continue scanning for clean pages on the
1175                 * inactive list and refilling from the active list. The
1176                 * observation here is that waiting for disk writes is more
1177                 * expensive than potentially causing reloads down the line.
1178                 * Since they're marked for immediate reclaim, they won't put
1179                 * memory pressure on the cache working set any longer than it
1180                 * takes to write them to disk.
1181                 */
1182                if (PageWriteback(page)) {
1183                        /* Case 1 above */
1184                        if (current_is_kswapd() &&
1185                            PageReclaim(page) &&
1186                            test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
1187                                stat->nr_immediate++;
1188                                goto activate_locked;
1189
1190                        /* Case 2 above */
1191                        } else if (writeback_throttling_sane(sc) ||
1192                            !PageReclaim(page) || !may_enter_fs) {
1193                                /*
1194                                 * This is slightly racy - end_page_writeback()
1195                                 * might have just cleared PageReclaim, then
1196                                 * setting PageReclaim here end up interpreted
1197                                 * as PageReadahead - but that does not matter
1198                                 * enough to care.  What we do want is for this
1199                                 * page to have PageReclaim set next time memcg
1200                                 * reclaim reaches the tests above, so it will
1201                                 * then wait_on_page_writeback() to avoid OOM;
1202                                 * and it's also appropriate in global reclaim.
1203                                 */
1204                                SetPageReclaim(page);
1205                                stat->nr_writeback++;
1206                                goto activate_locked;
1207
1208                        /* Case 3 above */
1209                        } else {
1210                                unlock_page(page);
1211                                wait_on_page_writeback(page);
1212                                /* then go back and try same page again */
1213                                list_add_tail(&page->lru, page_list);
1214                                continue;
1215                        }
1216                }
1217
1218                if (!ignore_references)
1219                        references = page_check_references(page, sc);
1220
1221                switch (references) {
1222                case PAGEREF_ACTIVATE:
1223                        goto activate_locked;
1224                case PAGEREF_KEEP:
1225                        stat->nr_ref_keep += nr_pages;
1226                        goto keep_locked;
1227                case PAGEREF_RECLAIM:
1228                case PAGEREF_RECLAIM_CLEAN:
1229                        ; /* try to reclaim the page below */
1230                }
1231
1232                /*
1233                 * Anonymous process memory has backing store?
1234                 * Try to allocate it some swap space here.
1235                 * Lazyfree page could be freed directly
1236                 */
1237                if (PageAnon(page) && PageSwapBacked(page)) {
1238                        if (!PageSwapCache(page)) {
1239                                if (!(sc->gfp_mask & __GFP_IO))
1240                                        goto keep_locked;
1241                                if (page_maybe_dma_pinned(page))
1242                                        goto keep_locked;
1243                                if (PageTransHuge(page)) {
1244                                        /* cannot split THP, skip it */
1245                                        if (!can_split_huge_page(page, NULL))
1246                                                goto activate_locked;
1247                                        /*
1248                                         * Split pages without a PMD map right
1249                                         * away. Chances are some or all of the
1250                                         * tail pages can be freed without IO.
1251                                         */
1252                                        if (!compound_mapcount(page) &&
1253                                            split_huge_page_to_list(page,
1254                                                                    page_list))
1255                                                goto activate_locked;
1256                                }
1257                                if (!add_to_swap(page)) {
1258                                        if (!PageTransHuge(page))
1259                                                goto activate_locked_split;
1260                                        /* Fallback to swap normal pages */
1261                                        if (split_huge_page_to_list(page,
1262                                                                    page_list))
1263                                                goto activate_locked;
1264#ifdef CONFIG_TRANSPARENT_HUGEPAGE
1265                                        count_vm_event(THP_SWPOUT_FALLBACK);
1266#endif
1267                                        if (!add_to_swap(page))
1268                                                goto activate_locked_split;
1269                                }
1270
1271                                may_enter_fs = true;
1272
1273                                /* Adding to swap updated mapping */
1274                                mapping = page_mapping(page);
1275                        }
1276                } else if (unlikely(PageTransHuge(page))) {
1277                        /* Split file THP */
1278                        if (split_huge_page_to_list(page, page_list))
1279                                goto keep_locked;
1280                }
1281
1282                /*
1283                 * THP may get split above, need minus tail pages and update
1284                 * nr_pages to avoid accounting tail pages twice.
1285                 *
1286                 * The tail pages that are added into swap cache successfully
1287                 * reach here.
1288                 */
1289                if ((nr_pages > 1) && !PageTransHuge(page)) {
1290                        sc->nr_scanned -= (nr_pages - 1);
1291                        nr_pages = 1;
1292                }
1293
1294                /*
1295                 * The page is mapped into the page tables of one or more
1296                 * processes. Try to unmap it here.
1297                 */
1298                if (page_mapped(page)) {
1299                        enum ttu_flags flags = TTU_BATCH_FLUSH;
1300                        bool was_swapbacked = PageSwapBacked(page);
1301
1302                        if (unlikely(PageTransHuge(page)))
1303                                flags |= TTU_SPLIT_HUGE_PMD;
1304
1305                        if (!try_to_unmap(page, flags)) {
1306                                stat->nr_unmap_fail += nr_pages;
1307                                if (!was_swapbacked && PageSwapBacked(page))
1308                                        stat->nr_lazyfree_fail += nr_pages;
1309                                goto activate_locked;
1310                        }
1311                }
1312
1313                if (PageDirty(page)) {
1314                        /*
1315                         * Only kswapd can writeback filesystem pages
1316                         * to avoid risk of stack overflow. But avoid
1317                         * injecting inefficient single-page IO into
1318                         * flusher writeback as much as possible: only
1319                         * write pages when we've encountered many
1320                         * dirty pages, and when we've already scanned
1321                         * the rest of the LRU for clean pages and see
1322                         * the same dirty pages again (PageReclaim).
1323                         */
1324                        if (page_is_file_lru(page) &&
1325                            (!current_is_kswapd() || !PageReclaim(page) ||
1326                             !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
1327                                /*
1328                                 * Immediately reclaim when written back.
1329                                 * Similar in principal to deactivate_page()
1330                                 * except we already have the page isolated
1331                                 * and know it's dirty
1332                                 */
1333                                inc_node_page_state(page, NR_VMSCAN_IMMEDIATE);
1334                                SetPageReclaim(page);
1335
1336                                goto activate_locked;
1337                        }
1338
1339                        if (references == PAGEREF_RECLAIM_CLEAN)
1340                                goto keep_locked;
1341                        if (!may_enter_fs)
1342                                goto keep_locked;
1343                        if (!sc->may_writepage)
1344                                goto keep_locked;
1345
1346                        /*
1347                         * Page is dirty. Flush the TLB if a writable entry
1348                         * potentially exists to avoid CPU writes after IO
1349                         * starts and then write it out here.
1350                         */
1351                        try_to_unmap_flush_dirty();
1352                        switch (pageout(page, mapping)) {
1353                        case PAGE_KEEP:
1354                                goto keep_locked;
1355                        case PAGE_ACTIVATE:
1356                                goto activate_locked;
1357                        case PAGE_SUCCESS:
1358                                stat->nr_pageout += thp_nr_pages(page);
1359
1360                                if (PageWriteback(page))
1361                                        goto keep;
1362                                if (PageDirty(page))
1363                                        goto keep;
1364
1365                                /*
1366                                 * A synchronous write - probably a ramdisk.  Go
1367                                 * ahead and try to reclaim the page.
1368                                 */
1369                                if (!trylock_page(page))
1370                                        goto keep;
1371                                if (PageDirty(page) || PageWriteback(page))
1372                                        goto keep_locked;
1373                                mapping = page_mapping(page);
1374                                fallthrough;
1375                        case PAGE_CLEAN:
1376                                ; /* try to free the page below */
1377                        }
1378                }
1379
1380                /*
1381                 * If the page has buffers, try to free the buffer mappings
1382                 * associated with this page. If we succeed we try to free
1383                 * the page as well.
1384                 *
1385                 * We do this even if the page is PageDirty().
1386                 * try_to_release_page() does not perform I/O, but it is
1387                 * possible for a page to have PageDirty set, but it is actually
1388                 * clean (all its buffers are clean).  This happens if the
1389                 * buffers were written out directly, with submit_bh(). ext3
1390                 * will do this, as well as the blockdev mapping.
1391                 * try_to_release_page() will discover that cleanness and will
1392                 * drop the buffers and mark the page clean - it can be freed.
1393                 *
1394                 * Rarely, pages can have buffers and no ->mapping.  These are
1395                 * the pages which were not successfully invalidated in
1396                 * truncate_cleanup_page().  We try to drop those buffers here
1397                 * and if that worked, and the page is no longer mapped into
1398                 * process address space (page_count == 1) it can be freed.
1399                 * Otherwise, leave the page on the LRU so it is swappable.
1400                 */
1401                if (page_has_private(page)) {
1402                        if (!try_to_release_page(page, sc->gfp_mask))
1403                                goto activate_locked;
1404                        if (!mapping && page_count(page) == 1) {
1405                                unlock_page(page);
1406                                if (put_page_testzero(page))
1407                                        goto free_it;
1408                                else {
1409                                        /*
1410                                         * rare race with speculative reference.
1411                                         * the speculative reference will free
1412                                         * this page shortly, so we may
1413                                         * increment nr_reclaimed here (and
1414                                         * leave it off the LRU).
1415                                         */
1416                                        nr_reclaimed++;
1417                                        continue;
1418                                }
1419                        }
1420                }
1421
1422                if (PageAnon(page) && !PageSwapBacked(page)) {
1423                        /* follow __remove_mapping for reference */
1424                        if (!page_ref_freeze(page, 1))
1425                                goto keep_locked;
1426                        if (PageDirty(page)) {
1427                                page_ref_unfreeze(page, 1);
1428                                goto keep_locked;
1429                        }
1430
1431                        count_vm_event(PGLAZYFREED);
1432                        count_memcg_page_event(page, PGLAZYFREED);
1433                } else if (!mapping || !__remove_mapping(mapping, page, true,
1434                                                         sc->target_mem_cgroup))
1435                        goto keep_locked;
1436
1437                unlock_page(page);
1438free_it:
1439                /*
1440                 * THP may get swapped out in a whole, need account
1441                 * all base pages.
1442                 */
1443                nr_reclaimed += nr_pages;
1444
1445                /*
1446                 * Is there need to periodically free_page_list? It would
1447                 * appear not as the counts should be low
1448                 */
1449                if (unlikely(PageTransHuge(page)))
1450                        destroy_compound_page(page);
1451                else
1452                        list_add(&page->lru, &free_pages);
1453                continue;
1454
1455activate_locked_split:
1456                /*
1457                 * The tail pages that are failed to add into swap cache
1458                 * reach here.  Fixup nr_scanned and nr_pages.
1459                 */
1460                if (nr_pages > 1) {
1461                        sc->nr_scanned -= (nr_pages - 1);
1462                        nr_pages = 1;
1463                }
1464activate_locked:
1465                /* Not a candidate for swapping, so reclaim swap space. */
1466                if (PageSwapCache(page) && (mem_cgroup_swap_full(page) ||
1467                                                PageMlocked(page)))
1468                        try_to_free_swap(page);
1469                VM_BUG_ON_PAGE(PageActive(page), page);
1470                if (!PageMlocked(page)) {
1471                        int type = page_is_file_lru(page);
1472                        SetPageActive(page);
1473                        stat->nr_activate[type] += nr_pages;
1474                        count_memcg_page_event(page, PGACTIVATE);
1475                }
1476keep_locked:
1477                unlock_page(page);
1478keep:
1479                list_add(&page->lru, &ret_pages);
1480                VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
1481        }
1482
1483        pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
1484
1485        mem_cgroup_uncharge_list(&free_pages);
1486        try_to_unmap_flush();
1487        free_unref_page_list(&free_pages);
1488
1489        list_splice(&ret_pages, page_list);
1490        count_vm_events(PGACTIVATE, pgactivate);
1491
1492        return nr_reclaimed;
1493}
1494
1495unsigned int reclaim_clean_pages_from_list(struct zone *zone,
1496                                            struct list_head *page_list)
1497{
1498        struct scan_control sc = {
1499                .gfp_mask = GFP_KERNEL,
1500                .priority = DEF_PRIORITY,
1501                .may_unmap = 1,
1502        };
1503        struct reclaim_stat stat;
1504        unsigned int nr_reclaimed;
1505        struct page *page, *next;
1506        LIST_HEAD(clean_pages);
1507
1508        list_for_each_entry_safe(page, next, page_list, lru) {
1509                if (page_is_file_lru(page) && !PageDirty(page) &&
1510                    !__PageMovable(page) && !PageUnevictable(page)) {
1511                        ClearPageActive(page);
1512                        list_move(&page->lru, &clean_pages);
1513                }
1514        }
1515
1516        nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
1517                                        &stat, true);
1518        list_splice(&clean_pages, page_list);
1519        mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
1520                            -(long)nr_reclaimed);
1521        /*
1522         * Since lazyfree pages are isolated from file LRU from the beginning,
1523         * they will rotate back to anonymous LRU in the end if it failed to
1524         * discard so isolated count will be mismatched.
1525         * Compensate the isolated count for both LRU lists.
1526         */
1527        mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON,
1528                            stat.nr_lazyfree_fail);
1529        mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
1530                            -(long)stat.nr_lazyfree_fail);
1531        return nr_reclaimed;
1532}
1533
1534/*
1535 * Attempt to remove the specified page from its LRU.  Only take this page
1536 * if it is of the appropriate PageActive status.  Pages which are being
1537 * freed elsewhere are also ignored.
1538 *
1539 * page:        page to consider
1540 * mode:        one of the LRU isolation modes defined above
1541 *
1542 * returns 0 on success, -ve errno on failure.
1543 */
1544int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode)
1545{
1546        int ret = -EBUSY;
1547
1548        /* Only take pages on the LRU. */
1549        if (!PageLRU(page))
1550                return ret;
1551
1552        /* Compaction should not handle unevictable pages but CMA can do so */
1553        if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
1554                return ret;
1555
1556        /*
1557         * To minimise LRU disruption, the caller can indicate that it only
1558         * wants to isolate pages it will be able to operate on without
1559         * blocking - clean pages for the most part.
1560         *
1561         * ISOLATE_ASYNC_MIGRATE is used to indicate that it only wants to pages
1562         * that it is possible to migrate without blocking
1563         */
1564        if (mode & ISOLATE_ASYNC_MIGRATE) {
1565                /* All the caller can do on PageWriteback is block */
1566                if (PageWriteback(page))
1567                        return ret;
1568
1569                if (PageDirty(page)) {
1570                        struct address_space *mapping;
1571                        bool migrate_dirty;
1572
1573                        /*
1574                         * Only pages without mappings or that have a
1575                         * ->migratepage callback are possible to migrate
1576                         * without blocking. However, we can be racing with
1577                         * truncation so it's necessary to lock the page
1578                         * to stabilise the mapping as truncation holds
1579                         * the page lock until after the page is removed
1580                         * from the page cache.
1581                         */
1582                        if (!trylock_page(page))
1583                                return ret;
1584
1585                        mapping = page_mapping(page);
1586                        migrate_dirty = !mapping || mapping->a_ops->migratepage;
1587                        unlock_page(page);
1588                        if (!migrate_dirty)
1589                                return ret;
1590                }
1591        }
1592
1593        if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
1594                return ret;
1595
1596        return 0;
1597}
1598
1599/*
1600 * Update LRU sizes after isolating pages. The LRU size updates must
1601 * be complete before mem_cgroup_update_lru_size due to a sanity check.
1602 */
1603static __always_inline void update_lru_sizes(struct lruvec *lruvec,
1604                        enum lru_list lru, unsigned long *nr_zone_taken)
1605{
1606        int zid;
1607
1608        for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1609                if (!nr_zone_taken[zid])
1610                        continue;
1611
1612                update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
1613        }
1614
1615}
1616
1617/**
1618 * Isolating page from the lruvec to fill in @dst list by nr_to_scan times.
1619 *
1620 * lruvec->lru_lock is heavily contended.  Some of the functions that
1621 * shrink the lists perform better by taking out a batch of pages
1622 * and working on them outside the LRU lock.
1623 *
1624 * For pagecache intensive workloads, this function is the hottest
1625 * spot in the kernel (apart from copy_*_user functions).
1626 *
1627 * Lru_lock must be held before calling this function.
1628 *
1629 * @nr_to_scan: The number of eligible pages to look through on the list.
1630 * @lruvec:     The LRU vector to pull pages from.
1631 * @dst:        The temp list to put pages on to.
1632 * @nr_scanned: The number of pages that were scanned.
1633 * @sc:         The scan_control struct for this reclaim session
1634 * @lru:        LRU list id for isolating
1635 *
1636 * returns how many pages were moved onto *@dst.
1637 */
1638static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
1639                struct lruvec *lruvec, struct list_head *dst,
1640                unsigned long *nr_scanned, struct scan_control *sc,
1641                enum lru_list lru)
1642{
1643        struct list_head *src = &lruvec->lists[lru];
1644        unsigned long nr_taken = 0;
1645        unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
1646        unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
1647        unsigned long skipped = 0;
1648        unsigned long scan, total_scan, nr_pages;
1649        LIST_HEAD(pages_skipped);
1650        isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);
1651
1652        total_scan = 0;
1653        scan = 0;
1654        while (scan < nr_to_scan && !list_empty(src)) {
1655                struct page *page;
1656
1657                page = lru_to_page(src);
1658                prefetchw_prev_lru_page(page, src, flags);
1659
1660                nr_pages = compound_nr(page);
1661                total_scan += nr_pages;
1662
1663                if (page_zonenum(page) > sc->reclaim_idx) {
1664                        list_move(&page->lru, &pages_skipped);
1665                        nr_skipped[page_zonenum(page)] += nr_pages;
1666                        continue;
1667                }
1668
1669                /*
1670                 * Do not count skipped pages because that makes the function
1671                 * return with no isolated pages if the LRU mostly contains
1672                 * ineligible pages.  This causes the VM to not reclaim any
1673                 * pages, triggering a premature OOM.
1674                 *
1675                 * Account all tail pages of THP.  This would not cause
1676                 * premature OOM since __isolate_lru_page() returns -EBUSY
1677                 * only when the page is being freed somewhere else.
1678                 */
1679                scan += nr_pages;
1680                switch (__isolate_lru_page_prepare(page, mode)) {
1681                case 0:
1682                        /*
1683                         * Be careful not to clear PageLRU until after we're
1684                         * sure the page is not being freed elsewhere -- the
1685                         * page release code relies on it.
1686                         */
1687                        if (unlikely(!get_page_unless_zero(page)))
1688                                goto busy;
1689
1690                        if (!TestClearPageLRU(page)) {
1691                                /*
1692                                 * This page may in other isolation path,
1693                                 * but we still hold lru_lock.
1694                                 */
1695                                put_page(page);
1696                                goto busy;
1697                        }
1698
1699                        nr_taken += nr_pages;
1700                        nr_zone_taken[page_zonenum(page)] += nr_pages;
1701                        list_move(&page->lru, dst);
1702                        break;
1703
1704                default:
1705busy:
1706                        /* else it is being freed elsewhere */
1707                        list_move(&page->lru, src);
1708                }
1709        }
1710
1711        /*
1712         * Splice any skipped pages to the start of the LRU list. Note that
1713         * this disrupts the LRU order when reclaiming for lower zones but
1714         * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX
1715         * scanning would soon rescan the same pages to skip and put the
1716         * system at risk of premature OOM.
1717         */
1718        if (!list_empty(&pages_skipped)) {
1719                int zid;
1720
1721                list_splice(&pages_skipped, src);
1722                for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1723                        if (!nr_skipped[zid])
1724                                continue;
1725
1726                        __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
1727                        skipped += nr_skipped[zid];
1728                }
1729        }
1730        *nr_scanned = total_scan;
1731        trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
1732                                    total_scan, skipped, nr_taken, mode, lru);
1733        update_lru_sizes(lruvec, lru, nr_zone_taken);
1734        return nr_taken;
1735}
1736
1737/**
1738 * isolate_lru_page - tries to isolate a page from its LRU list
1739 * @page: page to isolate from its LRU list
1740 *
1741 * Isolates a @page from an LRU list, clears PageLRU and adjusts the
1742 * vmstat statistic corresponding to whatever LRU list the page was on.
1743 *
1744 * Returns 0 if the page was removed from an LRU list.
1745 * Returns -EBUSY if the page was not on an LRU list.
1746 *
1747 * The returned page will have PageLRU() cleared.  If it was found on
1748 * the active list, it will have PageActive set.  If it was found on
1749 * the unevictable list, it will have the PageUnevictable bit set. That flag
1750 * may need to be cleared by the caller before letting the page go.
1751 *
1752 * The vmstat statistic corresponding to the list on which the page was
1753 * found will be decremented.
1754 *
1755 * Restrictions:
1756 *
1757 * (1) Must be called with an elevated refcount on the page. This is a
1758 *     fundamental difference from isolate_lru_pages (which is called
1759 *     without a stable reference).
1760 * (2) the lru_lock must not be held.
1761 * (3) interrupts must be enabled.
1762 */
1763int isolate_lru_page(struct page *page)
1764{
1765        int ret = -EBUSY;
1766
1767        VM_BUG_ON_PAGE(!page_count(page), page);
1768        WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
1769
1770        if (TestClearPageLRU(page)) {
1771                struct lruvec *lruvec;
1772
1773                get_page(page);
1774                lruvec = lock_page_lruvec_irq(page);
1775                del_page_from_lru_list(page, lruvec, page_lru(page));
1776                unlock_page_lruvec_irq(lruvec);
1777                ret = 0;
1778        }
1779
1780        return ret;
1781}
1782
1783/*
1784 * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
1785 * then get rescheduled. When there are massive number of tasks doing page
1786 * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
1787 * the LRU list will go small and be scanned faster than necessary, leading to
1788 * unnecessary swapping, thrashing and OOM.
1789 */
1790static int too_many_isolated(struct pglist_data *pgdat, int file,
1791                struct scan_control *sc)
1792{
1793        unsigned long inactive, isolated;
1794
1795        if (current_is_kswapd())
1796                return 0;
1797
1798        if (!writeback_throttling_sane(sc))
1799                return 0;
1800
1801        if (file) {
1802                inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
1803                isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
1804        } else {
1805                inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
1806                isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
1807        }
1808
1809        /*
1810         * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
1811         * won't get blocked by normal direct-reclaimers, forming a circular
1812         * deadlock.
1813         */
1814        if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
1815                inactive >>= 3;
1816
1817        return isolated > inactive;
1818}
1819
1820/*
1821 * move_pages_to_lru() moves pages from private @list to appropriate LRU list.
1822 * On return, @list is reused as a list of pages to be freed by the caller.
1823 *
1824 * Returns the number of pages moved to the given lruvec.
1825 */
1826static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
1827                                                     struct list_head *list)
1828{
1829        int nr_pages, nr_moved = 0;
1830        LIST_HEAD(pages_to_free);
1831        struct page *page;
1832        enum lru_list lru;
1833
1834        while (!list_empty(list)) {
1835                page = lru_to_page(list);
1836                VM_BUG_ON_PAGE(PageLRU(page), page);
1837                list_del(&page->lru);
1838                if (unlikely(!page_evictable(page))) {
1839                        spin_unlock_irq(&lruvec->lru_lock);
1840                        putback_lru_page(page);
1841                        spin_lock_irq(&lruvec->lru_lock);
1842                        continue;
1843                }
1844
1845                /*
1846                 * The SetPageLRU needs to be kept here for list integrity.
1847                 * Otherwise:
1848                 *   #0 move_pages_to_lru             #1 release_pages
1849                 *   if !put_page_testzero
1850                 *                                    if (put_page_testzero())
1851                 *                                      !PageLRU //skip lru_lock
1852                 *     SetPageLRU()
1853                 *     list_add(&page->lru,)
1854                 *                                        list_add(&page->lru,)
1855                 */
1856                SetPageLRU(page);
1857
1858                if (unlikely(put_page_testzero(page))) {
1859                        __ClearPageLRU(page);
1860                        __ClearPageActive(page);
1861
1862                        if (unlikely(PageCompound(page))) {
1863                                spin_unlock_irq(&lruvec->lru_lock);
1864                                destroy_compound_page(page);
1865                                spin_lock_irq(&lruvec->lru_lock);
1866                        } else
1867                                list_add(&page->lru, &pages_to_free);
1868
1869                        continue;
1870                }
1871
1872                /*
1873                 * All pages were isolated from the same lruvec (and isolation
1874                 * inhibits memcg migration).
1875                 */
1876                VM_BUG_ON_PAGE(!lruvec_holds_page_lru_lock(page, lruvec), page);
1877                lru = page_lru(page);
1878                nr_pages = thp_nr_pages(page);
1879
1880                update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
1881                list_add(&page->lru, &lruvec->lists[lru]);
1882                nr_moved += nr_pages;
1883                if (PageActive(page))
1884                        workingset_age_nonresident(lruvec, nr_pages);
1885        }
1886
1887        /*
1888         * To save our caller's stack, now use input list for pages to free.
1889         */
1890        list_splice(&pages_to_free, list);
1891
1892        return nr_moved;
1893}
1894
1895/*
1896 * If a kernel thread (such as nfsd for loop-back mounts) services
1897 * a backing device by writing to the page cache it sets PF_LOCAL_THROTTLE.
1898 * In that case we should only throttle if the backing device it is
1899 * writing to is congested.  In other cases it is safe to throttle.
1900 */
1901static int current_may_throttle(void)
1902{
1903        return !(current->flags & PF_LOCAL_THROTTLE) ||
1904                current->backing_dev_info == NULL ||
1905                bdi_write_congested(current->backing_dev_info);
1906}
1907
1908/*
1909 * shrink_inactive_list() is a helper for shrink_node().  It returns the number
1910 * of reclaimed pages
1911 */
1912static noinline_for_stack unsigned long
1913shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
1914                     struct scan_control *sc, enum lru_list lru)
1915{
1916        LIST_HEAD(page_list);
1917        unsigned long nr_scanned;
1918        unsigned int nr_reclaimed = 0;
1919        unsigned long nr_taken;
1920        struct reclaim_stat stat;
1921        bool file = is_file_lru(lru);
1922        enum vm_event_item item;
1923        struct pglist_data *pgdat = lruvec_pgdat(lruvec);
1924        bool stalled = false;
1925
1926        while (unlikely(too_many_isolated(pgdat, file, sc))) {
1927                if (stalled)
1928                        return 0;
1929
1930                /* wait a bit for the reclaimer. */
1931                msleep(100);
1932                stalled = true;
1933
1934                /* We are about to die and free our memory. Return now. */
1935                if (fatal_signal_pending(current))
1936                        return SWAP_CLUSTER_MAX;
1937        }
1938
1939        lru_add_drain();
1940
1941        spin_lock_irq(&lruvec->lru_lock);
1942
1943        nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
1944                                     &nr_scanned, sc, lru);
1945
1946        __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
1947        item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
1948        if (!cgroup_reclaim(sc))
1949                __count_vm_events(item, nr_scanned);
1950        __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
1951        __count_vm_events(PGSCAN_ANON + file, nr_scanned);
1952
1953        spin_unlock_irq(&lruvec->lru_lock);
1954
1955        if (nr_taken == 0)
1956                return 0;
1957
1958        nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, &stat, false);
1959
1960        spin_lock_irq(&lruvec->lru_lock);
1961        move_pages_to_lru(lruvec, &page_list);
1962
1963        __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
1964        item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
1965        if (!cgroup_reclaim(sc))
1966                __count_vm_events(item, nr_reclaimed);
1967        __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
1968        __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
1969        spin_unlock_irq(&lruvec->lru_lock);
1970
1971        lru_note_cost(lruvec, file, stat.nr_pageout);
1972        mem_cgroup_uncharge_list(&page_list);
1973        free_unref_page_list(&page_list);
1974
1975        /*
1976         * If dirty pages are scanned that are not queued for IO, it
1977         * implies that flushers are not doing their job. This can
1978         * happen when memory pressure pushes dirty pages to the end of
1979         * the LRU before the dirty limits are breached and the dirty
1980         * data has expired. It can also happen when the proportion of
1981         * dirty pages grows not through writes but through memory
1982         * pressure reclaiming all the clean cache. And in some cases,
1983         * the flushers simply cannot keep up with the allocation
1984         * rate. Nudge the flusher threads in case they are asleep.
1985         */
1986        if (stat.nr_unqueued_dirty == nr_taken)
1987                wakeup_flusher_threads(WB_REASON_VMSCAN);
1988
1989        sc->nr.dirty += stat.nr_dirty;
1990        sc->nr.congested += stat.nr_congested;
1991        sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
1992        sc->nr.writeback += stat.nr_writeback;
1993        sc->nr.immediate += stat.nr_immediate;
1994        sc->nr.taken += nr_taken;
1995        if (file)
1996                sc->nr.file_taken += nr_taken;
1997
1998        trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
1999                        nr_scanned, nr_reclaimed, &stat, sc->priority, file);
2000        return nr_reclaimed;
2001}
2002
2003/*
2004 * shrink_active_list() moves pages from the active LRU to the inactive LRU.
2005 *
2006 * We move them the other way if the page is referenced by one or more
2007 * processes.
2008 *
2009 * If the pages are mostly unmapped, the processing is fast and it is
2010 * appropriate to hold lru_lock across the whole operation.  But if
2011 * the pages are mapped, the processing is slow (page_referenced()), so
2012 * we should drop lru_lock around each page.  It's impossible to balance
2013 * this, so instead we remove the pages from the LRU while processing them.
2014 * It is safe to rely on PG_active against the non-LRU pages in here because
2015 * nobody will play with that bit on a non-LRU page.
2016 *
2017 * The downside is that we have to touch page->_refcount against each page.
2018 * But we had to alter page->flags anyway.
2019 */
2020static void shrink_active_list(unsigned long nr_to_scan,
2021                               struct lruvec *lruvec,
2022                               struct scan_control *sc,
2023                               enum lru_list lru)
2024{
2025        unsigned long nr_taken;
2026        unsigned long nr_scanned;
2027        unsigned long vm_flags;
2028        LIST_HEAD(l_hold);      /* The pages which were snipped off */
2029        LIST_HEAD(l_active);
2030        LIST_HEAD(l_inactive);
2031        struct page *page;
2032        unsigned nr_deactivate, nr_activate;
2033        unsigned nr_rotated = 0;
2034        int file = is_file_lru(lru);
2035        struct pglist_data *pgdat = lruvec_pgdat(lruvec);
2036
2037        lru_add_drain();
2038
2039        spin_lock_irq(&lruvec->lru_lock);
2040
2041        nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
2042                                     &nr_scanned, sc, lru);
2043
2044        __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
2045
2046        if (!cgroup_reclaim(sc))
2047                __count_vm_events(PGREFILL, nr_scanned);
2048        __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
2049
2050        spin_unlock_irq(&lruvec->lru_lock);
2051
2052        while (!list_empty(&l_hold)) {
2053                cond_resched();
2054                page = lru_to_page(&l_hold);
2055                list_del(&page->lru);
2056
2057                if (unlikely(!page_evictable(page))) {
2058                        putback_lru_page(page);
2059                        continue;
2060                }
2061
2062                if (unlikely(buffer_heads_over_limit)) {
2063                        if (page_has_private(page) && trylock_page(page)) {
2064                                if (page_has_private(page))
2065                                        try_to_release_page(page, 0);
2066                                unlock_page(page);
2067                        }
2068                }
2069
2070                if (page_referenced(page, 0, sc->target_mem_cgroup,
2071                                    &vm_flags)) {
2072                        /*
2073                         * Identify referenced, file-backed active pages and
2074                         * give them one more trip around the active list. So
2075                         * that executable code get better chances to stay in
2076                         * memory under moderate memory pressure.  Anon pages
2077                         * are not likely to be evicted by use-once streaming
2078                         * IO, plus JVM can create lots of anon VM_EXEC pages,
2079                         * so we ignore them here.
2080                         */
2081                        if ((vm_flags & VM_EXEC) && page_is_file_lru(page)) {
2082                                nr_rotated += thp_nr_pages(page);
2083                                list_add(&page->lru, &l_active);
2084                                continue;
2085                        }
2086                }
2087
2088                ClearPageActive(page);  /* we are de-activating */
2089                SetPageWorkingset(page);
2090                list_add(&page->lru, &l_inactive);
2091        }
2092
2093        /*
2094         * Move pages back to the lru list.
2095         */
2096        spin_lock_irq(&lruvec->lru_lock);
2097
2098        nr_activate = move_pages_to_lru(lruvec, &l_active);
2099        nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
2100        /* Keep all free pages in l_active list */
2101        list_splice(&l_inactive, &l_active);
2102
2103        __count_vm_events(PGDEACTIVATE, nr_deactivate);
2104        __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
2105
2106        __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
2107        spin_unlock_irq(&lruvec->lru_lock);
2108
2109        mem_cgroup_uncharge_list(&l_active);
2110        free_unref_page_list(&l_active);
2111        trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
2112                        nr_deactivate, nr_rotated, sc->priority, file);
2113}
2114
2115unsigned long reclaim_pages(struct list_head *page_list)
2116{
2117        int nid = NUMA_NO_NODE;
2118        unsigned int nr_reclaimed = 0;
2119        LIST_HEAD(node_page_list);
2120        struct reclaim_stat dummy_stat;
2121        struct page *page;
2122        struct scan_control sc = {
2123                .gfp_mask = GFP_KERNEL,
2124                .priority = DEF_PRIORITY,
2125                .may_writepage = 1,
2126                .may_unmap = 1,
2127                .may_swap = 1,
2128        };
2129
2130        while (!list_empty(page_list)) {
2131                page = lru_to_page(page_list);
2132                if (nid == NUMA_NO_NODE) {
2133                        nid = page_to_nid(page);
2134                        INIT_LIST_HEAD(&node_page_list);
2135                }
2136
2137                if (nid == page_to_nid(page)) {
2138                        ClearPageActive(page);
2139                        list_move(&page->lru, &node_page_list);
2140                        continue;
2141                }
2142
2143                nr_reclaimed += shrink_page_list(&node_page_list,
2144                                                NODE_DATA(nid),
2145                                                &sc, &dummy_stat, false);
2146                while (!list_empty(&node_page_list)) {
2147                        page = lru_to_page(&node_page_list);
2148                        list_del(&page->lru);
2149                        putback_lru_page(page);
2150                }
2151
2152                nid = NUMA_NO_NODE;
2153        }
2154
2155        if (!list_empty(&node_page_list)) {
2156                nr_reclaimed += shrink_page_list(&node_page_list,
2157                                                NODE_DATA(nid),
2158                                                &sc, &dummy_stat, false);
2159                while (!list_empty(&node_page_list)) {
2160                        page = lru_to_page(&node_page_list);
2161                        list_del(&page->lru);
2162                        putback_lru_page(page);
2163                }
2164        }
2165
2166        return nr_reclaimed;
2167}
2168
2169static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
2170                                 struct lruvec *lruvec, struct scan_control *sc)
2171{
2172        if (is_active_lru(lru)) {
2173                if (sc->may_deactivate & (1 << is_file_lru(lru)))
2174                        shrink_active_list(nr_to_scan, lruvec, sc, lru);
2175                else
2176                        sc->skipped_deactivate = 1;
2177                return 0;
2178        }
2179
2180        return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
2181}
2182
2183/*
2184 * The inactive anon list should be small enough that the VM never has
2185 * to do too much work.
2186 *
2187 * The inactive file list should be small enough to leave most memory
2188 * to the established workingset on the scan-resistant active list,
2189 * but large enough to avoid thrashing the aggregate readahead window.
2190 *
2191 * Both inactive lists should also be large enough that each inactive
2192 * page has a chance to be referenced again before it is reclaimed.
2193 *
2194 * If that fails and refaulting is observed, the inactive list grows.
2195 *
2196 * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
2197 * on this LRU, maintained by the pageout code. An inactive_ratio
2198 * of 3 means 3:1 or 25% of the pages are kept on the inactive list.
2199 *
2200 * total     target    max
2201 * memory    ratio     inactive
2202 * -------------------------------------
2203 *   10MB       1         5MB
2204 *  100MB       1        50MB
2205 *    1GB       3       250MB
2206 *   10GB      10       0.9GB
2207 *  100GB      31         3GB
2208 *    1TB     101        10GB
2209 *   10TB     320        32GB
2210 */
2211static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru)
2212{
2213        enum lru_list active_lru = inactive_lru + LRU_ACTIVE;
2214        unsigned long inactive, active;
2215        unsigned long inactive_ratio;
2216        unsigned long gb;
2217
2218        inactive = lruvec_page_state(lruvec, NR_LRU_BASE + inactive_lru);
2219        active = lruvec_page_state(lruvec, NR_LRU_BASE + active_lru);
2220
2221        gb = (inactive + active) >> (30 - PAGE_SHIFT);
2222        if (gb)
2223                inactive_ratio = int_sqrt(10 * gb);
2224        else
2225                inactive_ratio = 1;
2226
2227        return inactive * inactive_ratio < active;
2228}
2229
2230enum scan_balance {
2231        SCAN_EQUAL,
2232        SCAN_FRACT,
2233        SCAN_ANON,
2234        SCAN_FILE,
2235};
2236
2237/*
2238 * Determine how aggressively the anon and file LRU lists should be
2239 * scanned.  The relative value of each set of LRU lists is determined
2240 * by looking at the fraction of the pages scanned we did rotate back
2241 * onto the active list instead of evict.
2242 *
2243 * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
2244 * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
2245 */
2246static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
2247                           unsigned long *nr)
2248{
2249        struct mem_cgroup *memcg = lruvec_memcg(lruvec);
2250        unsigned long anon_cost, file_cost, total_cost;
2251        int swappiness = mem_cgroup_swappiness(memcg);
2252        u64 fraction[ANON_AND_FILE];
2253        u64 denominator = 0;    /* gcc */
2254        enum scan_balance scan_balance;
2255        unsigned long ap, fp;
2256        enum lru_list lru;
2257
2258        /* If we have no swap space, do not bother scanning anon pages. */
2259        if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
2260                scan_balance = SCAN_FILE;
2261                goto out;
2262        }
2263
2264        /*
2265         * Global reclaim will swap to prevent OOM even with no
2266         * swappiness, but memcg users want to use this knob to
2267         * disable swapping for individual groups completely when
2268         * using the memory controller's swap limit feature would be
2269         * too expensive.
2270         */
2271        if (cgroup_reclaim(sc) && !swappiness) {
2272                scan_balance = SCAN_FILE;
2273                goto out;
2274        }
2275
2276        /*
2277         * Do not apply any pressure balancing cleverness when the
2278         * system is close to OOM, scan both anon and file equally
2279         * (unless the swappiness setting disagrees with swapping).
2280         */
2281        if (!sc->priority && swappiness) {
2282                scan_balance = SCAN_EQUAL;
2283                goto out;
2284        }
2285
2286        /*
2287         * If the system is almost out of file pages, force-scan anon.
2288         */
2289        if (sc->file_is_tiny) {
2290                scan_balance = SCAN_ANON;
2291                goto out;
2292        }
2293
2294        /*
2295         * If there is enough inactive page cache, we do not reclaim
2296         * anything from the anonymous working right now.
2297         */
2298        if (sc->cache_trim_mode) {
2299                scan_balance = SCAN_FILE;
2300                goto out;
2301        }
2302
2303        scan_balance = SCAN_FRACT;
2304        /*
2305         * Calculate the pressure balance between anon and file pages.
2306         *
2307         * The amount of pressure we put on each LRU is inversely
2308         * proportional to the cost of reclaiming each list, as
2309         * determined by the share of pages that are refaulting, times
2310         * the relative IO cost of bringing back a swapped out
2311         * anonymous page vs reloading a filesystem page (swappiness).
2312         *
2313         * Although we limit that influence to ensure no list gets
2314         * left behind completely: at least a third of the pressure is
2315         * applied, before swappiness.
2316         *
2317         * With swappiness at 100, anon and file have equal IO cost.
2318         */
2319        total_cost = sc->anon_cost + sc->file_cost;
2320        anon_cost = total_cost + sc->anon_cost;
2321        file_cost = total_cost + sc->file_cost;
2322        total_cost = anon_cost + file_cost;
2323
2324        ap = swappiness * (total_cost + 1);
2325        ap /= anon_cost + 1;
2326
2327        fp = (200 - swappiness) * (total_cost + 1);
2328        fp /= file_cost + 1;
2329
2330        fraction[0] = ap;
2331        fraction[1] = fp;
2332        denominator = ap + fp;
2333out:
2334        for_each_evictable_lru(lru) {
2335                int file = is_file_lru(lru);
2336                unsigned long lruvec_size;
2337                unsigned long scan;
2338                unsigned long protection;
2339
2340                lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
2341                protection = mem_cgroup_protection(sc->target_mem_cgroup,
2342                                                   memcg,
2343                                                   sc->memcg_low_reclaim);
2344
2345                if (protection) {
2346                        /*
2347                         * Scale a cgroup's reclaim pressure by proportioning
2348                         * its current usage to its memory.low or memory.min
2349                         * setting.
2350                         *
2351                         * This is important, as otherwise scanning aggression
2352                         * becomes extremely binary -- from nothing as we
2353                         * approach the memory protection threshold, to totally
2354                         * nominal as we exceed it.  This results in requiring
2355                         * setting extremely liberal protection thresholds. It
2356                         * also means we simply get no protection at all if we
2357                         * set it too low, which is not ideal.
2358                         *
2359                         * If there is any protection in place, we reduce scan
2360                         * pressure by how much of the total memory used is
2361                         * within protection thresholds.
2362                         *
2363                         * There is one special case: in the first reclaim pass,
2364                         * we skip over all groups that are within their low
2365                         * protection. If that fails to reclaim enough pages to
2366                         * satisfy the reclaim goal, we come back and override
2367                         * the best-effort low protection. However, we still
2368                         * ideally want to honor how well-behaved groups are in
2369                         * that case instead of simply punishing them all
2370                         * equally. As such, we reclaim them based on how much
2371                         * memory they are using, reducing the scan pressure
2372                         * again by how much of the total memory used is under
2373                         * hard protection.
2374                         */
2375                        unsigned long cgroup_size = mem_cgroup_size(memcg);
2376
2377                        /* Avoid TOCTOU with earlier protection check */
2378                        cgroup_size = max(cgroup_size, protection);
2379
2380                        scan = lruvec_size - lruvec_size * protection /
2381                                cgroup_size;
2382
2383                        /*
2384                         * Minimally target SWAP_CLUSTER_MAX pages to keep
2385                         * reclaim moving forwards, avoiding decrementing
2386                         * sc->priority further than desirable.
2387                         */
2388                        scan = max(scan, SWAP_CLUSTER_MAX);
2389                } else {
2390                        scan = lruvec_size;
2391                }
2392
2393                scan >>= sc->priority;
2394
2395                /*
2396                 * If the cgroup's already been deleted, make sure to
2397                 * scrape out the remaining cache.
2398                 */
2399                if (!scan && !mem_cgroup_online(memcg))
2400                        scan = min(lruvec_size, SWAP_CLUSTER_MAX);
2401
2402                switch (scan_balance) {
2403                case SCAN_EQUAL:
2404                        /* Scan lists relative to size */
2405                        break;
2406                case SCAN_FRACT:
2407                        /*
2408                         * Scan types proportional to swappiness and
2409                         * their relative recent reclaim efficiency.
2410                         * Make sure we don't miss the last page on
2411                         * the offlined memory cgroups because of a
2412                         * round-off error.
2413                         */
2414                        scan = mem_cgroup_online(memcg) ?
2415                               div64_u64(scan * fraction[file], denominator) :
2416                               DIV64_U64_ROUND_UP(scan * fraction[file],
2417                                                  denominator);
2418                        break;
2419                case SCAN_FILE:
2420                case SCAN_ANON:
2421                        /* Scan one type exclusively */
2422                        if ((scan_balance == SCAN_FILE) != file)
2423                                scan = 0;
2424                        break;
2425                default:
2426                        /* Look ma, no brain */
2427                        BUG();
2428                }
2429
2430                nr[lru] = scan;
2431        }
2432}
2433
2434static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
2435{
2436        unsigned long nr[NR_LRU_LISTS];
2437        unsigned long targets[NR_LRU_LISTS];
2438        unsigned long nr_to_scan;
2439        enum lru_list lru;
2440        unsigned long nr_reclaimed = 0;
2441        unsigned long nr_to_reclaim = sc->nr_to_reclaim;
2442        struct blk_plug plug;
2443        bool scan_adjusted;
2444
2445        get_scan_count(lruvec, sc, nr);
2446
2447        /* Record the original scan target for proportional adjustments later */
2448        memcpy(targets, nr, sizeof(nr));
2449
2450        /*
2451         * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
2452         * event that can occur when there is little memory pressure e.g.
2453         * multiple streaming readers/writers. Hence, we do not abort scanning
2454         * when the requested number of pages are reclaimed when scanning at
2455         * DEF_PRIORITY on the assumption that the fact we are direct
2456         * reclaiming implies that kswapd is not keeping up and it is best to
2457         * do a batch of work at once. For memcg reclaim one check is made to
2458         * abort proportional reclaim if either the file or anon lru has already
2459         * dropped to zero at the first pass.
2460         */
2461        scan_adjusted = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
2462                         sc->priority == DEF_PRIORITY);
2463
2464        blk_start_plug(&plug);
2465        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
2466                                        nr[LRU_INACTIVE_FILE]) {
2467                unsigned long nr_anon, nr_file, percentage;
2468                unsigned long nr_scanned;
2469
2470                for_each_evictable_lru(lru) {
2471                        if (nr[lru]) {
2472                                nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
2473                                nr[lru] -= nr_to_scan;
2474
2475                                nr_reclaimed += shrink_list(lru, nr_to_scan,
2476                                                            lruvec, sc);
2477                        }
2478                }
2479
2480                cond_resched();
2481
2482                if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
2483                        continue;
2484
2485                /*
2486                 * For kswapd and memcg, reclaim at least the number of pages
2487                 * requested. Ensure that the anon and file LRUs are scanned
2488                 * proportionally what was requested by get_scan_count(). We
2489                 * stop reclaiming one LRU and reduce the amount scanning
2490                 * proportional to the original scan target.
2491                 */
2492                nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
2493                nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
2494
2495                /*
2496                 * It's just vindictive to attack the larger once the smaller
2497                 * has gone to zero.  And given the way we stop scanning the
2498                 * smaller below, this makes sure that we only make one nudge
2499                 * towards proportionality once we've got nr_to_reclaim.
2500                 */
2501                if (!nr_file || !nr_anon)
2502                        break;
2503
2504                if (nr_file > nr_anon) {
2505                        unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
2506                                                targets[LRU_ACTIVE_ANON] + 1;
2507                        lru = LRU_BASE;
2508                        percentage = nr_anon * 100 / scan_target;
2509                } else {
2510                        unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
2511                                                targets[LRU_ACTIVE_FILE] + 1;
2512                        lru = LRU_FILE;
2513                        percentage = nr_file * 100 / scan_target;
2514                }
2515
2516                /* Stop scanning the smaller of the LRU */
2517                nr[lru] = 0;
2518                nr[lru + LRU_ACTIVE] = 0;
2519
2520                /*
2521                 * Recalculate the other LRU scan count based on its original
2522                 * scan target and the percentage scanning already complete
2523                 */
2524                lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
2525                nr_scanned = targets[lru] - nr[lru];
2526                nr[lru] = targets[lru] * (100 - percentage) / 100;
2527                nr[lru] -= min(nr[lru], nr_scanned);
2528
2529                lru += LRU_ACTIVE;
2530                nr_scanned = targets[lru] - nr[lru];
2531                nr[lru] = targets[lru] * (100 - percentage) / 100;
2532                nr[lru] -= min(nr[lru], nr_scanned);
2533
2534                scan_adjusted = true;
2535        }
2536        blk_finish_plug(&plug);
2537        sc->nr_reclaimed += nr_reclaimed;
2538
2539        /*
2540         * Even if we did not try to evict anon pages at all, we want to
2541         * rebalance the anon lru active/inactive ratio.
2542         */
2543        if (total_swap_pages && inactive_is_low(lruvec, LRU_INACTIVE_ANON))
2544                shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
2545                                   sc, LRU_ACTIVE_ANON);
2546}
2547
2548/* Use reclaim/compaction for costly allocs or under memory pressure */
2549static bool in_reclaim_compaction(struct scan_control *sc)
2550{
2551        if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
2552                        (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
2553                         sc->priority < DEF_PRIORITY - 2))
2554                return true;
2555
2556        return false;
2557}
2558
2559/*
2560 * Reclaim/compaction is used for high-order allocation requests. It reclaims
2561 * order-0 pages before compacting the zone. should_continue_reclaim() returns
2562 * true if more pages should be reclaimed such that when the page allocator
2563 * calls try_to_compact_pages() that it will have enough free pages to succeed.
2564 * It will give up earlier than that if there is difficulty reclaiming pages.
2565 */
2566static inline bool should_continue_reclaim(struct pglist_data *pgdat,
2567                                        unsigned long nr_reclaimed,
2568                                        struct scan_control *sc)
2569{
2570        unsigned long pages_for_compaction;
2571        unsigned long inactive_lru_pages;
2572        int z;
2573
2574        /* If not in reclaim/compaction mode, stop */
2575        if (!in_reclaim_compaction(sc))
2576                return false;
2577
2578        /*
2579         * Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX
2580         * number of pages that were scanned. This will return to the caller
2581         * with the risk reclaim/compaction and the resulting allocation attempt
2582         * fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL
2583         * allocations through requiring that the full LRU list has been scanned
2584         * first, by assuming that zero delta of sc->nr_scanned means full LRU
2585         * scan, but that approximation was wrong, and there were corner cases
2586         * where always a non-zero amount of pages were scanned.
2587         */
2588        if (!nr_reclaimed)
2589                return false;
2590
2591        /* If compaction would go ahead or the allocation would succeed, stop */
2592        for (z = 0; z <= sc->reclaim_idx; z++) {
2593                struct zone *zone = &pgdat->node_zones[z];
2594                if (!managed_zone(zone))
2595                        continue;
2596
2597                switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
2598                case COMPACT_SUCCESS:
2599                case COMPACT_CONTINUE:
2600                        return false;
2601                default:
2602                        /* check next zone */
2603                        ;
2604                }
2605        }
2606
2607        /*
2608         * If we have not reclaimed enough pages for compaction and the
2609         * inactive lists are large enough, continue reclaiming
2610         */
2611        pages_for_compaction = compact_gap(sc->order);
2612        inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
2613        if (get_nr_swap_pages() > 0)
2614                inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
2615
2616        return inactive_lru_pages > pages_for_compaction;
2617}
2618
2619static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
2620{
2621        struct mem_cgroup *target_memcg = sc->target_mem_cgroup;
2622        struct mem_cgroup *memcg;
2623
2624        memcg = mem_cgroup_iter(target_memcg, NULL, NULL);
2625        do {
2626                struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
2627                unsigned long reclaimed;
2628                unsigned long scanned;
2629
2630                /*
2631                 * This loop can become CPU-bound when target memcgs
2632                 * aren't eligible for reclaim - either because they
2633                 * don't have any reclaimable pages, or because their
2634                 * memory is explicitly protected. Avoid soft lockups.
2635                 */
2636                cond_resched();
2637
2638                mem_cgroup_calculate_protection(target_memcg, memcg);
2639
2640                if (mem_cgroup_below_min(memcg)) {
2641                        /*
2642                         * Hard protection.
2643                         * If there is no reclaimable memory, OOM.
2644                         */
2645                        continue;
2646                } else if (mem_cgroup_below_low(memcg)) {
2647                        /*
2648                         * Soft protection.
2649                         * Respect the protection only as long as
2650                         * there is an unprotected supply
2651                         * of reclaimable memory from other cgroups.
2652                         */
2653                        if (!sc->memcg_low_reclaim) {
2654                                sc->memcg_low_skipped = 1;
2655                                continue;
2656                        }
2657                        memcg_memory_event(memcg, MEMCG_LOW);
2658                }
2659
2660                reclaimed = sc->nr_reclaimed;
2661                scanned = sc->nr_scanned;
2662
2663                shrink_lruvec(lruvec, sc);
2664
2665                shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
2666                            sc->priority);
2667
2668                /* Record the group's reclaim efficiency */
2669                vmpressure(sc->gfp_mask, memcg, false,
2670                           sc->nr_scanned - scanned,
2671                           sc->nr_reclaimed - reclaimed);
2672
2673        } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
2674}
2675
2676static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
2677{
2678        struct reclaim_state *reclaim_state = current->reclaim_state;
2679        unsigned long nr_reclaimed, nr_scanned;
2680        struct lruvec *target_lruvec;
2681        bool reclaimable = false;
2682        unsigned long file;
2683
2684        target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
2685
2686again:
2687        memset(&sc->nr, 0, sizeof(sc->nr));
2688
2689        nr_reclaimed = sc->nr_reclaimed;
2690        nr_scanned = sc->nr_scanned;
2691
2692        /*
2693         * Determine the scan balance between anon and file LRUs.
2694         */
2695        spin_lock_irq(&target_lruvec->lru_lock);
2696        sc->anon_cost = target_lruvec->anon_cost;
2697        sc->file_cost = target_lruvec->file_cost;
2698        spin_unlock_irq(&target_lruvec->lru_lock);
2699
2700        /*
2701         * Target desirable inactive:active list ratios for the anon
2702         * and file LRU lists.
2703         */
2704        if (!sc->force_deactivate) {
2705                unsigned long refaults;
2706
2707                refaults = lruvec_page_state(target_lruvec,
2708                                WORKINGSET_ACTIVATE_ANON);
2709                if (refaults != target_lruvec->refaults[0] ||
2710                        inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
2711                        sc->may_deactivate |= DEACTIVATE_ANON;
2712                else
2713                        sc->may_deactivate &= ~DEACTIVATE_ANON;
2714
2715                /*
2716                 * When refaults are being observed, it means a new
2717                 * workingset is being established. Deactivate to get
2718                 * rid of any stale active pages quickly.
2719                 */
2720                refaults = lruvec_page_state(target_lruvec,
2721                                WORKINGSET_ACTIVATE_FILE);
2722                if (refaults != target_lruvec->refaults[1] ||
2723                    inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
2724                        sc->may_deactivate |= DEACTIVATE_FILE;
2725                else
2726                        sc->may_deactivate &= ~DEACTIVATE_FILE;
2727        } else
2728                sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
2729
2730        /*
2731         * If we have plenty of inactive file pages that aren't
2732         * thrashing, try to reclaim those first before touching
2733         * anonymous pages.
2734         */
2735        file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
2736        if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
2737                sc->cache_trim_mode = 1;
2738        else
2739                sc->cache_trim_mode = 0;
2740
2741        /*
2742         * Prevent the reclaimer from falling into the cache trap: as
2743         * cache pages start out inactive, every cache fault will tip
2744         * the scan balance towards the file LRU.  And as the file LRU
2745         * shrinks, so does the window for rotation from references.
2746         * This means we have a runaway feedback loop where a tiny
2747         * thrashing file LRU becomes infinitely more attractive than
2748         * anon pages.  Try to detect this based on file LRU size.
2749         */
2750        if (!cgroup_reclaim(sc)) {
2751                unsigned long total_high_wmark = 0;
2752                unsigned long free, anon;
2753                int z;
2754
2755                free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
2756                file = node_page_state(pgdat, NR_ACTIVE_FILE) +
2757                           node_page_state(pgdat, NR_INACTIVE_FILE);
2758
2759                for (z = 0; z < MAX_NR_ZONES; z++) {
2760                        struct zone *zone = &pgdat->node_zones[z];
2761                        if (!managed_zone(zone))
2762                                continue;
2763
2764                        total_high_wmark += high_wmark_pages(zone);
2765                }
2766
2767                /*
2768                 * Consider anon: if that's low too, this isn't a
2769                 * runaway file reclaim problem, but rather just
2770                 * extreme pressure. Reclaim as per usual then.
2771                 */
2772                anon = node_page_state(pgdat, NR_INACTIVE_ANON);
2773
2774                sc->file_is_tiny =
2775                        file + free <= total_high_wmark &&
2776                        !(sc->may_deactivate & DEACTIVATE_ANON) &&
2777                        anon >> sc->priority;
2778        }
2779
2780        shrink_node_memcgs(pgdat, sc);
2781
2782        if (reclaim_state) {
2783                sc->nr_reclaimed += reclaim_state->reclaimed_slab;
2784                reclaim_state->reclaimed_slab = 0;
2785        }
2786
2787        /* Record the subtree's reclaim efficiency */
2788        vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
2789                   sc->nr_scanned - nr_scanned,
2790                   sc->nr_reclaimed - nr_reclaimed);
2791
2792        if (sc->nr_reclaimed - nr_reclaimed)
2793                reclaimable = true;
2794
2795        if (current_is_kswapd()) {
2796                /*
2797                 * If reclaim is isolating dirty pages under writeback,
2798                 * it implies that the long-lived page allocation rate
2799                 * is exceeding the page laundering rate. Either the
2800                 * global limits are not being effective at throttling
2801                 * processes due to the page distribution throughout
2802                 * zones or there is heavy usage of a slow backing
2803                 * device. The only option is to throttle from reclaim
2804                 * context which is not ideal as there is no guarantee
2805                 * the dirtying process is throttled in the same way
2806                 * balance_dirty_pages() manages.
2807                 *
2808                 * Once a node is flagged PGDAT_WRITEBACK, kswapd will
2809                 * count the number of pages under pages flagged for
2810                 * immediate reclaim and stall if any are encountered
2811                 * in the nr_immediate check below.
2812                 */
2813                if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
2814                        set_bit(PGDAT_WRITEBACK, &pgdat->flags);
2815
2816                /* Allow kswapd to start writing pages during reclaim.*/
2817                if (sc->nr.unqueued_dirty == sc->nr.file_taken)
2818                        set_bit(PGDAT_DIRTY, &pgdat->flags);
2819
2820                /*
2821                 * If kswapd scans pages marked for immediate
2822                 * reclaim and under writeback (nr_immediate), it
2823                 * implies that pages are cycling through the LRU
2824                 * faster than they are written so also forcibly stall.
2825                 */
2826                if (sc->nr.immediate)
2827                        congestion_wait(BLK_RW_ASYNC, HZ/10);
2828        }
2829
2830        /*
2831         * Tag a node/memcg as congested if all the dirty pages
2832         * scanned were backed by a congested BDI and
2833         * wait_iff_congested will stall.
2834         *
2835         * Legacy memcg will stall in page writeback so avoid forcibly
2836         * stalling in wait_iff_congested().
2837         */
2838        if ((current_is_kswapd() ||
2839             (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) &&
2840            sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
2841                set_bit(LRUVEC_CONGESTED, &target_lruvec->flags);
2842
2843        /*
2844         * Stall direct reclaim for IO completions if underlying BDIs
2845         * and node is congested. Allow kswapd to continue until it
2846         * starts encountering unqueued dirty pages or cycling through
2847         * the LRU too quickly.
2848         */
2849        if (!current_is_kswapd() && current_may_throttle() &&
2850            !sc->hibernation_mode &&
2851            test_bit(LRUVEC_CONGESTED, &target_lruvec->flags))
2852                wait_iff_congested(BLK_RW_ASYNC, HZ/10);
2853
2854        if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
2855                                    sc))
2856                goto again;
2857
2858        /*
2859         * Kswapd gives up on balancing particular nodes after too
2860         * many failures to reclaim anything from them and goes to
2861         * sleep. On reclaim progress, reset the failure counter. A
2862         * successful direct reclaim run will revive a dormant kswapd.
2863         */
2864        if (reclaimable)
2865                pgdat->kswapd_failures = 0;
2866}
2867
2868/*
2869 * Returns true if compaction should go ahead for a costly-order request, or
2870 * the allocation would already succeed without compaction. Return false if we
2871 * should reclaim first.
2872 */
2873static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
2874{
2875        unsigned long watermark;
2876        enum compact_result suitable;
2877
2878        suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx);
2879        if (suitable == COMPACT_SUCCESS)
2880                /* Allocation should succeed already. Don't reclaim. */
2881                return true;
2882        if (suitable == COMPACT_SKIPPED)
2883                /* Compaction cannot yet proceed. Do reclaim. */
2884                return false;
2885
2886        /*
2887         * Compaction is already possible, but it takes time to run and there
2888         * are potentially other callers using the pages just freed. So proceed
2889         * with reclaim to make a buffer of free pages available to give
2890         * compaction a reasonable chance of completing and allocating the page.
2891         * Note that we won't actually reclaim the whole buffer in one attempt
2892         * as the target watermark in should_continue_reclaim() is lower. But if
2893         * we are already above the high+gap watermark, don't reclaim at all.
2894         */
2895        watermark = high_wmark_pages(zone) + compact_gap(sc->order);
2896
2897        return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
2898}
2899
2900/*
2901 * This is the direct reclaim path, for page-allocating processes.  We only
2902 * try to reclaim pages from zones which will satisfy the caller's allocation
2903 * request.
2904 *
2905 * If a zone is deemed to be full of pinned pages then just give it a light
2906 * scan then give up on it.
2907 */
2908static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
2909{
2910        struct zoneref *z;
2911        struct zone *zone;
2912        unsigned long nr_soft_reclaimed;
2913        unsigned long nr_soft_scanned;
2914        gfp_t orig_mask;
2915        pg_data_t *last_pgdat = NULL;
2916
2917        /*
2918         * If the number of buffer_heads in the machine exceeds the maximum
2919         * allowed level, force direct reclaim to scan the highmem zone as
2920         * highmem pages could be pinning lowmem pages storing buffer_heads
2921         */
2922        orig_mask = sc->gfp_mask;
2923        if (buffer_heads_over_limit) {
2924                sc->gfp_mask |= __GFP_HIGHMEM;
2925                sc->reclaim_idx = gfp_zone(sc->gfp_mask);
2926        }
2927
2928        for_each_zone_zonelist_nodemask(zone, z, zonelist,
2929                                        sc->reclaim_idx, sc->nodemask) {
2930                /*
2931                 * Take care memory controller reclaiming has small influence
2932                 * to global LRU.
2933                 */
2934                if (!cgroup_reclaim(sc)) {
2935                        if (!cpuset_zone_allowed(zone,
2936                                                 GFP_KERNEL | __GFP_HARDWALL))
2937                                continue;
2938
2939                        /*
2940                         * If we already have plenty of memory free for
2941                         * compaction in this zone, don't free any more.
2942                         * Even though compaction is invoked for any
2943                         * non-zero order, only frequent costly order
2944                         * reclamation is disruptive enough to become a
2945                         * noticeable problem, like transparent huge
2946                         * page allocations.
2947                         */
2948                        if (IS_ENABLED(CONFIG_COMPACTION) &&
2949                            sc->order > PAGE_ALLOC_COSTLY_ORDER &&
2950                            compaction_ready(zone, sc)) {
2951                                sc->compaction_ready = true;
2952                                continue;
2953                        }
2954
2955                        /*
2956                         * Shrink each node in the zonelist once. If the
2957                         * zonelist is ordered by zone (not the default) then a
2958                         * node may be shrunk multiple times but in that case
2959                         * the user prefers lower zones being preserved.
2960                         */
2961                        if (zone->zone_pgdat == last_pgdat)
2962                                continue;
2963
2964                        /*
2965                         * This steals pages from memory cgroups over softlimit
2966                         * and returns the number of reclaimed pages and
2967                         * scanned pages. This works for global memory pressure
2968                         * and balancing, not for a memcg's limit.
2969                         */
2970                        nr_soft_scanned = 0;
2971                        nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat,
2972                                                sc->order, sc->gfp_mask,
2973                                                &nr_soft_scanned);
2974                        sc->nr_reclaimed += nr_soft_reclaimed;
2975                        sc->nr_scanned += nr_soft_scanned;
2976                        /* need some check for avoid more shrink_zone() */
2977                }
2978
2979                /* See comment about same check for global reclaim above */
2980                if (zone->zone_pgdat == last_pgdat)
2981                        continue;
2982                last_pgdat = zone->zone_pgdat;
2983                shrink_node(zone->zone_pgdat, sc);
2984        }
2985
2986        /*
2987         * Restore to original mask to avoid the impact on the caller if we
2988         * promoted it to __GFP_HIGHMEM.
2989         */
2990        sc->gfp_mask = orig_mask;
2991}
2992
2993static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
2994{
2995        struct lruvec *target_lruvec;
2996        unsigned long refaults;
2997
2998        target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
2999        refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
3000        target_lruvec->refaults[0] = refaults;
3001        refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE);
3002        target_lruvec->refaults[1] = refaults;
3003}
3004
3005/*
3006 * This is the main entry point to direct page reclaim.
3007 *
3008 * If a full scan of the inactive list fails to free enough memory then we
3009 * are "out of memory" and something needs to be killed.
3010 *
3011 * If the caller is !__GFP_FS then the probability of a failure is reasonably
3012 * high - the zone may be full of dirty or under-writeback pages, which this
3013 * caller can't do much about.  We kick the writeback threads and take explicit
3014 * naps in the hope that some of these pages can be written.  But if the
3015 * allocating task holds filesystem locks which prevent writeout this might not
3016 * work, and the allocation attempt will fail.
3017 *
3018 * returns:     0, if no pages reclaimed
3019 *              else, the number of pages reclaimed
3020 */
3021static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
3022                                          struct scan_control *sc)
3023{
3024        int initial_priority = sc->priority;
3025        pg_data_t *last_pgdat;
3026        struct zoneref *z;
3027        struct zone *zone;
3028retry:
3029        delayacct_freepages_start();
3030
3031        if (!cgroup_reclaim(sc))
3032                __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
3033
3034        do {
3035                vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
3036                                sc->priority);
3037                sc->nr_scanned = 0;
3038                shrink_zones(zonelist, sc);
3039
3040                if (sc->nr_reclaimed >= sc->nr_to_reclaim)
3041                        break;
3042
3043                if (sc->compaction_ready)
3044                        break;
3045
3046                /*
3047                 * If we're getting trouble reclaiming, start doing
3048                 * writepage even in laptop mode.
3049                 */
3050                if (sc->priority < DEF_PRIORITY - 2)
3051                        sc->may_writepage = 1;
3052        } while (--sc->priority >= 0);
3053
3054        last_pgdat = NULL;
3055        for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
3056                                        sc->nodemask) {
3057                if (zone->zone_pgdat == last_pgdat)
3058                        continue;
3059                last_pgdat = zone->zone_pgdat;
3060
3061                snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
3062
3063                if (cgroup_reclaim(sc)) {
3064                        struct lruvec *lruvec;
3065
3066                        lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup,
3067                                                   zone->zone_pgdat);
3068                        clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
3069                }
3070        }
3071
3072        delayacct_freepages_end();
3073
3074        if (sc->nr_reclaimed)
3075                return sc->nr_reclaimed;
3076
3077        /* Aborted reclaim to try compaction? don't OOM, then */
3078        if (sc->compaction_ready)
3079                return 1;
3080
3081        /*
3082         * We make inactive:active ratio decisions based on the node's
3083         * composition of memory, but a restrictive reclaim_idx or a
3084         * memory.low cgroup setting can exempt large amounts of
3085         * memory from reclaim. Neither of which are very common, so
3086         * instead of doing costly eligibility calculations of the
3087         * entire cgroup subtree up front, we assume the estimates are
3088         * good, and retry with forcible deactivation if that fails.
3089         */
3090        if (sc->skipped_deactivate) {
3091                sc->priority = initial_priority;
3092                sc->force_deactivate = 1;
3093                sc->skipped_deactivate = 0;
3094                goto retry;
3095        }
3096
3097        /* Untapped cgroup reserves?  Don't OOM, retry. */
3098        if (sc->memcg_low_skipped) {
3099                sc->priority = initial_priority;
3100                sc->force_deactivate = 0;
3101                sc->memcg_low_reclaim = 1;
3102                sc->memcg_low_skipped = 0;
3103                goto retry;
3104        }
3105
3106        return 0;
3107}
3108
3109static bool allow_direct_reclaim(pg_data_t *pgdat)
3110{
3111        struct zone *zone;
3112        unsigned long pfmemalloc_reserve = 0;
3113        unsigned long free_pages = 0;
3114        int i;
3115        bool wmark_ok;
3116
3117        if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
3118                return true;
3119
3120        for (i = 0; i <= ZONE_NORMAL; i++) {
3121                zone = &pgdat->node_zones[i];
3122                if (!managed_zone(zone))
3123                        continue;
3124
3125                if (!zone_reclaimable_pages(zone))
3126                        continue;
3127
3128                pfmemalloc_reserve += min_wmark_pages(zone);
3129                free_pages += zone_page_state(zone, NR_FREE_PAGES);
3130        }
3131
3132        /* If there are no reserves (unexpected config) then do not throttle */
3133        if (!pfmemalloc_reserve)
3134                return true;
3135
3136        wmark_ok = free_pages > pfmemalloc_reserve / 2;
3137
3138        /* kswapd must be awake if processes are being throttled */
3139        if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
3140                if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL)
3141                        WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL);
3142
3143                wake_up_interruptible(&pgdat->kswapd_wait);
3144        }
3145
3146        return wmark_ok;
3147}
3148
3149/*
3150 * Throttle direct reclaimers if backing storage is backed by the network
3151 * and the PFMEMALLOC reserve for the preferred node is getting dangerously
3152 * depleted. kswapd will continue to make progress and wake the processes
3153 * when the low watermark is reached.
3154 *
3155 * Returns true if a fatal signal was delivered during throttling. If this
3156 * happens, the page allocator should not consider triggering the OOM killer.
3157 */
3158static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
3159                                        nodemask_t *nodemask)
3160{
3161        struct zoneref *z;
3162        struct zone *zone;
3163        pg_data_t *pgdat = NULL;
3164
3165        /*
3166         * Kernel threads should not be throttled as they may be indirectly
3167         * responsible for cleaning pages necessary for reclaim to make forward
3168         * progress. kjournald for example may enter direct reclaim while
3169         * committing a transaction where throttling it could forcing other
3170         * processes to block on log_wait_commit().
3171         */
3172        if (current->flags & PF_KTHREAD)
3173                goto out;
3174
3175        /*
3176         * If a fatal signal is pending, this process should not throttle.
3177         * It should return quickly so it can exit and free its memory
3178         */
3179        if (fatal_signal_pending(current))
3180                goto out;
3181
3182        /*
3183         * Check if the pfmemalloc reserves are ok by finding the first node
3184         * with a usable ZONE_NORMAL or lower zone. The expectation is that
3185         * GFP_KERNEL will be required for allocating network buffers when
3186         * swapping over the network so ZONE_HIGHMEM is unusable.
3187         *
3188         * Throttling is based on the first usable node and throttled processes
3189         * wait on a queue until kswapd makes progress and wakes them. There
3190         * is an affinity then between processes waking up and where reclaim
3191         * progress has been made assuming the process wakes on the same node.
3192         * More importantly, processes running on remote nodes will not compete
3193         * for remote pfmemalloc reserves and processes on different nodes
3194         * should make reasonable progress.
3195         */
3196        for_each_zone_zonelist_nodemask(zone, z, zonelist,
3197                                        gfp_zone(gfp_mask), nodemask) {
3198                if (zone_idx(zone) > ZONE_NORMAL)
3199                        continue;
3200
3201                /* Throttle based on the first usable node */
3202                pgdat = zone->zone_pgdat;
3203                if (allow_direct_reclaim(pgdat))
3204                        goto out;
3205                break;
3206        }
3207
3208        /* If no zone was usable by the allocation flags then do not throttle */
3209        if (!pgdat)
3210                goto out;
3211
3212        /* Account for the throttling */
3213        count_vm_event(PGSCAN_DIRECT_THROTTLE);
3214
3215        /*
3216         * If the caller cannot enter the filesystem, it's possible that it
3217         * is due to the caller holding an FS lock or performing a journal
3218         * transaction in the case of a filesystem like ext[3|4]. In this case,
3219         * it is not safe to block on pfmemalloc_wait as kswapd could be
3220         * blocked waiting on the same lock. Instead, throttle for up to a
3221         * second before continuing.
3222         */
3223        if (!(gfp_mask & __GFP_FS)) {
3224                wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
3225                        allow_direct_reclaim(pgdat), HZ);
3226
3227                goto check_pending;
3228        }
3229
3230        /* Throttle until kswapd wakes the process */
3231        wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
3232                allow_direct_reclaim(pgdat));
3233
3234check_pending:
3235        if (fatal_signal_pending(current))
3236                return true;
3237
3238out:
3239        return false;
3240}
3241
3242unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
3243                                gfp_t gfp_mask, nodemask_t *nodemask)
3244{
3245        unsigned long nr_reclaimed;
3246        struct scan_control sc = {
3247                .nr_to_reclaim = SWAP_CLUSTER_MAX,
3248                .gfp_mask = current_gfp_context(gfp_mask),
3249                .reclaim_idx = gfp_zone(gfp_mask),
3250                .order = order,
3251                .nodemask = nodemask,
3252                .priority = DEF_PRIORITY,
3253                .may_writepage = !laptop_mode,
3254                .may_unmap = 1,
3255                .may_swap = 1,
3256        };
3257
3258        /*
3259         * scan_control uses s8 fields for order, priority, and reclaim_idx.
3260         * Confirm they are large enough for max values.
3261         */
3262        BUILD_BUG_ON(MAX_ORDER > S8_MAX);
3263        BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
3264        BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);
3265
3266        /*
3267         * Do not enter reclaim if fatal signal was delivered while throttled.
3268         * 1 is returned so that the page allocator does not OOM kill at this
3269         * point.
3270         */
3271        if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
3272                return 1;
3273
3274        set_task_reclaim_state(current, &sc.reclaim_state);
3275        trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
3276
3277        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
3278
3279        trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
3280        set_task_reclaim_state(current, NULL);
3281
3282        return nr_reclaimed;
3283}
3284
3285#ifdef CONFIG_MEMCG
3286
3287/* Only used by soft limit reclaim. Do not reuse for anything else. */
3288unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
3289                                                gfp_t gfp_mask, bool noswap,
3290                                                pg_data_t *pgdat,
3291                                                unsigned long *nr_scanned)
3292{
3293        struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
3294        struct scan_control sc = {
3295                .nr_to_reclaim = SWAP_CLUSTER_MAX,
3296                .target_mem_cgroup = memcg,
3297                .may_writepage = !laptop_mode,
3298                .may_unmap = 1,
3299                .reclaim_idx = MAX_NR_ZONES - 1,
3300                .may_swap = !noswap,
3301        };
3302
3303        WARN_ON_ONCE(!current->reclaim_state);
3304
3305        sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
3306                        (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
3307
3308        trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
3309                                                      sc.gfp_mask);
3310
3311        /*
3312         * NOTE: Although we can get the priority field, using it
3313         * here is not a good idea, since it limits the pages we can scan.
3314         * if we don't reclaim here, the shrink_node from balance_pgdat
3315         * will pick up pages from other mem cgroup's as well. We hack
3316         * the priority and make it zero.
3317         */
3318        shrink_lruvec(lruvec, &sc);
3319
3320        trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
3321
3322        *nr_scanned = sc.nr_scanned;
3323
3324        return sc.nr_reclaimed;
3325}
3326
3327unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
3328                                           unsigned long nr_pages,
3329                                           gfp_t gfp_mask,
3330                                           bool may_swap)
3331{
3332        unsigned long nr_reclaimed;
3333        unsigned int noreclaim_flag;
3334        struct scan_control sc = {
3335                .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
3336                .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
3337                                (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
3338                .reclaim_idx = MAX_NR_ZONES - 1,
3339                .target_mem_cgroup = memcg,
3340                .priority = DEF_PRIORITY,
3341                .may_writepage = !laptop_mode,
3342                .may_unmap = 1,
3343                .may_swap = may_swap,
3344        };
3345        /*
3346         * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
3347         * equal pressure on all the nodes. This is based on the assumption that
3348         * the reclaim does not bail out early.
3349         */
3350        struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
3351
3352        set_task_reclaim_state(current, &sc.reclaim_state);
3353        trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask);
3354        noreclaim_flag = memalloc_noreclaim_save();
3355
3356        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
3357
3358        memalloc_noreclaim_restore(noreclaim_flag);
3359        trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
3360        set_task_reclaim_state(current, NULL);
3361
3362        return nr_reclaimed;
3363}
3364#endif
3365
3366static void age_active_anon(struct pglist_data *pgdat,
3367                                struct scan_control *sc)
3368{
3369        struct mem_cgroup *memcg;
3370        struct lruvec *lruvec;
3371
3372        if (!total_swap_pages)
3373                return;
3374
3375        lruvec = mem_cgroup_lruvec(NULL, pgdat);
3376        if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON))
3377                return;
3378
3379        memcg = mem_cgroup_iter(NULL, NULL, NULL);
3380        do {
3381                lruvec = mem_cgroup_lruvec(memcg, pgdat);
3382                shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
3383                                   sc, LRU_ACTIVE_ANON);
3384                memcg = mem_cgroup_iter(NULL, memcg, NULL);
3385        } while (memcg);
3386}
3387
3388static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx)
3389{
3390        int i;
3391        struct zone *zone;
3392
3393        /*
3394         * Check for watermark boosts top-down as the higher zones
3395         * are more likely to be boosted. Both watermarks and boosts
3396         * should not be checked at the same time as reclaim would
3397         * start prematurely when there is no boosting and a lower
3398         * zone is balanced.
3399         */
3400        for (i = highest_zoneidx; i >= 0; i--) {
3401                zone = pgdat->node_zones + i;
3402                if (!managed_zone(zone))
3403                        continue;
3404
3405                if (zone->watermark_boost)
3406                        return true;
3407        }
3408
3409        return false;
3410}
3411
3412/*
3413 * Returns true if there is an eligible zone balanced for the request order
3414 * and highest_zoneidx
3415 */
3416static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
3417{
3418        int i;
3419        unsigned long mark = -1;
3420        struct zone *zone;
3421
3422        /*
3423         * Check watermarks bottom-up as lower zones are more likely to
3424         * meet watermarks.
3425         */
3426        for (i = 0; i <= highest_zoneidx; i++) {
3427                zone = pgdat->node_zones + i;
3428
3429                if (!managed_zone(zone))
3430                        continue;
3431
3432                mark = high_wmark_pages(zone);
3433                if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
3434                        return true;
3435        }
3436
3437        /*
3438         * If a node has no populated zone within highest_zoneidx, it does not
3439         * need balancing by definition. This can happen if a zone-restricted
3440         * allocation tries to wake a remote kswapd.
3441         */
3442        if (mark == -1)
3443                return true;
3444
3445        return false;
3446}
3447
3448/* Clear pgdat state for congested, dirty or under writeback. */
3449static void clear_pgdat_congested(pg_data_t *pgdat)
3450{
3451        struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
3452
3453        clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
3454        clear_bit(PGDAT_DIRTY, &pgdat->flags);
3455        clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
3456}
3457
3458/*
3459 * Prepare kswapd for sleeping. This verifies that there are no processes
3460 * waiting in throttle_direct_reclaim() and that watermarks have been met.
3461 *
3462 * Returns true if kswapd is ready to sleep
3463 */
3464static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order,
3465                                int highest_zoneidx)
3466{
3467        /*
3468         * The throttled processes are normally woken up in balance_pgdat() as
3469         * soon as allow_direct_reclaim() is true. But there is a potential
3470         * race between when kswapd checks the watermarks and a process gets
3471         * throttled. There is also a potential race if processes get
3472         * throttled, kswapd wakes, a large process exits thereby balancing the
3473         * zones, which causes kswapd to exit balance_pgdat() before reaching
3474         * the wake up checks. If kswapd is going to sleep, no process should
3475         * be sleeping on pfmemalloc_wait, so wake them now if necessary. If
3476         * the wake up is premature, processes will wake kswapd and get
3477         * throttled again. The difference from wake ups in balance_pgdat() is
3478         * that here we are under prepare_to_wait().
3479         */
3480        if (waitqueue_active(&pgdat->pfmemalloc_wait))
3481                wake_up_all(&pgdat->pfmemalloc_wait);
3482
3483        /* Hopeless node, leave it to direct reclaim */
3484        if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
3485                return true;
3486
3487        if (pgdat_balanced(pgdat, order, highest_zoneidx)) {
3488                clear_pgdat_congested(pgdat);
3489                return true;
3490        }
3491
3492        return false;
3493}
3494
3495/*
3496 * kswapd shrinks a node of pages that are at or below the highest usable
3497 * zone that is currently unbalanced.
3498 *
3499 * Returns true if kswapd scanned at least the requested number of pages to
3500 * reclaim or if the lack of progress was due to pages under writeback.
3501 * This is used to determine if the scanning priority needs to be raised.
3502 */
3503static bool kswapd_shrink_node(pg_data_t *pgdat,
3504                               struct scan_control *sc)
3505{
3506        struct zone *zone;
3507        int z;
3508
3509        /* Reclaim a number of pages proportional to the number of zones */
3510        sc->nr_to_reclaim = 0;
3511        for (z = 0; z <= sc->reclaim_idx; z++) {
3512                zone = pgdat->node_zones + z;
3513                if (!managed_zone(zone))
3514                        continue;
3515
3516                sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
3517        }
3518
3519        /*
3520         * Historically care was taken to put equal pressure on all zones but
3521         * now pressure is applied based on node LRU order.
3522         */
3523        shrink_node(pgdat, sc);
3524
3525        /*
3526         * Fragmentation may mean that the system cannot be rebalanced for
3527         * high-order allocations. If twice the allocation size has been
3528         * reclaimed then recheck watermarks only at order-0 to prevent
3529         * excessive reclaim. Assume that a process requested a high-order
3530         * can direct reclaim/compact.
3531         */
3532        if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
3533                sc->order = 0;
3534
3535        return sc->nr_scanned >= sc->nr_to_reclaim;
3536}
3537
3538/*
3539 * For kswapd, balance_pgdat() will reclaim pages across a node from zones
3540 * that are eligible for use by the caller until at least one zone is
3541 * balanced.
3542 *
3543 * Returns the order kswapd finished reclaiming at.
3544 *
3545 * kswapd scans the zones in the highmem->normal->dma direction.  It skips
3546 * zones which have free_pages > high_wmark_pages(zone), but once a zone is
3547 * found to have free_pages <= high_wmark_pages(zone), any page in that zone
3548 * or lower is eligible for reclaim until at least one usable zone is
3549 * balanced.
3550 */
3551static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
3552{
3553        int i;
3554        unsigned long nr_soft_reclaimed;
3555        unsigned long nr_soft_scanned;
3556        unsigned long pflags;
3557        unsigned long nr_boost_reclaim;
3558        unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };
3559        bool boosted;
3560        struct zone *zone;
3561        struct scan_control sc = {
3562                .gfp_mask = GFP_KERNEL,
3563                .order = order,
3564                .may_unmap = 1,
3565        };
3566
3567        set_task_reclaim_state(current, &sc.reclaim_state);
3568        psi_memstall_enter(&pflags);
3569        __fs_reclaim_acquire();
3570
3571        count_vm_event(PAGEOUTRUN);
3572
3573        /*
3574         * Account for the reclaim boost. Note that the zone boost is left in
3575         * place so that parallel allocations that are near the watermark will
3576         * stall or direct reclaim until kswapd is finished.
3577         */
3578        nr_boost_reclaim = 0;
3579        for (i = 0; i <= highest_zoneidx; i++) {
3580                zone = pgdat->node_zones + i;
3581                if (!managed_zone(zone))
3582                        continue;
3583
3584                nr_boost_reclaim += zone->watermark_boost;
3585                zone_boosts[i] = zone->watermark_boost;
3586        }
3587        boosted = nr_boost_reclaim;
3588
3589restart:
3590        sc.priority = DEF_PRIORITY;
3591        do {
3592                unsigned long nr_reclaimed = sc.nr_reclaimed;
3593                bool raise_priority = true;
3594                bool balanced;
3595                bool ret;
3596
3597                sc.reclaim_idx = highest_zoneidx;
3598
3599                /*
3600                 * If the number of buffer_heads exceeds the maximum allowed
3601                 * then consider reclaiming from all zones. This has a dual
3602                 * purpose -- on 64-bit systems it is expected that
3603                 * buffer_heads are stripped during active rotation. On 32-bit
3604                 * systems, highmem pages can pin lowmem memory and shrinking
3605                 * buffers can relieve lowmem pressure. Reclaim may still not
3606                 * go ahead if all eligible zones for the original allocation
3607                 * request are balanced to avoid excessive reclaim from kswapd.
3608                 */
3609                if (buffer_heads_over_limit) {
3610                        for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
3611                                zone = pgdat->node_zones + i;
3612                                if (!managed_zone(zone))
3613                                        continue;
3614
3615                                sc.reclaim_idx = i;
3616                                break;
3617                        }
3618                }
3619
3620                /*
3621                 * If the pgdat is imbalanced then ignore boosting and preserve
3622                 * the watermarks for a later time and restart. Note that the
3623                 * zone watermarks will be still reset at the end of balancing
3624                 * on the grounds that the normal reclaim should be enough to
3625                 * re-evaluate if boosting is required when kswapd next wakes.
3626                 */
3627                balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx);
3628                if (!balanced && nr_boost_reclaim) {
3629                        nr_boost_reclaim = 0;
3630                        goto restart;
3631                }
3632
3633                /*
3634                 * If boosting is not active then only reclaim if there are no
3635                 * eligible zones. Note that sc.reclaim_idx is not used as
3636                 * buffer_heads_over_limit may have adjusted it.
3637                 */
3638                if (!nr_boost_reclaim && balanced)
3639                        goto out;
3640
3641                /* Limit the priority of boosting to avoid reclaim writeback */
3642                if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)
3643                        raise_priority = false;
3644
3645                /*
3646                 * Do not writeback or swap pages for boosted reclaim. The
3647                 * intent is to relieve pressure not issue sub-optimal IO
3648                 * from reclaim context. If no pages are reclaimed, the
3649                 * reclaim will be aborted.
3650                 */
3651                sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
3652                sc.may_swap = !nr_boost_reclaim;
3653
3654                /*
3655                 * Do some background aging of the anon list, to give
3656                 * pages a chance to be referenced before reclaiming. All
3657                 * pages are rotated regardless of classzone as this is
3658                 * about consistent aging.
3659                 */
3660                age_active_anon(pgdat, &sc);
3661
3662                /*
3663                 * If we're getting trouble reclaiming, start doing writepage
3664                 * even in laptop mode.
3665                 */
3666                if (sc.priority < DEF_PRIORITY - 2)
3667                        sc.may_writepage = 1;
3668
3669                /* Call soft limit reclaim before calling shrink_node. */
3670                sc.nr_scanned = 0;
3671                nr_soft_scanned = 0;
3672                nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,
3673                                                sc.gfp_mask, &nr_soft_scanned);
3674                sc.nr_reclaimed += nr_soft_reclaimed;
3675
3676                /*
3677                 * There should be no need to raise the scanning priority if
3678                 * enough pages are already being scanned that that high
3679                 * watermark would be met at 100% efficiency.
3680                 */
3681                if (kswapd_shrink_node(pgdat, &sc))
3682                        raise_priority = false;
3683
3684                /*
3685                 * If the low watermark is met there is no need for processes
3686                 * to be throttled on pfmemalloc_wait as they should not be
3687                 * able to safely make forward progress. Wake them
3688                 */
3689                if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
3690                                allow_direct_reclaim(pgdat))
3691                        wake_up_all(&pgdat->pfmemalloc_wait);
3692
3693                /* Check if kswapd should be suspending */
3694                __fs_reclaim_release();
3695                ret = try_to_freeze();
3696                __fs_reclaim_acquire();
3697                if (ret || kthread_should_stop())
3698                        break;
3699
3700                /*
3701                 * Raise priority if scanning rate is too low or there was no
3702                 * progress in reclaiming pages
3703                 */
3704                nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
3705                nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);
3706
3707                /*
3708                 * If reclaim made no progress for a boost, stop reclaim as
3709                 * IO cannot be queued and it could be an infinite loop in
3710                 * extreme circumstances.
3711                 */
3712                if (nr_boost_reclaim && !nr_reclaimed)
3713                        break;
3714
3715                if (raise_priority || !nr_reclaimed)
3716                        sc.priority--;
3717        } while (sc.priority >= 1);
3718
3719        if (!sc.nr_reclaimed)
3720                pgdat->kswapd_failures++;
3721
3722out:
3723        /* If reclaim was boosted, account for the reclaim done in this pass */
3724        if (boosted) {
3725                unsigned long flags;
3726
3727                for (i = 0; i <= highest_zoneidx; i++) {
3728                        if (!zone_boosts[i])
3729                                continue;
3730
3731                        /* Increments are under the zone lock */
3732                        zone = pgdat->node_zones + i;
3733                        spin_lock_irqsave(&zone->lock, flags);
3734                        zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
3735                        spin_unlock_irqrestore(&zone->lock, flags);
3736                }
3737
3738                /*
3739                 * As there is now likely space, wakeup kcompact to defragment
3740                 * pageblocks.
3741                 */
3742                wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx);
3743        }
3744
3745        snapshot_refaults(NULL, pgdat);
3746        __fs_reclaim_release();
3747        psi_memstall_leave(&pflags);
3748        set_task_reclaim_state(current, NULL);
3749
3750        /*
3751         * Return the order kswapd stopped reclaiming at as
3752         * prepare_kswapd_sleep() takes it into account. If another caller
3753         * entered the allocator slow path while kswapd was awake, order will
3754         * remain at the higher level.
3755         */
3756        return sc.order;
3757}
3758
3759/*
3760 * The pgdat->kswapd_highest_zoneidx is used to pass the highest zone index to
3761 * be reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is
3762 * not a valid index then either kswapd runs for first time or kswapd couldn't
3763 * sleep after previous reclaim attempt (node is still unbalanced). In that
3764 * case return the zone index of the previous kswapd reclaim cycle.
3765 */
3766static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat,
3767                                           enum zone_type prev_highest_zoneidx)
3768{
3769        enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
3770
3771        return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx;
3772}
3773
3774static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
3775                                unsigned int highest_zoneidx)
3776{
3777        long remaining = 0;
3778        DEFINE_WAIT(wait);
3779
3780        if (freezing(current) || kthread_should_stop())
3781                return;
3782
3783        prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
3784
3785        /*
3786         * Try to sleep for a short interval. Note that kcompactd will only be
3787         * woken if it is possible to sleep for a short interval. This is
3788         * deliberate on the assumption that if reclaim cannot keep an
3789         * eligible zone balanced that it's also unlikely that compaction will
3790         * succeed.
3791         */
3792        if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
3793                /*
3794                 * Compaction records what page blocks it recently failed to
3795                 * isolate pages from and skips them in the future scanning.
3796                 * When kswapd is going to sleep, it is reasonable to assume
3797                 * that pages and compaction may succeed so reset the cache.
3798                 */
3799                reset_isolation_suitable(pgdat);
3800
3801                /*
3802                 * We have freed the memory, now we should compact it to make
3803                 * allocation of the requested order possible.
3804                 */
3805                wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx);
3806
3807                remaining = schedule_timeout(HZ/10);
3808
3809                /*
3810                 * If woken prematurely then reset kswapd_highest_zoneidx and
3811                 * order. The values will either be from a wakeup request or
3812                 * the previous request that slept prematurely.
3813                 */
3814                if (remaining) {
3815                        WRITE_ONCE(pgdat->kswapd_highest_zoneidx,
3816                                        kswapd_highest_zoneidx(pgdat,
3817                                                        highest_zoneidx));
3818
3819                        if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
3820                                WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
3821                }
3822
3823                finish_wait(&pgdat->kswapd_wait, &wait);
3824                prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
3825        }
3826
3827        /*
3828         * After a short sleep, check if it was a premature sleep. If not, then
3829         * go fully to sleep until explicitly woken up.
3830         */
3831        if (!remaining &&
3832            prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
3833                trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
3834
3835                /*
3836                 * vmstat counters are not perfectly accurate and the estimated
3837                 * value for counters such as NR_FREE_PAGES can deviate from the
3838                 * true value by nr_online_cpus * threshold. To avoid the zone
3839                 * watermarks being breached while under pressure, we reduce the
3840                 * per-cpu vmstat threshold while kswapd is awake and restore
3841                 * them before going back to sleep.
3842                 */
3843                set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
3844
3845                if (!kthread_should_stop())
3846                        schedule();
3847
3848                set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
3849        } else {
3850                if (remaining)
3851                        count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
3852                else
3853                        count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
3854        }
3855        finish_wait(&pgdat->kswapd_wait, &wait);
3856}
3857
3858/*
3859 * The background pageout daemon, started as a kernel thread
3860 * from the init process.
3861 *
3862 * This basically trickles out pages so that we have _some_
3863 * free memory available even if there is no other activity
3864 * that frees anything up. This is needed for things like routing
3865 * etc, where we otherwise might have all activity going on in
3866 * asynchronous contexts that cannot page things out.
3867 *
3868 * If there are applications that are active memory-allocators
3869 * (most normal use), this basically shouldn't matter.
3870 */
3871static int kswapd(void *p)
3872{
3873        unsigned int alloc_order, reclaim_order;
3874        unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
3875        pg_data_t *pgdat = (pg_data_t*)p;
3876        struct task_struct *tsk = current;
3877        const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
3878
3879        if (!cpumask_empty(cpumask))
3880                set_cpus_allowed_ptr(tsk, cpumask);
3881
3882        /*
3883         * Tell the memory management that we're a "memory allocator",
3884         * and that if we need more memory we should get access to it
3885         * regardless (see "__alloc_pages()"). "kswapd" should
3886         * never get caught in the normal page freeing logic.
3887         *
3888         * (Kswapd normally doesn't need memory anyway, but sometimes
3889         * you need a small amount of memory in order to be able to
3890         * page out something else, and this flag essentially protects
3891         * us from recursively trying to free more memory as we're
3892         * trying to free the first piece of memory in the first place).
3893         */
3894        tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
3895        set_freezable();
3896
3897        WRITE_ONCE(pgdat->kswapd_order, 0);
3898        WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
3899        for ( ; ; ) {
3900                bool ret;
3901
3902                alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
3903                highest_zoneidx = kswapd_highest_zoneidx(pgdat,
3904                                                        highest_zoneidx);
3905
3906kswapd_try_sleep:
3907                kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
3908                                        highest_zoneidx);
3909
3910                /* Read the new order and highest_zoneidx */
3911                alloc_order = READ_ONCE(pgdat->kswapd_order);
3912                highest_zoneidx = kswapd_highest_zoneidx(pgdat,
3913                                                        highest_zoneidx);
3914                WRITE_ONCE(pgdat->kswapd_order, 0);
3915                WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
3916
3917                ret = try_to_freeze();
3918                if (kthread_should_stop())
3919                        break;
3920
3921                /*
3922                 * We can speed up thawing tasks if we don't call balance_pgdat
3923                 * after returning from the refrigerator
3924                 */
3925                if (ret)
3926                        continue;
3927
3928                /*
3929                 * Reclaim begins at the requested order but if a high-order
3930                 * reclaim fails then kswapd falls back to reclaiming for
3931                 * order-0. If that happens, kswapd will consider sleeping
3932                 * for the order it finished reclaiming at (reclaim_order)
3933                 * but kcompactd is woken to compact for the original
3934                 * request (alloc_order).
3935                 */
3936                trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx,
3937                                                alloc_order);
3938                reclaim_order = balance_pgdat(pgdat, alloc_order,
3939                                                highest_zoneidx);
3940                if (reclaim_order < alloc_order)
3941                        goto kswapd_try_sleep;
3942        }
3943
3944        tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
3945
3946        return 0;
3947}
3948
3949/*
3950 * A zone is low on free memory or too fragmented for high-order memory.  If
3951 * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's
3952 * pgdat.  It will wake up kcompactd after reclaiming memory.  If kswapd reclaim
3953 * has failed or is not needed, still wake up kcompactd if only compaction is
3954 * needed.
3955 */
3956void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
3957                   enum zone_type highest_zoneidx)
3958{
3959        pg_data_t *pgdat;
3960        enum zone_type curr_idx;
3961
3962        if (!managed_zone(zone))
3963                return;
3964
3965        if (!cpuset_zone_allowed(zone, gfp_flags))
3966                return;
3967
3968        pgdat = zone->zone_pgdat;
3969        curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
3970
3971        if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx)
3972                WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx);
3973
3974        if (READ_ONCE(pgdat->kswapd_order) < order)
3975                WRITE_ONCE(pgdat->kswapd_order, order);
3976
3977        if (!waitqueue_active(&pgdat->kswapd_wait))
3978                return;
3979
3980        /* Hopeless node, leave it to direct reclaim if possible */
3981        if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
3982            (pgdat_balanced(pgdat, order, highest_zoneidx) &&
3983             !pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
3984                /*
3985                 * There may be plenty of free memory available, but it's too
3986                 * fragmented for high-order allocations.  Wake up kcompactd
3987                 * and rely on compaction_suitable() to determine if it's
3988                 * needed.  If it fails, it will defer subsequent attempts to
3989                 * ratelimit its work.
3990                 */
3991                if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
3992                        wakeup_kcompactd(pgdat, order, highest_zoneidx);
3993                return;
3994        }
3995
3996        trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order,
3997                                      gfp_flags);
3998        wake_up_interruptible(&pgdat->kswapd_wait);
3999}
4000
4001#ifdef CONFIG_HIBERNATION
4002/*
4003 * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
4004 * freed pages.
4005 *
4006 * Rather than trying to age LRUs the aim is to preserve the overall
4007 * LRU order by reclaiming preferentially
4008 * inactive > active > active referenced > active mapped
4009 */
4010unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
4011{
4012        struct scan_control sc = {
4013                .nr_to_reclaim = nr_to_reclaim,
4014                .gfp_mask = GFP_HIGHUSER_MOVABLE,
4015                .reclaim_idx = MAX_NR_ZONES - 1,
4016                .priority = DEF_PRIORITY,
4017                .may_writepage = 1,
4018                .may_unmap = 1,
4019                .may_swap = 1,
4020                .hibernation_mode = 1,
4021        };
4022        struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
4023        unsigned long nr_reclaimed;
4024        unsigned int noreclaim_flag;
4025
4026        fs_reclaim_acquire(sc.gfp_mask);
4027        noreclaim_flag = memalloc_noreclaim_save();
4028        set_task_reclaim_state(current, &sc.reclaim_state);
4029
4030        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
4031
4032        set_task_reclaim_state(current, NULL);
4033        memalloc_noreclaim_restore(noreclaim_flag);
4034        fs_reclaim_release(sc.gfp_mask);
4035
4036        return nr_reclaimed;
4037}
4038#endif /* CONFIG_HIBERNATION */
4039
4040/*
4041 * This kswapd start function will be called by init and node-hot-add.
4042 * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
4043 */
4044int kswapd_run(int nid)
4045{
4046        pg_data_t *pgdat = NODE_DATA(nid);
4047        int ret = 0;
4048
4049        if (pgdat->kswapd)
4050                return 0;
4051
4052        pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
4053        if (IS_ERR(pgdat->kswapd)) {
4054                /* failure at boot is fatal */
4055                BUG_ON(system_state < SYSTEM_RUNNING);
4056                pr_err("Failed to start kswapd on node %d\n", nid);
4057                ret = PTR_ERR(pgdat->kswapd);
4058                pgdat->kswapd = NULL;
4059        }
4060        return ret;
4061}
4062
4063/*
4064 * Called by memory hotplug when all memory in a node is offlined.  Caller must
4065 * hold mem_hotplug_begin/end().
4066 */
4067void kswapd_stop(int nid)
4068{
4069        struct task_struct *kswapd = NODE_DATA(nid)->kswapd;
4070
4071        if (kswapd) {
4072                kthread_stop(kswapd);
4073                NODE_DATA(nid)->kswapd = NULL;
4074        }
4075}
4076
4077static int __init kswapd_init(void)
4078{
4079        int nid;
4080
4081        swap_setup();
4082        for_each_node_state(nid, N_MEMORY)
4083                kswapd_run(nid);
4084        return 0;
4085}
4086
4087module_init(kswapd_init)
4088
4089#ifdef CONFIG_NUMA
4090/*
4091 * Node reclaim mode
4092 *
4093 * If non-zero call node_reclaim when the number of free pages falls below
4094 * the watermarks.
4095 */
4096int node_reclaim_mode __read_mostly;
4097
4098#define RECLAIM_WRITE (1<<0)    /* Writeout pages during reclaim */
4099#define RECLAIM_UNMAP (1<<1)    /* Unmap pages during reclaim */
4100
4101/*
4102 * Priority for NODE_RECLAIM. This determines the fraction of pages
4103 * of a node considered for each zone_reclaim. 4 scans 1/16th of
4104 * a zone.
4105 */
4106#define NODE_RECLAIM_PRIORITY 4
4107
4108/*
4109 * Percentage of pages in a zone that must be unmapped for node_reclaim to
4110 * occur.
4111 */
4112int sysctl_min_unmapped_ratio = 1;
4113
4114/*
4115 * If the number of slab pages in a zone grows beyond this percentage then
4116 * slab reclaim needs to occur.
4117 */
4118int sysctl_min_slab_ratio = 5;
4119
4120static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
4121{
4122        unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED);
4123        unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) +
4124                node_page_state(pgdat, NR_ACTIVE_FILE);
4125
4126        /*
4127         * It's possible for there to be more file mapped pages than
4128         * accounted for by the pages on the file LRU lists because
4129         * tmpfs pages accounted for as ANON can also be FILE_MAPPED
4130         */
4131        return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
4132}
4133
4134/* Work out how many page cache pages we can reclaim in this reclaim_mode */
4135static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
4136{
4137        unsigned long nr_pagecache_reclaimable;
4138        unsigned long delta = 0;
4139
4140        /*
4141         * If RECLAIM_UNMAP is set, then all file pages are considered
4142         * potentially reclaimable. Otherwise, we have to worry about
4143         * pages like swapcache and node_unmapped_file_pages() provides
4144         * a better estimate
4145         */
4146        if (node_reclaim_mode & RECLAIM_UNMAP)
4147                nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES);
4148        else
4149                nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
4150
4151        /* If we can't clean pages, remove dirty pages from consideration */
4152        if (!(node_reclaim_mode & RECLAIM_WRITE))
4153                delta += node_page_state(pgdat, NR_FILE_DIRTY);
4154
4155        /* Watch for any possible underflows due to delta */
4156        if (unlikely(delta > nr_pagecache_reclaimable))
4157                delta = nr_pagecache_reclaimable;
4158
4159        return nr_pagecache_reclaimable - delta;
4160}
4161
4162/*
4163 * Try to free up some pages from this node through reclaim.
4164 */
4165static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
4166{
4167        /* Minimum pages needed in order to stay on node */
4168        const unsigned long nr_pages = 1 << order;
4169        struct task_struct *p = current;
4170        unsigned int noreclaim_flag;
4171        struct scan_control sc = {
4172                .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
4173                .gfp_mask = current_gfp_context(gfp_mask),
4174                .order = order,
4175                .priority = NODE_RECLAIM_PRIORITY,
4176                .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
4177                .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
4178                .may_swap = 1,
4179                .reclaim_idx = gfp_zone(gfp_mask),
4180        };
4181
4182        trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
4183                                           sc.gfp_mask);
4184
4185        cond_resched();
4186        fs_reclaim_acquire(sc.gfp_mask);
4187        /*
4188         * We need to be able to allocate from the reserves for RECLAIM_UNMAP
4189         * and we also need to be able to write out pages for RECLAIM_WRITE
4190         * and RECLAIM_UNMAP.
4191         */
4192        noreclaim_flag = memalloc_noreclaim_save();
4193        p->flags |= PF_SWAPWRITE;
4194        set_task_reclaim_state(p, &sc.reclaim_state);
4195
4196        if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
4197                /*
4198                 * Free memory by calling shrink node with increasing
4199                 * priorities until we have enough memory freed.
4200                 */
4201                do {
4202                        shrink_node(pgdat, &sc);
4203                } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
4204        }
4205
4206        set_task_reclaim_state(p, NULL);
4207        current->flags &= ~PF_SWAPWRITE;
4208        memalloc_noreclaim_restore(noreclaim_flag);
4209        fs_reclaim_release(sc.gfp_mask);
4210
4211        trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed);
4212
4213        return sc.nr_reclaimed >= nr_pages;
4214}
4215
4216int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
4217{
4218        int ret;
4219
4220        /*
4221         * Node reclaim reclaims unmapped file backed pages and
4222         * slab pages if we are over the defined limits.
4223         *
4224         * A small portion of unmapped file backed pages is needed for
4225         * file I/O otherwise pages read by file I/O will be immediately
4226         * thrown out if the node is overallocated. So we do not reclaim
4227         * if less than a specified percentage of the node is used by
4228         * unmapped file backed pages.
4229         */
4230        if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
4231            node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) <=
4232            pgdat->min_slab_pages)
4233                return NODE_RECLAIM_FULL;
4234
4235        /*
4236         * Do not scan if the allocation should not be delayed.
4237         */
4238        if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
4239                return NODE_RECLAIM_NOSCAN;
4240
4241        /*
4242         * Only run node reclaim on the local node or on nodes that do not
4243         * have associated processors. This will favor the local processor
4244         * over remote processors and spread off node memory allocations
4245         * as wide as possible.
4246         */
4247        if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
4248                return NODE_RECLAIM_NOSCAN;
4249
4250        if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
4251                return NODE_RECLAIM_NOSCAN;
4252
4253        ret = __node_reclaim(pgdat, gfp_mask, order);
4254        clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
4255
4256        if (!ret)
4257                count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
4258
4259        return ret;
4260}
4261#endif
4262
4263/**
4264 * check_move_unevictable_pages - check pages for evictability and move to
4265 * appropriate zone lru list
4266 * @pvec: pagevec with lru pages to check
4267 *
4268 * Checks pages for evictability, if an evictable page is in the unevictable
4269 * lru list, moves it to the appropriate evictable lru list. This function
4270 * should be only used for lru pages.
4271 */
4272void check_move_unevictable_pages(struct pagevec *pvec)
4273{
4274        struct lruvec *lruvec = NULL;
4275        int pgscanned = 0;
4276        int pgrescued = 0;
4277        int i;
4278
4279        for (i = 0; i < pvec->nr; i++) {
4280                struct page *page = pvec->pages[i];
4281                int nr_pages;
4282
4283                if (PageTransTail(page))
4284                        continue;
4285
4286                nr_pages = thp_nr_pages(page);
4287                pgscanned += nr_pages;
4288
4289                /* block memcg migration during page moving between lru */
4290                if (!TestClearPageLRU(page))
4291                        continue;
4292
4293                lruvec = relock_page_lruvec_irq(page, lruvec);
4294                if (page_evictable(page) && PageUnevictable(page)) {
4295                        enum lru_list lru = page_lru_base_type(page);
4296
4297                        VM_BUG_ON_PAGE(PageActive(page), page);
4298                        ClearPageUnevictable(page);
4299                        del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
4300                        add_page_to_lru_list(page, lruvec, lru);
4301                        pgrescued += nr_pages;
4302                }
4303                SetPageLRU(page);
4304        }
4305
4306        if (lruvec) {
4307                __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
4308                __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
4309                unlock_page_lruvec_irq(lruvec);
4310        } else if (pgscanned) {
4311                count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
4312        }
4313}
4314EXPORT_SYMBOL_GPL(check_move_unevictable_pages);
4315