linux/mm/swap.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 *  linux/mm/swap.c
   4 *
   5 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   6 */
   7
   8/*
   9 * This file contains the default values for the operation of the
  10 * Linux VM subsystem. Fine-tuning documentation can be found in
  11 * Documentation/admin-guide/sysctl/vm.rst.
  12 * Started 18.12.91
  13 * Swap aging added 23.2.95, Stephen Tweedie.
  14 * Buffermem limits added 12.3.98, Rik van Riel.
  15 */
  16
  17#include <linux/mm.h>
  18#include <linux/sched.h>
  19#include <linux/kernel_stat.h>
  20#include <linux/swap.h>
  21#include <linux/mman.h>
  22#include <linux/pagemap.h>
  23#include <linux/pagevec.h>
  24#include <linux/init.h>
  25#include <linux/export.h>
  26#include <linux/mm_inline.h>
  27#include <linux/percpu_counter.h>
  28#include <linux/memremap.h>
  29#include <linux/percpu.h>
  30#include <linux/cpu.h>
  31#include <linux/notifier.h>
  32#include <linux/backing-dev.h>
  33#include <linux/memcontrol.h>
  34#include <linux/gfp.h>
  35#include <linux/uio.h>
  36#include <linux/hugetlb.h>
  37#include <linux/page_idle.h>
  38
  39#include "internal.h"
  40
  41#define CREATE_TRACE_POINTS
  42#include <trace/events/pagemap.h>
  43
  44/* How many pages do we try to swap or page in/out together? */
  45int page_cluster;
  46
  47static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
  48static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
  49static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
  50static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs);
  51#ifdef CONFIG_SMP
  52static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
  53#endif
  54
  55/*
  56 * This path almost never happens for VM activity - pages are normally
  57 * freed via pagevecs.  But it gets used by networking.
  58 */
  59static void __page_cache_release(struct page *page)
  60{
  61        if (PageLRU(page)) {
  62                pg_data_t *pgdat = page_pgdat(page);
  63                struct lruvec *lruvec;
  64                unsigned long flags;
  65
  66                spin_lock_irqsave(&pgdat->lru_lock, flags);
  67                lruvec = mem_cgroup_page_lruvec(page, pgdat);
  68                VM_BUG_ON_PAGE(!PageLRU(page), page);
  69                __ClearPageLRU(page);
  70                del_page_from_lru_list(page, lruvec, page_off_lru(page));
  71                spin_unlock_irqrestore(&pgdat->lru_lock, flags);
  72        }
  73        __ClearPageWaiters(page);
  74        mem_cgroup_uncharge(page);
  75}
  76
  77static void __put_single_page(struct page *page)
  78{
  79        __page_cache_release(page);
  80        free_unref_page(page);
  81}
  82
  83static void __put_compound_page(struct page *page)
  84{
  85        compound_page_dtor *dtor;
  86
  87        /*
  88         * __page_cache_release() is supposed to be called for thp, not for
  89         * hugetlb. This is because hugetlb page does never have PageLRU set
  90         * (it's never listed to any LRU lists) and no memcg routines should
  91         * be called for hugetlb (it has a separate hugetlb_cgroup.)
  92         */
  93        if (!PageHuge(page))
  94                __page_cache_release(page);
  95        dtor = get_compound_page_dtor(page);
  96        (*dtor)(page);
  97}
  98
  99void __put_page(struct page *page)
 100{
 101        if (is_zone_device_page(page)) {
 102                put_dev_pagemap(page->pgmap);
 103
 104                /*
 105                 * The page belongs to the device that created pgmap. Do
 106                 * not return it to page allocator.
 107                 */
 108                return;
 109        }
 110
 111        if (unlikely(PageCompound(page)))
 112                __put_compound_page(page);
 113        else
 114                __put_single_page(page);
 115}
 116EXPORT_SYMBOL(__put_page);
 117
 118/**
 119 * put_pages_list() - release a list of pages
 120 * @pages: list of pages threaded on page->lru
 121 *
 122 * Release a list of pages which are strung together on page.lru.  Currently
 123 * used by read_cache_pages() and related error recovery code.
 124 */
 125void put_pages_list(struct list_head *pages)
 126{
 127        while (!list_empty(pages)) {
 128                struct page *victim;
 129
 130                victim = lru_to_page(pages);
 131                list_del(&victim->lru);
 132                put_page(victim);
 133        }
 134}
 135EXPORT_SYMBOL(put_pages_list);
 136
 137/*
 138 * get_kernel_pages() - pin kernel pages in memory
 139 * @kiov:       An array of struct kvec structures
 140 * @nr_segs:    number of segments to pin
 141 * @write:      pinning for read/write, currently ignored
 142 * @pages:      array that receives pointers to the pages pinned.
 143 *              Should be at least nr_segs long.
 144 *
 145 * Returns number of pages pinned. This may be fewer than the number
 146 * requested. If nr_pages is 0 or negative, returns 0. If no pages
 147 * were pinned, returns -errno. Each page returned must be released
 148 * with a put_page() call when it is finished with.
 149 */
 150int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write,
 151                struct page **pages)
 152{
 153        int seg;
 154
 155        for (seg = 0; seg < nr_segs; seg++) {
 156                if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE))
 157                        return seg;
 158
 159                pages[seg] = kmap_to_page(kiov[seg].iov_base);
 160                get_page(pages[seg]);
 161        }
 162
 163        return seg;
 164}
 165EXPORT_SYMBOL_GPL(get_kernel_pages);
 166
 167/*
 168 * get_kernel_page() - pin a kernel page in memory
 169 * @start:      starting kernel address
 170 * @write:      pinning for read/write, currently ignored
 171 * @pages:      array that receives pointer to the page pinned.
 172 *              Must be at least nr_segs long.
 173 *
 174 * Returns 1 if page is pinned. If the page was not pinned, returns
 175 * -errno. The page returned must be released with a put_page() call
 176 * when it is finished with.
 177 */
 178int get_kernel_page(unsigned long start, int write, struct page **pages)
 179{
 180        const struct kvec kiov = {
 181                .iov_base = (void *)start,
 182                .iov_len = PAGE_SIZE
 183        };
 184
 185        return get_kernel_pages(&kiov, 1, write, pages);
 186}
 187EXPORT_SYMBOL_GPL(get_kernel_page);
 188
 189static void pagevec_lru_move_fn(struct pagevec *pvec,
 190        void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg),
 191        void *arg)
 192{
 193        int i;
 194        struct pglist_data *pgdat = NULL;
 195        struct lruvec *lruvec;
 196        unsigned long flags = 0;
 197
 198        for (i = 0; i < pagevec_count(pvec); i++) {
 199                struct page *page = pvec->pages[i];
 200                struct pglist_data *pagepgdat = page_pgdat(page);
 201
 202                if (pagepgdat != pgdat) {
 203                        if (pgdat)
 204                                spin_unlock_irqrestore(&pgdat->lru_lock, flags);
 205                        pgdat = pagepgdat;
 206                        spin_lock_irqsave(&pgdat->lru_lock, flags);
 207                }
 208
 209                lruvec = mem_cgroup_page_lruvec(page, pgdat);
 210                (*move_fn)(page, lruvec, arg);
 211        }
 212        if (pgdat)
 213                spin_unlock_irqrestore(&pgdat->lru_lock, flags);
 214        release_pages(pvec->pages, pvec->nr);
 215        pagevec_reinit(pvec);
 216}
 217
 218static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec,
 219                                 void *arg)
 220{
 221        int *pgmoved = arg;
 222
 223        if (PageLRU(page) && !PageUnevictable(page)) {
 224                del_page_from_lru_list(page, lruvec, page_lru(page));
 225                ClearPageActive(page);
 226                add_page_to_lru_list_tail(page, lruvec, page_lru(page));
 227                (*pgmoved)++;
 228        }
 229}
 230
 231/*
 232 * pagevec_move_tail() must be called with IRQ disabled.
 233 * Otherwise this may cause nasty races.
 234 */
 235static void pagevec_move_tail(struct pagevec *pvec)
 236{
 237        int pgmoved = 0;
 238
 239        pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved);
 240        __count_vm_events(PGROTATED, pgmoved);
 241}
 242
 243/*
 244 * Writeback is about to end against a page which has been marked for immediate
 245 * reclaim.  If it still appears to be reclaimable, move it to the tail of the
 246 * inactive list.
 247 */
 248void rotate_reclaimable_page(struct page *page)
 249{
 250        if (!PageLocked(page) && !PageDirty(page) &&
 251            !PageUnevictable(page) && PageLRU(page)) {
 252                struct pagevec *pvec;
 253                unsigned long flags;
 254
 255                get_page(page);
 256                local_irq_save(flags);
 257                pvec = this_cpu_ptr(&lru_rotate_pvecs);
 258                if (!pagevec_add(pvec, page) || PageCompound(page))
 259                        pagevec_move_tail(pvec);
 260                local_irq_restore(flags);
 261        }
 262}
 263
 264static void update_page_reclaim_stat(struct lruvec *lruvec,
 265                                     int file, int rotated)
 266{
 267        struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
 268
 269        reclaim_stat->recent_scanned[file]++;
 270        if (rotated)
 271                reclaim_stat->recent_rotated[file]++;
 272}
 273
 274static void __activate_page(struct page *page, struct lruvec *lruvec,
 275                            void *arg)
 276{
 277        if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
 278                int file = page_is_file_cache(page);
 279                int lru = page_lru_base_type(page);
 280
 281                del_page_from_lru_list(page, lruvec, lru);
 282                SetPageActive(page);
 283                lru += LRU_ACTIVE;
 284                add_page_to_lru_list(page, lruvec, lru);
 285                trace_mm_lru_activate(page);
 286
 287                __count_vm_event(PGACTIVATE);
 288                update_page_reclaim_stat(lruvec, file, 1);
 289        }
 290}
 291
 292#ifdef CONFIG_SMP
 293static void activate_page_drain(int cpu)
 294{
 295        struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu);
 296
 297        if (pagevec_count(pvec))
 298                pagevec_lru_move_fn(pvec, __activate_page, NULL);
 299}
 300
 301static bool need_activate_page_drain(int cpu)
 302{
 303        return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
 304}
 305
 306void activate_page(struct page *page)
 307{
 308        page = compound_head(page);
 309        if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
 310                struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
 311
 312                get_page(page);
 313                if (!pagevec_add(pvec, page) || PageCompound(page))
 314                        pagevec_lru_move_fn(pvec, __activate_page, NULL);
 315                put_cpu_var(activate_page_pvecs);
 316        }
 317}
 318
 319#else
 320static inline void activate_page_drain(int cpu)
 321{
 322}
 323
 324void activate_page(struct page *page)
 325{
 326        pg_data_t *pgdat = page_pgdat(page);
 327
 328        page = compound_head(page);
 329        spin_lock_irq(&pgdat->lru_lock);
 330        __activate_page(page, mem_cgroup_page_lruvec(page, pgdat), NULL);
 331        spin_unlock_irq(&pgdat->lru_lock);
 332}
 333#endif
 334
 335static void __lru_cache_activate_page(struct page *page)
 336{
 337        struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
 338        int i;
 339
 340        /*
 341         * Search backwards on the optimistic assumption that the page being
 342         * activated has just been added to this pagevec. Note that only
 343         * the local pagevec is examined as a !PageLRU page could be in the
 344         * process of being released, reclaimed, migrated or on a remote
 345         * pagevec that is currently being drained. Furthermore, marking
 346         * a remote pagevec's page PageActive potentially hits a race where
 347         * a page is marked PageActive just after it is added to the inactive
 348         * list causing accounting errors and BUG_ON checks to trigger.
 349         */
 350        for (i = pagevec_count(pvec) - 1; i >= 0; i--) {
 351                struct page *pagevec_page = pvec->pages[i];
 352
 353                if (pagevec_page == page) {
 354                        SetPageActive(page);
 355                        break;
 356                }
 357        }
 358
 359        put_cpu_var(lru_add_pvec);
 360}
 361
 362/*
 363 * Mark a page as having seen activity.
 364 *
 365 * inactive,unreferenced        ->      inactive,referenced
 366 * inactive,referenced          ->      active,unreferenced
 367 * active,unreferenced          ->      active,referenced
 368 *
 369 * When a newly allocated page is not yet visible, so safe for non-atomic ops,
 370 * __SetPageReferenced(page) may be substituted for mark_page_accessed(page).
 371 */
 372void mark_page_accessed(struct page *page)
 373{
 374        page = compound_head(page);
 375        if (!PageActive(page) && !PageUnevictable(page) &&
 376                        PageReferenced(page)) {
 377
 378                /*
 379                 * If the page is on the LRU, queue it for activation via
 380                 * activate_page_pvecs. Otherwise, assume the page is on a
 381                 * pagevec, mark it active and it'll be moved to the active
 382                 * LRU on the next drain.
 383                 */
 384                if (PageLRU(page))
 385                        activate_page(page);
 386                else
 387                        __lru_cache_activate_page(page);
 388                ClearPageReferenced(page);
 389                if (page_is_file_cache(page))
 390                        workingset_activation(page);
 391        } else if (!PageReferenced(page)) {
 392                SetPageReferenced(page);
 393        }
 394        if (page_is_idle(page))
 395                clear_page_idle(page);
 396}
 397EXPORT_SYMBOL(mark_page_accessed);
 398
 399static void __lru_cache_add(struct page *page)
 400{
 401        struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
 402
 403        get_page(page);
 404        if (!pagevec_add(pvec, page) || PageCompound(page))
 405                __pagevec_lru_add(pvec);
 406        put_cpu_var(lru_add_pvec);
 407}
 408
 409/**
 410 * lru_cache_add_anon - add a page to the page lists
 411 * @page: the page to add
 412 */
 413void lru_cache_add_anon(struct page *page)
 414{
 415        if (PageActive(page))
 416                ClearPageActive(page);
 417        __lru_cache_add(page);
 418}
 419
 420void lru_cache_add_file(struct page *page)
 421{
 422        if (PageActive(page))
 423                ClearPageActive(page);
 424        __lru_cache_add(page);
 425}
 426EXPORT_SYMBOL(lru_cache_add_file);
 427
 428/**
 429 * lru_cache_add - add a page to a page list
 430 * @page: the page to be added to the LRU.
 431 *
 432 * Queue the page for addition to the LRU via pagevec. The decision on whether
 433 * to add the page to the [in]active [file|anon] list is deferred until the
 434 * pagevec is drained. This gives a chance for the caller of lru_cache_add()
 435 * have the page added to the active list using mark_page_accessed().
 436 */
 437void lru_cache_add(struct page *page)
 438{
 439        VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
 440        VM_BUG_ON_PAGE(PageLRU(page), page);
 441        __lru_cache_add(page);
 442}
 443
 444/**
 445 * lru_cache_add_active_or_unevictable
 446 * @page:  the page to be added to LRU
 447 * @vma:   vma in which page is mapped for determining reclaimability
 448 *
 449 * Place @page on the active or unevictable LRU list, depending on its
 450 * evictability.  Note that if the page is not evictable, it goes
 451 * directly back onto it's zone's unevictable list, it does NOT use a
 452 * per cpu pagevec.
 453 */
 454void lru_cache_add_active_or_unevictable(struct page *page,
 455                                         struct vm_area_struct *vma)
 456{
 457        VM_BUG_ON_PAGE(PageLRU(page), page);
 458
 459        if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
 460                SetPageActive(page);
 461        else if (!TestSetPageMlocked(page)) {
 462                /*
 463                 * We use the irq-unsafe __mod_zone_page_stat because this
 464                 * counter is not modified from interrupt context, and the pte
 465                 * lock is held(spinlock), which implies preemption disabled.
 466                 */
 467                __mod_zone_page_state(page_zone(page), NR_MLOCK,
 468                                    hpage_nr_pages(page));
 469                count_vm_event(UNEVICTABLE_PGMLOCKED);
 470        }
 471        lru_cache_add(page);
 472}
 473
 474/*
 475 * If the page can not be invalidated, it is moved to the
 476 * inactive list to speed up its reclaim.  It is moved to the
 477 * head of the list, rather than the tail, to give the flusher
 478 * threads some time to write it out, as this is much more
 479 * effective than the single-page writeout from reclaim.
 480 *
 481 * If the page isn't page_mapped and dirty/writeback, the page
 482 * could reclaim asap using PG_reclaim.
 483 *
 484 * 1. active, mapped page -> none
 485 * 2. active, dirty/writeback page -> inactive, head, PG_reclaim
 486 * 3. inactive, mapped page -> none
 487 * 4. inactive, dirty/writeback page -> inactive, head, PG_reclaim
 488 * 5. inactive, clean -> inactive, tail
 489 * 6. Others -> none
 490 *
 491 * In 4, why it moves inactive's head, the VM expects the page would
 492 * be write it out by flusher threads as this is much more effective
 493 * than the single-page writeout from reclaim.
 494 */
 495static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
 496                              void *arg)
 497{
 498        int lru, file;
 499        bool active;
 500
 501        if (!PageLRU(page))
 502                return;
 503
 504        if (PageUnevictable(page))
 505                return;
 506
 507        /* Some processes are using the page */
 508        if (page_mapped(page))
 509                return;
 510
 511        active = PageActive(page);
 512        file = page_is_file_cache(page);
 513        lru = page_lru_base_type(page);
 514
 515        del_page_from_lru_list(page, lruvec, lru + active);
 516        ClearPageActive(page);
 517        ClearPageReferenced(page);
 518        add_page_to_lru_list(page, lruvec, lru);
 519
 520        if (PageWriteback(page) || PageDirty(page)) {
 521                /*
 522                 * PG_reclaim could be raced with end_page_writeback
 523                 * It can make readahead confusing.  But race window
 524                 * is _really_ small and  it's non-critical problem.
 525                 */
 526                SetPageReclaim(page);
 527        } else {
 528                /*
 529                 * The page's writeback ends up during pagevec
 530                 * We moves tha page into tail of inactive.
 531                 */
 532                list_move_tail(&page->lru, &lruvec->lists[lru]);
 533                __count_vm_event(PGROTATED);
 534        }
 535
 536        if (active)
 537                __count_vm_event(PGDEACTIVATE);
 538        update_page_reclaim_stat(lruvec, file, 0);
 539}
 540
 541
 542static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
 543                            void *arg)
 544{
 545        if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
 546            !PageSwapCache(page) && !PageUnevictable(page)) {
 547                bool active = PageActive(page);
 548
 549                del_page_from_lru_list(page, lruvec,
 550                                       LRU_INACTIVE_ANON + active);
 551                ClearPageActive(page);
 552                ClearPageReferenced(page);
 553                /*
 554                 * lazyfree pages are clean anonymous pages. They have
 555                 * SwapBacked flag cleared to distinguish normal anonymous
 556                 * pages
 557                 */
 558                ClearPageSwapBacked(page);
 559                add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE);
 560
 561                __count_vm_events(PGLAZYFREE, hpage_nr_pages(page));
 562                count_memcg_page_event(page, PGLAZYFREE);
 563                update_page_reclaim_stat(lruvec, 1, 0);
 564        }
 565}
 566
 567/*
 568 * Drain pages out of the cpu's pagevecs.
 569 * Either "cpu" is the current CPU, and preemption has already been
 570 * disabled; or "cpu" is being hot-unplugged, and is already dead.
 571 */
 572void lru_add_drain_cpu(int cpu)
 573{
 574        struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu);
 575
 576        if (pagevec_count(pvec))
 577                __pagevec_lru_add(pvec);
 578
 579        pvec = &per_cpu(lru_rotate_pvecs, cpu);
 580        if (pagevec_count(pvec)) {
 581                unsigned long flags;
 582
 583                /* No harm done if a racing interrupt already did this */
 584                local_irq_save(flags);
 585                pagevec_move_tail(pvec);
 586                local_irq_restore(flags);
 587        }
 588
 589        pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
 590        if (pagevec_count(pvec))
 591                pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
 592
 593        pvec = &per_cpu(lru_lazyfree_pvecs, cpu);
 594        if (pagevec_count(pvec))
 595                pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
 596
 597        activate_page_drain(cpu);
 598}
 599
 600/**
 601 * deactivate_file_page - forcefully deactivate a file page
 602 * @page: page to deactivate
 603 *
 604 * This function hints the VM that @page is a good reclaim candidate,
 605 * for example if its invalidation fails due to the page being dirty
 606 * or under writeback.
 607 */
 608void deactivate_file_page(struct page *page)
 609{
 610        /*
 611         * In a workload with many unevictable page such as mprotect,
 612         * unevictable page deactivation for accelerating reclaim is pointless.
 613         */
 614        if (PageUnevictable(page))
 615                return;
 616
 617        if (likely(get_page_unless_zero(page))) {
 618                struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
 619
 620                if (!pagevec_add(pvec, page) || PageCompound(page))
 621                        pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
 622                put_cpu_var(lru_deactivate_file_pvecs);
 623        }
 624}
 625
 626/**
 627 * mark_page_lazyfree - make an anon page lazyfree
 628 * @page: page to deactivate
 629 *
 630 * mark_page_lazyfree() moves @page to the inactive file list.
 631 * This is done to accelerate the reclaim of @page.
 632 */
 633void mark_page_lazyfree(struct page *page)
 634{
 635        if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
 636            !PageSwapCache(page) && !PageUnevictable(page)) {
 637                struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs);
 638
 639                get_page(page);
 640                if (!pagevec_add(pvec, page) || PageCompound(page))
 641                        pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
 642                put_cpu_var(lru_lazyfree_pvecs);
 643        }
 644}
 645
 646void lru_add_drain(void)
 647{
 648        lru_add_drain_cpu(get_cpu());
 649        put_cpu();
 650}
 651
 652#ifdef CONFIG_SMP
 653
 654static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
 655
 656static void lru_add_drain_per_cpu(struct work_struct *dummy)
 657{
 658        lru_add_drain();
 659}
 660
 661/*
 662 * Doesn't need any cpu hotplug locking because we do rely on per-cpu
 663 * kworkers being shut down before our page_alloc_cpu_dead callback is
 664 * executed on the offlined cpu.
 665 * Calling this function with cpu hotplug locks held can actually lead
 666 * to obscure indirect dependencies via WQ context.
 667 */
 668void lru_add_drain_all(void)
 669{
 670        static DEFINE_MUTEX(lock);
 671        static struct cpumask has_work;
 672        int cpu;
 673
 674        /*
 675         * Make sure nobody triggers this path before mm_percpu_wq is fully
 676         * initialized.
 677         */
 678        if (WARN_ON(!mm_percpu_wq))
 679                return;
 680
 681        mutex_lock(&lock);
 682        cpumask_clear(&has_work);
 683
 684        for_each_online_cpu(cpu) {
 685                struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
 686
 687                if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
 688                    pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
 689                    pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
 690                    pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) ||
 691                    need_activate_page_drain(cpu)) {
 692                        INIT_WORK(work, lru_add_drain_per_cpu);
 693                        queue_work_on(cpu, mm_percpu_wq, work);
 694                        cpumask_set_cpu(cpu, &has_work);
 695                }
 696        }
 697
 698        for_each_cpu(cpu, &has_work)
 699                flush_work(&per_cpu(lru_add_drain_work, cpu));
 700
 701        mutex_unlock(&lock);
 702}
 703#else
 704void lru_add_drain_all(void)
 705{
 706        lru_add_drain();
 707}
 708#endif
 709
 710/**
 711 * release_pages - batched put_page()
 712 * @pages: array of pages to release
 713 * @nr: number of pages
 714 *
 715 * Decrement the reference count on all the pages in @pages.  If it
 716 * fell to zero, remove the page from the LRU and free it.
 717 */
 718void release_pages(struct page **pages, int nr)
 719{
 720        int i;
 721        LIST_HEAD(pages_to_free);
 722        struct pglist_data *locked_pgdat = NULL;
 723        struct lruvec *lruvec;
 724        unsigned long uninitialized_var(flags);
 725        unsigned int uninitialized_var(lock_batch);
 726
 727        for (i = 0; i < nr; i++) {
 728                struct page *page = pages[i];
 729
 730                /*
 731                 * Make sure the IRQ-safe lock-holding time does not get
 732                 * excessive with a continuous string of pages from the
 733                 * same pgdat. The lock is held only if pgdat != NULL.
 734                 */
 735                if (locked_pgdat && ++lock_batch == SWAP_CLUSTER_MAX) {
 736                        spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
 737                        locked_pgdat = NULL;
 738                }
 739
 740                if (is_huge_zero_page(page))
 741                        continue;
 742
 743                if (is_zone_device_page(page)) {
 744                        if (locked_pgdat) {
 745                                spin_unlock_irqrestore(&locked_pgdat->lru_lock,
 746                                                       flags);
 747                                locked_pgdat = NULL;
 748                        }
 749                        /*
 750                         * ZONE_DEVICE pages that return 'false' from
 751                         * put_devmap_managed_page() do not require special
 752                         * processing, and instead, expect a call to
 753                         * put_page_testzero().
 754                         */
 755                        if (put_devmap_managed_page(page))
 756                                continue;
 757                }
 758
 759                page = compound_head(page);
 760                if (!put_page_testzero(page))
 761                        continue;
 762
 763                if (PageCompound(page)) {
 764                        if (locked_pgdat) {
 765                                spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
 766                                locked_pgdat = NULL;
 767                        }
 768                        __put_compound_page(page);
 769                        continue;
 770                }
 771
 772                if (PageLRU(page)) {
 773                        struct pglist_data *pgdat = page_pgdat(page);
 774
 775                        if (pgdat != locked_pgdat) {
 776                                if (locked_pgdat)
 777                                        spin_unlock_irqrestore(&locked_pgdat->lru_lock,
 778                                                                        flags);
 779                                lock_batch = 0;
 780                                locked_pgdat = pgdat;
 781                                spin_lock_irqsave(&locked_pgdat->lru_lock, flags);
 782                        }
 783
 784                        lruvec = mem_cgroup_page_lruvec(page, locked_pgdat);
 785                        VM_BUG_ON_PAGE(!PageLRU(page), page);
 786                        __ClearPageLRU(page);
 787                        del_page_from_lru_list(page, lruvec, page_off_lru(page));
 788                }
 789
 790                /* Clear Active bit in case of parallel mark_page_accessed */
 791                __ClearPageActive(page);
 792                __ClearPageWaiters(page);
 793
 794                list_add(&page->lru, &pages_to_free);
 795        }
 796        if (locked_pgdat)
 797                spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
 798
 799        mem_cgroup_uncharge_list(&pages_to_free);
 800        free_unref_page_list(&pages_to_free);
 801}
 802EXPORT_SYMBOL(release_pages);
 803
 804/*
 805 * The pages which we're about to release may be in the deferred lru-addition
 806 * queues.  That would prevent them from really being freed right now.  That's
 807 * OK from a correctness point of view but is inefficient - those pages may be
 808 * cache-warm and we want to give them back to the page allocator ASAP.
 809 *
 810 * So __pagevec_release() will drain those queues here.  __pagevec_lru_add()
 811 * and __pagevec_lru_add_active() call release_pages() directly to avoid
 812 * mutual recursion.
 813 */
 814void __pagevec_release(struct pagevec *pvec)
 815{
 816        if (!pvec->percpu_pvec_drained) {
 817                lru_add_drain();
 818                pvec->percpu_pvec_drained = true;
 819        }
 820        release_pages(pvec->pages, pagevec_count(pvec));
 821        pagevec_reinit(pvec);
 822}
 823EXPORT_SYMBOL(__pagevec_release);
 824
 825#ifdef CONFIG_TRANSPARENT_HUGEPAGE
 826/* used by __split_huge_page_refcount() */
 827void lru_add_page_tail(struct page *page, struct page *page_tail,
 828                       struct lruvec *lruvec, struct list_head *list)
 829{
 830        const int file = 0;
 831
 832        VM_BUG_ON_PAGE(!PageHead(page), page);
 833        VM_BUG_ON_PAGE(PageCompound(page_tail), page);
 834        VM_BUG_ON_PAGE(PageLRU(page_tail), page);
 835        lockdep_assert_held(&lruvec_pgdat(lruvec)->lru_lock);
 836
 837        if (!list)
 838                SetPageLRU(page_tail);
 839
 840        if (likely(PageLRU(page)))
 841                list_add_tail(&page_tail->lru, &page->lru);
 842        else if (list) {
 843                /* page reclaim is reclaiming a huge page */
 844                get_page(page_tail);
 845                list_add_tail(&page_tail->lru, list);
 846        } else {
 847                struct list_head *list_head;
 848                /*
 849                 * Head page has not yet been counted, as an hpage,
 850                 * so we must account for each subpage individually.
 851                 *
 852                 * Use the standard add function to put page_tail on the list,
 853                 * but then correct its position so they all end up in order.
 854                 */
 855                add_page_to_lru_list(page_tail, lruvec, page_lru(page_tail));
 856                list_head = page_tail->lru.prev;
 857                list_move_tail(&page_tail->lru, list_head);
 858        }
 859
 860        if (!PageUnevictable(page))
 861                update_page_reclaim_stat(lruvec, file, PageActive(page_tail));
 862}
 863#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 864
 865static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
 866                                 void *arg)
 867{
 868        enum lru_list lru;
 869        int was_unevictable = TestClearPageUnevictable(page);
 870
 871        VM_BUG_ON_PAGE(PageLRU(page), page);
 872
 873        SetPageLRU(page);
 874        /*
 875         * Page becomes evictable in two ways:
 876         * 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()].
 877         * 2) Before acquiring LRU lock to put the page to correct LRU and then
 878         *   a) do PageLRU check with lock [check_move_unevictable_pages]
 879         *   b) do PageLRU check before lock [clear_page_mlock]
 880         *
 881         * (1) & (2a) are ok as LRU lock will serialize them. For (2b), we need
 882         * following strict ordering:
 883         *
 884         * #0: __pagevec_lru_add_fn             #1: clear_page_mlock
 885         *
 886         * SetPageLRU()                         TestClearPageMlocked()
 887         * smp_mb() // explicit ordering        // above provides strict
 888         *                                      // ordering
 889         * PageMlocked()                        PageLRU()
 890         *
 891         *
 892         * if '#1' does not observe setting of PG_lru by '#0' and fails
 893         * isolation, the explicit barrier will make sure that page_evictable
 894         * check will put the page in correct LRU. Without smp_mb(), SetPageLRU
 895         * can be reordered after PageMlocked check and can make '#1' to fail
 896         * the isolation of the page whose Mlocked bit is cleared (#0 is also
 897         * looking at the same page) and the evictable page will be stranded
 898         * in an unevictable LRU.
 899         */
 900        smp_mb();
 901
 902        if (page_evictable(page)) {
 903                lru = page_lru(page);
 904                update_page_reclaim_stat(lruvec, page_is_file_cache(page),
 905                                         PageActive(page));
 906                if (was_unevictable)
 907                        count_vm_event(UNEVICTABLE_PGRESCUED);
 908        } else {
 909                lru = LRU_UNEVICTABLE;
 910                ClearPageActive(page);
 911                SetPageUnevictable(page);
 912                if (!was_unevictable)
 913                        count_vm_event(UNEVICTABLE_PGCULLED);
 914        }
 915
 916        add_page_to_lru_list(page, lruvec, lru);
 917        trace_mm_lru_insertion(page, lru);
 918}
 919
 920/*
 921 * Add the passed pages to the LRU, then drop the caller's refcount
 922 * on them.  Reinitialises the caller's pagevec.
 923 */
 924void __pagevec_lru_add(struct pagevec *pvec)
 925{
 926        pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL);
 927}
 928EXPORT_SYMBOL(__pagevec_lru_add);
 929
 930/**
 931 * pagevec_lookup_entries - gang pagecache lookup
 932 * @pvec:       Where the resulting entries are placed
 933 * @mapping:    The address_space to search
 934 * @start:      The starting entry index
 935 * @nr_entries: The maximum number of pages
 936 * @indices:    The cache indices corresponding to the entries in @pvec
 937 *
 938 * pagevec_lookup_entries() will search for and return a group of up
 939 * to @nr_pages pages and shadow entries in the mapping.  All
 940 * entries are placed in @pvec.  pagevec_lookup_entries() takes a
 941 * reference against actual pages in @pvec.
 942 *
 943 * The search returns a group of mapping-contiguous entries with
 944 * ascending indexes.  There may be holes in the indices due to
 945 * not-present entries.
 946 *
 947 * pagevec_lookup_entries() returns the number of entries which were
 948 * found.
 949 */
 950unsigned pagevec_lookup_entries(struct pagevec *pvec,
 951                                struct address_space *mapping,
 952                                pgoff_t start, unsigned nr_entries,
 953                                pgoff_t *indices)
 954{
 955        pvec->nr = find_get_entries(mapping, start, nr_entries,
 956                                    pvec->pages, indices);
 957        return pagevec_count(pvec);
 958}
 959
 960/**
 961 * pagevec_remove_exceptionals - pagevec exceptionals pruning
 962 * @pvec:       The pagevec to prune
 963 *
 964 * pagevec_lookup_entries() fills both pages and exceptional radix
 965 * tree entries into the pagevec.  This function prunes all
 966 * exceptionals from @pvec without leaving holes, so that it can be
 967 * passed on to page-only pagevec operations.
 968 */
 969void pagevec_remove_exceptionals(struct pagevec *pvec)
 970{
 971        int i, j;
 972
 973        for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
 974                struct page *page = pvec->pages[i];
 975                if (!xa_is_value(page))
 976                        pvec->pages[j++] = page;
 977        }
 978        pvec->nr = j;
 979}
 980
 981/**
 982 * pagevec_lookup_range - gang pagecache lookup
 983 * @pvec:       Where the resulting pages are placed
 984 * @mapping:    The address_space to search
 985 * @start:      The starting page index
 986 * @end:        The final page index
 987 *
 988 * pagevec_lookup_range() will search for & return a group of up to PAGEVEC_SIZE
 989 * pages in the mapping starting from index @start and upto index @end
 990 * (inclusive).  The pages are placed in @pvec.  pagevec_lookup() takes a
 991 * reference against the pages in @pvec.
 992 *
 993 * The search returns a group of mapping-contiguous pages with ascending
 994 * indexes.  There may be holes in the indices due to not-present pages. We
 995 * also update @start to index the next page for the traversal.
 996 *
 997 * pagevec_lookup_range() returns the number of pages which were found. If this
 998 * number is smaller than PAGEVEC_SIZE, the end of specified range has been
 999 * reached.
1000 */
1001unsigned pagevec_lookup_range(struct pagevec *pvec,
1002                struct address_space *mapping, pgoff_t *start, pgoff_t end)
1003{
1004        pvec->nr = find_get_pages_range(mapping, start, end, PAGEVEC_SIZE,
1005                                        pvec->pages);
1006        return pagevec_count(pvec);
1007}
1008EXPORT_SYMBOL(pagevec_lookup_range);
1009
1010unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
1011                struct address_space *mapping, pgoff_t *index, pgoff_t end,
1012                xa_mark_t tag)
1013{
1014        pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
1015                                        PAGEVEC_SIZE, pvec->pages);
1016        return pagevec_count(pvec);
1017}
1018EXPORT_SYMBOL(pagevec_lookup_range_tag);
1019
1020unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec,
1021                struct address_space *mapping, pgoff_t *index, pgoff_t end,
1022                xa_mark_t tag, unsigned max_pages)
1023{
1024        pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
1025                min_t(unsigned int, max_pages, PAGEVEC_SIZE), pvec->pages);
1026        return pagevec_count(pvec);
1027}
1028EXPORT_SYMBOL(pagevec_lookup_range_nr_tag);
1029/*
1030 * Perform any setup for the swap system
1031 */
1032void __init swap_setup(void)
1033{
1034        unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT);
1035
1036        /* Use a smaller cluster for small-memory machines */
1037        if (megs < 16)
1038                page_cluster = 2;
1039        else
1040                page_cluster = 3;
1041        /*
1042         * Right now other parts of the system means that we
1043         * _really_ don't want to cluster much more
1044         */
1045}
1046