linux/mm/swap.c
<<
>>
Prefs
   1/*
   2 *  linux/mm/swap.c
   3 *
   4 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5 */
   6
   7/*
   8 * This file contains the default values for the operation of the
   9 * Linux VM subsystem. Fine-tuning documentation can be found in
  10 * Documentation/sysctl/vm.txt.
  11 * Started 18.12.91
  12 * Swap aging added 23.2.95, Stephen Tweedie.
  13 * Buffermem limits added 12.3.98, Rik van Riel.
  14 */
  15
  16#include <linux/mm.h>
  17#include <linux/sched.h>
  18#include <linux/kernel_stat.h>
  19#include <linux/swap.h>
  20#include <linux/mman.h>
  21#include <linux/pagemap.h>
  22#include <linux/pagevec.h>
  23#include <linux/init.h>
  24#include <linux/module.h>
  25#include <linux/mm_inline.h>
  26#include <linux/buffer_head.h>  /* for try_to_release_page() */
  27#include <linux/percpu_counter.h>
  28#include <linux/percpu.h>
  29#include <linux/cpu.h>
  30#include <linux/notifier.h>
  31#include <linux/backing-dev.h>
  32#include <linux/memcontrol.h>
  33
  34#include "internal.h"
  35
  36/* How many pages do we try to swap or page in/out together? */
  37int page_cluster;
  38
  39static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs);
  40static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
  41
  42/*
  43 * This path almost never happens for VM activity - pages are normally
  44 * freed via pagevecs.  But it gets used by networking.
  45 */
  46static void __page_cache_release(struct page *page)
  47{
  48        if (PageLRU(page)) {
  49                unsigned long flags;
  50                struct zone *zone = page_zone(page);
  51
  52                spin_lock_irqsave(&zone->lru_lock, flags);
  53                VM_BUG_ON(!PageLRU(page));
  54                __ClearPageLRU(page);
  55                del_page_from_lru(zone, page);
  56                spin_unlock_irqrestore(&zone->lru_lock, flags);
  57        }
  58        free_hot_page(page);
  59}
  60
  61static void put_compound_page(struct page *page)
  62{
  63        page = compound_head(page);
  64        if (put_page_testzero(page)) {
  65                compound_page_dtor *dtor;
  66
  67                dtor = get_compound_page_dtor(page);
  68                (*dtor)(page);
  69        }
  70}
  71
  72void put_page(struct page *page)
  73{
  74        if (unlikely(PageCompound(page)))
  75                put_compound_page(page);
  76        else if (put_page_testzero(page))
  77                __page_cache_release(page);
  78}
  79EXPORT_SYMBOL(put_page);
  80
  81/**
  82 * put_pages_list() - release a list of pages
  83 * @pages: list of pages threaded on page->lru
  84 *
  85 * Release a list of pages which are strung together on page.lru.  Currently
  86 * used by read_cache_pages() and related error recovery code.
  87 */
  88void put_pages_list(struct list_head *pages)
  89{
  90        while (!list_empty(pages)) {
  91                struct page *victim;
  92
  93                victim = list_entry(pages->prev, struct page, lru);
  94                list_del(&victim->lru);
  95                page_cache_release(victim);
  96        }
  97}
  98EXPORT_SYMBOL(put_pages_list);
  99
 100/*
 101 * pagevec_move_tail() must be called with IRQ disabled.
 102 * Otherwise this may cause nasty races.
 103 */
 104static void pagevec_move_tail(struct pagevec *pvec)
 105{
 106        int i;
 107        int pgmoved = 0;
 108        struct zone *zone = NULL;
 109
 110        for (i = 0; i < pagevec_count(pvec); i++) {
 111                struct page *page = pvec->pages[i];
 112                struct zone *pagezone = page_zone(page);
 113
 114                if (pagezone != zone) {
 115                        if (zone)
 116                                spin_unlock(&zone->lru_lock);
 117                        zone = pagezone;
 118                        spin_lock(&zone->lru_lock);
 119                }
 120                if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
 121                        int lru = page_lru_base_type(page);
 122                        list_move_tail(&page->lru, &zone->lru[lru].list);
 123                        pgmoved++;
 124                }
 125        }
 126        if (zone)
 127                spin_unlock(&zone->lru_lock);
 128        __count_vm_events(PGROTATED, pgmoved);
 129        release_pages(pvec->pages, pvec->nr, pvec->cold);
 130        pagevec_reinit(pvec);
 131}
 132
 133/*
 134 * Writeback is about to end against a page which has been marked for immediate
 135 * reclaim.  If it still appears to be reclaimable, move it to the tail of the
 136 * inactive list.
 137 */
 138void  rotate_reclaimable_page(struct page *page)
 139{
 140        if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
 141            !PageUnevictable(page) && PageLRU(page)) {
 142                struct pagevec *pvec;
 143                unsigned long flags;
 144
 145                page_cache_get(page);
 146                local_irq_save(flags);
 147                pvec = &__get_cpu_var(lru_rotate_pvecs);
 148                if (!pagevec_add(pvec, page))
 149                        pagevec_move_tail(pvec);
 150                local_irq_restore(flags);
 151        }
 152}
 153
 154static void update_page_reclaim_stat(struct zone *zone, struct page *page,
 155                                     int file, int rotated)
 156{
 157        struct zone_reclaim_stat *reclaim_stat = &zone->reclaim_stat;
 158        struct zone_reclaim_stat *memcg_reclaim_stat;
 159
 160        memcg_reclaim_stat = mem_cgroup_get_reclaim_stat_from_page(page);
 161
 162        reclaim_stat->recent_scanned[file]++;
 163        if (rotated)
 164                reclaim_stat->recent_rotated[file]++;
 165
 166        if (!memcg_reclaim_stat)
 167                return;
 168
 169        memcg_reclaim_stat->recent_scanned[file]++;
 170        if (rotated)
 171                memcg_reclaim_stat->recent_rotated[file]++;
 172}
 173
 174/*
 175 * FIXME: speed this up?
 176 */
 177void activate_page(struct page *page)
 178{
 179        struct zone *zone = page_zone(page);
 180
 181        spin_lock_irq(&zone->lru_lock);
 182        if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
 183                int file = page_is_file_cache(page);
 184                int lru = page_lru_base_type(page);
 185                del_page_from_lru_list(zone, page, lru);
 186
 187                SetPageActive(page);
 188                lru += LRU_ACTIVE;
 189                add_page_to_lru_list(zone, page, lru);
 190                __count_vm_event(PGACTIVATE);
 191
 192                update_page_reclaim_stat(zone, page, file, 1);
 193        }
 194        spin_unlock_irq(&zone->lru_lock);
 195}
 196
 197/*
 198 * Mark a page as having seen activity.
 199 *
 200 * inactive,unreferenced        ->      inactive,referenced
 201 * inactive,referenced          ->      active,unreferenced
 202 * active,unreferenced          ->      active,referenced
 203 */
 204void mark_page_accessed(struct page *page)
 205{
 206        if (!PageActive(page) && !PageUnevictable(page) &&
 207                        PageReferenced(page) && PageLRU(page)) {
 208                activate_page(page);
 209                ClearPageReferenced(page);
 210        } else if (!PageReferenced(page)) {
 211                SetPageReferenced(page);
 212        }
 213}
 214
 215EXPORT_SYMBOL(mark_page_accessed);
 216
 217void __lru_cache_add(struct page *page, enum lru_list lru)
 218{
 219        struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru];
 220
 221        page_cache_get(page);
 222        if (!pagevec_add(pvec, page))
 223                ____pagevec_lru_add(pvec, lru);
 224        put_cpu_var(lru_add_pvecs);
 225}
 226
 227/**
 228 * lru_cache_add_lru - add a page to a page list
 229 * @page: the page to be added to the LRU.
 230 * @lru: the LRU list to which the page is added.
 231 */
 232void lru_cache_add_lru(struct page *page, enum lru_list lru)
 233{
 234        if (PageActive(page)) {
 235                VM_BUG_ON(PageUnevictable(page));
 236                ClearPageActive(page);
 237        } else if (PageUnevictable(page)) {
 238                VM_BUG_ON(PageActive(page));
 239                ClearPageUnevictable(page);
 240        }
 241
 242        VM_BUG_ON(PageLRU(page) || PageActive(page) || PageUnevictable(page));
 243        __lru_cache_add(page, lru);
 244}
 245
 246/**
 247 * add_page_to_unevictable_list - add a page to the unevictable list
 248 * @page:  the page to be added to the unevictable list
 249 *
 250 * Add page directly to its zone's unevictable list.  To avoid races with
 251 * tasks that might be making the page evictable, through eg. munlock,
 252 * munmap or exit, while it's not on the lru, we want to add the page
 253 * while it's locked or otherwise "invisible" to other tasks.  This is
 254 * difficult to do when using the pagevec cache, so bypass that.
 255 */
 256void add_page_to_unevictable_list(struct page *page)
 257{
 258        struct zone *zone = page_zone(page);
 259
 260        spin_lock_irq(&zone->lru_lock);
 261        SetPageUnevictable(page);
 262        SetPageLRU(page);
 263        add_page_to_lru_list(zone, page, LRU_UNEVICTABLE);
 264        spin_unlock_irq(&zone->lru_lock);
 265}
 266
 267/*
 268 * Drain pages out of the cpu's pagevecs.
 269 * Either "cpu" is the current CPU, and preemption has already been
 270 * disabled; or "cpu" is being hot-unplugged, and is already dead.
 271 */
 272static void drain_cpu_pagevecs(int cpu)
 273{
 274        struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu);
 275        struct pagevec *pvec;
 276        int lru;
 277
 278        for_each_lru(lru) {
 279                pvec = &pvecs[lru - LRU_BASE];
 280                if (pagevec_count(pvec))
 281                        ____pagevec_lru_add(pvec, lru);
 282        }
 283
 284        pvec = &per_cpu(lru_rotate_pvecs, cpu);
 285        if (pagevec_count(pvec)) {
 286                unsigned long flags;
 287
 288                /* No harm done if a racing interrupt already did this */
 289                local_irq_save(flags);
 290                pagevec_move_tail(pvec);
 291                local_irq_restore(flags);
 292        }
 293}
 294
 295void lru_add_drain(void)
 296{
 297        drain_cpu_pagevecs(get_cpu());
 298        put_cpu();
 299}
 300
 301static void lru_add_drain_per_cpu(struct work_struct *dummy)
 302{
 303        lru_add_drain();
 304}
 305
 306/*
 307 * Returns 0 for success
 308 */
 309int lru_add_drain_all(void)
 310{
 311        return schedule_on_each_cpu(lru_add_drain_per_cpu);
 312}
 313
 314/*
 315 * Batched page_cache_release().  Decrement the reference count on all the
 316 * passed pages.  If it fell to zero then remove the page from the LRU and
 317 * free it.
 318 *
 319 * Avoid taking zone->lru_lock if possible, but if it is taken, retain it
 320 * for the remainder of the operation.
 321 *
 322 * The locking in this function is against shrink_inactive_list(): we recheck
 323 * the page count inside the lock to see whether shrink_inactive_list()
 324 * grabbed the page via the LRU.  If it did, give up: shrink_inactive_list()
 325 * will free it.
 326 */
 327void release_pages(struct page **pages, int nr, int cold)
 328{
 329        int i;
 330        struct pagevec pages_to_free;
 331        struct zone *zone = NULL;
 332        unsigned long uninitialized_var(flags);
 333
 334        pagevec_init(&pages_to_free, cold);
 335        for (i = 0; i < nr; i++) {
 336                struct page *page = pages[i];
 337
 338                if (unlikely(PageCompound(page))) {
 339                        if (zone) {
 340                                spin_unlock_irqrestore(&zone->lru_lock, flags);
 341                                zone = NULL;
 342                        }
 343                        put_compound_page(page);
 344                        continue;
 345                }
 346
 347                if (!put_page_testzero(page))
 348                        continue;
 349
 350                if (PageLRU(page)) {
 351                        struct zone *pagezone = page_zone(page);
 352
 353                        if (pagezone != zone) {
 354                                if (zone)
 355                                        spin_unlock_irqrestore(&zone->lru_lock,
 356                                                                        flags);
 357                                zone = pagezone;
 358                                spin_lock_irqsave(&zone->lru_lock, flags);
 359                        }
 360                        VM_BUG_ON(!PageLRU(page));
 361                        __ClearPageLRU(page);
 362                        del_page_from_lru(zone, page);
 363                }
 364
 365                if (!pagevec_add(&pages_to_free, page)) {
 366                        if (zone) {
 367                                spin_unlock_irqrestore(&zone->lru_lock, flags);
 368                                zone = NULL;
 369                        }
 370                        __pagevec_free(&pages_to_free);
 371                        pagevec_reinit(&pages_to_free);
 372                }
 373        }
 374        if (zone)
 375                spin_unlock_irqrestore(&zone->lru_lock, flags);
 376
 377        pagevec_free(&pages_to_free);
 378}
 379
 380/*
 381 * The pages which we're about to release may be in the deferred lru-addition
 382 * queues.  That would prevent them from really being freed right now.  That's
 383 * OK from a correctness point of view but is inefficient - those pages may be
 384 * cache-warm and we want to give them back to the page allocator ASAP.
 385 *
 386 * So __pagevec_release() will drain those queues here.  __pagevec_lru_add()
 387 * and __pagevec_lru_add_active() call release_pages() directly to avoid
 388 * mutual recursion.
 389 */
 390void __pagevec_release(struct pagevec *pvec)
 391{
 392        lru_add_drain();
 393        release_pages(pvec->pages, pagevec_count(pvec), pvec->cold);
 394        pagevec_reinit(pvec);
 395}
 396
 397EXPORT_SYMBOL(__pagevec_release);
 398
 399/*
 400 * Add the passed pages to the LRU, then drop the caller's refcount
 401 * on them.  Reinitialises the caller's pagevec.
 402 */
 403void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
 404{
 405        int i;
 406        struct zone *zone = NULL;
 407
 408        VM_BUG_ON(is_unevictable_lru(lru));
 409
 410        for (i = 0; i < pagevec_count(pvec); i++) {
 411                struct page *page = pvec->pages[i];
 412                struct zone *pagezone = page_zone(page);
 413                int file;
 414                int active;
 415
 416                if (pagezone != zone) {
 417                        if (zone)
 418                                spin_unlock_irq(&zone->lru_lock);
 419                        zone = pagezone;
 420                        spin_lock_irq(&zone->lru_lock);
 421                }
 422                VM_BUG_ON(PageActive(page));
 423                VM_BUG_ON(PageUnevictable(page));
 424                VM_BUG_ON(PageLRU(page));
 425                SetPageLRU(page);
 426                active = is_active_lru(lru);
 427                file = is_file_lru(lru);
 428                if (active)
 429                        SetPageActive(page);
 430                update_page_reclaim_stat(zone, page, file, active);
 431                add_page_to_lru_list(zone, page, lru);
 432        }
 433        if (zone)
 434                spin_unlock_irq(&zone->lru_lock);
 435        release_pages(pvec->pages, pvec->nr, pvec->cold);
 436        pagevec_reinit(pvec);
 437}
 438
 439EXPORT_SYMBOL(____pagevec_lru_add);
 440
 441/*
 442 * Try to drop buffers from the pages in a pagevec
 443 */
 444void pagevec_strip(struct pagevec *pvec)
 445{
 446        int i;
 447
 448        for (i = 0; i < pagevec_count(pvec); i++) {
 449                struct page *page = pvec->pages[i];
 450
 451                if (page_has_private(page) && trylock_page(page)) {
 452                        if (page_has_private(page))
 453                                try_to_release_page(page, 0);
 454                        unlock_page(page);
 455                }
 456        }
 457}
 458
 459/**
 460 * pagevec_lookup - gang pagecache lookup
 461 * @pvec:       Where the resulting pages are placed
 462 * @mapping:    The address_space to search
 463 * @start:      The starting page index
 464 * @nr_pages:   The maximum number of pages
 465 *
 466 * pagevec_lookup() will search for and return a group of up to @nr_pages pages
 467 * in the mapping.  The pages are placed in @pvec.  pagevec_lookup() takes a
 468 * reference against the pages in @pvec.
 469 *
 470 * The search returns a group of mapping-contiguous pages with ascending
 471 * indexes.  There may be holes in the indices due to not-present pages.
 472 *
 473 * pagevec_lookup() returns the number of pages which were found.
 474 */
 475unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
 476                pgoff_t start, unsigned nr_pages)
 477{
 478        pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
 479        return pagevec_count(pvec);
 480}
 481
 482EXPORT_SYMBOL(pagevec_lookup);
 483
 484unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
 485                pgoff_t *index, int tag, unsigned nr_pages)
 486{
 487        pvec->nr = find_get_pages_tag(mapping, index, tag,
 488                                        nr_pages, pvec->pages);
 489        return pagevec_count(pvec);
 490}
 491
 492EXPORT_SYMBOL(pagevec_lookup_tag);
 493
 494/*
 495 * Perform any setup for the swap system
 496 */
 497void __init swap_setup(void)
 498{
 499        unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT);
 500
 501#ifdef CONFIG_SWAP
 502        bdi_init(swapper_space.backing_dev_info);
 503#endif
 504
 505        /* Use a smaller cluster for small-memory machines */
 506        if (megs < 16)
 507                page_cluster = 2;
 508        else
 509                page_cluster = 3;
 510        /*
 511         * Right now other parts of the system means that we
 512         * _really_ don't want to cluster much more
 513         */
 514}
 515