LXR linux/mm/swap.c

   1/*
   2 *  linux/mm/swap.c
   3 *
   4 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5 */
   6
   7/*
   8 * This file contains the default values for the operation of the
   9 * Linux VM subsystem. Fine-tuning documentation can be found in
  10 * Documentation/sysctl/vm.txt.
  11 * Started 18.12.91
  12 * Swap aging added 23.2.95, Stephen Tweedie.
  13 * Buffermem limits added 12.3.98, Rik van Riel.
  14 */
  15
  16#include <linux/mm.h>
  17#include <linux/sched.h>
  18#include <linux/kernel_stat.h>
  19#include <linux/swap.h>
  20#include <linux/mman.h>
  21#include <linux/pagemap.h>
  22#include <linux/pagevec.h>
  23#include <linux/init.h>
  24#include <linux/module.h>
  25#include <linux/mm_inline.h>
  26#include <linux/buffer_head.h>  /* for try_to_release_page() */
  27#include <linux/percpu_counter.h>
  28#include <linux/percpu.h>
  29#include <linux/cpu.h>
  30#include <linux/notifier.h>
  31#include <linux/backing-dev.h>
  32
  33/* How many pages do we try to swap or page in/out together? */
  34int page_cluster;
  35
  36static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
  37static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
  38static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs) = { 0, };
  39
  40/*
  41 * This path almost never happens for VM activity - pages are normally
  42 * freed via pagevecs.  But it gets used by networking.
  43 */
  44static void fastcall __page_cache_release(struct page *page)
  45{
  46        if (PageLRU(page)) {
  47                unsigned long flags;
  48                struct zone *zone = page_zone(page);
  49
  50                spin_lock_irqsave(&zone->lru_lock, flags);
  51                VM_BUG_ON(!PageLRU(page));
  52                __ClearPageLRU(page);
  53                del_page_from_lru(zone, page);
  54                spin_unlock_irqrestore(&zone->lru_lock, flags);
  55        }
  56        free_hot_page(page);
  57}
  58
  59static void put_compound_page(struct page *page)
  60{
  61        page = compound_head(page);
  62        if (put_page_testzero(page)) {
  63                compound_page_dtor *dtor;
  64
  65                dtor = get_compound_page_dtor(page);
  66                (*dtor)(page);
  67        }
  68}
  69
  70void put_page(struct page *page)
  71{
  72        if (unlikely(PageCompound(page)))
  73                put_compound_page(page);
  74        else if (put_page_testzero(page))
  75                __page_cache_release(page);
  76}
  77EXPORT_SYMBOL(put_page);
  78
  79/**
  80 * put_pages_list(): release a list of pages
  81 *
  82 * Release a list of pages which are strung together on page.lru.  Currently
  83 * used by read_cache_pages() and related error recovery code.
  84 *
  85 * @pages: list of pages threaded on page->lru
  86 */
  87void put_pages_list(struct list_head *pages)
  88{
  89        while (!list_empty(pages)) {
  90                struct page *victim;
  91
  92                victim = list_entry(pages->prev, struct page, lru);
  93                list_del(&victim->lru);
  94                page_cache_release(victim);
  95        }
  96}
  97EXPORT_SYMBOL(put_pages_list);
  98
  99/*
 100 * pagevec_move_tail() must be called with IRQ disabled.
 101 * Otherwise this may cause nasty races.
 102 */
 103static void pagevec_move_tail(struct pagevec *pvec)
 104{
 105        int i;
 106        int pgmoved = 0;
 107        struct zone *zone = NULL;
 108
 109        for (i = 0; i < pagevec_count(pvec); i++) {
 110                struct page *page = pvec->pages[i];
 111                struct zone *pagezone = page_zone(page);
 112
 113                if (pagezone != zone) {
 114                        if (zone)
 115                                spin_unlock(&zone->lru_lock);
 116                        zone = pagezone;
 117                        spin_lock(&zone->lru_lock);
 118                }
 119                if (PageLRU(page) && !PageActive(page)) {
 120                        list_move_tail(&page->lru, &zone->inactive_list);
 121                        pgmoved++;
 122                }
 123        }
 124        if (zone)
 125                spin_unlock(&zone->lru_lock);
 126        __count_vm_events(PGROTATED, pgmoved);
 127        release_pages(pvec->pages, pvec->nr, pvec->cold);
 128        pagevec_reinit(pvec);
 129}
 130
 131/*
 132 * Writeback is about to end against a page which has been marked for immediate
 133 * reclaim.  If it still appears to be reclaimable, move it to the tail of the
 134 * inactive list.
 135 *
 136 * Returns zero if it cleared PG_writeback.
 137 */
 138int rotate_reclaimable_page(struct page *page)
 139{
 140        struct pagevec *pvec;
 141        unsigned long flags;
 142
 143        if (PageLocked(page))
 144                return 1;
 145        if (PageDirty(page))
 146                return 1;
 147        if (PageActive(page))
 148                return 1;
 149        if (!PageLRU(page))
 150                return 1;
 151
 152        page_cache_get(page);
 153        local_irq_save(flags);
 154        pvec = &__get_cpu_var(lru_rotate_pvecs);
 155        if (!pagevec_add(pvec, page))
 156                pagevec_move_tail(pvec);
 157        local_irq_restore(flags);
 158
 159        if (!test_clear_page_writeback(page))
 160                BUG();
 161
 162        return 0;
 163}
 164
 165/*
 166 * FIXME: speed this up?
 167 */
 168void fastcall activate_page(struct page *page)
 169{
 170        struct zone *zone = page_zone(page);
 171
 172        spin_lock_irq(&zone->lru_lock);
 173        if (PageLRU(page) && !PageActive(page)) {
 174                del_page_from_inactive_list(zone, page);
 175                SetPageActive(page);
 176                add_page_to_active_list(zone, page);
 177                __count_vm_event(PGACTIVATE);
 178        }
 179        spin_unlock_irq(&zone->lru_lock);
 180}
 181
 182/*
 183 * Mark a page as having seen activity.
 184 *
 185 * inactive,unreferenced        ->      inactive,referenced
 186 * inactive,referenced          ->      active,unreferenced
 187 * active,unreferenced          ->      active,referenced
 188 */
 189void fastcall mark_page_accessed(struct page *page)
 190{
 191        if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) {
 192                activate_page(page);
 193                ClearPageReferenced(page);
 194        } else if (!PageReferenced(page)) {
 195                SetPageReferenced(page);
 196        }
 197}
 198
 199EXPORT_SYMBOL(mark_page_accessed);
 200
 201/**
 202 * lru_cache_add: add a page to the page lists
 203 * @page: the page to add
 204 */
 205void fastcall lru_cache_add(struct page *page)
 206{
 207        struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
 208
 209        page_cache_get(page);
 210        if (!pagevec_add(pvec, page))
 211                __pagevec_lru_add(pvec);
 212        put_cpu_var(lru_add_pvecs);
 213}
 214
 215void fastcall lru_cache_add_active(struct page *page)
 216{
 217        struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs);
 218
 219        page_cache_get(page);
 220        if (!pagevec_add(pvec, page))
 221                __pagevec_lru_add_active(pvec);
 222        put_cpu_var(lru_add_active_pvecs);
 223}
 224
 225/*
 226 * Drain pages out of the cpu's pagevecs.
 227 * Either "cpu" is the current CPU, and preemption has already been
 228 * disabled; or "cpu" is being hot-unplugged, and is already dead.
 229 */
 230static void drain_cpu_pagevecs(int cpu)
 231{
 232        struct pagevec *pvec;
 233
 234        pvec = &per_cpu(lru_add_pvecs, cpu);
 235        if (pagevec_count(pvec))
 236                __pagevec_lru_add(pvec);
 237
 238        pvec = &per_cpu(lru_add_active_pvecs, cpu);
 239        if (pagevec_count(pvec))
 240                __pagevec_lru_add_active(pvec);
 241
 242        pvec = &per_cpu(lru_rotate_pvecs, cpu);
 243        if (pagevec_count(pvec)) {
 244                unsigned long flags;
 245
 246                /* No harm done if a racing interrupt already did this */
 247                local_irq_save(flags);
 248                pagevec_move_tail(pvec);
 249                local_irq_restore(flags);
 250        }
 251}
 252
 253void lru_add_drain(void)
 254{
 255        drain_cpu_pagevecs(get_cpu());
 256        put_cpu();
 257}
 258
 259#ifdef CONFIG_NUMA
 260static void lru_add_drain_per_cpu(struct work_struct *dummy)
 261{
 262        lru_add_drain();
 263}
 264
 265/*
 266 * Returns 0 for success
 267 */
 268int lru_add_drain_all(void)
 269{
 270        return schedule_on_each_cpu(lru_add_drain_per_cpu);
 271}
 272
 273#else
 274
 275/*
 276 * Returns 0 for success
 277 */
 278int lru_add_drain_all(void)
 279{
 280        lru_add_drain();
 281        return 0;
 282}
 283#endif
 284
 285/*
 286 * Batched page_cache_release().  Decrement the reference count on all the
 287 * passed pages.  If it fell to zero then remove the page from the LRU and
 288 * free it.
 289 *
 290 * Avoid taking zone->lru_lock if possible, but if it is taken, retain it
 291 * for the remainder of the operation.
 292 *
 293 * The locking in this function is against shrink_cache(): we recheck the
 294 * page count inside the lock to see whether shrink_cache grabbed the page
 295 * via the LRU.  If it did, give up: shrink_cache will free it.
 296 */
 297void release_pages(struct page **pages, int nr, int cold)
 298{
 299        int i;
 300        struct pagevec pages_to_free;
 301        struct zone *zone = NULL;
 302        unsigned long uninitialized_var(flags);
 303
 304        pagevec_init(&pages_to_free, cold);
 305        for (i = 0; i < nr; i++) {
 306                struct page *page = pages[i];
 307
 308                if (unlikely(PageCompound(page))) {
 309                        if (zone) {
 310                                spin_unlock_irqrestore(&zone->lru_lock, flags);
 311                                zone = NULL;
 312                        }
 313                        put_compound_page(page);
 314                        continue;
 315                }
 316
 317                if (!put_page_testzero(page))
 318                        continue;
 319
 320                if (PageLRU(page)) {
 321                        struct zone *pagezone = page_zone(page);
 322                        if (pagezone != zone) {
 323                                if (zone)
 324                                        spin_unlock_irqrestore(&zone->lru_lock,
 325                                                                        flags);
 326                                zone = pagezone;
 327                                spin_lock_irqsave(&zone->lru_lock, flags);
 328                        }
 329                        VM_BUG_ON(!PageLRU(page));
 330                        __ClearPageLRU(page);
 331                        del_page_from_lru(zone, page);
 332                }
 333
 334                if (!pagevec_add(&pages_to_free, page)) {
 335                        if (zone) {
 336                                spin_unlock_irqrestore(&zone->lru_lock, flags);
 337                                zone = NULL;
 338                        }
 339                        __pagevec_free(&pages_to_free);
 340                        pagevec_reinit(&pages_to_free);
 341                }
 342        }
 343        if (zone)
 344                spin_unlock_irqrestore(&zone->lru_lock, flags);
 345
 346        pagevec_free(&pages_to_free);
 347}
 348
 349/*
 350 * The pages which we're about to release may be in the deferred lru-addition
 351 * queues.  That would prevent them from really being freed right now.  That's
 352 * OK from a correctness point of view but is inefficient - those pages may be
 353 * cache-warm and we want to give them back to the page allocator ASAP.
 354 *
 355 * So __pagevec_release() will drain those queues here.  __pagevec_lru_add()
 356 * and __pagevec_lru_add_active() call release_pages() directly to avoid
 357 * mutual recursion.
 358 */
 359void __pagevec_release(struct pagevec *pvec)
 360{
 361        lru_add_drain();
 362        release_pages(pvec->pages, pagevec_count(pvec), pvec->cold);
 363        pagevec_reinit(pvec);
 364}
 365
 366EXPORT_SYMBOL(__pagevec_release);
 367
 368/*
 369 * pagevec_release() for pages which are known to not be on the LRU
 370 *
 371 * This function reinitialises the caller's pagevec.
 372 */
 373void __pagevec_release_nonlru(struct pagevec *pvec)
 374{
 375        int i;
 376        struct pagevec pages_to_free;
 377
 378        pagevec_init(&pages_to_free, pvec->cold);
 379        for (i = 0; i < pagevec_count(pvec); i++) {
 380                struct page *page = pvec->pages[i];
 381
 382                VM_BUG_ON(PageLRU(page));
 383                if (put_page_testzero(page))
 384                        pagevec_add(&pages_to_free, page);
 385        }
 386        pagevec_free(&pages_to_free);
 387        pagevec_reinit(pvec);
 388}
 389
 390/*
 391 * Add the passed pages to the LRU, then drop the caller's refcount
 392 * on them.  Reinitialises the caller's pagevec.
 393 */
 394void __pagevec_lru_add(struct pagevec *pvec)
 395{
 396        int i;
 397        struct zone *zone = NULL;
 398
 399        for (i = 0; i < pagevec_count(pvec); i++) {
 400                struct page *page = pvec->pages[i];
 401                struct zone *pagezone = page_zone(page);
 402
 403                if (pagezone != zone) {
 404                        if (zone)
 405                                spin_unlock_irq(&zone->lru_lock);
 406                        zone = pagezone;
 407                        spin_lock_irq(&zone->lru_lock);
 408                }
 409                VM_BUG_ON(PageLRU(page));
 410                SetPageLRU(page);
 411                add_page_to_inactive_list(zone, page);
 412        }
 413        if (zone)
 414                spin_unlock_irq(&zone->lru_lock);
 415        release_pages(pvec->pages, pvec->nr, pvec->cold);
 416        pagevec_reinit(pvec);
 417}
 418
 419EXPORT_SYMBOL(__pagevec_lru_add);
 420
 421void __pagevec_lru_add_active(struct pagevec *pvec)
 422{
 423        int i;
 424        struct zone *zone = NULL;
 425
 426        for (i = 0; i < pagevec_count(pvec); i++) {
 427                struct page *page = pvec->pages[i];
 428                struct zone *pagezone = page_zone(page);
 429
 430                if (pagezone != zone) {
 431                        if (zone)
 432                                spin_unlock_irq(&zone->lru_lock);
 433                        zone = pagezone;
 434                        spin_lock_irq(&zone->lru_lock);
 435                }
 436                VM_BUG_ON(PageLRU(page));
 437                SetPageLRU(page);
 438                VM_BUG_ON(PageActive(page));
 439                SetPageActive(page);
 440                add_page_to_active_list(zone, page);
 441        }
 442        if (zone)
 443                spin_unlock_irq(&zone->lru_lock);
 444        release_pages(pvec->pages, pvec->nr, pvec->cold);
 445        pagevec_reinit(pvec);
 446}
 447
 448/*
 449 * Try to drop buffers from the pages in a pagevec
 450 */
 451void pagevec_strip(struct pagevec *pvec)
 452{
 453        int i;
 454
 455        for (i = 0; i < pagevec_count(pvec); i++) {
 456                struct page *page = pvec->pages[i];
 457
 458                if (PagePrivate(page) && !TestSetPageLocked(page)) {
 459                        if (PagePrivate(page))
 460                                try_to_release_page(page, 0);
 461                        unlock_page(page);
 462                }
 463        }
 464}
 465
 466/**
 467 * pagevec_lookup - gang pagecache lookup
 468 * @pvec:       Where the resulting pages are placed
 469 * @mapping:    The address_space to search
 470 * @start:      The starting page index
 471 * @nr_pages:   The maximum number of pages
 472 *
 473 * pagevec_lookup() will search for and return a group of up to @nr_pages pages
 474 * in the mapping.  The pages are placed in @pvec.  pagevec_lookup() takes a
 475 * reference against the pages in @pvec.
 476 *
 477 * The search returns a group of mapping-contiguous pages with ascending
 478 * indexes.  There may be holes in the indices due to not-present pages.
 479 *
 480 * pagevec_lookup() returns the number of pages which were found.
 481 */
 482unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
 483                pgoff_t start, unsigned nr_pages)
 484{
 485        pvec->nr = find_get_pages(mapping, start, nr_pages, pvec->pages);
 486        return pagevec_count(pvec);
 487}
 488
 489EXPORT_SYMBOL(pagevec_lookup);
 490
 491unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping,
 492                pgoff_t *index, int tag, unsigned nr_pages)
 493{
 494        pvec->nr = find_get_pages_tag(mapping, index, tag,
 495                                        nr_pages, pvec->pages);
 496        return pagevec_count(pvec);
 497}
 498
 499EXPORT_SYMBOL(pagevec_lookup_tag);
 500
 501#ifdef CONFIG_SMP
 502/*
 503 * We tolerate a little inaccuracy to avoid ping-ponging the counter between
 504 * CPUs
 505 */
 506#define ACCT_THRESHOLD  max(16, NR_CPUS * 2)
 507
 508static DEFINE_PER_CPU(long, committed_space) = 0;
 509
 510void vm_acct_memory(long pages)
 511{
 512        long *local;
 513
 514        preempt_disable();
 515        local = &__get_cpu_var(committed_space);
 516        *local += pages;
 517        if (*local > ACCT_THRESHOLD || *local < -ACCT_THRESHOLD) {
 518                atomic_add(*local, &vm_committed_space);
 519                *local = 0;
 520        }
 521        preempt_enable();
 522}
 523
 524#ifdef CONFIG_HOTPLUG_CPU
 525
 526/* Drop the CPU's cached committed space back into the central pool. */
 527static int cpu_swap_callback(struct notifier_block *nfb,
 528                             unsigned long action,
 529                             void *hcpu)
 530{
 531        long *committed;
 532
 533        committed = &per_cpu(committed_space, (long)hcpu);
 534        if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
 535                atomic_add(*committed, &vm_committed_space);
 536                *committed = 0;
 537                drain_cpu_pagevecs((long)hcpu);
 538        }
 539        return NOTIFY_OK;
 540}
 541#endif /* CONFIG_HOTPLUG_CPU */
 542#endif /* CONFIG_SMP */
 543
 544/*
 545 * Perform any setup for the swap system
 546 */
 547void __init swap_setup(void)
 548{
 549        unsigned long megs = num_physpages >> (20 - PAGE_SHIFT);
 550
 551#ifdef CONFIG_SWAP
 552        bdi_init(swapper_space.backing_dev_info);
 553#endif
 554
 555        /* Use a smaller cluster for small-memory machines */
 556        if (megs < 16)
 557                page_cluster = 2;
 558        else
 559                page_cluster = 3;
 560        /*
 561         * Right now other parts of the system means that we
 562         * _really_ don't want to cluster much more
 563         */
 564#ifdef CONFIG_HOTPLUG_CPU
 565        hotcpu_notifier(cpu_swap_callback, 0);
 566#endif
 567}
 568