LXR linux/mm/swap

   1/*
   2 *  linux/mm/swap_state.c
   3 *
   4 *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   5 *  Swap reorganised 29.12.95, Stephen Tweedie
   6 *
   7 *  Rewritten to use page cache, (C) 1998 Stephen Tweedie
   8 */
   9#include <linux/mm.h>
  10#include <linux/gfp.h>
  11#include <linux/kernel_stat.h>
  12#include <linux/swap.h>
  13#include <linux/swapops.h>
  14#include <linux/init.h>
  15#include <linux/pagemap.h>
  16#include <linux/backing-dev.h>
  17#include <linux/blkdev.h>
  18#include <linux/pagevec.h>
  19#include <linux/migrate.h>
  20#include <linux/page_cgroup.h>
  21
  22#include <asm/pgtable.h>
  23
  24/*
  25 * swapper_space is a fiction, retained to simplify the path through
  26 * vmscan's shrink_page_list.
  27 */
  28static const struct address_space_operations swap_aops = {
  29        .writepage      = swap_writepage,
  30        .set_page_dirty = swap_set_page_dirty,
  31        .migratepage    = migrate_page,
  32};
  33
  34static struct backing_dev_info swap_backing_dev_info = {
  35        .name           = "swap",
  36        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
  37};
  38
  39struct address_space swapper_spaces[MAX_SWAPFILES] = {
  40        [0 ... MAX_SWAPFILES - 1] = {
  41                .page_tree      = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
  42                .a_ops          = &swap_aops,
  43                .backing_dev_info = &swap_backing_dev_info,
  44        }
  45};
  46
  47#define INC_CACHE_INFO(x)       do { swap_cache_info.x++; } while (0)
  48
  49static struct {
  50        unsigned long add_total;
  51        unsigned long del_total;
  52        unsigned long find_success;
  53        unsigned long find_total;
  54} swap_cache_info;
  55
  56unsigned long total_swapcache_pages(void)
  57{
  58        int i;
  59        unsigned long ret = 0;
  60
  61        for (i = 0; i < MAX_SWAPFILES; i++)
  62                ret += swapper_spaces[i].nrpages;
  63        return ret;
  64}
  65
  66void show_swap_cache_info(void)
  67{
  68        printk("%lu pages in swap cache\n", total_swapcache_pages());
  69        printk("Swap cache stats: add %lu, delete %lu, find %lu/%lu\n",
  70                swap_cache_info.add_total, swap_cache_info.del_total,
  71                swap_cache_info.find_success, swap_cache_info.find_total);
  72        printk("Free swap  = %ldkB\n",
  73                get_nr_swap_pages() << (PAGE_SHIFT - 10));
  74        printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
  75}
  76
  77/*
  78 * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
  79 * but sets SwapCache flag and private instead of mapping and index.
  80 */
  81int __add_to_swap_cache(struct page *page, swp_entry_t entry)
  82{
  83        int error;
  84        struct address_space *address_space;
  85
  86        VM_BUG_ON(!PageLocked(page));
  87        VM_BUG_ON(PageSwapCache(page));
  88        VM_BUG_ON(!PageSwapBacked(page));
  89
  90        page_cache_get(page);
  91        SetPageSwapCache(page);
  92        set_page_private(page, entry.val);
  93
  94        address_space = swap_address_space(entry);
  95        spin_lock_irq(&address_space->tree_lock);
  96        error = radix_tree_insert(&address_space->page_tree,
  97                                        entry.val, page);
  98        if (likely(!error)) {
  99                address_space->nrpages++;
 100                __inc_zone_page_state(page, NR_FILE_PAGES);
 101                INC_CACHE_INFO(add_total);
 102        }
 103        spin_unlock_irq(&address_space->tree_lock);
 104
 105        if (unlikely(error)) {
 106                /*
 107                 * Only the context which have set SWAP_HAS_CACHE flag
 108                 * would call add_to_swap_cache().
 109                 * So add_to_swap_cache() doesn't returns -EEXIST.
 110                 */
 111                VM_BUG_ON(error == -EEXIST);
 112                set_page_private(page, 0UL);
 113                ClearPageSwapCache(page);
 114                page_cache_release(page);
 115        }
 116
 117        return error;
 118}
 119
 120
 121int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
 122{
 123        int error;
 124
 125        error = radix_tree_preload(gfp_mask);
 126        if (!error) {
 127                error = __add_to_swap_cache(page, entry);
 128                radix_tree_preload_end();
 129        }
 130        return error;
 131}
 132
 133/*
 134 * This must be called only on pages that have
 135 * been verified to be in the swap cache.
 136 */
 137void __delete_from_swap_cache(struct page *page)
 138{
 139        swp_entry_t entry;
 140        struct address_space *address_space;
 141
 142        VM_BUG_ON(!PageLocked(page));
 143        VM_BUG_ON(!PageSwapCache(page));
 144        VM_BUG_ON(PageWriteback(page));
 145
 146        entry.val = page_private(page);
 147        address_space = swap_address_space(entry);
 148        radix_tree_delete(&address_space->page_tree, page_private(page));
 149        set_page_private(page, 0);
 150        ClearPageSwapCache(page);
 151        address_space->nrpages--;
 152        __dec_zone_page_state(page, NR_FILE_PAGES);
 153        INC_CACHE_INFO(del_total);
 154}
 155
 156/**
 157 * add_to_swap - allocate swap space for a page
 158 * @page: page we want to move to swap
 159 *
 160 * Allocate swap space for the page and add the page to the
 161 * swap cache.  Caller needs to hold the page lock. 
 162 */
 163int add_to_swap(struct page *page, struct list_head *list)
 164{
 165        swp_entry_t entry;
 166        int err;
 167
 168        VM_BUG_ON(!PageLocked(page));
 169        VM_BUG_ON(!PageUptodate(page));
 170
 171        entry = get_swap_page();
 172        if (!entry.val)
 173                return 0;
 174
 175        if (unlikely(PageTransHuge(page)))
 176                if (unlikely(split_huge_page_to_list(page, list))) {
 177                        swapcache_free(entry, NULL);
 178                        return 0;
 179                }
 180
 181        /*
 182         * Radix-tree node allocations from PF_MEMALLOC contexts could
 183         * completely exhaust the page allocator. __GFP_NOMEMALLOC
 184         * stops emergency reserves from being allocated.
 185         *
 186         * TODO: this could cause a theoretical memory reclaim
 187         * deadlock in the swap out path.
 188         */
 189        /*
 190         * Add it to the swap cache and mark it dirty
 191         */
 192        err = add_to_swap_cache(page, entry,
 193                        __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
 194
 195        if (!err) {     /* Success */
 196                SetPageDirty(page);
 197                return 1;
 198        } else {        /* -ENOMEM radix-tree allocation failure */
 199                /*
 200                 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
 201                 * clear SWAP_HAS_CACHE flag.
 202                 */
 203                swapcache_free(entry, NULL);
 204                return 0;
 205        }
 206}
 207
 208/*
 209 * This must be called only on pages that have
 210 * been verified to be in the swap cache and locked.
 211 * It will never put the page into the free list,
 212 * the caller has a reference on the page.
 213 */
 214void delete_from_swap_cache(struct page *page)
 215{
 216        swp_entry_t entry;
 217        struct address_space *address_space;
 218
 219        entry.val = page_private(page);
 220
 221        address_space = swap_address_space(entry);
 222        spin_lock_irq(&address_space->tree_lock);
 223        __delete_from_swap_cache(page);
 224        spin_unlock_irq(&address_space->tree_lock);
 225
 226        swapcache_free(entry, page);
 227        page_cache_release(page);
 228}
 229
 230/* 
 231 * If we are the only user, then try to free up the swap cache. 
 232 * 
 233 * Its ok to check for PageSwapCache without the page lock
 234 * here because we are going to recheck again inside
 235 * try_to_free_swap() _with_ the lock.
 236 *                                      - Marcelo
 237 */
 238static inline void free_swap_cache(struct page *page)
 239{
 240        if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) {
 241                try_to_free_swap(page);
 242                unlock_page(page);
 243        }
 244}
 245
 246/* 
 247 * Perform a free_page(), also freeing any swap cache associated with
 248 * this page if it is the last user of the page.
 249 */
 250void free_page_and_swap_cache(struct page *page)
 251{
 252        free_swap_cache(page);
 253        page_cache_release(page);
 254}
 255
 256/*
 257 * Passed an array of pages, drop them all from swapcache and then release
 258 * them.  They are removed from the LRU and freed if this is their last use.
 259 */
 260void free_pages_and_swap_cache(struct page **pages, int nr)
 261{
 262        struct page **pagep = pages;
 263
 264        lru_add_drain();
 265        while (nr) {
 266                int todo = min(nr, PAGEVEC_SIZE);
 267                int i;
 268
 269                for (i = 0; i < todo; i++)
 270                        free_swap_cache(pagep[i]);
 271                release_pages(pagep, todo, 0);
 272                pagep += todo;
 273                nr -= todo;
 274        }
 275}
 276
 277/*
 278 * Lookup a swap entry in the swap cache. A found page will be returned
 279 * unlocked and with its refcount incremented - we rely on the kernel
 280 * lock getting page table operations atomic even if we drop the page
 281 * lock before returning.
 282 */
 283struct page * lookup_swap_cache(swp_entry_t entry)
 284{
 285        struct page *page;
 286
 287        page = find_get_page(swap_address_space(entry), entry.val);
 288
 289        if (page)
 290                INC_CACHE_INFO(find_success);
 291
 292        INC_CACHE_INFO(find_total);
 293        return page;
 294}
 295
 296/* 
 297 * Locate a page of swap in physical memory, reserving swap cache space
 298 * and reading the disk if it is not already cached.
 299 * A failure return means that either the page allocation failed or that
 300 * the swap entry is no longer in use.
 301 */
 302struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 303                        struct vm_area_struct *vma, unsigned long addr)
 304{
 305        struct page *found_page, *new_page = NULL;
 306        int err;
 307
 308        do {
 309                /*
 310                 * First check the swap cache.  Since this is normally
 311                 * called after lookup_swap_cache() failed, re-calling
 312                 * that would confuse statistics.
 313                 */
 314                found_page = find_get_page(swap_address_space(entry),
 315                                        entry.val);
 316                if (found_page)
 317                        break;
 318
 319                /*
 320                 * Get a new page to read into from swap.
 321                 */
 322                if (!new_page) {
 323                        new_page = alloc_page_vma(gfp_mask, vma, addr);
 324                        if (!new_page)
 325                                break;          /* Out of memory */
 326                }
 327
 328                /*
 329                 * call radix_tree_preload() while we can wait.
 330                 */
 331                err = radix_tree_preload(gfp_mask & GFP_KERNEL);
 332                if (err)
 333                        break;
 334
 335                /*
 336                 * Swap entry may have been freed since our caller observed it.
 337                 */
 338                err = swapcache_prepare(entry);
 339                if (err == -EEXIST) {
 340                        radix_tree_preload_end();
 341                        /*
 342                         * We might race against get_swap_page() and stumble
 343                         * across a SWAP_HAS_CACHE swap_map entry whose page
 344                         * has not been brought into the swapcache yet, while
 345                         * the other end is scheduled away waiting on discard
 346                         * I/O completion at scan_swap_map().
 347                         *
 348                         * In order to avoid turning this transitory state
 349                         * into a permanent loop around this -EEXIST case
 350                         * if !CONFIG_PREEMPT and the I/O completion happens
 351                         * to be waiting on the CPU waitqueue where we are now
 352                         * busy looping, we just conditionally invoke the
 353                         * scheduler here, if there are some more important
 354                         * tasks to run.
 355                         */
 356                        cond_resched();
 357                        continue;
 358                }
 359                if (err) {              /* swp entry is obsolete ? */
 360                        radix_tree_preload_end();
 361                        break;
 362                }
 363
 364                /* May fail (-ENOMEM) if radix-tree node allocation failed. */
 365                __set_page_locked(new_page);
 366                SetPageSwapBacked(new_page);
 367                err = __add_to_swap_cache(new_page, entry);
 368                if (likely(!err)) {
 369                        radix_tree_preload_end();
 370                        /*
 371                         * Initiate read into locked page and return.
 372                         */
 373                        lru_cache_add_anon(new_page);
 374                        swap_readpage(new_page);
 375                        return new_page;
 376                }
 377                radix_tree_preload_end();
 378                ClearPageSwapBacked(new_page);
 379                __clear_page_locked(new_page);
 380                /*
 381                 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
 382                 * clear SWAP_HAS_CACHE flag.
 383                 */
 384                swapcache_free(entry, NULL);
 385        } while (err != -ENOMEM);
 386
 387        if (new_page)
 388                page_cache_release(new_page);
 389        return found_page;
 390}
 391
 392/**
 393 * swapin_readahead - swap in pages in hope we need them soon
 394 * @entry: swap entry of this memory
 395 * @gfp_mask: memory allocation flags
 396 * @vma: user vma this address belongs to
 397 * @addr: target address for mempolicy
 398 *
 399 * Returns the struct page for entry and addr, after queueing swapin.
 400 *
 401 * Primitive swap readahead code. We simply read an aligned block of
 402 * (1 << page_cluster) entries in the swap area. This method is chosen
 403 * because it doesn't cost us any seek time.  We also make sure to queue
 404 * the 'original' request together with the readahead ones...
 405 *
 406 * This has been extended to use the NUMA policies from the mm triggering
 407 * the readahead.
 408 *
 409 * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
 410 */
 411struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
 412                        struct vm_area_struct *vma, unsigned long addr)
 413{
 414        struct page *page;
 415        unsigned long offset = swp_offset(entry);
 416        unsigned long start_offset, end_offset;
 417        unsigned long mask = (1UL << page_cluster) - 1;
 418        struct blk_plug plug;
 419
 420        /* Read a page_cluster sized and aligned cluster around offset. */
 421        start_offset = offset & ~mask;
 422        end_offset = offset | mask;
 423        if (!start_offset)      /* First page is swap header. */
 424                start_offset++;
 425
 426        blk_start_plug(&plug);
 427        for (offset = start_offset; offset <= end_offset ; offset++) {
 428                /* Ok, do the async read-ahead now */
 429                page = read_swap_cache_async(swp_entry(swp_type(entry), offset),
 430                                                gfp_mask, vma, addr);
 431                if (!page)
 432                        continue;
 433                page_cache_release(page);
 434        }
 435        blk_finish_plug(&plug);
 436
 437        lru_add_drain();        /* Push any new pages onto the LRU now */
 438        return read_swap_cache_async(entry, gfp_mask, vma, addr);
 439}
 440