linux/mm/frontswap.c
<<
>>
Prefs
   1/*
   2 * Frontswap frontend
   3 *
   4 * This code provides the generic "frontend" layer to call a matching
   5 * "backend" driver implementation of frontswap.  See
   6 * Documentation/vm/frontswap.txt for more information.
   7 *
   8 * Copyright (C) 2009-2012 Oracle Corp.  All rights reserved.
   9 * Author: Dan Magenheimer
  10 *
  11 * This work is licensed under the terms of the GNU GPL, version 2.
  12 */
  13
  14#include <linux/mman.h>
  15#include <linux/swap.h>
  16#include <linux/swapops.h>
  17#include <linux/security.h>
  18#include <linux/module.h>
  19#include <linux/debugfs.h>
  20#include <linux/frontswap.h>
  21#include <linux/swapfile.h>
  22
  23/*
  24 * frontswap_ops is set by frontswap_register_ops to contain the pointers
  25 * to the frontswap "backend" implementation functions.
  26 */
  27static struct frontswap_ops *frontswap_ops __read_mostly;
  28
  29/*
  30 * If enabled, frontswap_store will return failure even on success.  As
  31 * a result, the swap subsystem will always write the page to swap, in
  32 * effect converting frontswap into a writethrough cache.  In this mode,
  33 * there is no direct reduction in swap writes, but a frontswap backend
  34 * can unilaterally "reclaim" any pages in use with no data loss, thus
  35 * providing increases control over maximum memory usage due to frontswap.
  36 */
  37static bool frontswap_writethrough_enabled __read_mostly;
  38
  39/*
  40 * If enabled, the underlying tmem implementation is capable of doing
  41 * exclusive gets, so frontswap_load, on a successful tmem_get must
  42 * mark the page as no longer in frontswap AND mark it dirty.
  43 */
  44static bool frontswap_tmem_exclusive_gets_enabled __read_mostly;
  45
  46#ifdef CONFIG_DEBUG_FS
  47/*
  48 * Counters available via /sys/kernel/debug/frontswap (if debugfs is
  49 * properly configured).  These are for information only so are not protected
  50 * against increment races.
  51 */
  52static u64 frontswap_loads;
  53static u64 frontswap_succ_stores;
  54static u64 frontswap_failed_stores;
  55static u64 frontswap_invalidates;
  56
  57static inline void inc_frontswap_loads(void) {
  58        frontswap_loads++;
  59}
  60static inline void inc_frontswap_succ_stores(void) {
  61        frontswap_succ_stores++;
  62}
  63static inline void inc_frontswap_failed_stores(void) {
  64        frontswap_failed_stores++;
  65}
  66static inline void inc_frontswap_invalidates(void) {
  67        frontswap_invalidates++;
  68}
  69#else
  70static inline void inc_frontswap_loads(void) { }
  71static inline void inc_frontswap_succ_stores(void) { }
  72static inline void inc_frontswap_failed_stores(void) { }
  73static inline void inc_frontswap_invalidates(void) { }
  74#endif
  75
  76/*
  77 * Due to the asynchronous nature of the backends loading potentially
  78 * _after_ the swap system has been activated, we have chokepoints
  79 * on all frontswap functions to not call the backend until the backend
  80 * has registered.
  81 *
  82 * Specifically when no backend is registered (nobody called
  83 * frontswap_register_ops) all calls to frontswap_init (which is done via
  84 * swapon -> enable_swap_info -> frontswap_init) are registered and remembered
  85 * (via the setting of need_init bitmap) but fail to create tmem_pools. When a
  86 * backend registers with frontswap at some later point the previous
  87 * calls to frontswap_init are executed (by iterating over the need_init
  88 * bitmap) to create tmem_pools and set the respective poolids. All of that is
  89 * guarded by us using atomic bit operations on the 'need_init' bitmap.
  90 *
  91 * This would not guards us against the user deciding to call swapoff right as
  92 * we are calling the backend to initialize (so swapon is in action).
  93 * Fortunatly for us, the swapon_mutex has been taked by the callee so we are
  94 * OK. The other scenario where calls to frontswap_store (called via
  95 * swap_writepage) is racing with frontswap_invalidate_area (called via
  96 * swapoff) is again guarded by the swap subsystem.
  97 *
  98 * While no backend is registered all calls to frontswap_[store|load|
  99 * invalidate_area|invalidate_page] are ignored or fail.
 100 *
 101 * The time between the backend being registered and the swap file system
 102 * calling the backend (via the frontswap_* functions) is indeterminate as
 103 * frontswap_ops is not atomic_t (or a value guarded by a spinlock).
 104 * That is OK as we are comfortable missing some of these calls to the newly
 105 * registered backend.
 106 *
 107 * Obviously the opposite (unloading the backend) must be done after all
 108 * the frontswap_[store|load|invalidate_area|invalidate_page] start
 109 * ignorning or failing the requests - at which point frontswap_ops
 110 * would have to be made in some fashion atomic.
 111 */
 112static DECLARE_BITMAP(need_init, MAX_SWAPFILES);
 113
 114/*
 115 * Register operations for frontswap, returning previous thus allowing
 116 * detection of multiple backends and possible nesting.
 117 */
 118struct frontswap_ops *frontswap_register_ops(struct frontswap_ops *ops)
 119{
 120        struct frontswap_ops *old = frontswap_ops;
 121        int i;
 122
 123        for (i = 0; i < MAX_SWAPFILES; i++) {
 124                if (test_and_clear_bit(i, need_init)) {
 125                        struct swap_info_struct *sis = swap_info[i];
 126                        /* __frontswap_init _should_ have set it! */
 127                        if (!sis->frontswap_map)
 128                                return ERR_PTR(-EINVAL);
 129                        ops->init(i);
 130                }
 131        }
 132        /*
 133         * We MUST have frontswap_ops set _after_ the frontswap_init's
 134         * have been called. Otherwise __frontswap_store might fail. Hence
 135         * the barrier to make sure compiler does not re-order us.
 136         */
 137        barrier();
 138        frontswap_ops = ops;
 139        return old;
 140}
 141EXPORT_SYMBOL(frontswap_register_ops);
 142
 143/*
 144 * Enable/disable frontswap writethrough (see above).
 145 */
 146void frontswap_writethrough(bool enable)
 147{
 148        frontswap_writethrough_enabled = enable;
 149}
 150EXPORT_SYMBOL(frontswap_writethrough);
 151
 152/*
 153 * Enable/disable frontswap exclusive gets (see above).
 154 */
 155void frontswap_tmem_exclusive_gets(bool enable)
 156{
 157        frontswap_tmem_exclusive_gets_enabled = enable;
 158}
 159EXPORT_SYMBOL(frontswap_tmem_exclusive_gets);
 160
 161/*
 162 * Called when a swap device is swapon'd.
 163 */
 164void __frontswap_init(unsigned type, unsigned long *map)
 165{
 166        struct swap_info_struct *sis = swap_info[type];
 167
 168        BUG_ON(sis == NULL);
 169
 170        /*
 171         * p->frontswap is a bitmap that we MUST have to figure out which page
 172         * has gone in frontswap. Without it there is no point of continuing.
 173         */
 174        if (WARN_ON(!map))
 175                return;
 176        /*
 177         * Irregardless of whether the frontswap backend has been loaded
 178         * before this function or it will be later, we _MUST_ have the
 179         * p->frontswap set to something valid to work properly.
 180         */
 181        frontswap_map_set(sis, map);
 182        if (frontswap_ops)
 183                frontswap_ops->init(type);
 184        else {
 185                BUG_ON(type > MAX_SWAPFILES);
 186                set_bit(type, need_init);
 187        }
 188}
 189EXPORT_SYMBOL(__frontswap_init);
 190
 191bool __frontswap_test(struct swap_info_struct *sis,
 192                                pgoff_t offset)
 193{
 194        bool ret = false;
 195
 196        if (frontswap_ops && sis->frontswap_map)
 197                ret = test_bit(offset, sis->frontswap_map);
 198        return ret;
 199}
 200EXPORT_SYMBOL(__frontswap_test);
 201
 202static inline void __frontswap_clear(struct swap_info_struct *sis,
 203                                pgoff_t offset)
 204{
 205        clear_bit(offset, sis->frontswap_map);
 206        atomic_dec(&sis->frontswap_pages);
 207}
 208
 209/*
 210 * "Store" data from a page to frontswap and associate it with the page's
 211 * swaptype and offset.  Page must be locked and in the swap cache.
 212 * If frontswap already contains a page with matching swaptype and
 213 * offset, the frontswap implementation may either overwrite the data and
 214 * return success or invalidate the page from frontswap and return failure.
 215 */
 216int __frontswap_store(struct page *page)
 217{
 218        int ret = -1, dup = 0;
 219        swp_entry_t entry = { .val = page_private(page), };
 220        int type = swp_type(entry);
 221        struct swap_info_struct *sis = swap_info[type];
 222        pgoff_t offset = swp_offset(entry);
 223
 224        /*
 225         * Return if no backend registed.
 226         * Don't need to inc frontswap_failed_stores here.
 227         */
 228        if (!frontswap_ops)
 229                return ret;
 230
 231        BUG_ON(!PageLocked(page));
 232        BUG_ON(sis == NULL);
 233        if (__frontswap_test(sis, offset))
 234                dup = 1;
 235        ret = frontswap_ops->store(type, offset, page);
 236        if (ret == 0) {
 237                set_bit(offset, sis->frontswap_map);
 238                inc_frontswap_succ_stores();
 239                if (!dup)
 240                        atomic_inc(&sis->frontswap_pages);
 241        } else {
 242                /*
 243                  failed dup always results in automatic invalidate of
 244                  the (older) page from frontswap
 245                 */
 246                inc_frontswap_failed_stores();
 247                if (dup)
 248                        __frontswap_clear(sis, offset);
 249        }
 250        if (frontswap_writethrough_enabled)
 251                /* report failure so swap also writes to swap device */
 252                ret = -1;
 253        return ret;
 254}
 255EXPORT_SYMBOL(__frontswap_store);
 256
 257/*
 258 * "Get" data from frontswap associated with swaptype and offset that were
 259 * specified when the data was put to frontswap and use it to fill the
 260 * specified page with data. Page must be locked and in the swap cache.
 261 */
 262int __frontswap_load(struct page *page)
 263{
 264        int ret = -1;
 265        swp_entry_t entry = { .val = page_private(page), };
 266        int type = swp_type(entry);
 267        struct swap_info_struct *sis = swap_info[type];
 268        pgoff_t offset = swp_offset(entry);
 269
 270        BUG_ON(!PageLocked(page));
 271        BUG_ON(sis == NULL);
 272        /*
 273         * __frontswap_test() will check whether there is backend registered
 274         */
 275        if (__frontswap_test(sis, offset))
 276                ret = frontswap_ops->load(type, offset, page);
 277        if (ret == 0) {
 278                inc_frontswap_loads();
 279                if (frontswap_tmem_exclusive_gets_enabled) {
 280                        SetPageDirty(page);
 281                        __frontswap_clear(sis, offset);
 282                }
 283        }
 284        return ret;
 285}
 286EXPORT_SYMBOL(__frontswap_load);
 287
 288/*
 289 * Invalidate any data from frontswap associated with the specified swaptype
 290 * and offset so that a subsequent "get" will fail.
 291 */
 292void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
 293{
 294        struct swap_info_struct *sis = swap_info[type];
 295
 296        BUG_ON(sis == NULL);
 297        /*
 298         * __frontswap_test() will check whether there is backend registered
 299         */
 300        if (__frontswap_test(sis, offset)) {
 301                frontswap_ops->invalidate_page(type, offset);
 302                __frontswap_clear(sis, offset);
 303                inc_frontswap_invalidates();
 304        }
 305}
 306EXPORT_SYMBOL(__frontswap_invalidate_page);
 307
 308/*
 309 * Invalidate all data from frontswap associated with all offsets for the
 310 * specified swaptype.
 311 */
 312void __frontswap_invalidate_area(unsigned type)
 313{
 314        struct swap_info_struct *sis = swap_info[type];
 315
 316        if (frontswap_ops) {
 317                BUG_ON(sis == NULL);
 318                if (sis->frontswap_map == NULL)
 319                        return;
 320                frontswap_ops->invalidate_area(type);
 321                atomic_set(&sis->frontswap_pages, 0);
 322                bitmap_zero(sis->frontswap_map, sis->max);
 323        }
 324        clear_bit(type, need_init);
 325}
 326EXPORT_SYMBOL(__frontswap_invalidate_area);
 327
 328static unsigned long __frontswap_curr_pages(void)
 329{
 330        int type;
 331        unsigned long totalpages = 0;
 332        struct swap_info_struct *si = NULL;
 333
 334        assert_spin_locked(&swap_lock);
 335        for (type = swap_list.head; type >= 0; type = si->next) {
 336                si = swap_info[type];
 337                totalpages += atomic_read(&si->frontswap_pages);
 338        }
 339        return totalpages;
 340}
 341
 342static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
 343                                        int *swapid)
 344{
 345        int ret = -EINVAL;
 346        struct swap_info_struct *si = NULL;
 347        int si_frontswap_pages;
 348        unsigned long total_pages_to_unuse = total;
 349        unsigned long pages = 0, pages_to_unuse = 0;
 350        int type;
 351
 352        assert_spin_locked(&swap_lock);
 353        for (type = swap_list.head; type >= 0; type = si->next) {
 354                si = swap_info[type];
 355                si_frontswap_pages = atomic_read(&si->frontswap_pages);
 356                if (total_pages_to_unuse < si_frontswap_pages) {
 357                        pages = pages_to_unuse = total_pages_to_unuse;
 358                } else {
 359                        pages = si_frontswap_pages;
 360                        pages_to_unuse = 0; /* unuse all */
 361                }
 362                /* ensure there is enough RAM to fetch pages from frontswap */
 363                if (security_vm_enough_memory_mm(current->mm, pages)) {
 364                        ret = -ENOMEM;
 365                        continue;
 366                }
 367                vm_unacct_memory(pages);
 368                *unused = pages_to_unuse;
 369                *swapid = type;
 370                ret = 0;
 371                break;
 372        }
 373
 374        return ret;
 375}
 376
 377/*
 378 * Used to check if it's necessory and feasible to unuse pages.
 379 * Return 1 when nothing to do, 0 when need to shink pages,
 380 * error code when there is an error.
 381 */
 382static int __frontswap_shrink(unsigned long target_pages,
 383                                unsigned long *pages_to_unuse,
 384                                int *type)
 385{
 386        unsigned long total_pages = 0, total_pages_to_unuse;
 387
 388        assert_spin_locked(&swap_lock);
 389
 390        total_pages = __frontswap_curr_pages();
 391        if (total_pages <= target_pages) {
 392                /* Nothing to do */
 393                *pages_to_unuse = 0;
 394                return 1;
 395        }
 396        total_pages_to_unuse = total_pages - target_pages;
 397        return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type);
 398}
 399
 400/*
 401 * Frontswap, like a true swap device, may unnecessarily retain pages
 402 * under certain circumstances; "shrink" frontswap is essentially a
 403 * "partial swapoff" and works by calling try_to_unuse to attempt to
 404 * unuse enough frontswap pages to attempt to -- subject to memory
 405 * constraints -- reduce the number of pages in frontswap to the
 406 * number given in the parameter target_pages.
 407 */
 408void frontswap_shrink(unsigned long target_pages)
 409{
 410        unsigned long pages_to_unuse = 0;
 411        int uninitialized_var(type), ret;
 412
 413        /*
 414         * we don't want to hold swap_lock while doing a very
 415         * lengthy try_to_unuse, but swap_list may change
 416         * so restart scan from swap_list.head each time
 417         */
 418        spin_lock(&swap_lock);
 419        ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
 420        spin_unlock(&swap_lock);
 421        if (ret == 0)
 422                try_to_unuse(type, true, pages_to_unuse);
 423        return;
 424}
 425EXPORT_SYMBOL(frontswap_shrink);
 426
 427/*
 428 * Count and return the number of frontswap pages across all
 429 * swap devices.  This is exported so that backend drivers can
 430 * determine current usage without reading debugfs.
 431 */
 432unsigned long frontswap_curr_pages(void)
 433{
 434        unsigned long totalpages = 0;
 435
 436        spin_lock(&swap_lock);
 437        totalpages = __frontswap_curr_pages();
 438        spin_unlock(&swap_lock);
 439
 440        return totalpages;
 441}
 442EXPORT_SYMBOL(frontswap_curr_pages);
 443
 444static int __init init_frontswap(void)
 445{
 446#ifdef CONFIG_DEBUG_FS
 447        struct dentry *root = debugfs_create_dir("frontswap", NULL);
 448        if (root == NULL)
 449                return -ENXIO;
 450        debugfs_create_u64("loads", S_IRUGO, root, &frontswap_loads);
 451        debugfs_create_u64("succ_stores", S_IRUGO, root, &frontswap_succ_stores);
 452        debugfs_create_u64("failed_stores", S_IRUGO, root,
 453                                &frontswap_failed_stores);
 454        debugfs_create_u64("invalidates", S_IRUGO,
 455                                root, &frontswap_invalidates);
 456#endif
 457        return 0;
 458}
 459
 460module_init(init_frontswap);
 461