linux/mm/frontswap.c
<<
>>
Prefs
   1/*
   2 * Frontswap frontend
   3 *
   4 * This code provides the generic "frontend" layer to call a matching
   5 * "backend" driver implementation of frontswap.  See
   6 * Documentation/vm/frontswap.txt for more information.
   7 *
   8 * Copyright (C) 2009-2012 Oracle Corp.  All rights reserved.
   9 * Author: Dan Magenheimer
  10 *
  11 * This work is licensed under the terms of the GNU GPL, version 2.
  12 */
  13
  14#include <linux/mman.h>
  15#include <linux/swap.h>
  16#include <linux/swapops.h>
  17#include <linux/security.h>
  18#include <linux/module.h>
  19#include <linux/debugfs.h>
  20#include <linux/frontswap.h>
  21#include <linux/swapfile.h>
  22
  23DEFINE_STATIC_KEY_FALSE(frontswap_enabled_key);
  24
  25/*
  26 * frontswap_ops are added by frontswap_register_ops, and provide the
  27 * frontswap "backend" implementation functions.  Multiple implementations
  28 * may be registered, but implementations can never deregister.  This
  29 * is a simple singly-linked list of all registered implementations.
  30 */
  31static struct frontswap_ops *frontswap_ops __read_mostly;
  32
  33#define for_each_frontswap_ops(ops)             \
  34        for ((ops) = frontswap_ops; (ops); (ops) = (ops)->next)
  35
  36/*
  37 * If enabled, frontswap_store will return failure even on success.  As
  38 * a result, the swap subsystem will always write the page to swap, in
  39 * effect converting frontswap into a writethrough cache.  In this mode,
  40 * there is no direct reduction in swap writes, but a frontswap backend
  41 * can unilaterally "reclaim" any pages in use with no data loss, thus
  42 * providing increases control over maximum memory usage due to frontswap.
  43 */
  44static bool frontswap_writethrough_enabled __read_mostly;
  45
  46/*
  47 * If enabled, the underlying tmem implementation is capable of doing
  48 * exclusive gets, so frontswap_load, on a successful tmem_get must
  49 * mark the page as no longer in frontswap AND mark it dirty.
  50 */
  51static bool frontswap_tmem_exclusive_gets_enabled __read_mostly;
  52
  53#ifdef CONFIG_DEBUG_FS
  54/*
  55 * Counters available via /sys/kernel/debug/frontswap (if debugfs is
  56 * properly configured).  These are for information only so are not protected
  57 * against increment races.
  58 */
  59static u64 frontswap_loads;
  60static u64 frontswap_succ_stores;
  61static u64 frontswap_failed_stores;
  62static u64 frontswap_invalidates;
  63
  64static inline void inc_frontswap_loads(void) {
  65        frontswap_loads++;
  66}
  67static inline void inc_frontswap_succ_stores(void) {
  68        frontswap_succ_stores++;
  69}
  70static inline void inc_frontswap_failed_stores(void) {
  71        frontswap_failed_stores++;
  72}
  73static inline void inc_frontswap_invalidates(void) {
  74        frontswap_invalidates++;
  75}
  76#else
  77static inline void inc_frontswap_loads(void) { }
  78static inline void inc_frontswap_succ_stores(void) { }
  79static inline void inc_frontswap_failed_stores(void) { }
  80static inline void inc_frontswap_invalidates(void) { }
  81#endif
  82
  83/*
  84 * Due to the asynchronous nature of the backends loading potentially
  85 * _after_ the swap system has been activated, we have chokepoints
  86 * on all frontswap functions to not call the backend until the backend
  87 * has registered.
  88 *
  89 * This would not guards us against the user deciding to call swapoff right as
  90 * we are calling the backend to initialize (so swapon is in action).
  91 * Fortunatly for us, the swapon_mutex has been taked by the callee so we are
  92 * OK. The other scenario where calls to frontswap_store (called via
  93 * swap_writepage) is racing with frontswap_invalidate_area (called via
  94 * swapoff) is again guarded by the swap subsystem.
  95 *
  96 * While no backend is registered all calls to frontswap_[store|load|
  97 * invalidate_area|invalidate_page] are ignored or fail.
  98 *
  99 * The time between the backend being registered and the swap file system
 100 * calling the backend (via the frontswap_* functions) is indeterminate as
 101 * frontswap_ops is not atomic_t (or a value guarded by a spinlock).
 102 * That is OK as we are comfortable missing some of these calls to the newly
 103 * registered backend.
 104 *
 105 * Obviously the opposite (unloading the backend) must be done after all
 106 * the frontswap_[store|load|invalidate_area|invalidate_page] start
 107 * ignoring or failing the requests.  However, there is currently no way
 108 * to unload a backend once it is registered.
 109 */
 110
 111/*
 112 * Register operations for frontswap
 113 */
 114void frontswap_register_ops(struct frontswap_ops *ops)
 115{
 116        DECLARE_BITMAP(a, MAX_SWAPFILES);
 117        DECLARE_BITMAP(b, MAX_SWAPFILES);
 118        struct swap_info_struct *si;
 119        unsigned int i;
 120
 121        bitmap_zero(a, MAX_SWAPFILES);
 122        bitmap_zero(b, MAX_SWAPFILES);
 123
 124        spin_lock(&swap_lock);
 125        plist_for_each_entry(si, &swap_active_head, list) {
 126                if (!WARN_ON(!si->frontswap_map))
 127                        set_bit(si->type, a);
 128        }
 129        spin_unlock(&swap_lock);
 130
 131        /* the new ops needs to know the currently active swap devices */
 132        for_each_set_bit(i, a, MAX_SWAPFILES)
 133                ops->init(i);
 134
 135        /*
 136         * Setting frontswap_ops must happen after the ops->init() calls
 137         * above; cmpxchg implies smp_mb() which will ensure the init is
 138         * complete at this point.
 139         */
 140        do {
 141                ops->next = frontswap_ops;
 142        } while (cmpxchg(&frontswap_ops, ops->next, ops) != ops->next);
 143
 144        static_branch_inc(&frontswap_enabled_key);
 145
 146        spin_lock(&swap_lock);
 147        plist_for_each_entry(si, &swap_active_head, list) {
 148                if (si->frontswap_map)
 149                        set_bit(si->type, b);
 150        }
 151        spin_unlock(&swap_lock);
 152
 153        /*
 154         * On the very unlikely chance that a swap device was added or
 155         * removed between setting the "a" list bits and the ops init
 156         * calls, we re-check and do init or invalidate for any changed
 157         * bits.
 158         */
 159        if (unlikely(!bitmap_equal(a, b, MAX_SWAPFILES))) {
 160                for (i = 0; i < MAX_SWAPFILES; i++) {
 161                        if (!test_bit(i, a) && test_bit(i, b))
 162                                ops->init(i);
 163                        else if (test_bit(i, a) && !test_bit(i, b))
 164                                ops->invalidate_area(i);
 165                }
 166        }
 167}
 168EXPORT_SYMBOL(frontswap_register_ops);
 169
 170/*
 171 * Enable/disable frontswap writethrough (see above).
 172 */
 173void frontswap_writethrough(bool enable)
 174{
 175        frontswap_writethrough_enabled = enable;
 176}
 177EXPORT_SYMBOL(frontswap_writethrough);
 178
 179/*
 180 * Enable/disable frontswap exclusive gets (see above).
 181 */
 182void frontswap_tmem_exclusive_gets(bool enable)
 183{
 184        frontswap_tmem_exclusive_gets_enabled = enable;
 185}
 186EXPORT_SYMBOL(frontswap_tmem_exclusive_gets);
 187
 188/*
 189 * Called when a swap device is swapon'd.
 190 */
 191void __frontswap_init(unsigned type, unsigned long *map)
 192{
 193        struct swap_info_struct *sis = swap_info[type];
 194        struct frontswap_ops *ops;
 195
 196        VM_BUG_ON(sis == NULL);
 197
 198        /*
 199         * p->frontswap is a bitmap that we MUST have to figure out which page
 200         * has gone in frontswap. Without it there is no point of continuing.
 201         */
 202        if (WARN_ON(!map))
 203                return;
 204        /*
 205         * Irregardless of whether the frontswap backend has been loaded
 206         * before this function or it will be later, we _MUST_ have the
 207         * p->frontswap set to something valid to work properly.
 208         */
 209        frontswap_map_set(sis, map);
 210
 211        for_each_frontswap_ops(ops)
 212                ops->init(type);
 213}
 214EXPORT_SYMBOL(__frontswap_init);
 215
 216bool __frontswap_test(struct swap_info_struct *sis,
 217                                pgoff_t offset)
 218{
 219        if (sis->frontswap_map)
 220                return test_bit(offset, sis->frontswap_map);
 221        return false;
 222}
 223EXPORT_SYMBOL(__frontswap_test);
 224
 225static inline void __frontswap_set(struct swap_info_struct *sis,
 226                                   pgoff_t offset)
 227{
 228        set_bit(offset, sis->frontswap_map);
 229        atomic_inc(&sis->frontswap_pages);
 230}
 231
 232static inline void __frontswap_clear(struct swap_info_struct *sis,
 233                                     pgoff_t offset)
 234{
 235        clear_bit(offset, sis->frontswap_map);
 236        atomic_dec(&sis->frontswap_pages);
 237}
 238
 239/*
 240 * "Store" data from a page to frontswap and associate it with the page's
 241 * swaptype and offset.  Page must be locked and in the swap cache.
 242 * If frontswap already contains a page with matching swaptype and
 243 * offset, the frontswap implementation may either overwrite the data and
 244 * return success or invalidate the page from frontswap and return failure.
 245 */
 246int __frontswap_store(struct page *page)
 247{
 248        int ret = -1;
 249        swp_entry_t entry = { .val = page_private(page), };
 250        int type = swp_type(entry);
 251        struct swap_info_struct *sis = swap_info[type];
 252        pgoff_t offset = swp_offset(entry);
 253        struct frontswap_ops *ops;
 254
 255        VM_BUG_ON(!frontswap_ops);
 256        VM_BUG_ON(!PageLocked(page));
 257        VM_BUG_ON(sis == NULL);
 258
 259        /*
 260         * If a dup, we must remove the old page first; we can't leave the
 261         * old page no matter if the store of the new page succeeds or fails,
 262         * and we can't rely on the new page replacing the old page as we may
 263         * not store to the same implementation that contains the old page.
 264         */
 265        if (__frontswap_test(sis, offset)) {
 266                __frontswap_clear(sis, offset);
 267                for_each_frontswap_ops(ops)
 268                        ops->invalidate_page(type, offset);
 269        }
 270
 271        /* Try to store in each implementation, until one succeeds. */
 272        for_each_frontswap_ops(ops) {
 273                ret = ops->store(type, offset, page);
 274                if (!ret) /* successful store */
 275                        break;
 276        }
 277        if (ret == 0) {
 278                __frontswap_set(sis, offset);
 279                inc_frontswap_succ_stores();
 280        } else {
 281                inc_frontswap_failed_stores();
 282        }
 283        if (frontswap_writethrough_enabled)
 284                /* report failure so swap also writes to swap device */
 285                ret = -1;
 286        return ret;
 287}
 288EXPORT_SYMBOL(__frontswap_store);
 289
 290/*
 291 * "Get" data from frontswap associated with swaptype and offset that were
 292 * specified when the data was put to frontswap and use it to fill the
 293 * specified page with data. Page must be locked and in the swap cache.
 294 */
 295int __frontswap_load(struct page *page)
 296{
 297        int ret = -1;
 298        swp_entry_t entry = { .val = page_private(page), };
 299        int type = swp_type(entry);
 300        struct swap_info_struct *sis = swap_info[type];
 301        pgoff_t offset = swp_offset(entry);
 302        struct frontswap_ops *ops;
 303
 304        VM_BUG_ON(!frontswap_ops);
 305        VM_BUG_ON(!PageLocked(page));
 306        VM_BUG_ON(sis == NULL);
 307
 308        if (!__frontswap_test(sis, offset))
 309                return -1;
 310
 311        /* Try loading from each implementation, until one succeeds. */
 312        for_each_frontswap_ops(ops) {
 313                ret = ops->load(type, offset, page);
 314                if (!ret) /* successful load */
 315                        break;
 316        }
 317        if (ret == 0) {
 318                inc_frontswap_loads();
 319                if (frontswap_tmem_exclusive_gets_enabled) {
 320                        SetPageDirty(page);
 321                        __frontswap_clear(sis, offset);
 322                }
 323        }
 324        return ret;
 325}
 326EXPORT_SYMBOL(__frontswap_load);
 327
 328/*
 329 * Invalidate any data from frontswap associated with the specified swaptype
 330 * and offset so that a subsequent "get" will fail.
 331 */
 332void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
 333{
 334        struct swap_info_struct *sis = swap_info[type];
 335        struct frontswap_ops *ops;
 336
 337        VM_BUG_ON(!frontswap_ops);
 338        VM_BUG_ON(sis == NULL);
 339
 340        if (!__frontswap_test(sis, offset))
 341                return;
 342
 343        for_each_frontswap_ops(ops)
 344                ops->invalidate_page(type, offset);
 345        __frontswap_clear(sis, offset);
 346        inc_frontswap_invalidates();
 347}
 348EXPORT_SYMBOL(__frontswap_invalidate_page);
 349
 350/*
 351 * Invalidate all data from frontswap associated with all offsets for the
 352 * specified swaptype.
 353 */
 354void __frontswap_invalidate_area(unsigned type)
 355{
 356        struct swap_info_struct *sis = swap_info[type];
 357        struct frontswap_ops *ops;
 358
 359        VM_BUG_ON(!frontswap_ops);
 360        VM_BUG_ON(sis == NULL);
 361
 362        if (sis->frontswap_map == NULL)
 363                return;
 364
 365        for_each_frontswap_ops(ops)
 366                ops->invalidate_area(type);
 367        atomic_set(&sis->frontswap_pages, 0);
 368        bitmap_zero(sis->frontswap_map, sis->max);
 369}
 370EXPORT_SYMBOL(__frontswap_invalidate_area);
 371
 372static unsigned long __frontswap_curr_pages(void)
 373{
 374        unsigned long totalpages = 0;
 375        struct swap_info_struct *si = NULL;
 376
 377        assert_spin_locked(&swap_lock);
 378        plist_for_each_entry(si, &swap_active_head, list)
 379                totalpages += atomic_read(&si->frontswap_pages);
 380        return totalpages;
 381}
 382
 383static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
 384                                        int *swapid)
 385{
 386        int ret = -EINVAL;
 387        struct swap_info_struct *si = NULL;
 388        int si_frontswap_pages;
 389        unsigned long total_pages_to_unuse = total;
 390        unsigned long pages = 0, pages_to_unuse = 0;
 391
 392        assert_spin_locked(&swap_lock);
 393        plist_for_each_entry(si, &swap_active_head, list) {
 394                si_frontswap_pages = atomic_read(&si->frontswap_pages);
 395                if (total_pages_to_unuse < si_frontswap_pages) {
 396                        pages = pages_to_unuse = total_pages_to_unuse;
 397                } else {
 398                        pages = si_frontswap_pages;
 399                        pages_to_unuse = 0; /* unuse all */
 400                }
 401                /* ensure there is enough RAM to fetch pages from frontswap */
 402                if (security_vm_enough_memory_mm(current->mm, pages)) {
 403                        ret = -ENOMEM;
 404                        continue;
 405                }
 406                vm_unacct_memory(pages);
 407                *unused = pages_to_unuse;
 408                *swapid = si->type;
 409                ret = 0;
 410                break;
 411        }
 412
 413        return ret;
 414}
 415
 416/*
 417 * Used to check if it's necessory and feasible to unuse pages.
 418 * Return 1 when nothing to do, 0 when need to shink pages,
 419 * error code when there is an error.
 420 */
 421static int __frontswap_shrink(unsigned long target_pages,
 422                                unsigned long *pages_to_unuse,
 423                                int *type)
 424{
 425        unsigned long total_pages = 0, total_pages_to_unuse;
 426
 427        assert_spin_locked(&swap_lock);
 428
 429        total_pages = __frontswap_curr_pages();
 430        if (total_pages <= target_pages) {
 431                /* Nothing to do */
 432                *pages_to_unuse = 0;
 433                return 1;
 434        }
 435        total_pages_to_unuse = total_pages - target_pages;
 436        return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type);
 437}
 438
 439/*
 440 * Frontswap, like a true swap device, may unnecessarily retain pages
 441 * under certain circumstances; "shrink" frontswap is essentially a
 442 * "partial swapoff" and works by calling try_to_unuse to attempt to
 443 * unuse enough frontswap pages to attempt to -- subject to memory
 444 * constraints -- reduce the number of pages in frontswap to the
 445 * number given in the parameter target_pages.
 446 */
 447void frontswap_shrink(unsigned long target_pages)
 448{
 449        unsigned long pages_to_unuse = 0;
 450        int uninitialized_var(type), ret;
 451
 452        /*
 453         * we don't want to hold swap_lock while doing a very
 454         * lengthy try_to_unuse, but swap_list may change
 455         * so restart scan from swap_active_head each time
 456         */
 457        spin_lock(&swap_lock);
 458        ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
 459        spin_unlock(&swap_lock);
 460        if (ret == 0)
 461                try_to_unuse(type, true, pages_to_unuse);
 462        return;
 463}
 464EXPORT_SYMBOL(frontswap_shrink);
 465
 466/*
 467 * Count and return the number of frontswap pages across all
 468 * swap devices.  This is exported so that backend drivers can
 469 * determine current usage without reading debugfs.
 470 */
 471unsigned long frontswap_curr_pages(void)
 472{
 473        unsigned long totalpages = 0;
 474
 475        spin_lock(&swap_lock);
 476        totalpages = __frontswap_curr_pages();
 477        spin_unlock(&swap_lock);
 478
 479        return totalpages;
 480}
 481EXPORT_SYMBOL(frontswap_curr_pages);
 482
 483static int __init init_frontswap(void)
 484{
 485#ifdef CONFIG_DEBUG_FS
 486        struct dentry *root = debugfs_create_dir("frontswap", NULL);
 487        if (root == NULL)
 488                return -ENXIO;
 489        debugfs_create_u64("loads", S_IRUGO, root, &frontswap_loads);
 490        debugfs_create_u64("succ_stores", S_IRUGO, root, &frontswap_succ_stores);
 491        debugfs_create_u64("failed_stores", S_IRUGO, root,
 492                                &frontswap_failed_stores);
 493        debugfs_create_u64("invalidates", S_IRUGO,
 494                                root, &frontswap_invalidates);
 495#endif
 496        return 0;
 497}
 498
 499module_init(init_frontswap);
 500