linux/mm/frontswap.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Frontswap frontend
   4 *
   5 * This code provides the generic "frontend" layer to call a matching
   6 * "backend" driver implementation of frontswap.  See
   7 * Documentation/vm/frontswap.rst for more information.
   8 *
   9 * Copyright (C) 2009-2012 Oracle Corp.  All rights reserved.
  10 * Author: Dan Magenheimer
  11 */
  12
  13#include <linux/mman.h>
  14#include <linux/swap.h>
  15#include <linux/swapops.h>
  16#include <linux/security.h>
  17#include <linux/module.h>
  18#include <linux/debugfs.h>
  19#include <linux/frontswap.h>
  20#include <linux/swapfile.h>
  21
  22DEFINE_STATIC_KEY_FALSE(frontswap_enabled_key);
  23
  24/*
  25 * frontswap_ops are added by frontswap_register_ops, and provide the
  26 * frontswap "backend" implementation functions.  Multiple implementations
  27 * may be registered, but implementations can never deregister.  This
  28 * is a simple singly-linked list of all registered implementations.
  29 */
  30static struct frontswap_ops *frontswap_ops __read_mostly;
  31
  32#define for_each_frontswap_ops(ops)             \
  33        for ((ops) = frontswap_ops; (ops); (ops) = (ops)->next)
  34
  35/*
  36 * If enabled, frontswap_store will return failure even on success.  As
  37 * a result, the swap subsystem will always write the page to swap, in
  38 * effect converting frontswap into a writethrough cache.  In this mode,
  39 * there is no direct reduction in swap writes, but a frontswap backend
  40 * can unilaterally "reclaim" any pages in use with no data loss, thus
  41 * providing increases control over maximum memory usage due to frontswap.
  42 */
  43static bool frontswap_writethrough_enabled __read_mostly;
  44
  45/*
  46 * If enabled, the underlying tmem implementation is capable of doing
  47 * exclusive gets, so frontswap_load, on a successful tmem_get must
  48 * mark the page as no longer in frontswap AND mark it dirty.
  49 */
  50static bool frontswap_tmem_exclusive_gets_enabled __read_mostly;
  51
  52#ifdef CONFIG_DEBUG_FS
  53/*
  54 * Counters available via /sys/kernel/debug/frontswap (if debugfs is
  55 * properly configured).  These are for information only so are not protected
  56 * against increment races.
  57 */
  58static u64 frontswap_loads;
  59static u64 frontswap_succ_stores;
  60static u64 frontswap_failed_stores;
  61static u64 frontswap_invalidates;
  62
  63static inline void inc_frontswap_loads(void)
  64{
  65        data_race(frontswap_loads++);
  66}
  67static inline void inc_frontswap_succ_stores(void)
  68{
  69        data_race(frontswap_succ_stores++);
  70}
  71static inline void inc_frontswap_failed_stores(void)
  72{
  73        data_race(frontswap_failed_stores++);
  74}
  75static inline void inc_frontswap_invalidates(void)
  76{
  77        data_race(frontswap_invalidates++);
  78}
  79#else
  80static inline void inc_frontswap_loads(void) { }
  81static inline void inc_frontswap_succ_stores(void) { }
  82static inline void inc_frontswap_failed_stores(void) { }
  83static inline void inc_frontswap_invalidates(void) { }
  84#endif
  85
  86/*
  87 * Due to the asynchronous nature of the backends loading potentially
  88 * _after_ the swap system has been activated, we have chokepoints
  89 * on all frontswap functions to not call the backend until the backend
  90 * has registered.
  91 *
  92 * This would not guards us against the user deciding to call swapoff right as
  93 * we are calling the backend to initialize (so swapon is in action).
  94 * Fortunately for us, the swapon_mutex has been taken by the callee so we are
  95 * OK. The other scenario where calls to frontswap_store (called via
  96 * swap_writepage) is racing with frontswap_invalidate_area (called via
  97 * swapoff) is again guarded by the swap subsystem.
  98 *
  99 * While no backend is registered all calls to frontswap_[store|load|
 100 * invalidate_area|invalidate_page] are ignored or fail.
 101 *
 102 * The time between the backend being registered and the swap file system
 103 * calling the backend (via the frontswap_* functions) is indeterminate as
 104 * frontswap_ops is not atomic_t (or a value guarded by a spinlock).
 105 * That is OK as we are comfortable missing some of these calls to the newly
 106 * registered backend.
 107 *
 108 * Obviously the opposite (unloading the backend) must be done after all
 109 * the frontswap_[store|load|invalidate_area|invalidate_page] start
 110 * ignoring or failing the requests.  However, there is currently no way
 111 * to unload a backend once it is registered.
 112 */
 113
 114/*
 115 * Register operations for frontswap
 116 */
 117void frontswap_register_ops(struct frontswap_ops *ops)
 118{
 119        DECLARE_BITMAP(a, MAX_SWAPFILES);
 120        DECLARE_BITMAP(b, MAX_SWAPFILES);
 121        struct swap_info_struct *si;
 122        unsigned int i;
 123
 124        bitmap_zero(a, MAX_SWAPFILES);
 125        bitmap_zero(b, MAX_SWAPFILES);
 126
 127        spin_lock(&swap_lock);
 128        plist_for_each_entry(si, &swap_active_head, list) {
 129                if (!WARN_ON(!si->frontswap_map))
 130                        set_bit(si->type, a);
 131        }
 132        spin_unlock(&swap_lock);
 133
 134        /* the new ops needs to know the currently active swap devices */
 135        for_each_set_bit(i, a, MAX_SWAPFILES)
 136                ops->init(i);
 137
 138        /*
 139         * Setting frontswap_ops must happen after the ops->init() calls
 140         * above; cmpxchg implies smp_mb() which will ensure the init is
 141         * complete at this point.
 142         */
 143        do {
 144                ops->next = frontswap_ops;
 145        } while (cmpxchg(&frontswap_ops, ops->next, ops) != ops->next);
 146
 147        static_branch_inc(&frontswap_enabled_key);
 148
 149        spin_lock(&swap_lock);
 150        plist_for_each_entry(si, &swap_active_head, list) {
 151                if (si->frontswap_map)
 152                        set_bit(si->type, b);
 153        }
 154        spin_unlock(&swap_lock);
 155
 156        /*
 157         * On the very unlikely chance that a swap device was added or
 158         * removed between setting the "a" list bits and the ops init
 159         * calls, we re-check and do init or invalidate for any changed
 160         * bits.
 161         */
 162        if (unlikely(!bitmap_equal(a, b, MAX_SWAPFILES))) {
 163                for (i = 0; i < MAX_SWAPFILES; i++) {
 164                        if (!test_bit(i, a) && test_bit(i, b))
 165                                ops->init(i);
 166                        else if (test_bit(i, a) && !test_bit(i, b))
 167                                ops->invalidate_area(i);
 168                }
 169        }
 170}
 171EXPORT_SYMBOL(frontswap_register_ops);
 172
 173/*
 174 * Enable/disable frontswap writethrough (see above).
 175 */
 176void frontswap_writethrough(bool enable)
 177{
 178        frontswap_writethrough_enabled = enable;
 179}
 180EXPORT_SYMBOL(frontswap_writethrough);
 181
 182/*
 183 * Enable/disable frontswap exclusive gets (see above).
 184 */
 185void frontswap_tmem_exclusive_gets(bool enable)
 186{
 187        frontswap_tmem_exclusive_gets_enabled = enable;
 188}
 189EXPORT_SYMBOL(frontswap_tmem_exclusive_gets);
 190
 191/*
 192 * Called when a swap device is swapon'd.
 193 */
 194void __frontswap_init(unsigned type, unsigned long *map)
 195{
 196        struct swap_info_struct *sis = swap_info[type];
 197        struct frontswap_ops *ops;
 198
 199        VM_BUG_ON(sis == NULL);
 200
 201        /*
 202         * p->frontswap is a bitmap that we MUST have to figure out which page
 203         * has gone in frontswap. Without it there is no point of continuing.
 204         */
 205        if (WARN_ON(!map))
 206                return;
 207        /*
 208         * Irregardless of whether the frontswap backend has been loaded
 209         * before this function or it will be later, we _MUST_ have the
 210         * p->frontswap set to something valid to work properly.
 211         */
 212        frontswap_map_set(sis, map);
 213
 214        for_each_frontswap_ops(ops)
 215                ops->init(type);
 216}
 217EXPORT_SYMBOL(__frontswap_init);
 218
 219bool __frontswap_test(struct swap_info_struct *sis,
 220                                pgoff_t offset)
 221{
 222        if (sis->frontswap_map)
 223                return test_bit(offset, sis->frontswap_map);
 224        return false;
 225}
 226EXPORT_SYMBOL(__frontswap_test);
 227
 228static inline void __frontswap_set(struct swap_info_struct *sis,
 229                                   pgoff_t offset)
 230{
 231        set_bit(offset, sis->frontswap_map);
 232        atomic_inc(&sis->frontswap_pages);
 233}
 234
 235static inline void __frontswap_clear(struct swap_info_struct *sis,
 236                                     pgoff_t offset)
 237{
 238        clear_bit(offset, sis->frontswap_map);
 239        atomic_dec(&sis->frontswap_pages);
 240}
 241
 242/*
 243 * "Store" data from a page to frontswap and associate it with the page's
 244 * swaptype and offset.  Page must be locked and in the swap cache.
 245 * If frontswap already contains a page with matching swaptype and
 246 * offset, the frontswap implementation may either overwrite the data and
 247 * return success or invalidate the page from frontswap and return failure.
 248 */
 249int __frontswap_store(struct page *page)
 250{
 251        int ret = -1;
 252        swp_entry_t entry = { .val = page_private(page), };
 253        int type = swp_type(entry);
 254        struct swap_info_struct *sis = swap_info[type];
 255        pgoff_t offset = swp_offset(entry);
 256        struct frontswap_ops *ops;
 257
 258        VM_BUG_ON(!frontswap_ops);
 259        VM_BUG_ON(!PageLocked(page));
 260        VM_BUG_ON(sis == NULL);
 261
 262        /*
 263         * If a dup, we must remove the old page first; we can't leave the
 264         * old page no matter if the store of the new page succeeds or fails,
 265         * and we can't rely on the new page replacing the old page as we may
 266         * not store to the same implementation that contains the old page.
 267         */
 268        if (__frontswap_test(sis, offset)) {
 269                __frontswap_clear(sis, offset);
 270                for_each_frontswap_ops(ops)
 271                        ops->invalidate_page(type, offset);
 272        }
 273
 274        /* Try to store in each implementation, until one succeeds. */
 275        for_each_frontswap_ops(ops) {
 276                ret = ops->store(type, offset, page);
 277                if (!ret) /* successful store */
 278                        break;
 279        }
 280        if (ret == 0) {
 281                __frontswap_set(sis, offset);
 282                inc_frontswap_succ_stores();
 283        } else {
 284                inc_frontswap_failed_stores();
 285        }
 286        if (frontswap_writethrough_enabled)
 287                /* report failure so swap also writes to swap device */
 288                ret = -1;
 289        return ret;
 290}
 291EXPORT_SYMBOL(__frontswap_store);
 292
 293/*
 294 * "Get" data from frontswap associated with swaptype and offset that were
 295 * specified when the data was put to frontswap and use it to fill the
 296 * specified page with data. Page must be locked and in the swap cache.
 297 */
 298int __frontswap_load(struct page *page)
 299{
 300        int ret = -1;
 301        swp_entry_t entry = { .val = page_private(page), };
 302        int type = swp_type(entry);
 303        struct swap_info_struct *sis = swap_info[type];
 304        pgoff_t offset = swp_offset(entry);
 305        struct frontswap_ops *ops;
 306
 307        VM_BUG_ON(!frontswap_ops);
 308        VM_BUG_ON(!PageLocked(page));
 309        VM_BUG_ON(sis == NULL);
 310
 311        if (!__frontswap_test(sis, offset))
 312                return -1;
 313
 314        /* Try loading from each implementation, until one succeeds. */
 315        for_each_frontswap_ops(ops) {
 316                ret = ops->load(type, offset, page);
 317                if (!ret) /* successful load */
 318                        break;
 319        }
 320        if (ret == 0) {
 321                inc_frontswap_loads();
 322                if (frontswap_tmem_exclusive_gets_enabled) {
 323                        SetPageDirty(page);
 324                        __frontswap_clear(sis, offset);
 325                }
 326        }
 327        return ret;
 328}
 329EXPORT_SYMBOL(__frontswap_load);
 330
 331/*
 332 * Invalidate any data from frontswap associated with the specified swaptype
 333 * and offset so that a subsequent "get" will fail.
 334 */
 335void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
 336{
 337        struct swap_info_struct *sis = swap_info[type];
 338        struct frontswap_ops *ops;
 339
 340        VM_BUG_ON(!frontswap_ops);
 341        VM_BUG_ON(sis == NULL);
 342
 343        if (!__frontswap_test(sis, offset))
 344                return;
 345
 346        for_each_frontswap_ops(ops)
 347                ops->invalidate_page(type, offset);
 348        __frontswap_clear(sis, offset);
 349        inc_frontswap_invalidates();
 350}
 351EXPORT_SYMBOL(__frontswap_invalidate_page);
 352
 353/*
 354 * Invalidate all data from frontswap associated with all offsets for the
 355 * specified swaptype.
 356 */
 357void __frontswap_invalidate_area(unsigned type)
 358{
 359        struct swap_info_struct *sis = swap_info[type];
 360        struct frontswap_ops *ops;
 361
 362        VM_BUG_ON(!frontswap_ops);
 363        VM_BUG_ON(sis == NULL);
 364
 365        if (sis->frontswap_map == NULL)
 366                return;
 367
 368        for_each_frontswap_ops(ops)
 369                ops->invalidate_area(type);
 370        atomic_set(&sis->frontswap_pages, 0);
 371        bitmap_zero(sis->frontswap_map, sis->max);
 372}
 373EXPORT_SYMBOL(__frontswap_invalidate_area);
 374
 375static unsigned long __frontswap_curr_pages(void)
 376{
 377        unsigned long totalpages = 0;
 378        struct swap_info_struct *si = NULL;
 379
 380        assert_spin_locked(&swap_lock);
 381        plist_for_each_entry(si, &swap_active_head, list)
 382                totalpages += atomic_read(&si->frontswap_pages);
 383        return totalpages;
 384}
 385
 386static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
 387                                        int *swapid)
 388{
 389        int ret = -EINVAL;
 390        struct swap_info_struct *si = NULL;
 391        int si_frontswap_pages;
 392        unsigned long total_pages_to_unuse = total;
 393        unsigned long pages = 0, pages_to_unuse = 0;
 394
 395        assert_spin_locked(&swap_lock);
 396        plist_for_each_entry(si, &swap_active_head, list) {
 397                si_frontswap_pages = atomic_read(&si->frontswap_pages);
 398                if (total_pages_to_unuse < si_frontswap_pages) {
 399                        pages = pages_to_unuse = total_pages_to_unuse;
 400                } else {
 401                        pages = si_frontswap_pages;
 402                        pages_to_unuse = 0; /* unuse all */
 403                }
 404                /* ensure there is enough RAM to fetch pages from frontswap */
 405                if (security_vm_enough_memory_mm(current->mm, pages)) {
 406                        ret = -ENOMEM;
 407                        continue;
 408                }
 409                vm_unacct_memory(pages);
 410                *unused = pages_to_unuse;
 411                *swapid = si->type;
 412                ret = 0;
 413                break;
 414        }
 415
 416        return ret;
 417}
 418
 419/*
 420 * Used to check if it's necessary and feasible to unuse pages.
 421 * Return 1 when nothing to do, 0 when need to shrink pages,
 422 * error code when there is an error.
 423 */
 424static int __frontswap_shrink(unsigned long target_pages,
 425                                unsigned long *pages_to_unuse,
 426                                int *type)
 427{
 428        unsigned long total_pages = 0, total_pages_to_unuse;
 429
 430        assert_spin_locked(&swap_lock);
 431
 432        total_pages = __frontswap_curr_pages();
 433        if (total_pages <= target_pages) {
 434                /* Nothing to do */
 435                *pages_to_unuse = 0;
 436                return 1;
 437        }
 438        total_pages_to_unuse = total_pages - target_pages;
 439        return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type);
 440}
 441
 442/*
 443 * Frontswap, like a true swap device, may unnecessarily retain pages
 444 * under certain circumstances; "shrink" frontswap is essentially a
 445 * "partial swapoff" and works by calling try_to_unuse to attempt to
 446 * unuse enough frontswap pages to attempt to -- subject to memory
 447 * constraints -- reduce the number of pages in frontswap to the
 448 * number given in the parameter target_pages.
 449 */
 450void frontswap_shrink(unsigned long target_pages)
 451{
 452        unsigned long pages_to_unuse = 0;
 453        int type, ret;
 454
 455        /*
 456         * we don't want to hold swap_lock while doing a very
 457         * lengthy try_to_unuse, but swap_list may change
 458         * so restart scan from swap_active_head each time
 459         */
 460        spin_lock(&swap_lock);
 461        ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
 462        spin_unlock(&swap_lock);
 463        if (ret == 0)
 464                try_to_unuse(type, true, pages_to_unuse);
 465        return;
 466}
 467EXPORT_SYMBOL(frontswap_shrink);
 468
 469/*
 470 * Count and return the number of frontswap pages across all
 471 * swap devices.  This is exported so that backend drivers can
 472 * determine current usage without reading debugfs.
 473 */
 474unsigned long frontswap_curr_pages(void)
 475{
 476        unsigned long totalpages = 0;
 477
 478        spin_lock(&swap_lock);
 479        totalpages = __frontswap_curr_pages();
 480        spin_unlock(&swap_lock);
 481
 482        return totalpages;
 483}
 484EXPORT_SYMBOL(frontswap_curr_pages);
 485
 486static int __init init_frontswap(void)
 487{
 488#ifdef CONFIG_DEBUG_FS
 489        struct dentry *root = debugfs_create_dir("frontswap", NULL);
 490        if (root == NULL)
 491                return -ENXIO;
 492        debugfs_create_u64("loads", 0444, root, &frontswap_loads);
 493        debugfs_create_u64("succ_stores", 0444, root, &frontswap_succ_stores);
 494        debugfs_create_u64("failed_stores", 0444, root,
 495                           &frontswap_failed_stores);
 496        debugfs_create_u64("invalidates", 0444, root, &frontswap_invalidates);
 497#endif
 498        return 0;
 499}
 500
 501module_init(init_frontswap);
 502