linux/mm/frontswap.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Frontswap frontend
   4 *
   5 * This code provides the generic "frontend" layer to call a matching
   6 * "backend" driver implementation of frontswap.  See
   7 * Documentation/vm/frontswap.rst for more information.
   8 *
   9 * Copyright (C) 2009-2012 Oracle Corp.  All rights reserved.
  10 * Author: Dan Magenheimer
  11 */
  12
  13#include <linux/mman.h>
  14#include <linux/swap.h>
  15#include <linux/swapops.h>
  16#include <linux/security.h>
  17#include <linux/module.h>
  18#include <linux/debugfs.h>
  19#include <linux/frontswap.h>
  20#include <linux/swapfile.h>
  21
  22DEFINE_STATIC_KEY_FALSE(frontswap_enabled_key);
  23
  24/*
  25 * frontswap_ops are added by frontswap_register_ops, and provide the
  26 * frontswap "backend" implementation functions.  Multiple implementations
  27 * may be registered, but implementations can never deregister.  This
  28 * is a simple singly-linked list of all registered implementations.
  29 */
  30static struct frontswap_ops *frontswap_ops __read_mostly;
  31
  32#define for_each_frontswap_ops(ops)             \
  33        for ((ops) = frontswap_ops; (ops); (ops) = (ops)->next)
  34
  35/*
  36 * If enabled, frontswap_store will return failure even on success.  As
  37 * a result, the swap subsystem will always write the page to swap, in
  38 * effect converting frontswap into a writethrough cache.  In this mode,
  39 * there is no direct reduction in swap writes, but a frontswap backend
  40 * can unilaterally "reclaim" any pages in use with no data loss, thus
  41 * providing increases control over maximum memory usage due to frontswap.
  42 */
  43static bool frontswap_writethrough_enabled __read_mostly;
  44
  45/*
  46 * If enabled, the underlying tmem implementation is capable of doing
  47 * exclusive gets, so frontswap_load, on a successful tmem_get must
  48 * mark the page as no longer in frontswap AND mark it dirty.
  49 */
  50static bool frontswap_tmem_exclusive_gets_enabled __read_mostly;
  51
  52#ifdef CONFIG_DEBUG_FS
  53/*
  54 * Counters available via /sys/kernel/debug/frontswap (if debugfs is
  55 * properly configured).  These are for information only so are not protected
  56 * against increment races.
  57 */
  58static u64 frontswap_loads;
  59static u64 frontswap_succ_stores;
  60static u64 frontswap_failed_stores;
  61static u64 frontswap_invalidates;
  62
  63static inline void inc_frontswap_loads(void) {
  64        frontswap_loads++;
  65}
  66static inline void inc_frontswap_succ_stores(void) {
  67        frontswap_succ_stores++;
  68}
  69static inline void inc_frontswap_failed_stores(void) {
  70        frontswap_failed_stores++;
  71}
  72static inline void inc_frontswap_invalidates(void) {
  73        frontswap_invalidates++;
  74}
  75#else
  76static inline void inc_frontswap_loads(void) { }
  77static inline void inc_frontswap_succ_stores(void) { }
  78static inline void inc_frontswap_failed_stores(void) { }
  79static inline void inc_frontswap_invalidates(void) { }
  80#endif
  81
  82/*
  83 * Due to the asynchronous nature of the backends loading potentially
  84 * _after_ the swap system has been activated, we have chokepoints
  85 * on all frontswap functions to not call the backend until the backend
  86 * has registered.
  87 *
  88 * This would not guards us against the user deciding to call swapoff right as
  89 * we are calling the backend to initialize (so swapon is in action).
  90 * Fortunatly for us, the swapon_mutex has been taked by the callee so we are
  91 * OK. The other scenario where calls to frontswap_store (called via
  92 * swap_writepage) is racing with frontswap_invalidate_area (called via
  93 * swapoff) is again guarded by the swap subsystem.
  94 *
  95 * While no backend is registered all calls to frontswap_[store|load|
  96 * invalidate_area|invalidate_page] are ignored or fail.
  97 *
  98 * The time between the backend being registered and the swap file system
  99 * calling the backend (via the frontswap_* functions) is indeterminate as
 100 * frontswap_ops is not atomic_t (or a value guarded by a spinlock).
 101 * That is OK as we are comfortable missing some of these calls to the newly
 102 * registered backend.
 103 *
 104 * Obviously the opposite (unloading the backend) must be done after all
 105 * the frontswap_[store|load|invalidate_area|invalidate_page] start
 106 * ignoring or failing the requests.  However, there is currently no way
 107 * to unload a backend once it is registered.
 108 */
 109
 110/*
 111 * Register operations for frontswap
 112 */
 113void frontswap_register_ops(struct frontswap_ops *ops)
 114{
 115        DECLARE_BITMAP(a, MAX_SWAPFILES);
 116        DECLARE_BITMAP(b, MAX_SWAPFILES);
 117        struct swap_info_struct *si;
 118        unsigned int i;
 119
 120        bitmap_zero(a, MAX_SWAPFILES);
 121        bitmap_zero(b, MAX_SWAPFILES);
 122
 123        spin_lock(&swap_lock);
 124        plist_for_each_entry(si, &swap_active_head, list) {
 125                if (!WARN_ON(!si->frontswap_map))
 126                        set_bit(si->type, a);
 127        }
 128        spin_unlock(&swap_lock);
 129
 130        /* the new ops needs to know the currently active swap devices */
 131        for_each_set_bit(i, a, MAX_SWAPFILES)
 132                ops->init(i);
 133
 134        /*
 135         * Setting frontswap_ops must happen after the ops->init() calls
 136         * above; cmpxchg implies smp_mb() which will ensure the init is
 137         * complete at this point.
 138         */
 139        do {
 140                ops->next = frontswap_ops;
 141        } while (cmpxchg(&frontswap_ops, ops->next, ops) != ops->next);
 142
 143        static_branch_inc(&frontswap_enabled_key);
 144
 145        spin_lock(&swap_lock);
 146        plist_for_each_entry(si, &swap_active_head, list) {
 147                if (si->frontswap_map)
 148                        set_bit(si->type, b);
 149        }
 150        spin_unlock(&swap_lock);
 151
 152        /*
 153         * On the very unlikely chance that a swap device was added or
 154         * removed between setting the "a" list bits and the ops init
 155         * calls, we re-check and do init or invalidate for any changed
 156         * bits.
 157         */
 158        if (unlikely(!bitmap_equal(a, b, MAX_SWAPFILES))) {
 159                for (i = 0; i < MAX_SWAPFILES; i++) {
 160                        if (!test_bit(i, a) && test_bit(i, b))
 161                                ops->init(i);
 162                        else if (test_bit(i, a) && !test_bit(i, b))
 163                                ops->invalidate_area(i);
 164                }
 165        }
 166}
 167EXPORT_SYMBOL(frontswap_register_ops);
 168
 169/*
 170 * Enable/disable frontswap writethrough (see above).
 171 */
 172void frontswap_writethrough(bool enable)
 173{
 174        frontswap_writethrough_enabled = enable;
 175}
 176EXPORT_SYMBOL(frontswap_writethrough);
 177
 178/*
 179 * Enable/disable frontswap exclusive gets (see above).
 180 */
 181void frontswap_tmem_exclusive_gets(bool enable)
 182{
 183        frontswap_tmem_exclusive_gets_enabled = enable;
 184}
 185EXPORT_SYMBOL(frontswap_tmem_exclusive_gets);
 186
 187/*
 188 * Called when a swap device is swapon'd.
 189 */
 190void __frontswap_init(unsigned type, unsigned long *map)
 191{
 192        struct swap_info_struct *sis = swap_info[type];
 193        struct frontswap_ops *ops;
 194
 195        VM_BUG_ON(sis == NULL);
 196
 197        /*
 198         * p->frontswap is a bitmap that we MUST have to figure out which page
 199         * has gone in frontswap. Without it there is no point of continuing.
 200         */
 201        if (WARN_ON(!map))
 202                return;
 203        /*
 204         * Irregardless of whether the frontswap backend has been loaded
 205         * before this function or it will be later, we _MUST_ have the
 206         * p->frontswap set to something valid to work properly.
 207         */
 208        frontswap_map_set(sis, map);
 209
 210        for_each_frontswap_ops(ops)
 211                ops->init(type);
 212}
 213EXPORT_SYMBOL(__frontswap_init);
 214
 215bool __frontswap_test(struct swap_info_struct *sis,
 216                                pgoff_t offset)
 217{
 218        if (sis->frontswap_map)
 219                return test_bit(offset, sis->frontswap_map);
 220        return false;
 221}
 222EXPORT_SYMBOL(__frontswap_test);
 223
 224static inline void __frontswap_set(struct swap_info_struct *sis,
 225                                   pgoff_t offset)
 226{
 227        set_bit(offset, sis->frontswap_map);
 228        atomic_inc(&sis->frontswap_pages);
 229}
 230
 231static inline void __frontswap_clear(struct swap_info_struct *sis,
 232                                     pgoff_t offset)
 233{
 234        clear_bit(offset, sis->frontswap_map);
 235        atomic_dec(&sis->frontswap_pages);
 236}
 237
 238/*
 239 * "Store" data from a page to frontswap and associate it with the page's
 240 * swaptype and offset.  Page must be locked and in the swap cache.
 241 * If frontswap already contains a page with matching swaptype and
 242 * offset, the frontswap implementation may either overwrite the data and
 243 * return success or invalidate the page from frontswap and return failure.
 244 */
 245int __frontswap_store(struct page *page)
 246{
 247        int ret = -1;
 248        swp_entry_t entry = { .val = page_private(page), };
 249        int type = swp_type(entry);
 250        struct swap_info_struct *sis = swap_info[type];
 251        pgoff_t offset = swp_offset(entry);
 252        struct frontswap_ops *ops;
 253
 254        VM_BUG_ON(!frontswap_ops);
 255        VM_BUG_ON(!PageLocked(page));
 256        VM_BUG_ON(sis == NULL);
 257
 258        /*
 259         * If a dup, we must remove the old page first; we can't leave the
 260         * old page no matter if the store of the new page succeeds or fails,
 261         * and we can't rely on the new page replacing the old page as we may
 262         * not store to the same implementation that contains the old page.
 263         */
 264        if (__frontswap_test(sis, offset)) {
 265                __frontswap_clear(sis, offset);
 266                for_each_frontswap_ops(ops)
 267                        ops->invalidate_page(type, offset);
 268        }
 269
 270        /* Try to store in each implementation, until one succeeds. */
 271        for_each_frontswap_ops(ops) {
 272                ret = ops->store(type, offset, page);
 273                if (!ret) /* successful store */
 274                        break;
 275        }
 276        if (ret == 0) {
 277                __frontswap_set(sis, offset);
 278                inc_frontswap_succ_stores();
 279        } else {
 280                inc_frontswap_failed_stores();
 281        }
 282        if (frontswap_writethrough_enabled)
 283                /* report failure so swap also writes to swap device */
 284                ret = -1;
 285        return ret;
 286}
 287EXPORT_SYMBOL(__frontswap_store);
 288
 289/*
 290 * "Get" data from frontswap associated with swaptype and offset that were
 291 * specified when the data was put to frontswap and use it to fill the
 292 * specified page with data. Page must be locked and in the swap cache.
 293 */
 294int __frontswap_load(struct page *page)
 295{
 296        int ret = -1;
 297        swp_entry_t entry = { .val = page_private(page), };
 298        int type = swp_type(entry);
 299        struct swap_info_struct *sis = swap_info[type];
 300        pgoff_t offset = swp_offset(entry);
 301        struct frontswap_ops *ops;
 302
 303        VM_BUG_ON(!frontswap_ops);
 304        VM_BUG_ON(!PageLocked(page));
 305        VM_BUG_ON(sis == NULL);
 306
 307        if (!__frontswap_test(sis, offset))
 308                return -1;
 309
 310        /* Try loading from each implementation, until one succeeds. */
 311        for_each_frontswap_ops(ops) {
 312                ret = ops->load(type, offset, page);
 313                if (!ret) /* successful load */
 314                        break;
 315        }
 316        if (ret == 0) {
 317                inc_frontswap_loads();
 318                if (frontswap_tmem_exclusive_gets_enabled) {
 319                        SetPageDirty(page);
 320                        __frontswap_clear(sis, offset);
 321                }
 322        }
 323        return ret;
 324}
 325EXPORT_SYMBOL(__frontswap_load);
 326
 327/*
 328 * Invalidate any data from frontswap associated with the specified swaptype
 329 * and offset so that a subsequent "get" will fail.
 330 */
 331void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
 332{
 333        struct swap_info_struct *sis = swap_info[type];
 334        struct frontswap_ops *ops;
 335
 336        VM_BUG_ON(!frontswap_ops);
 337        VM_BUG_ON(sis == NULL);
 338
 339        if (!__frontswap_test(sis, offset))
 340                return;
 341
 342        for_each_frontswap_ops(ops)
 343                ops->invalidate_page(type, offset);
 344        __frontswap_clear(sis, offset);
 345        inc_frontswap_invalidates();
 346}
 347EXPORT_SYMBOL(__frontswap_invalidate_page);
 348
 349/*
 350 * Invalidate all data from frontswap associated with all offsets for the
 351 * specified swaptype.
 352 */
 353void __frontswap_invalidate_area(unsigned type)
 354{
 355        struct swap_info_struct *sis = swap_info[type];
 356        struct frontswap_ops *ops;
 357
 358        VM_BUG_ON(!frontswap_ops);
 359        VM_BUG_ON(sis == NULL);
 360
 361        if (sis->frontswap_map == NULL)
 362                return;
 363
 364        for_each_frontswap_ops(ops)
 365                ops->invalidate_area(type);
 366        atomic_set(&sis->frontswap_pages, 0);
 367        bitmap_zero(sis->frontswap_map, sis->max);
 368}
 369EXPORT_SYMBOL(__frontswap_invalidate_area);
 370
 371static unsigned long __frontswap_curr_pages(void)
 372{
 373        unsigned long totalpages = 0;
 374        struct swap_info_struct *si = NULL;
 375
 376        assert_spin_locked(&swap_lock);
 377        plist_for_each_entry(si, &swap_active_head, list)
 378                totalpages += atomic_read(&si->frontswap_pages);
 379        return totalpages;
 380}
 381
 382static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
 383                                        int *swapid)
 384{
 385        int ret = -EINVAL;
 386        struct swap_info_struct *si = NULL;
 387        int si_frontswap_pages;
 388        unsigned long total_pages_to_unuse = total;
 389        unsigned long pages = 0, pages_to_unuse = 0;
 390
 391        assert_spin_locked(&swap_lock);
 392        plist_for_each_entry(si, &swap_active_head, list) {
 393                si_frontswap_pages = atomic_read(&si->frontswap_pages);
 394                if (total_pages_to_unuse < si_frontswap_pages) {
 395                        pages = pages_to_unuse = total_pages_to_unuse;
 396                } else {
 397                        pages = si_frontswap_pages;
 398                        pages_to_unuse = 0; /* unuse all */
 399                }
 400                /* ensure there is enough RAM to fetch pages from frontswap */
 401                if (security_vm_enough_memory_mm(current->mm, pages)) {
 402                        ret = -ENOMEM;
 403                        continue;
 404                }
 405                vm_unacct_memory(pages);
 406                *unused = pages_to_unuse;
 407                *swapid = si->type;
 408                ret = 0;
 409                break;
 410        }
 411
 412        return ret;
 413}
 414
 415/*
 416 * Used to check if it's necessory and feasible to unuse pages.
 417 * Return 1 when nothing to do, 0 when need to shink pages,
 418 * error code when there is an error.
 419 */
 420static int __frontswap_shrink(unsigned long target_pages,
 421                                unsigned long *pages_to_unuse,
 422                                int *type)
 423{
 424        unsigned long total_pages = 0, total_pages_to_unuse;
 425
 426        assert_spin_locked(&swap_lock);
 427
 428        total_pages = __frontswap_curr_pages();
 429        if (total_pages <= target_pages) {
 430                /* Nothing to do */
 431                *pages_to_unuse = 0;
 432                return 1;
 433        }
 434        total_pages_to_unuse = total_pages - target_pages;
 435        return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type);
 436}
 437
 438/*
 439 * Frontswap, like a true swap device, may unnecessarily retain pages
 440 * under certain circumstances; "shrink" frontswap is essentially a
 441 * "partial swapoff" and works by calling try_to_unuse to attempt to
 442 * unuse enough frontswap pages to attempt to -- subject to memory
 443 * constraints -- reduce the number of pages in frontswap to the
 444 * number given in the parameter target_pages.
 445 */
 446void frontswap_shrink(unsigned long target_pages)
 447{
 448        unsigned long pages_to_unuse = 0;
 449        int uninitialized_var(type), ret;
 450
 451        /*
 452         * we don't want to hold swap_lock while doing a very
 453         * lengthy try_to_unuse, but swap_list may change
 454         * so restart scan from swap_active_head each time
 455         */
 456        spin_lock(&swap_lock);
 457        ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
 458        spin_unlock(&swap_lock);
 459        if (ret == 0)
 460                try_to_unuse(type, true, pages_to_unuse);
 461        return;
 462}
 463EXPORT_SYMBOL(frontswap_shrink);
 464
 465/*
 466 * Count and return the number of frontswap pages across all
 467 * swap devices.  This is exported so that backend drivers can
 468 * determine current usage without reading debugfs.
 469 */
 470unsigned long frontswap_curr_pages(void)
 471{
 472        unsigned long totalpages = 0;
 473
 474        spin_lock(&swap_lock);
 475        totalpages = __frontswap_curr_pages();
 476        spin_unlock(&swap_lock);
 477
 478        return totalpages;
 479}
 480EXPORT_SYMBOL(frontswap_curr_pages);
 481
 482static int __init init_frontswap(void)
 483{
 484#ifdef CONFIG_DEBUG_FS
 485        struct dentry *root = debugfs_create_dir("frontswap", NULL);
 486        if (root == NULL)
 487                return -ENXIO;
 488        debugfs_create_u64("loads", 0444, root, &frontswap_loads);
 489        debugfs_create_u64("succ_stores", 0444, root, &frontswap_succ_stores);
 490        debugfs_create_u64("failed_stores", 0444, root,
 491                           &frontswap_failed_stores);
 492        debugfs_create_u64("invalidates", 0444, root, &frontswap_invalidates);
 493#endif
 494        return 0;
 495}
 496
 497module_init(init_frontswap);
 498