linux/mm/zswap.c
<<
>>
Prefs
   1/*
   2 * zswap.c - zswap driver file
   3 *
   4 * zswap is a backend for frontswap that takes pages that are in the process
   5 * of being swapped out and attempts to compress and store them in a
   6 * RAM-based memory pool.  This can result in a significant I/O reduction on
   7 * the swap device and, in the case where decompressing from RAM is faster
   8 * than reading from the swap device, can also improve workload performance.
   9 *
  10 * Copyright (C) 2012  Seth Jennings <sjenning@linux.vnet.ibm.com>
  11 *
  12 * This program is free software; you can redistribute it and/or
  13 * modify it under the terms of the GNU General Public License
  14 * as published by the Free Software Foundation; either version 2
  15 * of the License, or (at your option) any later version.
  16 *
  17 * This program is distributed in the hope that it will be useful,
  18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 * GNU General Public License for more details.
  21*/
  22
  23#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  24
  25#include <linux/module.h>
  26#include <linux/cpu.h>
  27#include <linux/highmem.h>
  28#include <linux/slab.h>
  29#include <linux/spinlock.h>
  30#include <linux/types.h>
  31#include <linux/atomic.h>
  32#include <linux/frontswap.h>
  33#include <linux/rbtree.h>
  34#include <linux/swap.h>
  35#include <linux/crypto.h>
  36#include <linux/mempool.h>
  37#include <linux/zpool.h>
  38
  39#include <linux/mm_types.h>
  40#include <linux/page-flags.h>
  41#include <linux/swapops.h>
  42#include <linux/writeback.h>
  43#include <linux/pagemap.h>
  44
  45/*********************************
  46* statistics
  47**********************************/
  48/* Total bytes used by the compressed storage */
  49static u64 zswap_pool_total_size;
  50/* The number of compressed pages currently stored in zswap */
  51static atomic_t zswap_stored_pages = ATOMIC_INIT(0);
  52
  53/*
  54 * The statistics below are not protected from concurrent access for
  55 * performance reasons so they may not be a 100% accurate.  However,
  56 * they do provide useful information on roughly how many times a
  57 * certain event is occurring.
  58*/
  59
  60/* Pool limit was hit (see zswap_max_pool_percent) */
  61static u64 zswap_pool_limit_hit;
  62/* Pages written back when pool limit was reached */
  63static u64 zswap_written_back_pages;
  64/* Store failed due to a reclaim failure after pool limit was reached */
  65static u64 zswap_reject_reclaim_fail;
  66/* Compressed page was too big for the allocator to (optimally) store */
  67static u64 zswap_reject_compress_poor;
  68/* Store failed because underlying allocator could not get memory */
  69static u64 zswap_reject_alloc_fail;
  70/* Store failed because the entry metadata could not be allocated (rare) */
  71static u64 zswap_reject_kmemcache_fail;
  72/* Duplicate store was encountered (rare) */
  73static u64 zswap_duplicate_entry;
  74
  75/*********************************
  76* tunables
  77**********************************/
  78
  79/* Enable/disable zswap (disabled by default) */
  80static bool zswap_enabled;
  81module_param_named(enabled, zswap_enabled, bool, 0644);
  82
  83/* Crypto compressor to use */
  84#define ZSWAP_COMPRESSOR_DEFAULT "lzo"
  85static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
  86static int zswap_compressor_param_set(const char *,
  87                                      const struct kernel_param *);
  88static struct kernel_param_ops zswap_compressor_param_ops = {
  89        .set =          zswap_compressor_param_set,
  90        .get =          param_get_charp,
  91        .free =         param_free_charp,
  92};
  93module_param_cb(compressor, &zswap_compressor_param_ops,
  94                &zswap_compressor, 0644);
  95
  96/* Compressed storage zpool to use */
  97#define ZSWAP_ZPOOL_DEFAULT "zbud"
  98static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
  99static int zswap_zpool_param_set(const char *, const struct kernel_param *);
 100static struct kernel_param_ops zswap_zpool_param_ops = {
 101        .set =          zswap_zpool_param_set,
 102        .get =          param_get_charp,
 103        .free =         param_free_charp,
 104};
 105module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644);
 106
 107/* The maximum percentage of memory that the compressed pool can occupy */
 108static unsigned int zswap_max_pool_percent = 20;
 109module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644);
 110
 111/*********************************
 112* data structures
 113**********************************/
 114
 115struct zswap_pool {
 116        struct zpool *zpool;
 117        struct crypto_comp * __percpu *tfm;
 118        struct kref kref;
 119        struct list_head list;
 120        struct work_struct work;
 121        struct notifier_block notifier;
 122        char tfm_name[CRYPTO_MAX_ALG_NAME];
 123};
 124
 125/*
 126 * struct zswap_entry
 127 *
 128 * This structure contains the metadata for tracking a single compressed
 129 * page within zswap.
 130 *
 131 * rbnode - links the entry into red-black tree for the appropriate swap type
 132 * offset - the swap offset for the entry.  Index into the red-black tree.
 133 * refcount - the number of outstanding reference to the entry. This is needed
 134 *            to protect against premature freeing of the entry by code
 135 *            concurrent calls to load, invalidate, and writeback.  The lock
 136 *            for the zswap_tree structure that contains the entry must
 137 *            be held while changing the refcount.  Since the lock must
 138 *            be held, there is no reason to also make refcount atomic.
 139 * length - the length in bytes of the compressed page data.  Needed during
 140 *          decompression
 141 * pool - the zswap_pool the entry's data is in
 142 * handle - zpool allocation handle that stores the compressed page data
 143 */
 144struct zswap_entry {
 145        struct rb_node rbnode;
 146        pgoff_t offset;
 147        int refcount;
 148        unsigned int length;
 149        struct zswap_pool *pool;
 150        unsigned long handle;
 151};
 152
 153struct zswap_header {
 154        swp_entry_t swpentry;
 155};
 156
 157/*
 158 * The tree lock in the zswap_tree struct protects a few things:
 159 * - the rbtree
 160 * - the refcount field of each entry in the tree
 161 */
 162struct zswap_tree {
 163        struct rb_root rbroot;
 164        spinlock_t lock;
 165};
 166
 167static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
 168
 169/* RCU-protected iteration */
 170static LIST_HEAD(zswap_pools);
 171/* protects zswap_pools list modification */
 172static DEFINE_SPINLOCK(zswap_pools_lock);
 173/* pool counter to provide unique names to zpool */
 174static atomic_t zswap_pools_count = ATOMIC_INIT(0);
 175
 176/* used by param callback function */
 177static bool zswap_init_started;
 178
 179/*********************************
 180* helpers and fwd declarations
 181**********************************/
 182
 183#define zswap_pool_debug(msg, p)                                \
 184        pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name,         \
 185                 zpool_get_type((p)->zpool))
 186
 187static int zswap_writeback_entry(struct zpool *pool, unsigned long handle);
 188static int zswap_pool_get(struct zswap_pool *pool);
 189static void zswap_pool_put(struct zswap_pool *pool);
 190
 191static const struct zpool_ops zswap_zpool_ops = {
 192        .evict = zswap_writeback_entry
 193};
 194
 195static bool zswap_is_full(void)
 196{
 197        return totalram_pages * zswap_max_pool_percent / 100 <
 198                DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
 199}
 200
 201static void zswap_update_total_size(void)
 202{
 203        struct zswap_pool *pool;
 204        u64 total = 0;
 205
 206        rcu_read_lock();
 207
 208        list_for_each_entry_rcu(pool, &zswap_pools, list)
 209                total += zpool_get_total_size(pool->zpool);
 210
 211        rcu_read_unlock();
 212
 213        zswap_pool_total_size = total;
 214}
 215
 216/*********************************
 217* zswap entry functions
 218**********************************/
 219static struct kmem_cache *zswap_entry_cache;
 220
 221static int __init zswap_entry_cache_create(void)
 222{
 223        zswap_entry_cache = KMEM_CACHE(zswap_entry, 0);
 224        return zswap_entry_cache == NULL;
 225}
 226
 227static void __init zswap_entry_cache_destroy(void)
 228{
 229        kmem_cache_destroy(zswap_entry_cache);
 230}
 231
 232static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
 233{
 234        struct zswap_entry *entry;
 235        entry = kmem_cache_alloc(zswap_entry_cache, gfp);
 236        if (!entry)
 237                return NULL;
 238        entry->refcount = 1;
 239        RB_CLEAR_NODE(&entry->rbnode);
 240        return entry;
 241}
 242
 243static void zswap_entry_cache_free(struct zswap_entry *entry)
 244{
 245        kmem_cache_free(zswap_entry_cache, entry);
 246}
 247
 248/*********************************
 249* rbtree functions
 250**********************************/
 251static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
 252{
 253        struct rb_node *node = root->rb_node;
 254        struct zswap_entry *entry;
 255
 256        while (node) {
 257                entry = rb_entry(node, struct zswap_entry, rbnode);
 258                if (entry->offset > offset)
 259                        node = node->rb_left;
 260                else if (entry->offset < offset)
 261                        node = node->rb_right;
 262                else
 263                        return entry;
 264        }
 265        return NULL;
 266}
 267
 268/*
 269 * In the case that a entry with the same offset is found, a pointer to
 270 * the existing entry is stored in dupentry and the function returns -EEXIST
 271 */
 272static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
 273                        struct zswap_entry **dupentry)
 274{
 275        struct rb_node **link = &root->rb_node, *parent = NULL;
 276        struct zswap_entry *myentry;
 277
 278        while (*link) {
 279                parent = *link;
 280                myentry = rb_entry(parent, struct zswap_entry, rbnode);
 281                if (myentry->offset > entry->offset)
 282                        link = &(*link)->rb_left;
 283                else if (myentry->offset < entry->offset)
 284                        link = &(*link)->rb_right;
 285                else {
 286                        *dupentry = myentry;
 287                        return -EEXIST;
 288                }
 289        }
 290        rb_link_node(&entry->rbnode, parent, link);
 291        rb_insert_color(&entry->rbnode, root);
 292        return 0;
 293}
 294
 295static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
 296{
 297        if (!RB_EMPTY_NODE(&entry->rbnode)) {
 298                rb_erase(&entry->rbnode, root);
 299                RB_CLEAR_NODE(&entry->rbnode);
 300        }
 301}
 302
 303/*
 304 * Carries out the common pattern of freeing and entry's zpool allocation,
 305 * freeing the entry itself, and decrementing the number of stored pages.
 306 */
 307static void zswap_free_entry(struct zswap_entry *entry)
 308{
 309        zpool_free(entry->pool->zpool, entry->handle);
 310        zswap_pool_put(entry->pool);
 311        zswap_entry_cache_free(entry);
 312        atomic_dec(&zswap_stored_pages);
 313        zswap_update_total_size();
 314}
 315
 316/* caller must hold the tree lock */
 317static void zswap_entry_get(struct zswap_entry *entry)
 318{
 319        entry->refcount++;
 320}
 321
 322/* caller must hold the tree lock
 323* remove from the tree and free it, if nobody reference the entry
 324*/
 325static void zswap_entry_put(struct zswap_tree *tree,
 326                        struct zswap_entry *entry)
 327{
 328        int refcount = --entry->refcount;
 329
 330        BUG_ON(refcount < 0);
 331        if (refcount == 0) {
 332                zswap_rb_erase(&tree->rbroot, entry);
 333                zswap_free_entry(entry);
 334        }
 335}
 336
 337/* caller must hold the tree lock */
 338static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
 339                                pgoff_t offset)
 340{
 341        struct zswap_entry *entry;
 342
 343        entry = zswap_rb_search(root, offset);
 344        if (entry)
 345                zswap_entry_get(entry);
 346
 347        return entry;
 348}
 349
 350/*********************************
 351* per-cpu code
 352**********************************/
 353static DEFINE_PER_CPU(u8 *, zswap_dstmem);
 354
 355static int __zswap_cpu_dstmem_notifier(unsigned long action, unsigned long cpu)
 356{
 357        u8 *dst;
 358
 359        switch (action) {
 360        case CPU_UP_PREPARE:
 361                dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
 362                if (!dst) {
 363                        pr_err("can't allocate compressor buffer\n");
 364                        return NOTIFY_BAD;
 365                }
 366                per_cpu(zswap_dstmem, cpu) = dst;
 367                break;
 368        case CPU_DEAD:
 369        case CPU_UP_CANCELED:
 370                dst = per_cpu(zswap_dstmem, cpu);
 371                kfree(dst);
 372                per_cpu(zswap_dstmem, cpu) = NULL;
 373                break;
 374        default:
 375                break;
 376        }
 377        return NOTIFY_OK;
 378}
 379
 380static int zswap_cpu_dstmem_notifier(struct notifier_block *nb,
 381                                     unsigned long action, void *pcpu)
 382{
 383        return __zswap_cpu_dstmem_notifier(action, (unsigned long)pcpu);
 384}
 385
 386static struct notifier_block zswap_dstmem_notifier = {
 387        .notifier_call =        zswap_cpu_dstmem_notifier,
 388};
 389
 390static int __init zswap_cpu_dstmem_init(void)
 391{
 392        unsigned long cpu;
 393
 394        cpu_notifier_register_begin();
 395        for_each_online_cpu(cpu)
 396                if (__zswap_cpu_dstmem_notifier(CPU_UP_PREPARE, cpu) ==
 397                    NOTIFY_BAD)
 398                        goto cleanup;
 399        __register_cpu_notifier(&zswap_dstmem_notifier);
 400        cpu_notifier_register_done();
 401        return 0;
 402
 403cleanup:
 404        for_each_online_cpu(cpu)
 405                __zswap_cpu_dstmem_notifier(CPU_UP_CANCELED, cpu);
 406        cpu_notifier_register_done();
 407        return -ENOMEM;
 408}
 409
 410static void zswap_cpu_dstmem_destroy(void)
 411{
 412        unsigned long cpu;
 413
 414        cpu_notifier_register_begin();
 415        for_each_online_cpu(cpu)
 416                __zswap_cpu_dstmem_notifier(CPU_UP_CANCELED, cpu);
 417        __unregister_cpu_notifier(&zswap_dstmem_notifier);
 418        cpu_notifier_register_done();
 419}
 420
 421static int __zswap_cpu_comp_notifier(struct zswap_pool *pool,
 422                                     unsigned long action, unsigned long cpu)
 423{
 424        struct crypto_comp *tfm;
 425
 426        switch (action) {
 427        case CPU_UP_PREPARE:
 428                if (WARN_ON(*per_cpu_ptr(pool->tfm, cpu)))
 429                        break;
 430                tfm = crypto_alloc_comp(pool->tfm_name, 0, 0);
 431                if (IS_ERR_OR_NULL(tfm)) {
 432                        pr_err("could not alloc crypto comp %s : %ld\n",
 433                               pool->tfm_name, PTR_ERR(tfm));
 434                        return NOTIFY_BAD;
 435                }
 436                *per_cpu_ptr(pool->tfm, cpu) = tfm;
 437                break;
 438        case CPU_DEAD:
 439        case CPU_UP_CANCELED:
 440                tfm = *per_cpu_ptr(pool->tfm, cpu);
 441                if (!IS_ERR_OR_NULL(tfm))
 442                        crypto_free_comp(tfm);
 443                *per_cpu_ptr(pool->tfm, cpu) = NULL;
 444                break;
 445        default:
 446                break;
 447        }
 448        return NOTIFY_OK;
 449}
 450
 451static int zswap_cpu_comp_notifier(struct notifier_block *nb,
 452                                   unsigned long action, void *pcpu)
 453{
 454        unsigned long cpu = (unsigned long)pcpu;
 455        struct zswap_pool *pool = container_of(nb, typeof(*pool), notifier);
 456
 457        return __zswap_cpu_comp_notifier(pool, action, cpu);
 458}
 459
 460static int zswap_cpu_comp_init(struct zswap_pool *pool)
 461{
 462        unsigned long cpu;
 463
 464        memset(&pool->notifier, 0, sizeof(pool->notifier));
 465        pool->notifier.notifier_call = zswap_cpu_comp_notifier;
 466
 467        cpu_notifier_register_begin();
 468        for_each_online_cpu(cpu)
 469                if (__zswap_cpu_comp_notifier(pool, CPU_UP_PREPARE, cpu) ==
 470                    NOTIFY_BAD)
 471                        goto cleanup;
 472        __register_cpu_notifier(&pool->notifier);
 473        cpu_notifier_register_done();
 474        return 0;
 475
 476cleanup:
 477        for_each_online_cpu(cpu)
 478                __zswap_cpu_comp_notifier(pool, CPU_UP_CANCELED, cpu);
 479        cpu_notifier_register_done();
 480        return -ENOMEM;
 481}
 482
 483static void zswap_cpu_comp_destroy(struct zswap_pool *pool)
 484{
 485        unsigned long cpu;
 486
 487        cpu_notifier_register_begin();
 488        for_each_online_cpu(cpu)
 489                __zswap_cpu_comp_notifier(pool, CPU_UP_CANCELED, cpu);
 490        __unregister_cpu_notifier(&pool->notifier);
 491        cpu_notifier_register_done();
 492}
 493
 494/*********************************
 495* pool functions
 496**********************************/
 497
 498static struct zswap_pool *__zswap_pool_current(void)
 499{
 500        struct zswap_pool *pool;
 501
 502        pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list);
 503        WARN_ON(!pool);
 504
 505        return pool;
 506}
 507
 508static struct zswap_pool *zswap_pool_current(void)
 509{
 510        assert_spin_locked(&zswap_pools_lock);
 511
 512        return __zswap_pool_current();
 513}
 514
 515static struct zswap_pool *zswap_pool_current_get(void)
 516{
 517        struct zswap_pool *pool;
 518
 519        rcu_read_lock();
 520
 521        pool = __zswap_pool_current();
 522        if (!pool || !zswap_pool_get(pool))
 523                pool = NULL;
 524
 525        rcu_read_unlock();
 526
 527        return pool;
 528}
 529
 530static struct zswap_pool *zswap_pool_last_get(void)
 531{
 532        struct zswap_pool *pool, *last = NULL;
 533
 534        rcu_read_lock();
 535
 536        list_for_each_entry_rcu(pool, &zswap_pools, list)
 537                last = pool;
 538        if (!WARN_ON(!last) && !zswap_pool_get(last))
 539                last = NULL;
 540
 541        rcu_read_unlock();
 542
 543        return last;
 544}
 545
 546/* type and compressor must be null-terminated */
 547static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
 548{
 549        struct zswap_pool *pool;
 550
 551        assert_spin_locked(&zswap_pools_lock);
 552
 553        list_for_each_entry_rcu(pool, &zswap_pools, list) {
 554                if (strcmp(pool->tfm_name, compressor))
 555                        continue;
 556                if (strcmp(zpool_get_type(pool->zpool), type))
 557                        continue;
 558                /* if we can't get it, it's about to be destroyed */
 559                if (!zswap_pool_get(pool))
 560                        continue;
 561                return pool;
 562        }
 563
 564        return NULL;
 565}
 566
 567static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
 568{
 569        struct zswap_pool *pool;
 570        char name[38]; /* 'zswap' + 32 char (max) num + \0 */
 571        gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
 572
 573        pool = kzalloc(sizeof(*pool), GFP_KERNEL);
 574        if (!pool) {
 575                pr_err("pool alloc failed\n");
 576                return NULL;
 577        }
 578
 579        /* unique name for each pool specifically required by zsmalloc */
 580        snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count));
 581
 582        pool->zpool = zpool_create_pool(type, name, gfp, &zswap_zpool_ops);
 583        if (!pool->zpool) {
 584                pr_err("%s zpool not available\n", type);
 585                goto error;
 586        }
 587        pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
 588
 589        strlcpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
 590        pool->tfm = alloc_percpu(struct crypto_comp *);
 591        if (!pool->tfm) {
 592                pr_err("percpu alloc failed\n");
 593                goto error;
 594        }
 595
 596        if (zswap_cpu_comp_init(pool))
 597                goto error;
 598        pr_debug("using %s compressor\n", pool->tfm_name);
 599
 600        /* being the current pool takes 1 ref; this func expects the
 601         * caller to always add the new pool as the current pool
 602         */
 603        kref_init(&pool->kref);
 604        INIT_LIST_HEAD(&pool->list);
 605
 606        zswap_pool_debug("created", pool);
 607
 608        return pool;
 609
 610error:
 611        free_percpu(pool->tfm);
 612        if (pool->zpool)
 613                zpool_destroy_pool(pool->zpool);
 614        kfree(pool);
 615        return NULL;
 616}
 617
 618static __init struct zswap_pool *__zswap_pool_create_fallback(void)
 619{
 620        if (!crypto_has_comp(zswap_compressor, 0, 0)) {
 621                if (!strcmp(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT)) {
 622                        pr_err("default compressor %s not available\n",
 623                               zswap_compressor);
 624                        return NULL;
 625                }
 626                pr_err("compressor %s not available, using default %s\n",
 627                       zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT);
 628                param_free_charp(&zswap_compressor);
 629                zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
 630        }
 631        if (!zpool_has_pool(zswap_zpool_type)) {
 632                if (!strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) {
 633                        pr_err("default zpool %s not available\n",
 634                               zswap_zpool_type);
 635                        return NULL;
 636                }
 637                pr_err("zpool %s not available, using default %s\n",
 638                       zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT);
 639                param_free_charp(&zswap_zpool_type);
 640                zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
 641        }
 642
 643        return zswap_pool_create(zswap_zpool_type, zswap_compressor);
 644}
 645
 646static void zswap_pool_destroy(struct zswap_pool *pool)
 647{
 648        zswap_pool_debug("destroying", pool);
 649
 650        zswap_cpu_comp_destroy(pool);
 651        free_percpu(pool->tfm);
 652        zpool_destroy_pool(pool->zpool);
 653        kfree(pool);
 654}
 655
 656static int __must_check zswap_pool_get(struct zswap_pool *pool)
 657{
 658        return kref_get_unless_zero(&pool->kref);
 659}
 660
 661static void __zswap_pool_release(struct work_struct *work)
 662{
 663        struct zswap_pool *pool = container_of(work, typeof(*pool), work);
 664
 665        synchronize_rcu();
 666
 667        /* nobody should have been able to get a kref... */
 668        WARN_ON(kref_get_unless_zero(&pool->kref));
 669
 670        /* pool is now off zswap_pools list and has no references. */
 671        zswap_pool_destroy(pool);
 672}
 673
 674static void __zswap_pool_empty(struct kref *kref)
 675{
 676        struct zswap_pool *pool;
 677
 678        pool = container_of(kref, typeof(*pool), kref);
 679
 680        spin_lock(&zswap_pools_lock);
 681
 682        WARN_ON(pool == zswap_pool_current());
 683
 684        list_del_rcu(&pool->list);
 685
 686        INIT_WORK(&pool->work, __zswap_pool_release);
 687        schedule_work(&pool->work);
 688
 689        spin_unlock(&zswap_pools_lock);
 690}
 691
 692static void zswap_pool_put(struct zswap_pool *pool)
 693{
 694        kref_put(&pool->kref, __zswap_pool_empty);
 695}
 696
 697/*********************************
 698* param callbacks
 699**********************************/
 700
 701/* val must be a null-terminated string */
 702static int __zswap_param_set(const char *val, const struct kernel_param *kp,
 703                             char *type, char *compressor)
 704{
 705        struct zswap_pool *pool, *put_pool = NULL;
 706        char *s = strstrip((char *)val);
 707        int ret;
 708
 709        /* no change required */
 710        if (!strcmp(s, *(char **)kp->arg))
 711                return 0;
 712
 713        /* if this is load-time (pre-init) param setting,
 714         * don't create a pool; that's done during init.
 715         */
 716        if (!zswap_init_started)
 717                return param_set_charp(s, kp);
 718
 719        if (!type) {
 720                if (!zpool_has_pool(s)) {
 721                        pr_err("zpool %s not available\n", s);
 722                        return -ENOENT;
 723                }
 724                type = s;
 725        } else if (!compressor) {
 726                if (!crypto_has_comp(s, 0, 0)) {
 727                        pr_err("compressor %s not available\n", s);
 728                        return -ENOENT;
 729                }
 730                compressor = s;
 731        } else {
 732                WARN_ON(1);
 733                return -EINVAL;
 734        }
 735
 736        spin_lock(&zswap_pools_lock);
 737
 738        pool = zswap_pool_find_get(type, compressor);
 739        if (pool) {
 740                zswap_pool_debug("using existing", pool);
 741                list_del_rcu(&pool->list);
 742        } else {
 743                spin_unlock(&zswap_pools_lock);
 744                pool = zswap_pool_create(type, compressor);
 745                spin_lock(&zswap_pools_lock);
 746        }
 747
 748        if (pool)
 749                ret = param_set_charp(s, kp);
 750        else
 751                ret = -EINVAL;
 752
 753        if (!ret) {
 754                put_pool = zswap_pool_current();
 755                list_add_rcu(&pool->list, &zswap_pools);
 756        } else if (pool) {
 757                /* add the possibly pre-existing pool to the end of the pools
 758                 * list; if it's new (and empty) then it'll be removed and
 759                 * destroyed by the put after we drop the lock
 760                 */
 761                list_add_tail_rcu(&pool->list, &zswap_pools);
 762                put_pool = pool;
 763        }
 764
 765        spin_unlock(&zswap_pools_lock);
 766
 767        /* drop the ref from either the old current pool,
 768         * or the new pool we failed to add
 769         */
 770        if (put_pool)
 771                zswap_pool_put(put_pool);
 772
 773        return ret;
 774}
 775
 776static int zswap_compressor_param_set(const char *val,
 777                                      const struct kernel_param *kp)
 778{
 779        return __zswap_param_set(val, kp, zswap_zpool_type, NULL);
 780}
 781
 782static int zswap_zpool_param_set(const char *val,
 783                                 const struct kernel_param *kp)
 784{
 785        return __zswap_param_set(val, kp, NULL, zswap_compressor);
 786}
 787
 788/*********************************
 789* writeback code
 790**********************************/
 791/* return enum for zswap_get_swap_cache_page */
 792enum zswap_get_swap_ret {
 793        ZSWAP_SWAPCACHE_NEW,
 794        ZSWAP_SWAPCACHE_EXIST,
 795        ZSWAP_SWAPCACHE_FAIL,
 796};
 797
 798/*
 799 * zswap_get_swap_cache_page
 800 *
 801 * This is an adaption of read_swap_cache_async()
 802 *
 803 * This function tries to find a page with the given swap entry
 804 * in the swapper_space address space (the swap cache).  If the page
 805 * is found, it is returned in retpage.  Otherwise, a page is allocated,
 806 * added to the swap cache, and returned in retpage.
 807 *
 808 * If success, the swap cache page is returned in retpage
 809 * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache
 810 * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated,
 811 *     the new page is added to swapcache and locked
 812 * Returns ZSWAP_SWAPCACHE_FAIL on error
 813 */
 814static int zswap_get_swap_cache_page(swp_entry_t entry,
 815                                struct page **retpage)
 816{
 817        bool page_was_allocated;
 818
 819        *retpage = __read_swap_cache_async(entry, GFP_KERNEL,
 820                        NULL, 0, &page_was_allocated);
 821        if (page_was_allocated)
 822                return ZSWAP_SWAPCACHE_NEW;
 823        if (!*retpage)
 824                return ZSWAP_SWAPCACHE_FAIL;
 825        return ZSWAP_SWAPCACHE_EXIST;
 826}
 827
 828/*
 829 * Attempts to free an entry by adding a page to the swap cache,
 830 * decompressing the entry data into the page, and issuing a
 831 * bio write to write the page back to the swap device.
 832 *
 833 * This can be thought of as a "resumed writeback" of the page
 834 * to the swap device.  We are basically resuming the same swap
 835 * writeback path that was intercepted with the frontswap_store()
 836 * in the first place.  After the page has been decompressed into
 837 * the swap cache, the compressed version stored by zswap can be
 838 * freed.
 839 */
 840static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
 841{
 842        struct zswap_header *zhdr;
 843        swp_entry_t swpentry;
 844        struct zswap_tree *tree;
 845        pgoff_t offset;
 846        struct zswap_entry *entry;
 847        struct page *page;
 848        struct crypto_comp *tfm;
 849        u8 *src, *dst;
 850        unsigned int dlen;
 851        int ret;
 852        struct writeback_control wbc = {
 853                .sync_mode = WB_SYNC_NONE,
 854        };
 855
 856        /* extract swpentry from data */
 857        zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO);
 858        swpentry = zhdr->swpentry; /* here */
 859        zpool_unmap_handle(pool, handle);
 860        tree = zswap_trees[swp_type(swpentry)];
 861        offset = swp_offset(swpentry);
 862
 863        /* find and ref zswap entry */
 864        spin_lock(&tree->lock);
 865        entry = zswap_entry_find_get(&tree->rbroot, offset);
 866        if (!entry) {
 867                /* entry was invalidated */
 868                spin_unlock(&tree->lock);
 869                return 0;
 870        }
 871        spin_unlock(&tree->lock);
 872        BUG_ON(offset != entry->offset);
 873
 874        /* try to allocate swap cache page */
 875        switch (zswap_get_swap_cache_page(swpentry, &page)) {
 876        case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */
 877                ret = -ENOMEM;
 878                goto fail;
 879
 880        case ZSWAP_SWAPCACHE_EXIST:
 881                /* page is already in the swap cache, ignore for now */
 882                put_page(page);
 883                ret = -EEXIST;
 884                goto fail;
 885
 886        case ZSWAP_SWAPCACHE_NEW: /* page is locked */
 887                /* decompress */
 888                dlen = PAGE_SIZE;
 889                src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle,
 890                                ZPOOL_MM_RO) + sizeof(struct zswap_header);
 891                dst = kmap_atomic(page);
 892                tfm = *get_cpu_ptr(entry->pool->tfm);
 893                ret = crypto_comp_decompress(tfm, src, entry->length,
 894                                             dst, &dlen);
 895                put_cpu_ptr(entry->pool->tfm);
 896                kunmap_atomic(dst);
 897                zpool_unmap_handle(entry->pool->zpool, entry->handle);
 898                BUG_ON(ret);
 899                BUG_ON(dlen != PAGE_SIZE);
 900
 901                /* page is up to date */
 902                SetPageUptodate(page);
 903        }
 904
 905        /* move it to the tail of the inactive list after end_writeback */
 906        SetPageReclaim(page);
 907
 908        /* start writeback */
 909        __swap_writepage(page, &wbc, end_swap_bio_write);
 910        put_page(page);
 911        zswap_written_back_pages++;
 912
 913        spin_lock(&tree->lock);
 914        /* drop local reference */
 915        zswap_entry_put(tree, entry);
 916
 917        /*
 918        * There are two possible situations for entry here:
 919        * (1) refcount is 1(normal case),  entry is valid and on the tree
 920        * (2) refcount is 0, entry is freed and not on the tree
 921        *     because invalidate happened during writeback
 922        *  search the tree and free the entry if find entry
 923        */
 924        if (entry == zswap_rb_search(&tree->rbroot, offset))
 925                zswap_entry_put(tree, entry);
 926        spin_unlock(&tree->lock);
 927
 928        goto end;
 929
 930        /*
 931        * if we get here due to ZSWAP_SWAPCACHE_EXIST
 932        * a load may happening concurrently
 933        * it is safe and okay to not free the entry
 934        * if we free the entry in the following put
 935        * it it either okay to return !0
 936        */
 937fail:
 938        spin_lock(&tree->lock);
 939        zswap_entry_put(tree, entry);
 940        spin_unlock(&tree->lock);
 941
 942end:
 943        return ret;
 944}
 945
 946static int zswap_shrink(void)
 947{
 948        struct zswap_pool *pool;
 949        int ret;
 950
 951        pool = zswap_pool_last_get();
 952        if (!pool)
 953                return -ENOENT;
 954
 955        ret = zpool_shrink(pool->zpool, 1, NULL);
 956
 957        zswap_pool_put(pool);
 958
 959        return ret;
 960}
 961
 962/*********************************
 963* frontswap hooks
 964**********************************/
 965/* attempts to compress and store an single page */
 966static int zswap_frontswap_store(unsigned type, pgoff_t offset,
 967                                struct page *page)
 968{
 969        struct zswap_tree *tree = zswap_trees[type];
 970        struct zswap_entry *entry, *dupentry;
 971        struct crypto_comp *tfm;
 972        int ret;
 973        unsigned int dlen = PAGE_SIZE, len;
 974        unsigned long handle;
 975        char *buf;
 976        u8 *src, *dst;
 977        struct zswap_header *zhdr;
 978
 979        if (!zswap_enabled || !tree) {
 980                ret = -ENODEV;
 981                goto reject;
 982        }
 983
 984        /* reclaim space if needed */
 985        if (zswap_is_full()) {
 986                zswap_pool_limit_hit++;
 987                if (zswap_shrink()) {
 988                        zswap_reject_reclaim_fail++;
 989                        ret = -ENOMEM;
 990                        goto reject;
 991                }
 992        }
 993
 994        /* allocate entry */
 995        entry = zswap_entry_cache_alloc(GFP_KERNEL);
 996        if (!entry) {
 997                zswap_reject_kmemcache_fail++;
 998                ret = -ENOMEM;
 999                goto reject;
1000        }
1001
1002        /* if entry is successfully added, it keeps the reference */
1003        entry->pool = zswap_pool_current_get();
1004        if (!entry->pool) {
1005                ret = -EINVAL;
1006                goto freepage;
1007        }
1008
1009        /* compress */
1010        dst = get_cpu_var(zswap_dstmem);
1011        tfm = *get_cpu_ptr(entry->pool->tfm);
1012        src = kmap_atomic(page);
1013        ret = crypto_comp_compress(tfm, src, PAGE_SIZE, dst, &dlen);
1014        kunmap_atomic(src);
1015        put_cpu_ptr(entry->pool->tfm);
1016        if (ret) {
1017                ret = -EINVAL;
1018                goto put_dstmem;
1019        }
1020
1021        /* store */
1022        len = dlen + sizeof(struct zswap_header);
1023        ret = zpool_malloc(entry->pool->zpool, len,
1024                           __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM,
1025                           &handle);
1026        if (ret == -ENOSPC) {
1027                zswap_reject_compress_poor++;
1028                goto put_dstmem;
1029        }
1030        if (ret) {
1031                zswap_reject_alloc_fail++;
1032                goto put_dstmem;
1033        }
1034        zhdr = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW);
1035        zhdr->swpentry = swp_entry(type, offset);
1036        buf = (u8 *)(zhdr + 1);
1037        memcpy(buf, dst, dlen);
1038        zpool_unmap_handle(entry->pool->zpool, handle);
1039        put_cpu_var(zswap_dstmem);
1040
1041        /* populate entry */
1042        entry->offset = offset;
1043        entry->handle = handle;
1044        entry->length = dlen;
1045
1046        /* map */
1047        spin_lock(&tree->lock);
1048        do {
1049                ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry);
1050                if (ret == -EEXIST) {
1051                        zswap_duplicate_entry++;
1052                        /* remove from rbtree */
1053                        zswap_rb_erase(&tree->rbroot, dupentry);
1054                        zswap_entry_put(tree, dupentry);
1055                }
1056        } while (ret == -EEXIST);
1057        spin_unlock(&tree->lock);
1058
1059        /* update stats */
1060        atomic_inc(&zswap_stored_pages);
1061        zswap_update_total_size();
1062
1063        return 0;
1064
1065put_dstmem:
1066        put_cpu_var(zswap_dstmem);
1067        zswap_pool_put(entry->pool);
1068freepage:
1069        zswap_entry_cache_free(entry);
1070reject:
1071        return ret;
1072}
1073
1074/*
1075 * returns 0 if the page was successfully decompressed
1076 * return -1 on entry not found or error
1077*/
1078static int zswap_frontswap_load(unsigned type, pgoff_t offset,
1079                                struct page *page)
1080{
1081        struct zswap_tree *tree = zswap_trees[type];
1082        struct zswap_entry *entry;
1083        struct crypto_comp *tfm;
1084        u8 *src, *dst;
1085        unsigned int dlen;
1086        int ret;
1087
1088        /* find */
1089        spin_lock(&tree->lock);
1090        entry = zswap_entry_find_get(&tree->rbroot, offset);
1091        if (!entry) {
1092                /* entry was written back */
1093                spin_unlock(&tree->lock);
1094                return -1;
1095        }
1096        spin_unlock(&tree->lock);
1097
1098        /* decompress */
1099        dlen = PAGE_SIZE;
1100        src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle,
1101                        ZPOOL_MM_RO) + sizeof(struct zswap_header);
1102        dst = kmap_atomic(page);
1103        tfm = *get_cpu_ptr(entry->pool->tfm);
1104        ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen);
1105        put_cpu_ptr(entry->pool->tfm);
1106        kunmap_atomic(dst);
1107        zpool_unmap_handle(entry->pool->zpool, entry->handle);
1108        BUG_ON(ret);
1109
1110        spin_lock(&tree->lock);
1111        zswap_entry_put(tree, entry);
1112        spin_unlock(&tree->lock);
1113
1114        return 0;
1115}
1116
1117/* frees an entry in zswap */
1118static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
1119{
1120        struct zswap_tree *tree = zswap_trees[type];
1121        struct zswap_entry *entry;
1122
1123        /* find */
1124        spin_lock(&tree->lock);
1125        entry = zswap_rb_search(&tree->rbroot, offset);
1126        if (!entry) {
1127                /* entry was written back */
1128                spin_unlock(&tree->lock);
1129                return;
1130        }
1131
1132        /* remove from rbtree */
1133        zswap_rb_erase(&tree->rbroot, entry);
1134
1135        /* drop the initial reference from entry creation */
1136        zswap_entry_put(tree, entry);
1137
1138        spin_unlock(&tree->lock);
1139}
1140
1141/* frees all zswap entries for the given swap type */
1142static void zswap_frontswap_invalidate_area(unsigned type)
1143{
1144        struct zswap_tree *tree = zswap_trees[type];
1145        struct zswap_entry *entry, *n;
1146
1147        if (!tree)
1148                return;
1149
1150        /* walk the tree and free everything */
1151        spin_lock(&tree->lock);
1152        rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
1153                zswap_free_entry(entry);
1154        tree->rbroot = RB_ROOT;
1155        spin_unlock(&tree->lock);
1156        kfree(tree);
1157        zswap_trees[type] = NULL;
1158}
1159
1160static void zswap_frontswap_init(unsigned type)
1161{
1162        struct zswap_tree *tree;
1163
1164        tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL);
1165        if (!tree) {
1166                pr_err("alloc failed, zswap disabled for swap type %d\n", type);
1167                return;
1168        }
1169
1170        tree->rbroot = RB_ROOT;
1171        spin_lock_init(&tree->lock);
1172        zswap_trees[type] = tree;
1173}
1174
1175static struct frontswap_ops zswap_frontswap_ops = {
1176        .store = zswap_frontswap_store,
1177        .load = zswap_frontswap_load,
1178        .invalidate_page = zswap_frontswap_invalidate_page,
1179        .invalidate_area = zswap_frontswap_invalidate_area,
1180        .init = zswap_frontswap_init
1181};
1182
1183/*********************************
1184* debugfs functions
1185**********************************/
1186#ifdef CONFIG_DEBUG_FS
1187#include <linux/debugfs.h>
1188
1189static struct dentry *zswap_debugfs_root;
1190
1191static int __init zswap_debugfs_init(void)
1192{
1193        if (!debugfs_initialized())
1194                return -ENODEV;
1195
1196        zswap_debugfs_root = debugfs_create_dir("zswap", NULL);
1197        if (!zswap_debugfs_root)
1198                return -ENOMEM;
1199
1200        debugfs_create_u64("pool_limit_hit", S_IRUGO,
1201                        zswap_debugfs_root, &zswap_pool_limit_hit);
1202        debugfs_create_u64("reject_reclaim_fail", S_IRUGO,
1203                        zswap_debugfs_root, &zswap_reject_reclaim_fail);
1204        debugfs_create_u64("reject_alloc_fail", S_IRUGO,
1205                        zswap_debugfs_root, &zswap_reject_alloc_fail);
1206        debugfs_create_u64("reject_kmemcache_fail", S_IRUGO,
1207                        zswap_debugfs_root, &zswap_reject_kmemcache_fail);
1208        debugfs_create_u64("reject_compress_poor", S_IRUGO,
1209                        zswap_debugfs_root, &zswap_reject_compress_poor);
1210        debugfs_create_u64("written_back_pages", S_IRUGO,
1211                        zswap_debugfs_root, &zswap_written_back_pages);
1212        debugfs_create_u64("duplicate_entry", S_IRUGO,
1213                        zswap_debugfs_root, &zswap_duplicate_entry);
1214        debugfs_create_u64("pool_total_size", S_IRUGO,
1215                        zswap_debugfs_root, &zswap_pool_total_size);
1216        debugfs_create_atomic_t("stored_pages", S_IRUGO,
1217                        zswap_debugfs_root, &zswap_stored_pages);
1218
1219        return 0;
1220}
1221
1222static void __exit zswap_debugfs_exit(void)
1223{
1224        debugfs_remove_recursive(zswap_debugfs_root);
1225}
1226#else
1227static int __init zswap_debugfs_init(void)
1228{
1229        return 0;
1230}
1231
1232static void __exit zswap_debugfs_exit(void) { }
1233#endif
1234
1235/*********************************
1236* module init and exit
1237**********************************/
1238static int __init init_zswap(void)
1239{
1240        struct zswap_pool *pool;
1241
1242        zswap_init_started = true;
1243
1244        if (zswap_entry_cache_create()) {
1245                pr_err("entry cache creation failed\n");
1246                goto cache_fail;
1247        }
1248
1249        if (zswap_cpu_dstmem_init()) {
1250                pr_err("dstmem alloc failed\n");
1251                goto dstmem_fail;
1252        }
1253
1254        pool = __zswap_pool_create_fallback();
1255        if (!pool) {
1256                pr_err("pool creation failed\n");
1257                goto pool_fail;
1258        }
1259        pr_info("loaded using pool %s/%s\n", pool->tfm_name,
1260                zpool_get_type(pool->zpool));
1261
1262        list_add(&pool->list, &zswap_pools);
1263
1264        frontswap_register_ops(&zswap_frontswap_ops);
1265        if (zswap_debugfs_init())
1266                pr_warn("debugfs initialization failed\n");
1267        return 0;
1268
1269pool_fail:
1270        zswap_cpu_dstmem_destroy();
1271dstmem_fail:
1272        zswap_entry_cache_destroy();
1273cache_fail:
1274        return -ENOMEM;
1275}
1276/* must be late so crypto has time to come up */
1277late_initcall(init_zswap);
1278
1279MODULE_LICENSE("GPL");
1280MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>");
1281MODULE_DESCRIPTION("Compressed cache for swap pages");
1282