linux/mm/zswap.c
<<
>>
Prefs
   1/*
   2 * zswap.c - zswap driver file
   3 *
   4 * zswap is a backend for frontswap that takes pages that are in the process
   5 * of being swapped out and attempts to compress and store them in a
   6 * RAM-based memory pool.  This can result in a significant I/O reduction on
   7 * the swap device and, in the case where decompressing from RAM is faster
   8 * than reading from the swap device, can also improve workload performance.
   9 *
  10 * Copyright (C) 2012  Seth Jennings <sjenning@linux.vnet.ibm.com>
  11 *
  12 * This program is free software; you can redistribute it and/or
  13 * modify it under the terms of the GNU General Public License
  14 * as published by the Free Software Foundation; either version 2
  15 * of the License, or (at your option) any later version.
  16 *
  17 * This program is distributed in the hope that it will be useful,
  18 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 * GNU General Public License for more details.
  21*/
  22
  23#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  24
  25#include <linux/module.h>
  26#include <linux/cpu.h>
  27#include <linux/highmem.h>
  28#include <linux/slab.h>
  29#include <linux/spinlock.h>
  30#include <linux/types.h>
  31#include <linux/atomic.h>
  32#include <linux/frontswap.h>
  33#include <linux/rbtree.h>
  34#include <linux/swap.h>
  35#include <linux/crypto.h>
  36#include <linux/mempool.h>
  37#include <linux/zpool.h>
  38
  39#include <linux/mm_types.h>
  40#include <linux/page-flags.h>
  41#include <linux/swapops.h>
  42#include <linux/writeback.h>
  43#include <linux/pagemap.h>
  44
  45/*********************************
  46* statistics
  47**********************************/
  48/* Total bytes used by the compressed storage */
  49static u64 zswap_pool_total_size;
  50/* The number of compressed pages currently stored in zswap */
  51static atomic_t zswap_stored_pages = ATOMIC_INIT(0);
  52
  53/*
  54 * The statistics below are not protected from concurrent access for
  55 * performance reasons so they may not be a 100% accurate.  However,
  56 * they do provide useful information on roughly how many times a
  57 * certain event is occurring.
  58*/
  59
  60/* Pool limit was hit (see zswap_max_pool_percent) */
  61static u64 zswap_pool_limit_hit;
  62/* Pages written back when pool limit was reached */
  63static u64 zswap_written_back_pages;
  64/* Store failed due to a reclaim failure after pool limit was reached */
  65static u64 zswap_reject_reclaim_fail;
  66/* Compressed page was too big for the allocator to (optimally) store */
  67static u64 zswap_reject_compress_poor;
  68/* Store failed because underlying allocator could not get memory */
  69static u64 zswap_reject_alloc_fail;
  70/* Store failed because the entry metadata could not be allocated (rare) */
  71static u64 zswap_reject_kmemcache_fail;
  72/* Duplicate store was encountered (rare) */
  73static u64 zswap_duplicate_entry;
  74
  75/*********************************
  76* tunables
  77**********************************/
  78/* Enable/disable zswap (disabled by default, fixed at boot for now) */
  79static bool zswap_enabled __read_mostly;
  80module_param_named(enabled, zswap_enabled, bool, 0444);
  81
  82/* Compressor to be used by zswap (fixed at boot for now) */
  83#define ZSWAP_COMPRESSOR_DEFAULT "lzo"
  84static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
  85module_param_named(compressor, zswap_compressor, charp, 0444);
  86
  87/* The maximum percentage of memory that the compressed pool can occupy */
  88static unsigned int zswap_max_pool_percent = 20;
  89module_param_named(max_pool_percent,
  90                        zswap_max_pool_percent, uint, 0644);
  91
  92/* Compressed storage to use */
  93#define ZSWAP_ZPOOL_DEFAULT "zbud"
  94static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
  95module_param_named(zpool, zswap_zpool_type, charp, 0444);
  96
  97/* zpool is shared by all of zswap backend  */
  98static struct zpool *zswap_pool;
  99
 100/*********************************
 101* compression functions
 102**********************************/
 103/* per-cpu compression transforms */
 104static struct crypto_comp * __percpu *zswap_comp_pcpu_tfms;
 105
 106enum comp_op {
 107        ZSWAP_COMPOP_COMPRESS,
 108        ZSWAP_COMPOP_DECOMPRESS
 109};
 110
 111static int zswap_comp_op(enum comp_op op, const u8 *src, unsigned int slen,
 112                                u8 *dst, unsigned int *dlen)
 113{
 114        struct crypto_comp *tfm;
 115        int ret;
 116
 117        tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, get_cpu());
 118        switch (op) {
 119        case ZSWAP_COMPOP_COMPRESS:
 120                ret = crypto_comp_compress(tfm, src, slen, dst, dlen);
 121                break;
 122        case ZSWAP_COMPOP_DECOMPRESS:
 123                ret = crypto_comp_decompress(tfm, src, slen, dst, dlen);
 124                break;
 125        default:
 126                ret = -EINVAL;
 127        }
 128
 129        put_cpu();
 130        return ret;
 131}
 132
 133static int __init zswap_comp_init(void)
 134{
 135        if (!crypto_has_comp(zswap_compressor, 0, 0)) {
 136                pr_info("%s compressor not available\n", zswap_compressor);
 137                /* fall back to default compressor */
 138                zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
 139                if (!crypto_has_comp(zswap_compressor, 0, 0))
 140                        /* can't even load the default compressor */
 141                        return -ENODEV;
 142        }
 143        pr_info("using %s compressor\n", zswap_compressor);
 144
 145        /* alloc percpu transforms */
 146        zswap_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *);
 147        if (!zswap_comp_pcpu_tfms)
 148                return -ENOMEM;
 149        return 0;
 150}
 151
 152static void zswap_comp_exit(void)
 153{
 154        /* free percpu transforms */
 155        if (zswap_comp_pcpu_tfms)
 156                free_percpu(zswap_comp_pcpu_tfms);
 157}
 158
 159/*********************************
 160* data structures
 161**********************************/
 162/*
 163 * struct zswap_entry
 164 *
 165 * This structure contains the metadata for tracking a single compressed
 166 * page within zswap.
 167 *
 168 * rbnode - links the entry into red-black tree for the appropriate swap type
 169 * refcount - the number of outstanding reference to the entry. This is needed
 170 *            to protect against premature freeing of the entry by code
 171 *            concurrent calls to load, invalidate, and writeback.  The lock
 172 *            for the zswap_tree structure that contains the entry must
 173 *            be held while changing the refcount.  Since the lock must
 174 *            be held, there is no reason to also make refcount atomic.
 175 * offset - the swap offset for the entry.  Index into the red-black tree.
 176 * handle - zpool allocation handle that stores the compressed page data
 177 * length - the length in bytes of the compressed page data.  Needed during
 178 *          decompression
 179 */
 180struct zswap_entry {
 181        struct rb_node rbnode;
 182        pgoff_t offset;
 183        int refcount;
 184        unsigned int length;
 185        unsigned long handle;
 186};
 187
 188struct zswap_header {
 189        swp_entry_t swpentry;
 190};
 191
 192/*
 193 * The tree lock in the zswap_tree struct protects a few things:
 194 * - the rbtree
 195 * - the refcount field of each entry in the tree
 196 */
 197struct zswap_tree {
 198        struct rb_root rbroot;
 199        spinlock_t lock;
 200};
 201
 202static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
 203
 204/*********************************
 205* zswap entry functions
 206**********************************/
 207static struct kmem_cache *zswap_entry_cache;
 208
 209static int zswap_entry_cache_create(void)
 210{
 211        zswap_entry_cache = KMEM_CACHE(zswap_entry, 0);
 212        return zswap_entry_cache == NULL;
 213}
 214
 215static void __init zswap_entry_cache_destroy(void)
 216{
 217        kmem_cache_destroy(zswap_entry_cache);
 218}
 219
 220static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
 221{
 222        struct zswap_entry *entry;
 223        entry = kmem_cache_alloc(zswap_entry_cache, gfp);
 224        if (!entry)
 225                return NULL;
 226        entry->refcount = 1;
 227        RB_CLEAR_NODE(&entry->rbnode);
 228        return entry;
 229}
 230
 231static void zswap_entry_cache_free(struct zswap_entry *entry)
 232{
 233        kmem_cache_free(zswap_entry_cache, entry);
 234}
 235
 236/*********************************
 237* rbtree functions
 238**********************************/
 239static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
 240{
 241        struct rb_node *node = root->rb_node;
 242        struct zswap_entry *entry;
 243
 244        while (node) {
 245                entry = rb_entry(node, struct zswap_entry, rbnode);
 246                if (entry->offset > offset)
 247                        node = node->rb_left;
 248                else if (entry->offset < offset)
 249                        node = node->rb_right;
 250                else
 251                        return entry;
 252        }
 253        return NULL;
 254}
 255
 256/*
 257 * In the case that a entry with the same offset is found, a pointer to
 258 * the existing entry is stored in dupentry and the function returns -EEXIST
 259 */
 260static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
 261                        struct zswap_entry **dupentry)
 262{
 263        struct rb_node **link = &root->rb_node, *parent = NULL;
 264        struct zswap_entry *myentry;
 265
 266        while (*link) {
 267                parent = *link;
 268                myentry = rb_entry(parent, struct zswap_entry, rbnode);
 269                if (myentry->offset > entry->offset)
 270                        link = &(*link)->rb_left;
 271                else if (myentry->offset < entry->offset)
 272                        link = &(*link)->rb_right;
 273                else {
 274                        *dupentry = myentry;
 275                        return -EEXIST;
 276                }
 277        }
 278        rb_link_node(&entry->rbnode, parent, link);
 279        rb_insert_color(&entry->rbnode, root);
 280        return 0;
 281}
 282
 283static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
 284{
 285        if (!RB_EMPTY_NODE(&entry->rbnode)) {
 286                rb_erase(&entry->rbnode, root);
 287                RB_CLEAR_NODE(&entry->rbnode);
 288        }
 289}
 290
 291/*
 292 * Carries out the common pattern of freeing and entry's zpool allocation,
 293 * freeing the entry itself, and decrementing the number of stored pages.
 294 */
 295static void zswap_free_entry(struct zswap_entry *entry)
 296{
 297        zpool_free(zswap_pool, entry->handle);
 298        zswap_entry_cache_free(entry);
 299        atomic_dec(&zswap_stored_pages);
 300        zswap_pool_total_size = zpool_get_total_size(zswap_pool);
 301}
 302
 303/* caller must hold the tree lock */
 304static void zswap_entry_get(struct zswap_entry *entry)
 305{
 306        entry->refcount++;
 307}
 308
 309/* caller must hold the tree lock
 310* remove from the tree and free it, if nobody reference the entry
 311*/
 312static void zswap_entry_put(struct zswap_tree *tree,
 313                        struct zswap_entry *entry)
 314{
 315        int refcount = --entry->refcount;
 316
 317        BUG_ON(refcount < 0);
 318        if (refcount == 0) {
 319                zswap_rb_erase(&tree->rbroot, entry);
 320                zswap_free_entry(entry);
 321        }
 322}
 323
 324/* caller must hold the tree lock */
 325static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
 326                                pgoff_t offset)
 327{
 328        struct zswap_entry *entry = NULL;
 329
 330        entry = zswap_rb_search(root, offset);
 331        if (entry)
 332                zswap_entry_get(entry);
 333
 334        return entry;
 335}
 336
 337/*********************************
 338* per-cpu code
 339**********************************/
 340static DEFINE_PER_CPU(u8 *, zswap_dstmem);
 341
 342static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu)
 343{
 344        struct crypto_comp *tfm;
 345        u8 *dst;
 346
 347        switch (action) {
 348        case CPU_UP_PREPARE:
 349                tfm = crypto_alloc_comp(zswap_compressor, 0, 0);
 350                if (IS_ERR(tfm)) {
 351                        pr_err("can't allocate compressor transform\n");
 352                        return NOTIFY_BAD;
 353                }
 354                *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm;
 355                dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
 356                if (!dst) {
 357                        pr_err("can't allocate compressor buffer\n");
 358                        crypto_free_comp(tfm);
 359                        *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL;
 360                        return NOTIFY_BAD;
 361                }
 362                per_cpu(zswap_dstmem, cpu) = dst;
 363                break;
 364        case CPU_DEAD:
 365        case CPU_UP_CANCELED:
 366                tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu);
 367                if (tfm) {
 368                        crypto_free_comp(tfm);
 369                        *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL;
 370                }
 371                dst = per_cpu(zswap_dstmem, cpu);
 372                kfree(dst);
 373                per_cpu(zswap_dstmem, cpu) = NULL;
 374                break;
 375        default:
 376                break;
 377        }
 378        return NOTIFY_OK;
 379}
 380
 381static int zswap_cpu_notifier(struct notifier_block *nb,
 382                                unsigned long action, void *pcpu)
 383{
 384        unsigned long cpu = (unsigned long)pcpu;
 385        return __zswap_cpu_notifier(action, cpu);
 386}
 387
 388static struct notifier_block zswap_cpu_notifier_block = {
 389        .notifier_call = zswap_cpu_notifier
 390};
 391
 392static int zswap_cpu_init(void)
 393{
 394        unsigned long cpu;
 395
 396        cpu_notifier_register_begin();
 397        for_each_online_cpu(cpu)
 398                if (__zswap_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK)
 399                        goto cleanup;
 400        __register_cpu_notifier(&zswap_cpu_notifier_block);
 401        cpu_notifier_register_done();
 402        return 0;
 403
 404cleanup:
 405        for_each_online_cpu(cpu)
 406                __zswap_cpu_notifier(CPU_UP_CANCELED, cpu);
 407        cpu_notifier_register_done();
 408        return -ENOMEM;
 409}
 410
 411/*********************************
 412* helpers
 413**********************************/
 414static bool zswap_is_full(void)
 415{
 416        return totalram_pages * zswap_max_pool_percent / 100 <
 417                DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
 418}
 419
 420/*********************************
 421* writeback code
 422**********************************/
 423/* return enum for zswap_get_swap_cache_page */
 424enum zswap_get_swap_ret {
 425        ZSWAP_SWAPCACHE_NEW,
 426        ZSWAP_SWAPCACHE_EXIST,
 427        ZSWAP_SWAPCACHE_FAIL,
 428};
 429
 430/*
 431 * zswap_get_swap_cache_page
 432 *
 433 * This is an adaption of read_swap_cache_async()
 434 *
 435 * This function tries to find a page with the given swap entry
 436 * in the swapper_space address space (the swap cache).  If the page
 437 * is found, it is returned in retpage.  Otherwise, a page is allocated,
 438 * added to the swap cache, and returned in retpage.
 439 *
 440 * If success, the swap cache page is returned in retpage
 441 * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache
 442 * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated,
 443 *     the new page is added to swapcache and locked
 444 * Returns ZSWAP_SWAPCACHE_FAIL on error
 445 */
 446static int zswap_get_swap_cache_page(swp_entry_t entry,
 447                                struct page **retpage)
 448{
 449        bool page_was_allocated;
 450
 451        *retpage = __read_swap_cache_async(entry, GFP_KERNEL,
 452                        NULL, 0, &page_was_allocated);
 453        if (page_was_allocated)
 454                return ZSWAP_SWAPCACHE_NEW;
 455        if (!*retpage)
 456                return ZSWAP_SWAPCACHE_FAIL;
 457        return ZSWAP_SWAPCACHE_EXIST;
 458}
 459
 460/*
 461 * Attempts to free an entry by adding a page to the swap cache,
 462 * decompressing the entry data into the page, and issuing a
 463 * bio write to write the page back to the swap device.
 464 *
 465 * This can be thought of as a "resumed writeback" of the page
 466 * to the swap device.  We are basically resuming the same swap
 467 * writeback path that was intercepted with the frontswap_store()
 468 * in the first place.  After the page has been decompressed into
 469 * the swap cache, the compressed version stored by zswap can be
 470 * freed.
 471 */
 472static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
 473{
 474        struct zswap_header *zhdr;
 475        swp_entry_t swpentry;
 476        struct zswap_tree *tree;
 477        pgoff_t offset;
 478        struct zswap_entry *entry;
 479        struct page *page;
 480        u8 *src, *dst;
 481        unsigned int dlen;
 482        int ret;
 483        struct writeback_control wbc = {
 484                .sync_mode = WB_SYNC_NONE,
 485        };
 486
 487        /* extract swpentry from data */
 488        zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO);
 489        swpentry = zhdr->swpentry; /* here */
 490        zpool_unmap_handle(pool, handle);
 491        tree = zswap_trees[swp_type(swpentry)];
 492        offset = swp_offset(swpentry);
 493
 494        /* find and ref zswap entry */
 495        spin_lock(&tree->lock);
 496        entry = zswap_entry_find_get(&tree->rbroot, offset);
 497        if (!entry) {
 498                /* entry was invalidated */
 499                spin_unlock(&tree->lock);
 500                return 0;
 501        }
 502        spin_unlock(&tree->lock);
 503        BUG_ON(offset != entry->offset);
 504
 505        /* try to allocate swap cache page */
 506        switch (zswap_get_swap_cache_page(swpentry, &page)) {
 507        case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */
 508                ret = -ENOMEM;
 509                goto fail;
 510
 511        case ZSWAP_SWAPCACHE_EXIST:
 512                /* page is already in the swap cache, ignore for now */
 513                page_cache_release(page);
 514                ret = -EEXIST;
 515                goto fail;
 516
 517        case ZSWAP_SWAPCACHE_NEW: /* page is locked */
 518                /* decompress */
 519                dlen = PAGE_SIZE;
 520                src = (u8 *)zpool_map_handle(zswap_pool, entry->handle,
 521                                ZPOOL_MM_RO) + sizeof(struct zswap_header);
 522                dst = kmap_atomic(page);
 523                ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src,
 524                                entry->length, dst, &dlen);
 525                kunmap_atomic(dst);
 526                zpool_unmap_handle(zswap_pool, entry->handle);
 527                BUG_ON(ret);
 528                BUG_ON(dlen != PAGE_SIZE);
 529
 530                /* page is up to date */
 531                SetPageUptodate(page);
 532        }
 533
 534        /* move it to the tail of the inactive list after end_writeback */
 535        SetPageReclaim(page);
 536
 537        /* start writeback */
 538        __swap_writepage(page, &wbc, end_swap_bio_write);
 539        page_cache_release(page);
 540        zswap_written_back_pages++;
 541
 542        spin_lock(&tree->lock);
 543        /* drop local reference */
 544        zswap_entry_put(tree, entry);
 545
 546        /*
 547        * There are two possible situations for entry here:
 548        * (1) refcount is 1(normal case),  entry is valid and on the tree
 549        * (2) refcount is 0, entry is freed and not on the tree
 550        *     because invalidate happened during writeback
 551        *  search the tree and free the entry if find entry
 552        */
 553        if (entry == zswap_rb_search(&tree->rbroot, offset))
 554                zswap_entry_put(tree, entry);
 555        spin_unlock(&tree->lock);
 556
 557        goto end;
 558
 559        /*
 560        * if we get here due to ZSWAP_SWAPCACHE_EXIST
 561        * a load may happening concurrently
 562        * it is safe and okay to not free the entry
 563        * if we free the entry in the following put
 564        * it it either okay to return !0
 565        */
 566fail:
 567        spin_lock(&tree->lock);
 568        zswap_entry_put(tree, entry);
 569        spin_unlock(&tree->lock);
 570
 571end:
 572        return ret;
 573}
 574
 575/*********************************
 576* frontswap hooks
 577**********************************/
 578/* attempts to compress and store an single page */
 579static int zswap_frontswap_store(unsigned type, pgoff_t offset,
 580                                struct page *page)
 581{
 582        struct zswap_tree *tree = zswap_trees[type];
 583        struct zswap_entry *entry, *dupentry;
 584        int ret;
 585        unsigned int dlen = PAGE_SIZE, len;
 586        unsigned long handle;
 587        char *buf;
 588        u8 *src, *dst;
 589        struct zswap_header *zhdr;
 590
 591        if (!tree) {
 592                ret = -ENODEV;
 593                goto reject;
 594        }
 595
 596        /* reclaim space if needed */
 597        if (zswap_is_full()) {
 598                zswap_pool_limit_hit++;
 599                if (zpool_shrink(zswap_pool, 1, NULL)) {
 600                        zswap_reject_reclaim_fail++;
 601                        ret = -ENOMEM;
 602                        goto reject;
 603                }
 604        }
 605
 606        /* allocate entry */
 607        entry = zswap_entry_cache_alloc(GFP_KERNEL);
 608        if (!entry) {
 609                zswap_reject_kmemcache_fail++;
 610                ret = -ENOMEM;
 611                goto reject;
 612        }
 613
 614        /* compress */
 615        dst = get_cpu_var(zswap_dstmem);
 616        src = kmap_atomic(page);
 617        ret = zswap_comp_op(ZSWAP_COMPOP_COMPRESS, src, PAGE_SIZE, dst, &dlen);
 618        kunmap_atomic(src);
 619        if (ret) {
 620                ret = -EINVAL;
 621                goto freepage;
 622        }
 623
 624        /* store */
 625        len = dlen + sizeof(struct zswap_header);
 626        ret = zpool_malloc(zswap_pool, len, __GFP_NORETRY | __GFP_NOWARN,
 627                &handle);
 628        if (ret == -ENOSPC) {
 629                zswap_reject_compress_poor++;
 630                goto freepage;
 631        }
 632        if (ret) {
 633                zswap_reject_alloc_fail++;
 634                goto freepage;
 635        }
 636        zhdr = zpool_map_handle(zswap_pool, handle, ZPOOL_MM_RW);
 637        zhdr->swpentry = swp_entry(type, offset);
 638        buf = (u8 *)(zhdr + 1);
 639        memcpy(buf, dst, dlen);
 640        zpool_unmap_handle(zswap_pool, handle);
 641        put_cpu_var(zswap_dstmem);
 642
 643        /* populate entry */
 644        entry->offset = offset;
 645        entry->handle = handle;
 646        entry->length = dlen;
 647
 648        /* map */
 649        spin_lock(&tree->lock);
 650        do {
 651                ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry);
 652                if (ret == -EEXIST) {
 653                        zswap_duplicate_entry++;
 654                        /* remove from rbtree */
 655                        zswap_rb_erase(&tree->rbroot, dupentry);
 656                        zswap_entry_put(tree, dupentry);
 657                }
 658        } while (ret == -EEXIST);
 659        spin_unlock(&tree->lock);
 660
 661        /* update stats */
 662        atomic_inc(&zswap_stored_pages);
 663        zswap_pool_total_size = zpool_get_total_size(zswap_pool);
 664
 665        return 0;
 666
 667freepage:
 668        put_cpu_var(zswap_dstmem);
 669        zswap_entry_cache_free(entry);
 670reject:
 671        return ret;
 672}
 673
 674/*
 675 * returns 0 if the page was successfully decompressed
 676 * return -1 on entry not found or error
 677*/
 678static int zswap_frontswap_load(unsigned type, pgoff_t offset,
 679                                struct page *page)
 680{
 681        struct zswap_tree *tree = zswap_trees[type];
 682        struct zswap_entry *entry;
 683        u8 *src, *dst;
 684        unsigned int dlen;
 685        int ret;
 686
 687        /* find */
 688        spin_lock(&tree->lock);
 689        entry = zswap_entry_find_get(&tree->rbroot, offset);
 690        if (!entry) {
 691                /* entry was written back */
 692                spin_unlock(&tree->lock);
 693                return -1;
 694        }
 695        spin_unlock(&tree->lock);
 696
 697        /* decompress */
 698        dlen = PAGE_SIZE;
 699        src = (u8 *)zpool_map_handle(zswap_pool, entry->handle,
 700                        ZPOOL_MM_RO) + sizeof(struct zswap_header);
 701        dst = kmap_atomic(page);
 702        ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length,
 703                dst, &dlen);
 704        kunmap_atomic(dst);
 705        zpool_unmap_handle(zswap_pool, entry->handle);
 706        BUG_ON(ret);
 707
 708        spin_lock(&tree->lock);
 709        zswap_entry_put(tree, entry);
 710        spin_unlock(&tree->lock);
 711
 712        return 0;
 713}
 714
 715/* frees an entry in zswap */
 716static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
 717{
 718        struct zswap_tree *tree = zswap_trees[type];
 719        struct zswap_entry *entry;
 720
 721        /* find */
 722        spin_lock(&tree->lock);
 723        entry = zswap_rb_search(&tree->rbroot, offset);
 724        if (!entry) {
 725                /* entry was written back */
 726                spin_unlock(&tree->lock);
 727                return;
 728        }
 729
 730        /* remove from rbtree */
 731        zswap_rb_erase(&tree->rbroot, entry);
 732
 733        /* drop the initial reference from entry creation */
 734        zswap_entry_put(tree, entry);
 735
 736        spin_unlock(&tree->lock);
 737}
 738
 739/* frees all zswap entries for the given swap type */
 740static void zswap_frontswap_invalidate_area(unsigned type)
 741{
 742        struct zswap_tree *tree = zswap_trees[type];
 743        struct zswap_entry *entry, *n;
 744
 745        if (!tree)
 746                return;
 747
 748        /* walk the tree and free everything */
 749        spin_lock(&tree->lock);
 750        rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
 751                zswap_free_entry(entry);
 752        tree->rbroot = RB_ROOT;
 753        spin_unlock(&tree->lock);
 754        kfree(tree);
 755        zswap_trees[type] = NULL;
 756}
 757
 758static struct zpool_ops zswap_zpool_ops = {
 759        .evict = zswap_writeback_entry
 760};
 761
 762static void zswap_frontswap_init(unsigned type)
 763{
 764        struct zswap_tree *tree;
 765
 766        tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL);
 767        if (!tree) {
 768                pr_err("alloc failed, zswap disabled for swap type %d\n", type);
 769                return;
 770        }
 771
 772        tree->rbroot = RB_ROOT;
 773        spin_lock_init(&tree->lock);
 774        zswap_trees[type] = tree;
 775}
 776
 777static struct frontswap_ops zswap_frontswap_ops = {
 778        .store = zswap_frontswap_store,
 779        .load = zswap_frontswap_load,
 780        .invalidate_page = zswap_frontswap_invalidate_page,
 781        .invalidate_area = zswap_frontswap_invalidate_area,
 782        .init = zswap_frontswap_init
 783};
 784
 785/*********************************
 786* debugfs functions
 787**********************************/
 788#ifdef CONFIG_DEBUG_FS
 789#include <linux/debugfs.h>
 790
 791static struct dentry *zswap_debugfs_root;
 792
 793static int __init zswap_debugfs_init(void)
 794{
 795        if (!debugfs_initialized())
 796                return -ENODEV;
 797
 798        zswap_debugfs_root = debugfs_create_dir("zswap", NULL);
 799        if (!zswap_debugfs_root)
 800                return -ENOMEM;
 801
 802        debugfs_create_u64("pool_limit_hit", S_IRUGO,
 803                        zswap_debugfs_root, &zswap_pool_limit_hit);
 804        debugfs_create_u64("reject_reclaim_fail", S_IRUGO,
 805                        zswap_debugfs_root, &zswap_reject_reclaim_fail);
 806        debugfs_create_u64("reject_alloc_fail", S_IRUGO,
 807                        zswap_debugfs_root, &zswap_reject_alloc_fail);
 808        debugfs_create_u64("reject_kmemcache_fail", S_IRUGO,
 809                        zswap_debugfs_root, &zswap_reject_kmemcache_fail);
 810        debugfs_create_u64("reject_compress_poor", S_IRUGO,
 811                        zswap_debugfs_root, &zswap_reject_compress_poor);
 812        debugfs_create_u64("written_back_pages", S_IRUGO,
 813                        zswap_debugfs_root, &zswap_written_back_pages);
 814        debugfs_create_u64("duplicate_entry", S_IRUGO,
 815                        zswap_debugfs_root, &zswap_duplicate_entry);
 816        debugfs_create_u64("pool_total_size", S_IRUGO,
 817                        zswap_debugfs_root, &zswap_pool_total_size);
 818        debugfs_create_atomic_t("stored_pages", S_IRUGO,
 819                        zswap_debugfs_root, &zswap_stored_pages);
 820
 821        return 0;
 822}
 823
 824static void __exit zswap_debugfs_exit(void)
 825{
 826        debugfs_remove_recursive(zswap_debugfs_root);
 827}
 828#else
 829static int __init zswap_debugfs_init(void)
 830{
 831        return 0;
 832}
 833
 834static void __exit zswap_debugfs_exit(void) { }
 835#endif
 836
 837/*********************************
 838* module init and exit
 839**********************************/
 840static int __init init_zswap(void)
 841{
 842        gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN;
 843
 844        if (!zswap_enabled)
 845                return 0;
 846
 847        pr_info("loading zswap\n");
 848
 849        zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
 850                                        &zswap_zpool_ops);
 851        if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) {
 852                pr_info("%s zpool not available\n", zswap_zpool_type);
 853                zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
 854                zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
 855                                        &zswap_zpool_ops);
 856        }
 857        if (!zswap_pool) {
 858                pr_err("%s zpool not available\n", zswap_zpool_type);
 859                pr_err("zpool creation failed\n");
 860                goto error;
 861        }
 862        pr_info("using %s pool\n", zswap_zpool_type);
 863
 864        if (zswap_entry_cache_create()) {
 865                pr_err("entry cache creation failed\n");
 866                goto cachefail;
 867        }
 868        if (zswap_comp_init()) {
 869                pr_err("compressor initialization failed\n");
 870                goto compfail;
 871        }
 872        if (zswap_cpu_init()) {
 873                pr_err("per-cpu initialization failed\n");
 874                goto pcpufail;
 875        }
 876
 877        frontswap_register_ops(&zswap_frontswap_ops);
 878        if (zswap_debugfs_init())
 879                pr_warn("debugfs initialization failed\n");
 880        return 0;
 881pcpufail:
 882        zswap_comp_exit();
 883compfail:
 884        zswap_entry_cache_destroy();
 885cachefail:
 886        zpool_destroy_pool(zswap_pool);
 887error:
 888        return -ENOMEM;
 889}
 890/* must be late so crypto has time to come up */
 891late_initcall(init_zswap);
 892
 893MODULE_LICENSE("GPL");
 894MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>");
 895MODULE_DESCRIPTION("Compressed cache for swap pages");
 896