linux/drivers/staging/zcache/zcache-main.c
<<
>>
Prefs
   1/*
   2 * zcache.c
   3 *
   4 * Copyright (c) 2010-2012, Dan Magenheimer, Oracle Corp.
   5 * Copyright (c) 2010,2011, Nitin Gupta
   6 *
   7 * Zcache provides an in-kernel "host implementation" for transcendent memory
   8 * ("tmem") and, thus indirectly, for cleancache and frontswap.  Zcache uses
   9 * lzo1x compression to improve density and an embedded allocator called
  10 * "zbud" which "buddies" two compressed pages semi-optimally in each physical
  11 * pageframe.  Zbud is integrally tied into tmem to allow pageframes to
  12 * be "reclaimed" efficiently.
  13 */
  14
  15#include <linux/module.h>
  16#include <linux/cpu.h>
  17#include <linux/highmem.h>
  18#include <linux/list.h>
  19#include <linux/slab.h>
  20#include <linux/spinlock.h>
  21#include <linux/types.h>
  22#include <linux/string.h>
  23#include <linux/atomic.h>
  24#include <linux/math64.h>
  25#include <linux/crypto.h>
  26#include <linux/swap.h>
  27#include <linux/swapops.h>
  28#include <linux/pagemap.h>
  29#include <linux/writeback.h>
  30
  31#include <linux/cleancache.h>
  32#include <linux/frontswap.h>
  33#include "tmem.h"
  34#include "zcache.h"
  35#include "zbud.h"
  36#include "ramster.h"
  37#include "debug.h"
  38#ifdef CONFIG_RAMSTER
  39static bool ramster_enabled __read_mostly;
  40static int disable_frontswap_selfshrink;
  41#else
  42#define ramster_enabled false
  43#define disable_frontswap_selfshrink 0
  44#endif
  45
  46#ifndef __PG_WAS_ACTIVE
  47static inline bool PageWasActive(struct page *page)
  48{
  49        return true;
  50}
  51
  52static inline void SetPageWasActive(struct page *page)
  53{
  54}
  55#endif
  56
  57#ifdef FRONTSWAP_HAS_EXCLUSIVE_GETS
  58static bool frontswap_has_exclusive_gets __read_mostly = true;
  59#else
  60static bool frontswap_has_exclusive_gets __read_mostly;
  61static inline void frontswap_tmem_exclusive_gets(bool b)
  62{
  63}
  64#endif
  65
  66/*
  67 * mark pampd to special value in order that later
  68 * retrieve will identify zero-filled pages
  69 */
  70#define ZERO_FILLED 0x2
  71
  72/* enable (or fix code) when Seth's patches are accepted upstream */
  73#define zcache_writeback_enabled 0
  74
  75static bool zcache_enabled __read_mostly;
  76static bool disable_cleancache __read_mostly;
  77static bool disable_frontswap __read_mostly;
  78static bool disable_frontswap_ignore_nonactive __read_mostly;
  79static bool disable_cleancache_ignore_nonactive __read_mostly;
  80static char *namestr __read_mostly = "zcache";
  81
  82#define ZCACHE_GFP_MASK \
  83        (__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC)
  84
  85/* crypto API for zcache  */
  86#ifdef CONFIG_ZCACHE_MODULE
  87static char *zcache_comp_name = "lzo";
  88#else
  89#define ZCACHE_COMP_NAME_SZ CRYPTO_MAX_ALG_NAME
  90static char zcache_comp_name[ZCACHE_COMP_NAME_SZ] __read_mostly;
  91#endif
  92static struct crypto_comp * __percpu *zcache_comp_pcpu_tfms __read_mostly;
  93
  94enum comp_op {
  95        ZCACHE_COMPOP_COMPRESS,
  96        ZCACHE_COMPOP_DECOMPRESS
  97};
  98
  99static inline int zcache_comp_op(enum comp_op op,
 100                                const u8 *src, unsigned int slen,
 101                                u8 *dst, unsigned int *dlen)
 102{
 103        struct crypto_comp *tfm;
 104        int ret = -1;
 105
 106        BUG_ON(!zcache_comp_pcpu_tfms);
 107        tfm = *per_cpu_ptr(zcache_comp_pcpu_tfms, get_cpu());
 108        BUG_ON(!tfm);
 109        switch (op) {
 110        case ZCACHE_COMPOP_COMPRESS:
 111                ret = crypto_comp_compress(tfm, src, slen, dst, dlen);
 112                break;
 113        case ZCACHE_COMPOP_DECOMPRESS:
 114                ret = crypto_comp_decompress(tfm, src, slen, dst, dlen);
 115                break;
 116        default:
 117                ret = -EINVAL;
 118        }
 119        put_cpu();
 120        return ret;
 121}
 122
 123/*
 124 * policy parameters
 125 */
 126
 127/*
 128 * byte count defining poor compression; pages with greater zsize will be
 129 * rejected
 130 */
 131static unsigned int zbud_max_zsize __read_mostly = (PAGE_SIZE / 8) * 7;
 132/*
 133 * byte count defining poor *mean* compression; pages with greater zsize
 134 * will be rejected until sufficient better-compressed pages are accepted
 135 * driving the mean below this threshold
 136 */
 137static unsigned int zbud_max_mean_zsize __read_mostly = (PAGE_SIZE / 8) * 5;
 138
 139/*
 140 * for now, used named slabs so can easily track usage; later can
 141 * either just use kmalloc, or perhaps add a slab-like allocator
 142 * to more carefully manage total memory utilization
 143 */
 144static struct kmem_cache *zcache_objnode_cache;
 145static struct kmem_cache *zcache_obj_cache;
 146
 147static DEFINE_PER_CPU(struct zcache_preload, zcache_preloads) = { 0, };
 148
 149/* Used by debug.c */
 150ssize_t zcache_pers_zpages;
 151u64 zcache_pers_zbytes;
 152ssize_t zcache_eph_pageframes;
 153ssize_t zcache_pers_pageframes;
 154
 155/* Used by this code. */
 156ssize_t zcache_last_active_file_pageframes;
 157ssize_t zcache_last_inactive_file_pageframes;
 158ssize_t zcache_last_active_anon_pageframes;
 159ssize_t zcache_last_inactive_anon_pageframes;
 160#ifdef CONFIG_ZCACHE_WRITEBACK
 161ssize_t zcache_writtenback_pages;
 162ssize_t zcache_outstanding_writeback_pages;
 163#endif
 164/*
 165 * zcache core code starts here
 166 */
 167
 168static struct zcache_client zcache_host;
 169static struct zcache_client zcache_clients[MAX_CLIENTS];
 170
 171static inline bool is_local_client(struct zcache_client *cli)
 172{
 173        return cli == &zcache_host;
 174}
 175
 176static struct zcache_client *zcache_get_client_by_id(uint16_t cli_id)
 177{
 178        struct zcache_client *cli = &zcache_host;
 179
 180        if (cli_id != LOCAL_CLIENT) {
 181                if (cli_id >= MAX_CLIENTS)
 182                        goto out;
 183                cli = &zcache_clients[cli_id];
 184        }
 185out:
 186        return cli;
 187}
 188
 189/*
 190 * Tmem operations assume the poolid implies the invoking client.
 191 * Zcache only has one client (the kernel itself): LOCAL_CLIENT.
 192 * RAMster has each client numbered by cluster node, and a KVM version
 193 * of zcache would have one client per guest and each client might
 194 * have a poolid==N.
 195 */
 196struct tmem_pool *zcache_get_pool_by_id(uint16_t cli_id, uint16_t poolid)
 197{
 198        struct tmem_pool *pool = NULL;
 199        struct zcache_client *cli = NULL;
 200
 201        cli = zcache_get_client_by_id(cli_id);
 202        if (cli == NULL)
 203                goto out;
 204        if (!is_local_client(cli))
 205                atomic_inc(&cli->refcount);
 206        if (poolid < MAX_POOLS_PER_CLIENT) {
 207                pool = cli->tmem_pools[poolid];
 208                if (pool != NULL)
 209                        atomic_inc(&pool->refcount);
 210        }
 211out:
 212        return pool;
 213}
 214
 215void zcache_put_pool(struct tmem_pool *pool)
 216{
 217        struct zcache_client *cli = NULL;
 218
 219        if (pool == NULL)
 220                BUG();
 221        cli = pool->client;
 222        atomic_dec(&pool->refcount);
 223        if (!is_local_client(cli))
 224                atomic_dec(&cli->refcount);
 225}
 226
 227int zcache_new_client(uint16_t cli_id)
 228{
 229        struct zcache_client *cli;
 230        int ret = -1;
 231
 232        cli = zcache_get_client_by_id(cli_id);
 233        if (cli == NULL)
 234                goto out;
 235        if (cli->allocated)
 236                goto out;
 237        cli->allocated = 1;
 238        ret = 0;
 239out:
 240        return ret;
 241}
 242
 243/*
 244 * zcache implementation for tmem host ops
 245 */
 246
 247static struct tmem_objnode *zcache_objnode_alloc(struct tmem_pool *pool)
 248{
 249        struct tmem_objnode *objnode = NULL;
 250        struct zcache_preload *kp;
 251        int i;
 252
 253        kp = &__get_cpu_var(zcache_preloads);
 254        for (i = 0; i < ARRAY_SIZE(kp->objnodes); i++) {
 255                objnode = kp->objnodes[i];
 256                if (objnode != NULL) {
 257                        kp->objnodes[i] = NULL;
 258                        break;
 259                }
 260        }
 261        BUG_ON(objnode == NULL);
 262        inc_zcache_objnode_count();
 263        return objnode;
 264}
 265
 266static void zcache_objnode_free(struct tmem_objnode *objnode,
 267                                        struct tmem_pool *pool)
 268{
 269        dec_zcache_objnode_count();
 270        kmem_cache_free(zcache_objnode_cache, objnode);
 271}
 272
 273static struct tmem_obj *zcache_obj_alloc(struct tmem_pool *pool)
 274{
 275        struct tmem_obj *obj = NULL;
 276        struct zcache_preload *kp;
 277
 278        kp = &__get_cpu_var(zcache_preloads);
 279        obj = kp->obj;
 280        BUG_ON(obj == NULL);
 281        kp->obj = NULL;
 282        inc_zcache_obj_count();
 283        return obj;
 284}
 285
 286static void zcache_obj_free(struct tmem_obj *obj, struct tmem_pool *pool)
 287{
 288        dec_zcache_obj_count();
 289        kmem_cache_free(zcache_obj_cache, obj);
 290}
 291
 292/*
 293 * Compressing zero-filled pages will waste memory and introduce
 294 * serious fragmentation, skip it to avoid overhead.
 295 */
 296static bool page_is_zero_filled(struct page *p)
 297{
 298        unsigned int pos;
 299        char *page;
 300
 301        page = kmap_atomic(p);
 302        for (pos = 0; pos < PAGE_SIZE / sizeof(*page); pos++) {
 303                if (page[pos]) {
 304                        kunmap_atomic(page);
 305                        return false;
 306                }
 307        }
 308        kunmap_atomic(page);
 309
 310        return true;
 311}
 312
 313static void handle_zero_filled_page(void *p)
 314{
 315        void *user_mem;
 316        struct page *page = (struct page *)p;
 317
 318        user_mem = kmap_atomic(page);
 319        memset(user_mem, 0, PAGE_SIZE);
 320        kunmap_atomic(user_mem);
 321
 322        flush_dcache_page(page);
 323}
 324
 325static struct tmem_hostops zcache_hostops = {
 326        .obj_alloc = zcache_obj_alloc,
 327        .obj_free = zcache_obj_free,
 328        .objnode_alloc = zcache_objnode_alloc,
 329        .objnode_free = zcache_objnode_free,
 330};
 331
 332static struct page *zcache_alloc_page(void)
 333{
 334        struct page *page = alloc_page(ZCACHE_GFP_MASK);
 335
 336        if (page != NULL)
 337                inc_zcache_pageframes_alloced();
 338        return page;
 339}
 340
 341static void zcache_free_page(struct page *page)
 342{
 343        long curr_pageframes;
 344        static long max_pageframes, min_pageframes;
 345
 346        if (page == NULL)
 347                BUG();
 348        __free_page(page);
 349        inc_zcache_pageframes_freed();
 350        curr_pageframes = curr_pageframes_count();
 351        if (curr_pageframes > max_pageframes)
 352                max_pageframes = curr_pageframes;
 353        if (curr_pageframes < min_pageframes)
 354                min_pageframes = curr_pageframes;
 355#ifdef CONFIG_ZCACHE_DEBUG
 356        if (curr_pageframes > 2L || curr_pageframes < -2L) {
 357                /* pr_info here */
 358        }
 359#endif
 360}
 361
 362/*
 363 * zcache implementations for PAM page descriptor ops
 364 */
 365
 366/* forward reference */
 367static void zcache_compress(struct page *from,
 368                                void **out_va, unsigned *out_len);
 369
 370static struct page *zcache_evict_eph_pageframe(void);
 371
 372static void *zcache_pampd_eph_create(char *data, size_t size, bool raw,
 373                                        struct tmem_handle *th)
 374{
 375        void *pampd = NULL, *cdata = data;
 376        unsigned clen = size;
 377        bool zero_filled = false;
 378        struct page *page = (struct page *)(data), *newpage;
 379
 380        if (page_is_zero_filled(page)) {
 381                clen = 0;
 382                zero_filled = true;
 383                inc_zcache_zero_filled_pages();
 384                goto got_pampd;
 385        }
 386
 387        if (!raw) {
 388                zcache_compress(page, &cdata, &clen);
 389                if (clen > zbud_max_buddy_size()) {
 390                        inc_zcache_compress_poor();
 391                        goto out;
 392                }
 393        } else {
 394                BUG_ON(clen > zbud_max_buddy_size());
 395        }
 396
 397        /* look for space via an existing match first */
 398        pampd = (void *)zbud_match_prep(th, true, cdata, clen);
 399        if (pampd != NULL)
 400                goto got_pampd;
 401
 402        /* no match, now we need to find (or free up) a full page */
 403        newpage = zcache_alloc_page();
 404        if (newpage != NULL)
 405                goto create_in_new_page;
 406
 407        inc_zcache_failed_getfreepages();
 408        /* can't allocate a page, evict an ephemeral page via LRU */
 409        newpage = zcache_evict_eph_pageframe();
 410        if (newpage == NULL) {
 411                inc_zcache_eph_ate_tail_failed();
 412                goto out;
 413        }
 414        inc_zcache_eph_ate_tail();
 415
 416create_in_new_page:
 417        pampd = (void *)zbud_create_prep(th, true, cdata, clen, newpage);
 418        BUG_ON(pampd == NULL);
 419        inc_zcache_eph_pageframes();
 420
 421got_pampd:
 422        inc_zcache_eph_zbytes(clen);
 423        inc_zcache_eph_zpages();
 424        if (ramster_enabled && raw && !zero_filled)
 425                ramster_count_foreign_pages(true, 1);
 426        if (zero_filled)
 427                pampd = (void *)ZERO_FILLED;
 428out:
 429        return pampd;
 430}
 431
 432static void *zcache_pampd_pers_create(char *data, size_t size, bool raw,
 433                                        struct tmem_handle *th)
 434{
 435        void *pampd = NULL, *cdata = data;
 436        unsigned clen = size;
 437        bool zero_filled = false;
 438        struct page *page = (struct page *)(data), *newpage;
 439        unsigned long zbud_mean_zsize;
 440        unsigned long curr_pers_zpages, total_zsize;
 441
 442        if (data == NULL) {
 443                BUG_ON(!ramster_enabled);
 444                goto create_pampd;
 445        }
 446
 447        if (page_is_zero_filled(page)) {
 448                clen = 0;
 449                zero_filled = true;
 450                inc_zcache_zero_filled_pages();
 451                goto got_pampd;
 452        }
 453
 454        curr_pers_zpages = zcache_pers_zpages;
 455/* FIXME CONFIG_RAMSTER... subtract atomic remote_pers_pages here? */
 456        if (!raw)
 457                zcache_compress(page, &cdata, &clen);
 458        /* reject if compression is too poor */
 459        if (clen > zbud_max_zsize) {
 460                inc_zcache_compress_poor();
 461                goto out;
 462        }
 463        /* reject if mean compression is too poor */
 464        if ((clen > zbud_max_mean_zsize) && (curr_pers_zpages > 0)) {
 465                total_zsize = zcache_pers_zbytes;
 466                if ((long)total_zsize < 0)
 467                        total_zsize = 0;
 468                zbud_mean_zsize = div_u64(total_zsize,
 469                                        curr_pers_zpages);
 470                if (zbud_mean_zsize > zbud_max_mean_zsize) {
 471                        inc_zcache_mean_compress_poor();
 472                        goto out;
 473                }
 474        }
 475
 476create_pampd:
 477        /* look for space via an existing match first */
 478        pampd = (void *)zbud_match_prep(th, false, cdata, clen);
 479        if (pampd != NULL)
 480                goto got_pampd;
 481
 482        /* no match, now we need to find (or free up) a full page */
 483        newpage = zcache_alloc_page();
 484        if (newpage != NULL)
 485                goto create_in_new_page;
 486        /*
 487         * FIXME do the following only if eph is oversized?
 488         * if (zcache_eph_pageframes >
 489         * (global_page_state(NR_LRU_BASE + LRU_ACTIVE_FILE) +
 490         * global_page_state(NR_LRU_BASE + LRU_INACTIVE_FILE)))
 491         */
 492        inc_zcache_failed_getfreepages();
 493        /* can't allocate a page, evict an ephemeral page via LRU */
 494        newpage = zcache_evict_eph_pageframe();
 495        if (newpage == NULL) {
 496                inc_zcache_pers_ate_eph_failed();
 497                goto out;
 498        }
 499        inc_zcache_pers_ate_eph();
 500
 501create_in_new_page:
 502        pampd = (void *)zbud_create_prep(th, false, cdata, clen, newpage);
 503        BUG_ON(pampd == NULL);
 504        inc_zcache_pers_pageframes();
 505
 506got_pampd:
 507        inc_zcache_pers_zpages();
 508        inc_zcache_pers_zbytes(clen);
 509        if (ramster_enabled && raw && !zero_filled)
 510                ramster_count_foreign_pages(false, 1);
 511        if (zero_filled)
 512                pampd = (void *)ZERO_FILLED;
 513out:
 514        return pampd;
 515}
 516
 517/*
 518 * This is called directly from zcache_put_page to pre-allocate space
 519 * to store a zpage.
 520 */
 521void *zcache_pampd_create(char *data, unsigned int size, bool raw,
 522                                        int eph, struct tmem_handle *th)
 523{
 524        void *pampd = NULL;
 525        struct zcache_preload *kp;
 526        struct tmem_objnode *objnode;
 527        struct tmem_obj *obj;
 528        int i;
 529
 530        BUG_ON(!irqs_disabled());
 531        /* pre-allocate per-cpu metadata */
 532        BUG_ON(zcache_objnode_cache == NULL);
 533        BUG_ON(zcache_obj_cache == NULL);
 534        kp = &__get_cpu_var(zcache_preloads);
 535        for (i = 0; i < ARRAY_SIZE(kp->objnodes); i++) {
 536                objnode = kp->objnodes[i];
 537                if (objnode == NULL) {
 538                        objnode = kmem_cache_alloc(zcache_objnode_cache,
 539                                                        ZCACHE_GFP_MASK);
 540                        if (unlikely(objnode == NULL)) {
 541                                inc_zcache_failed_alloc();
 542                                goto out;
 543                        }
 544                        kp->objnodes[i] = objnode;
 545                }
 546        }
 547        if (kp->obj == NULL) {
 548                obj = kmem_cache_alloc(zcache_obj_cache, ZCACHE_GFP_MASK);
 549                kp->obj = obj;
 550        }
 551        if (unlikely(kp->obj == NULL)) {
 552                inc_zcache_failed_alloc();
 553                goto out;
 554        }
 555        /*
 556         * ok, have all the metadata pre-allocated, now do the data
 557         * but since how we allocate the data is dependent on ephemeral
 558         * or persistent, we split the call here to different sub-functions
 559         */
 560        if (eph)
 561                pampd = zcache_pampd_eph_create(data, size, raw, th);
 562        else
 563                pampd = zcache_pampd_pers_create(data, size, raw, th);
 564out:
 565        return pampd;
 566}
 567
 568/*
 569 * This is a pamops called via tmem_put and is necessary to "finish"
 570 * a pampd creation.
 571 */
 572void zcache_pampd_create_finish(void *pampd, bool eph)
 573{
 574        if (pampd != (void *)ZERO_FILLED)
 575                zbud_create_finish((struct zbudref *)pampd, eph);
 576}
 577
 578/*
 579 * This is passed as a function parameter to zbud_decompress so that
 580 * zbud need not be familiar with the details of crypto. It assumes that
 581 * the bytes from_va and to_va through from_va+size-1 and to_va+size-1 are
 582 * kmapped.  It must be successful, else there is a logic bug somewhere.
 583 */
 584static void zcache_decompress(char *from_va, unsigned int size, char *to_va)
 585{
 586        int ret;
 587        unsigned int outlen = PAGE_SIZE;
 588
 589        ret = zcache_comp_op(ZCACHE_COMPOP_DECOMPRESS, from_va, size,
 590                                to_va, &outlen);
 591        BUG_ON(ret);
 592        BUG_ON(outlen != PAGE_SIZE);
 593}
 594
 595/*
 596 * Decompress from the kernel va to a pageframe
 597 */
 598void zcache_decompress_to_page(char *from_va, unsigned int size,
 599                                        struct page *to_page)
 600{
 601        char *to_va = kmap_atomic(to_page);
 602        zcache_decompress(from_va, size, to_va);
 603        kunmap_atomic(to_va);
 604}
 605
 606/*
 607 * fill the pageframe corresponding to the struct page with the data
 608 * from the passed pampd
 609 */
 610static int zcache_pampd_get_data(char *data, size_t *sizep, bool raw,
 611                                        void *pampd, struct tmem_pool *pool,
 612                                        struct tmem_oid *oid, uint32_t index)
 613{
 614        int ret;
 615        bool eph = !is_persistent(pool);
 616
 617        BUG_ON(preemptible());
 618        BUG_ON(eph);    /* fix later if shared pools get implemented */
 619        BUG_ON(pampd_is_remote(pampd));
 620
 621        if (pampd == (void *)ZERO_FILLED) {
 622                handle_zero_filled_page(data);
 623                if (!raw)
 624                        *sizep = PAGE_SIZE;
 625                return 0;
 626        }
 627
 628        if (raw)
 629                ret = zbud_copy_from_zbud(data, (struct zbudref *)pampd,
 630                                                sizep, eph);
 631        else {
 632                ret = zbud_decompress((struct page *)(data),
 633                                        (struct zbudref *)pampd, false,
 634                                        zcache_decompress);
 635                *sizep = PAGE_SIZE;
 636        }
 637        return ret;
 638}
 639
 640/*
 641 * fill the pageframe corresponding to the struct page with the data
 642 * from the passed pampd
 643 */
 644static int zcache_pampd_get_data_and_free(char *data, size_t *sizep, bool raw,
 645                                        void *pampd, struct tmem_pool *pool,
 646                                        struct tmem_oid *oid, uint32_t index)
 647{
 648        int ret = 0;
 649        bool eph = !is_persistent(pool), zero_filled = false;
 650        struct page *page = NULL;
 651        unsigned int zsize, zpages;
 652
 653        BUG_ON(preemptible());
 654        BUG_ON(pampd_is_remote(pampd));
 655
 656        if (pampd == (void *)ZERO_FILLED) {
 657                handle_zero_filled_page(data);
 658                zero_filled = true;
 659                zsize = 0;
 660                zpages = 1;
 661                if (!raw)
 662                        *sizep = PAGE_SIZE;
 663                dec_zcache_zero_filled_pages();
 664                goto zero_fill;
 665        }
 666
 667        if (raw)
 668                ret = zbud_copy_from_zbud(data, (struct zbudref *)pampd,
 669                                                sizep, eph);
 670        else {
 671                ret = zbud_decompress((struct page *)(data),
 672                                        (struct zbudref *)pampd, eph,
 673                                        zcache_decompress);
 674                *sizep = PAGE_SIZE;
 675        }
 676        page = zbud_free_and_delist((struct zbudref *)pampd, eph,
 677                                        &zsize, &zpages);
 678zero_fill:
 679        if (eph) {
 680                if (page)
 681                        dec_zcache_eph_pageframes();
 682                dec_zcache_eph_zpages(zpages);
 683                dec_zcache_eph_zbytes(zsize);
 684        } else {
 685                if (page)
 686                        dec_zcache_pers_pageframes();
 687                dec_zcache_pers_zpages(zpages);
 688                dec_zcache_pers_zbytes(zsize);
 689        }
 690        if (!is_local_client(pool->client) && !zero_filled)
 691                ramster_count_foreign_pages(eph, -1);
 692        if (page && !zero_filled)
 693                zcache_free_page(page);
 694        return ret;
 695}
 696
 697/*
 698 * free the pampd and remove it from any zcache lists
 699 * pampd must no longer be pointed to from any tmem data structures!
 700 */
 701static void zcache_pampd_free(void *pampd, struct tmem_pool *pool,
 702                              struct tmem_oid *oid, uint32_t index, bool acct)
 703{
 704        struct page *page = NULL;
 705        unsigned int zsize, zpages;
 706        bool zero_filled = false;
 707
 708        BUG_ON(preemptible());
 709
 710        if (pampd == (void *)ZERO_FILLED) {
 711                zero_filled = true;
 712                zsize = 0;
 713                zpages = 1;
 714                dec_zcache_zero_filled_pages();
 715        }
 716
 717        if (pampd_is_remote(pampd) && !zero_filled) {
 718                BUG_ON(!ramster_enabled);
 719                pampd = ramster_pampd_free(pampd, pool, oid, index, acct);
 720                if (pampd == NULL)
 721                        return;
 722        }
 723        if (is_ephemeral(pool)) {
 724                if (!zero_filled)
 725                        page = zbud_free_and_delist((struct zbudref *)pampd,
 726                                                true, &zsize, &zpages);
 727                if (page)
 728                        dec_zcache_eph_pageframes();
 729                dec_zcache_eph_zpages(zpages);
 730                dec_zcache_eph_zbytes(zsize);
 731                /* FIXME CONFIG_RAMSTER... check acct parameter? */
 732        } else {
 733                if (!zero_filled)
 734                        page = zbud_free_and_delist((struct zbudref *)pampd,
 735                                                false, &zsize, &zpages);
 736                if (page)
 737                        dec_zcache_pers_pageframes();
 738                dec_zcache_pers_zpages(zpages);
 739                dec_zcache_pers_zbytes(zsize);
 740        }
 741        if (!is_local_client(pool->client) && !zero_filled)
 742                ramster_count_foreign_pages(is_ephemeral(pool), -1);
 743        if (page && !zero_filled)
 744                zcache_free_page(page);
 745}
 746
 747static struct tmem_pamops zcache_pamops = {
 748        .create_finish = zcache_pampd_create_finish,
 749        .get_data = zcache_pampd_get_data,
 750        .get_data_and_free = zcache_pampd_get_data_and_free,
 751        .free = zcache_pampd_free,
 752};
 753
 754/*
 755 * zcache compression/decompression and related per-cpu stuff
 756 */
 757
 758static DEFINE_PER_CPU(unsigned char *, zcache_dstmem);
 759#define ZCACHE_DSTMEM_ORDER 1
 760
 761static void zcache_compress(struct page *from, void **out_va, unsigned *out_len)
 762{
 763        int ret;
 764        unsigned char *dmem = __get_cpu_var(zcache_dstmem);
 765        char *from_va;
 766
 767        BUG_ON(!irqs_disabled());
 768        /* no buffer or no compressor so can't compress */
 769        BUG_ON(dmem == NULL);
 770        *out_len = PAGE_SIZE << ZCACHE_DSTMEM_ORDER;
 771        from_va = kmap_atomic(from);
 772        mb();
 773        ret = zcache_comp_op(ZCACHE_COMPOP_COMPRESS, from_va, PAGE_SIZE, dmem,
 774                                out_len);
 775        BUG_ON(ret);
 776        *out_va = dmem;
 777        kunmap_atomic(from_va);
 778}
 779
 780static int zcache_comp_cpu_up(int cpu)
 781{
 782        struct crypto_comp *tfm;
 783
 784        tfm = crypto_alloc_comp(zcache_comp_name, 0, 0);
 785        if (IS_ERR(tfm))
 786                return NOTIFY_BAD;
 787        *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu) = tfm;
 788        return NOTIFY_OK;
 789}
 790
 791static void zcache_comp_cpu_down(int cpu)
 792{
 793        struct crypto_comp *tfm;
 794
 795        tfm = *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu);
 796        crypto_free_comp(tfm);
 797        *per_cpu_ptr(zcache_comp_pcpu_tfms, cpu) = NULL;
 798}
 799
 800static int zcache_cpu_notifier(struct notifier_block *nb,
 801                                unsigned long action, void *pcpu)
 802{
 803        int ret, i, cpu = (long)pcpu;
 804        struct zcache_preload *kp;
 805
 806        switch (action) {
 807        case CPU_UP_PREPARE:
 808                ret = zcache_comp_cpu_up(cpu);
 809                if (ret != NOTIFY_OK) {
 810                        pr_err("%s: can't allocate compressor xform\n",
 811                                namestr);
 812                        return ret;
 813                }
 814                per_cpu(zcache_dstmem, cpu) = (void *)__get_free_pages(
 815                        GFP_KERNEL | __GFP_REPEAT, ZCACHE_DSTMEM_ORDER);
 816                if (ramster_enabled)
 817                        ramster_cpu_up(cpu);
 818                break;
 819        case CPU_DEAD:
 820        case CPU_UP_CANCELED:
 821                zcache_comp_cpu_down(cpu);
 822                free_pages((unsigned long)per_cpu(zcache_dstmem, cpu),
 823                        ZCACHE_DSTMEM_ORDER);
 824                per_cpu(zcache_dstmem, cpu) = NULL;
 825                kp = &per_cpu(zcache_preloads, cpu);
 826                for (i = 0; i < ARRAY_SIZE(kp->objnodes); i++) {
 827                        if (kp->objnodes[i])
 828                                kmem_cache_free(zcache_objnode_cache,
 829                                                kp->objnodes[i]);
 830                }
 831                if (kp->obj) {
 832                        kmem_cache_free(zcache_obj_cache, kp->obj);
 833                        kp->obj = NULL;
 834                }
 835                if (ramster_enabled)
 836                        ramster_cpu_down(cpu);
 837                break;
 838        default:
 839                break;
 840        }
 841        return NOTIFY_OK;
 842}
 843
 844static struct notifier_block zcache_cpu_notifier_block = {
 845        .notifier_call = zcache_cpu_notifier
 846};
 847
 848/*
 849 * The following code interacts with the zbud eviction and zbud
 850 * zombify code to access LRU pages
 851 */
 852
 853static struct page *zcache_evict_eph_pageframe(void)
 854{
 855        struct page *page;
 856        unsigned int zsize = 0, zpages = 0;
 857
 858        page = zbud_evict_pageframe_lru(&zsize, &zpages);
 859        if (page == NULL)
 860                goto out;
 861        dec_zcache_eph_zbytes(zsize);
 862        dec_zcache_eph_zpages(zpages);
 863        inc_zcache_evicted_eph_zpages(zpages);
 864        dec_zcache_eph_pageframes();
 865        inc_zcache_evicted_eph_pageframes();
 866out:
 867        return page;
 868}
 869
 870#ifdef CONFIG_ZCACHE_WRITEBACK
 871
 872static atomic_t zcache_outstanding_writeback_pages_atomic = ATOMIC_INIT(0);
 873
 874static inline void inc_zcache_outstanding_writeback_pages(void)
 875{
 876        zcache_outstanding_writeback_pages =
 877            atomic_inc_return(&zcache_outstanding_writeback_pages_atomic);
 878}
 879static inline void dec_zcache_outstanding_writeback_pages(void)
 880{
 881        zcache_outstanding_writeback_pages =
 882          atomic_dec_return(&zcache_outstanding_writeback_pages_atomic);
 883};
 884static void unswiz(struct tmem_oid oid, u32 index,
 885                                unsigned *type, pgoff_t *offset);
 886
 887/*
 888 *  Choose an LRU persistent pageframe and attempt to write it back to
 889 *  the backing swap disk by calling frontswap_writeback on both zpages.
 890 *
 891 *  This is work-in-progress.
 892 */
 893
 894static void zcache_end_swap_write(struct bio *bio, int err)
 895{
 896        end_swap_bio_write(bio, err);
 897        dec_zcache_outstanding_writeback_pages();
 898        zcache_writtenback_pages++;
 899}
 900
 901/*
 902 * zcache_get_swap_cache_page
 903 *
 904 * This is an adaption of read_swap_cache_async()
 905 *
 906 * If success, page is returned in retpage
 907 * Returns 0 if page was already in the swap cache, page is not locked
 908 * Returns 1 if the new page needs to be populated, page is locked
 909 */
 910static int zcache_get_swap_cache_page(int type, pgoff_t offset,
 911                                struct page *new_page)
 912{
 913        struct page *found_page;
 914        swp_entry_t entry = swp_entry(type, offset);
 915        int err;
 916
 917        BUG_ON(new_page == NULL);
 918        do {
 919                /*
 920                 * First check the swap cache.  Since this is normally
 921                 * called after lookup_swap_cache() failed, re-calling
 922                 * that would confuse statistics.
 923                 */
 924                found_page = find_get_page(&swapper_space, entry.val);
 925                if (found_page)
 926                        return 0;
 927
 928                /*
 929                 * call radix_tree_preload() while we can wait.
 930                 */
 931                err = radix_tree_preload(GFP_KERNEL);
 932                if (err)
 933                        break;
 934
 935                /*
 936                 * Swap entry may have been freed since our caller observed it.
 937                 */
 938                err = swapcache_prepare(entry);
 939                if (err == -EEXIST) { /* seems racy */
 940                        radix_tree_preload_end();
 941                        continue;
 942                }
 943                if (err) { /* swp entry is obsolete ? */
 944                        radix_tree_preload_end();
 945                        break;
 946                }
 947
 948                /* May fail (-ENOMEM) if radix-tree node allocation failed. */
 949                __set_page_locked(new_page);
 950                SetPageSwapBacked(new_page);
 951                err = __add_to_swap_cache(new_page, entry);
 952                if (likely(!err)) {
 953                        radix_tree_preload_end();
 954                        lru_cache_add_anon(new_page);
 955                        return 1;
 956                }
 957                radix_tree_preload_end();
 958                ClearPageSwapBacked(new_page);
 959                __clear_page_locked(new_page);
 960                /*
 961                 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
 962                 * clear SWAP_HAS_CACHE flag.
 963                 */
 964                swapcache_free(entry, NULL);
 965                /* FIXME: is it possible to get here without err==-ENOMEM?
 966                 * If not, we can dispense with the do loop, use goto retry */
 967        } while (err != -ENOMEM);
 968
 969        return -ENOMEM;
 970}
 971
 972/*
 973 * Given a frontswap zpage in zcache (identified by type/offset) and
 974 * an empty page, put the page into the swap cache, use frontswap
 975 * to get the page from zcache into the empty page, then give it
 976 * to the swap subsystem to send to disk (carefully avoiding the
 977 * possibility that frontswap might snatch it back).
 978 * Returns < 0 if error, 0 if successful, and 1 if successful but
 979 * the newpage passed in not needed and should be freed.
 980 */
 981static int zcache_frontswap_writeback_zpage(int type, pgoff_t offset,
 982                                        struct page *newpage)
 983{
 984        struct page *page = newpage;
 985        int ret;
 986        struct writeback_control wbc = {
 987                .sync_mode = WB_SYNC_NONE,
 988        };
 989
 990        ret = zcache_get_swap_cache_page(type, offset, page);
 991        if (ret < 0)
 992                return ret;
 993        else if (ret == 0) {
 994                /* more uptodate page is already in swapcache */
 995                __frontswap_invalidate_page(type, offset);
 996                return 1;
 997        }
 998
 999        BUG_ON(!frontswap_has_exclusive_gets); /* load must also invalidate */
1000        /* FIXME: how is it possible to get here when page is unlocked? */
1001        __frontswap_load(page);
1002        SetPageUptodate(page);  /* above does SetPageDirty, is that enough? */
1003
1004        /* start writeback */
1005        SetPageReclaim(page);
1006        /*
1007         * Return value is ignored here because it doesn't change anything
1008         * for us.  Page is returned unlocked.
1009         */
1010        (void)__swap_writepage(page, &wbc, zcache_end_swap_write);
1011        page_cache_release(page);
1012        inc_zcache_outstanding_writeback_pages();
1013
1014        return 0;
1015}
1016
1017/*
1018 * The following is still a magic number... we want to allow forward progress
1019 * for writeback because it clears out needed RAM when under pressure, but
1020 * we don't want to allow writeback to absorb and queue too many GFP_KERNEL
1021 * pages if the swap device is very slow.
1022 */
1023#define ZCACHE_MAX_OUTSTANDING_WRITEBACK_PAGES 6400
1024
1025/*
1026 * Try to allocate two free pages, first using a non-aggressive alloc,
1027 * then by evicting zcache ephemeral (clean pagecache) pages, and last
1028 * by aggressive GFP_KERNEL alloc.  We allow zbud to choose a pageframe
1029 * consisting of 1-2 zbuds/zpages, then call the writeback_zpage helper
1030 * function above for each.
1031 */
1032static int zcache_frontswap_writeback(void)
1033{
1034        struct tmem_handle th[2];
1035        int ret = 0;
1036        int nzbuds, writeback_ret;
1037        unsigned type;
1038        struct page *znewpage1 = NULL, *znewpage2 = NULL;
1039        struct page *evictpage1 = NULL, *evictpage2 = NULL;
1040        struct page *newpage1 = NULL, *newpage2 = NULL;
1041        struct page *page1 = NULL, *page2 = NULL;
1042        pgoff_t offset;
1043
1044        znewpage1 = alloc_page(ZCACHE_GFP_MASK);
1045        znewpage2 = alloc_page(ZCACHE_GFP_MASK);
1046        if (znewpage1 == NULL)
1047                evictpage1 = zcache_evict_eph_pageframe();
1048        if (znewpage2 == NULL)
1049                evictpage2 = zcache_evict_eph_pageframe();
1050
1051        if ((evictpage1 == NULL || evictpage2 == NULL) &&
1052            atomic_read(&zcache_outstanding_writeback_pages_atomic) >
1053                                ZCACHE_MAX_OUTSTANDING_WRITEBACK_PAGES) {
1054                goto free_and_out;
1055        }
1056        if (znewpage1 == NULL && evictpage1 == NULL)
1057                newpage1 = alloc_page(GFP_KERNEL);
1058        if (znewpage2 == NULL && evictpage2 == NULL)
1059                newpage2 = alloc_page(GFP_KERNEL);
1060        if (newpage1 == NULL || newpage2 == NULL)
1061                        goto free_and_out;
1062
1063        /* ok, we have two pageframes pre-allocated, get a pair of zbuds */
1064        nzbuds = zbud_make_zombie_lru(&th[0], NULL, NULL, false);
1065        if (nzbuds == 0) {
1066                ret = -ENOENT;
1067                goto free_and_out;
1068        }
1069
1070        /* process the first zbud */
1071        unswiz(th[0].oid, th[0].index, &type, &offset);
1072        page1 = (znewpage1 != NULL) ? znewpage1 :
1073                        ((newpage1 != NULL) ? newpage1 : evictpage1);
1074        writeback_ret = zcache_frontswap_writeback_zpage(type, offset, page1);
1075        if (writeback_ret < 0) {
1076                ret = -ENOMEM;
1077                goto free_and_out;
1078        }
1079        if (evictpage1 != NULL)
1080                zcache_pageframes_freed =
1081                        atomic_inc_return(&zcache_pageframes_freed_atomic);
1082        if (writeback_ret == 0) {
1083                /* zcache_get_swap_cache_page will free, don't double free */
1084                znewpage1 = NULL;
1085                newpage1 = NULL;
1086                evictpage1 = NULL;
1087        }
1088        if (nzbuds < 2)
1089                goto free_and_out;
1090
1091        /* if there is a second zbud, process it */
1092        unswiz(th[1].oid, th[1].index, &type, &offset);
1093        page2 = (znewpage2 != NULL) ? znewpage2 :
1094                        ((newpage2 != NULL) ? newpage2 : evictpage2);
1095        writeback_ret = zcache_frontswap_writeback_zpage(type, offset, page2);
1096        if (writeback_ret < 0) {
1097                ret = -ENOMEM;
1098                goto free_and_out;
1099        }
1100        if (evictpage2 != NULL)
1101                zcache_pageframes_freed =
1102                        atomic_inc_return(&zcache_pageframes_freed_atomic);
1103        if (writeback_ret == 0) {
1104                znewpage2 = NULL;
1105                newpage2 = NULL;
1106                evictpage2 = NULL;
1107        }
1108
1109free_and_out:
1110        if (znewpage1 != NULL)
1111                page_cache_release(znewpage1);
1112        if (znewpage2 != NULL)
1113                page_cache_release(znewpage2);
1114        if (newpage1 != NULL)
1115                page_cache_release(newpage1);
1116        if (newpage2 != NULL)
1117                page_cache_release(newpage2);
1118        if (evictpage1 != NULL)
1119                zcache_free_page(evictpage1);
1120        if (evictpage2 != NULL)
1121                zcache_free_page(evictpage2);
1122        return ret;
1123}
1124#endif /* CONFIG_ZCACHE_WRITEBACK */
1125
1126/*
1127 * When zcache is disabled ("frozen"), pools can be created and destroyed,
1128 * but all puts (and thus all other operations that require memory allocation)
1129 * must fail.  If zcache is unfrozen, accepts puts, then frozen again,
1130 * data consistency requires all puts while frozen to be converted into
1131 * flushes.
1132 */
1133static bool zcache_freeze;
1134
1135/*
1136 * This zcache shrinker interface reduces the number of ephemeral pageframes
1137 * used by zcache to approximately the same as the total number of LRU_FILE
1138 * pageframes in use, and now also reduces the number of persistent pageframes
1139 * used by zcache to approximately the same as the total number of LRU_ANON
1140 * pageframes in use.  FIXME POLICY: Probably the writeback should only occur
1141 * if the eviction doesn't free enough pages.
1142 */
1143static int shrink_zcache_memory(struct shrinker *shrink,
1144                                struct shrink_control *sc)
1145{
1146        static bool in_progress;
1147        int ret = -1;
1148        int nr = sc->nr_to_scan;
1149        int nr_evict = 0;
1150        int nr_writeback = 0;
1151        struct page *page;
1152        int  file_pageframes_inuse, anon_pageframes_inuse;
1153
1154        if (nr <= 0)
1155                goto skip_evict;
1156
1157        /* don't allow more than one eviction thread at a time */
1158        if (in_progress)
1159                goto skip_evict;
1160
1161        in_progress = true;
1162
1163        /* we are going to ignore nr, and target a different value */
1164        zcache_last_active_file_pageframes =
1165                global_page_state(NR_LRU_BASE + LRU_ACTIVE_FILE);
1166        zcache_last_inactive_file_pageframes =
1167                global_page_state(NR_LRU_BASE + LRU_INACTIVE_FILE);
1168        file_pageframes_inuse = zcache_last_active_file_pageframes +
1169                                zcache_last_inactive_file_pageframes;
1170        if (zcache_eph_pageframes > file_pageframes_inuse)
1171                nr_evict = zcache_eph_pageframes - file_pageframes_inuse;
1172        else
1173                nr_evict = 0;
1174        while (nr_evict-- > 0) {
1175                page = zcache_evict_eph_pageframe();
1176                if (page == NULL)
1177                        break;
1178                zcache_free_page(page);
1179        }
1180
1181        zcache_last_active_anon_pageframes =
1182                global_page_state(NR_LRU_BASE + LRU_ACTIVE_ANON);
1183        zcache_last_inactive_anon_pageframes =
1184                global_page_state(NR_LRU_BASE + LRU_INACTIVE_ANON);
1185        anon_pageframes_inuse = zcache_last_active_anon_pageframes +
1186                                zcache_last_inactive_anon_pageframes;
1187        if (zcache_pers_pageframes > anon_pageframes_inuse)
1188                nr_writeback = zcache_pers_pageframes - anon_pageframes_inuse;
1189        else
1190                nr_writeback = 0;
1191        while (nr_writeback-- > 0) {
1192#ifdef CONFIG_ZCACHE_WRITEBACK
1193                int writeback_ret;
1194                writeback_ret = zcache_frontswap_writeback();
1195                if (writeback_ret == -ENOMEM)
1196#endif
1197                        break;
1198        }
1199        in_progress = false;
1200
1201skip_evict:
1202        /* resample: has changed, but maybe not all the way yet */
1203        zcache_last_active_file_pageframes =
1204                global_page_state(NR_LRU_BASE + LRU_ACTIVE_FILE);
1205        zcache_last_inactive_file_pageframes =
1206                global_page_state(NR_LRU_BASE + LRU_INACTIVE_FILE);
1207        ret = zcache_eph_pageframes - zcache_last_active_file_pageframes +
1208                zcache_last_inactive_file_pageframes;
1209        if (ret < 0)
1210                ret = 0;
1211        return ret;
1212}
1213
1214static struct shrinker zcache_shrinker = {
1215        .shrink = shrink_zcache_memory,
1216        .seeks = DEFAULT_SEEKS,
1217};
1218
1219/*
1220 * zcache shims between cleancache/frontswap ops and tmem
1221 */
1222
1223/* FIXME rename these core routines to zcache_tmemput etc? */
1224int zcache_put_page(int cli_id, int pool_id, struct tmem_oid *oidp,
1225                                uint32_t index, void *page,
1226                                unsigned int size, bool raw, int ephemeral)
1227{
1228        struct tmem_pool *pool;
1229        struct tmem_handle th;
1230        int ret = -1;
1231        void *pampd = NULL;
1232
1233        BUG_ON(!irqs_disabled());
1234        pool = zcache_get_pool_by_id(cli_id, pool_id);
1235        if (unlikely(pool == NULL))
1236                goto out;
1237        if (!zcache_freeze) {
1238                ret = 0;
1239                th.client_id = cli_id;
1240                th.pool_id = pool_id;
1241                th.oid = *oidp;
1242                th.index = index;
1243                pampd = zcache_pampd_create((char *)page, size, raw,
1244                                ephemeral, &th);
1245                if (pampd == NULL) {
1246                        ret = -ENOMEM;
1247                        if (ephemeral)
1248                                inc_zcache_failed_eph_puts();
1249                        else
1250                                inc_zcache_failed_pers_puts();
1251                } else {
1252                        if (ramster_enabled)
1253                                ramster_do_preload_flnode(pool);
1254                        ret = tmem_put(pool, oidp, index, 0, pampd);
1255                        if (ret < 0)
1256                                BUG();
1257                }
1258                zcache_put_pool(pool);
1259        } else {
1260                inc_zcache_put_to_flush();
1261                if (ramster_enabled)
1262                        ramster_do_preload_flnode(pool);
1263                if (atomic_read(&pool->obj_count) > 0)
1264                        /* the put fails whether the flush succeeds or not */
1265                        (void)tmem_flush_page(pool, oidp, index);
1266                zcache_put_pool(pool);
1267        }
1268out:
1269        return ret;
1270}
1271
1272int zcache_get_page(int cli_id, int pool_id, struct tmem_oid *oidp,
1273                                uint32_t index, void *page,
1274                                size_t *sizep, bool raw, int get_and_free)
1275{
1276        struct tmem_pool *pool;
1277        int ret = -1;
1278        bool eph;
1279
1280        if (!raw) {
1281                BUG_ON(irqs_disabled());
1282                BUG_ON(in_softirq());
1283        }
1284        pool = zcache_get_pool_by_id(cli_id, pool_id);
1285        eph = is_ephemeral(pool);
1286        if (likely(pool != NULL)) {
1287                if (atomic_read(&pool->obj_count) > 0)
1288                        ret = tmem_get(pool, oidp, index, (char *)(page),
1289                                        sizep, raw, get_and_free);
1290                zcache_put_pool(pool);
1291        }
1292        WARN_ONCE((!is_ephemeral(pool) && (ret != 0)),
1293                        "zcache_get fails on persistent pool, "
1294                        "bad things are very likely to happen soon\n");
1295#ifdef RAMSTER_TESTING
1296        if (ret != 0 && ret != -1 && !(ret == -EINVAL && is_ephemeral(pool)))
1297                pr_err("TESTING zcache_get tmem_get returns ret=%d\n", ret);
1298#endif
1299        return ret;
1300}
1301
1302int zcache_flush_page(int cli_id, int pool_id,
1303                                struct tmem_oid *oidp, uint32_t index)
1304{
1305        struct tmem_pool *pool;
1306        int ret = -1;
1307        unsigned long flags;
1308
1309        local_irq_save(flags);
1310        inc_zcache_flush_total();
1311        pool = zcache_get_pool_by_id(cli_id, pool_id);
1312        if (ramster_enabled)
1313                ramster_do_preload_flnode(pool);
1314        if (likely(pool != NULL)) {
1315                if (atomic_read(&pool->obj_count) > 0)
1316                        ret = tmem_flush_page(pool, oidp, index);
1317                zcache_put_pool(pool);
1318        }
1319        if (ret >= 0)
1320                inc_zcache_flush_found();
1321        local_irq_restore(flags);
1322        return ret;
1323}
1324
1325int zcache_flush_object(int cli_id, int pool_id,
1326                                struct tmem_oid *oidp)
1327{
1328        struct tmem_pool *pool;
1329        int ret = -1;
1330        unsigned long flags;
1331
1332        local_irq_save(flags);
1333        inc_zcache_flobj_total();
1334        pool = zcache_get_pool_by_id(cli_id, pool_id);
1335        if (ramster_enabled)
1336                ramster_do_preload_flnode(pool);
1337        if (likely(pool != NULL)) {
1338                if (atomic_read(&pool->obj_count) > 0)
1339                        ret = tmem_flush_object(pool, oidp);
1340                zcache_put_pool(pool);
1341        }
1342        if (ret >= 0)
1343                inc_zcache_flobj_found();
1344        local_irq_restore(flags);
1345        return ret;
1346}
1347
1348static int zcache_client_destroy_pool(int cli_id, int pool_id)
1349{
1350        struct tmem_pool *pool = NULL;
1351        struct zcache_client *cli = NULL;
1352        int ret = -1;
1353
1354        if (pool_id < 0)
1355                goto out;
1356        if (cli_id == LOCAL_CLIENT)
1357                cli = &zcache_host;
1358        else if ((unsigned int)cli_id < MAX_CLIENTS)
1359                cli = &zcache_clients[cli_id];
1360        if (cli == NULL)
1361                goto out;
1362        atomic_inc(&cli->refcount);
1363        pool = cli->tmem_pools[pool_id];
1364        if (pool == NULL)
1365                goto out;
1366        cli->tmem_pools[pool_id] = NULL;
1367        /* wait for pool activity on other cpus to quiesce */
1368        while (atomic_read(&pool->refcount) != 0)
1369                ;
1370        atomic_dec(&cli->refcount);
1371        local_bh_disable();
1372        ret = tmem_destroy_pool(pool);
1373        local_bh_enable();
1374        kfree(pool);
1375        if (cli_id == LOCAL_CLIENT)
1376                pr_info("%s: destroyed local pool id=%d\n", namestr, pool_id);
1377        else
1378                pr_info("%s: destroyed pool id=%d, client=%d\n",
1379                                namestr, pool_id, cli_id);
1380out:
1381        return ret;
1382}
1383
1384int zcache_new_pool(uint16_t cli_id, uint32_t flags)
1385{
1386        int poolid = -1;
1387        struct tmem_pool *pool;
1388        struct zcache_client *cli = NULL;
1389
1390        if (cli_id == LOCAL_CLIENT)
1391                cli = &zcache_host;
1392        else if ((unsigned int)cli_id < MAX_CLIENTS)
1393                cli = &zcache_clients[cli_id];
1394        if (cli == NULL)
1395                goto out;
1396        atomic_inc(&cli->refcount);
1397        pool = kmalloc(sizeof(struct tmem_pool), GFP_ATOMIC);
1398        if (pool == NULL)
1399                goto out;
1400
1401        for (poolid = 0; poolid < MAX_POOLS_PER_CLIENT; poolid++)
1402                if (cli->tmem_pools[poolid] == NULL)
1403                        break;
1404        if (poolid >= MAX_POOLS_PER_CLIENT) {
1405                pr_info("%s: pool creation failed: max exceeded\n", namestr);
1406                kfree(pool);
1407                poolid = -1;
1408                goto out;
1409        }
1410        atomic_set(&pool->refcount, 0);
1411        pool->client = cli;
1412        pool->pool_id = poolid;
1413        tmem_new_pool(pool, flags);
1414        cli->tmem_pools[poolid] = pool;
1415        if (cli_id == LOCAL_CLIENT)
1416                pr_info("%s: created %s local tmem pool, id=%d\n", namestr,
1417                        flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral",
1418                        poolid);
1419        else
1420                pr_info("%s: created %s tmem pool, id=%d, client=%d\n", namestr,
1421                        flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral",
1422                        poolid, cli_id);
1423out:
1424        if (cli != NULL)
1425                atomic_dec(&cli->refcount);
1426        return poolid;
1427}
1428
1429static int zcache_local_new_pool(uint32_t flags)
1430{
1431        return zcache_new_pool(LOCAL_CLIENT, flags);
1432}
1433
1434int zcache_autocreate_pool(unsigned int cli_id, unsigned int pool_id, bool eph)
1435{
1436        struct tmem_pool *pool;
1437        struct zcache_client *cli = NULL;
1438        uint32_t flags = eph ? 0 : TMEM_POOL_PERSIST;
1439        int ret = -1;
1440
1441        BUG_ON(!ramster_enabled);
1442        if (cli_id == LOCAL_CLIENT)
1443                goto out;
1444        if (pool_id >= MAX_POOLS_PER_CLIENT)
1445                goto out;
1446        if (cli_id >= MAX_CLIENTS)
1447                goto out;
1448
1449        cli = &zcache_clients[cli_id];
1450        if ((eph && disable_cleancache) || (!eph && disable_frontswap)) {
1451                pr_err("zcache_autocreate_pool: pool type disabled\n");
1452                goto out;
1453        }
1454        if (!cli->allocated) {
1455                if (zcache_new_client(cli_id)) {
1456                        pr_err("zcache_autocreate_pool: can't create client\n");
1457                        goto out;
1458                }
1459                cli = &zcache_clients[cli_id];
1460        }
1461        atomic_inc(&cli->refcount);
1462        pool = cli->tmem_pools[pool_id];
1463        if (pool != NULL) {
1464                if (pool->persistent && eph) {
1465                        pr_err("zcache_autocreate_pool: type mismatch\n");
1466                        goto out;
1467                }
1468                ret = 0;
1469                goto out;
1470        }
1471        pool = kmalloc(sizeof(struct tmem_pool), GFP_KERNEL);
1472        if (pool == NULL)
1473                goto out;
1474
1475        atomic_set(&pool->refcount, 0);
1476        pool->client = cli;
1477        pool->pool_id = pool_id;
1478        tmem_new_pool(pool, flags);
1479        cli->tmem_pools[pool_id] = pool;
1480        pr_info("%s: AUTOcreated %s tmem poolid=%d, for remote client=%d\n",
1481                namestr, flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral",
1482                pool_id, cli_id);
1483        ret = 0;
1484out:
1485        if (cli != NULL)
1486                atomic_dec(&cli->refcount);
1487        return ret;
1488}
1489
1490/**********
1491 * Two kernel functionalities currently can be layered on top of tmem.
1492 * These are "cleancache" which is used as a second-chance cache for clean
1493 * page cache pages; and "frontswap" which is used for swap pages
1494 * to avoid writes to disk.  A generic "shim" is provided here for each
1495 * to translate in-kernel semantics to zcache semantics.
1496 */
1497
1498static void zcache_cleancache_put_page(int pool_id,
1499                                        struct cleancache_filekey key,
1500                                        pgoff_t index, struct page *page)
1501{
1502        u32 ind = (u32) index;
1503        struct tmem_oid oid = *(struct tmem_oid *)&key;
1504
1505        if (!disable_cleancache_ignore_nonactive && !PageWasActive(page)) {
1506                inc_zcache_eph_nonactive_puts_ignored();
1507                return;
1508        }
1509        if (likely(ind == index))
1510                (void)zcache_put_page(LOCAL_CLIENT, pool_id, &oid, index,
1511                                        page, PAGE_SIZE, false, 1);
1512}
1513
1514static int zcache_cleancache_get_page(int pool_id,
1515                                        struct cleancache_filekey key,
1516                                        pgoff_t index, struct page *page)
1517{
1518        u32 ind = (u32) index;
1519        struct tmem_oid oid = *(struct tmem_oid *)&key;
1520        size_t size;
1521        int ret = -1;
1522
1523        if (likely(ind == index)) {
1524                ret = zcache_get_page(LOCAL_CLIENT, pool_id, &oid, index,
1525                                        page, &size, false, 0);
1526                BUG_ON(ret >= 0 && size != PAGE_SIZE);
1527                if (ret == 0)
1528                        SetPageWasActive(page);
1529        }
1530        return ret;
1531}
1532
1533static void zcache_cleancache_flush_page(int pool_id,
1534                                        struct cleancache_filekey key,
1535                                        pgoff_t index)
1536{
1537        u32 ind = (u32) index;
1538        struct tmem_oid oid = *(struct tmem_oid *)&key;
1539
1540        if (likely(ind == index))
1541                (void)zcache_flush_page(LOCAL_CLIENT, pool_id, &oid, ind);
1542}
1543
1544static void zcache_cleancache_flush_inode(int pool_id,
1545                                        struct cleancache_filekey key)
1546{
1547        struct tmem_oid oid = *(struct tmem_oid *)&key;
1548
1549        (void)zcache_flush_object(LOCAL_CLIENT, pool_id, &oid);
1550}
1551
1552static void zcache_cleancache_flush_fs(int pool_id)
1553{
1554        if (pool_id >= 0)
1555                (void)zcache_client_destroy_pool(LOCAL_CLIENT, pool_id);
1556}
1557
1558static int zcache_cleancache_init_fs(size_t pagesize)
1559{
1560        BUG_ON(sizeof(struct cleancache_filekey) !=
1561                                sizeof(struct tmem_oid));
1562        BUG_ON(pagesize != PAGE_SIZE);
1563        return zcache_local_new_pool(0);
1564}
1565
1566static int zcache_cleancache_init_shared_fs(char *uuid, size_t pagesize)
1567{
1568        /* shared pools are unsupported and map to private */
1569        BUG_ON(sizeof(struct cleancache_filekey) !=
1570                                sizeof(struct tmem_oid));
1571        BUG_ON(pagesize != PAGE_SIZE);
1572        return zcache_local_new_pool(0);
1573}
1574
1575static struct cleancache_ops zcache_cleancache_ops = {
1576        .put_page = zcache_cleancache_put_page,
1577        .get_page = zcache_cleancache_get_page,
1578        .invalidate_page = zcache_cleancache_flush_page,
1579        .invalidate_inode = zcache_cleancache_flush_inode,
1580        .invalidate_fs = zcache_cleancache_flush_fs,
1581        .init_shared_fs = zcache_cleancache_init_shared_fs,
1582        .init_fs = zcache_cleancache_init_fs
1583};
1584
1585struct cleancache_ops *zcache_cleancache_register_ops(void)
1586{
1587        struct cleancache_ops *old_ops =
1588                cleancache_register_ops(&zcache_cleancache_ops);
1589
1590        return old_ops;
1591}
1592
1593/* a single tmem poolid is used for all frontswap "types" (swapfiles) */
1594static int zcache_frontswap_poolid __read_mostly = -1;
1595
1596/*
1597 * Swizzling increases objects per swaptype, increasing tmem concurrency
1598 * for heavy swaploads.  Later, larger nr_cpus -> larger SWIZ_BITS
1599 * Setting SWIZ_BITS to 27 basically reconstructs the swap entry from
1600 * frontswap_get_page(), but has side-effects. Hence using 8.
1601 */
1602#define SWIZ_BITS               8
1603#define SWIZ_MASK               ((1 << SWIZ_BITS) - 1)
1604#define _oswiz(_type, _ind)     ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK))
1605#define iswiz(_ind)             (_ind >> SWIZ_BITS)
1606
1607static inline struct tmem_oid oswiz(unsigned type, u32 ind)
1608{
1609        struct tmem_oid oid = { .oid = { 0 } };
1610        oid.oid[0] = _oswiz(type, ind);
1611        return oid;
1612}
1613
1614#ifdef CONFIG_ZCACHE_WRITEBACK
1615static void unswiz(struct tmem_oid oid, u32 index,
1616                                unsigned *type, pgoff_t *offset)
1617{
1618        *type = (unsigned)(oid.oid[0] >> SWIZ_BITS);
1619        *offset = (pgoff_t)((index << SWIZ_BITS) |
1620                        (oid.oid[0] & SWIZ_MASK));
1621}
1622#endif
1623
1624static int zcache_frontswap_put_page(unsigned type, pgoff_t offset,
1625                                        struct page *page)
1626{
1627        u64 ind64 = (u64)offset;
1628        u32 ind = (u32)offset;
1629        struct tmem_oid oid = oswiz(type, ind);
1630        int ret = -1;
1631        unsigned long flags;
1632
1633        BUG_ON(!PageLocked(page));
1634        if (!disable_frontswap_ignore_nonactive && !PageWasActive(page)) {
1635                inc_zcache_pers_nonactive_puts_ignored();
1636                ret = -ERANGE;
1637                goto out;
1638        }
1639        if (likely(ind64 == ind)) {
1640                local_irq_save(flags);
1641                ret = zcache_put_page(LOCAL_CLIENT, zcache_frontswap_poolid,
1642                                        &oid, iswiz(ind),
1643                                        page, PAGE_SIZE, false, 0);
1644                local_irq_restore(flags);
1645        }
1646out:
1647        return ret;
1648}
1649
1650/* returns 0 if the page was successfully gotten from frontswap, -1 if
1651 * was not present (should never happen!) */
1652static int zcache_frontswap_get_page(unsigned type, pgoff_t offset,
1653                                        struct page *page)
1654{
1655        u64 ind64 = (u64)offset;
1656        u32 ind = (u32)offset;
1657        struct tmem_oid oid = oswiz(type, ind);
1658        size_t size;
1659        int ret = -1, get_and_free;
1660
1661        if (frontswap_has_exclusive_gets)
1662                get_and_free = 1;
1663        else
1664                get_and_free = -1;
1665        BUG_ON(!PageLocked(page));
1666        if (likely(ind64 == ind)) {
1667                ret = zcache_get_page(LOCAL_CLIENT, zcache_frontswap_poolid,
1668                                        &oid, iswiz(ind),
1669                                        page, &size, false, get_and_free);
1670                BUG_ON(ret >= 0 && size != PAGE_SIZE);
1671        }
1672        return ret;
1673}
1674
1675/* flush a single page from frontswap */
1676static void zcache_frontswap_flush_page(unsigned type, pgoff_t offset)
1677{
1678        u64 ind64 = (u64)offset;
1679        u32 ind = (u32)offset;
1680        struct tmem_oid oid = oswiz(type, ind);
1681
1682        if (likely(ind64 == ind))
1683                (void)zcache_flush_page(LOCAL_CLIENT, zcache_frontswap_poolid,
1684                                        &oid, iswiz(ind));
1685}
1686
1687/* flush all pages from the passed swaptype */
1688static void zcache_frontswap_flush_area(unsigned type)
1689{
1690        struct tmem_oid oid;
1691        int ind;
1692
1693        for (ind = SWIZ_MASK; ind >= 0; ind--) {
1694                oid = oswiz(type, ind);
1695                (void)zcache_flush_object(LOCAL_CLIENT,
1696                                                zcache_frontswap_poolid, &oid);
1697        }
1698}
1699
1700static void zcache_frontswap_init(unsigned ignored)
1701{
1702        /* a single tmem poolid is used for all frontswap "types" (swapfiles) */
1703        if (zcache_frontswap_poolid < 0)
1704                zcache_frontswap_poolid =
1705                        zcache_local_new_pool(TMEM_POOL_PERSIST);
1706}
1707
1708static struct frontswap_ops zcache_frontswap_ops = {
1709        .store = zcache_frontswap_put_page,
1710        .load = zcache_frontswap_get_page,
1711        .invalidate_page = zcache_frontswap_flush_page,
1712        .invalidate_area = zcache_frontswap_flush_area,
1713        .init = zcache_frontswap_init
1714};
1715
1716struct frontswap_ops *zcache_frontswap_register_ops(void)
1717{
1718        struct frontswap_ops *old_ops =
1719                frontswap_register_ops(&zcache_frontswap_ops);
1720
1721        return old_ops;
1722}
1723
1724/*
1725 * zcache initialization
1726 * NOTE FOR NOW zcache or ramster MUST BE PROVIDED AS A KERNEL BOOT PARAMETER
1727 * OR NOTHING HAPPENS!
1728 */
1729
1730#ifndef CONFIG_ZCACHE_MODULE
1731static int __init enable_zcache(char *s)
1732{
1733        zcache_enabled = true;
1734        return 1;
1735}
1736__setup("zcache", enable_zcache);
1737
1738static int __init enable_ramster(char *s)
1739{
1740        zcache_enabled = true;
1741#ifdef CONFIG_RAMSTER
1742        ramster_enabled = true;
1743#endif
1744        return 1;
1745}
1746__setup("ramster", enable_ramster);
1747
1748/* allow independent dynamic disabling of cleancache and frontswap */
1749
1750static int __init no_cleancache(char *s)
1751{
1752        disable_cleancache = true;
1753        return 1;
1754}
1755
1756__setup("nocleancache", no_cleancache);
1757
1758static int __init no_frontswap(char *s)
1759{
1760        disable_frontswap = true;
1761        return 1;
1762}
1763
1764__setup("nofrontswap", no_frontswap);
1765
1766static int __init no_frontswap_exclusive_gets(char *s)
1767{
1768        frontswap_has_exclusive_gets = false;
1769        return 1;
1770}
1771
1772__setup("nofrontswapexclusivegets", no_frontswap_exclusive_gets);
1773
1774static int __init no_frontswap_ignore_nonactive(char *s)
1775{
1776        disable_frontswap_ignore_nonactive = true;
1777        return 1;
1778}
1779
1780__setup("nofrontswapignorenonactive", no_frontswap_ignore_nonactive);
1781
1782static int __init no_cleancache_ignore_nonactive(char *s)
1783{
1784        disable_cleancache_ignore_nonactive = true;
1785        return 1;
1786}
1787
1788__setup("nocleancacheignorenonactive", no_cleancache_ignore_nonactive);
1789
1790static int __init enable_zcache_compressor(char *s)
1791{
1792        strlcpy(zcache_comp_name, s, sizeof(zcache_comp_name));
1793        zcache_enabled = true;
1794        return 1;
1795}
1796__setup("zcache=", enable_zcache_compressor);
1797#endif
1798
1799
1800static int zcache_comp_init(void)
1801{
1802        int ret = 0;
1803
1804        /* check crypto algorithm */
1805#ifdef CONFIG_ZCACHE_MODULE
1806        ret = crypto_has_comp(zcache_comp_name, 0, 0);
1807        if (!ret) {
1808                ret = -1;
1809                goto out;
1810        }
1811#else
1812        if (*zcache_comp_name != '\0') {
1813                ret = crypto_has_comp(zcache_comp_name, 0, 0);
1814                if (!ret)
1815                        pr_info("zcache: %s not supported\n",
1816                                        zcache_comp_name);
1817                goto out;
1818        }
1819        if (!ret)
1820                strcpy(zcache_comp_name, "lzo");
1821        ret = crypto_has_comp(zcache_comp_name, 0, 0);
1822        if (!ret) {
1823                ret = 1;
1824                goto out;
1825        }
1826#endif
1827        pr_info("zcache: using %s compressor\n", zcache_comp_name);
1828
1829        /* alloc percpu transforms */
1830        ret = 0;
1831        zcache_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *);
1832        if (!zcache_comp_pcpu_tfms)
1833                ret = 1;
1834out:
1835        return ret;
1836}
1837
1838static int zcache_init(void)
1839{
1840        int ret = 0;
1841
1842#ifdef CONFIG_ZCACHE_MODULE
1843        zcache_enabled = 1;
1844#endif
1845        if (ramster_enabled) {
1846                namestr = "ramster";
1847                ramster_register_pamops(&zcache_pamops);
1848        }
1849        zcache_debugfs_init();
1850        if (zcache_enabled) {
1851                unsigned int cpu;
1852
1853                tmem_register_hostops(&zcache_hostops);
1854                tmem_register_pamops(&zcache_pamops);
1855                ret = register_cpu_notifier(&zcache_cpu_notifier_block);
1856                if (ret) {
1857                        pr_err("%s: can't register cpu notifier\n", namestr);
1858                        goto out;
1859                }
1860                ret = zcache_comp_init();
1861                if (ret) {
1862                        pr_err("%s: compressor initialization failed\n",
1863                                namestr);
1864                        goto out;
1865                }
1866                for_each_online_cpu(cpu) {
1867                        void *pcpu = (void *)(long)cpu;
1868                        zcache_cpu_notifier(&zcache_cpu_notifier_block,
1869                                CPU_UP_PREPARE, pcpu);
1870                }
1871        }
1872        zcache_objnode_cache = kmem_cache_create("zcache_objnode",
1873                                sizeof(struct tmem_objnode), 0, 0, NULL);
1874        zcache_obj_cache = kmem_cache_create("zcache_obj",
1875                                sizeof(struct tmem_obj), 0, 0, NULL);
1876        ret = zcache_new_client(LOCAL_CLIENT);
1877        if (ret) {
1878                pr_err("%s: can't create client\n", namestr);
1879                goto out;
1880        }
1881        zbud_init();
1882        if (zcache_enabled && !disable_cleancache) {
1883                struct cleancache_ops *old_ops;
1884
1885                register_shrinker(&zcache_shrinker);
1886                old_ops = zcache_cleancache_register_ops();
1887                pr_info("%s: cleancache enabled using kernel transcendent "
1888                        "memory and compression buddies\n", namestr);
1889#ifdef CONFIG_ZCACHE_DEBUG
1890                pr_info("%s: cleancache: ignorenonactive = %d\n",
1891                        namestr, !disable_cleancache_ignore_nonactive);
1892#endif
1893                if (old_ops != NULL)
1894                        pr_warn("%s: cleancache_ops overridden\n", namestr);
1895        }
1896        if (zcache_enabled && !disable_frontswap) {
1897                struct frontswap_ops *old_ops;
1898
1899                old_ops = zcache_frontswap_register_ops();
1900                if (frontswap_has_exclusive_gets)
1901                        frontswap_tmem_exclusive_gets(true);
1902                pr_info("%s: frontswap enabled using kernel transcendent "
1903                        "memory and compression buddies\n", namestr);
1904#ifdef CONFIG_ZCACHE_DEBUG
1905                pr_info("%s: frontswap: excl gets = %d active only = %d\n",
1906                        namestr, frontswap_has_exclusive_gets,
1907                        !disable_frontswap_ignore_nonactive);
1908#endif
1909                if (IS_ERR(old_ops) || old_ops) {
1910                        if (IS_ERR(old_ops))
1911                                return PTR_RET(old_ops);
1912                        pr_warn("%s: frontswap_ops overridden\n", namestr);
1913                }
1914        }
1915        if (ramster_enabled)
1916                ramster_init(!disable_cleancache, !disable_frontswap,
1917                                frontswap_has_exclusive_gets,
1918                                !disable_frontswap_selfshrink);
1919out:
1920        return ret;
1921}
1922
1923#ifdef CONFIG_ZCACHE_MODULE
1924#ifdef CONFIG_RAMSTER
1925module_param(ramster_enabled, bool, S_IRUGO);
1926module_param(disable_frontswap_selfshrink, int, S_IRUGO);
1927#endif
1928module_param(disable_cleancache, bool, S_IRUGO);
1929module_param(disable_frontswap, bool, S_IRUGO);
1930#ifdef FRONTSWAP_HAS_EXCLUSIVE_GETS
1931module_param(frontswap_has_exclusive_gets, bool, S_IRUGO);
1932#endif
1933module_param(disable_frontswap_ignore_nonactive, bool, S_IRUGO);
1934module_param(zcache_comp_name, charp, S_IRUGO);
1935module_init(zcache_init);
1936MODULE_LICENSE("GPL");
1937MODULE_AUTHOR("Dan Magenheimer <dan.magenheimer@oracle.com>");
1938MODULE_DESCRIPTION("In-kernel compression of cleancache/frontswap pages");
1939#else
1940late_initcall(zcache_init);
1941#endif
1942