linux/drivers/staging/zcache/zcache.c
<<
>>
Prefs
   1/*
   2 * zcache.c
   3 *
   4 * Copyright (c) 2010,2011, Dan Magenheimer, Oracle Corp.
   5 * Copyright (c) 2010,2011, Nitin Gupta
   6 *
   7 * Zcache provides an in-kernel "host implementation" for transcendent memory
   8 * and, thus indirectly, for cleancache and frontswap.  Zcache includes two
   9 * page-accessible memory [1] interfaces, both utilizing lzo1x compression:
  10 * 1) "compression buddies" ("zbud") is used for ephemeral pages
  11 * 2) xvmalloc is used for persistent pages.
  12 * Xvmalloc (based on the TLSF allocator) has very low fragmentation
  13 * so maximizes space efficiency, while zbud allows pairs (and potentially,
  14 * in the future, more than a pair of) compressed pages to be closely linked
  15 * so that reclaiming can be done via the kernel's physical-page-oriented
  16 * "shrinker" interface.
  17 *
  18 * [1] For a definition of page-accessible memory (aka PAM), see:
  19 *   http://marc.info/?l=linux-mm&m=127811271605009
  20 */
  21
  22#include <linux/cpu.h>
  23#include <linux/highmem.h>
  24#include <linux/list.h>
  25#include <linux/lzo.h>
  26#include <linux/slab.h>
  27#include <linux/spinlock.h>
  28#include <linux/types.h>
  29#include <linux/atomic.h>
  30#include "tmem.h"
  31
  32#include "../zram/xvmalloc.h" /* if built in drivers/staging */
  33
  34#if (!defined(CONFIG_CLEANCACHE) && !defined(CONFIG_FRONTSWAP))
  35#error "zcache is useless without CONFIG_CLEANCACHE or CONFIG_FRONTSWAP"
  36#endif
  37#ifdef CONFIG_CLEANCACHE
  38#include <linux/cleancache.h>
  39#endif
  40#ifdef CONFIG_FRONTSWAP
  41#include <linux/frontswap.h>
  42#endif
  43
  44#if 0
  45/* this is more aggressive but may cause other problems? */
  46#define ZCACHE_GFP_MASK (GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN)
  47#else
  48#define ZCACHE_GFP_MASK \
  49        (__GFP_FS | __GFP_NORETRY | __GFP_NOWARN | __GFP_NOMEMALLOC)
  50#endif
  51
  52/**********
  53 * Compression buddies ("zbud") provides for packing two (or, possibly
  54 * in the future, more) compressed ephemeral pages into a single "raw"
  55 * (physical) page and tracking them with data structures so that
  56 * the raw pages can be easily reclaimed.
  57 *
  58 * A zbud page ("zbpg") is an aligned page containing a list_head,
  59 * a lock, and two "zbud headers".  The remainder of the physical
  60 * page is divided up into aligned 64-byte "chunks" which contain
  61 * the compressed data for zero, one, or two zbuds.  Each zbpg
  62 * resides on: (1) an "unused list" if it has no zbuds; (2) a
  63 * "buddied" list if it is fully populated  with two zbuds; or
  64 * (3) one of PAGE_SIZE/64 "unbuddied" lists indexed by how many chunks
  65 * the one unbuddied zbud uses.  The data inside a zbpg cannot be
  66 * read or written unless the zbpg's lock is held.
  67 */
  68
  69#define ZBH_SENTINEL  0x43214321
  70#define ZBPG_SENTINEL  0xdeadbeef
  71
  72#define ZBUD_MAX_BUDS 2
  73
  74struct zbud_hdr {
  75        uint32_t pool_id;
  76        struct tmem_oid oid;
  77        uint32_t index;
  78        uint16_t size; /* compressed size in bytes, zero means unused */
  79        DECL_SENTINEL
  80};
  81
  82struct zbud_page {
  83        struct list_head bud_list;
  84        spinlock_t lock;
  85        struct zbud_hdr buddy[ZBUD_MAX_BUDS];
  86        DECL_SENTINEL
  87        /* followed by NUM_CHUNK aligned CHUNK_SIZE-byte chunks */
  88};
  89
  90#define CHUNK_SHIFT     6
  91#define CHUNK_SIZE      (1 << CHUNK_SHIFT)
  92#define CHUNK_MASK      (~(CHUNK_SIZE-1))
  93#define NCHUNKS         (((PAGE_SIZE - sizeof(struct zbud_page)) & \
  94                                CHUNK_MASK) >> CHUNK_SHIFT)
  95#define MAX_CHUNK       (NCHUNKS-1)
  96
  97static struct {
  98        struct list_head list;
  99        unsigned count;
 100} zbud_unbuddied[NCHUNKS];
 101/* list N contains pages with N chunks USED and NCHUNKS-N unused */
 102/* element 0 is never used but optimizing that isn't worth it */
 103static unsigned long zbud_cumul_chunk_counts[NCHUNKS];
 104
 105struct list_head zbud_buddied_list;
 106static unsigned long zcache_zbud_buddied_count;
 107
 108/* protects the buddied list and all unbuddied lists */
 109static DEFINE_SPINLOCK(zbud_budlists_spinlock);
 110
 111static LIST_HEAD(zbpg_unused_list);
 112static unsigned long zcache_zbpg_unused_list_count;
 113
 114/* protects the unused page list */
 115static DEFINE_SPINLOCK(zbpg_unused_list_spinlock);
 116
 117static atomic_t zcache_zbud_curr_raw_pages;
 118static atomic_t zcache_zbud_curr_zpages;
 119static unsigned long zcache_zbud_curr_zbytes;
 120static unsigned long zcache_zbud_cumul_zpages;
 121static unsigned long zcache_zbud_cumul_zbytes;
 122static unsigned long zcache_compress_poor;
 123
 124/* forward references */
 125static void *zcache_get_free_page(void);
 126static void zcache_free_page(void *p);
 127
 128/*
 129 * zbud helper functions
 130 */
 131
 132static inline unsigned zbud_max_buddy_size(void)
 133{
 134        return MAX_CHUNK << CHUNK_SHIFT;
 135}
 136
 137static inline unsigned zbud_size_to_chunks(unsigned size)
 138{
 139        BUG_ON(size == 0 || size > zbud_max_buddy_size());
 140        return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
 141}
 142
 143static inline int zbud_budnum(struct zbud_hdr *zh)
 144{
 145        unsigned offset = (unsigned long)zh & (PAGE_SIZE - 1);
 146        struct zbud_page *zbpg = NULL;
 147        unsigned budnum = -1U;
 148        int i;
 149
 150        for (i = 0; i < ZBUD_MAX_BUDS; i++)
 151                if (offset == offsetof(typeof(*zbpg), buddy[i])) {
 152                        budnum = i;
 153                        break;
 154                }
 155        BUG_ON(budnum == -1U);
 156        return budnum;
 157}
 158
 159static char *zbud_data(struct zbud_hdr *zh, unsigned size)
 160{
 161        struct zbud_page *zbpg;
 162        char *p;
 163        unsigned budnum;
 164
 165        ASSERT_SENTINEL(zh, ZBH);
 166        budnum = zbud_budnum(zh);
 167        BUG_ON(size == 0 || size > zbud_max_buddy_size());
 168        zbpg = container_of(zh, struct zbud_page, buddy[budnum]);
 169        ASSERT_SPINLOCK(&zbpg->lock);
 170        p = (char *)zbpg;
 171        if (budnum == 0)
 172                p += ((sizeof(struct zbud_page) + CHUNK_SIZE - 1) &
 173                                                        CHUNK_MASK);
 174        else if (budnum == 1)
 175                p += PAGE_SIZE - ((size + CHUNK_SIZE - 1) & CHUNK_MASK);
 176        return p;
 177}
 178
 179/*
 180 * zbud raw page management
 181 */
 182
 183static struct zbud_page *zbud_alloc_raw_page(void)
 184{
 185        struct zbud_page *zbpg = NULL;
 186        struct zbud_hdr *zh0, *zh1;
 187        bool recycled = 0;
 188
 189        /* if any pages on the zbpg list, use one */
 190        spin_lock(&zbpg_unused_list_spinlock);
 191        if (!list_empty(&zbpg_unused_list)) {
 192                zbpg = list_first_entry(&zbpg_unused_list,
 193                                struct zbud_page, bud_list);
 194                list_del_init(&zbpg->bud_list);
 195                zcache_zbpg_unused_list_count--;
 196                recycled = 1;
 197        }
 198        spin_unlock(&zbpg_unused_list_spinlock);
 199        if (zbpg == NULL)
 200                /* none on zbpg list, try to get a kernel page */
 201                zbpg = zcache_get_free_page();
 202        if (likely(zbpg != NULL)) {
 203                INIT_LIST_HEAD(&zbpg->bud_list);
 204                zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1];
 205                spin_lock_init(&zbpg->lock);
 206                if (recycled) {
 207                        ASSERT_INVERTED_SENTINEL(zbpg, ZBPG);
 208                        SET_SENTINEL(zbpg, ZBPG);
 209                        BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid));
 210                        BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid));
 211                } else {
 212                        atomic_inc(&zcache_zbud_curr_raw_pages);
 213                        INIT_LIST_HEAD(&zbpg->bud_list);
 214                        SET_SENTINEL(zbpg, ZBPG);
 215                        zh0->size = 0; zh1->size = 0;
 216                        tmem_oid_set_invalid(&zh0->oid);
 217                        tmem_oid_set_invalid(&zh1->oid);
 218                }
 219        }
 220        return zbpg;
 221}
 222
 223static void zbud_free_raw_page(struct zbud_page *zbpg)
 224{
 225        struct zbud_hdr *zh0 = &zbpg->buddy[0], *zh1 = &zbpg->buddy[1];
 226
 227        ASSERT_SENTINEL(zbpg, ZBPG);
 228        BUG_ON(!list_empty(&zbpg->bud_list));
 229        ASSERT_SPINLOCK(&zbpg->lock);
 230        BUG_ON(zh0->size != 0 || tmem_oid_valid(&zh0->oid));
 231        BUG_ON(zh1->size != 0 || tmem_oid_valid(&zh1->oid));
 232        INVERT_SENTINEL(zbpg, ZBPG);
 233        spin_unlock(&zbpg->lock);
 234        spin_lock(&zbpg_unused_list_spinlock);
 235        list_add(&zbpg->bud_list, &zbpg_unused_list);
 236        zcache_zbpg_unused_list_count++;
 237        spin_unlock(&zbpg_unused_list_spinlock);
 238}
 239
 240/*
 241 * core zbud handling routines
 242 */
 243
 244static unsigned zbud_free(struct zbud_hdr *zh)
 245{
 246        unsigned size;
 247
 248        ASSERT_SENTINEL(zh, ZBH);
 249        BUG_ON(!tmem_oid_valid(&zh->oid));
 250        size = zh->size;
 251        BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size());
 252        zh->size = 0;
 253        tmem_oid_set_invalid(&zh->oid);
 254        INVERT_SENTINEL(zh, ZBH);
 255        zcache_zbud_curr_zbytes -= size;
 256        atomic_dec(&zcache_zbud_curr_zpages);
 257        return size;
 258}
 259
 260static void zbud_free_and_delist(struct zbud_hdr *zh)
 261{
 262        unsigned chunks;
 263        struct zbud_hdr *zh_other;
 264        unsigned budnum = zbud_budnum(zh), size;
 265        struct zbud_page *zbpg =
 266                container_of(zh, struct zbud_page, buddy[budnum]);
 267
 268        spin_lock(&zbpg->lock);
 269        if (list_empty(&zbpg->bud_list)) {
 270                /* ignore zombie page... see zbud_evict_pages() */
 271                spin_unlock(&zbpg->lock);
 272                return;
 273        }
 274        size = zbud_free(zh);
 275        ASSERT_SPINLOCK(&zbpg->lock);
 276        zh_other = &zbpg->buddy[(budnum == 0) ? 1 : 0];
 277        if (zh_other->size == 0) { /* was unbuddied: unlist and free */
 278                chunks = zbud_size_to_chunks(size) ;
 279                spin_lock(&zbud_budlists_spinlock);
 280                BUG_ON(list_empty(&zbud_unbuddied[chunks].list));
 281                list_del_init(&zbpg->bud_list);
 282                zbud_unbuddied[chunks].count--;
 283                spin_unlock(&zbud_budlists_spinlock);
 284                zbud_free_raw_page(zbpg);
 285        } else { /* was buddied: move remaining buddy to unbuddied list */
 286                chunks = zbud_size_to_chunks(zh_other->size) ;
 287                spin_lock(&zbud_budlists_spinlock);
 288                list_del_init(&zbpg->bud_list);
 289                zcache_zbud_buddied_count--;
 290                list_add_tail(&zbpg->bud_list, &zbud_unbuddied[chunks].list);
 291                zbud_unbuddied[chunks].count++;
 292                spin_unlock(&zbud_budlists_spinlock);
 293                spin_unlock(&zbpg->lock);
 294        }
 295}
 296
 297static struct zbud_hdr *zbud_create(uint32_t pool_id, struct tmem_oid *oid,
 298                                        uint32_t index, struct page *page,
 299                                        void *cdata, unsigned size)
 300{
 301        struct zbud_hdr *zh0, *zh1, *zh = NULL;
 302        struct zbud_page *zbpg = NULL, *ztmp;
 303        unsigned nchunks;
 304        char *to;
 305        int i, found_good_buddy = 0;
 306
 307        nchunks = zbud_size_to_chunks(size) ;
 308        for (i = MAX_CHUNK - nchunks + 1; i > 0; i--) {
 309                spin_lock(&zbud_budlists_spinlock);
 310                if (!list_empty(&zbud_unbuddied[i].list)) {
 311                        list_for_each_entry_safe(zbpg, ztmp,
 312                                    &zbud_unbuddied[i].list, bud_list) {
 313                                if (spin_trylock(&zbpg->lock)) {
 314                                        found_good_buddy = i;
 315                                        goto found_unbuddied;
 316                                }
 317                        }
 318                }
 319                spin_unlock(&zbud_budlists_spinlock);
 320        }
 321        /* didn't find a good buddy, try allocating a new page */
 322        zbpg = zbud_alloc_raw_page();
 323        if (unlikely(zbpg == NULL))
 324                goto out;
 325        /* ok, have a page, now compress the data before taking locks */
 326        spin_lock(&zbpg->lock);
 327        spin_lock(&zbud_budlists_spinlock);
 328        list_add_tail(&zbpg->bud_list, &zbud_unbuddied[nchunks].list);
 329        zbud_unbuddied[nchunks].count++;
 330        zh = &zbpg->buddy[0];
 331        goto init_zh;
 332
 333found_unbuddied:
 334        ASSERT_SPINLOCK(&zbpg->lock);
 335        zh0 = &zbpg->buddy[0]; zh1 = &zbpg->buddy[1];
 336        BUG_ON(!((zh0->size == 0) ^ (zh1->size == 0)));
 337        if (zh0->size != 0) { /* buddy0 in use, buddy1 is vacant */
 338                ASSERT_SENTINEL(zh0, ZBH);
 339                zh = zh1;
 340        } else if (zh1->size != 0) { /* buddy1 in use, buddy0 is vacant */
 341                ASSERT_SENTINEL(zh1, ZBH);
 342                zh = zh0;
 343        } else
 344                BUG();
 345        list_del_init(&zbpg->bud_list);
 346        zbud_unbuddied[found_good_buddy].count--;
 347        list_add_tail(&zbpg->bud_list, &zbud_buddied_list);
 348        zcache_zbud_buddied_count++;
 349
 350init_zh:
 351        SET_SENTINEL(zh, ZBH);
 352        zh->size = size;
 353        zh->index = index;
 354        zh->oid = *oid;
 355        zh->pool_id = pool_id;
 356        /* can wait to copy the data until the list locks are dropped */
 357        spin_unlock(&zbud_budlists_spinlock);
 358
 359        to = zbud_data(zh, size);
 360        memcpy(to, cdata, size);
 361        spin_unlock(&zbpg->lock);
 362        zbud_cumul_chunk_counts[nchunks]++;
 363        atomic_inc(&zcache_zbud_curr_zpages);
 364        zcache_zbud_cumul_zpages++;
 365        zcache_zbud_curr_zbytes += size;
 366        zcache_zbud_cumul_zbytes += size;
 367out:
 368        return zh;
 369}
 370
 371static int zbud_decompress(struct page *page, struct zbud_hdr *zh)
 372{
 373        struct zbud_page *zbpg;
 374        unsigned budnum = zbud_budnum(zh);
 375        size_t out_len = PAGE_SIZE;
 376        char *to_va, *from_va;
 377        unsigned size;
 378        int ret = 0;
 379
 380        zbpg = container_of(zh, struct zbud_page, buddy[budnum]);
 381        spin_lock(&zbpg->lock);
 382        if (list_empty(&zbpg->bud_list)) {
 383                /* ignore zombie page... see zbud_evict_pages() */
 384                ret = -EINVAL;
 385                goto out;
 386        }
 387        ASSERT_SENTINEL(zh, ZBH);
 388        BUG_ON(zh->size == 0 || zh->size > zbud_max_buddy_size());
 389        to_va = kmap_atomic(page, KM_USER0);
 390        size = zh->size;
 391        from_va = zbud_data(zh, size);
 392        ret = lzo1x_decompress_safe(from_va, size, to_va, &out_len);
 393        BUG_ON(ret != LZO_E_OK);
 394        BUG_ON(out_len != PAGE_SIZE);
 395        kunmap_atomic(to_va, KM_USER0);
 396out:
 397        spin_unlock(&zbpg->lock);
 398        return ret;
 399}
 400
 401/*
 402 * The following routines handle shrinking of ephemeral pages by evicting
 403 * pages "least valuable" first.
 404 */
 405
 406static unsigned long zcache_evicted_raw_pages;
 407static unsigned long zcache_evicted_buddied_pages;
 408static unsigned long zcache_evicted_unbuddied_pages;
 409
 410static struct tmem_pool *zcache_get_pool_by_id(uint32_t poolid);
 411static void zcache_put_pool(struct tmem_pool *pool);
 412
 413/*
 414 * Flush and free all zbuds in a zbpg, then free the pageframe
 415 */
 416static void zbud_evict_zbpg(struct zbud_page *zbpg)
 417{
 418        struct zbud_hdr *zh;
 419        int i, j;
 420        uint32_t pool_id[ZBUD_MAX_BUDS], index[ZBUD_MAX_BUDS];
 421        struct tmem_oid oid[ZBUD_MAX_BUDS];
 422        struct tmem_pool *pool;
 423
 424        ASSERT_SPINLOCK(&zbpg->lock);
 425        BUG_ON(!list_empty(&zbpg->bud_list));
 426        for (i = 0, j = 0; i < ZBUD_MAX_BUDS; i++) {
 427                zh = &zbpg->buddy[i];
 428                if (zh->size) {
 429                        pool_id[j] = zh->pool_id;
 430                        oid[j] = zh->oid;
 431                        index[j] = zh->index;
 432                        j++;
 433                        zbud_free(zh);
 434                }
 435        }
 436        spin_unlock(&zbpg->lock);
 437        for (i = 0; i < j; i++) {
 438                pool = zcache_get_pool_by_id(pool_id[i]);
 439                if (pool != NULL) {
 440                        tmem_flush_page(pool, &oid[i], index[i]);
 441                        zcache_put_pool(pool);
 442                }
 443        }
 444        ASSERT_SENTINEL(zbpg, ZBPG);
 445        spin_lock(&zbpg->lock);
 446        zbud_free_raw_page(zbpg);
 447}
 448
 449/*
 450 * Free nr pages.  This code is funky because we want to hold the locks
 451 * protecting various lists for as short a time as possible, and in some
 452 * circumstances the list may change asynchronously when the list lock is
 453 * not held.  In some cases we also trylock not only to avoid waiting on a
 454 * page in use by another cpu, but also to avoid potential deadlock due to
 455 * lock inversion.
 456 */
 457static void zbud_evict_pages(int nr)
 458{
 459        struct zbud_page *zbpg;
 460        int i;
 461
 462        /* first try freeing any pages on unused list */
 463retry_unused_list:
 464        spin_lock_bh(&zbpg_unused_list_spinlock);
 465        if (!list_empty(&zbpg_unused_list)) {
 466                /* can't walk list here, since it may change when unlocked */
 467                zbpg = list_first_entry(&zbpg_unused_list,
 468                                struct zbud_page, bud_list);
 469                list_del_init(&zbpg->bud_list);
 470                zcache_zbpg_unused_list_count--;
 471                atomic_dec(&zcache_zbud_curr_raw_pages);
 472                spin_unlock_bh(&zbpg_unused_list_spinlock);
 473                zcache_free_page(zbpg);
 474                zcache_evicted_raw_pages++;
 475                if (--nr <= 0)
 476                        goto out;
 477                goto retry_unused_list;
 478        }
 479        spin_unlock_bh(&zbpg_unused_list_spinlock);
 480
 481        /* now try freeing unbuddied pages, starting with least space avail */
 482        for (i = 0; i < MAX_CHUNK; i++) {
 483retry_unbud_list_i:
 484                spin_lock_bh(&zbud_budlists_spinlock);
 485                if (list_empty(&zbud_unbuddied[i].list)) {
 486                        spin_unlock_bh(&zbud_budlists_spinlock);
 487                        continue;
 488                }
 489                list_for_each_entry(zbpg, &zbud_unbuddied[i].list, bud_list) {
 490                        if (unlikely(!spin_trylock(&zbpg->lock)))
 491                                continue;
 492                        list_del_init(&zbpg->bud_list);
 493                        zbud_unbuddied[i].count--;
 494                        spin_unlock(&zbud_budlists_spinlock);
 495                        zcache_evicted_unbuddied_pages++;
 496                        /* want budlists unlocked when doing zbpg eviction */
 497                        zbud_evict_zbpg(zbpg);
 498                        local_bh_enable();
 499                        if (--nr <= 0)
 500                                goto out;
 501                        goto retry_unbud_list_i;
 502                }
 503                spin_unlock_bh(&zbud_budlists_spinlock);
 504        }
 505
 506        /* as a last resort, free buddied pages */
 507retry_bud_list:
 508        spin_lock_bh(&zbud_budlists_spinlock);
 509        if (list_empty(&zbud_buddied_list)) {
 510                spin_unlock_bh(&zbud_budlists_spinlock);
 511                goto out;
 512        }
 513        list_for_each_entry(zbpg, &zbud_buddied_list, bud_list) {
 514                if (unlikely(!spin_trylock(&zbpg->lock)))
 515                        continue;
 516                list_del_init(&zbpg->bud_list);
 517                zcache_zbud_buddied_count--;
 518                spin_unlock(&zbud_budlists_spinlock);
 519                zcache_evicted_buddied_pages++;
 520                /* want budlists unlocked when doing zbpg eviction */
 521                zbud_evict_zbpg(zbpg);
 522                local_bh_enable();
 523                if (--nr <= 0)
 524                        goto out;
 525                goto retry_bud_list;
 526        }
 527        spin_unlock_bh(&zbud_budlists_spinlock);
 528out:
 529        return;
 530}
 531
 532static void zbud_init(void)
 533{
 534        int i;
 535
 536        INIT_LIST_HEAD(&zbud_buddied_list);
 537        zcache_zbud_buddied_count = 0;
 538        for (i = 0; i < NCHUNKS; i++) {
 539                INIT_LIST_HEAD(&zbud_unbuddied[i].list);
 540                zbud_unbuddied[i].count = 0;
 541        }
 542}
 543
 544#ifdef CONFIG_SYSFS
 545/*
 546 * These sysfs routines show a nice distribution of how many zbpg's are
 547 * currently (and have ever been placed) in each unbuddied list.  It's fun
 548 * to watch but can probably go away before final merge.
 549 */
 550static int zbud_show_unbuddied_list_counts(char *buf)
 551{
 552        int i;
 553        char *p = buf;
 554
 555        for (i = 0; i < NCHUNKS - 1; i++)
 556                p += sprintf(p, "%u ", zbud_unbuddied[i].count);
 557        p += sprintf(p, "%d\n", zbud_unbuddied[i].count);
 558        return p - buf;
 559}
 560
 561static int zbud_show_cumul_chunk_counts(char *buf)
 562{
 563        unsigned long i, chunks = 0, total_chunks = 0, sum_total_chunks = 0;
 564        unsigned long total_chunks_lte_21 = 0, total_chunks_lte_32 = 0;
 565        unsigned long total_chunks_lte_42 = 0;
 566        char *p = buf;
 567
 568        for (i = 0; i < NCHUNKS; i++) {
 569                p += sprintf(p, "%lu ", zbud_cumul_chunk_counts[i]);
 570                chunks += zbud_cumul_chunk_counts[i];
 571                total_chunks += zbud_cumul_chunk_counts[i];
 572                sum_total_chunks += i * zbud_cumul_chunk_counts[i];
 573                if (i == 21)
 574                        total_chunks_lte_21 = total_chunks;
 575                if (i == 32)
 576                        total_chunks_lte_32 = total_chunks;
 577                if (i == 42)
 578                        total_chunks_lte_42 = total_chunks;
 579        }
 580        p += sprintf(p, "<=21:%lu <=32:%lu <=42:%lu, mean:%lu\n",
 581                total_chunks_lte_21, total_chunks_lte_32, total_chunks_lte_42,
 582                chunks == 0 ? 0 : sum_total_chunks / chunks);
 583        return p - buf;
 584}
 585#endif
 586
 587/**********
 588 * This "zv" PAM implementation combines the TLSF-based xvMalloc
 589 * with lzo1x compression to maximize the amount of data that can
 590 * be packed into a physical page.
 591 *
 592 * Zv represents a PAM page with the index and object (plus a "size" value
 593 * necessary for decompression) immediately preceding the compressed data.
 594 */
 595
 596#define ZVH_SENTINEL  0x43214321
 597
 598struct zv_hdr {
 599        uint32_t pool_id;
 600        struct tmem_oid oid;
 601        uint32_t index;
 602        DECL_SENTINEL
 603};
 604
 605static const int zv_max_page_size = (PAGE_SIZE / 8) * 7;
 606
 607static struct zv_hdr *zv_create(struct xv_pool *xvpool, uint32_t pool_id,
 608                                struct tmem_oid *oid, uint32_t index,
 609                                void *cdata, unsigned clen)
 610{
 611        struct page *page;
 612        struct zv_hdr *zv = NULL;
 613        uint32_t offset;
 614        int ret;
 615
 616        BUG_ON(!irqs_disabled());
 617        ret = xv_malloc(xvpool, clen + sizeof(struct zv_hdr),
 618                        &page, &offset, ZCACHE_GFP_MASK);
 619        if (unlikely(ret))
 620                goto out;
 621        zv = kmap_atomic(page, KM_USER0) + offset;
 622        zv->index = index;
 623        zv->oid = *oid;
 624        zv->pool_id = pool_id;
 625        SET_SENTINEL(zv, ZVH);
 626        memcpy((char *)zv + sizeof(struct zv_hdr), cdata, clen);
 627        kunmap_atomic(zv, KM_USER0);
 628out:
 629        return zv;
 630}
 631
 632static void zv_free(struct xv_pool *xvpool, struct zv_hdr *zv)
 633{
 634        unsigned long flags;
 635        struct page *page;
 636        uint32_t offset;
 637        uint16_t size;
 638
 639        ASSERT_SENTINEL(zv, ZVH);
 640        size = xv_get_object_size(zv) - sizeof(*zv);
 641        BUG_ON(size == 0 || size > zv_max_page_size);
 642        INVERT_SENTINEL(zv, ZVH);
 643        page = virt_to_page(zv);
 644        offset = (unsigned long)zv & ~PAGE_MASK;
 645        local_irq_save(flags);
 646        xv_free(xvpool, page, offset);
 647        local_irq_restore(flags);
 648}
 649
 650static void zv_decompress(struct page *page, struct zv_hdr *zv)
 651{
 652        size_t clen = PAGE_SIZE;
 653        char *to_va;
 654        unsigned size;
 655        int ret;
 656
 657        ASSERT_SENTINEL(zv, ZVH);
 658        size = xv_get_object_size(zv) - sizeof(*zv);
 659        BUG_ON(size == 0 || size > zv_max_page_size);
 660        to_va = kmap_atomic(page, KM_USER0);
 661        ret = lzo1x_decompress_safe((char *)zv + sizeof(*zv),
 662                                        size, to_va, &clen);
 663        kunmap_atomic(to_va, KM_USER0);
 664        BUG_ON(ret != LZO_E_OK);
 665        BUG_ON(clen != PAGE_SIZE);
 666}
 667
 668/*
 669 * zcache core code starts here
 670 */
 671
 672/* useful stats not collected by cleancache or frontswap */
 673static unsigned long zcache_flush_total;
 674static unsigned long zcache_flush_found;
 675static unsigned long zcache_flobj_total;
 676static unsigned long zcache_flobj_found;
 677static unsigned long zcache_failed_eph_puts;
 678static unsigned long zcache_failed_pers_puts;
 679
 680#define MAX_POOLS_PER_CLIENT 16
 681
 682static struct {
 683        struct tmem_pool *tmem_pools[MAX_POOLS_PER_CLIENT];
 684        struct xv_pool *xvpool;
 685} zcache_client;
 686
 687/*
 688 * Tmem operations assume the poolid implies the invoking client.
 689 * Zcache only has one client (the kernel itself), so translate
 690 * the poolid into the tmem_pool allocated for it.  A KVM version
 691 * of zcache would have one client per guest and each client might
 692 * have a poolid==N.
 693 */
 694static struct tmem_pool *zcache_get_pool_by_id(uint32_t poolid)
 695{
 696        struct tmem_pool *pool = NULL;
 697
 698        if (poolid >= 0) {
 699                pool = zcache_client.tmem_pools[poolid];
 700                if (pool != NULL)
 701                        atomic_inc(&pool->refcount);
 702        }
 703        return pool;
 704}
 705
 706static void zcache_put_pool(struct tmem_pool *pool)
 707{
 708        if (pool != NULL)
 709                atomic_dec(&pool->refcount);
 710}
 711
 712/* counters for debugging */
 713static unsigned long zcache_failed_get_free_pages;
 714static unsigned long zcache_failed_alloc;
 715static unsigned long zcache_put_to_flush;
 716static unsigned long zcache_aborted_preload;
 717static unsigned long zcache_aborted_shrink;
 718
 719/*
 720 * Ensure that memory allocation requests in zcache don't result
 721 * in direct reclaim requests via the shrinker, which would cause
 722 * an infinite loop.  Maybe a GFP flag would be better?
 723 */
 724static DEFINE_SPINLOCK(zcache_direct_reclaim_lock);
 725
 726/*
 727 * for now, used named slabs so can easily track usage; later can
 728 * either just use kmalloc, or perhaps add a slab-like allocator
 729 * to more carefully manage total memory utilization
 730 */
 731static struct kmem_cache *zcache_objnode_cache;
 732static struct kmem_cache *zcache_obj_cache;
 733static atomic_t zcache_curr_obj_count = ATOMIC_INIT(0);
 734static unsigned long zcache_curr_obj_count_max;
 735static atomic_t zcache_curr_objnode_count = ATOMIC_INIT(0);
 736static unsigned long zcache_curr_objnode_count_max;
 737
 738/*
 739 * to avoid memory allocation recursion (e.g. due to direct reclaim), we
 740 * preload all necessary data structures so the hostops callbacks never
 741 * actually do a malloc
 742 */
 743struct zcache_preload {
 744        void *page;
 745        struct tmem_obj *obj;
 746        int nr;
 747        struct tmem_objnode *objnodes[OBJNODE_TREE_MAX_PATH];
 748};
 749static DEFINE_PER_CPU(struct zcache_preload, zcache_preloads) = { 0, };
 750
 751static int zcache_do_preload(struct tmem_pool *pool)
 752{
 753        struct zcache_preload *kp;
 754        struct tmem_objnode *objnode;
 755        struct tmem_obj *obj;
 756        void *page;
 757        int ret = -ENOMEM;
 758
 759        if (unlikely(zcache_objnode_cache == NULL))
 760                goto out;
 761        if (unlikely(zcache_obj_cache == NULL))
 762                goto out;
 763        if (!spin_trylock(&zcache_direct_reclaim_lock)) {
 764                zcache_aborted_preload++;
 765                goto out;
 766        }
 767        preempt_disable();
 768        kp = &__get_cpu_var(zcache_preloads);
 769        while (kp->nr < ARRAY_SIZE(kp->objnodes)) {
 770                preempt_enable_no_resched();
 771                objnode = kmem_cache_alloc(zcache_objnode_cache,
 772                                ZCACHE_GFP_MASK);
 773                if (unlikely(objnode == NULL)) {
 774                        zcache_failed_alloc++;
 775                        goto unlock_out;
 776                }
 777                preempt_disable();
 778                kp = &__get_cpu_var(zcache_preloads);
 779                if (kp->nr < ARRAY_SIZE(kp->objnodes))
 780                        kp->objnodes[kp->nr++] = objnode;
 781                else
 782                        kmem_cache_free(zcache_objnode_cache, objnode);
 783        }
 784        preempt_enable_no_resched();
 785        obj = kmem_cache_alloc(zcache_obj_cache, ZCACHE_GFP_MASK);
 786        if (unlikely(obj == NULL)) {
 787                zcache_failed_alloc++;
 788                goto unlock_out;
 789        }
 790        page = (void *)__get_free_page(ZCACHE_GFP_MASK);
 791        if (unlikely(page == NULL)) {
 792                zcache_failed_get_free_pages++;
 793                kmem_cache_free(zcache_obj_cache, obj);
 794                goto unlock_out;
 795        }
 796        preempt_disable();
 797        kp = &__get_cpu_var(zcache_preloads);
 798        if (kp->obj == NULL)
 799                kp->obj = obj;
 800        else
 801                kmem_cache_free(zcache_obj_cache, obj);
 802        if (kp->page == NULL)
 803                kp->page = page;
 804        else
 805                free_page((unsigned long)page);
 806        ret = 0;
 807unlock_out:
 808        spin_unlock(&zcache_direct_reclaim_lock);
 809out:
 810        return ret;
 811}
 812
 813static void *zcache_get_free_page(void)
 814{
 815        struct zcache_preload *kp;
 816        void *page;
 817
 818        kp = &__get_cpu_var(zcache_preloads);
 819        page = kp->page;
 820        BUG_ON(page == NULL);
 821        kp->page = NULL;
 822        return page;
 823}
 824
 825static void zcache_free_page(void *p)
 826{
 827        free_page((unsigned long)p);
 828}
 829
 830/*
 831 * zcache implementation for tmem host ops
 832 */
 833
 834static struct tmem_objnode *zcache_objnode_alloc(struct tmem_pool *pool)
 835{
 836        struct tmem_objnode *objnode = NULL;
 837        unsigned long count;
 838        struct zcache_preload *kp;
 839
 840        kp = &__get_cpu_var(zcache_preloads);
 841        if (kp->nr <= 0)
 842                goto out;
 843        objnode = kp->objnodes[kp->nr - 1];
 844        BUG_ON(objnode == NULL);
 845        kp->objnodes[kp->nr - 1] = NULL;
 846        kp->nr--;
 847        count = atomic_inc_return(&zcache_curr_objnode_count);
 848        if (count > zcache_curr_objnode_count_max)
 849                zcache_curr_objnode_count_max = count;
 850out:
 851        return objnode;
 852}
 853
 854static void zcache_objnode_free(struct tmem_objnode *objnode,
 855                                        struct tmem_pool *pool)
 856{
 857        atomic_dec(&zcache_curr_objnode_count);
 858        BUG_ON(atomic_read(&zcache_curr_objnode_count) < 0);
 859        kmem_cache_free(zcache_objnode_cache, objnode);
 860}
 861
 862static struct tmem_obj *zcache_obj_alloc(struct tmem_pool *pool)
 863{
 864        struct tmem_obj *obj = NULL;
 865        unsigned long count;
 866        struct zcache_preload *kp;
 867
 868        kp = &__get_cpu_var(zcache_preloads);
 869        obj = kp->obj;
 870        BUG_ON(obj == NULL);
 871        kp->obj = NULL;
 872        count = atomic_inc_return(&zcache_curr_obj_count);
 873        if (count > zcache_curr_obj_count_max)
 874                zcache_curr_obj_count_max = count;
 875        return obj;
 876}
 877
 878static void zcache_obj_free(struct tmem_obj *obj, struct tmem_pool *pool)
 879{
 880        atomic_dec(&zcache_curr_obj_count);
 881        BUG_ON(atomic_read(&zcache_curr_obj_count) < 0);
 882        kmem_cache_free(zcache_obj_cache, obj);
 883}
 884
 885static struct tmem_hostops zcache_hostops = {
 886        .obj_alloc = zcache_obj_alloc,
 887        .obj_free = zcache_obj_free,
 888        .objnode_alloc = zcache_objnode_alloc,
 889        .objnode_free = zcache_objnode_free,
 890};
 891
 892/*
 893 * zcache implementations for PAM page descriptor ops
 894 */
 895
 896static atomic_t zcache_curr_eph_pampd_count = ATOMIC_INIT(0);
 897static unsigned long zcache_curr_eph_pampd_count_max;
 898static atomic_t zcache_curr_pers_pampd_count = ATOMIC_INIT(0);
 899static unsigned long zcache_curr_pers_pampd_count_max;
 900
 901/* forward reference */
 902static int zcache_compress(struct page *from, void **out_va, size_t *out_len);
 903
 904static void *zcache_pampd_create(struct tmem_pool *pool, struct tmem_oid *oid,
 905                                 uint32_t index, struct page *page)
 906{
 907        void *pampd = NULL, *cdata;
 908        size_t clen;
 909        int ret;
 910        bool ephemeral = is_ephemeral(pool);
 911        unsigned long count;
 912
 913        if (ephemeral) {
 914                ret = zcache_compress(page, &cdata, &clen);
 915                if (ret == 0)
 916
 917                        goto out;
 918                if (clen == 0 || clen > zbud_max_buddy_size()) {
 919                        zcache_compress_poor++;
 920                        goto out;
 921                }
 922                pampd = (void *)zbud_create(pool->pool_id, oid, index,
 923                                                page, cdata, clen);
 924                if (pampd != NULL) {
 925                        count = atomic_inc_return(&zcache_curr_eph_pampd_count);
 926                        if (count > zcache_curr_eph_pampd_count_max)
 927                                zcache_curr_eph_pampd_count_max = count;
 928                }
 929        } else {
 930                /*
 931                 * FIXME: This is all the "policy" there is for now.
 932                 * 3/4 totpages should allow ~37% of RAM to be filled with
 933                 * compressed frontswap pages
 934                 */
 935                if (atomic_read(&zcache_curr_pers_pampd_count) >
 936                                                        3 * totalram_pages / 4)
 937                        goto out;
 938                ret = zcache_compress(page, &cdata, &clen);
 939                if (ret == 0)
 940                        goto out;
 941                if (clen > zv_max_page_size) {
 942                        zcache_compress_poor++;
 943                        goto out;
 944                }
 945                pampd = (void *)zv_create(zcache_client.xvpool, pool->pool_id,
 946                                                oid, index, cdata, clen);
 947                if (pampd == NULL)
 948                        goto out;
 949                count = atomic_inc_return(&zcache_curr_pers_pampd_count);
 950                if (count > zcache_curr_pers_pampd_count_max)
 951                        zcache_curr_pers_pampd_count_max = count;
 952        }
 953out:
 954        return pampd;
 955}
 956
 957/*
 958 * fill the pageframe corresponding to the struct page with the data
 959 * from the passed pampd
 960 */
 961static int zcache_pampd_get_data(struct page *page, void *pampd,
 962                                                struct tmem_pool *pool)
 963{
 964        int ret = 0;
 965
 966        if (is_ephemeral(pool))
 967                ret = zbud_decompress(page, pampd);
 968        else
 969                zv_decompress(page, pampd);
 970        return ret;
 971}
 972
 973/*
 974 * free the pampd and remove it from any zcache lists
 975 * pampd must no longer be pointed to from any tmem data structures!
 976 */
 977static void zcache_pampd_free(void *pampd, struct tmem_pool *pool)
 978{
 979        if (is_ephemeral(pool)) {
 980                zbud_free_and_delist((struct zbud_hdr *)pampd);
 981                atomic_dec(&zcache_curr_eph_pampd_count);
 982                BUG_ON(atomic_read(&zcache_curr_eph_pampd_count) < 0);
 983        } else {
 984                zv_free(zcache_client.xvpool, (struct zv_hdr *)pampd);
 985                atomic_dec(&zcache_curr_pers_pampd_count);
 986                BUG_ON(atomic_read(&zcache_curr_pers_pampd_count) < 0);
 987        }
 988}
 989
 990static struct tmem_pamops zcache_pamops = {
 991        .create = zcache_pampd_create,
 992        .get_data = zcache_pampd_get_data,
 993        .free = zcache_pampd_free,
 994};
 995
 996/*
 997 * zcache compression/decompression and related per-cpu stuff
 998 */
 999
1000#define LZO_WORKMEM_BYTES LZO1X_1_MEM_COMPRESS
1001#define LZO_DSTMEM_PAGE_ORDER 1
1002static DEFINE_PER_CPU(unsigned char *, zcache_workmem);
1003static DEFINE_PER_CPU(unsigned char *, zcache_dstmem);
1004
1005static int zcache_compress(struct page *from, void **out_va, size_t *out_len)
1006{
1007        int ret = 0;
1008        unsigned char *dmem = __get_cpu_var(zcache_dstmem);
1009        unsigned char *wmem = __get_cpu_var(zcache_workmem);
1010        char *from_va;
1011
1012        BUG_ON(!irqs_disabled());
1013        if (unlikely(dmem == NULL || wmem == NULL))
1014                goto out;  /* no buffer, so can't compress */
1015        from_va = kmap_atomic(from, KM_USER0);
1016        mb();
1017        ret = lzo1x_1_compress(from_va, PAGE_SIZE, dmem, out_len, wmem);
1018        BUG_ON(ret != LZO_E_OK);
1019        *out_va = dmem;
1020        kunmap_atomic(from_va, KM_USER0);
1021        ret = 1;
1022out:
1023        return ret;
1024}
1025
1026
1027static int zcache_cpu_notifier(struct notifier_block *nb,
1028                                unsigned long action, void *pcpu)
1029{
1030        int cpu = (long)pcpu;
1031        struct zcache_preload *kp;
1032
1033        switch (action) {
1034        case CPU_UP_PREPARE:
1035                per_cpu(zcache_dstmem, cpu) = (void *)__get_free_pages(
1036                        GFP_KERNEL | __GFP_REPEAT,
1037                        LZO_DSTMEM_PAGE_ORDER),
1038                per_cpu(zcache_workmem, cpu) =
1039                        kzalloc(LZO1X_MEM_COMPRESS,
1040                                GFP_KERNEL | __GFP_REPEAT);
1041                break;
1042        case CPU_DEAD:
1043        case CPU_UP_CANCELED:
1044                free_pages((unsigned long)per_cpu(zcache_dstmem, cpu),
1045                                LZO_DSTMEM_PAGE_ORDER);
1046                per_cpu(zcache_dstmem, cpu) = NULL;
1047                kfree(per_cpu(zcache_workmem, cpu));
1048                per_cpu(zcache_workmem, cpu) = NULL;
1049                kp = &per_cpu(zcache_preloads, cpu);
1050                while (kp->nr) {
1051                        kmem_cache_free(zcache_objnode_cache,
1052                                        kp->objnodes[kp->nr - 1]);
1053                        kp->objnodes[kp->nr - 1] = NULL;
1054                        kp->nr--;
1055                }
1056                kmem_cache_free(zcache_obj_cache, kp->obj);
1057                free_page((unsigned long)kp->page);
1058                break;
1059        default:
1060                break;
1061        }
1062        return NOTIFY_OK;
1063}
1064
1065static struct notifier_block zcache_cpu_notifier_block = {
1066        .notifier_call = zcache_cpu_notifier
1067};
1068
1069#ifdef CONFIG_SYSFS
1070#define ZCACHE_SYSFS_RO(_name) \
1071        static ssize_t zcache_##_name##_show(struct kobject *kobj, \
1072                                struct kobj_attribute *attr, char *buf) \
1073        { \
1074                return sprintf(buf, "%lu\n", zcache_##_name); \
1075        } \
1076        static struct kobj_attribute zcache_##_name##_attr = { \
1077                .attr = { .name = __stringify(_name), .mode = 0444 }, \
1078                .show = zcache_##_name##_show, \
1079        }
1080
1081#define ZCACHE_SYSFS_RO_ATOMIC(_name) \
1082        static ssize_t zcache_##_name##_show(struct kobject *kobj, \
1083                                struct kobj_attribute *attr, char *buf) \
1084        { \
1085            return sprintf(buf, "%d\n", atomic_read(&zcache_##_name)); \
1086        } \
1087        static struct kobj_attribute zcache_##_name##_attr = { \
1088                .attr = { .name = __stringify(_name), .mode = 0444 }, \
1089                .show = zcache_##_name##_show, \
1090        }
1091
1092#define ZCACHE_SYSFS_RO_CUSTOM(_name, _func) \
1093        static ssize_t zcache_##_name##_show(struct kobject *kobj, \
1094                                struct kobj_attribute *attr, char *buf) \
1095        { \
1096            return _func(buf); \
1097        } \
1098        static struct kobj_attribute zcache_##_name##_attr = { \
1099                .attr = { .name = __stringify(_name), .mode = 0444 }, \
1100                .show = zcache_##_name##_show, \
1101        }
1102
1103ZCACHE_SYSFS_RO(curr_obj_count_max);
1104ZCACHE_SYSFS_RO(curr_objnode_count_max);
1105ZCACHE_SYSFS_RO(flush_total);
1106ZCACHE_SYSFS_RO(flush_found);
1107ZCACHE_SYSFS_RO(flobj_total);
1108ZCACHE_SYSFS_RO(flobj_found);
1109ZCACHE_SYSFS_RO(failed_eph_puts);
1110ZCACHE_SYSFS_RO(failed_pers_puts);
1111ZCACHE_SYSFS_RO(zbud_curr_zbytes);
1112ZCACHE_SYSFS_RO(zbud_cumul_zpages);
1113ZCACHE_SYSFS_RO(zbud_cumul_zbytes);
1114ZCACHE_SYSFS_RO(zbud_buddied_count);
1115ZCACHE_SYSFS_RO(zbpg_unused_list_count);
1116ZCACHE_SYSFS_RO(evicted_raw_pages);
1117ZCACHE_SYSFS_RO(evicted_unbuddied_pages);
1118ZCACHE_SYSFS_RO(evicted_buddied_pages);
1119ZCACHE_SYSFS_RO(failed_get_free_pages);
1120ZCACHE_SYSFS_RO(failed_alloc);
1121ZCACHE_SYSFS_RO(put_to_flush);
1122ZCACHE_SYSFS_RO(aborted_preload);
1123ZCACHE_SYSFS_RO(aborted_shrink);
1124ZCACHE_SYSFS_RO(compress_poor);
1125ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_raw_pages);
1126ZCACHE_SYSFS_RO_ATOMIC(zbud_curr_zpages);
1127ZCACHE_SYSFS_RO_ATOMIC(curr_obj_count);
1128ZCACHE_SYSFS_RO_ATOMIC(curr_objnode_count);
1129ZCACHE_SYSFS_RO_CUSTOM(zbud_unbuddied_list_counts,
1130                        zbud_show_unbuddied_list_counts);
1131ZCACHE_SYSFS_RO_CUSTOM(zbud_cumul_chunk_counts,
1132                        zbud_show_cumul_chunk_counts);
1133
1134static struct attribute *zcache_attrs[] = {
1135        &zcache_curr_obj_count_attr.attr,
1136        &zcache_curr_obj_count_max_attr.attr,
1137        &zcache_curr_objnode_count_attr.attr,
1138        &zcache_curr_objnode_count_max_attr.attr,
1139        &zcache_flush_total_attr.attr,
1140        &zcache_flobj_total_attr.attr,
1141        &zcache_flush_found_attr.attr,
1142        &zcache_flobj_found_attr.attr,
1143        &zcache_failed_eph_puts_attr.attr,
1144        &zcache_failed_pers_puts_attr.attr,
1145        &zcache_compress_poor_attr.attr,
1146        &zcache_zbud_curr_raw_pages_attr.attr,
1147        &zcache_zbud_curr_zpages_attr.attr,
1148        &zcache_zbud_curr_zbytes_attr.attr,
1149        &zcache_zbud_cumul_zpages_attr.attr,
1150        &zcache_zbud_cumul_zbytes_attr.attr,
1151        &zcache_zbud_buddied_count_attr.attr,
1152        &zcache_zbpg_unused_list_count_attr.attr,
1153        &zcache_evicted_raw_pages_attr.attr,
1154        &zcache_evicted_unbuddied_pages_attr.attr,
1155        &zcache_evicted_buddied_pages_attr.attr,
1156        &zcache_failed_get_free_pages_attr.attr,
1157        &zcache_failed_alloc_attr.attr,
1158        &zcache_put_to_flush_attr.attr,
1159        &zcache_aborted_preload_attr.attr,
1160        &zcache_aborted_shrink_attr.attr,
1161        &zcache_zbud_unbuddied_list_counts_attr.attr,
1162        &zcache_zbud_cumul_chunk_counts_attr.attr,
1163        NULL,
1164};
1165
1166static struct attribute_group zcache_attr_group = {
1167        .attrs = zcache_attrs,
1168        .name = "zcache",
1169};
1170
1171#endif /* CONFIG_SYSFS */
1172/*
1173 * When zcache is disabled ("frozen"), pools can be created and destroyed,
1174 * but all puts (and thus all other operations that require memory allocation)
1175 * must fail.  If zcache is unfrozen, accepts puts, then frozen again,
1176 * data consistency requires all puts while frozen to be converted into
1177 * flushes.
1178 */
1179static bool zcache_freeze;
1180
1181/*
1182 * zcache shrinker interface (only useful for ephemeral pages, so zbud only)
1183 */
1184static int shrink_zcache_memory(struct shrinker *shrink,
1185                                struct shrink_control *sc)
1186{
1187        int ret = -1;
1188        int nr = sc->nr_to_scan;
1189        gfp_t gfp_mask = sc->gfp_mask;
1190
1191        if (nr >= 0) {
1192                if (!(gfp_mask & __GFP_FS))
1193                        /* does this case really need to be skipped? */
1194                        goto out;
1195                if (spin_trylock(&zcache_direct_reclaim_lock)) {
1196                        zbud_evict_pages(nr);
1197                        spin_unlock(&zcache_direct_reclaim_lock);
1198                } else
1199                        zcache_aborted_shrink++;
1200        }
1201        ret = (int)atomic_read(&zcache_zbud_curr_raw_pages);
1202out:
1203        return ret;
1204}
1205
1206static struct shrinker zcache_shrinker = {
1207        .shrink = shrink_zcache_memory,
1208        .seeks = DEFAULT_SEEKS,
1209};
1210
1211/*
1212 * zcache shims between cleancache/frontswap ops and tmem
1213 */
1214
1215static int zcache_put_page(int pool_id, struct tmem_oid *oidp,
1216                                uint32_t index, struct page *page)
1217{
1218        struct tmem_pool *pool;
1219        int ret = -1;
1220
1221        BUG_ON(!irqs_disabled());
1222        pool = zcache_get_pool_by_id(pool_id);
1223        if (unlikely(pool == NULL))
1224                goto out;
1225        if (!zcache_freeze && zcache_do_preload(pool) == 0) {
1226                /* preload does preempt_disable on success */
1227                ret = tmem_put(pool, oidp, index, page);
1228                if (ret < 0) {
1229                        if (is_ephemeral(pool))
1230                                zcache_failed_eph_puts++;
1231                        else
1232                                zcache_failed_pers_puts++;
1233                }
1234                zcache_put_pool(pool);
1235                preempt_enable_no_resched();
1236        } else {
1237                zcache_put_to_flush++;
1238                if (atomic_read(&pool->obj_count) > 0)
1239                        /* the put fails whether the flush succeeds or not */
1240                        (void)tmem_flush_page(pool, oidp, index);
1241                zcache_put_pool(pool);
1242        }
1243out:
1244        return ret;
1245}
1246
1247static int zcache_get_page(int pool_id, struct tmem_oid *oidp,
1248                                uint32_t index, struct page *page)
1249{
1250        struct tmem_pool *pool;
1251        int ret = -1;
1252        unsigned long flags;
1253
1254        local_irq_save(flags);
1255        pool = zcache_get_pool_by_id(pool_id);
1256        if (likely(pool != NULL)) {
1257                if (atomic_read(&pool->obj_count) > 0)
1258                        ret = tmem_get(pool, oidp, index, page);
1259                zcache_put_pool(pool);
1260        }
1261        local_irq_restore(flags);
1262        return ret;
1263}
1264
1265static int zcache_flush_page(int pool_id, struct tmem_oid *oidp, uint32_t index)
1266{
1267        struct tmem_pool *pool;
1268        int ret = -1;
1269        unsigned long flags;
1270
1271        local_irq_save(flags);
1272        zcache_flush_total++;
1273        pool = zcache_get_pool_by_id(pool_id);
1274        if (likely(pool != NULL)) {
1275                if (atomic_read(&pool->obj_count) > 0)
1276                        ret = tmem_flush_page(pool, oidp, index);
1277                zcache_put_pool(pool);
1278        }
1279        if (ret >= 0)
1280                zcache_flush_found++;
1281        local_irq_restore(flags);
1282        return ret;
1283}
1284
1285static int zcache_flush_object(int pool_id, struct tmem_oid *oidp)
1286{
1287        struct tmem_pool *pool;
1288        int ret = -1;
1289        unsigned long flags;
1290
1291        local_irq_save(flags);
1292        zcache_flobj_total++;
1293        pool = zcache_get_pool_by_id(pool_id);
1294        if (likely(pool != NULL)) {
1295                if (atomic_read(&pool->obj_count) > 0)
1296                        ret = tmem_flush_object(pool, oidp);
1297                zcache_put_pool(pool);
1298        }
1299        if (ret >= 0)
1300                zcache_flobj_found++;
1301        local_irq_restore(flags);
1302        return ret;
1303}
1304
1305static int zcache_destroy_pool(int pool_id)
1306{
1307        struct tmem_pool *pool = NULL;
1308        int ret = -1;
1309
1310        if (pool_id < 0)
1311                goto out;
1312        pool = zcache_client.tmem_pools[pool_id];
1313        if (pool == NULL)
1314                goto out;
1315        zcache_client.tmem_pools[pool_id] = NULL;
1316        /* wait for pool activity on other cpus to quiesce */
1317        while (atomic_read(&pool->refcount) != 0)
1318                ;
1319        local_bh_disable();
1320        ret = tmem_destroy_pool(pool);
1321        local_bh_enable();
1322        kfree(pool);
1323        pr_info("zcache: destroyed pool id=%d\n", pool_id);
1324out:
1325        return ret;
1326}
1327
1328static int zcache_new_pool(uint32_t flags)
1329{
1330        int poolid = -1;
1331        struct tmem_pool *pool;
1332
1333        pool = kmalloc(sizeof(struct tmem_pool), GFP_KERNEL);
1334        if (pool == NULL) {
1335                pr_info("zcache: pool creation failed: out of memory\n");
1336                goto out;
1337        }
1338
1339        for (poolid = 0; poolid < MAX_POOLS_PER_CLIENT; poolid++)
1340                if (zcache_client.tmem_pools[poolid] == NULL)
1341                        break;
1342        if (poolid >= MAX_POOLS_PER_CLIENT) {
1343                pr_info("zcache: pool creation failed: max exceeded\n");
1344                kfree(pool);
1345                poolid = -1;
1346                goto out;
1347        }
1348        atomic_set(&pool->refcount, 0);
1349        pool->client = &zcache_client;
1350        pool->pool_id = poolid;
1351        tmem_new_pool(pool, flags);
1352        zcache_client.tmem_pools[poolid] = pool;
1353        pr_info("zcache: created %s tmem pool, id=%d\n",
1354                flags & TMEM_POOL_PERSIST ? "persistent" : "ephemeral",
1355                poolid);
1356out:
1357        return poolid;
1358}
1359
1360/**********
1361 * Two kernel functionalities currently can be layered on top of tmem.
1362 * These are "cleancache" which is used as a second-chance cache for clean
1363 * page cache pages; and "frontswap" which is used for swap pages
1364 * to avoid writes to disk.  A generic "shim" is provided here for each
1365 * to translate in-kernel semantics to zcache semantics.
1366 */
1367
1368#ifdef CONFIG_CLEANCACHE
1369static void zcache_cleancache_put_page(int pool_id,
1370                                        struct cleancache_filekey key,
1371                                        pgoff_t index, struct page *page)
1372{
1373        u32 ind = (u32) index;
1374        struct tmem_oid oid = *(struct tmem_oid *)&key;
1375
1376        if (likely(ind == index))
1377                (void)zcache_put_page(pool_id, &oid, index, page);
1378}
1379
1380static int zcache_cleancache_get_page(int pool_id,
1381                                        struct cleancache_filekey key,
1382                                        pgoff_t index, struct page *page)
1383{
1384        u32 ind = (u32) index;
1385        struct tmem_oid oid = *(struct tmem_oid *)&key;
1386        int ret = -1;
1387
1388        if (likely(ind == index))
1389                ret = zcache_get_page(pool_id, &oid, index, page);
1390        return ret;
1391}
1392
1393static void zcache_cleancache_flush_page(int pool_id,
1394                                        struct cleancache_filekey key,
1395                                        pgoff_t index)
1396{
1397        u32 ind = (u32) index;
1398        struct tmem_oid oid = *(struct tmem_oid *)&key;
1399
1400        if (likely(ind == index))
1401                (void)zcache_flush_page(pool_id, &oid, ind);
1402}
1403
1404static void zcache_cleancache_flush_inode(int pool_id,
1405                                        struct cleancache_filekey key)
1406{
1407        struct tmem_oid oid = *(struct tmem_oid *)&key;
1408
1409        (void)zcache_flush_object(pool_id, &oid);
1410}
1411
1412static void zcache_cleancache_flush_fs(int pool_id)
1413{
1414        if (pool_id >= 0)
1415                (void)zcache_destroy_pool(pool_id);
1416}
1417
1418static int zcache_cleancache_init_fs(size_t pagesize)
1419{
1420        BUG_ON(sizeof(struct cleancache_filekey) !=
1421                                sizeof(struct tmem_oid));
1422        BUG_ON(pagesize != PAGE_SIZE);
1423        return zcache_new_pool(0);
1424}
1425
1426static int zcache_cleancache_init_shared_fs(char *uuid, size_t pagesize)
1427{
1428        /* shared pools are unsupported and map to private */
1429        BUG_ON(sizeof(struct cleancache_filekey) !=
1430                                sizeof(struct tmem_oid));
1431        BUG_ON(pagesize != PAGE_SIZE);
1432        return zcache_new_pool(0);
1433}
1434
1435static struct cleancache_ops zcache_cleancache_ops = {
1436        .put_page = zcache_cleancache_put_page,
1437        .get_page = zcache_cleancache_get_page,
1438        .flush_page = zcache_cleancache_flush_page,
1439        .flush_inode = zcache_cleancache_flush_inode,
1440        .flush_fs = zcache_cleancache_flush_fs,
1441        .init_shared_fs = zcache_cleancache_init_shared_fs,
1442        .init_fs = zcache_cleancache_init_fs
1443};
1444
1445struct cleancache_ops zcache_cleancache_register_ops(void)
1446{
1447        struct cleancache_ops old_ops =
1448                cleancache_register_ops(&zcache_cleancache_ops);
1449
1450        return old_ops;
1451}
1452#endif
1453
1454#ifdef CONFIG_FRONTSWAP
1455/* a single tmem poolid is used for all frontswap "types" (swapfiles) */
1456static int zcache_frontswap_poolid = -1;
1457
1458/*
1459 * Swizzling increases objects per swaptype, increasing tmem concurrency
1460 * for heavy swaploads.  Later, larger nr_cpus -> larger SWIZ_BITS
1461 */
1462#define SWIZ_BITS               4
1463#define SWIZ_MASK               ((1 << SWIZ_BITS) - 1)
1464#define _oswiz(_type, _ind)     ((_type << SWIZ_BITS) | (_ind & SWIZ_MASK))
1465#define iswiz(_ind)             (_ind >> SWIZ_BITS)
1466
1467static inline struct tmem_oid oswiz(unsigned type, u32 ind)
1468{
1469        struct tmem_oid oid = { .oid = { 0 } };
1470        oid.oid[0] = _oswiz(type, ind);
1471        return oid;
1472}
1473
1474static int zcache_frontswap_put_page(unsigned type, pgoff_t offset,
1475                                   struct page *page)
1476{
1477        u64 ind64 = (u64)offset;
1478        u32 ind = (u32)offset;
1479        struct tmem_oid oid = oswiz(type, ind);
1480        int ret = -1;
1481        unsigned long flags;
1482
1483        BUG_ON(!PageLocked(page));
1484        if (likely(ind64 == ind)) {
1485                local_irq_save(flags);
1486                ret = zcache_put_page(zcache_frontswap_poolid, &oid,
1487                                        iswiz(ind), page);
1488                local_irq_restore(flags);
1489        }
1490        return ret;
1491}
1492
1493/* returns 0 if the page was successfully gotten from frontswap, -1 if
1494 * was not present (should never happen!) */
1495static int zcache_frontswap_get_page(unsigned type, pgoff_t offset,
1496                                   struct page *page)
1497{
1498        u64 ind64 = (u64)offset;
1499        u32 ind = (u32)offset;
1500        struct tmem_oid oid = oswiz(type, ind);
1501        int ret = -1;
1502
1503        BUG_ON(!PageLocked(page));
1504        if (likely(ind64 == ind))
1505                ret = zcache_get_page(zcache_frontswap_poolid, &oid,
1506                                        iswiz(ind), page);
1507        return ret;
1508}
1509
1510/* flush a single page from frontswap */
1511static void zcache_frontswap_flush_page(unsigned type, pgoff_t offset)
1512{
1513        u64 ind64 = (u64)offset;
1514        u32 ind = (u32)offset;
1515        struct tmem_oid oid = oswiz(type, ind);
1516
1517        if (likely(ind64 == ind))
1518                (void)zcache_flush_page(zcache_frontswap_poolid, &oid,
1519                                        iswiz(ind));
1520}
1521
1522/* flush all pages from the passed swaptype */
1523static void zcache_frontswap_flush_area(unsigned type)
1524{
1525        struct tmem_oid oid;
1526        int ind;
1527
1528        for (ind = SWIZ_MASK; ind >= 0; ind--) {
1529                oid = oswiz(type, ind);
1530                (void)zcache_flush_object(zcache_frontswap_poolid, &oid);
1531        }
1532}
1533
1534static void zcache_frontswap_init(unsigned ignored)
1535{
1536        /* a single tmem poolid is used for all frontswap "types" (swapfiles) */
1537        if (zcache_frontswap_poolid < 0)
1538                zcache_frontswap_poolid = zcache_new_pool(TMEM_POOL_PERSIST);
1539}
1540
1541static struct frontswap_ops zcache_frontswap_ops = {
1542        .put_page = zcache_frontswap_put_page,
1543        .get_page = zcache_frontswap_get_page,
1544        .flush_page = zcache_frontswap_flush_page,
1545        .flush_area = zcache_frontswap_flush_area,
1546        .init = zcache_frontswap_init
1547};
1548
1549struct frontswap_ops zcache_frontswap_register_ops(void)
1550{
1551        struct frontswap_ops old_ops =
1552                frontswap_register_ops(&zcache_frontswap_ops);
1553
1554        return old_ops;
1555}
1556#endif
1557
1558/*
1559 * zcache initialization
1560 * NOTE FOR NOW zcache MUST BE PROVIDED AS A KERNEL BOOT PARAMETER OR
1561 * NOTHING HAPPENS!
1562 */
1563
1564static int zcache_enabled;
1565
1566static int __init enable_zcache(char *s)
1567{
1568        zcache_enabled = 1;
1569        return 1;
1570}
1571__setup("zcache", enable_zcache);
1572
1573/* allow independent dynamic disabling of cleancache and frontswap */
1574
1575static int use_cleancache = 1;
1576
1577static int __init no_cleancache(char *s)
1578{
1579        use_cleancache = 0;
1580        return 1;
1581}
1582
1583__setup("nocleancache", no_cleancache);
1584
1585static int use_frontswap = 1;
1586
1587static int __init no_frontswap(char *s)
1588{
1589        use_frontswap = 0;
1590        return 1;
1591}
1592
1593__setup("nofrontswap", no_frontswap);
1594
1595static int __init zcache_init(void)
1596{
1597#ifdef CONFIG_SYSFS
1598        int ret = 0;
1599
1600        ret = sysfs_create_group(mm_kobj, &zcache_attr_group);
1601        if (ret) {
1602                pr_err("zcache: can't create sysfs\n");
1603                goto out;
1604        }
1605#endif /* CONFIG_SYSFS */
1606#if defined(CONFIG_CLEANCACHE) || defined(CONFIG_FRONTSWAP)
1607        if (zcache_enabled) {
1608                unsigned int cpu;
1609
1610                tmem_register_hostops(&zcache_hostops);
1611                tmem_register_pamops(&zcache_pamops);
1612                ret = register_cpu_notifier(&zcache_cpu_notifier_block);
1613                if (ret) {
1614                        pr_err("zcache: can't register cpu notifier\n");
1615                        goto out;
1616                }
1617                for_each_online_cpu(cpu) {
1618                        void *pcpu = (void *)(long)cpu;
1619                        zcache_cpu_notifier(&zcache_cpu_notifier_block,
1620                                CPU_UP_PREPARE, pcpu);
1621                }
1622        }
1623        zcache_objnode_cache = kmem_cache_create("zcache_objnode",
1624                                sizeof(struct tmem_objnode), 0, 0, NULL);
1625        zcache_obj_cache = kmem_cache_create("zcache_obj",
1626                                sizeof(struct tmem_obj), 0, 0, NULL);
1627#endif
1628#ifdef CONFIG_CLEANCACHE
1629        if (zcache_enabled && use_cleancache) {
1630                struct cleancache_ops old_ops;
1631
1632                zbud_init();
1633                register_shrinker(&zcache_shrinker);
1634                old_ops = zcache_cleancache_register_ops();
1635                pr_info("zcache: cleancache enabled using kernel "
1636                        "transcendent memory and compression buddies\n");
1637                if (old_ops.init_fs != NULL)
1638                        pr_warning("zcache: cleancache_ops overridden");
1639        }
1640#endif
1641#ifdef CONFIG_FRONTSWAP
1642        if (zcache_enabled && use_frontswap) {
1643                struct frontswap_ops old_ops;
1644
1645                zcache_client.xvpool = xv_create_pool();
1646                if (zcache_client.xvpool == NULL) {
1647                        pr_err("zcache: can't create xvpool\n");
1648                        goto out;
1649                }
1650                old_ops = zcache_frontswap_register_ops();
1651                pr_info("zcache: frontswap enabled using kernel "
1652                        "transcendent memory and xvmalloc\n");
1653                if (old_ops.init != NULL)
1654                        pr_warning("ktmem: frontswap_ops overridden");
1655        }
1656#endif
1657out:
1658        return ret;
1659}
1660
1661module_init(zcache_init)
1662