linux/drivers/staging/zcache/zbud.c
<<
>>
Prefs
   1/*
   2 * zbud.c - Compression buddies allocator
   3 *
   4 * Copyright (c) 2010-2012, Dan Magenheimer, Oracle Corp.
   5 *
   6 * Compression buddies ("zbud") provides for efficiently packing two
   7 * (or, possibly in the future, more) compressed pages ("zpages") into
   8 * a single "raw" pageframe and for tracking both zpages and pageframes
   9 * so that whole pageframes can be easily reclaimed in LRU-like order.
  10 * It is designed to be used in conjunction with transcendent memory
  11 * ("tmem"); for example separate LRU lists are maintained for persistent
  12 * vs. ephemeral pages.
  13 *
  14 * A zbudpage is an overlay for a struct page and thus each zbudpage
  15 * refers to a physical pageframe of RAM.  When the caller passes a
  16 * struct page from the kernel's page allocator, zbud "transforms" it
  17 * to a zbudpage which sets/uses a different set of fields than the
  18 * struct-page and thus must "untransform" it back by reinitializing
  19 * certain fields before the struct-page can be freed.  The fields
  20 * of a zbudpage include a page lock for controlling access to the
  21 * corresponding pageframe, and there is a size field for each zpage.
  22 * Each zbudpage also lives on two linked lists: a "budlist" which is
  23 * used to support efficient buddying of zpages; and an "lru" which
  24 * is used for reclaiming pageframes in approximately least-recently-used
  25 * order.
  26 *
  27 * A zbudpageframe is a pageframe divided up into aligned 64-byte "chunks"
  28 * which contain the compressed data for zero, one, or two zbuds.  Contained
  29 * with the compressed data is a tmem_handle which is a key to allow
  30 * the same data to be found via the tmem interface so the zpage can
  31 * be invalidated (for ephemeral pages) or repatriated to the swap cache
  32 * (for persistent pages).  The contents of a zbudpageframe must never
  33 * be accessed without holding the page lock for the corresponding
  34 * zbudpage and, to accomodate highmem machines, the contents may
  35 * only be examined or changes when kmapped.  Thus, when in use, a
  36 * kmapped zbudpageframe is referred to in the zbud code as "void *zbpg".
  37 *
  38 * Note that the term "zbud" refers to the combination of a zpage and
  39 * a tmem_handle that is stored as one of possibly two "buddied" zpages;
  40 * it also generically refers to this allocator... sorry for any confusion.
  41 *
  42 * A zbudref is a pointer to a struct zbudpage (which can be cast to a
  43 * struct page), with the LSB either cleared or set to indicate, respectively,
  44 * the first or second zpage in the zbudpageframe. Since a zbudref can be
  45 * cast to a pointer, it is used as the tmem "pampd" pointer and uniquely
  46 * references a stored tmem page and so is the only zbud data structure
  47 * externally visible to zbud.c/zbud.h.
  48 *
  49 * Since we wish to reclaim entire pageframes but zpages may be randomly
  50 * added and deleted to any given pageframe, we approximate LRU by
  51 * promoting a pageframe to MRU when a zpage is added to it, but
  52 * leaving it at the current place in the list when a zpage is deleted
  53 * from it.  As a side effect, zpages that are difficult to buddy (e.g.
  54 * very large paages) will be reclaimed faster than average, which seems
  55 * reasonable.
  56 *
  57 * In the current implementation, no more than two zpages may be stored in
  58 * any pageframe and no zpage ever crosses a pageframe boundary.  While
  59 * other zpage allocation mechanisms may allow greater density, this two
  60 * zpage-per-pageframe limit both ensures simple reclaim of pageframes
  61 * (including garbage collection of references to the contents of those
  62 * pageframes from tmem data structures) AND avoids the need for compaction.
  63 * With additional complexity, zbud could be modified to support storing
  64 * up to three zpages per pageframe or, to handle larger average zpages,
  65 * up to three zpages per pair of pageframes, but it is not clear if the
  66 * additional complexity would be worth it.  So consider it an exercise
  67 * for future developers.
  68 *
  69 * Note also that zbud does no page allocation or freeing.  This is so
  70 * that the caller has complete control over and, for accounting, visibility
  71 * into if/when pages are allocated and freed.
  72 *
  73 * Finally, note that zbud limits the size of zpages it can store; the
  74 * caller must check the zpage size with zbud_max_buddy_size before
  75 * storing it, else BUGs will result.  User beware.
  76 */
  77
  78#include <linux/module.h>
  79#include <linux/highmem.h>
  80#include <linux/list.h>
  81#include <linux/spinlock.h>
  82#include <linux/pagemap.h>
  83#include <linux/atomic.h>
  84#include <linux/bug.h>
  85#include "tmem.h"
  86#include "zcache.h"
  87#include "zbud.h"
  88
  89/*
  90 * We need to ensure that a struct zbudpage is never larger than a
  91 * struct page.  This is checked with a BUG_ON in zbud_init.
  92 *
  93 * The unevictable field indicates that a zbud is being added to the
  94 * zbudpage.  Since this is a two-phase process (due to tmem locking),
  95 * this field locks the zbudpage against eviction when a zbud match
  96 * or creation is in process.  Since this addition process may occur
  97 * in parallel for two zbuds in one zbudpage, the field is a counter
  98 * that must not exceed two.
  99 */
 100struct zbudpage {
 101        union {
 102                struct page page;
 103                struct {
 104                        unsigned long space_for_flags;
 105                        struct {
 106                                unsigned zbud0_size:PAGE_SHIFT;
 107                                unsigned zbud1_size:PAGE_SHIFT;
 108                                unsigned unevictable:2;
 109                        };
 110                        struct list_head budlist;
 111                        struct list_head lru;
 112                };
 113        };
 114};
 115#if (PAGE_SHIFT * 2) + 2 > BITS_PER_LONG
 116#error "zbud won't work for this arch, PAGE_SIZE is too large"
 117#endif
 118
 119struct zbudref {
 120        union {
 121                struct zbudpage *zbudpage;
 122                unsigned long zbudref;
 123        };
 124};
 125
 126#define CHUNK_SHIFT     6
 127#define CHUNK_SIZE      (1 << CHUNK_SHIFT)
 128#define CHUNK_MASK      (~(CHUNK_SIZE-1))
 129#define NCHUNKS         (PAGE_SIZE >> CHUNK_SHIFT)
 130#define MAX_CHUNK       (NCHUNKS-1)
 131
 132/*
 133 * The following functions deal with the difference between struct
 134 * page and struct zbudpage.  Note the hack of using the pageflags
 135 * from struct page; this is to avoid duplicating all the complex
 136 * pageflag macros.
 137 */
 138static inline void zbudpage_spin_lock(struct zbudpage *zbudpage)
 139{
 140        struct page *page = (struct page *)zbudpage;
 141
 142        while (unlikely(test_and_set_bit_lock(PG_locked, &page->flags))) {
 143                do {
 144                        cpu_relax();
 145                } while (test_bit(PG_locked, &page->flags));
 146        }
 147}
 148
 149static inline void zbudpage_spin_unlock(struct zbudpage *zbudpage)
 150{
 151        struct page *page = (struct page *)zbudpage;
 152
 153        clear_bit(PG_locked, &page->flags);
 154}
 155
 156static inline int zbudpage_spin_trylock(struct zbudpage *zbudpage)
 157{
 158        return trylock_page((struct page *)zbudpage);
 159}
 160
 161static inline int zbudpage_is_locked(struct zbudpage *zbudpage)
 162{
 163        return PageLocked((struct page *)zbudpage);
 164}
 165
 166static inline void *kmap_zbudpage_atomic(struct zbudpage *zbudpage)
 167{
 168        return kmap_atomic((struct page *)zbudpage);
 169}
 170
 171/*
 172 * A dying zbudpage is an ephemeral page in the process of being evicted.
 173 * Any data contained in the zbudpage is invalid and we are just waiting for
 174 * the tmem pampds to be invalidated before freeing the page
 175 */
 176static inline int zbudpage_is_dying(struct zbudpage *zbudpage)
 177{
 178        struct page *page = (struct page *)zbudpage;
 179
 180        return test_bit(PG_reclaim, &page->flags);
 181}
 182
 183static inline void zbudpage_set_dying(struct zbudpage *zbudpage)
 184{
 185        struct page *page = (struct page *)zbudpage;
 186
 187        set_bit(PG_reclaim, &page->flags);
 188}
 189
 190static inline void zbudpage_clear_dying(struct zbudpage *zbudpage)
 191{
 192        struct page *page = (struct page *)zbudpage;
 193
 194        clear_bit(PG_reclaim, &page->flags);
 195}
 196
 197/*
 198 * A zombie zbudpage is a persistent page in the process of being evicted.
 199 * The data contained in the zbudpage is valid and we are just waiting for
 200 * the tmem pampds to be invalidated before freeing the page
 201 */
 202static inline int zbudpage_is_zombie(struct zbudpage *zbudpage)
 203{
 204        struct page *page = (struct page *)zbudpage;
 205
 206        return test_bit(PG_dirty, &page->flags);
 207}
 208
 209static inline void zbudpage_set_zombie(struct zbudpage *zbudpage)
 210{
 211        struct page *page = (struct page *)zbudpage;
 212
 213        set_bit(PG_dirty, &page->flags);
 214}
 215
 216static inline void zbudpage_clear_zombie(struct zbudpage *zbudpage)
 217{
 218        struct page *page = (struct page *)zbudpage;
 219
 220        clear_bit(PG_dirty, &page->flags);
 221}
 222
 223static inline void kunmap_zbudpage_atomic(void *zbpg)
 224{
 225        kunmap_atomic(zbpg);
 226}
 227
 228/*
 229 * zbud "translation" and helper functions
 230 */
 231
 232static inline struct zbudpage *zbudref_to_zbudpage(struct zbudref *zref)
 233{
 234        unsigned long zbud = (unsigned long)zref;
 235        zbud &= ~1UL;
 236        return (struct zbudpage *)zbud;
 237}
 238
 239static inline struct zbudref *zbudpage_to_zbudref(struct zbudpage *zbudpage,
 240                                                        unsigned budnum)
 241{
 242        unsigned long zbud = (unsigned long)zbudpage;
 243        BUG_ON(budnum > 1);
 244        zbud |= budnum;
 245        return (struct zbudref *)zbud;
 246}
 247
 248static inline int zbudref_budnum(struct zbudref *zbudref)
 249{
 250        unsigned long zbud = (unsigned long)zbudref;
 251        return zbud & 1UL;
 252}
 253
 254static inline unsigned zbud_max_size(void)
 255{
 256        return MAX_CHUNK << CHUNK_SHIFT;
 257}
 258
 259static inline unsigned zbud_size_to_chunks(unsigned size)
 260{
 261        BUG_ON(size == 0 || size > zbud_max_size());
 262        return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
 263}
 264
 265/* can only be used between kmap_zbudpage_atomic/kunmap_zbudpage_atomic! */
 266static inline char *zbud_data(void *zbpg,
 267                        unsigned budnum, unsigned size)
 268{
 269        char *p;
 270
 271        BUG_ON(size == 0 || size > zbud_max_size());
 272        p = (char *)zbpg;
 273        if (budnum == 1)
 274                p += PAGE_SIZE - ((size + CHUNK_SIZE - 1) & CHUNK_MASK);
 275        return p;
 276}
 277
 278/*
 279 * These are all informative and exposed through debugfs... except for
 280 * the arrays... anyone know how to do that?  To avoid confusion for
 281 * debugfs viewers, some of these should also be atomic_long_t, but
 282 * I don't know how to expose atomics via debugfs either...
 283 */
 284static ssize_t zbud_eph_pageframes;
 285static ssize_t zbud_pers_pageframes;
 286static ssize_t zbud_eph_zpages;
 287static ssize_t zbud_pers_zpages;
 288static u64 zbud_eph_zbytes;
 289static u64 zbud_pers_zbytes;
 290static ssize_t zbud_eph_evicted_pageframes;
 291static ssize_t zbud_pers_evicted_pageframes;
 292static ssize_t zbud_eph_cumul_zpages;
 293static ssize_t zbud_pers_cumul_zpages;
 294static u64 zbud_eph_cumul_zbytes;
 295static u64 zbud_pers_cumul_zbytes;
 296static ssize_t zbud_eph_cumul_chunk_counts[NCHUNKS];
 297static ssize_t zbud_pers_cumul_chunk_counts[NCHUNKS];
 298static ssize_t zbud_eph_buddied_count;
 299static ssize_t zbud_pers_buddied_count;
 300static ssize_t zbud_eph_unbuddied_count;
 301static ssize_t zbud_pers_unbuddied_count;
 302static ssize_t zbud_eph_zombie_count;
 303static ssize_t zbud_pers_zombie_count;
 304static atomic_t zbud_eph_zombie_atomic;
 305static atomic_t zbud_pers_zombie_atomic;
 306
 307#ifdef CONFIG_DEBUG_FS
 308#include <linux/debugfs.h>
 309#define zdfs    debugfs_create_size_t
 310#define zdfs64  debugfs_create_u64
 311static int zbud_debugfs_init(void)
 312{
 313        struct dentry *root = debugfs_create_dir("zbud", NULL);
 314        if (root == NULL)
 315                return -ENXIO;
 316
 317        /*
 318         * would be nice to dump the sizes of the unbuddied
 319         * arrays, like was done with sysfs, but it doesn't
 320         * look like debugfs is flexible enough to do that
 321         */
 322        zdfs64("eph_zbytes", S_IRUGO, root, &zbud_eph_zbytes);
 323        zdfs64("eph_cumul_zbytes", S_IRUGO, root, &zbud_eph_cumul_zbytes);
 324        zdfs64("pers_zbytes", S_IRUGO, root, &zbud_pers_zbytes);
 325        zdfs64("pers_cumul_zbytes", S_IRUGO, root, &zbud_pers_cumul_zbytes);
 326        zdfs("eph_cumul_zpages", S_IRUGO, root, &zbud_eph_cumul_zpages);
 327        zdfs("eph_evicted_pageframes", S_IRUGO, root,
 328                                &zbud_eph_evicted_pageframes);
 329        zdfs("eph_zpages", S_IRUGO, root, &zbud_eph_zpages);
 330        zdfs("eph_pageframes", S_IRUGO, root, &zbud_eph_pageframes);
 331        zdfs("eph_buddied_count", S_IRUGO, root, &zbud_eph_buddied_count);
 332        zdfs("eph_unbuddied_count", S_IRUGO, root, &zbud_eph_unbuddied_count);
 333        zdfs("pers_cumul_zpages", S_IRUGO, root, &zbud_pers_cumul_zpages);
 334        zdfs("pers_evicted_pageframes", S_IRUGO, root,
 335                                &zbud_pers_evicted_pageframes);
 336        zdfs("pers_zpages", S_IRUGO, root, &zbud_pers_zpages);
 337        zdfs("pers_pageframes", S_IRUGO, root, &zbud_pers_pageframes);
 338        zdfs("pers_buddied_count", S_IRUGO, root, &zbud_pers_buddied_count);
 339        zdfs("pers_unbuddied_count", S_IRUGO, root, &zbud_pers_unbuddied_count);
 340        zdfs("pers_zombie_count", S_IRUGO, root, &zbud_pers_zombie_count);
 341        return 0;
 342}
 343#undef  zdfs
 344#undef  zdfs64
 345#else
 346static inline int zbud_debugfs_init(void)
 347{
 348        return 0;
 349}
 350#endif
 351
 352/* protects the buddied list and all unbuddied lists */
 353static DEFINE_SPINLOCK(zbud_eph_lists_lock);
 354static DEFINE_SPINLOCK(zbud_pers_lists_lock);
 355
 356struct zbud_unbuddied {
 357        struct list_head list;
 358        unsigned count;
 359};
 360
 361/* list N contains pages with N chunks USED and NCHUNKS-N unused */
 362/* element 0 is never used but optimizing that isn't worth it */
 363static struct zbud_unbuddied zbud_eph_unbuddied[NCHUNKS];
 364static struct zbud_unbuddied zbud_pers_unbuddied[NCHUNKS];
 365static LIST_HEAD(zbud_eph_lru_list);
 366static LIST_HEAD(zbud_pers_lru_list);
 367static LIST_HEAD(zbud_eph_buddied_list);
 368static LIST_HEAD(zbud_pers_buddied_list);
 369static LIST_HEAD(zbud_eph_zombie_list);
 370static LIST_HEAD(zbud_pers_zombie_list);
 371
 372/*
 373 * Given a struct page, transform it to a zbudpage so that it can be
 374 * used by zbud and initialize fields as necessary.
 375 */
 376static inline struct zbudpage *zbud_init_zbudpage(struct page *page, bool eph)
 377{
 378        struct zbudpage *zbudpage = (struct zbudpage *)page;
 379
 380        BUG_ON(page == NULL);
 381        INIT_LIST_HEAD(&zbudpage->budlist);
 382        INIT_LIST_HEAD(&zbudpage->lru);
 383        zbudpage->zbud0_size = 0;
 384        zbudpage->zbud1_size = 0;
 385        zbudpage->unevictable = 0;
 386        if (eph)
 387                zbud_eph_pageframes++;
 388        else
 389                zbud_pers_pageframes++;
 390        return zbudpage;
 391}
 392
 393/* "Transform" a zbudpage back to a struct page suitable to free. */
 394static inline struct page *zbud_unuse_zbudpage(struct zbudpage *zbudpage,
 395                                                                bool eph)
 396{
 397        struct page *page = (struct page *)zbudpage;
 398
 399        BUG_ON(!list_empty(&zbudpage->budlist));
 400        BUG_ON(!list_empty(&zbudpage->lru));
 401        BUG_ON(zbudpage->zbud0_size != 0);
 402        BUG_ON(zbudpage->zbud1_size != 0);
 403        BUG_ON(!PageLocked(page));
 404        BUG_ON(zbudpage->unevictable != 0);
 405        BUG_ON(zbudpage_is_dying(zbudpage));
 406        BUG_ON(zbudpage_is_zombie(zbudpage));
 407        if (eph)
 408                zbud_eph_pageframes--;
 409        else
 410                zbud_pers_pageframes--;
 411        zbudpage_spin_unlock(zbudpage);
 412        page_mapcount_reset(page);
 413        init_page_count(page);
 414        page->index = 0;
 415        return page;
 416}
 417
 418/* Mark a zbud as unused and do accounting */
 419static inline void zbud_unuse_zbud(struct zbudpage *zbudpage,
 420                                        int budnum, bool eph)
 421{
 422        unsigned size;
 423
 424        BUG_ON(!zbudpage_is_locked(zbudpage));
 425        if (budnum == 0) {
 426                size = zbudpage->zbud0_size;
 427                zbudpage->zbud0_size = 0;
 428        } else {
 429                size = zbudpage->zbud1_size;
 430                zbudpage->zbud1_size = 0;
 431        }
 432        if (eph) {
 433                zbud_eph_zbytes -= size;
 434                zbud_eph_zpages--;
 435        } else {
 436                zbud_pers_zbytes -= size;
 437                zbud_pers_zpages--;
 438        }
 439}
 440
 441/*
 442 * Given a zbudpage/budnum/size, a tmem handle, and a kmapped pointer
 443 * to some data, set up the zbud appropriately including data copying
 444 * and accounting.  Note that if cdata is NULL, the data copying is
 445 * skipped.  (This is useful for lazy writes such as for RAMster.)
 446 */
 447static void zbud_init_zbud(struct zbudpage *zbudpage, struct tmem_handle *th,
 448                                bool eph, void *cdata,
 449                                unsigned budnum, unsigned size)
 450{
 451        char *to;
 452        void *zbpg;
 453        struct tmem_handle *to_th;
 454        unsigned nchunks = zbud_size_to_chunks(size);
 455
 456        BUG_ON(!zbudpage_is_locked(zbudpage));
 457        zbpg = kmap_zbudpage_atomic(zbudpage);
 458        to = zbud_data(zbpg, budnum, size);
 459        to_th = (struct tmem_handle *)to;
 460        to_th->index = th->index;
 461        to_th->oid = th->oid;
 462        to_th->pool_id = th->pool_id;
 463        to_th->client_id = th->client_id;
 464        to += sizeof(struct tmem_handle);
 465        if (cdata != NULL)
 466                memcpy(to, cdata, size - sizeof(struct tmem_handle));
 467        kunmap_zbudpage_atomic(zbpg);
 468        if (budnum == 0)
 469                zbudpage->zbud0_size = size;
 470        else
 471                zbudpage->zbud1_size = size;
 472        if (eph) {
 473                zbud_eph_cumul_chunk_counts[nchunks]++;
 474                zbud_eph_zpages++;
 475                zbud_eph_cumul_zpages++;
 476                zbud_eph_zbytes += size;
 477                zbud_eph_cumul_zbytes += size;
 478        } else {
 479                zbud_pers_cumul_chunk_counts[nchunks]++;
 480                zbud_pers_zpages++;
 481                zbud_pers_cumul_zpages++;
 482                zbud_pers_zbytes += size;
 483                zbud_pers_cumul_zbytes += size;
 484        }
 485}
 486
 487/*
 488 * Given a locked dying zbudpage, read out the tmem handles from the data,
 489 * unlock the page, then use the handles to tell tmem to flush out its
 490 * references
 491 */
 492static void zbud_evict_tmem(struct zbudpage *zbudpage)
 493{
 494        int i, j;
 495        uint32_t pool_id[2], client_id[2];
 496        uint32_t index[2];
 497        struct tmem_oid oid[2];
 498        struct tmem_pool *pool;
 499        void *zbpg;
 500        struct tmem_handle *th;
 501        unsigned size;
 502
 503        /* read out the tmem handles from the data and set aside */
 504        zbpg = kmap_zbudpage_atomic(zbudpage);
 505        for (i = 0, j = 0; i < 2; i++) {
 506                size = (i == 0) ? zbudpage->zbud0_size : zbudpage->zbud1_size;
 507                if (size) {
 508                        th = (struct tmem_handle *)zbud_data(zbpg, i, size);
 509                        client_id[j] = th->client_id;
 510                        pool_id[j] = th->pool_id;
 511                        oid[j] = th->oid;
 512                        index[j] = th->index;
 513                        j++;
 514                        zbud_unuse_zbud(zbudpage, i, true);
 515                }
 516        }
 517        kunmap_zbudpage_atomic(zbpg);
 518        zbudpage_spin_unlock(zbudpage);
 519        /* zbudpage is now an unlocked dying... tell tmem to flush pointers */
 520        for (i = 0; i < j; i++) {
 521                pool = zcache_get_pool_by_id(client_id[i], pool_id[i]);
 522                if (pool != NULL) {
 523                        tmem_flush_page(pool, &oid[i], index[i]);
 524                        zcache_put_pool(pool);
 525                }
 526        }
 527}
 528
 529/*
 530 * Externally callable zbud handling routines.
 531 */
 532
 533/*
 534 * Return the maximum size compressed page that can be stored (secretly
 535 * setting aside space for the tmem handle.
 536 */
 537unsigned int zbud_max_buddy_size(void)
 538{
 539        return zbud_max_size() - sizeof(struct tmem_handle);
 540}
 541
 542/*
 543 * Given a zbud reference, free the corresponding zbud from all lists,
 544 * mark it as unused, do accounting, and if the freeing of the zbud
 545 * frees up an entire pageframe, return it to the caller (else NULL).
 546 */
 547struct page *zbud_free_and_delist(struct zbudref *zref, bool eph,
 548                                  unsigned int *zsize, unsigned int *zpages)
 549{
 550        unsigned long budnum = zbudref_budnum(zref);
 551        struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
 552        struct page *page = NULL;
 553        unsigned chunks, bud_size, other_bud_size;
 554        spinlock_t *lists_lock =
 555                eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
 556        struct zbud_unbuddied *unbud =
 557                eph ? zbud_eph_unbuddied : zbud_pers_unbuddied;
 558
 559
 560        spin_lock(lists_lock);
 561        zbudpage_spin_lock(zbudpage);
 562        if (zbudpage_is_dying(zbudpage)) {
 563                /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */
 564                zbudpage_spin_unlock(zbudpage);
 565                spin_unlock(lists_lock);
 566                *zpages = 0;
 567                *zsize = 0;
 568                goto out;
 569        }
 570        if (budnum == 0) {
 571                bud_size = zbudpage->zbud0_size;
 572                other_bud_size = zbudpage->zbud1_size;
 573        } else {
 574                bud_size = zbudpage->zbud1_size;
 575                other_bud_size = zbudpage->zbud0_size;
 576        }
 577        *zsize = bud_size - sizeof(struct tmem_handle);
 578        *zpages = 1;
 579        zbud_unuse_zbud(zbudpage, budnum, eph);
 580        if (other_bud_size == 0) { /* was unbuddied: unlist and free */
 581                chunks = zbud_size_to_chunks(bud_size) ;
 582                if (zbudpage_is_zombie(zbudpage)) {
 583                        if (eph)
 584                                zbud_pers_zombie_count =
 585                                  atomic_dec_return(&zbud_eph_zombie_atomic);
 586                        else
 587                                zbud_pers_zombie_count =
 588                                  atomic_dec_return(&zbud_pers_zombie_atomic);
 589                        zbudpage_clear_zombie(zbudpage);
 590                } else {
 591                        BUG_ON(list_empty(&unbud[chunks].list));
 592                        list_del_init(&zbudpage->budlist);
 593                        unbud[chunks].count--;
 594                }
 595                list_del_init(&zbudpage->lru);
 596                spin_unlock(lists_lock);
 597                if (eph)
 598                        zbud_eph_unbuddied_count--;
 599                else
 600                        zbud_pers_unbuddied_count--;
 601                page = zbud_unuse_zbudpage(zbudpage, eph);
 602        } else { /* was buddied: move remaining buddy to unbuddied list */
 603                chunks = zbud_size_to_chunks(other_bud_size) ;
 604                if (!zbudpage_is_zombie(zbudpage)) {
 605                        list_del_init(&zbudpage->budlist);
 606                        list_add_tail(&zbudpage->budlist, &unbud[chunks].list);
 607                        unbud[chunks].count++;
 608                }
 609                if (eph) {
 610                        zbud_eph_buddied_count--;
 611                        zbud_eph_unbuddied_count++;
 612                } else {
 613                        zbud_pers_unbuddied_count++;
 614                        zbud_pers_buddied_count--;
 615                }
 616                /* don't mess with lru, no need to move it */
 617                zbudpage_spin_unlock(zbudpage);
 618                spin_unlock(lists_lock);
 619        }
 620out:
 621        return page;
 622}
 623
 624/*
 625 * Given a tmem handle, and a kmapped pointer to compressed data of
 626 * the given size, try to find an unbuddied zbudpage in which to
 627 * create a zbud. If found, put it there, mark the zbudpage unevictable,
 628 * and return a zbudref to it.  Else return NULL.
 629 */
 630struct zbudref *zbud_match_prep(struct tmem_handle *th, bool eph,
 631                                void *cdata, unsigned size)
 632{
 633        struct zbudpage *zbudpage = NULL, *zbudpage2;
 634        unsigned long budnum = 0UL;
 635        unsigned nchunks;
 636        int i, found_good_buddy = 0;
 637        spinlock_t *lists_lock =
 638                eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
 639        struct zbud_unbuddied *unbud =
 640                eph ? zbud_eph_unbuddied : zbud_pers_unbuddied;
 641
 642        size += sizeof(struct tmem_handle);
 643        nchunks = zbud_size_to_chunks(size);
 644        for (i = MAX_CHUNK - nchunks + 1; i > 0; i--) {
 645                spin_lock(lists_lock);
 646                if (!list_empty(&unbud[i].list)) {
 647                        list_for_each_entry_safe(zbudpage, zbudpage2,
 648                                    &unbud[i].list, budlist) {
 649                                if (zbudpage_spin_trylock(zbudpage)) {
 650                                        found_good_buddy = i;
 651                                        goto found_unbuddied;
 652                                }
 653                        }
 654                }
 655                spin_unlock(lists_lock);
 656        }
 657        zbudpage = NULL;
 658        goto out;
 659
 660found_unbuddied:
 661        BUG_ON(!zbudpage_is_locked(zbudpage));
 662        BUG_ON(!((zbudpage->zbud0_size == 0) ^ (zbudpage->zbud1_size == 0)));
 663        if (zbudpage->zbud0_size == 0)
 664                budnum = 0UL;
 665        else if (zbudpage->zbud1_size == 0)
 666                budnum = 1UL;
 667        list_del_init(&zbudpage->budlist);
 668        if (eph) {
 669                list_add_tail(&zbudpage->budlist, &zbud_eph_buddied_list);
 670                unbud[found_good_buddy].count--;
 671                zbud_eph_unbuddied_count--;
 672                zbud_eph_buddied_count++;
 673                /* "promote" raw zbudpage to most-recently-used */
 674                list_del_init(&zbudpage->lru);
 675                list_add_tail(&zbudpage->lru, &zbud_eph_lru_list);
 676        } else {
 677                list_add_tail(&zbudpage->budlist, &zbud_pers_buddied_list);
 678                unbud[found_good_buddy].count--;
 679                zbud_pers_unbuddied_count--;
 680                zbud_pers_buddied_count++;
 681                /* "promote" raw zbudpage to most-recently-used */
 682                list_del_init(&zbudpage->lru);
 683                list_add_tail(&zbudpage->lru, &zbud_pers_lru_list);
 684        }
 685        zbud_init_zbud(zbudpage, th, eph, cdata, budnum, size);
 686        zbudpage->unevictable++;
 687        BUG_ON(zbudpage->unevictable == 3);
 688        zbudpage_spin_unlock(zbudpage);
 689        spin_unlock(lists_lock);
 690out:
 691        return zbudpage_to_zbudref(zbudpage, budnum);
 692
 693}
 694
 695/*
 696 * Given a tmem handle, and a kmapped pointer to compressed data of
 697 * the given size, and a newly allocated struct page, create an unevictable
 698 * zbud in that new page and return a zbudref to it.
 699 */
 700struct zbudref *zbud_create_prep(struct tmem_handle *th, bool eph,
 701                                        void *cdata, unsigned size,
 702                                        struct page *newpage)
 703{
 704        struct zbudpage *zbudpage;
 705        unsigned long budnum = 0;
 706        unsigned nchunks;
 707        spinlock_t *lists_lock =
 708                eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
 709        struct zbud_unbuddied *unbud =
 710                eph ? zbud_eph_unbuddied : zbud_pers_unbuddied;
 711
 712#if 0
 713        /* this may be worth it later to support decompress-in-place? */
 714        static unsigned long counter;
 715        budnum = counter++ & 1; /* alternate using zbud0 and zbud1 */
 716#endif
 717
 718        if (size  > zbud_max_buddy_size())
 719                return NULL;
 720        if (newpage == NULL)
 721                return NULL;
 722
 723        size += sizeof(struct tmem_handle);
 724        nchunks = zbud_size_to_chunks(size) ;
 725        spin_lock(lists_lock);
 726        zbudpage = zbud_init_zbudpage(newpage, eph);
 727        zbudpage_spin_lock(zbudpage);
 728        list_add_tail(&zbudpage->budlist, &unbud[nchunks].list);
 729        if (eph) {
 730                list_add_tail(&zbudpage->lru, &zbud_eph_lru_list);
 731                zbud_eph_unbuddied_count++;
 732        } else {
 733                list_add_tail(&zbudpage->lru, &zbud_pers_lru_list);
 734                zbud_pers_unbuddied_count++;
 735        }
 736        unbud[nchunks].count++;
 737        zbud_init_zbud(zbudpage, th, eph, cdata, budnum, size);
 738        zbudpage->unevictable++;
 739        BUG_ON(zbudpage->unevictable == 3);
 740        zbudpage_spin_unlock(zbudpage);
 741        spin_unlock(lists_lock);
 742        return zbudpage_to_zbudref(zbudpage, budnum);
 743}
 744
 745/*
 746 * Finish creation of a zbud by, assuming another zbud isn't being created
 747 * in parallel, marking it evictable.
 748 */
 749void zbud_create_finish(struct zbudref *zref, bool eph)
 750{
 751        struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
 752        spinlock_t *lists_lock =
 753                eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
 754
 755        spin_lock(lists_lock);
 756        zbudpage_spin_lock(zbudpage);
 757        BUG_ON(zbudpage_is_dying(zbudpage));
 758        zbudpage->unevictable--;
 759        BUG_ON((int)zbudpage->unevictable < 0);
 760        zbudpage_spin_unlock(zbudpage);
 761        spin_unlock(lists_lock);
 762}
 763
 764/*
 765 * Given a zbudref and a struct page, decompress the data from
 766 * the zbud into the physical page represented by the struct page
 767 * by upcalling to zcache_decompress
 768 */
 769int zbud_decompress(struct page *data_page, struct zbudref *zref, bool eph,
 770                        void (*decompress)(char *, unsigned int, char *))
 771{
 772        struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
 773        unsigned long budnum = zbudref_budnum(zref);
 774        void *zbpg;
 775        char *to_va, *from_va;
 776        unsigned size;
 777        int ret = -1;
 778        spinlock_t *lists_lock =
 779                eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
 780
 781        spin_lock(lists_lock);
 782        zbudpage_spin_lock(zbudpage);
 783        if (zbudpage_is_dying(zbudpage)) {
 784                /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */
 785                goto out;
 786        }
 787        zbpg = kmap_zbudpage_atomic(zbudpage);
 788        to_va = kmap_atomic(data_page);
 789        if (budnum == 0)
 790                size = zbudpage->zbud0_size;
 791        else
 792                size = zbudpage->zbud1_size;
 793        BUG_ON(size == 0 || size > zbud_max_size());
 794        from_va = zbud_data(zbpg, budnum, size);
 795        from_va += sizeof(struct tmem_handle);
 796        size -= sizeof(struct tmem_handle);
 797        decompress(from_va, size, to_va);
 798        kunmap_atomic(to_va);
 799        kunmap_zbudpage_atomic(zbpg);
 800        ret = 0;
 801out:
 802        zbudpage_spin_unlock(zbudpage);
 803        spin_unlock(lists_lock);
 804        return ret;
 805}
 806
 807/*
 808 * Given a zbudref and a kernel pointer, copy the data from
 809 * the zbud to the kernel pointer.
 810 */
 811int zbud_copy_from_zbud(char *to_va, struct zbudref *zref,
 812                                size_t *sizep, bool eph)
 813{
 814        struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
 815        unsigned long budnum = zbudref_budnum(zref);
 816        void *zbpg;
 817        char *from_va;
 818        unsigned size;
 819        int ret = -1;
 820        spinlock_t *lists_lock =
 821                eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
 822
 823        spin_lock(lists_lock);
 824        zbudpage_spin_lock(zbudpage);
 825        if (zbudpage_is_dying(zbudpage)) {
 826                /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */
 827                goto out;
 828        }
 829        zbpg = kmap_zbudpage_atomic(zbudpage);
 830        if (budnum == 0)
 831                size = zbudpage->zbud0_size;
 832        else
 833                size = zbudpage->zbud1_size;
 834        BUG_ON(size == 0 || size > zbud_max_size());
 835        from_va = zbud_data(zbpg, budnum, size);
 836        from_va += sizeof(struct tmem_handle);
 837        size -= sizeof(struct tmem_handle);
 838        *sizep = size;
 839        memcpy(to_va, from_va, size);
 840
 841        kunmap_zbudpage_atomic(zbpg);
 842        ret = 0;
 843out:
 844        zbudpage_spin_unlock(zbudpage);
 845        spin_unlock(lists_lock);
 846        return ret;
 847}
 848
 849/*
 850 * Given a zbudref and a kernel pointer, copy the data from
 851 * the kernel pointer to the zbud.
 852 */
 853int zbud_copy_to_zbud(struct zbudref *zref, char *from_va, bool eph)
 854{
 855        struct zbudpage *zbudpage = zbudref_to_zbudpage(zref);
 856        unsigned long budnum = zbudref_budnum(zref);
 857        void *zbpg;
 858        char *to_va;
 859        unsigned size;
 860        int ret = -1;
 861        spinlock_t *lists_lock =
 862                eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
 863
 864        spin_lock(lists_lock);
 865        zbudpage_spin_lock(zbudpage);
 866        if (zbudpage_is_dying(zbudpage)) {
 867                /* ignore dying zbudpage... see zbud_evict_pageframe_lru() */
 868                goto out;
 869        }
 870        zbpg = kmap_zbudpage_atomic(zbudpage);
 871        if (budnum == 0)
 872                size = zbudpage->zbud0_size;
 873        else
 874                size = zbudpage->zbud1_size;
 875        BUG_ON(size == 0 || size > zbud_max_size());
 876        to_va = zbud_data(zbpg, budnum, size);
 877        to_va += sizeof(struct tmem_handle);
 878        size -= sizeof(struct tmem_handle);
 879        memcpy(to_va, from_va, size);
 880
 881        kunmap_zbudpage_atomic(zbpg);
 882        ret = 0;
 883out:
 884        zbudpage_spin_unlock(zbudpage);
 885        spin_unlock(lists_lock);
 886        return ret;
 887}
 888
 889/*
 890 * Choose an ephemeral LRU zbudpage that is evictable (not locked), ensure
 891 * there are no references to it remaining, and return the now unused
 892 * (and re-init'ed) struct page and the total amount of compressed
 893 * data that was evicted.
 894 */
 895struct page *zbud_evict_pageframe_lru(unsigned int *zsize, unsigned int *zpages)
 896{
 897        struct zbudpage *zbudpage = NULL, *zbudpage2;
 898        struct zbud_unbuddied *unbud = zbud_eph_unbuddied;
 899        struct page *page = NULL;
 900        bool irqs_disabled = irqs_disabled();
 901
 902        /*
 903         * Since this can be called indirectly from cleancache_put, which
 904         * has interrupts disabled, as well as frontswap_put, which does not,
 905         * we need to be able to handle both cases, even though it is ugly.
 906         */
 907        if (irqs_disabled)
 908                spin_lock(&zbud_eph_lists_lock);
 909        else
 910                spin_lock_bh(&zbud_eph_lists_lock);
 911        *zsize = 0;
 912        if (list_empty(&zbud_eph_lru_list))
 913                goto unlock_out;
 914        list_for_each_entry_safe(zbudpage, zbudpage2, &zbud_eph_lru_list, lru) {
 915                /* skip a locked zbudpage */
 916                if (unlikely(!zbudpage_spin_trylock(zbudpage)))
 917                        continue;
 918                /* skip an unevictable zbudpage */
 919                if (unlikely(zbudpage->unevictable != 0)) {
 920                        zbudpage_spin_unlock(zbudpage);
 921                        continue;
 922                }
 923                /* got a locked evictable page */
 924                goto evict_page;
 925
 926        }
 927unlock_out:
 928        /* no unlocked evictable pages, give up */
 929        if (irqs_disabled)
 930                spin_unlock(&zbud_eph_lists_lock);
 931        else
 932                spin_unlock_bh(&zbud_eph_lists_lock);
 933        goto out;
 934
 935evict_page:
 936        list_del_init(&zbudpage->budlist);
 937        list_del_init(&zbudpage->lru);
 938        zbudpage_set_dying(zbudpage);
 939        /*
 940         * the zbudpage is now "dying" and attempts to read, write,
 941         * or delete data from it will be ignored
 942         */
 943        if (zbudpage->zbud0_size != 0 && zbudpage->zbud1_size !=  0) {
 944                *zsize = zbudpage->zbud0_size + zbudpage->zbud1_size -
 945                                (2 * sizeof(struct tmem_handle));
 946                *zpages = 2;
 947        } else if (zbudpage->zbud0_size != 0) {
 948                unbud[zbud_size_to_chunks(zbudpage->zbud0_size)].count--;
 949                *zsize = zbudpage->zbud0_size - sizeof(struct tmem_handle);
 950                *zpages = 1;
 951        } else if (zbudpage->zbud1_size != 0) {
 952                unbud[zbud_size_to_chunks(zbudpage->zbud1_size)].count--;
 953                *zsize = zbudpage->zbud1_size - sizeof(struct tmem_handle);
 954                *zpages = 1;
 955        } else {
 956                BUG();
 957        }
 958        spin_unlock(&zbud_eph_lists_lock);
 959        zbud_eph_evicted_pageframes++;
 960        if (*zpages == 1)
 961                zbud_eph_unbuddied_count--;
 962        else
 963                zbud_eph_buddied_count--;
 964        zbud_evict_tmem(zbudpage);
 965        zbudpage_spin_lock(zbudpage);
 966        zbudpage_clear_dying(zbudpage);
 967        page = zbud_unuse_zbudpage(zbudpage, true);
 968        if (!irqs_disabled)
 969                local_bh_enable();
 970out:
 971        return page;
 972}
 973
 974/*
 975 * Choose a persistent LRU zbudpage that is evictable (not locked), zombify it,
 976 * read the tmem_handle(s) out of it into the passed array, and return the
 977 * number of zbuds.  Caller must perform necessary tmem functions and,
 978 * indirectly, zbud functions to fetch any valid data and cause the
 979 * now-zombified zbudpage to eventually be freed.  We track the zombified
 980 * zbudpage count so it is possible to observe if there is a leak.
 981 FIXME: describe (ramster) case where data pointers are passed in for memcpy
 982 */
 983unsigned int zbud_make_zombie_lru(struct tmem_handle *th, unsigned char **data,
 984                                        unsigned int *zsize, bool eph)
 985{
 986        struct zbudpage *zbudpage = NULL, *zbudpag2;
 987        struct tmem_handle *thfrom;
 988        char *from_va;
 989        void *zbpg;
 990        unsigned size;
 991        int ret = 0, i;
 992        spinlock_t *lists_lock =
 993                eph ? &zbud_eph_lists_lock : &zbud_pers_lists_lock;
 994        struct list_head *lru_list =
 995                eph ? &zbud_eph_lru_list : &zbud_pers_lru_list;
 996
 997        spin_lock_bh(lists_lock);
 998        if (list_empty(lru_list))
 999                goto out;
1000        list_for_each_entry_safe(zbudpage, zbudpag2, lru_list, lru) {
1001                /* skip a locked zbudpage */
1002                if (unlikely(!zbudpage_spin_trylock(zbudpage)))
1003                        continue;
1004                /* skip an unevictable zbudpage */
1005                if (unlikely(zbudpage->unevictable != 0)) {
1006                        zbudpage_spin_unlock(zbudpage);
1007                        continue;
1008                }
1009                /* got a locked evictable page */
1010                goto zombify_page;
1011        }
1012        /* no unlocked evictable pages, give up */
1013        goto out;
1014
1015zombify_page:
1016        /* got an unlocked evictable page, zombify it */
1017        list_del_init(&zbudpage->budlist);
1018        zbudpage_set_zombie(zbudpage);
1019        /* FIXME what accounting do I need to do here? */
1020        list_del_init(&zbudpage->lru);
1021        if (eph) {
1022                list_add_tail(&zbudpage->lru, &zbud_eph_zombie_list);
1023                zbud_eph_zombie_count =
1024                                atomic_inc_return(&zbud_eph_zombie_atomic);
1025        } else {
1026                list_add_tail(&zbudpage->lru, &zbud_pers_zombie_list);
1027                zbud_pers_zombie_count =
1028                                atomic_inc_return(&zbud_pers_zombie_atomic);
1029        }
1030        /* FIXME what accounting do I need to do here? */
1031        zbpg = kmap_zbudpage_atomic(zbudpage);
1032        for (i = 0; i < 2; i++) {
1033                size = (i == 0) ? zbudpage->zbud0_size : zbudpage->zbud1_size;
1034                if (size) {
1035                        from_va = zbud_data(zbpg, i, size);
1036                        thfrom = (struct tmem_handle *)from_va;
1037                        from_va += sizeof(struct tmem_handle);
1038                        size -= sizeof(struct tmem_handle);
1039                        if (th != NULL)
1040                                th[ret] = *thfrom;
1041                        if (data != NULL)
1042                                memcpy(data[ret], from_va, size);
1043                        if (zsize != NULL)
1044                                *zsize++ = size;
1045                        ret++;
1046                }
1047        }
1048        kunmap_zbudpage_atomic(zbpg);
1049        zbudpage_spin_unlock(zbudpage);
1050out:
1051        spin_unlock_bh(lists_lock);
1052        return ret;
1053}
1054
1055void zbud_init(void)
1056{
1057        int i;
1058
1059        zbud_debugfs_init();
1060        BUG_ON((sizeof(struct tmem_handle) * 2 > CHUNK_SIZE));
1061        BUG_ON(sizeof(struct zbudpage) > sizeof(struct page));
1062        for (i = 0; i < NCHUNKS; i++) {
1063                INIT_LIST_HEAD(&zbud_eph_unbuddied[i].list);
1064                INIT_LIST_HEAD(&zbud_pers_unbuddied[i].list);
1065        }
1066}
1067