linux/mm/z3fold.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * z3fold.c
   4 *
   5 * Author: Vitaly Wool <vitaly.wool@konsulko.com>
   6 * Copyright (C) 2016, Sony Mobile Communications Inc.
   7 *
   8 * This implementation is based on zbud written by Seth Jennings.
   9 *
  10 * z3fold is an special purpose allocator for storing compressed pages. It
  11 * can store up to three compressed pages per page which improves the
  12 * compression ratio of zbud while retaining its main concepts (e. g. always
  13 * storing an integral number of objects per page) and simplicity.
  14 * It still has simple and deterministic reclaim properties that make it
  15 * preferable to a higher density approach (with no requirement on integral
  16 * number of object per page) when reclaim is used.
  17 *
  18 * As in zbud, pages are divided into "chunks".  The size of the chunks is
  19 * fixed at compile time and is determined by NCHUNKS_ORDER below.
  20 *
  21 * z3fold doesn't export any API and is meant to be used via zpool API.
  22 */
  23
  24#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  25
  26#include <linux/atomic.h>
  27#include <linux/sched.h>
  28#include <linux/cpumask.h>
  29#include <linux/list.h>
  30#include <linux/mm.h>
  31#include <linux/module.h>
  32#include <linux/page-flags.h>
  33#include <linux/migrate.h>
  34#include <linux/node.h>
  35#include <linux/compaction.h>
  36#include <linux/percpu.h>
  37#include <linux/mount.h>
  38#include <linux/pseudo_fs.h>
  39#include <linux/fs.h>
  40#include <linux/preempt.h>
  41#include <linux/workqueue.h>
  42#include <linux/slab.h>
  43#include <linux/spinlock.h>
  44#include <linux/zpool.h>
  45#include <linux/magic.h>
  46#include <linux/kmemleak.h>
  47
  48/*
  49 * NCHUNKS_ORDER determines the internal allocation granularity, effectively
  50 * adjusting internal fragmentation.  It also determines the number of
  51 * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
  52 * allocation granularity will be in chunks of size PAGE_SIZE/64. Some chunks
  53 * in the beginning of an allocated page are occupied by z3fold header, so
  54 * NCHUNKS will be calculated to 63 (or 62 in case CONFIG_DEBUG_SPINLOCK=y),
  55 * which shows the max number of free chunks in z3fold page, also there will
  56 * be 63, or 62, respectively, freelists per pool.
  57 */
  58#define NCHUNKS_ORDER   6
  59
  60#define CHUNK_SHIFT     (PAGE_SHIFT - NCHUNKS_ORDER)
  61#define CHUNK_SIZE      (1 << CHUNK_SHIFT)
  62#define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE)
  63#define ZHDR_CHUNKS     (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT)
  64#define TOTAL_CHUNKS    (PAGE_SIZE >> CHUNK_SHIFT)
  65#define NCHUNKS         (TOTAL_CHUNKS - ZHDR_CHUNKS)
  66
  67#define BUDDY_MASK      (0x3)
  68#define BUDDY_SHIFT     2
  69#define SLOTS_ALIGN     (0x40)
  70
  71/*****************
  72 * Structures
  73*****************/
  74struct z3fold_pool;
  75struct z3fold_ops {
  76        int (*evict)(struct z3fold_pool *pool, unsigned long handle);
  77};
  78
  79enum buddy {
  80        HEADLESS = 0,
  81        FIRST,
  82        MIDDLE,
  83        LAST,
  84        BUDDIES_MAX = LAST
  85};
  86
  87struct z3fold_buddy_slots {
  88        /*
  89         * we are using BUDDY_MASK in handle_to_buddy etc. so there should
  90         * be enough slots to hold all possible variants
  91         */
  92        unsigned long slot[BUDDY_MASK + 1];
  93        unsigned long pool; /* back link */
  94        rwlock_t lock;
  95};
  96#define HANDLE_FLAG_MASK        (0x03)
  97
  98/*
  99 * struct z3fold_header - z3fold page metadata occupying first chunks of each
 100 *                      z3fold page, except for HEADLESS pages
 101 * @buddy:              links the z3fold page into the relevant list in the
 102 *                      pool
 103 * @page_lock:          per-page lock
 104 * @refcount:           reference count for the z3fold page
 105 * @work:               work_struct for page layout optimization
 106 * @slots:              pointer to the structure holding buddy slots
 107 * @pool:               pointer to the containing pool
 108 * @cpu:                CPU which this page "belongs" to
 109 * @first_chunks:       the size of the first buddy in chunks, 0 if free
 110 * @middle_chunks:      the size of the middle buddy in chunks, 0 if free
 111 * @last_chunks:        the size of the last buddy in chunks, 0 if free
 112 * @first_num:          the starting number (for the first handle)
 113 * @mapped_count:       the number of objects currently mapped
 114 */
 115struct z3fold_header {
 116        struct list_head buddy;
 117        spinlock_t page_lock;
 118        struct kref refcount;
 119        struct work_struct work;
 120        struct z3fold_buddy_slots *slots;
 121        struct z3fold_pool *pool;
 122        short cpu;
 123        unsigned short first_chunks;
 124        unsigned short middle_chunks;
 125        unsigned short last_chunks;
 126        unsigned short start_middle;
 127        unsigned short first_num:2;
 128        unsigned short mapped_count:2;
 129        unsigned short foreign_handles:2;
 130};
 131
 132/**
 133 * struct z3fold_pool - stores metadata for each z3fold pool
 134 * @name:       pool name
 135 * @lock:       protects pool unbuddied/lru lists
 136 * @stale_lock: protects pool stale page list
 137 * @unbuddied:  per-cpu array of lists tracking z3fold pages that contain 2-
 138 *              buddies; the list each z3fold page is added to depends on
 139 *              the size of its free region.
 140 * @lru:        list tracking the z3fold pages in LRU order by most recently
 141 *              added buddy.
 142 * @stale:      list of pages marked for freeing
 143 * @pages_nr:   number of z3fold pages in the pool.
 144 * @c_handle:   cache for z3fold_buddy_slots allocation
 145 * @ops:        pointer to a structure of user defined operations specified at
 146 *              pool creation time.
 147 * @zpool:      zpool driver
 148 * @zpool_ops:  zpool operations structure with an evict callback
 149 * @compact_wq: workqueue for page layout background optimization
 150 * @release_wq: workqueue for safe page release
 151 * @work:       work_struct for safe page release
 152 * @inode:      inode for z3fold pseudo filesystem
 153 *
 154 * This structure is allocated at pool creation time and maintains metadata
 155 * pertaining to a particular z3fold pool.
 156 */
 157struct z3fold_pool {
 158        const char *name;
 159        spinlock_t lock;
 160        spinlock_t stale_lock;
 161        struct list_head *unbuddied;
 162        struct list_head lru;
 163        struct list_head stale;
 164        atomic64_t pages_nr;
 165        struct kmem_cache *c_handle;
 166        const struct z3fold_ops *ops;
 167        struct zpool *zpool;
 168        const struct zpool_ops *zpool_ops;
 169        struct workqueue_struct *compact_wq;
 170        struct workqueue_struct *release_wq;
 171        struct work_struct work;
 172        struct inode *inode;
 173};
 174
 175/*
 176 * Internal z3fold page flags
 177 */
 178enum z3fold_page_flags {
 179        PAGE_HEADLESS = 0,
 180        MIDDLE_CHUNK_MAPPED,
 181        NEEDS_COMPACTING,
 182        PAGE_STALE,
 183        PAGE_CLAIMED, /* by either reclaim or free */
 184};
 185
 186/*
 187 * handle flags, go under HANDLE_FLAG_MASK
 188 */
 189enum z3fold_handle_flags {
 190        HANDLES_NOFREE = 0,
 191};
 192
 193/*
 194 * Forward declarations
 195 */
 196static struct z3fold_header *__z3fold_alloc(struct z3fold_pool *, size_t, bool);
 197static void compact_page_work(struct work_struct *w);
 198
 199/*****************
 200 * Helpers
 201*****************/
 202
 203/* Converts an allocation size in bytes to size in z3fold chunks */
 204static int size_to_chunks(size_t size)
 205{
 206        return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
 207}
 208
 209#define for_each_unbuddied_list(_iter, _begin) \
 210        for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
 211
 212static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool,
 213                                                        gfp_t gfp)
 214{
 215        struct z3fold_buddy_slots *slots;
 216
 217        slots = kmem_cache_zalloc(pool->c_handle,
 218                                 (gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE)));
 219
 220        if (slots) {
 221                /* It will be freed separately in free_handle(). */
 222                kmemleak_not_leak(slots);
 223                slots->pool = (unsigned long)pool;
 224                rwlock_init(&slots->lock);
 225        }
 226
 227        return slots;
 228}
 229
 230static inline struct z3fold_pool *slots_to_pool(struct z3fold_buddy_slots *s)
 231{
 232        return (struct z3fold_pool *)(s->pool & ~HANDLE_FLAG_MASK);
 233}
 234
 235static inline struct z3fold_buddy_slots *handle_to_slots(unsigned long handle)
 236{
 237        return (struct z3fold_buddy_slots *)(handle & ~(SLOTS_ALIGN - 1));
 238}
 239
 240/* Lock a z3fold page */
 241static inline void z3fold_page_lock(struct z3fold_header *zhdr)
 242{
 243        spin_lock(&zhdr->page_lock);
 244}
 245
 246/* Try to lock a z3fold page */
 247static inline int z3fold_page_trylock(struct z3fold_header *zhdr)
 248{
 249        return spin_trylock(&zhdr->page_lock);
 250}
 251
 252/* Unlock a z3fold page */
 253static inline void z3fold_page_unlock(struct z3fold_header *zhdr)
 254{
 255        spin_unlock(&zhdr->page_lock);
 256}
 257
 258/* return locked z3fold page if it's not headless */
 259static inline struct z3fold_header *get_z3fold_header(unsigned long handle)
 260{
 261        struct z3fold_buddy_slots *slots;
 262        struct z3fold_header *zhdr;
 263        int locked = 0;
 264
 265        if (!(handle & (1 << PAGE_HEADLESS))) {
 266                slots = handle_to_slots(handle);
 267                do {
 268                        unsigned long addr;
 269
 270                        read_lock(&slots->lock);
 271                        addr = *(unsigned long *)handle;
 272                        zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
 273                        locked = z3fold_page_trylock(zhdr);
 274                        read_unlock(&slots->lock);
 275                        if (locked)
 276                                break;
 277                        cpu_relax();
 278                } while (true);
 279        } else {
 280                zhdr = (struct z3fold_header *)(handle & PAGE_MASK);
 281        }
 282
 283        return zhdr;
 284}
 285
 286static inline void put_z3fold_header(struct z3fold_header *zhdr)
 287{
 288        struct page *page = virt_to_page(zhdr);
 289
 290        if (!test_bit(PAGE_HEADLESS, &page->private))
 291                z3fold_page_unlock(zhdr);
 292}
 293
 294static inline void free_handle(unsigned long handle, struct z3fold_header *zhdr)
 295{
 296        struct z3fold_buddy_slots *slots;
 297        int i;
 298        bool is_free;
 299
 300        if (handle & (1 << PAGE_HEADLESS))
 301                return;
 302
 303        if (WARN_ON(*(unsigned long *)handle == 0))
 304                return;
 305
 306        slots = handle_to_slots(handle);
 307        write_lock(&slots->lock);
 308        *(unsigned long *)handle = 0;
 309
 310        if (test_bit(HANDLES_NOFREE, &slots->pool)) {
 311                write_unlock(&slots->lock);
 312                return; /* simple case, nothing else to do */
 313        }
 314
 315        if (zhdr->slots != slots)
 316                zhdr->foreign_handles--;
 317
 318        is_free = true;
 319        for (i = 0; i <= BUDDY_MASK; i++) {
 320                if (slots->slot[i]) {
 321                        is_free = false;
 322                        break;
 323                }
 324        }
 325        write_unlock(&slots->lock);
 326
 327        if (is_free) {
 328                struct z3fold_pool *pool = slots_to_pool(slots);
 329
 330                if (zhdr->slots == slots)
 331                        zhdr->slots = NULL;
 332                kmem_cache_free(pool->c_handle, slots);
 333        }
 334}
 335
 336static int z3fold_init_fs_context(struct fs_context *fc)
 337{
 338        return init_pseudo(fc, Z3FOLD_MAGIC) ? 0 : -ENOMEM;
 339}
 340
 341static struct file_system_type z3fold_fs = {
 342        .name           = "z3fold",
 343        .init_fs_context = z3fold_init_fs_context,
 344        .kill_sb        = kill_anon_super,
 345};
 346
 347static struct vfsmount *z3fold_mnt;
 348static int z3fold_mount(void)
 349{
 350        int ret = 0;
 351
 352        z3fold_mnt = kern_mount(&z3fold_fs);
 353        if (IS_ERR(z3fold_mnt))
 354                ret = PTR_ERR(z3fold_mnt);
 355
 356        return ret;
 357}
 358
 359static void z3fold_unmount(void)
 360{
 361        kern_unmount(z3fold_mnt);
 362}
 363
 364static const struct address_space_operations z3fold_aops;
 365static int z3fold_register_migration(struct z3fold_pool *pool)
 366{
 367        pool->inode = alloc_anon_inode(z3fold_mnt->mnt_sb);
 368        if (IS_ERR(pool->inode)) {
 369                pool->inode = NULL;
 370                return 1;
 371        }
 372
 373        pool->inode->i_mapping->private_data = pool;
 374        pool->inode->i_mapping->a_ops = &z3fold_aops;
 375        return 0;
 376}
 377
 378static void z3fold_unregister_migration(struct z3fold_pool *pool)
 379{
 380        if (pool->inode)
 381                iput(pool->inode);
 382}
 383
 384/* Initializes the z3fold header of a newly allocated z3fold page */
 385static struct z3fold_header *init_z3fold_page(struct page *page, bool headless,
 386                                        struct z3fold_pool *pool, gfp_t gfp)
 387{
 388        struct z3fold_header *zhdr = page_address(page);
 389        struct z3fold_buddy_slots *slots;
 390
 391        INIT_LIST_HEAD(&page->lru);
 392        clear_bit(PAGE_HEADLESS, &page->private);
 393        clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
 394        clear_bit(NEEDS_COMPACTING, &page->private);
 395        clear_bit(PAGE_STALE, &page->private);
 396        clear_bit(PAGE_CLAIMED, &page->private);
 397        if (headless)
 398                return zhdr;
 399
 400        slots = alloc_slots(pool, gfp);
 401        if (!slots)
 402                return NULL;
 403
 404        memset(zhdr, 0, sizeof(*zhdr));
 405        spin_lock_init(&zhdr->page_lock);
 406        kref_init(&zhdr->refcount);
 407        zhdr->cpu = -1;
 408        zhdr->slots = slots;
 409        zhdr->pool = pool;
 410        INIT_LIST_HEAD(&zhdr->buddy);
 411        INIT_WORK(&zhdr->work, compact_page_work);
 412        return zhdr;
 413}
 414
 415/* Resets the struct page fields and frees the page */
 416static void free_z3fold_page(struct page *page, bool headless)
 417{
 418        if (!headless) {
 419                lock_page(page);
 420                __ClearPageMovable(page);
 421                unlock_page(page);
 422        }
 423        ClearPagePrivate(page);
 424        __free_page(page);
 425}
 426
 427/* Helper function to build the index */
 428static inline int __idx(struct z3fold_header *zhdr, enum buddy bud)
 429{
 430        return (bud + zhdr->first_num) & BUDDY_MASK;
 431}
 432
 433/*
 434 * Encodes the handle of a particular buddy within a z3fold page
 435 * Pool lock should be held as this function accesses first_num
 436 */
 437static unsigned long __encode_handle(struct z3fold_header *zhdr,
 438                                struct z3fold_buddy_slots *slots,
 439                                enum buddy bud)
 440{
 441        unsigned long h = (unsigned long)zhdr;
 442        int idx = 0;
 443
 444        /*
 445         * For a headless page, its handle is its pointer with the extra
 446         * PAGE_HEADLESS bit set
 447         */
 448        if (bud == HEADLESS)
 449                return h | (1 << PAGE_HEADLESS);
 450
 451        /* otherwise, return pointer to encoded handle */
 452        idx = __idx(zhdr, bud);
 453        h += idx;
 454        if (bud == LAST)
 455                h |= (zhdr->last_chunks << BUDDY_SHIFT);
 456
 457        write_lock(&slots->lock);
 458        slots->slot[idx] = h;
 459        write_unlock(&slots->lock);
 460        return (unsigned long)&slots->slot[idx];
 461}
 462
 463static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud)
 464{
 465        return __encode_handle(zhdr, zhdr->slots, bud);
 466}
 467
 468/* only for LAST bud, returns zero otherwise */
 469static unsigned short handle_to_chunks(unsigned long handle)
 470{
 471        struct z3fold_buddy_slots *slots = handle_to_slots(handle);
 472        unsigned long addr;
 473
 474        read_lock(&slots->lock);
 475        addr = *(unsigned long *)handle;
 476        read_unlock(&slots->lock);
 477        return (addr & ~PAGE_MASK) >> BUDDY_SHIFT;
 478}
 479
 480/*
 481 * (handle & BUDDY_MASK) < zhdr->first_num is possible in encode_handle
 482 *  but that doesn't matter. because the masking will result in the
 483 *  correct buddy number.
 484 */
 485static enum buddy handle_to_buddy(unsigned long handle)
 486{
 487        struct z3fold_header *zhdr;
 488        struct z3fold_buddy_slots *slots = handle_to_slots(handle);
 489        unsigned long addr;
 490
 491        read_lock(&slots->lock);
 492        WARN_ON(handle & (1 << PAGE_HEADLESS));
 493        addr = *(unsigned long *)handle;
 494        read_unlock(&slots->lock);
 495        zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
 496        return (addr - zhdr->first_num) & BUDDY_MASK;
 497}
 498
 499static inline struct z3fold_pool *zhdr_to_pool(struct z3fold_header *zhdr)
 500{
 501        return zhdr->pool;
 502}
 503
 504static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
 505{
 506        struct page *page = virt_to_page(zhdr);
 507        struct z3fold_pool *pool = zhdr_to_pool(zhdr);
 508
 509        WARN_ON(!list_empty(&zhdr->buddy));
 510        set_bit(PAGE_STALE, &page->private);
 511        clear_bit(NEEDS_COMPACTING, &page->private);
 512        spin_lock(&pool->lock);
 513        if (!list_empty(&page->lru))
 514                list_del_init(&page->lru);
 515        spin_unlock(&pool->lock);
 516
 517        if (locked)
 518                z3fold_page_unlock(zhdr);
 519
 520        spin_lock(&pool->stale_lock);
 521        list_add(&zhdr->buddy, &pool->stale);
 522        queue_work(pool->release_wq, &pool->work);
 523        spin_unlock(&pool->stale_lock);
 524}
 525
 526static void release_z3fold_page(struct kref *ref)
 527{
 528        struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
 529                                                refcount);
 530        __release_z3fold_page(zhdr, false);
 531}
 532
 533static void release_z3fold_page_locked(struct kref *ref)
 534{
 535        struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
 536                                                refcount);
 537        WARN_ON(z3fold_page_trylock(zhdr));
 538        __release_z3fold_page(zhdr, true);
 539}
 540
 541static void release_z3fold_page_locked_list(struct kref *ref)
 542{
 543        struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
 544                                               refcount);
 545        struct z3fold_pool *pool = zhdr_to_pool(zhdr);
 546
 547        spin_lock(&pool->lock);
 548        list_del_init(&zhdr->buddy);
 549        spin_unlock(&pool->lock);
 550
 551        WARN_ON(z3fold_page_trylock(zhdr));
 552        __release_z3fold_page(zhdr, true);
 553}
 554
 555static void free_pages_work(struct work_struct *w)
 556{
 557        struct z3fold_pool *pool = container_of(w, struct z3fold_pool, work);
 558
 559        spin_lock(&pool->stale_lock);
 560        while (!list_empty(&pool->stale)) {
 561                struct z3fold_header *zhdr = list_first_entry(&pool->stale,
 562                                                struct z3fold_header, buddy);
 563                struct page *page = virt_to_page(zhdr);
 564
 565                list_del(&zhdr->buddy);
 566                if (WARN_ON(!test_bit(PAGE_STALE, &page->private)))
 567                        continue;
 568                spin_unlock(&pool->stale_lock);
 569                cancel_work_sync(&zhdr->work);
 570                free_z3fold_page(page, false);
 571                cond_resched();
 572                spin_lock(&pool->stale_lock);
 573        }
 574        spin_unlock(&pool->stale_lock);
 575}
 576
 577/*
 578 * Returns the number of free chunks in a z3fold page.
 579 * NB: can't be used with HEADLESS pages.
 580 */
 581static int num_free_chunks(struct z3fold_header *zhdr)
 582{
 583        int nfree;
 584        /*
 585         * If there is a middle object, pick up the bigger free space
 586         * either before or after it. Otherwise just subtract the number
 587         * of chunks occupied by the first and the last objects.
 588         */
 589        if (zhdr->middle_chunks != 0) {
 590                int nfree_before = zhdr->first_chunks ?
 591                        0 : zhdr->start_middle - ZHDR_CHUNKS;
 592                int nfree_after = zhdr->last_chunks ?
 593                        0 : TOTAL_CHUNKS -
 594                                (zhdr->start_middle + zhdr->middle_chunks);
 595                nfree = max(nfree_before, nfree_after);
 596        } else
 597                nfree = NCHUNKS - zhdr->first_chunks - zhdr->last_chunks;
 598        return nfree;
 599}
 600
 601/* Add to the appropriate unbuddied list */
 602static inline void add_to_unbuddied(struct z3fold_pool *pool,
 603                                struct z3fold_header *zhdr)
 604{
 605        if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
 606                        zhdr->middle_chunks == 0) {
 607                struct list_head *unbuddied;
 608                int freechunks = num_free_chunks(zhdr);
 609
 610                migrate_disable();
 611                unbuddied = this_cpu_ptr(pool->unbuddied);
 612                spin_lock(&pool->lock);
 613                list_add(&zhdr->buddy, &unbuddied[freechunks]);
 614                spin_unlock(&pool->lock);
 615                zhdr->cpu = smp_processor_id();
 616                migrate_enable();
 617        }
 618}
 619
 620static inline enum buddy get_free_buddy(struct z3fold_header *zhdr, int chunks)
 621{
 622        enum buddy bud = HEADLESS;
 623
 624        if (zhdr->middle_chunks) {
 625                if (!zhdr->first_chunks &&
 626                    chunks <= zhdr->start_middle - ZHDR_CHUNKS)
 627                        bud = FIRST;
 628                else if (!zhdr->last_chunks)
 629                        bud = LAST;
 630        } else {
 631                if (!zhdr->first_chunks)
 632                        bud = FIRST;
 633                else if (!zhdr->last_chunks)
 634                        bud = LAST;
 635                else
 636                        bud = MIDDLE;
 637        }
 638
 639        return bud;
 640}
 641
 642static inline void *mchunk_memmove(struct z3fold_header *zhdr,
 643                                unsigned short dst_chunk)
 644{
 645        void *beg = zhdr;
 646        return memmove(beg + (dst_chunk << CHUNK_SHIFT),
 647                       beg + (zhdr->start_middle << CHUNK_SHIFT),
 648                       zhdr->middle_chunks << CHUNK_SHIFT);
 649}
 650
 651static inline bool buddy_single(struct z3fold_header *zhdr)
 652{
 653        return !((zhdr->first_chunks && zhdr->middle_chunks) ||
 654                        (zhdr->first_chunks && zhdr->last_chunks) ||
 655                        (zhdr->middle_chunks && zhdr->last_chunks));
 656}
 657
 658static struct z3fold_header *compact_single_buddy(struct z3fold_header *zhdr)
 659{
 660        struct z3fold_pool *pool = zhdr_to_pool(zhdr);
 661        void *p = zhdr;
 662        unsigned long old_handle = 0;
 663        size_t sz = 0;
 664        struct z3fold_header *new_zhdr = NULL;
 665        int first_idx = __idx(zhdr, FIRST);
 666        int middle_idx = __idx(zhdr, MIDDLE);
 667        int last_idx = __idx(zhdr, LAST);
 668        unsigned short *moved_chunks = NULL;
 669
 670        /*
 671         * No need to protect slots here -- all the slots are "local" and
 672         * the page lock is already taken
 673         */
 674        if (zhdr->first_chunks && zhdr->slots->slot[first_idx]) {
 675                p += ZHDR_SIZE_ALIGNED;
 676                sz = zhdr->first_chunks << CHUNK_SHIFT;
 677                old_handle = (unsigned long)&zhdr->slots->slot[first_idx];
 678                moved_chunks = &zhdr->first_chunks;
 679        } else if (zhdr->middle_chunks && zhdr->slots->slot[middle_idx]) {
 680                p += zhdr->start_middle << CHUNK_SHIFT;
 681                sz = zhdr->middle_chunks << CHUNK_SHIFT;
 682                old_handle = (unsigned long)&zhdr->slots->slot[middle_idx];
 683                moved_chunks = &zhdr->middle_chunks;
 684        } else if (zhdr->last_chunks && zhdr->slots->slot[last_idx]) {
 685                p += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT);
 686                sz = zhdr->last_chunks << CHUNK_SHIFT;
 687                old_handle = (unsigned long)&zhdr->slots->slot[last_idx];
 688                moved_chunks = &zhdr->last_chunks;
 689        }
 690
 691        if (sz > 0) {
 692                enum buddy new_bud = HEADLESS;
 693                short chunks = size_to_chunks(sz);
 694                void *q;
 695
 696                new_zhdr = __z3fold_alloc(pool, sz, false);
 697                if (!new_zhdr)
 698                        return NULL;
 699
 700                if (WARN_ON(new_zhdr == zhdr))
 701                        goto out_fail;
 702
 703                new_bud = get_free_buddy(new_zhdr, chunks);
 704                q = new_zhdr;
 705                switch (new_bud) {
 706                case FIRST:
 707                        new_zhdr->first_chunks = chunks;
 708                        q += ZHDR_SIZE_ALIGNED;
 709                        break;
 710                case MIDDLE:
 711                        new_zhdr->middle_chunks = chunks;
 712                        new_zhdr->start_middle =
 713                                new_zhdr->first_chunks + ZHDR_CHUNKS;
 714                        q += new_zhdr->start_middle << CHUNK_SHIFT;
 715                        break;
 716                case LAST:
 717                        new_zhdr->last_chunks = chunks;
 718                        q += PAGE_SIZE - (new_zhdr->last_chunks << CHUNK_SHIFT);
 719                        break;
 720                default:
 721                        goto out_fail;
 722                }
 723                new_zhdr->foreign_handles++;
 724                memcpy(q, p, sz);
 725                write_lock(&zhdr->slots->lock);
 726                *(unsigned long *)old_handle = (unsigned long)new_zhdr +
 727                        __idx(new_zhdr, new_bud);
 728                if (new_bud == LAST)
 729                        *(unsigned long *)old_handle |=
 730                                        (new_zhdr->last_chunks << BUDDY_SHIFT);
 731                write_unlock(&zhdr->slots->lock);
 732                add_to_unbuddied(pool, new_zhdr);
 733                z3fold_page_unlock(new_zhdr);
 734
 735                *moved_chunks = 0;
 736        }
 737
 738        return new_zhdr;
 739
 740out_fail:
 741        if (new_zhdr) {
 742                if (kref_put(&new_zhdr->refcount, release_z3fold_page_locked))
 743                        atomic64_dec(&pool->pages_nr);
 744                else {
 745                        add_to_unbuddied(pool, new_zhdr);
 746                        z3fold_page_unlock(new_zhdr);
 747                }
 748        }
 749        return NULL;
 750
 751}
 752
 753#define BIG_CHUNK_GAP   3
 754/* Has to be called with lock held */
 755static int z3fold_compact_page(struct z3fold_header *zhdr)
 756{
 757        struct page *page = virt_to_page(zhdr);
 758
 759        if (test_bit(MIDDLE_CHUNK_MAPPED, &page->private))
 760                return 0; /* can't move middle chunk, it's used */
 761
 762        if (unlikely(PageIsolated(page)))
 763                return 0;
 764
 765        if (zhdr->middle_chunks == 0)
 766                return 0; /* nothing to compact */
 767
 768        if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
 769                /* move to the beginning */
 770                mchunk_memmove(zhdr, ZHDR_CHUNKS);
 771                zhdr->first_chunks = zhdr->middle_chunks;
 772                zhdr->middle_chunks = 0;
 773                zhdr->start_middle = 0;
 774                zhdr->first_num++;
 775                return 1;
 776        }
 777
 778        /*
 779         * moving data is expensive, so let's only do that if
 780         * there's substantial gain (at least BIG_CHUNK_GAP chunks)
 781         */
 782        if (zhdr->first_chunks != 0 && zhdr->last_chunks == 0 &&
 783            zhdr->start_middle - (zhdr->first_chunks + ZHDR_CHUNKS) >=
 784                        BIG_CHUNK_GAP) {
 785                mchunk_memmove(zhdr, zhdr->first_chunks + ZHDR_CHUNKS);
 786                zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
 787                return 1;
 788        } else if (zhdr->last_chunks != 0 && zhdr->first_chunks == 0 &&
 789                   TOTAL_CHUNKS - (zhdr->last_chunks + zhdr->start_middle
 790                                        + zhdr->middle_chunks) >=
 791                        BIG_CHUNK_GAP) {
 792                unsigned short new_start = TOTAL_CHUNKS - zhdr->last_chunks -
 793                        zhdr->middle_chunks;
 794                mchunk_memmove(zhdr, new_start);
 795                zhdr->start_middle = new_start;
 796                return 1;
 797        }
 798
 799        return 0;
 800}
 801
 802static void do_compact_page(struct z3fold_header *zhdr, bool locked)
 803{
 804        struct z3fold_pool *pool = zhdr_to_pool(zhdr);
 805        struct page *page;
 806
 807        page = virt_to_page(zhdr);
 808        if (locked)
 809                WARN_ON(z3fold_page_trylock(zhdr));
 810        else
 811                z3fold_page_lock(zhdr);
 812        if (WARN_ON(!test_and_clear_bit(NEEDS_COMPACTING, &page->private))) {
 813                z3fold_page_unlock(zhdr);
 814                return;
 815        }
 816        spin_lock(&pool->lock);
 817        list_del_init(&zhdr->buddy);
 818        spin_unlock(&pool->lock);
 819
 820        if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
 821                atomic64_dec(&pool->pages_nr);
 822                return;
 823        }
 824
 825        if (test_bit(PAGE_STALE, &page->private) ||
 826            test_and_set_bit(PAGE_CLAIMED, &page->private)) {
 827                z3fold_page_unlock(zhdr);
 828                return;
 829        }
 830
 831        if (!zhdr->foreign_handles && buddy_single(zhdr) &&
 832            zhdr->mapped_count == 0 && compact_single_buddy(zhdr)) {
 833                if (kref_put(&zhdr->refcount, release_z3fold_page_locked))
 834                        atomic64_dec(&pool->pages_nr);
 835                else {
 836                        clear_bit(PAGE_CLAIMED, &page->private);
 837                        z3fold_page_unlock(zhdr);
 838                }
 839                return;
 840        }
 841
 842        z3fold_compact_page(zhdr);
 843        add_to_unbuddied(pool, zhdr);
 844        clear_bit(PAGE_CLAIMED, &page->private);
 845        z3fold_page_unlock(zhdr);
 846}
 847
 848static void compact_page_work(struct work_struct *w)
 849{
 850        struct z3fold_header *zhdr = container_of(w, struct z3fold_header,
 851                                                work);
 852
 853        do_compact_page(zhdr, false);
 854}
 855
 856/* returns _locked_ z3fold page header or NULL */
 857static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
 858                                                size_t size, bool can_sleep)
 859{
 860        struct z3fold_header *zhdr = NULL;
 861        struct page *page;
 862        struct list_head *unbuddied;
 863        int chunks = size_to_chunks(size), i;
 864
 865lookup:
 866        migrate_disable();
 867        /* First, try to find an unbuddied z3fold page. */
 868        unbuddied = this_cpu_ptr(pool->unbuddied);
 869        for_each_unbuddied_list(i, chunks) {
 870                struct list_head *l = &unbuddied[i];
 871
 872                zhdr = list_first_entry_or_null(READ_ONCE(l),
 873                                        struct z3fold_header, buddy);
 874
 875                if (!zhdr)
 876                        continue;
 877
 878                /* Re-check under lock. */
 879                spin_lock(&pool->lock);
 880                l = &unbuddied[i];
 881                if (unlikely(zhdr != list_first_entry(READ_ONCE(l),
 882                                                struct z3fold_header, buddy)) ||
 883                    !z3fold_page_trylock(zhdr)) {
 884                        spin_unlock(&pool->lock);
 885                        zhdr = NULL;
 886                        migrate_enable();
 887                        if (can_sleep)
 888                                cond_resched();
 889                        goto lookup;
 890                }
 891                list_del_init(&zhdr->buddy);
 892                zhdr->cpu = -1;
 893                spin_unlock(&pool->lock);
 894
 895                page = virt_to_page(zhdr);
 896                if (test_bit(NEEDS_COMPACTING, &page->private) ||
 897                    test_bit(PAGE_CLAIMED, &page->private)) {
 898                        z3fold_page_unlock(zhdr);
 899                        zhdr = NULL;
 900                        migrate_enable();
 901                        if (can_sleep)
 902                                cond_resched();
 903                        goto lookup;
 904                }
 905
 906                /*
 907                 * this page could not be removed from its unbuddied
 908                 * list while pool lock was held, and then we've taken
 909                 * page lock so kref_put could not be called before
 910                 * we got here, so it's safe to just call kref_get()
 911                 */
 912                kref_get(&zhdr->refcount);
 913                break;
 914        }
 915        migrate_enable();
 916
 917        if (!zhdr) {
 918                int cpu;
 919
 920                /* look for _exact_ match on other cpus' lists */
 921                for_each_online_cpu(cpu) {
 922                        struct list_head *l;
 923
 924                        unbuddied = per_cpu_ptr(pool->unbuddied, cpu);
 925                        spin_lock(&pool->lock);
 926                        l = &unbuddied[chunks];
 927
 928                        zhdr = list_first_entry_or_null(READ_ONCE(l),
 929                                                struct z3fold_header, buddy);
 930
 931                        if (!zhdr || !z3fold_page_trylock(zhdr)) {
 932                                spin_unlock(&pool->lock);
 933                                zhdr = NULL;
 934                                continue;
 935                        }
 936                        list_del_init(&zhdr->buddy);
 937                        zhdr->cpu = -1;
 938                        spin_unlock(&pool->lock);
 939
 940                        page = virt_to_page(zhdr);
 941                        if (test_bit(NEEDS_COMPACTING, &page->private) ||
 942                            test_bit(PAGE_CLAIMED, &page->private)) {
 943                                z3fold_page_unlock(zhdr);
 944                                zhdr = NULL;
 945                                if (can_sleep)
 946                                        cond_resched();
 947                                continue;
 948                        }
 949                        kref_get(&zhdr->refcount);
 950                        break;
 951                }
 952        }
 953
 954        if (zhdr && !zhdr->slots)
 955                zhdr->slots = alloc_slots(pool,
 956                                        can_sleep ? GFP_NOIO : GFP_ATOMIC);
 957        return zhdr;
 958}
 959
 960/*
 961 * API Functions
 962 */
 963
 964/**
 965 * z3fold_create_pool() - create a new z3fold pool
 966 * @name:       pool name
 967 * @gfp:        gfp flags when allocating the z3fold pool structure
 968 * @ops:        user-defined operations for the z3fold pool
 969 *
 970 * Return: pointer to the new z3fold pool or NULL if the metadata allocation
 971 * failed.
 972 */
 973static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
 974                const struct z3fold_ops *ops)
 975{
 976        struct z3fold_pool *pool = NULL;
 977        int i, cpu;
 978
 979        pool = kzalloc(sizeof(struct z3fold_pool), gfp);
 980        if (!pool)
 981                goto out;
 982        pool->c_handle = kmem_cache_create("z3fold_handle",
 983                                sizeof(struct z3fold_buddy_slots),
 984                                SLOTS_ALIGN, 0, NULL);
 985        if (!pool->c_handle)
 986                goto out_c;
 987        spin_lock_init(&pool->lock);
 988        spin_lock_init(&pool->stale_lock);
 989        pool->unbuddied = __alloc_percpu(sizeof(struct list_head) * NCHUNKS,
 990                                         __alignof__(struct list_head));
 991        if (!pool->unbuddied)
 992                goto out_pool;
 993        for_each_possible_cpu(cpu) {
 994                struct list_head *unbuddied =
 995                                per_cpu_ptr(pool->unbuddied, cpu);
 996                for_each_unbuddied_list(i, 0)
 997                        INIT_LIST_HEAD(&unbuddied[i]);
 998        }
 999        INIT_LIST_HEAD(&pool->lru);
1000        INIT_LIST_HEAD(&pool->stale);
1001        atomic64_set(&pool->pages_nr, 0);
1002        pool->name = name;
1003        pool->compact_wq = create_singlethread_workqueue(pool->name);
1004        if (!pool->compact_wq)
1005                goto out_unbuddied;
1006        pool->release_wq = create_singlethread_workqueue(pool->name);
1007        if (!pool->release_wq)
1008                goto out_wq;
1009        if (z3fold_register_migration(pool))
1010                goto out_rwq;
1011        INIT_WORK(&pool->work, free_pages_work);
1012        pool->ops = ops;
1013        return pool;
1014
1015out_rwq:
1016        destroy_workqueue(pool->release_wq);
1017out_wq:
1018        destroy_workqueue(pool->compact_wq);
1019out_unbuddied:
1020        free_percpu(pool->unbuddied);
1021out_pool:
1022        kmem_cache_destroy(pool->c_handle);
1023out_c:
1024        kfree(pool);
1025out:
1026        return NULL;
1027}
1028
1029/**
1030 * z3fold_destroy_pool() - destroys an existing z3fold pool
1031 * @pool:       the z3fold pool to be destroyed
1032 *
1033 * The pool should be emptied before this function is called.
1034 */
1035static void z3fold_destroy_pool(struct z3fold_pool *pool)
1036{
1037        kmem_cache_destroy(pool->c_handle);
1038
1039        /*
1040         * We need to destroy pool->compact_wq before pool->release_wq,
1041         * as any pending work on pool->compact_wq will call
1042         * queue_work(pool->release_wq, &pool->work).
1043         *
1044         * There are still outstanding pages until both workqueues are drained,
1045         * so we cannot unregister migration until then.
1046         */
1047
1048        destroy_workqueue(pool->compact_wq);
1049        destroy_workqueue(pool->release_wq);
1050        z3fold_unregister_migration(pool);
1051        free_percpu(pool->unbuddied);
1052        kfree(pool);
1053}
1054
1055/**
1056 * z3fold_alloc() - allocates a region of a given size
1057 * @pool:       z3fold pool from which to allocate
1058 * @size:       size in bytes of the desired allocation
1059 * @gfp:        gfp flags used if the pool needs to grow
1060 * @handle:     handle of the new allocation
1061 *
1062 * This function will attempt to find a free region in the pool large enough to
1063 * satisfy the allocation request.  A search of the unbuddied lists is
1064 * performed first. If no suitable free region is found, then a new page is
1065 * allocated and added to the pool to satisfy the request.
1066 *
1067 * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used
1068 * as z3fold pool pages.
1069 *
1070 * Return: 0 if success and handle is set, otherwise -EINVAL if the size or
1071 * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
1072 * a new page.
1073 */
1074static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
1075                        unsigned long *handle)
1076{
1077        int chunks = size_to_chunks(size);
1078        struct z3fold_header *zhdr = NULL;
1079        struct page *page = NULL;
1080        enum buddy bud;
1081        bool can_sleep = gfpflags_allow_blocking(gfp);
1082
1083        if (!size)
1084                return -EINVAL;
1085
1086        if (size > PAGE_SIZE)
1087                return -ENOSPC;
1088
1089        if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE)
1090                bud = HEADLESS;
1091        else {
1092retry:
1093                zhdr = __z3fold_alloc(pool, size, can_sleep);
1094                if (zhdr) {
1095                        bud = get_free_buddy(zhdr, chunks);
1096                        if (bud == HEADLESS) {
1097                                if (kref_put(&zhdr->refcount,
1098                                             release_z3fold_page_locked))
1099                                        atomic64_dec(&pool->pages_nr);
1100                                else
1101                                        z3fold_page_unlock(zhdr);
1102                                pr_err("No free chunks in unbuddied\n");
1103                                WARN_ON(1);
1104                                goto retry;
1105                        }
1106                        page = virt_to_page(zhdr);
1107                        goto found;
1108                }
1109                bud = FIRST;
1110        }
1111
1112        page = NULL;
1113        if (can_sleep) {
1114                spin_lock(&pool->stale_lock);
1115                zhdr = list_first_entry_or_null(&pool->stale,
1116                                                struct z3fold_header, buddy);
1117                /*
1118                 * Before allocating a page, let's see if we can take one from
1119                 * the stale pages list. cancel_work_sync() can sleep so we
1120                 * limit this case to the contexts where we can sleep
1121                 */
1122                if (zhdr) {
1123                        list_del(&zhdr->buddy);
1124                        spin_unlock(&pool->stale_lock);
1125                        cancel_work_sync(&zhdr->work);
1126                        page = virt_to_page(zhdr);
1127                } else {
1128                        spin_unlock(&pool->stale_lock);
1129                }
1130        }
1131        if (!page)
1132                page = alloc_page(gfp);
1133
1134        if (!page)
1135                return -ENOMEM;
1136
1137        zhdr = init_z3fold_page(page, bud == HEADLESS, pool, gfp);
1138        if (!zhdr) {
1139                __free_page(page);
1140                return -ENOMEM;
1141        }
1142        atomic64_inc(&pool->pages_nr);
1143
1144        if (bud == HEADLESS) {
1145                set_bit(PAGE_HEADLESS, &page->private);
1146                goto headless;
1147        }
1148        if (can_sleep) {
1149                lock_page(page);
1150                __SetPageMovable(page, pool->inode->i_mapping);
1151                unlock_page(page);
1152        } else {
1153                if (trylock_page(page)) {
1154                        __SetPageMovable(page, pool->inode->i_mapping);
1155                        unlock_page(page);
1156                }
1157        }
1158        z3fold_page_lock(zhdr);
1159
1160found:
1161        if (bud == FIRST)
1162                zhdr->first_chunks = chunks;
1163        else if (bud == LAST)
1164                zhdr->last_chunks = chunks;
1165        else {
1166                zhdr->middle_chunks = chunks;
1167                zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
1168        }
1169        add_to_unbuddied(pool, zhdr);
1170
1171headless:
1172        spin_lock(&pool->lock);
1173        /* Add/move z3fold page to beginning of LRU */
1174        if (!list_empty(&page->lru))
1175                list_del(&page->lru);
1176
1177        list_add(&page->lru, &pool->lru);
1178
1179        *handle = encode_handle(zhdr, bud);
1180        spin_unlock(&pool->lock);
1181        if (bud != HEADLESS)
1182                z3fold_page_unlock(zhdr);
1183
1184        return 0;
1185}
1186
1187/**
1188 * z3fold_free() - frees the allocation associated with the given handle
1189 * @pool:       pool in which the allocation resided
1190 * @handle:     handle associated with the allocation returned by z3fold_alloc()
1191 *
1192 * In the case that the z3fold page in which the allocation resides is under
1193 * reclaim, as indicated by the PG_reclaim flag being set, this function
1194 * only sets the first|last_chunks to 0.  The page is actually freed
1195 * once both buddies are evicted (see z3fold_reclaim_page() below).
1196 */
1197static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
1198{
1199        struct z3fold_header *zhdr;
1200        struct page *page;
1201        enum buddy bud;
1202        bool page_claimed;
1203
1204        zhdr = get_z3fold_header(handle);
1205        page = virt_to_page(zhdr);
1206        page_claimed = test_and_set_bit(PAGE_CLAIMED, &page->private);
1207
1208        if (test_bit(PAGE_HEADLESS, &page->private)) {
1209                /* if a headless page is under reclaim, just leave.
1210                 * NB: we use test_and_set_bit for a reason: if the bit
1211                 * has not been set before, we release this page
1212                 * immediately so we don't care about its value any more.
1213                 */
1214                if (!page_claimed) {
1215                        spin_lock(&pool->lock);
1216                        list_del(&page->lru);
1217                        spin_unlock(&pool->lock);
1218                        put_z3fold_header(zhdr);
1219                        free_z3fold_page(page, true);
1220                        atomic64_dec(&pool->pages_nr);
1221                }
1222                return;
1223        }
1224
1225        /* Non-headless case */
1226        bud = handle_to_buddy(handle);
1227
1228        switch (bud) {
1229        case FIRST:
1230                zhdr->first_chunks = 0;
1231                break;
1232        case MIDDLE:
1233                zhdr->middle_chunks = 0;
1234                break;
1235        case LAST:
1236                zhdr->last_chunks = 0;
1237                break;
1238        default:
1239                pr_err("%s: unknown bud %d\n", __func__, bud);
1240                WARN_ON(1);
1241                put_z3fold_header(zhdr);
1242                return;
1243        }
1244
1245        if (!page_claimed)
1246                free_handle(handle, zhdr);
1247        if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) {
1248                atomic64_dec(&pool->pages_nr);
1249                return;
1250        }
1251        if (page_claimed) {
1252                /* the page has not been claimed by us */
1253                z3fold_page_unlock(zhdr);
1254                return;
1255        }
1256        if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
1257                put_z3fold_header(zhdr);
1258                clear_bit(PAGE_CLAIMED, &page->private);
1259                return;
1260        }
1261        if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) {
1262                spin_lock(&pool->lock);
1263                list_del_init(&zhdr->buddy);
1264                spin_unlock(&pool->lock);
1265                zhdr->cpu = -1;
1266                kref_get(&zhdr->refcount);
1267                clear_bit(PAGE_CLAIMED, &page->private);
1268                do_compact_page(zhdr, true);
1269                return;
1270        }
1271        kref_get(&zhdr->refcount);
1272        clear_bit(PAGE_CLAIMED, &page->private);
1273        queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work);
1274        put_z3fold_header(zhdr);
1275}
1276
1277/**
1278 * z3fold_reclaim_page() - evicts allocations from a pool page and frees it
1279 * @pool:       pool from which a page will attempt to be evicted
1280 * @retries:    number of pages on the LRU list for which eviction will
1281 *              be attempted before failing
1282 *
1283 * z3fold reclaim is different from normal system reclaim in that it is done
1284 * from the bottom, up. This is because only the bottom layer, z3fold, has
1285 * information on how the allocations are organized within each z3fold page.
1286 * This has the potential to create interesting locking situations between
1287 * z3fold and the user, however.
1288 *
1289 * To avoid these, this is how z3fold_reclaim_page() should be called:
1290 *
1291 * The user detects a page should be reclaimed and calls z3fold_reclaim_page().
1292 * z3fold_reclaim_page() will remove a z3fold page from the pool LRU list and
1293 * call the user-defined eviction handler with the pool and handle as
1294 * arguments.
1295 *
1296 * If the handle can not be evicted, the eviction handler should return
1297 * non-zero. z3fold_reclaim_page() will add the z3fold page back to the
1298 * appropriate list and try the next z3fold page on the LRU up to
1299 * a user defined number of retries.
1300 *
1301 * If the handle is successfully evicted, the eviction handler should
1302 * return 0 _and_ should have called z3fold_free() on the handle. z3fold_free()
1303 * contains logic to delay freeing the page if the page is under reclaim,
1304 * as indicated by the setting of the PG_reclaim flag on the underlying page.
1305 *
1306 * If all buddies in the z3fold page are successfully evicted, then the
1307 * z3fold page can be freed.
1308 *
1309 * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are
1310 * no pages to evict or an eviction handler is not registered, -EAGAIN if
1311 * the retry limit was hit.
1312 */
1313static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
1314{
1315        int i, ret = -1;
1316        struct z3fold_header *zhdr = NULL;
1317        struct page *page = NULL;
1318        struct list_head *pos;
1319        unsigned long first_handle = 0, middle_handle = 0, last_handle = 0;
1320        struct z3fold_buddy_slots slots __attribute__((aligned(SLOTS_ALIGN)));
1321
1322        rwlock_init(&slots.lock);
1323        slots.pool = (unsigned long)pool | (1 << HANDLES_NOFREE);
1324
1325        spin_lock(&pool->lock);
1326        if (!pool->ops || !pool->ops->evict || retries == 0) {
1327                spin_unlock(&pool->lock);
1328                return -EINVAL;
1329        }
1330        for (i = 0; i < retries; i++) {
1331                if (list_empty(&pool->lru)) {
1332                        spin_unlock(&pool->lock);
1333                        return -EINVAL;
1334                }
1335                list_for_each_prev(pos, &pool->lru) {
1336                        page = list_entry(pos, struct page, lru);
1337
1338                        zhdr = page_address(page);
1339                        if (test_bit(PAGE_HEADLESS, &page->private)) {
1340                                /*
1341                                 * For non-headless pages, we wait to do this
1342                                 * until we have the page lock to avoid racing
1343                                 * with __z3fold_alloc(). Headless pages don't
1344                                 * have a lock (and __z3fold_alloc() will never
1345                                 * see them), but we still need to test and set
1346                                 * PAGE_CLAIMED to avoid racing with
1347                                 * z3fold_free(), so just do it now before
1348                                 * leaving the loop.
1349                                 */
1350                                if (test_and_set_bit(PAGE_CLAIMED, &page->private))
1351                                        continue;
1352
1353                                break;
1354                        }
1355
1356                        if (kref_get_unless_zero(&zhdr->refcount) == 0) {
1357                                zhdr = NULL;
1358                                break;
1359                        }
1360                        if (!z3fold_page_trylock(zhdr)) {
1361                                if (kref_put(&zhdr->refcount,
1362                                                release_z3fold_page))
1363                                        atomic64_dec(&pool->pages_nr);
1364                                zhdr = NULL;
1365                                continue; /* can't evict at this point */
1366                        }
1367
1368                        /* test_and_set_bit is of course atomic, but we still
1369                         * need to do it under page lock, otherwise checking
1370                         * that bit in __z3fold_alloc wouldn't make sense
1371                         */
1372                        if (zhdr->foreign_handles ||
1373                            test_and_set_bit(PAGE_CLAIMED, &page->private)) {
1374                                if (kref_put(&zhdr->refcount,
1375                                                release_z3fold_page_locked))
1376                                        atomic64_dec(&pool->pages_nr);
1377                                else
1378                                        z3fold_page_unlock(zhdr);
1379                                zhdr = NULL;
1380                                continue; /* can't evict such page */
1381                        }
1382                        list_del_init(&zhdr->buddy);
1383                        zhdr->cpu = -1;
1384                        break;
1385                }
1386
1387                if (!zhdr)
1388                        break;
1389
1390                list_del_init(&page->lru);
1391                spin_unlock(&pool->lock);
1392
1393                if (!test_bit(PAGE_HEADLESS, &page->private)) {
1394                        /*
1395                         * We need encode the handles before unlocking, and
1396                         * use our local slots structure because z3fold_free
1397                         * can zero out zhdr->slots and we can't do much
1398                         * about that
1399                         */
1400                        first_handle = 0;
1401                        last_handle = 0;
1402                        middle_handle = 0;
1403                        memset(slots.slot, 0, sizeof(slots.slot));
1404                        if (zhdr->first_chunks)
1405                                first_handle = __encode_handle(zhdr, &slots,
1406                                                                FIRST);
1407                        if (zhdr->middle_chunks)
1408                                middle_handle = __encode_handle(zhdr, &slots,
1409                                                                MIDDLE);
1410                        if (zhdr->last_chunks)
1411                                last_handle = __encode_handle(zhdr, &slots,
1412                                                                LAST);
1413                        /*
1414                         * it's safe to unlock here because we hold a
1415                         * reference to this page
1416                         */
1417                        z3fold_page_unlock(zhdr);
1418                } else {
1419                        first_handle = encode_handle(zhdr, HEADLESS);
1420                        last_handle = middle_handle = 0;
1421                }
1422                /* Issue the eviction callback(s) */
1423                if (middle_handle) {
1424                        ret = pool->ops->evict(pool, middle_handle);
1425                        if (ret)
1426                                goto next;
1427                }
1428                if (first_handle) {
1429                        ret = pool->ops->evict(pool, first_handle);
1430                        if (ret)
1431                                goto next;
1432                }
1433                if (last_handle) {
1434                        ret = pool->ops->evict(pool, last_handle);
1435                        if (ret)
1436                                goto next;
1437                }
1438next:
1439                if (test_bit(PAGE_HEADLESS, &page->private)) {
1440                        if (ret == 0) {
1441                                free_z3fold_page(page, true);
1442                                atomic64_dec(&pool->pages_nr);
1443                                return 0;
1444                        }
1445                        spin_lock(&pool->lock);
1446                        list_add(&page->lru, &pool->lru);
1447                        spin_unlock(&pool->lock);
1448                        clear_bit(PAGE_CLAIMED, &page->private);
1449                } else {
1450                        struct z3fold_buddy_slots *slots = zhdr->slots;
1451                        z3fold_page_lock(zhdr);
1452                        if (kref_put(&zhdr->refcount,
1453                                        release_z3fold_page_locked)) {
1454                                kmem_cache_free(pool->c_handle, slots);
1455                                atomic64_dec(&pool->pages_nr);
1456                                return 0;
1457                        }
1458                        /*
1459                         * if we are here, the page is still not completely
1460                         * free. Take the global pool lock then to be able
1461                         * to add it back to the lru list
1462                         */
1463                        spin_lock(&pool->lock);
1464                        list_add(&page->lru, &pool->lru);
1465                        spin_unlock(&pool->lock);
1466                        z3fold_page_unlock(zhdr);
1467                        clear_bit(PAGE_CLAIMED, &page->private);
1468                }
1469
1470                /* We started off locked to we need to lock the pool back */
1471                spin_lock(&pool->lock);
1472        }
1473        spin_unlock(&pool->lock);
1474        return -EAGAIN;
1475}
1476
1477/**
1478 * z3fold_map() - maps the allocation associated with the given handle
1479 * @pool:       pool in which the allocation resides
1480 * @handle:     handle associated with the allocation to be mapped
1481 *
1482 * Extracts the buddy number from handle and constructs the pointer to the
1483 * correct starting chunk within the page.
1484 *
1485 * Returns: a pointer to the mapped allocation
1486 */
1487static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle)
1488{
1489        struct z3fold_header *zhdr;
1490        struct page *page;
1491        void *addr;
1492        enum buddy buddy;
1493
1494        zhdr = get_z3fold_header(handle);
1495        addr = zhdr;
1496        page = virt_to_page(zhdr);
1497
1498        if (test_bit(PAGE_HEADLESS, &page->private))
1499                goto out;
1500
1501        buddy = handle_to_buddy(handle);
1502        switch (buddy) {
1503        case FIRST:
1504                addr += ZHDR_SIZE_ALIGNED;
1505                break;
1506        case MIDDLE:
1507                addr += zhdr->start_middle << CHUNK_SHIFT;
1508                set_bit(MIDDLE_CHUNK_MAPPED, &page->private);
1509                break;
1510        case LAST:
1511                addr += PAGE_SIZE - (handle_to_chunks(handle) << CHUNK_SHIFT);
1512                break;
1513        default:
1514                pr_err("unknown buddy id %d\n", buddy);
1515                WARN_ON(1);
1516                addr = NULL;
1517                break;
1518        }
1519
1520        if (addr)
1521                zhdr->mapped_count++;
1522out:
1523        put_z3fold_header(zhdr);
1524        return addr;
1525}
1526
1527/**
1528 * z3fold_unmap() - unmaps the allocation associated with the given handle
1529 * @pool:       pool in which the allocation resides
1530 * @handle:     handle associated with the allocation to be unmapped
1531 */
1532static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle)
1533{
1534        struct z3fold_header *zhdr;
1535        struct page *page;
1536        enum buddy buddy;
1537
1538        zhdr = get_z3fold_header(handle);
1539        page = virt_to_page(zhdr);
1540
1541        if (test_bit(PAGE_HEADLESS, &page->private))
1542                return;
1543
1544        buddy = handle_to_buddy(handle);
1545        if (buddy == MIDDLE)
1546                clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
1547        zhdr->mapped_count--;
1548        put_z3fold_header(zhdr);
1549}
1550
1551/**
1552 * z3fold_get_pool_size() - gets the z3fold pool size in pages
1553 * @pool:       pool whose size is being queried
1554 *
1555 * Returns: size in pages of the given pool.
1556 */
1557static u64 z3fold_get_pool_size(struct z3fold_pool *pool)
1558{
1559        return atomic64_read(&pool->pages_nr);
1560}
1561
1562static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
1563{
1564        struct z3fold_header *zhdr;
1565        struct z3fold_pool *pool;
1566
1567        VM_BUG_ON_PAGE(!PageMovable(page), page);
1568        VM_BUG_ON_PAGE(PageIsolated(page), page);
1569
1570        if (test_bit(PAGE_HEADLESS, &page->private))
1571                return false;
1572
1573        zhdr = page_address(page);
1574        z3fold_page_lock(zhdr);
1575        if (test_bit(NEEDS_COMPACTING, &page->private) ||
1576            test_bit(PAGE_STALE, &page->private))
1577                goto out;
1578
1579        if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0)
1580                goto out;
1581
1582        if (test_and_set_bit(PAGE_CLAIMED, &page->private))
1583                goto out;
1584        pool = zhdr_to_pool(zhdr);
1585        spin_lock(&pool->lock);
1586        if (!list_empty(&zhdr->buddy))
1587                list_del_init(&zhdr->buddy);
1588        if (!list_empty(&page->lru))
1589                list_del_init(&page->lru);
1590        spin_unlock(&pool->lock);
1591
1592        kref_get(&zhdr->refcount);
1593        z3fold_page_unlock(zhdr);
1594        return true;
1595
1596out:
1597        z3fold_page_unlock(zhdr);
1598        return false;
1599}
1600
1601static int z3fold_page_migrate(struct address_space *mapping, struct page *newpage,
1602                               struct page *page, enum migrate_mode mode)
1603{
1604        struct z3fold_header *zhdr, *new_zhdr;
1605        struct z3fold_pool *pool;
1606        struct address_space *new_mapping;
1607
1608        VM_BUG_ON_PAGE(!PageMovable(page), page);
1609        VM_BUG_ON_PAGE(!PageIsolated(page), page);
1610        VM_BUG_ON_PAGE(!test_bit(PAGE_CLAIMED, &page->private), page);
1611        VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
1612
1613        zhdr = page_address(page);
1614        pool = zhdr_to_pool(zhdr);
1615
1616        if (!z3fold_page_trylock(zhdr))
1617                return -EAGAIN;
1618        if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) {
1619                z3fold_page_unlock(zhdr);
1620                clear_bit(PAGE_CLAIMED, &page->private);
1621                return -EBUSY;
1622        }
1623        if (work_pending(&zhdr->work)) {
1624                z3fold_page_unlock(zhdr);
1625                return -EAGAIN;
1626        }
1627        new_zhdr = page_address(newpage);
1628        memcpy(new_zhdr, zhdr, PAGE_SIZE);
1629        newpage->private = page->private;
1630        page->private = 0;
1631        z3fold_page_unlock(zhdr);
1632        spin_lock_init(&new_zhdr->page_lock);
1633        INIT_WORK(&new_zhdr->work, compact_page_work);
1634        /*
1635         * z3fold_page_isolate() ensures that new_zhdr->buddy is empty,
1636         * so we only have to reinitialize it.
1637         */
1638        INIT_LIST_HEAD(&new_zhdr->buddy);
1639        new_mapping = page_mapping(page);
1640        __ClearPageMovable(page);
1641        ClearPagePrivate(page);
1642
1643        get_page(newpage);
1644        z3fold_page_lock(new_zhdr);
1645        if (new_zhdr->first_chunks)
1646                encode_handle(new_zhdr, FIRST);
1647        if (new_zhdr->last_chunks)
1648                encode_handle(new_zhdr, LAST);
1649        if (new_zhdr->middle_chunks)
1650                encode_handle(new_zhdr, MIDDLE);
1651        set_bit(NEEDS_COMPACTING, &newpage->private);
1652        new_zhdr->cpu = smp_processor_id();
1653        spin_lock(&pool->lock);
1654        list_add(&newpage->lru, &pool->lru);
1655        spin_unlock(&pool->lock);
1656        __SetPageMovable(newpage, new_mapping);
1657        z3fold_page_unlock(new_zhdr);
1658
1659        queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);
1660
1661        page_mapcount_reset(page);
1662        clear_bit(PAGE_CLAIMED, &page->private);
1663        put_page(page);
1664        return 0;
1665}
1666
1667static void z3fold_page_putback(struct page *page)
1668{
1669        struct z3fold_header *zhdr;
1670        struct z3fold_pool *pool;
1671
1672        zhdr = page_address(page);
1673        pool = zhdr_to_pool(zhdr);
1674
1675        z3fold_page_lock(zhdr);
1676        if (!list_empty(&zhdr->buddy))
1677                list_del_init(&zhdr->buddy);
1678        INIT_LIST_HEAD(&page->lru);
1679        if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
1680                atomic64_dec(&pool->pages_nr);
1681                return;
1682        }
1683        spin_lock(&pool->lock);
1684        list_add(&page->lru, &pool->lru);
1685        spin_unlock(&pool->lock);
1686        clear_bit(PAGE_CLAIMED, &page->private);
1687        z3fold_page_unlock(zhdr);
1688}
1689
1690static const struct address_space_operations z3fold_aops = {
1691        .isolate_page = z3fold_page_isolate,
1692        .migratepage = z3fold_page_migrate,
1693        .putback_page = z3fold_page_putback,
1694};
1695
1696/*****************
1697 * zpool
1698 ****************/
1699
1700static int z3fold_zpool_evict(struct z3fold_pool *pool, unsigned long handle)
1701{
1702        if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict)
1703                return pool->zpool_ops->evict(pool->zpool, handle);
1704        else
1705                return -ENOENT;
1706}
1707
1708static const struct z3fold_ops z3fold_zpool_ops = {
1709        .evict =        z3fold_zpool_evict
1710};
1711
1712static void *z3fold_zpool_create(const char *name, gfp_t gfp,
1713                               const struct zpool_ops *zpool_ops,
1714                               struct zpool *zpool)
1715{
1716        struct z3fold_pool *pool;
1717
1718        pool = z3fold_create_pool(name, gfp,
1719                                zpool_ops ? &z3fold_zpool_ops : NULL);
1720        if (pool) {
1721                pool->zpool = zpool;
1722                pool->zpool_ops = zpool_ops;
1723        }
1724        return pool;
1725}
1726
1727static void z3fold_zpool_destroy(void *pool)
1728{
1729        z3fold_destroy_pool(pool);
1730}
1731
1732static int z3fold_zpool_malloc(void *pool, size_t size, gfp_t gfp,
1733                        unsigned long *handle)
1734{
1735        return z3fold_alloc(pool, size, gfp, handle);
1736}
1737static void z3fold_zpool_free(void *pool, unsigned long handle)
1738{
1739        z3fold_free(pool, handle);
1740}
1741
1742static int z3fold_zpool_shrink(void *pool, unsigned int pages,
1743                        unsigned int *reclaimed)
1744{
1745        unsigned int total = 0;
1746        int ret = -EINVAL;
1747
1748        while (total < pages) {
1749                ret = z3fold_reclaim_page(pool, 8);
1750                if (ret < 0)
1751                        break;
1752                total++;
1753        }
1754
1755        if (reclaimed)
1756                *reclaimed = total;
1757
1758        return ret;
1759}
1760
1761static void *z3fold_zpool_map(void *pool, unsigned long handle,
1762                        enum zpool_mapmode mm)
1763{
1764        return z3fold_map(pool, handle);
1765}
1766static void z3fold_zpool_unmap(void *pool, unsigned long handle)
1767{
1768        z3fold_unmap(pool, handle);
1769}
1770
1771static u64 z3fold_zpool_total_size(void *pool)
1772{
1773        return z3fold_get_pool_size(pool) * PAGE_SIZE;
1774}
1775
1776static struct zpool_driver z3fold_zpool_driver = {
1777        .type =         "z3fold",
1778        .sleep_mapped = true,
1779        .owner =        THIS_MODULE,
1780        .create =       z3fold_zpool_create,
1781        .destroy =      z3fold_zpool_destroy,
1782        .malloc =       z3fold_zpool_malloc,
1783        .free =         z3fold_zpool_free,
1784        .shrink =       z3fold_zpool_shrink,
1785        .map =          z3fold_zpool_map,
1786        .unmap =        z3fold_zpool_unmap,
1787        .total_size =   z3fold_zpool_total_size,
1788};
1789
1790MODULE_ALIAS("zpool-z3fold");
1791
1792static int __init init_z3fold(void)
1793{
1794        int ret;
1795
1796        /*
1797         * Make sure the z3fold header is not larger than the page size and
1798         * there has remaining spaces for its buddy.
1799         */
1800        BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE - CHUNK_SIZE);
1801        ret = z3fold_mount();
1802        if (ret)
1803                return ret;
1804
1805        zpool_register_driver(&z3fold_zpool_driver);
1806
1807        return 0;
1808}
1809
1810static void __exit exit_z3fold(void)
1811{
1812        z3fold_unmount();
1813        zpool_unregister_driver(&z3fold_zpool_driver);
1814}
1815
1816module_init(init_z3fold);
1817module_exit(exit_z3fold);
1818
1819MODULE_LICENSE("GPL");
1820MODULE_AUTHOR("Vitaly Wool <vitalywool@gmail.com>");
1821MODULE_DESCRIPTION("3-Fold Allocator for Compressed Pages");
1822