linux/drivers/md/bcache/btree.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
   3 *
   4 * Uses a block device as cache for other block devices; optimized for SSDs.
   5 * All allocation is done in buckets, which should match the erase block size
   6 * of the device.
   7 *
   8 * Buckets containing cached data are kept on a heap sorted by priority;
   9 * bucket priority is increased on cache hit, and periodically all the buckets
  10 * on the heap have their priority scaled down. This currently is just used as
  11 * an LRU but in the future should allow for more intelligent heuristics.
  12 *
  13 * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
  14 * counter. Garbage collection is used to remove stale pointers.
  15 *
  16 * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
  17 * as keys are inserted we only sort the pages that have not yet been written.
  18 * When garbage collection is run, we resort the entire node.
  19 *
  20 * All configuration is done via sysfs; see Documentation/bcache.txt.
  21 */
  22
  23#include "bcache.h"
  24#include "btree.h"
  25#include "debug.h"
  26#include "extents.h"
  27
  28#include <linux/slab.h>
  29#include <linux/bitops.h>
  30#include <linux/freezer.h>
  31#include <linux/hash.h>
  32#include <linux/kthread.h>
  33#include <linux/prefetch.h>
  34#include <linux/random.h>
  35#include <linux/rcupdate.h>
  36#include <trace/events/bcache.h>
  37
  38/*
  39 * Todo:
  40 * register_bcache: Return errors out to userspace correctly
  41 *
  42 * Writeback: don't undirty key until after a cache flush
  43 *
  44 * Create an iterator for key pointers
  45 *
  46 * On btree write error, mark bucket such that it won't be freed from the cache
  47 *
  48 * Journalling:
  49 *   Check for bad keys in replay
  50 *   Propagate barriers
  51 *   Refcount journal entries in journal_replay
  52 *
  53 * Garbage collection:
  54 *   Finish incremental gc
  55 *   Gc should free old UUIDs, data for invalid UUIDs
  56 *
  57 * Provide a way to list backing device UUIDs we have data cached for, and
  58 * probably how long it's been since we've seen them, and a way to invalidate
  59 * dirty data for devices that will never be attached again
  60 *
  61 * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so
  62 * that based on that and how much dirty data we have we can keep writeback
  63 * from being starved
  64 *
  65 * Add a tracepoint or somesuch to watch for writeback starvation
  66 *
  67 * When btree depth > 1 and splitting an interior node, we have to make sure
  68 * alloc_bucket() cannot fail. This should be true but is not completely
  69 * obvious.
  70 *
  71 * Plugging?
  72 *
  73 * If data write is less than hard sector size of ssd, round up offset in open
  74 * bucket to the next whole sector
  75 *
  76 * Superblock needs to be fleshed out for multiple cache devices
  77 *
  78 * Add a sysfs tunable for the number of writeback IOs in flight
  79 *
  80 * Add a sysfs tunable for the number of open data buckets
  81 *
  82 * IO tracking: Can we track when one process is doing io on behalf of another?
  83 * IO tracking: Don't use just an average, weigh more recent stuff higher
  84 *
  85 * Test module load/unload
  86 */
  87
  88#define MAX_NEED_GC             64
  89#define MAX_SAVE_PRIO           72
  90
  91#define PTR_DIRTY_BIT           (((uint64_t) 1 << 36))
  92
  93#define PTR_HASH(c, k)                                                  \
  94        (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
  95
  96#define insert_lock(s, b)       ((b)->level <= (s)->lock)
  97
  98/*
  99 * These macros are for recursing down the btree - they handle the details of
 100 * locking and looking up nodes in the cache for you. They're best treated as
 101 * mere syntax when reading code that uses them.
 102 *
 103 * op->lock determines whether we take a read or a write lock at a given depth.
 104 * If you've got a read lock and find that you need a write lock (i.e. you're
 105 * going to have to split), set op->lock and return -EINTR; btree_root() will
 106 * call you again and you'll have the correct lock.
 107 */
 108
 109/**
 110 * btree - recurse down the btree on a specified key
 111 * @fn:         function to call, which will be passed the child node
 112 * @key:        key to recurse on
 113 * @b:          parent btree node
 114 * @op:         pointer to struct btree_op
 115 */
 116#define btree(fn, key, b, op, ...)                                      \
 117({                                                                      \
 118        int _r, l = (b)->level - 1;                                     \
 119        bool _w = l <= (op)->lock;                                      \
 120        struct btree *_child = bch_btree_node_get((b)->c, op, key, l,   \
 121                                                  _w, b);               \
 122        if (!IS_ERR(_child)) {                                          \
 123                _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__);       \
 124                rw_unlock(_w, _child);                                  \
 125        } else                                                          \
 126                _r = PTR_ERR(_child);                                   \
 127        _r;                                                             \
 128})
 129
 130/**
 131 * btree_root - call a function on the root of the btree
 132 * @fn:         function to call, which will be passed the child node
 133 * @c:          cache set
 134 * @op:         pointer to struct btree_op
 135 */
 136#define btree_root(fn, c, op, ...)                                      \
 137({                                                                      \
 138        int _r = -EINTR;                                                \
 139        do {                                                            \
 140                struct btree *_b = (c)->root;                           \
 141                bool _w = insert_lock(op, _b);                          \
 142                rw_lock(_w, _b, _b->level);                             \
 143                if (_b == (c)->root &&                                  \
 144                    _w == insert_lock(op, _b)) {                        \
 145                        _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);   \
 146                }                                                       \
 147                rw_unlock(_w, _b);                                      \
 148                bch_cannibalize_unlock(c);                              \
 149                if (_r == -EINTR)                                       \
 150                        schedule();                                     \
 151        } while (_r == -EINTR);                                         \
 152                                                                        \
 153        finish_wait(&(c)->btree_cache_wait, &(op)->wait);               \
 154        _r;                                                             \
 155})
 156
 157static inline struct bset *write_block(struct btree *b)
 158{
 159        return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c);
 160}
 161
 162static void bch_btree_init_next(struct btree *b)
 163{
 164        /* If not a leaf node, always sort */
 165        if (b->level && b->keys.nsets)
 166                bch_btree_sort(&b->keys, &b->c->sort);
 167        else
 168                bch_btree_sort_lazy(&b->keys, &b->c->sort);
 169
 170        if (b->written < btree_blocks(b))
 171                bch_bset_init_next(&b->keys, write_block(b),
 172                                   bset_magic(&b->c->sb));
 173
 174}
 175
 176/* Btree key manipulation */
 177
 178void bkey_put(struct cache_set *c, struct bkey *k)
 179{
 180        unsigned i;
 181
 182        for (i = 0; i < KEY_PTRS(k); i++)
 183                if (ptr_available(c, k, i))
 184                        atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
 185}
 186
 187/* Btree IO */
 188
 189static uint64_t btree_csum_set(struct btree *b, struct bset *i)
 190{
 191        uint64_t crc = b->key.ptr[0];
 192        void *data = (void *) i + 8, *end = bset_bkey_last(i);
 193
 194        crc = bch_crc64_update(crc, data, end - data);
 195        return crc ^ 0xffffffffffffffffULL;
 196}
 197
 198void bch_btree_node_read_done(struct btree *b)
 199{
 200        const char *err = "bad btree header";
 201        struct bset *i = btree_bset_first(b);
 202        struct btree_iter *iter;
 203
 204        iter = mempool_alloc(b->c->fill_iter, GFP_NOIO);
 205        iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
 206        iter->used = 0;
 207
 208#ifdef CONFIG_BCACHE_DEBUG
 209        iter->b = &b->keys;
 210#endif
 211
 212        if (!i->seq)
 213                goto err;
 214
 215        for (;
 216             b->written < btree_blocks(b) && i->seq == b->keys.set[0].data->seq;
 217             i = write_block(b)) {
 218                err = "unsupported bset version";
 219                if (i->version > BCACHE_BSET_VERSION)
 220                        goto err;
 221
 222                err = "bad btree header";
 223                if (b->written + set_blocks(i, block_bytes(b->c)) >
 224                    btree_blocks(b))
 225                        goto err;
 226
 227                err = "bad magic";
 228                if (i->magic != bset_magic(&b->c->sb))
 229                        goto err;
 230
 231                err = "bad checksum";
 232                switch (i->version) {
 233                case 0:
 234                        if (i->csum != csum_set(i))
 235                                goto err;
 236                        break;
 237                case BCACHE_BSET_VERSION:
 238                        if (i->csum != btree_csum_set(b, i))
 239                                goto err;
 240                        break;
 241                }
 242
 243                err = "empty set";
 244                if (i != b->keys.set[0].data && !i->keys)
 245                        goto err;
 246
 247                bch_btree_iter_push(iter, i->start, bset_bkey_last(i));
 248
 249                b->written += set_blocks(i, block_bytes(b->c));
 250        }
 251
 252        err = "corrupted btree";
 253        for (i = write_block(b);
 254             bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key);
 255             i = ((void *) i) + block_bytes(b->c))
 256                if (i->seq == b->keys.set[0].data->seq)
 257                        goto err;
 258
 259        bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort);
 260
 261        i = b->keys.set[0].data;
 262        err = "short btree key";
 263        if (b->keys.set[0].size &&
 264            bkey_cmp(&b->key, &b->keys.set[0].end) < 0)
 265                goto err;
 266
 267        if (b->written < btree_blocks(b))
 268                bch_bset_init_next(&b->keys, write_block(b),
 269                                   bset_magic(&b->c->sb));
 270out:
 271        mempool_free(iter, b->c->fill_iter);
 272        return;
 273err:
 274        set_btree_node_io_error(b);
 275        bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys",
 276                            err, PTR_BUCKET_NR(b->c, &b->key, 0),
 277                            bset_block_offset(b, i), i->keys);
 278        goto out;
 279}
 280
 281static void btree_node_read_endio(struct bio *bio, int error)
 282{
 283        struct closure *cl = bio->bi_private;
 284        closure_put(cl);
 285}
 286
 287static void bch_btree_node_read(struct btree *b)
 288{
 289        uint64_t start_time = local_clock();
 290        struct closure cl;
 291        struct bio *bio;
 292
 293        trace_bcache_btree_read(b);
 294
 295        closure_init_stack(&cl);
 296
 297        bio = bch_bbio_alloc(b->c);
 298        bio->bi_rw      = REQ_META|READ_SYNC;
 299        bio->bi_iter.bi_size = KEY_SIZE(&b->key) << 9;
 300        bio->bi_end_io  = btree_node_read_endio;
 301        bio->bi_private = &cl;
 302
 303        bch_bio_map(bio, b->keys.set[0].data);
 304
 305        bch_submit_bbio(bio, b->c, &b->key, 0);
 306        closure_sync(&cl);
 307
 308        if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 309                set_btree_node_io_error(b);
 310
 311        bch_bbio_free(bio, b->c);
 312
 313        if (btree_node_io_error(b))
 314                goto err;
 315
 316        bch_btree_node_read_done(b);
 317        bch_time_stats_update(&b->c->btree_read_time, start_time);
 318
 319        return;
 320err:
 321        bch_cache_set_error(b->c, "io error reading bucket %zu",
 322                            PTR_BUCKET_NR(b->c, &b->key, 0));
 323}
 324
 325static void btree_complete_write(struct btree *b, struct btree_write *w)
 326{
 327        if (w->prio_blocked &&
 328            !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked))
 329                wake_up_allocators(b->c);
 330
 331        if (w->journal) {
 332                atomic_dec_bug(w->journal);
 333                __closure_wake_up(&b->c->journal.wait);
 334        }
 335
 336        w->prio_blocked = 0;
 337        w->journal      = NULL;
 338}
 339
 340static void btree_node_write_unlock(struct closure *cl)
 341{
 342        struct btree *b = container_of(cl, struct btree, io);
 343
 344        up(&b->io_mutex);
 345}
 346
 347static void __btree_node_write_done(struct closure *cl)
 348{
 349        struct btree *b = container_of(cl, struct btree, io);
 350        struct btree_write *w = btree_prev_write(b);
 351
 352        bch_bbio_free(b->bio, b->c);
 353        b->bio = NULL;
 354        btree_complete_write(b, w);
 355
 356        if (btree_node_dirty(b))
 357                schedule_delayed_work(&b->work, 30 * HZ);
 358
 359        closure_return_with_destructor(cl, btree_node_write_unlock);
 360}
 361
 362static void btree_node_write_done(struct closure *cl)
 363{
 364        struct btree *b = container_of(cl, struct btree, io);
 365        struct bio_vec *bv;
 366        int n;
 367
 368        bio_for_each_segment_all(bv, b->bio, n)
 369                __free_page(bv->bv_page);
 370
 371        __btree_node_write_done(cl);
 372}
 373
 374static void btree_node_write_endio(struct bio *bio, int error)
 375{
 376        struct closure *cl = bio->bi_private;
 377        struct btree *b = container_of(cl, struct btree, io);
 378
 379        if (error)
 380                set_btree_node_io_error(b);
 381
 382        bch_bbio_count_io_errors(b->c, bio, error, "writing btree");
 383        closure_put(cl);
 384}
 385
 386static void do_btree_node_write(struct btree *b)
 387{
 388        struct closure *cl = &b->io;
 389        struct bset *i = btree_bset_last(b);
 390        BKEY_PADDED(key) k;
 391
 392        i->version      = BCACHE_BSET_VERSION;
 393        i->csum         = btree_csum_set(b, i);
 394
 395        BUG_ON(b->bio);
 396        b->bio = bch_bbio_alloc(b->c);
 397
 398        b->bio->bi_end_io       = btree_node_write_endio;
 399        b->bio->bi_private      = cl;
 400        b->bio->bi_rw           = REQ_META|WRITE_SYNC|REQ_FUA;
 401        b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c));
 402        bch_bio_map(b->bio, i);
 403
 404        /*
 405         * If we're appending to a leaf node, we don't technically need FUA -
 406         * this write just needs to be persisted before the next journal write,
 407         * which will be marked FLUSH|FUA.
 408         *
 409         * Similarly if we're writing a new btree root - the pointer is going to
 410         * be in the next journal entry.
 411         *
 412         * But if we're writing a new btree node (that isn't a root) or
 413         * appending to a non leaf btree node, we need either FUA or a flush
 414         * when we write the parent with the new pointer. FUA is cheaper than a
 415         * flush, and writes appending to leaf nodes aren't blocking anything so
 416         * just make all btree node writes FUA to keep things sane.
 417         */
 418
 419        bkey_copy(&k.key, &b->key);
 420        SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) +
 421                       bset_sector_offset(&b->keys, i));
 422
 423        if (!bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) {
 424                int j;
 425                struct bio_vec *bv;
 426                void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
 427
 428                bio_for_each_segment_all(bv, b->bio, j)
 429                        memcpy(page_address(bv->bv_page),
 430                               base + j * PAGE_SIZE, PAGE_SIZE);
 431
 432                bch_submit_bbio(b->bio, b->c, &k.key, 0);
 433
 434                continue_at(cl, btree_node_write_done, NULL);
 435        } else {
 436                b->bio->bi_vcnt = 0;
 437                bch_bio_map(b->bio, i);
 438
 439                bch_submit_bbio(b->bio, b->c, &k.key, 0);
 440
 441                closure_sync(cl);
 442                continue_at_nobarrier(cl, __btree_node_write_done, NULL);
 443        }
 444}
 445
 446void __bch_btree_node_write(struct btree *b, struct closure *parent)
 447{
 448        struct bset *i = btree_bset_last(b);
 449
 450        lockdep_assert_held(&b->write_lock);
 451
 452        trace_bcache_btree_write(b);
 453
 454        BUG_ON(current->bio_list);
 455        BUG_ON(b->written >= btree_blocks(b));
 456        BUG_ON(b->written && !i->keys);
 457        BUG_ON(btree_bset_first(b)->seq != i->seq);
 458        bch_check_keys(&b->keys, "writing");
 459
 460        cancel_delayed_work(&b->work);
 461
 462        /* If caller isn't waiting for write, parent refcount is cache set */
 463        down(&b->io_mutex);
 464        closure_init(&b->io, parent ?: &b->c->cl);
 465
 466        clear_bit(BTREE_NODE_dirty,      &b->flags);
 467        change_bit(BTREE_NODE_write_idx, &b->flags);
 468
 469        do_btree_node_write(b);
 470
 471        atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size,
 472                        &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
 473
 474        b->written += set_blocks(i, block_bytes(b->c));
 475}
 476
 477void bch_btree_node_write(struct btree *b, struct closure *parent)
 478{
 479        unsigned nsets = b->keys.nsets;
 480
 481        lockdep_assert_held(&b->lock);
 482
 483        __bch_btree_node_write(b, parent);
 484
 485        /*
 486         * do verify if there was more than one set initially (i.e. we did a
 487         * sort) and we sorted down to a single set:
 488         */
 489        if (nsets && !b->keys.nsets)
 490                bch_btree_verify(b);
 491
 492        bch_btree_init_next(b);
 493}
 494
 495static void bch_btree_node_write_sync(struct btree *b)
 496{
 497        struct closure cl;
 498
 499        closure_init_stack(&cl);
 500
 501        mutex_lock(&b->write_lock);
 502        bch_btree_node_write(b, &cl);
 503        mutex_unlock(&b->write_lock);
 504
 505        closure_sync(&cl);
 506}
 507
 508static void btree_node_write_work(struct work_struct *w)
 509{
 510        struct btree *b = container_of(to_delayed_work(w), struct btree, work);
 511
 512        mutex_lock(&b->write_lock);
 513        if (btree_node_dirty(b))
 514                __bch_btree_node_write(b, NULL);
 515        mutex_unlock(&b->write_lock);
 516}
 517
 518static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
 519{
 520        struct bset *i = btree_bset_last(b);
 521        struct btree_write *w = btree_current_write(b);
 522
 523        lockdep_assert_held(&b->write_lock);
 524
 525        BUG_ON(!b->written);
 526        BUG_ON(!i->keys);
 527
 528        if (!btree_node_dirty(b))
 529                schedule_delayed_work(&b->work, 30 * HZ);
 530
 531        set_btree_node_dirty(b);
 532
 533        if (journal_ref) {
 534                if (w->journal &&
 535                    journal_pin_cmp(b->c, w->journal, journal_ref)) {
 536                        atomic_dec_bug(w->journal);
 537                        w->journal = NULL;
 538                }
 539
 540                if (!w->journal) {
 541                        w->journal = journal_ref;
 542                        atomic_inc(w->journal);
 543                }
 544        }
 545
 546        /* Force write if set is too big */
 547        if (set_bytes(i) > PAGE_SIZE - 48 &&
 548            !current->bio_list)
 549                bch_btree_node_write(b, NULL);
 550}
 551
 552/*
 553 * Btree in memory cache - allocation/freeing
 554 * mca -> memory cache
 555 */
 556
 557#define mca_reserve(c)  (((c->root && c->root->level)           \
 558                          ? c->root->level : 1) * 8 + 16)
 559#define mca_can_free(c)                                         \
 560        max_t(int, 0, c->btree_cache_used - mca_reserve(c))
 561
 562static void mca_data_free(struct btree *b)
 563{
 564        BUG_ON(b->io_mutex.count != 1);
 565
 566        bch_btree_keys_free(&b->keys);
 567
 568        b->c->btree_cache_used--;
 569        list_move(&b->list, &b->c->btree_cache_freed);
 570}
 571
 572static void mca_bucket_free(struct btree *b)
 573{
 574        BUG_ON(btree_node_dirty(b));
 575
 576        b->key.ptr[0] = 0;
 577        hlist_del_init_rcu(&b->hash);
 578        list_move(&b->list, &b->c->btree_cache_freeable);
 579}
 580
 581static unsigned btree_order(struct bkey *k)
 582{
 583        return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1);
 584}
 585
 586static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
 587{
 588        if (!bch_btree_keys_alloc(&b->keys,
 589                                  max_t(unsigned,
 590                                        ilog2(b->c->btree_pages),
 591                                        btree_order(k)),
 592                                  gfp)) {
 593                b->c->btree_cache_used++;
 594                list_move(&b->list, &b->c->btree_cache);
 595        } else {
 596                list_move(&b->list, &b->c->btree_cache_freed);
 597        }
 598}
 599
 600static struct btree *mca_bucket_alloc(struct cache_set *c,
 601                                      struct bkey *k, gfp_t gfp)
 602{
 603        struct btree *b = kzalloc(sizeof(struct btree), gfp);
 604        if (!b)
 605                return NULL;
 606
 607        init_rwsem(&b->lock);
 608        lockdep_set_novalidate_class(&b->lock);
 609        mutex_init(&b->write_lock);
 610        lockdep_set_novalidate_class(&b->write_lock);
 611        INIT_LIST_HEAD(&b->list);
 612        INIT_DELAYED_WORK(&b->work, btree_node_write_work);
 613        b->c = c;
 614        sema_init(&b->io_mutex, 1);
 615
 616        mca_data_alloc(b, k, gfp);
 617        return b;
 618}
 619
 620static int mca_reap(struct btree *b, unsigned min_order, bool flush)
 621{
 622        struct closure cl;
 623
 624        closure_init_stack(&cl);
 625        lockdep_assert_held(&b->c->bucket_lock);
 626
 627        if (!down_write_trylock(&b->lock))
 628                return -ENOMEM;
 629
 630        BUG_ON(btree_node_dirty(b) && !b->keys.set[0].data);
 631
 632        if (b->keys.page_order < min_order)
 633                goto out_unlock;
 634
 635        if (!flush) {
 636                if (btree_node_dirty(b))
 637                        goto out_unlock;
 638
 639                if (down_trylock(&b->io_mutex))
 640                        goto out_unlock;
 641                up(&b->io_mutex);
 642        }
 643
 644        mutex_lock(&b->write_lock);
 645        if (btree_node_dirty(b))
 646                __bch_btree_node_write(b, &cl);
 647        mutex_unlock(&b->write_lock);
 648
 649        closure_sync(&cl);
 650
 651        /* wait for any in flight btree write */
 652        down(&b->io_mutex);
 653        up(&b->io_mutex);
 654
 655        return 0;
 656out_unlock:
 657        rw_unlock(true, b);
 658        return -ENOMEM;
 659}
 660
 661static unsigned long bch_mca_scan(struct shrinker *shrink,
 662                                  struct shrink_control *sc)
 663{
 664        struct cache_set *c = container_of(shrink, struct cache_set, shrink);
 665        struct btree *b, *t;
 666        unsigned long i, nr = sc->nr_to_scan;
 667        unsigned long freed = 0;
 668
 669        if (c->shrinker_disabled)
 670                return SHRINK_STOP;
 671
 672        if (c->btree_cache_alloc_lock)
 673                return SHRINK_STOP;
 674
 675        /* Return -1 if we can't do anything right now */
 676        if (sc->gfp_mask & __GFP_IO)
 677                mutex_lock(&c->bucket_lock);
 678        else if (!mutex_trylock(&c->bucket_lock))
 679                return -1;
 680
 681        /*
 682         * It's _really_ critical that we don't free too many btree nodes - we
 683         * have to always leave ourselves a reserve. The reserve is how we
 684         * guarantee that allocating memory for a new btree node can always
 685         * succeed, so that inserting keys into the btree can always succeed and
 686         * IO can always make forward progress:
 687         */
 688        nr /= c->btree_pages;
 689        nr = min_t(unsigned long, nr, mca_can_free(c));
 690
 691        i = 0;
 692        list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
 693                if (freed >= nr)
 694                        break;
 695
 696                if (++i > 3 &&
 697                    !mca_reap(b, 0, false)) {
 698                        mca_data_free(b);
 699                        rw_unlock(true, b);
 700                        freed++;
 701                }
 702        }
 703
 704        for (i = 0; (nr--) && i < c->btree_cache_used; i++) {
 705                if (list_empty(&c->btree_cache))
 706                        goto out;
 707
 708                b = list_first_entry(&c->btree_cache, struct btree, list);
 709                list_rotate_left(&c->btree_cache);
 710
 711                if (!b->accessed &&
 712                    !mca_reap(b, 0, false)) {
 713                        mca_bucket_free(b);
 714                        mca_data_free(b);
 715                        rw_unlock(true, b);
 716                        freed++;
 717                } else
 718                        b->accessed = 0;
 719        }
 720out:
 721        mutex_unlock(&c->bucket_lock);
 722        return freed;
 723}
 724
 725static unsigned long bch_mca_count(struct shrinker *shrink,
 726                                   struct shrink_control *sc)
 727{
 728        struct cache_set *c = container_of(shrink, struct cache_set, shrink);
 729
 730        if (c->shrinker_disabled)
 731                return 0;
 732
 733        if (c->btree_cache_alloc_lock)
 734                return 0;
 735
 736        return mca_can_free(c) * c->btree_pages;
 737}
 738
 739void bch_btree_cache_free(struct cache_set *c)
 740{
 741        struct btree *b;
 742        struct closure cl;
 743        closure_init_stack(&cl);
 744
 745        if (c->shrink.list.next)
 746                unregister_shrinker(&c->shrink);
 747
 748        mutex_lock(&c->bucket_lock);
 749
 750#ifdef CONFIG_BCACHE_DEBUG
 751        if (c->verify_data)
 752                list_move(&c->verify_data->list, &c->btree_cache);
 753
 754        free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c)));
 755#endif
 756
 757        list_splice(&c->btree_cache_freeable,
 758                    &c->btree_cache);
 759
 760        while (!list_empty(&c->btree_cache)) {
 761                b = list_first_entry(&c->btree_cache, struct btree, list);
 762
 763                if (btree_node_dirty(b))
 764                        btree_complete_write(b, btree_current_write(b));
 765                clear_bit(BTREE_NODE_dirty, &b->flags);
 766
 767                mca_data_free(b);
 768        }
 769
 770        while (!list_empty(&c->btree_cache_freed)) {
 771                b = list_first_entry(&c->btree_cache_freed,
 772                                     struct btree, list);
 773                list_del(&b->list);
 774                cancel_delayed_work_sync(&b->work);
 775                kfree(b);
 776        }
 777
 778        mutex_unlock(&c->bucket_lock);
 779}
 780
 781int bch_btree_cache_alloc(struct cache_set *c)
 782{
 783        unsigned i;
 784
 785        for (i = 0; i < mca_reserve(c); i++)
 786                if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL))
 787                        return -ENOMEM;
 788
 789        list_splice_init(&c->btree_cache,
 790                         &c->btree_cache_freeable);
 791
 792#ifdef CONFIG_BCACHE_DEBUG
 793        mutex_init(&c->verify_lock);
 794
 795        c->verify_ondisk = (void *)
 796                __get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c)));
 797
 798        c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
 799
 800        if (c->verify_data &&
 801            c->verify_data->keys.set->data)
 802                list_del_init(&c->verify_data->list);
 803        else
 804                c->verify_data = NULL;
 805#endif
 806
 807        c->shrink.count_objects = bch_mca_count;
 808        c->shrink.scan_objects = bch_mca_scan;
 809        c->shrink.seeks = 4;
 810        c->shrink.batch = c->btree_pages * 2;
 811        register_shrinker(&c->shrink);
 812
 813        return 0;
 814}
 815
 816/* Btree in memory cache - hash table */
 817
 818static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k)
 819{
 820        return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)];
 821}
 822
 823static struct btree *mca_find(struct cache_set *c, struct bkey *k)
 824{
 825        struct btree *b;
 826
 827        rcu_read_lock();
 828        hlist_for_each_entry_rcu(b, mca_hash(c, k), hash)
 829                if (PTR_HASH(c, &b->key) == PTR_HASH(c, k))
 830                        goto out;
 831        b = NULL;
 832out:
 833        rcu_read_unlock();
 834        return b;
 835}
 836
 837static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op)
 838{
 839        struct task_struct *old;
 840
 841        old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current);
 842        if (old && old != current) {
 843                if (op)
 844                        prepare_to_wait(&c->btree_cache_wait, &op->wait,
 845                                        TASK_UNINTERRUPTIBLE);
 846                return -EINTR;
 847        }
 848
 849        return 0;
 850}
 851
 852static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op,
 853                                     struct bkey *k)
 854{
 855        struct btree *b;
 856
 857        trace_bcache_btree_cache_cannibalize(c);
 858
 859        if (mca_cannibalize_lock(c, op))
 860                return ERR_PTR(-EINTR);
 861
 862        list_for_each_entry_reverse(b, &c->btree_cache, list)
 863                if (!mca_reap(b, btree_order(k), false))
 864                        return b;
 865
 866        list_for_each_entry_reverse(b, &c->btree_cache, list)
 867                if (!mca_reap(b, btree_order(k), true))
 868                        return b;
 869
 870        WARN(1, "btree cache cannibalize failed\n");
 871        return ERR_PTR(-ENOMEM);
 872}
 873
 874/*
 875 * We can only have one thread cannibalizing other cached btree nodes at a time,
 876 * or we'll deadlock. We use an open coded mutex to ensure that, which a
 877 * cannibalize_bucket() will take. This means every time we unlock the root of
 878 * the btree, we need to release this lock if we have it held.
 879 */
 880static void bch_cannibalize_unlock(struct cache_set *c)
 881{
 882        if (c->btree_cache_alloc_lock == current) {
 883                c->btree_cache_alloc_lock = NULL;
 884                wake_up(&c->btree_cache_wait);
 885        }
 886}
 887
 888static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op,
 889                               struct bkey *k, int level)
 890{
 891        struct btree *b;
 892
 893        BUG_ON(current->bio_list);
 894
 895        lockdep_assert_held(&c->bucket_lock);
 896
 897        if (mca_find(c, k))
 898                return NULL;
 899
 900        /* btree_free() doesn't free memory; it sticks the node on the end of
 901         * the list. Check if there's any freed nodes there:
 902         */
 903        list_for_each_entry(b, &c->btree_cache_freeable, list)
 904                if (!mca_reap(b, btree_order(k), false))
 905                        goto out;
 906
 907        /* We never free struct btree itself, just the memory that holds the on
 908         * disk node. Check the freed list before allocating a new one:
 909         */
 910        list_for_each_entry(b, &c->btree_cache_freed, list)
 911                if (!mca_reap(b, 0, false)) {
 912                        mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO);
 913                        if (!b->keys.set[0].data)
 914                                goto err;
 915                        else
 916                                goto out;
 917                }
 918
 919        b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO);
 920        if (!b)
 921                goto err;
 922
 923        BUG_ON(!down_write_trylock(&b->lock));
 924        if (!b->keys.set->data)
 925                goto err;
 926out:
 927        BUG_ON(b->io_mutex.count != 1);
 928
 929        bkey_copy(&b->key, k);
 930        list_move(&b->list, &c->btree_cache);
 931        hlist_del_init_rcu(&b->hash);
 932        hlist_add_head_rcu(&b->hash, mca_hash(c, k));
 933
 934        lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_);
 935        b->parent       = (void *) ~0UL;
 936        b->flags        = 0;
 937        b->written      = 0;
 938        b->level        = level;
 939
 940        if (!b->level)
 941                bch_btree_keys_init(&b->keys, &bch_extent_keys_ops,
 942                                    &b->c->expensive_debug_checks);
 943        else
 944                bch_btree_keys_init(&b->keys, &bch_btree_keys_ops,
 945                                    &b->c->expensive_debug_checks);
 946
 947        return b;
 948err:
 949        if (b)
 950                rw_unlock(true, b);
 951
 952        b = mca_cannibalize(c, op, k);
 953        if (!IS_ERR(b))
 954                goto out;
 955
 956        return b;
 957}
 958
 959/**
 960 * bch_btree_node_get - find a btree node in the cache and lock it, reading it
 961 * in from disk if necessary.
 962 *
 963 * If IO is necessary and running under generic_make_request, returns -EAGAIN.
 964 *
 965 * The btree node will have either a read or a write lock held, depending on
 966 * level and op->lock.
 967 */
 968struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op,
 969                                 struct bkey *k, int level, bool write,
 970                                 struct btree *parent)
 971{
 972        int i = 0;
 973        struct btree *b;
 974
 975        BUG_ON(level < 0);
 976retry:
 977        b = mca_find(c, k);
 978
 979        if (!b) {
 980                if (current->bio_list)
 981                        return ERR_PTR(-EAGAIN);
 982
 983                mutex_lock(&c->bucket_lock);
 984                b = mca_alloc(c, op, k, level);
 985                mutex_unlock(&c->bucket_lock);
 986
 987                if (!b)
 988                        goto retry;
 989                if (IS_ERR(b))
 990                        return b;
 991
 992                bch_btree_node_read(b);
 993
 994                if (!write)
 995                        downgrade_write(&b->lock);
 996        } else {
 997                rw_lock(write, b, level);
 998                if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) {
 999                        rw_unlock(write, b);
1000                        goto retry;
1001                }
1002                BUG_ON(b->level != level);
1003        }
1004
1005        b->parent = parent;
1006        b->accessed = 1;
1007
1008        for (; i <= b->keys.nsets && b->keys.set[i].size; i++) {
1009                prefetch(b->keys.set[i].tree);
1010                prefetch(b->keys.set[i].data);
1011        }
1012
1013        for (; i <= b->keys.nsets; i++)
1014                prefetch(b->keys.set[i].data);
1015
1016        if (btree_node_io_error(b)) {
1017                rw_unlock(write, b);
1018                return ERR_PTR(-EIO);
1019        }
1020
1021        BUG_ON(!b->written);
1022
1023        return b;
1024}
1025
1026static void btree_node_prefetch(struct btree *parent, struct bkey *k)
1027{
1028        struct btree *b;
1029
1030        mutex_lock(&parent->c->bucket_lock);
1031        b = mca_alloc(parent->c, NULL, k, parent->level - 1);
1032        mutex_unlock(&parent->c->bucket_lock);
1033
1034        if (!IS_ERR_OR_NULL(b)) {
1035                b->parent = parent;
1036                bch_btree_node_read(b);
1037                rw_unlock(true, b);
1038        }
1039}
1040
1041/* Btree alloc */
1042
1043static void btree_node_free(struct btree *b)
1044{
1045        trace_bcache_btree_node_free(b);
1046
1047        BUG_ON(b == b->c->root);
1048
1049        mutex_lock(&b->write_lock);
1050
1051        if (btree_node_dirty(b))
1052                btree_complete_write(b, btree_current_write(b));
1053        clear_bit(BTREE_NODE_dirty, &b->flags);
1054
1055        mutex_unlock(&b->write_lock);
1056
1057        cancel_delayed_work(&b->work);
1058
1059        mutex_lock(&b->c->bucket_lock);
1060        bch_bucket_free(b->c, &b->key);
1061        mca_bucket_free(b);
1062        mutex_unlock(&b->c->bucket_lock);
1063}
1064
1065struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
1066                                     int level, bool wait,
1067                                     struct btree *parent)
1068{
1069        BKEY_PADDED(key) k;
1070        struct btree *b = ERR_PTR(-EAGAIN);
1071
1072        mutex_lock(&c->bucket_lock);
1073retry:
1074        if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait))
1075                goto err;
1076
1077        bkey_put(c, &k.key);
1078        SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
1079
1080        b = mca_alloc(c, op, &k.key, level);
1081        if (IS_ERR(b))
1082                goto err_free;
1083
1084        if (!b) {
1085                cache_bug(c,
1086                        "Tried to allocate bucket that was in btree cache");
1087                goto retry;
1088        }
1089
1090        b->accessed = 1;
1091        b->parent = parent;
1092        bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb));
1093
1094        mutex_unlock(&c->bucket_lock);
1095
1096        trace_bcache_btree_node_alloc(b);
1097        return b;
1098err_free:
1099        bch_bucket_free(c, &k.key);
1100err:
1101        mutex_unlock(&c->bucket_lock);
1102
1103        trace_bcache_btree_node_alloc_fail(c);
1104        return b;
1105}
1106
1107static struct btree *bch_btree_node_alloc(struct cache_set *c,
1108                                          struct btree_op *op, int level,
1109                                          struct btree *parent)
1110{
1111        return __bch_btree_node_alloc(c, op, level, op != NULL, parent);
1112}
1113
1114static struct btree *btree_node_alloc_replacement(struct btree *b,
1115                                                  struct btree_op *op)
1116{
1117        struct btree *n = bch_btree_node_alloc(b->c, op, b->level, b->parent);
1118        if (!IS_ERR_OR_NULL(n)) {
1119                mutex_lock(&n->write_lock);
1120                bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort);
1121                bkey_copy_key(&n->key, &b->key);
1122                mutex_unlock(&n->write_lock);
1123        }
1124
1125        return n;
1126}
1127
1128static void make_btree_freeing_key(struct btree *b, struct bkey *k)
1129{
1130        unsigned i;
1131
1132        mutex_lock(&b->c->bucket_lock);
1133
1134        atomic_inc(&b->c->prio_blocked);
1135
1136        bkey_copy(k, &b->key);
1137        bkey_copy_key(k, &ZERO_KEY);
1138
1139        for (i = 0; i < KEY_PTRS(k); i++)
1140                SET_PTR_GEN(k, i,
1141                            bch_inc_gen(PTR_CACHE(b->c, &b->key, i),
1142                                        PTR_BUCKET(b->c, &b->key, i)));
1143
1144        mutex_unlock(&b->c->bucket_lock);
1145}
1146
1147static int btree_check_reserve(struct btree *b, struct btree_op *op)
1148{
1149        struct cache_set *c = b->c;
1150        struct cache *ca;
1151        unsigned i, reserve = (c->root->level - b->level) * 2 + 1;
1152
1153        mutex_lock(&c->bucket_lock);
1154
1155        for_each_cache(ca, c, i)
1156                if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) {
1157                        if (op)
1158                                prepare_to_wait(&c->btree_cache_wait, &op->wait,
1159                                                TASK_UNINTERRUPTIBLE);
1160                        mutex_unlock(&c->bucket_lock);
1161                        return -EINTR;
1162                }
1163
1164        mutex_unlock(&c->bucket_lock);
1165
1166        return mca_cannibalize_lock(b->c, op);
1167}
1168
1169/* Garbage collection */
1170
1171static uint8_t __bch_btree_mark_key(struct cache_set *c, int level,
1172                                    struct bkey *k)
1173{
1174        uint8_t stale = 0;
1175        unsigned i;
1176        struct bucket *g;
1177
1178        /*
1179         * ptr_invalid() can't return true for the keys that mark btree nodes as
1180         * freed, but since ptr_bad() returns true we'll never actually use them
1181         * for anything and thus we don't want mark their pointers here
1182         */
1183        if (!bkey_cmp(k, &ZERO_KEY))
1184                return stale;
1185
1186        for (i = 0; i < KEY_PTRS(k); i++) {
1187                if (!ptr_available(c, k, i))
1188                        continue;
1189
1190                g = PTR_BUCKET(c, k, i);
1191
1192                if (gen_after(g->last_gc, PTR_GEN(k, i)))
1193                        g->last_gc = PTR_GEN(k, i);
1194
1195                if (ptr_stale(c, k, i)) {
1196                        stale = max(stale, ptr_stale(c, k, i));
1197                        continue;
1198                }
1199
1200                cache_bug_on(GC_MARK(g) &&
1201                             (GC_MARK(g) == GC_MARK_METADATA) != (level != 0),
1202                             c, "inconsistent ptrs: mark = %llu, level = %i",
1203                             GC_MARK(g), level);
1204
1205                if (level)
1206                        SET_GC_MARK(g, GC_MARK_METADATA);
1207                else if (KEY_DIRTY(k))
1208                        SET_GC_MARK(g, GC_MARK_DIRTY);
1209                else if (!GC_MARK(g))
1210                        SET_GC_MARK(g, GC_MARK_RECLAIMABLE);
1211
1212                /* guard against overflow */
1213                SET_GC_SECTORS_USED(g, min_t(unsigned,
1214                                             GC_SECTORS_USED(g) + KEY_SIZE(k),
1215                                             MAX_GC_SECTORS_USED));
1216
1217                BUG_ON(!GC_SECTORS_USED(g));
1218        }
1219
1220        return stale;
1221}
1222
1223#define btree_mark_key(b, k)    __bch_btree_mark_key(b->c, b->level, k)
1224
1225void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k)
1226{
1227        unsigned i;
1228
1229        for (i = 0; i < KEY_PTRS(k); i++)
1230                if (ptr_available(c, k, i) &&
1231                    !ptr_stale(c, k, i)) {
1232                        struct bucket *b = PTR_BUCKET(c, k, i);
1233
1234                        b->gen = PTR_GEN(k, i);
1235
1236                        if (level && bkey_cmp(k, &ZERO_KEY))
1237                                b->prio = BTREE_PRIO;
1238                        else if (!level && b->prio == BTREE_PRIO)
1239                                b->prio = INITIAL_PRIO;
1240                }
1241
1242        __bch_btree_mark_key(c, level, k);
1243}
1244
1245static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
1246{
1247        uint8_t stale = 0;
1248        unsigned keys = 0, good_keys = 0;
1249        struct bkey *k;
1250        struct btree_iter iter;
1251        struct bset_tree *t;
1252
1253        gc->nodes++;
1254
1255        for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) {
1256                stale = max(stale, btree_mark_key(b, k));
1257                keys++;
1258
1259                if (bch_ptr_bad(&b->keys, k))
1260                        continue;
1261
1262                gc->key_bytes += bkey_u64s(k);
1263                gc->nkeys++;
1264                good_keys++;
1265
1266                gc->data += KEY_SIZE(k);
1267        }
1268
1269        for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++)
1270                btree_bug_on(t->size &&
1271                             bset_written(&b->keys, t) &&
1272                             bkey_cmp(&b->key, &t->end) < 0,
1273                             b, "found short btree key in gc");
1274
1275        if (b->c->gc_always_rewrite)
1276                return true;
1277
1278        if (stale > 10)
1279                return true;
1280
1281        if ((keys - good_keys) * 2 > keys)
1282                return true;
1283
1284        return false;
1285}
1286
1287#define GC_MERGE_NODES  4U
1288
1289struct gc_merge_info {
1290        struct btree    *b;
1291        unsigned        keys;
1292};
1293
1294static int bch_btree_insert_node(struct btree *, struct btree_op *,
1295                                 struct keylist *, atomic_t *, struct bkey *);
1296
1297static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1298                             struct gc_stat *gc, struct gc_merge_info *r)
1299{
1300        unsigned i, nodes = 0, keys = 0, blocks;
1301        struct btree *new_nodes[GC_MERGE_NODES];
1302        struct keylist keylist;
1303        struct closure cl;
1304        struct bkey *k;
1305
1306        bch_keylist_init(&keylist);
1307
1308        if (btree_check_reserve(b, NULL))
1309                return 0;
1310
1311        memset(new_nodes, 0, sizeof(new_nodes));
1312        closure_init_stack(&cl);
1313
1314        while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b))
1315                keys += r[nodes++].keys;
1316
1317        blocks = btree_default_blocks(b->c) * 2 / 3;
1318
1319        if (nodes < 2 ||
1320            __set_blocks(b->keys.set[0].data, keys,
1321                         block_bytes(b->c)) > blocks * (nodes - 1))
1322                return 0;
1323
1324        for (i = 0; i < nodes; i++) {
1325                new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL);
1326                if (IS_ERR_OR_NULL(new_nodes[i]))
1327                        goto out_nocoalesce;
1328        }
1329
1330        /*
1331         * We have to check the reserve here, after we've allocated our new
1332         * nodes, to make sure the insert below will succeed - we also check
1333         * before as an optimization to potentially avoid a bunch of expensive
1334         * allocs/sorts
1335         */
1336        if (btree_check_reserve(b, NULL))
1337                goto out_nocoalesce;
1338
1339        for (i = 0; i < nodes; i++)
1340                mutex_lock(&new_nodes[i]->write_lock);
1341
1342        for (i = nodes - 1; i > 0; --i) {
1343                struct bset *n1 = btree_bset_first(new_nodes[i]);
1344                struct bset *n2 = btree_bset_first(new_nodes[i - 1]);
1345                struct bkey *k, *last = NULL;
1346
1347                keys = 0;
1348
1349                if (i > 1) {
1350                        for (k = n2->start;
1351                             k < bset_bkey_last(n2);
1352                             k = bkey_next(k)) {
1353                                if (__set_blocks(n1, n1->keys + keys +
1354                                                 bkey_u64s(k),
1355                                                 block_bytes(b->c)) > blocks)
1356                                        break;
1357
1358                                last = k;
1359                                keys += bkey_u64s(k);
1360                        }
1361                } else {
1362                        /*
1363                         * Last node we're not getting rid of - we're getting
1364                         * rid of the node at r[0]. Have to try and fit all of
1365                         * the remaining keys into this node; we can't ensure
1366                         * they will always fit due to rounding and variable
1367                         * length keys (shouldn't be possible in practice,
1368                         * though)
1369                         */
1370                        if (__set_blocks(n1, n1->keys + n2->keys,
1371                                         block_bytes(b->c)) >
1372                            btree_blocks(new_nodes[i]))
1373                                goto out_nocoalesce;
1374
1375                        keys = n2->keys;
1376                        /* Take the key of the node we're getting rid of */
1377                        last = &r->b->key;
1378                }
1379
1380                BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c)) >
1381                       btree_blocks(new_nodes[i]));
1382
1383                if (last)
1384                        bkey_copy_key(&new_nodes[i]->key, last);
1385
1386                memcpy(bset_bkey_last(n1),
1387                       n2->start,
1388                       (void *) bset_bkey_idx(n2, keys) - (void *) n2->start);
1389
1390                n1->keys += keys;
1391                r[i].keys = n1->keys;
1392
1393                memmove(n2->start,
1394                        bset_bkey_idx(n2, keys),
1395                        (void *) bset_bkey_last(n2) -
1396                        (void *) bset_bkey_idx(n2, keys));
1397
1398                n2->keys -= keys;
1399
1400                if (__bch_keylist_realloc(&keylist,
1401                                          bkey_u64s(&new_nodes[i]->key)))
1402                        goto out_nocoalesce;
1403
1404                bch_btree_node_write(new_nodes[i], &cl);
1405                bch_keylist_add(&keylist, &new_nodes[i]->key);
1406        }
1407
1408        for (i = 0; i < nodes; i++)
1409                mutex_unlock(&new_nodes[i]->write_lock);
1410
1411        closure_sync(&cl);
1412
1413        /* We emptied out this node */
1414        BUG_ON(btree_bset_first(new_nodes[0])->keys);
1415        btree_node_free(new_nodes[0]);
1416        rw_unlock(true, new_nodes[0]);
1417        new_nodes[0] = NULL;
1418
1419        for (i = 0; i < nodes; i++) {
1420                if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key)))
1421                        goto out_nocoalesce;
1422
1423                make_btree_freeing_key(r[i].b, keylist.top);
1424                bch_keylist_push(&keylist);
1425        }
1426
1427        bch_btree_insert_node(b, op, &keylist, NULL, NULL);
1428        BUG_ON(!bch_keylist_empty(&keylist));
1429
1430        for (i = 0; i < nodes; i++) {
1431                btree_node_free(r[i].b);
1432                rw_unlock(true, r[i].b);
1433
1434                r[i].b = new_nodes[i];
1435        }
1436
1437        memmove(r, r + 1, sizeof(r[0]) * (nodes - 1));
1438        r[nodes - 1].b = ERR_PTR(-EINTR);
1439
1440        trace_bcache_btree_gc_coalesce(nodes);
1441        gc->nodes--;
1442
1443        bch_keylist_free(&keylist);
1444
1445        /* Invalidated our iterator */
1446        return -EINTR;
1447
1448out_nocoalesce:
1449        closure_sync(&cl);
1450        bch_keylist_free(&keylist);
1451
1452        while ((k = bch_keylist_pop(&keylist)))
1453                if (!bkey_cmp(k, &ZERO_KEY))
1454                        atomic_dec(&b->c->prio_blocked);
1455
1456        for (i = 0; i < nodes; i++)
1457                if (!IS_ERR_OR_NULL(new_nodes[i])) {
1458                        btree_node_free(new_nodes[i]);
1459                        rw_unlock(true, new_nodes[i]);
1460                }
1461        return 0;
1462}
1463
1464static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op,
1465                                 struct btree *replace)
1466{
1467        struct keylist keys;
1468        struct btree *n;
1469
1470        if (btree_check_reserve(b, NULL))
1471                return 0;
1472
1473        n = btree_node_alloc_replacement(replace, NULL);
1474
1475        /* recheck reserve after allocating replacement node */
1476        if (btree_check_reserve(b, NULL)) {
1477                btree_node_free(n);
1478                rw_unlock(true, n);
1479                return 0;
1480        }
1481
1482        bch_btree_node_write_sync(n);
1483
1484        bch_keylist_init(&keys);
1485        bch_keylist_add(&keys, &n->key);
1486
1487        make_btree_freeing_key(replace, keys.top);
1488        bch_keylist_push(&keys);
1489
1490        bch_btree_insert_node(b, op, &keys, NULL, NULL);
1491        BUG_ON(!bch_keylist_empty(&keys));
1492
1493        btree_node_free(replace);
1494        rw_unlock(true, n);
1495
1496        /* Invalidated our iterator */
1497        return -EINTR;
1498}
1499
1500static unsigned btree_gc_count_keys(struct btree *b)
1501{
1502        struct bkey *k;
1503        struct btree_iter iter;
1504        unsigned ret = 0;
1505
1506        for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
1507                ret += bkey_u64s(k);
1508
1509        return ret;
1510}
1511
1512static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1513                            struct closure *writes, struct gc_stat *gc)
1514{
1515        int ret = 0;
1516        bool should_rewrite;
1517        struct bkey *k;
1518        struct btree_iter iter;
1519        struct gc_merge_info r[GC_MERGE_NODES];
1520        struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1;
1521
1522        bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done);
1523
1524        for (i = r; i < r + ARRAY_SIZE(r); i++)
1525                i->b = ERR_PTR(-EINTR);
1526
1527        while (1) {
1528                k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad);
1529                if (k) {
1530                        r->b = bch_btree_node_get(b->c, op, k, b->level - 1,
1531                                                  true, b);
1532                        if (IS_ERR(r->b)) {
1533                                ret = PTR_ERR(r->b);
1534                                break;
1535                        }
1536
1537                        r->keys = btree_gc_count_keys(r->b);
1538
1539                        ret = btree_gc_coalesce(b, op, gc, r);
1540                        if (ret)
1541                                break;
1542                }
1543
1544                if (!last->b)
1545                        break;
1546
1547                if (!IS_ERR(last->b)) {
1548                        should_rewrite = btree_gc_mark_node(last->b, gc);
1549                        if (should_rewrite) {
1550                                ret = btree_gc_rewrite_node(b, op, last->b);
1551                                if (ret)
1552                                        break;
1553                        }
1554
1555                        if (last->b->level) {
1556                                ret = btree_gc_recurse(last->b, op, writes, gc);
1557                                if (ret)
1558                                        break;
1559                        }
1560
1561                        bkey_copy_key(&b->c->gc_done, &last->b->key);
1562
1563                        /*
1564                         * Must flush leaf nodes before gc ends, since replace
1565                         * operations aren't journalled
1566                         */
1567                        mutex_lock(&last->b->write_lock);
1568                        if (btree_node_dirty(last->b))
1569                                bch_btree_node_write(last->b, writes);
1570                        mutex_unlock(&last->b->write_lock);
1571                        rw_unlock(true, last->b);
1572                }
1573
1574                memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
1575                r->b = NULL;
1576
1577                if (need_resched()) {
1578                        ret = -EAGAIN;
1579                        break;
1580                }
1581        }
1582
1583        for (i = r; i < r + ARRAY_SIZE(r); i++)
1584                if (!IS_ERR_OR_NULL(i->b)) {
1585                        mutex_lock(&i->b->write_lock);
1586                        if (btree_node_dirty(i->b))
1587                                bch_btree_node_write(i->b, writes);
1588                        mutex_unlock(&i->b->write_lock);
1589                        rw_unlock(true, i->b);
1590                }
1591
1592        return ret;
1593}
1594
1595static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
1596                             struct closure *writes, struct gc_stat *gc)
1597{
1598        struct btree *n = NULL;
1599        int ret = 0;
1600        bool should_rewrite;
1601
1602        should_rewrite = btree_gc_mark_node(b, gc);
1603        if (should_rewrite) {
1604                n = btree_node_alloc_replacement(b, NULL);
1605
1606                if (!IS_ERR_OR_NULL(n)) {
1607                        bch_btree_node_write_sync(n);
1608
1609                        bch_btree_set_root(n);
1610                        btree_node_free(b);
1611                        rw_unlock(true, n);
1612
1613                        return -EINTR;
1614                }
1615        }
1616
1617        __bch_btree_mark_key(b->c, b->level + 1, &b->key);
1618
1619        if (b->level) {
1620                ret = btree_gc_recurse(b, op, writes, gc);
1621                if (ret)
1622                        return ret;
1623        }
1624
1625        bkey_copy_key(&b->c->gc_done, &b->key);
1626
1627        return ret;
1628}
1629
1630static void btree_gc_start(struct cache_set *c)
1631{
1632        struct cache *ca;
1633        struct bucket *b;
1634        unsigned i;
1635
1636        if (!c->gc_mark_valid)
1637                return;
1638
1639        mutex_lock(&c->bucket_lock);
1640
1641        c->gc_mark_valid = 0;
1642        c->gc_done = ZERO_KEY;
1643
1644        for_each_cache(ca, c, i)
1645                for_each_bucket(b, ca) {
1646                        b->last_gc = b->gen;
1647                        if (!atomic_read(&b->pin)) {
1648                                SET_GC_MARK(b, 0);
1649                                SET_GC_SECTORS_USED(b, 0);
1650                        }
1651                }
1652
1653        mutex_unlock(&c->bucket_lock);
1654}
1655
1656static size_t bch_btree_gc_finish(struct cache_set *c)
1657{
1658        size_t available = 0;
1659        struct bucket *b;
1660        struct cache *ca;
1661        unsigned i;
1662
1663        mutex_lock(&c->bucket_lock);
1664
1665        set_gc_sectors(c);
1666        c->gc_mark_valid = 1;
1667        c->need_gc      = 0;
1668
1669        for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++)
1670                SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
1671                            GC_MARK_METADATA);
1672
1673        /* don't reclaim buckets to which writeback keys point */
1674        rcu_read_lock();
1675        for (i = 0; i < c->nr_uuids; i++) {
1676                struct bcache_device *d = c->devices[i];
1677                struct cached_dev *dc;
1678                struct keybuf_key *w, *n;
1679                unsigned j;
1680
1681                if (!d || UUID_FLASH_ONLY(&c->uuids[i]))
1682                        continue;
1683                dc = container_of(d, struct cached_dev, disk);
1684
1685                spin_lock(&dc->writeback_keys.lock);
1686                rbtree_postorder_for_each_entry_safe(w, n,
1687                                        &dc->writeback_keys.keys, node)
1688                        for (j = 0; j < KEY_PTRS(&w->key); j++)
1689                                SET_GC_MARK(PTR_BUCKET(c, &w->key, j),
1690                                            GC_MARK_DIRTY);
1691                spin_unlock(&dc->writeback_keys.lock);
1692        }
1693        rcu_read_unlock();
1694
1695        for_each_cache(ca, c, i) {
1696                uint64_t *i;
1697
1698                ca->invalidate_needs_gc = 0;
1699
1700                for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++)
1701                        SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
1702
1703                for (i = ca->prio_buckets;
1704                     i < ca->prio_buckets + prio_buckets(ca) * 2; i++)
1705                        SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
1706
1707                for_each_bucket(b, ca) {
1708                        c->need_gc      = max(c->need_gc, bucket_gc_gen(b));
1709
1710                        if (atomic_read(&b->pin))
1711                                continue;
1712
1713                        BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b));
1714
1715                        if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE)
1716                                available++;
1717                }
1718        }
1719
1720        mutex_unlock(&c->bucket_lock);
1721        return available;
1722}
1723
1724static void bch_btree_gc(struct cache_set *c)
1725{
1726        int ret;
1727        unsigned long available;
1728        struct gc_stat stats;
1729        struct closure writes;
1730        struct btree_op op;
1731        uint64_t start_time = local_clock();
1732
1733        trace_bcache_gc_start(c);
1734
1735        memset(&stats, 0, sizeof(struct gc_stat));
1736        closure_init_stack(&writes);
1737        bch_btree_op_init(&op, SHRT_MAX);
1738
1739        btree_gc_start(c);
1740
1741        do {
1742                ret = btree_root(gc_root, c, &op, &writes, &stats);
1743                closure_sync(&writes);
1744
1745                if (ret && ret != -EAGAIN)
1746                        pr_warn("gc failed!");
1747        } while (ret);
1748
1749        available = bch_btree_gc_finish(c);
1750        wake_up_allocators(c);
1751
1752        bch_time_stats_update(&c->btree_gc_time, start_time);
1753
1754        stats.key_bytes *= sizeof(uint64_t);
1755        stats.data      <<= 9;
1756        stats.in_use    = (c->nbuckets - available) * 100 / c->nbuckets;
1757        memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
1758
1759        trace_bcache_gc_end(c);
1760
1761        bch_moving_gc(c);
1762}
1763
1764static int bch_gc_thread(void *arg)
1765{
1766        struct cache_set *c = arg;
1767        struct cache *ca;
1768        unsigned i;
1769
1770        while (1) {
1771again:
1772                bch_btree_gc(c);
1773
1774                set_current_state(TASK_INTERRUPTIBLE);
1775                if (kthread_should_stop())
1776                        break;
1777
1778                mutex_lock(&c->bucket_lock);
1779
1780                for_each_cache(ca, c, i)
1781                        if (ca->invalidate_needs_gc) {
1782                                mutex_unlock(&c->bucket_lock);
1783                                set_current_state(TASK_RUNNING);
1784                                goto again;
1785                        }
1786
1787                mutex_unlock(&c->bucket_lock);
1788
1789                try_to_freeze();
1790                schedule();
1791        }
1792
1793        return 0;
1794}
1795
1796int bch_gc_thread_start(struct cache_set *c)
1797{
1798        c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc");
1799        if (IS_ERR(c->gc_thread))
1800                return PTR_ERR(c->gc_thread);
1801
1802        set_task_state(c->gc_thread, TASK_INTERRUPTIBLE);
1803        return 0;
1804}
1805
1806/* Initial partial gc */
1807
1808static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
1809{
1810        int ret = 0;
1811        struct bkey *k, *p = NULL;
1812        struct btree_iter iter;
1813
1814        for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid)
1815                bch_initial_mark_key(b->c, b->level, k);
1816
1817        bch_initial_mark_key(b->c, b->level + 1, &b->key);
1818
1819        if (b->level) {
1820                bch_btree_iter_init(&b->keys, &iter, NULL);
1821
1822                do {
1823                        k = bch_btree_iter_next_filter(&iter, &b->keys,
1824                                                       bch_ptr_bad);
1825                        if (k)
1826                                btree_node_prefetch(b, k);
1827
1828                        if (p)
1829                                ret = btree(check_recurse, p, b, op);
1830
1831                        p = k;
1832                } while (p && !ret);
1833        }
1834
1835        return ret;
1836}
1837
1838int bch_btree_check(struct cache_set *c)
1839{
1840        struct btree_op op;
1841
1842        bch_btree_op_init(&op, SHRT_MAX);
1843
1844        return btree_root(check_recurse, c, &op);
1845}
1846
1847void bch_initial_gc_finish(struct cache_set *c)
1848{
1849        struct cache *ca;
1850        struct bucket *b;
1851        unsigned i;
1852
1853        bch_btree_gc_finish(c);
1854
1855        mutex_lock(&c->bucket_lock);
1856
1857        /*
1858         * We need to put some unused buckets directly on the prio freelist in
1859         * order to get the allocator thread started - it needs freed buckets in
1860         * order to rewrite the prios and gens, and it needs to rewrite prios
1861         * and gens in order to free buckets.
1862         *
1863         * This is only safe for buckets that have no live data in them, which
1864         * there should always be some of.
1865         */
1866        for_each_cache(ca, c, i) {
1867                for_each_bucket(b, ca) {
1868                        if (fifo_full(&ca->free[RESERVE_PRIO]))
1869                                break;
1870
1871                        if (bch_can_invalidate_bucket(ca, b) &&
1872                            !GC_MARK(b)) {
1873                                __bch_invalidate_one_bucket(ca, b);
1874                                fifo_push(&ca->free[RESERVE_PRIO],
1875                                          b - ca->buckets);
1876                        }
1877                }
1878        }
1879
1880        mutex_unlock(&c->bucket_lock);
1881}
1882
1883/* Btree insertion */
1884
1885static bool btree_insert_key(struct btree *b, struct bkey *k,
1886                             struct bkey *replace_key)
1887{
1888        unsigned status;
1889
1890        BUG_ON(bkey_cmp(k, &b->key) > 0);
1891
1892        status = bch_btree_insert_key(&b->keys, k, replace_key);
1893        if (status != BTREE_INSERT_STATUS_NO_INSERT) {
1894                bch_check_keys(&b->keys, "%u for %s", status,
1895                               replace_key ? "replace" : "insert");
1896
1897                trace_bcache_btree_insert_key(b, k, replace_key != NULL,
1898                                              status);
1899                return true;
1900        } else
1901                return false;
1902}
1903
1904static size_t insert_u64s_remaining(struct btree *b)
1905{
1906        long ret = bch_btree_keys_u64s_remaining(&b->keys);
1907
1908        /*
1909         * Might land in the middle of an existing extent and have to split it
1910         */
1911        if (b->keys.ops->is_extents)
1912                ret -= KEY_MAX_U64S;
1913
1914        return max(ret, 0L);
1915}
1916
1917static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
1918                                  struct keylist *insert_keys,
1919                                  struct bkey *replace_key)
1920{
1921        bool ret = false;
1922        int oldsize = bch_count_data(&b->keys);
1923
1924        while (!bch_keylist_empty(insert_keys)) {
1925                struct bkey *k = insert_keys->keys;
1926
1927                if (bkey_u64s(k) > insert_u64s_remaining(b))
1928                        break;
1929
1930                if (bkey_cmp(k, &b->key) <= 0) {
1931                        if (!b->level)
1932                                bkey_put(b->c, k);
1933
1934                        ret |= btree_insert_key(b, k, replace_key);
1935                        bch_keylist_pop_front(insert_keys);
1936                } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) {
1937                        BKEY_PADDED(key) temp;
1938                        bkey_copy(&temp.key, insert_keys->keys);
1939
1940                        bch_cut_back(&b->key, &temp.key);
1941                        bch_cut_front(&b->key, insert_keys->keys);
1942
1943                        ret |= btree_insert_key(b, &temp.key, replace_key);
1944                        break;
1945                } else {
1946                        break;
1947                }
1948        }
1949
1950        if (!ret)
1951                op->insert_collision = true;
1952
1953        BUG_ON(!bch_keylist_empty(insert_keys) && b->level);
1954
1955        BUG_ON(bch_count_data(&b->keys) < oldsize);
1956        return ret;
1957}
1958
1959static int btree_split(struct btree *b, struct btree_op *op,
1960                       struct keylist *insert_keys,
1961                       struct bkey *replace_key)
1962{
1963        bool split;
1964        struct btree *n1, *n2 = NULL, *n3 = NULL;
1965        uint64_t start_time = local_clock();
1966        struct closure cl;
1967        struct keylist parent_keys;
1968
1969        closure_init_stack(&cl);
1970        bch_keylist_init(&parent_keys);
1971
1972        if (btree_check_reserve(b, op)) {
1973                if (!b->level)
1974                        return -EINTR;
1975                else
1976                        WARN(1, "insufficient reserve for split\n");
1977        }
1978
1979        n1 = btree_node_alloc_replacement(b, op);
1980        if (IS_ERR(n1))
1981                goto err;
1982
1983        split = set_blocks(btree_bset_first(n1),
1984                           block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5;
1985
1986        if (split) {
1987                unsigned keys = 0;
1988
1989                trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys);
1990
1991                n2 = bch_btree_node_alloc(b->c, op, b->level, b->parent);
1992                if (IS_ERR(n2))
1993                        goto err_free1;
1994
1995                if (!b->parent) {
1996                        n3 = bch_btree_node_alloc(b->c, op, b->level + 1, NULL);
1997                        if (IS_ERR(n3))
1998                                goto err_free2;
1999                }
2000
2001                mutex_lock(&n1->write_lock);
2002                mutex_lock(&n2->write_lock);
2003
2004                bch_btree_insert_keys(n1, op, insert_keys, replace_key);
2005
2006                /*
2007                 * Has to be a linear search because we don't have an auxiliary
2008                 * search tree yet
2009                 */
2010
2011                while (keys < (btree_bset_first(n1)->keys * 3) / 5)
2012                        keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1),
2013                                                        keys));
2014
2015                bkey_copy_key(&n1->key,
2016                              bset_bkey_idx(btree_bset_first(n1), keys));
2017                keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), keys));
2018
2019                btree_bset_first(n2)->keys = btree_bset_first(n1)->keys - keys;
2020                btree_bset_first(n1)->keys = keys;
2021
2022                memcpy(btree_bset_first(n2)->start,
2023                       bset_bkey_last(btree_bset_first(n1)),
2024                       btree_bset_first(n2)->keys * sizeof(uint64_t));
2025
2026                bkey_copy_key(&n2->key, &b->key);
2027
2028                bch_keylist_add(&parent_keys, &n2->key);
2029                bch_btree_node_write(n2, &cl);
2030                mutex_unlock(&n2->write_lock);
2031                rw_unlock(true, n2);
2032        } else {
2033                trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys);
2034
2035                mutex_lock(&n1->write_lock);
2036                bch_btree_insert_keys(n1, op, insert_keys, replace_key);
2037        }
2038
2039        bch_keylist_add(&parent_keys, &n1->key);
2040        bch_btree_node_write(n1, &cl);
2041        mutex_unlock(&n1->write_lock);
2042
2043        if (n3) {
2044                /* Depth increases, make a new root */
2045                mutex_lock(&n3->write_lock);
2046                bkey_copy_key(&n3->key, &MAX_KEY);
2047                bch_btree_insert_keys(n3, op, &parent_keys, NULL);
2048                bch_btree_node_write(n3, &cl);
2049                mutex_unlock(&n3->write_lock);
2050
2051                closure_sync(&cl);
2052                bch_btree_set_root(n3);
2053                rw_unlock(true, n3);
2054        } else if (!b->parent) {
2055                /* Root filled up but didn't need to be split */
2056                closure_sync(&cl);
2057                bch_btree_set_root(n1);
2058        } else {
2059                /* Split a non root node */
2060                closure_sync(&cl);
2061                make_btree_freeing_key(b, parent_keys.top);
2062                bch_keylist_push(&parent_keys);
2063
2064                bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL);
2065                BUG_ON(!bch_keylist_empty(&parent_keys));
2066        }
2067
2068        btree_node_free(b);
2069        rw_unlock(true, n1);
2070
2071        bch_time_stats_update(&b->c->btree_split_time, start_time);
2072
2073        return 0;
2074err_free2:
2075        bkey_put(b->c, &n2->key);
2076        btree_node_free(n2);
2077        rw_unlock(true, n2);
2078err_free1:
2079        bkey_put(b->c, &n1->key);
2080        btree_node_free(n1);
2081        rw_unlock(true, n1);
2082err:
2083        WARN(1, "bcache: btree split failed (level %u)", b->level);
2084
2085        if (n3 == ERR_PTR(-EAGAIN) ||
2086            n2 == ERR_PTR(-EAGAIN) ||
2087            n1 == ERR_PTR(-EAGAIN))
2088                return -EAGAIN;
2089
2090        return -ENOMEM;
2091}
2092
2093static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
2094                                 struct keylist *insert_keys,
2095                                 atomic_t *journal_ref,
2096                                 struct bkey *replace_key)
2097{
2098        struct closure cl;
2099
2100        BUG_ON(b->level && replace_key);
2101
2102        closure_init_stack(&cl);
2103
2104        mutex_lock(&b->write_lock);
2105
2106        if (write_block(b) != btree_bset_last(b) &&
2107            b->keys.last_set_unwritten)
2108                bch_btree_init_next(b); /* just wrote a set */
2109
2110        if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) {
2111                mutex_unlock(&b->write_lock);
2112                goto split;
2113        }
2114
2115        BUG_ON(write_block(b) != btree_bset_last(b));
2116
2117        if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) {
2118                if (!b->level)
2119                        bch_btree_leaf_dirty(b, journal_ref);
2120                else
2121                        bch_btree_node_write(b, &cl);
2122        }
2123
2124        mutex_unlock(&b->write_lock);
2125
2126        /* wait for btree node write if necessary, after unlock */
2127        closure_sync(&cl);
2128
2129        return 0;
2130split:
2131        if (current->bio_list) {
2132                op->lock = b->c->root->level + 1;
2133                return -EAGAIN;
2134        } else if (op->lock <= b->c->root->level) {
2135                op->lock = b->c->root->level + 1;
2136                return -EINTR;
2137        } else {
2138                /* Invalidated all iterators */
2139                int ret = btree_split(b, op, insert_keys, replace_key);
2140
2141                if (bch_keylist_empty(insert_keys))
2142                        return 0;
2143                else if (!ret)
2144                        return -EINTR;
2145                return ret;
2146        }
2147}
2148
2149int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
2150                               struct bkey *check_key)
2151{
2152        int ret = -EINTR;
2153        uint64_t btree_ptr = b->key.ptr[0];
2154        unsigned long seq = b->seq;
2155        struct keylist insert;
2156        bool upgrade = op->lock == -1;
2157
2158        bch_keylist_init(&insert);
2159
2160        if (upgrade) {
2161                rw_unlock(false, b);
2162                rw_lock(true, b, b->level);
2163
2164                if (b->key.ptr[0] != btree_ptr ||
2165                    b->seq != seq + 1)
2166                        goto out;
2167        }
2168
2169        SET_KEY_PTRS(check_key, 1);
2170        get_random_bytes(&check_key->ptr[0], sizeof(uint64_t));
2171
2172        SET_PTR_DEV(check_key, 0, PTR_CHECK_DEV);
2173
2174        bch_keylist_add(&insert, check_key);
2175
2176        ret = bch_btree_insert_node(b, op, &insert, NULL, NULL);
2177
2178        BUG_ON(!ret && !bch_keylist_empty(&insert));
2179out:
2180        if (upgrade)
2181                downgrade_write(&b->lock);
2182        return ret;
2183}
2184
2185struct btree_insert_op {
2186        struct btree_op op;
2187        struct keylist  *keys;
2188        atomic_t        *journal_ref;
2189        struct bkey     *replace_key;
2190};
2191
2192static int btree_insert_fn(struct btree_op *b_op, struct btree *b)
2193{
2194        struct btree_insert_op *op = container_of(b_op,
2195                                        struct btree_insert_op, op);
2196
2197        int ret = bch_btree_insert_node(b, &op->op, op->keys,
2198                                        op->journal_ref, op->replace_key);
2199        if (ret && !bch_keylist_empty(op->keys))
2200                return ret;
2201        else
2202                return MAP_DONE;
2203}
2204
2205int bch_btree_insert(struct cache_set *c, struct keylist *keys,
2206                     atomic_t *journal_ref, struct bkey *replace_key)
2207{
2208        struct btree_insert_op op;
2209        int ret = 0;
2210
2211        BUG_ON(current->bio_list);
2212        BUG_ON(bch_keylist_empty(keys));
2213
2214        bch_btree_op_init(&op.op, 0);
2215        op.keys         = keys;
2216        op.journal_ref  = journal_ref;
2217        op.replace_key  = replace_key;
2218
2219        while (!ret && !bch_keylist_empty(keys)) {
2220                op.op.lock = 0;
2221                ret = bch_btree_map_leaf_nodes(&op.op, c,
2222                                               &START_KEY(keys->keys),
2223                                               btree_insert_fn);
2224        }
2225
2226        if (ret) {
2227                struct bkey *k;
2228
2229                pr_err("error %i", ret);
2230
2231                while ((k = bch_keylist_pop(keys)))
2232                        bkey_put(c, k);
2233        } else if (op.op.insert_collision)
2234                ret = -ESRCH;
2235
2236        return ret;
2237}
2238
2239void bch_btree_set_root(struct btree *b)
2240{
2241        unsigned i;
2242        struct closure cl;
2243
2244        closure_init_stack(&cl);
2245
2246        trace_bcache_btree_set_root(b);
2247
2248        BUG_ON(!b->written);
2249
2250        for (i = 0; i < KEY_PTRS(&b->key); i++)
2251                BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO);
2252
2253        mutex_lock(&b->c->bucket_lock);
2254        list_del_init(&b->list);
2255        mutex_unlock(&b->c->bucket_lock);
2256
2257        b->c->root = b;
2258
2259        bch_journal_meta(b->c, &cl);
2260        closure_sync(&cl);
2261}
2262
2263/* Map across nodes or keys */
2264
2265static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
2266                                       struct bkey *from,
2267                                       btree_map_nodes_fn *fn, int flags)
2268{
2269        int ret = MAP_CONTINUE;
2270
2271        if (b->level) {
2272                struct bkey *k;
2273                struct btree_iter iter;
2274
2275                bch_btree_iter_init(&b->keys, &iter, from);
2276
2277                while ((k = bch_btree_iter_next_filter(&iter, &b->keys,
2278                                                       bch_ptr_bad))) {
2279                        ret = btree(map_nodes_recurse, k, b,
2280                                    op, from, fn, flags);
2281                        from = NULL;
2282
2283                        if (ret != MAP_CONTINUE)
2284                                return ret;
2285                }
2286        }
2287
2288        if (!b->level || flags == MAP_ALL_NODES)
2289                ret = fn(op, b);
2290
2291        return ret;
2292}
2293
2294int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c,
2295                          struct bkey *from, btree_map_nodes_fn *fn, int flags)
2296{
2297        return btree_root(map_nodes_recurse, c, op, from, fn, flags);
2298}
2299
2300static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
2301                                      struct bkey *from, btree_map_keys_fn *fn,
2302                                      int flags)
2303{
2304        int ret = MAP_CONTINUE;
2305        struct bkey *k;
2306        struct btree_iter iter;
2307
2308        bch_btree_iter_init(&b->keys, &iter, from);
2309
2310        while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) {
2311                ret = !b->level
2312                        ? fn(op, b, k)
2313                        : btree(map_keys_recurse, k, b, op, from, fn, flags);
2314                from = NULL;
2315
2316                if (ret != MAP_CONTINUE)
2317                        return ret;
2318        }
2319
2320        if (!b->level && (flags & MAP_END_KEY))
2321                ret = fn(op, b, &KEY(KEY_INODE(&b->key),
2322                                     KEY_OFFSET(&b->key), 0));
2323
2324        return ret;
2325}
2326
2327int bch_btree_map_keys(struct btree_op *op, struct cache_set *c,
2328                       struct bkey *from, btree_map_keys_fn *fn, int flags)
2329{
2330        return btree_root(map_keys_recurse, c, op, from, fn, flags);
2331}
2332
2333/* Keybuf code */
2334
2335static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r)
2336{
2337        /* Overlapping keys compare equal */
2338        if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0)
2339                return -1;
2340        if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0)
2341                return 1;
2342        return 0;
2343}
2344
2345static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
2346                                            struct keybuf_key *r)
2347{
2348        return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1);
2349}
2350
2351struct refill {
2352        struct btree_op op;
2353        unsigned        nr_found;
2354        struct keybuf   *buf;
2355        struct bkey     *end;
2356        keybuf_pred_fn  *pred;
2357};
2358
2359static int refill_keybuf_fn(struct btree_op *op, struct btree *b,
2360                            struct bkey *k)
2361{
2362        struct refill *refill = container_of(op, struct refill, op);
2363        struct keybuf *buf = refill->buf;
2364        int ret = MAP_CONTINUE;
2365
2366        if (bkey_cmp(k, refill->end) >= 0) {
2367                ret = MAP_DONE;
2368                goto out;
2369        }
2370
2371        if (!KEY_SIZE(k)) /* end key */
2372                goto out;
2373
2374        if (refill->pred(buf, k)) {
2375                struct keybuf_key *w;
2376
2377                spin_lock(&buf->lock);
2378
2379                w = array_alloc(&buf->freelist);
2380                if (!w) {
2381                        spin_unlock(&buf->lock);
2382                        return MAP_DONE;
2383                }
2384
2385                w->private = NULL;
2386                bkey_copy(&w->key, k);
2387
2388                if (RB_INSERT(&buf->keys, w, node, keybuf_cmp))
2389                        array_free(&buf->freelist, w);
2390                else
2391                        refill->nr_found++;
2392
2393                if (array_freelist_empty(&buf->freelist))
2394                        ret = MAP_DONE;
2395
2396                spin_unlock(&buf->lock);
2397        }
2398out:
2399        buf->last_scanned = *k;
2400        return ret;
2401}
2402
2403void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
2404                       struct bkey *end, keybuf_pred_fn *pred)
2405{
2406        struct bkey start = buf->last_scanned;
2407        struct refill refill;
2408
2409        cond_resched();
2410
2411        bch_btree_op_init(&refill.op, -1);
2412        refill.nr_found = 0;
2413        refill.buf      = buf;
2414        refill.end      = end;
2415        refill.pred     = pred;
2416
2417        bch_btree_map_keys(&refill.op, c, &buf->last_scanned,
2418                           refill_keybuf_fn, MAP_END_KEY);
2419
2420        trace_bcache_keyscan(refill.nr_found,
2421                             KEY_INODE(&start), KEY_OFFSET(&start),
2422                             KEY_INODE(&buf->last_scanned),
2423                             KEY_OFFSET(&buf->last_scanned));
2424
2425        spin_lock(&buf->lock);
2426
2427        if (!RB_EMPTY_ROOT(&buf->keys)) {
2428                struct keybuf_key *w;
2429                w = RB_FIRST(&buf->keys, struct keybuf_key, node);
2430                buf->start      = START_KEY(&w->key);
2431
2432                w = RB_LAST(&buf->keys, struct keybuf_key, node);
2433                buf->end        = w->key;
2434        } else {
2435                buf->start      = MAX_KEY;
2436                buf->end        = MAX_KEY;
2437        }
2438
2439        spin_unlock(&buf->lock);
2440}
2441
2442static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
2443{
2444        rb_erase(&w->node, &buf->keys);
2445        array_free(&buf->freelist, w);
2446}
2447
2448void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
2449{
2450        spin_lock(&buf->lock);
2451        __bch_keybuf_del(buf, w);
2452        spin_unlock(&buf->lock);
2453}
2454
2455bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start,
2456                                  struct bkey *end)
2457{
2458        bool ret = false;
2459        struct keybuf_key *p, *w, s;
2460        s.key = *start;
2461
2462        if (bkey_cmp(end, &buf->start) <= 0 ||
2463            bkey_cmp(start, &buf->end) >= 0)
2464                return false;
2465
2466        spin_lock(&buf->lock);
2467        w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp);
2468
2469        while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) {
2470                p = w;
2471                w = RB_NEXT(w, node);
2472
2473                if (p->private)
2474                        ret = true;
2475                else
2476                        __bch_keybuf_del(buf, p);
2477        }
2478
2479        spin_unlock(&buf->lock);
2480        return ret;
2481}
2482
2483struct keybuf_key *bch_keybuf_next(struct keybuf *buf)
2484{
2485        struct keybuf_key *w;
2486        spin_lock(&buf->lock);
2487
2488        w = RB_FIRST(&buf->keys, struct keybuf_key, node);
2489
2490        while (w && w->private)
2491                w = RB_NEXT(w, node);
2492
2493        if (w)
2494                w->private = ERR_PTR(-EINTR);
2495
2496        spin_unlock(&buf->lock);
2497        return w;
2498}
2499
2500struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
2501                                          struct keybuf *buf,
2502                                          struct bkey *end,
2503                                          keybuf_pred_fn *pred)
2504{
2505        struct keybuf_key *ret;
2506
2507        while (1) {
2508                ret = bch_keybuf_next(buf);
2509                if (ret)
2510                        break;
2511
2512                if (bkey_cmp(&buf->last_scanned, end) >= 0) {
2513                        pr_debug("scan finished");
2514                        break;
2515                }
2516
2517                bch_refill_keybuf(c, buf, end, pred);
2518        }
2519
2520        return ret;
2521}
2522
2523void bch_keybuf_init(struct keybuf *buf)
2524{
2525        buf->last_scanned       = MAX_KEY;
2526        buf->keys               = RB_ROOT;
2527
2528        spin_lock_init(&buf->lock);
2529        array_allocator_init(&buf->freelist);
2530}
2531