LXR linux/drivers/md/bcache/btree.c

   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
   4 *
   5 * Uses a block device as cache for other block devices; optimized for SSDs.
   6 * All allocation is done in buckets, which should match the erase block size
   7 * of the device.
   8 *
   9 * Buckets containing cached data are kept on a heap sorted by priority;
  10 * bucket priority is increased on cache hit, and periodically all the buckets
  11 * on the heap have their priority scaled down. This currently is just used as
  12 * an LRU but in the future should allow for more intelligent heuristics.
  13 *
  14 * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
  15 * counter. Garbage collection is used to remove stale pointers.
  16 *
  17 * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
  18 * as keys are inserted we only sort the pages that have not yet been written.
  19 * When garbage collection is run, we resort the entire node.
  20 *
  21 * All configuration is done via sysfs; see Documentation/admin-guide/bcache.rst.
  22 */
  23
  24#include "bcache.h"
  25#include "btree.h"
  26#include "debug.h"
  27#include "extents.h"
  28
  29#include <linux/slab.h>
  30#include <linux/bitops.h>
  31#include <linux/hash.h>
  32#include <linux/kthread.h>
  33#include <linux/prefetch.h>
  34#include <linux/random.h>
  35#include <linux/rcupdate.h>
  36#include <linux/sched/clock.h>
  37#include <linux/rculist.h>
  38#include <linux/delay.h>
  39#include <trace/events/bcache.h>
  40
  41/*
  42 * Todo:
  43 * register_bcache: Return errors out to userspace correctly
  44 *
  45 * Writeback: don't undirty key until after a cache flush
  46 *
  47 * Create an iterator for key pointers
  48 *
  49 * On btree write error, mark bucket such that it won't be freed from the cache
  50 *
  51 * Journalling:
  52 *   Check for bad keys in replay
  53 *   Propagate barriers
  54 *   Refcount journal entries in journal_replay
  55 *
  56 * Garbage collection:
  57 *   Finish incremental gc
  58 *   Gc should free old UUIDs, data for invalid UUIDs
  59 *
  60 * Provide a way to list backing device UUIDs we have data cached for, and
  61 * probably how long it's been since we've seen them, and a way to invalidate
  62 * dirty data for devices that will never be attached again
  63 *
  64 * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so
  65 * that based on that and how much dirty data we have we can keep writeback
  66 * from being starved
  67 *
  68 * Add a tracepoint or somesuch to watch for writeback starvation
  69 *
  70 * When btree depth > 1 and splitting an interior node, we have to make sure
  71 * alloc_bucket() cannot fail. This should be true but is not completely
  72 * obvious.
  73 *
  74 * Plugging?
  75 *
  76 * If data write is less than hard sector size of ssd, round up offset in open
  77 * bucket to the next whole sector
  78 *
  79 * Superblock needs to be fleshed out for multiple cache devices
  80 *
  81 * Add a sysfs tunable for the number of writeback IOs in flight
  82 *
  83 * Add a sysfs tunable for the number of open data buckets
  84 *
  85 * IO tracking: Can we track when one process is doing io on behalf of another?
  86 * IO tracking: Don't use just an average, weigh more recent stuff higher
  87 *
  88 * Test module load/unload
  89 */
  90
  91#define MAX_NEED_GC             64
  92#define MAX_SAVE_PRIO           72
  93#define MAX_GC_TIMES            100
  94#define MIN_GC_NODES            100
  95#define GC_SLEEP_MS             100
  96
  97#define PTR_DIRTY_BIT           (((uint64_t) 1 << 36))
  98
  99#define PTR_HASH(c, k)                                                  \
 100        (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
 101
 102#define insert_lock(s, b)       ((b)->level <= (s)->lock)
 103
 104
 105static inline struct bset *write_block(struct btree *b)
 106{
 107        return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c);
 108}
 109
 110static void bch_btree_init_next(struct btree *b)
 111{
 112        /* If not a leaf node, always sort */
 113        if (b->level && b->keys.nsets)
 114                bch_btree_sort(&b->keys, &b->c->sort);
 115        else
 116                bch_btree_sort_lazy(&b->keys, &b->c->sort);
 117
 118        if (b->written < btree_blocks(b))
 119                bch_bset_init_next(&b->keys, write_block(b),
 120                                   bset_magic(&b->c->sb));
 121
 122}
 123
 124/* Btree key manipulation */
 125
 126void bkey_put(struct cache_set *c, struct bkey *k)
 127{
 128        unsigned int i;
 129
 130        for (i = 0; i < KEY_PTRS(k); i++)
 131                if (ptr_available(c, k, i))
 132                        atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
 133}
 134
 135/* Btree IO */
 136
 137static uint64_t btree_csum_set(struct btree *b, struct bset *i)
 138{
 139        uint64_t crc = b->key.ptr[0];
 140        void *data = (void *) i + 8, *end = bset_bkey_last(i);
 141
 142        crc = bch_crc64_update(crc, data, end - data);
 143        return crc ^ 0xffffffffffffffffULL;
 144}
 145
 146void bch_btree_node_read_done(struct btree *b)
 147{
 148        const char *err = "bad btree header";
 149        struct bset *i = btree_bset_first(b);
 150        struct btree_iter *iter;
 151
 152        /*
 153         * c->fill_iter can allocate an iterator with more memory space
 154         * than static MAX_BSETS.
 155         * See the comment arount cache_set->fill_iter.
 156         */
 157        iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO);
 158        iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
 159        iter->used = 0;
 160
 161#ifdef CONFIG_BCACHE_DEBUG
 162        iter->b = &b->keys;
 163#endif
 164
 165        if (!i->seq)
 166                goto err;
 167
 168        for (;
 169             b->written < btree_blocks(b) && i->seq == b->keys.set[0].data->seq;
 170             i = write_block(b)) {
 171                err = "unsupported bset version";
 172                if (i->version > BCACHE_BSET_VERSION)
 173                        goto err;
 174
 175                err = "bad btree header";
 176                if (b->written + set_blocks(i, block_bytes(b->c)) >
 177                    btree_blocks(b))
 178                        goto err;
 179
 180                err = "bad magic";
 181                if (i->magic != bset_magic(&b->c->sb))
 182                        goto err;
 183
 184                err = "bad checksum";
 185                switch (i->version) {
 186                case 0:
 187                        if (i->csum != csum_set(i))
 188                                goto err;
 189                        break;
 190                case BCACHE_BSET_VERSION:
 191                        if (i->csum != btree_csum_set(b, i))
 192                                goto err;
 193                        break;
 194                }
 195
 196                err = "empty set";
 197                if (i != b->keys.set[0].data && !i->keys)
 198                        goto err;
 199
 200                bch_btree_iter_push(iter, i->start, bset_bkey_last(i));
 201
 202                b->written += set_blocks(i, block_bytes(b->c));
 203        }
 204
 205        err = "corrupted btree";
 206        for (i = write_block(b);
 207             bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key);
 208             i = ((void *) i) + block_bytes(b->c))
 209                if (i->seq == b->keys.set[0].data->seq)
 210                        goto err;
 211
 212        bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort);
 213
 214        i = b->keys.set[0].data;
 215        err = "short btree key";
 216        if (b->keys.set[0].size &&
 217            bkey_cmp(&b->key, &b->keys.set[0].end) < 0)
 218                goto err;
 219
 220        if (b->written < btree_blocks(b))
 221                bch_bset_init_next(&b->keys, write_block(b),
 222                                   bset_magic(&b->c->sb));
 223out:
 224        mempool_free(iter, &b->c->fill_iter);
 225        return;
 226err:
 227        set_btree_node_io_error(b);
 228        bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys",
 229                            err, PTR_BUCKET_NR(b->c, &b->key, 0),
 230                            bset_block_offset(b, i), i->keys);
 231        goto out;
 232}
 233
 234static void btree_node_read_endio(struct bio *bio)
 235{
 236        struct closure *cl = bio->bi_private;
 237
 238        closure_put(cl);
 239}
 240
 241static void bch_btree_node_read(struct btree *b)
 242{
 243        uint64_t start_time = local_clock();
 244        struct closure cl;
 245        struct bio *bio;
 246
 247        trace_bcache_btree_read(b);
 248
 249        closure_init_stack(&cl);
 250
 251        bio = bch_bbio_alloc(b->c);
 252        bio->bi_iter.bi_size = KEY_SIZE(&b->key) << 9;
 253        bio->bi_end_io  = btree_node_read_endio;
 254        bio->bi_private = &cl;
 255        bio->bi_opf = REQ_OP_READ | REQ_META;
 256
 257        bch_bio_map(bio, b->keys.set[0].data);
 258
 259        bch_submit_bbio(bio, b->c, &b->key, 0);
 260        closure_sync(&cl);
 261
 262        if (bio->bi_status)
 263                set_btree_node_io_error(b);
 264
 265        bch_bbio_free(bio, b->c);
 266
 267        if (btree_node_io_error(b))
 268                goto err;
 269
 270        bch_btree_node_read_done(b);
 271        bch_time_stats_update(&b->c->btree_read_time, start_time);
 272
 273        return;
 274err:
 275        bch_cache_set_error(b->c, "io error reading bucket %zu",
 276                            PTR_BUCKET_NR(b->c, &b->key, 0));
 277}
 278
 279static void btree_complete_write(struct btree *b, struct btree_write *w)
 280{
 281        if (w->prio_blocked &&
 282            !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked))
 283                wake_up_allocators(b->c);
 284
 285        if (w->journal) {
 286                atomic_dec_bug(w->journal);
 287                __closure_wake_up(&b->c->journal.wait);
 288        }
 289
 290        w->prio_blocked = 0;
 291        w->journal      = NULL;
 292}
 293
 294static void btree_node_write_unlock(struct closure *cl)
 295{
 296        struct btree *b = container_of(cl, struct btree, io);
 297
 298        up(&b->io_mutex);
 299}
 300
 301static void __btree_node_write_done(struct closure *cl)
 302{
 303        struct btree *b = container_of(cl, struct btree, io);
 304        struct btree_write *w = btree_prev_write(b);
 305
 306        bch_bbio_free(b->bio, b->c);
 307        b->bio = NULL;
 308        btree_complete_write(b, w);
 309
 310        if (btree_node_dirty(b))
 311                schedule_delayed_work(&b->work, 30 * HZ);
 312
 313        closure_return_with_destructor(cl, btree_node_write_unlock);
 314}
 315
 316static void btree_node_write_done(struct closure *cl)
 317{
 318        struct btree *b = container_of(cl, struct btree, io);
 319
 320        bio_free_pages(b->bio);
 321        __btree_node_write_done(cl);
 322}
 323
 324static void btree_node_write_endio(struct bio *bio)
 325{
 326        struct closure *cl = bio->bi_private;
 327        struct btree *b = container_of(cl, struct btree, io);
 328
 329        if (bio->bi_status)
 330                set_btree_node_io_error(b);
 331
 332        bch_bbio_count_io_errors(b->c, bio, bio->bi_status, "writing btree");
 333        closure_put(cl);
 334}
 335
 336static void do_btree_node_write(struct btree *b)
 337{
 338        struct closure *cl = &b->io;
 339        struct bset *i = btree_bset_last(b);
 340        BKEY_PADDED(key) k;
 341
 342        i->version      = BCACHE_BSET_VERSION;
 343        i->csum         = btree_csum_set(b, i);
 344
 345        BUG_ON(b->bio);
 346        b->bio = bch_bbio_alloc(b->c);
 347
 348        b->bio->bi_end_io       = btree_node_write_endio;
 349        b->bio->bi_private      = cl;
 350        b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c));
 351        b->bio->bi_opf          = REQ_OP_WRITE | REQ_META | REQ_FUA;
 352        bch_bio_map(b->bio, i);
 353
 354        /*
 355         * If we're appending to a leaf node, we don't technically need FUA -
 356         * this write just needs to be persisted before the next journal write,
 357         * which will be marked FLUSH|FUA.
 358         *
 359         * Similarly if we're writing a new btree root - the pointer is going to
 360         * be in the next journal entry.
 361         *
 362         * But if we're writing a new btree node (that isn't a root) or
 363         * appending to a non leaf btree node, we need either FUA or a flush
 364         * when we write the parent with the new pointer. FUA is cheaper than a
 365         * flush, and writes appending to leaf nodes aren't blocking anything so
 366         * just make all btree node writes FUA to keep things sane.
 367         */
 368
 369        bkey_copy(&k.key, &b->key);
 370        SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) +
 371                       bset_sector_offset(&b->keys, i));
 372
 373        if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) {
 374                struct bio_vec *bv;
 375                void *addr = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
 376                struct bvec_iter_all iter_all;
 377
 378                bio_for_each_segment_all(bv, b->bio, iter_all) {
 379                        memcpy(page_address(bv->bv_page), addr, PAGE_SIZE);
 380                        addr += PAGE_SIZE;
 381                }
 382
 383                bch_submit_bbio(b->bio, b->c, &k.key, 0);
 384
 385                continue_at(cl, btree_node_write_done, NULL);
 386        } else {
 387                /*
 388                 * No problem for multipage bvec since the bio is
 389                 * just allocated
 390                 */
 391                b->bio->bi_vcnt = 0;
 392                bch_bio_map(b->bio, i);
 393
 394                bch_submit_bbio(b->bio, b->c, &k.key, 0);
 395
 396                closure_sync(cl);
 397                continue_at_nobarrier(cl, __btree_node_write_done, NULL);
 398        }
 399}
 400
 401void __bch_btree_node_write(struct btree *b, struct closure *parent)
 402{
 403        struct bset *i = btree_bset_last(b);
 404
 405        lockdep_assert_held(&b->write_lock);
 406
 407        trace_bcache_btree_write(b);
 408
 409        BUG_ON(current->bio_list);
 410        BUG_ON(b->written >= btree_blocks(b));
 411        BUG_ON(b->written && !i->keys);
 412        BUG_ON(btree_bset_first(b)->seq != i->seq);
 413        bch_check_keys(&b->keys, "writing");
 414
 415        cancel_delayed_work(&b->work);
 416
 417        /* If caller isn't waiting for write, parent refcount is cache set */
 418        down(&b->io_mutex);
 419        closure_init(&b->io, parent ?: &b->c->cl);
 420
 421        clear_bit(BTREE_NODE_dirty,      &b->flags);
 422        change_bit(BTREE_NODE_write_idx, &b->flags);
 423
 424        do_btree_node_write(b);
 425
 426        atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size,
 427                        &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
 428
 429        b->written += set_blocks(i, block_bytes(b->c));
 430}
 431
 432void bch_btree_node_write(struct btree *b, struct closure *parent)
 433{
 434        unsigned int nsets = b->keys.nsets;
 435
 436        lockdep_assert_held(&b->lock);
 437
 438        __bch_btree_node_write(b, parent);
 439
 440        /*
 441         * do verify if there was more than one set initially (i.e. we did a
 442         * sort) and we sorted down to a single set:
 443         */
 444        if (nsets && !b->keys.nsets)
 445                bch_btree_verify(b);
 446
 447        bch_btree_init_next(b);
 448}
 449
 450static void bch_btree_node_write_sync(struct btree *b)
 451{
 452        struct closure cl;
 453
 454        closure_init_stack(&cl);
 455
 456        mutex_lock(&b->write_lock);
 457        bch_btree_node_write(b, &cl);
 458        mutex_unlock(&b->write_lock);
 459
 460        closure_sync(&cl);
 461}
 462
 463static void btree_node_write_work(struct work_struct *w)
 464{
 465        struct btree *b = container_of(to_delayed_work(w), struct btree, work);
 466
 467        mutex_lock(&b->write_lock);
 468        if (btree_node_dirty(b))
 469                __bch_btree_node_write(b, NULL);
 470        mutex_unlock(&b->write_lock);
 471}
 472
 473static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
 474{
 475        struct bset *i = btree_bset_last(b);
 476        struct btree_write *w = btree_current_write(b);
 477
 478        lockdep_assert_held(&b->write_lock);
 479
 480        BUG_ON(!b->written);
 481        BUG_ON(!i->keys);
 482
 483        if (!btree_node_dirty(b))
 484                schedule_delayed_work(&b->work, 30 * HZ);
 485
 486        set_btree_node_dirty(b);
 487
 488        /*
 489         * w->journal is always the oldest journal pin of all bkeys
 490         * in the leaf node, to make sure the oldest jset seq won't
 491         * be increased before this btree node is flushed.
 492         */
 493        if (journal_ref) {
 494                if (w->journal &&
 495                    journal_pin_cmp(b->c, w->journal, journal_ref)) {
 496                        atomic_dec_bug(w->journal);
 497                        w->journal = NULL;
 498                }
 499
 500                if (!w->journal) {
 501                        w->journal = journal_ref;
 502                        atomic_inc(w->journal);
 503                }
 504        }
 505
 506        /* Force write if set is too big */
 507        if (set_bytes(i) > PAGE_SIZE - 48 &&
 508            !current->bio_list)
 509                bch_btree_node_write(b, NULL);
 510}
 511
 512/*
 513 * Btree in memory cache - allocation/freeing
 514 * mca -> memory cache
 515 */
 516
 517#define mca_reserve(c)  (((c->root && c->root->level)           \
 518                          ? c->root->level : 1) * 8 + 16)
 519#define mca_can_free(c)                                         \
 520        max_t(int, 0, c->btree_cache_used - mca_reserve(c))
 521
 522static void mca_data_free(struct btree *b)
 523{
 524        BUG_ON(b->io_mutex.count != 1);
 525
 526        bch_btree_keys_free(&b->keys);
 527
 528        b->c->btree_cache_used--;
 529        list_move(&b->list, &b->c->btree_cache_freed);
 530}
 531
 532static void mca_bucket_free(struct btree *b)
 533{
 534        BUG_ON(btree_node_dirty(b));
 535
 536        b->key.ptr[0] = 0;
 537        hlist_del_init_rcu(&b->hash);
 538        list_move(&b->list, &b->c->btree_cache_freeable);
 539}
 540
 541static unsigned int btree_order(struct bkey *k)
 542{
 543        return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1);
 544}
 545
 546static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
 547{
 548        if (!bch_btree_keys_alloc(&b->keys,
 549                                  max_t(unsigned int,
 550                                        ilog2(b->c->btree_pages),
 551                                        btree_order(k)),
 552                                  gfp)) {
 553                b->c->btree_cache_used++;
 554                list_move(&b->list, &b->c->btree_cache);
 555        } else {
 556                list_move(&b->list, &b->c->btree_cache_freed);
 557        }
 558}
 559
 560static struct btree *mca_bucket_alloc(struct cache_set *c,
 561                                      struct bkey *k, gfp_t gfp)
 562{
 563        /*
 564         * kzalloc() is necessary here for initialization,
 565         * see code comments in bch_btree_keys_init().
 566         */
 567        struct btree *b = kzalloc(sizeof(struct btree), gfp);
 568
 569        if (!b)
 570                return NULL;
 571
 572        init_rwsem(&b->lock);
 573        lockdep_set_novalidate_class(&b->lock);
 574        mutex_init(&b->write_lock);
 575        lockdep_set_novalidate_class(&b->write_lock);
 576        INIT_LIST_HEAD(&b->list);
 577        INIT_DELAYED_WORK(&b->work, btree_node_write_work);
 578        b->c = c;
 579        sema_init(&b->io_mutex, 1);
 580
 581        mca_data_alloc(b, k, gfp);
 582        return b;
 583}
 584
 585static int mca_reap(struct btree *b, unsigned int min_order, bool flush)
 586{
 587        struct closure cl;
 588
 589        closure_init_stack(&cl);
 590        lockdep_assert_held(&b->c->bucket_lock);
 591
 592        if (!down_write_trylock(&b->lock))
 593                return -ENOMEM;
 594
 595        BUG_ON(btree_node_dirty(b) && !b->keys.set[0].data);
 596
 597        if (b->keys.page_order < min_order)
 598                goto out_unlock;
 599
 600        if (!flush) {
 601                if (btree_node_dirty(b))
 602                        goto out_unlock;
 603
 604                if (down_trylock(&b->io_mutex))
 605                        goto out_unlock;
 606                up(&b->io_mutex);
 607        }
 608
 609retry:
 610        /*
 611         * BTREE_NODE_dirty might be cleared in btree_flush_btree() by
 612         * __bch_btree_node_write(). To avoid an extra flush, acquire
 613         * b->write_lock before checking BTREE_NODE_dirty bit.
 614         */
 615        mutex_lock(&b->write_lock);
 616        /*
 617         * If this btree node is selected in btree_flush_write() by journal
 618         * code, delay and retry until the node is flushed by journal code
 619         * and BTREE_NODE_journal_flush bit cleared by btree_flush_write().
 620         */
 621        if (btree_node_journal_flush(b)) {
 622                pr_debug("bnode %p is flushing by journal, retry\n", b);
 623                mutex_unlock(&b->write_lock);
 624                udelay(1);
 625                goto retry;
 626        }
 627
 628        if (btree_node_dirty(b))
 629                __bch_btree_node_write(b, &cl);
 630        mutex_unlock(&b->write_lock);
 631
 632        closure_sync(&cl);
 633
 634        /* wait for any in flight btree write */
 635        down(&b->io_mutex);
 636        up(&b->io_mutex);
 637
 638        return 0;
 639out_unlock:
 640        rw_unlock(true, b);
 641        return -ENOMEM;
 642}
 643
 644static unsigned long bch_mca_scan(struct shrinker *shrink,
 645                                  struct shrink_control *sc)
 646{
 647        struct cache_set *c = container_of(shrink, struct cache_set, shrink);
 648        struct btree *b, *t;
 649        unsigned long i, nr = sc->nr_to_scan;
 650        unsigned long freed = 0;
 651        unsigned int btree_cache_used;
 652
 653        if (c->shrinker_disabled)
 654                return SHRINK_STOP;
 655
 656        if (c->btree_cache_alloc_lock)
 657                return SHRINK_STOP;
 658
 659        /* Return -1 if we can't do anything right now */
 660        if (sc->gfp_mask & __GFP_IO)
 661                mutex_lock(&c->bucket_lock);
 662        else if (!mutex_trylock(&c->bucket_lock))
 663                return -1;
 664
 665        /*
 666         * It's _really_ critical that we don't free too many btree nodes - we
 667         * have to always leave ourselves a reserve. The reserve is how we
 668         * guarantee that allocating memory for a new btree node can always
 669         * succeed, so that inserting keys into the btree can always succeed and
 670         * IO can always make forward progress:
 671         */
 672        nr /= c->btree_pages;
 673        if (nr == 0)
 674                nr = 1;
 675        nr = min_t(unsigned long, nr, mca_can_free(c));
 676
 677        i = 0;
 678        btree_cache_used = c->btree_cache_used;
 679        list_for_each_entry_safe_reverse(b, t, &c->btree_cache_freeable, list) {
 680                if (nr <= 0)
 681                        goto out;
 682
 683                if (!mca_reap(b, 0, false)) {
 684                        mca_data_free(b);
 685                        rw_unlock(true, b);
 686                        freed++;
 687                }
 688                nr--;
 689                i++;
 690        }
 691
 692        list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) {
 693                if (nr <= 0 || i >= btree_cache_used)
 694                        goto out;
 695
 696                if (!mca_reap(b, 0, false)) {
 697                        mca_bucket_free(b);
 698                        mca_data_free(b);
 699                        rw_unlock(true, b);
 700                        freed++;
 701                }
 702
 703                nr--;
 704                i++;
 705        }
 706out:
 707        mutex_unlock(&c->bucket_lock);
 708        return freed * c->btree_pages;
 709}
 710
 711static unsigned long bch_mca_count(struct shrinker *shrink,
 712                                   struct shrink_control *sc)
 713{
 714        struct cache_set *c = container_of(shrink, struct cache_set, shrink);
 715
 716        if (c->shrinker_disabled)
 717                return 0;
 718
 719        if (c->btree_cache_alloc_lock)
 720                return 0;
 721
 722        return mca_can_free(c) * c->btree_pages;
 723}
 724
 725void bch_btree_cache_free(struct cache_set *c)
 726{
 727        struct btree *b;
 728        struct closure cl;
 729
 730        closure_init_stack(&cl);
 731
 732        if (c->shrink.list.next)
 733                unregister_shrinker(&c->shrink);
 734
 735        mutex_lock(&c->bucket_lock);
 736
 737#ifdef CONFIG_BCACHE_DEBUG
 738        if (c->verify_data)
 739                list_move(&c->verify_data->list, &c->btree_cache);
 740
 741        free_pages((unsigned long) c->verify_ondisk, ilog2(meta_bucket_pages(&c->sb)));
 742#endif
 743
 744        list_splice(&c->btree_cache_freeable,
 745                    &c->btree_cache);
 746
 747        while (!list_empty(&c->btree_cache)) {
 748                b = list_first_entry(&c->btree_cache, struct btree, list);
 749
 750                /*
 751                 * This function is called by cache_set_free(), no I/O
 752                 * request on cache now, it is unnecessary to acquire
 753                 * b->write_lock before clearing BTREE_NODE_dirty anymore.
 754                 */
 755                if (btree_node_dirty(b)) {
 756                        btree_complete_write(b, btree_current_write(b));
 757                        clear_bit(BTREE_NODE_dirty, &b->flags);
 758                }
 759                mca_data_free(b);
 760        }
 761
 762        while (!list_empty(&c->btree_cache_freed)) {
 763                b = list_first_entry(&c->btree_cache_freed,
 764                                     struct btree, list);
 765                list_del(&b->list);
 766                cancel_delayed_work_sync(&b->work);
 767                kfree(b);
 768        }
 769
 770        mutex_unlock(&c->bucket_lock);
 771}
 772
 773int bch_btree_cache_alloc(struct cache_set *c)
 774{
 775        unsigned int i;
 776
 777        for (i = 0; i < mca_reserve(c); i++)
 778                if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL))
 779                        return -ENOMEM;
 780
 781        list_splice_init(&c->btree_cache,
 782                         &c->btree_cache_freeable);
 783
 784#ifdef CONFIG_BCACHE_DEBUG
 785        mutex_init(&c->verify_lock);
 786
 787        c->verify_ondisk = (void *)
 788                __get_free_pages(GFP_KERNEL|__GFP_COMP, ilog2(meta_bucket_pages(&c->sb)));
 789        if (!c->verify_ondisk) {
 790                /*
 791                 * Don't worry about the mca_rereserve buckets
 792                 * allocated in previous for-loop, they will be
 793                 * handled properly in bch_cache_set_unregister().
 794                 */
 795                return -ENOMEM;
 796        }
 797
 798        c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
 799
 800        if (c->verify_data &&
 801            c->verify_data->keys.set->data)
 802                list_del_init(&c->verify_data->list);
 803        else
 804                c->verify_data = NULL;
 805#endif
 806
 807        c->shrink.count_objects = bch_mca_count;
 808        c->shrink.scan_objects = bch_mca_scan;
 809        c->shrink.seeks = 4;
 810        c->shrink.batch = c->btree_pages * 2;
 811
 812        if (register_shrinker(&c->shrink))
 813                pr_warn("bcache: %s: could not register shrinker\n",
 814                                __func__);
 815
 816        return 0;
 817}
 818
 819/* Btree in memory cache - hash table */
 820
 821static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k)
 822{
 823        return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)];
 824}
 825
 826static struct btree *mca_find(struct cache_set *c, struct bkey *k)
 827{
 828        struct btree *b;
 829
 830        rcu_read_lock();
 831        hlist_for_each_entry_rcu(b, mca_hash(c, k), hash)
 832                if (PTR_HASH(c, &b->key) == PTR_HASH(c, k))
 833                        goto out;
 834        b = NULL;
 835out:
 836        rcu_read_unlock();
 837        return b;
 838}
 839
 840static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op)
 841{
 842        spin_lock(&c->btree_cannibalize_lock);
 843        if (likely(c->btree_cache_alloc_lock == NULL)) {
 844                c->btree_cache_alloc_lock = current;
 845        } else if (c->btree_cache_alloc_lock != current) {
 846                if (op)
 847                        prepare_to_wait(&c->btree_cache_wait, &op->wait,
 848                                        TASK_UNINTERRUPTIBLE);
 849                spin_unlock(&c->btree_cannibalize_lock);
 850                return -EINTR;
 851        }
 852        spin_unlock(&c->btree_cannibalize_lock);
 853
 854        return 0;
 855}
 856
 857static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op,
 858                                     struct bkey *k)
 859{
 860        struct btree *b;
 861
 862        trace_bcache_btree_cache_cannibalize(c);
 863
 864        if (mca_cannibalize_lock(c, op))
 865                return ERR_PTR(-EINTR);
 866
 867        list_for_each_entry_reverse(b, &c->btree_cache, list)
 868                if (!mca_reap(b, btree_order(k), false))
 869                        return b;
 870
 871        list_for_each_entry_reverse(b, &c->btree_cache, list)
 872                if (!mca_reap(b, btree_order(k), true))
 873                        return b;
 874
 875        WARN(1, "btree cache cannibalize failed\n");
 876        return ERR_PTR(-ENOMEM);
 877}
 878
 879/*
 880 * We can only have one thread cannibalizing other cached btree nodes at a time,
 881 * or we'll deadlock. We use an open coded mutex to ensure that, which a
 882 * cannibalize_bucket() will take. This means every time we unlock the root of
 883 * the btree, we need to release this lock if we have it held.
 884 */
 885static void bch_cannibalize_unlock(struct cache_set *c)
 886{
 887        spin_lock(&c->btree_cannibalize_lock);
 888        if (c->btree_cache_alloc_lock == current) {
 889                c->btree_cache_alloc_lock = NULL;
 890                wake_up(&c->btree_cache_wait);
 891        }
 892        spin_unlock(&c->btree_cannibalize_lock);
 893}
 894
 895static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op,
 896                               struct bkey *k, int level)
 897{
 898        struct btree *b;
 899
 900        BUG_ON(current->bio_list);
 901
 902        lockdep_assert_held(&c->bucket_lock);
 903
 904        if (mca_find(c, k))
 905                return NULL;
 906
 907        /* btree_free() doesn't free memory; it sticks the node on the end of
 908         * the list. Check if there's any freed nodes there:
 909         */
 910        list_for_each_entry(b, &c->btree_cache_freeable, list)
 911                if (!mca_reap(b, btree_order(k), false))
 912                        goto out;
 913
 914        /* We never free struct btree itself, just the memory that holds the on
 915         * disk node. Check the freed list before allocating a new one:
 916         */
 917        list_for_each_entry(b, &c->btree_cache_freed, list)
 918                if (!mca_reap(b, 0, false)) {
 919                        mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO);
 920                        if (!b->keys.set[0].data)
 921                                goto err;
 922                        else
 923                                goto out;
 924                }
 925
 926        b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO);
 927        if (!b)
 928                goto err;
 929
 930        BUG_ON(!down_write_trylock(&b->lock));
 931        if (!b->keys.set->data)
 932                goto err;
 933out:
 934        BUG_ON(b->io_mutex.count != 1);
 935
 936        bkey_copy(&b->key, k);
 937        list_move(&b->list, &c->btree_cache);
 938        hlist_del_init_rcu(&b->hash);
 939        hlist_add_head_rcu(&b->hash, mca_hash(c, k));
 940
 941        lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_);
 942        b->parent       = (void *) ~0UL;
 943        b->flags        = 0;
 944        b->written      = 0;
 945        b->level        = level;
 946
 947        if (!b->level)
 948                bch_btree_keys_init(&b->keys, &bch_extent_keys_ops,
 949                                    &b->c->expensive_debug_checks);
 950        else
 951                bch_btree_keys_init(&b->keys, &bch_btree_keys_ops,
 952                                    &b->c->expensive_debug_checks);
 953
 954        return b;
 955err:
 956        if (b)
 957                rw_unlock(true, b);
 958
 959        b = mca_cannibalize(c, op, k);
 960        if (!IS_ERR(b))
 961                goto out;
 962
 963        return b;
 964}
 965
 966/*
 967 * bch_btree_node_get - find a btree node in the cache and lock it, reading it
 968 * in from disk if necessary.
 969 *
 970 * If IO is necessary and running under submit_bio_noacct, returns -EAGAIN.
 971 *
 972 * The btree node will have either a read or a write lock held, depending on
 973 * level and op->lock.
 974 */
 975struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op,
 976                                 struct bkey *k, int level, bool write,
 977                                 struct btree *parent)
 978{
 979        int i = 0;
 980        struct btree *b;
 981
 982        BUG_ON(level < 0);
 983retry:
 984        b = mca_find(c, k);
 985
 986        if (!b) {
 987                if (current->bio_list)
 988                        return ERR_PTR(-EAGAIN);
 989
 990                mutex_lock(&c->bucket_lock);
 991                b = mca_alloc(c, op, k, level);
 992                mutex_unlock(&c->bucket_lock);
 993
 994                if (!b)
 995                        goto retry;
 996                if (IS_ERR(b))
 997                        return b;
 998
 999                bch_btree_node_read(b);
1000

1001                if (!write)
1002                        downgrade_write(&b->lock);
1003        } else {
1004                rw_lock(write, b, level);
1005                if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) {
1006                        rw_unlock(write, b);
1007                        goto retry;
1008                }
1009                BUG_ON(b->level != level);
1010        }
1011
1012        if (btree_node_io_error(b)) {
1013                rw_unlock(write, b);
1014                return ERR_PTR(-EIO);
1015        }
1016
1017        BUG_ON(!b->written);
1018
1019        b->parent = parent;
1020
1021        for (; i <= b->keys.nsets && b->keys.set[i].size; i++) {
1022                prefetch(b->keys.set[i].tree);
1023                prefetch(b->keys.set[i].data);
1024        }
1025
1026        for (; i <= b->keys.nsets; i++)
1027                prefetch(b->keys.set[i].data);
1028
1029        return b;
1030}
1031
1032static void btree_node_prefetch(struct btree *parent, struct bkey *k)
1033{
1034        struct btree *b;
1035
1036        mutex_lock(&parent->c->bucket_lock);
1037        b = mca_alloc(parent->c, NULL, k, parent->level - 1);
1038        mutex_unlock(&parent->c->bucket_lock);
1039
1040        if (!IS_ERR_OR_NULL(b)) {
1041                b->parent = parent;
1042                bch_btree_node_read(b);
1043                rw_unlock(true, b);
1044        }
1045}
1046
1047/* Btree alloc */
1048
1049static void btree_node_free(struct btree *b)
1050{
1051        trace_bcache_btree_node_free(b);
1052
1053        BUG_ON(b == b->c->root);
1054
1055retry:
1056        mutex_lock(&b->write_lock);
1057        /*
1058         * If the btree node is selected and flushing in btree_flush_write(),
1059         * delay and retry until the BTREE_NODE_journal_flush bit cleared,
1060         * then it is safe to free the btree node here. Otherwise this btree
1061         * node will be in race condition.
1062         */
1063        if (btree_node_journal_flush(b)) {
1064                mutex_unlock(&b->write_lock);
1065                pr_debug("bnode %p journal_flush set, retry\n", b);
1066                udelay(1);
1067                goto retry;
1068        }
1069
1070        if (btree_node_dirty(b)) {
1071                btree_complete_write(b, btree_current_write(b));
1072                clear_bit(BTREE_NODE_dirty, &b->flags);
1073        }
1074
1075        mutex_unlock(&b->write_lock);
1076
1077        cancel_delayed_work(&b->work);
1078
1079        mutex_lock(&b->c->bucket_lock);
1080        bch_bucket_free(b->c, &b->key);
1081        mca_bucket_free(b);
1082        mutex_unlock(&b->c->bucket_lock);
1083}
1084
1085struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op,
1086                                     int level, bool wait,
1087                                     struct btree *parent)
1088{
1089        BKEY_PADDED(key) k;
1090        struct btree *b = ERR_PTR(-EAGAIN);
1091
1092        mutex_lock(&c->bucket_lock);
1093retry:
1094        if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, wait))
1095                goto err;
1096
1097        bkey_put(c, &k.key);
1098        SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
1099
1100        b = mca_alloc(c, op, &k.key, level);
1101        if (IS_ERR(b))
1102                goto err_free;
1103
1104        if (!b) {
1105                cache_bug(c,
1106                        "Tried to allocate bucket that was in btree cache");
1107                goto retry;
1108        }
1109
1110        b->parent = parent;
1111        bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb));
1112
1113        mutex_unlock(&c->bucket_lock);
1114
1115        trace_bcache_btree_node_alloc(b);
1116        return b;
1117err_free:
1118        bch_bucket_free(c, &k.key);
1119err:
1120        mutex_unlock(&c->bucket_lock);
1121
1122        trace_bcache_btree_node_alloc_fail(c);
1123        return b;
1124}
1125
1126static struct btree *bch_btree_node_alloc(struct cache_set *c,
1127                                          struct btree_op *op, int level,
1128                                          struct btree *parent)
1129{
1130        return __bch_btree_node_alloc(c, op, level, op != NULL, parent);
1131}
1132
1133static struct btree *btree_node_alloc_replacement(struct btree *b,
1134                                                  struct btree_op *op)
1135{
1136        struct btree *n = bch_btree_node_alloc(b->c, op, b->level, b->parent);
1137
1138        if (!IS_ERR_OR_NULL(n)) {
1139                mutex_lock(&n->write_lock);
1140                bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort);
1141                bkey_copy_key(&n->key, &b->key);
1142                mutex_unlock(&n->write_lock);
1143        }
1144
1145        return n;
1146}
1147
1148static void make_btree_freeing_key(struct btree *b, struct bkey *k)
1149{
1150        unsigned int i;
1151
1152        mutex_lock(&b->c->bucket_lock);
1153
1154        atomic_inc(&b->c->prio_blocked);
1155
1156        bkey_copy(k, &b->key);
1157        bkey_copy_key(k, &ZERO_KEY);
1158
1159        for (i = 0; i < KEY_PTRS(k); i++)
1160                SET_PTR_GEN(k, i,
1161                            bch_inc_gen(PTR_CACHE(b->c, &b->key, i),
1162                                        PTR_BUCKET(b->c, &b->key, i)));
1163
1164        mutex_unlock(&b->c->bucket_lock);
1165}
1166
1167static int btree_check_reserve(struct btree *b, struct btree_op *op)
1168{
1169        struct cache_set *c = b->c;
1170        struct cache *ca;
1171        unsigned int i, reserve = (c->root->level - b->level) * 2 + 1;
1172
1173        mutex_lock(&c->bucket_lock);
1174
1175        for_each_cache(ca, c, i)
1176                if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) {
1177                        if (op)
1178                                prepare_to_wait(&c->btree_cache_wait, &op->wait,
1179                                                TASK_UNINTERRUPTIBLE);
1180                        mutex_unlock(&c->bucket_lock);
1181                        return -EINTR;
1182                }
1183
1184        mutex_unlock(&c->bucket_lock);
1185
1186        return mca_cannibalize_lock(b->c, op);
1187}
1188
1189/* Garbage collection */
1190
1191static uint8_t __bch_btree_mark_key(struct cache_set *c, int level,
1192                                    struct bkey *k)
1193{
1194        uint8_t stale = 0;
1195        unsigned int i;
1196        struct bucket *g;
1197
1198        /*
1199         * ptr_invalid() can't return true for the keys that mark btree nodes as
1200         * freed, but since ptr_bad() returns true we'll never actually use them
1201         * for anything and thus we don't want mark their pointers here
1202         */
1203        if (!bkey_cmp(k, &ZERO_KEY))
1204                return stale;
1205
1206        for (i = 0; i < KEY_PTRS(k); i++) {
1207                if (!ptr_available(c, k, i))
1208                        continue;
1209
1210                g = PTR_BUCKET(c, k, i);
1211
1212                if (gen_after(g->last_gc, PTR_GEN(k, i)))
1213                        g->last_gc = PTR_GEN(k, i);
1214
1215                if (ptr_stale(c, k, i)) {
1216                        stale = max(stale, ptr_stale(c, k, i));
1217                        continue;
1218                }
1219
1220                cache_bug_on(GC_MARK(g) &&
1221                             (GC_MARK(g) == GC_MARK_METADATA) != (level != 0),
1222                             c, "inconsistent ptrs: mark = %llu, level = %i",
1223                             GC_MARK(g), level);
1224
1225                if (level)
1226                        SET_GC_MARK(g, GC_MARK_METADATA);
1227                else if (KEY_DIRTY(k))
1228                        SET_GC_MARK(g, GC_MARK_DIRTY);
1229                else if (!GC_MARK(g))
1230                        SET_GC_MARK(g, GC_MARK_RECLAIMABLE);
1231
1232                /* guard against overflow */
1233                SET_GC_SECTORS_USED(g, min_t(unsigned int,
1234                                             GC_SECTORS_USED(g) + KEY_SIZE(k),
1235                                             MAX_GC_SECTORS_USED));
1236
1237                BUG_ON(!GC_SECTORS_USED(g));
1238        }
1239
1240        return stale;
1241}
1242
1243#define btree_mark_key(b, k)    __bch_btree_mark_key(b->c, b->level, k)
1244
1245void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k)
1246{
1247        unsigned int i;
1248
1249        for (i = 0; i < KEY_PTRS(k); i++)
1250                if (ptr_available(c, k, i) &&
1251                    !ptr_stale(c, k, i)) {
1252                        struct bucket *b = PTR_BUCKET(c, k, i);
1253
1254                        b->gen = PTR_GEN(k, i);
1255
1256                        if (level && bkey_cmp(k, &ZERO_KEY))
1257                                b->prio = BTREE_PRIO;
1258                        else if (!level && b->prio == BTREE_PRIO)
1259                                b->prio = INITIAL_PRIO;
1260                }
1261
1262        __bch_btree_mark_key(c, level, k);
1263}
1264
1265void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats)
1266{
1267        stats->in_use = (c->nbuckets - c->avail_nbuckets) * 100 / c->nbuckets;
1268}
1269
1270static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
1271{
1272        uint8_t stale = 0;
1273        unsigned int keys = 0, good_keys = 0;
1274        struct bkey *k;
1275        struct btree_iter iter;
1276        struct bset_tree *t;
1277
1278        gc->nodes++;
1279
1280        for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) {
1281                stale = max(stale, btree_mark_key(b, k));
1282                keys++;
1283
1284                if (bch_ptr_bad(&b->keys, k))
1285                        continue;
1286
1287                gc->key_bytes += bkey_u64s(k);
1288                gc->nkeys++;
1289                good_keys++;
1290
1291                gc->data += KEY_SIZE(k);
1292        }
1293
1294        for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++)
1295                btree_bug_on(t->size &&
1296                             bset_written(&b->keys, t) &&
1297                             bkey_cmp(&b->key, &t->end) < 0,
1298                             b, "found short btree key in gc");
1299
1300        if (b->c->gc_always_rewrite)
1301                return true;
1302
1303        if (stale > 10)
1304                return true;
1305
1306        if ((keys - good_keys) * 2 > keys)
1307                return true;
1308
1309        return false;
1310}
1311
1312#define GC_MERGE_NODES  4U
1313
1314struct gc_merge_info {
1315        struct btree    *b;
1316        unsigned int    keys;
1317};
1318
1319static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
1320                                 struct keylist *insert_keys,
1321                                 atomic_t *journal_ref,
1322                                 struct bkey *replace_key);
1323
1324static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1325                             struct gc_stat *gc, struct gc_merge_info *r)
1326{
1327        unsigned int i, nodes = 0, keys = 0, blocks;
1328        struct btree *new_nodes[GC_MERGE_NODES];
1329        struct keylist keylist;
1330        struct closure cl;
1331        struct bkey *k;
1332
1333        bch_keylist_init(&keylist);
1334
1335        if (btree_check_reserve(b, NULL))
1336                return 0;
1337
1338        memset(new_nodes, 0, sizeof(new_nodes));
1339        closure_init_stack(&cl);
1340
1341        while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b))
1342                keys += r[nodes++].keys;
1343
1344        blocks = btree_default_blocks(b->c) * 2 / 3;
1345
1346        if (nodes < 2 ||
1347            __set_blocks(b->keys.set[0].data, keys,
1348                         block_bytes(b->c)) > blocks * (nodes - 1))
1349                return 0;
1350
1351        for (i = 0; i < nodes; i++) {
1352                new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL);
1353                if (IS_ERR_OR_NULL(new_nodes[i]))
1354                        goto out_nocoalesce;
1355        }
1356
1357        /*
1358         * We have to check the reserve here, after we've allocated our new
1359         * nodes, to make sure the insert below will succeed - we also check
1360         * before as an optimization to potentially avoid a bunch of expensive
1361         * allocs/sorts
1362         */
1363        if (btree_check_reserve(b, NULL))
1364                goto out_nocoalesce;
1365
1366        for (i = 0; i < nodes; i++)
1367                mutex_lock(&new_nodes[i]->write_lock);
1368
1369        for (i = nodes - 1; i > 0; --i) {
1370                struct bset *n1 = btree_bset_first(new_nodes[i]);
1371                struct bset *n2 = btree_bset_first(new_nodes[i - 1]);
1372                struct bkey *k, *last = NULL;
1373
1374                keys = 0;
1375
1376                if (i > 1) {
1377                        for (k = n2->start;
1378                             k < bset_bkey_last(n2);
1379                             k = bkey_next(k)) {
1380                                if (__set_blocks(n1, n1->keys + keys +
1381                                                 bkey_u64s(k),
1382                                                 block_bytes(b->c)) > blocks)
1383                                        break;
1384
1385                                last = k;
1386                                keys += bkey_u64s(k);
1387                        }
1388                } else {
1389                        /*
1390                         * Last node we're not getting rid of - we're getting
1391                         * rid of the node at r[0]. Have to try and fit all of
1392                         * the remaining keys into this node; we can't ensure
1393                         * they will always fit due to rounding and variable
1394                         * length keys (shouldn't be possible in practice,
1395                         * though)
1396                         */
1397                        if (__set_blocks(n1, n1->keys + n2->keys,
1398                                         block_bytes(b->c)) >
1399                            btree_blocks(new_nodes[i]))
1400                                goto out_unlock_nocoalesce;
1401
1402                        keys = n2->keys;
1403                        /* Take the key of the node we're getting rid of */
1404                        last = &r->b->key;
1405                }
1406
1407                BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c)) >
1408                       btree_blocks(new_nodes[i]));
1409
1410                if (last)
1411                        bkey_copy_key(&new_nodes[i]->key, last);
1412
1413                memcpy(bset_bkey_last(n1),
1414                       n2->start,
1415                       (void *) bset_bkey_idx(n2, keys) - (void *) n2->start);
1416
1417                n1->keys += keys;
1418                r[i].keys = n1->keys;
1419
1420                memmove(n2->start,
1421                        bset_bkey_idx(n2, keys),
1422                        (void *) bset_bkey_last(n2) -
1423                        (void *) bset_bkey_idx(n2, keys));
1424
1425                n2->keys -= keys;
1426
1427                if (__bch_keylist_realloc(&keylist,
1428                                          bkey_u64s(&new_nodes[i]->key)))
1429                        goto out_unlock_nocoalesce;
1430
1431                bch_btree_node_write(new_nodes[i], &cl);
1432                bch_keylist_add(&keylist, &new_nodes[i]->key);
1433        }
1434
1435        for (i = 0; i < nodes; i++)
1436                mutex_unlock(&new_nodes[i]->write_lock);
1437
1438        closure_sync(&cl);
1439
1440        /* We emptied out this node */
1441        BUG_ON(btree_bset_first(new_nodes[0])->keys);
1442        btree_node_free(new_nodes[0]);
1443        rw_unlock(true, new_nodes[0]);
1444        new_nodes[0] = NULL;
1445
1446        for (i = 0; i < nodes; i++) {
1447                if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key)))
1448                        goto out_nocoalesce;
1449
1450                make_btree_freeing_key(r[i].b, keylist.top);
1451                bch_keylist_push(&keylist);
1452        }
1453
1454        bch_btree_insert_node(b, op, &keylist, NULL, NULL);
1455        BUG_ON(!bch_keylist_empty(&keylist));
1456
1457        for (i = 0; i < nodes; i++) {
1458                btree_node_free(r[i].b);
1459                rw_unlock(true, r[i].b);
1460
1461                r[i].b = new_nodes[i];
1462        }
1463
1464        memmove(r, r + 1, sizeof(r[0]) * (nodes - 1));
1465        r[nodes - 1].b = ERR_PTR(-EINTR);
1466
1467        trace_bcache_btree_gc_coalesce(nodes);
1468        gc->nodes--;
1469
1470        bch_keylist_free(&keylist);
1471
1472        /* Invalidated our iterator */
1473        return -EINTR;
1474
1475out_unlock_nocoalesce:
1476        for (i = 0; i < nodes; i++)
1477                mutex_unlock(&new_nodes[i]->write_lock);
1478
1479out_nocoalesce:
1480        closure_sync(&cl);
1481
1482        while ((k = bch_keylist_pop(&keylist)))
1483                if (!bkey_cmp(k, &ZERO_KEY))
1484                        atomic_dec(&b->c->prio_blocked);
1485        bch_keylist_free(&keylist);
1486
1487        for (i = 0; i < nodes; i++)
1488                if (!IS_ERR_OR_NULL(new_nodes[i])) {
1489                        btree_node_free(new_nodes[i]);
1490                        rw_unlock(true, new_nodes[i]);
1491                }
1492        return 0;
1493}
1494
1495static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op,
1496                                 struct btree *replace)
1497{
1498        struct keylist keys;
1499        struct btree *n;
1500
1501        if (btree_check_reserve(b, NULL))
1502                return 0;
1503
1504        n = btree_node_alloc_replacement(replace, NULL);
1505
1506        /* recheck reserve after allocating replacement node */
1507        if (btree_check_reserve(b, NULL)) {
1508                btree_node_free(n);
1509                rw_unlock(true, n);
1510                return 0;
1511        }
1512
1513        bch_btree_node_write_sync(n);
1514
1515        bch_keylist_init(&keys);
1516        bch_keylist_add(&keys, &n->key);
1517
1518        make_btree_freeing_key(replace, keys.top);
1519        bch_keylist_push(&keys);
1520
1521        bch_btree_insert_node(b, op, &keys, NULL, NULL);
1522        BUG_ON(!bch_keylist_empty(&keys));
1523
1524        btree_node_free(replace);
1525        rw_unlock(true, n);
1526
1527        /* Invalidated our iterator */
1528        return -EINTR;
1529}
1530
1531static unsigned int btree_gc_count_keys(struct btree *b)
1532{
1533        struct bkey *k;
1534        struct btree_iter iter;
1535        unsigned int ret = 0;
1536
1537        for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad)
1538                ret += bkey_u64s(k);
1539
1540        return ret;
1541}
1542
1543static size_t btree_gc_min_nodes(struct cache_set *c)
1544{
1545        size_t min_nodes;
1546
1547        /*
1548         * Since incremental GC would stop 100ms when front
1549         * side I/O comes, so when there are many btree nodes,
1550         * if GC only processes constant (100) nodes each time,
1551         * GC would last a long time, and the front side I/Os
1552         * would run out of the buckets (since no new bucket
1553         * can be allocated during GC), and be blocked again.
1554         * So GC should not process constant nodes, but varied
1555         * nodes according to the number of btree nodes, which
1556         * realized by dividing GC into constant(100) times,
1557         * so when there are many btree nodes, GC can process
1558         * more nodes each time, otherwise, GC will process less
1559         * nodes each time (but no less than MIN_GC_NODES)
1560         */
1561        min_nodes = c->gc_stats.nodes / MAX_GC_TIMES;
1562        if (min_nodes < MIN_GC_NODES)
1563                min_nodes = MIN_GC_NODES;
1564
1565        return min_nodes;
1566}
1567
1568
1569static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1570                            struct closure *writes, struct gc_stat *gc)
1571{
1572        int ret = 0;
1573        bool should_rewrite;
1574        struct bkey *k;
1575        struct btree_iter iter;
1576        struct gc_merge_info r[GC_MERGE_NODES];
1577        struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1;
1578
1579        bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done);
1580
1581        for (i = r; i < r + ARRAY_SIZE(r); i++)
1582                i->b = ERR_PTR(-EINTR);
1583
1584        while (1) {
1585                k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad);
1586                if (k) {
1587                        r->b = bch_btree_node_get(b->c, op, k, b->level - 1,
1588                                                  true, b);
1589                        if (IS_ERR(r->b)) {
1590                                ret = PTR_ERR(r->b);
1591                                break;
1592                        }
1593
1594                        r->keys = btree_gc_count_keys(r->b);
1595
1596                        ret = btree_gc_coalesce(b, op, gc, r);
1597                        if (ret)
1598                                break;
1599                }
1600
1601                if (!last->b)
1602                        break;
1603
1604                if (!IS_ERR(last->b)) {
1605                        should_rewrite = btree_gc_mark_node(last->b, gc);
1606                        if (should_rewrite) {
1607                                ret = btree_gc_rewrite_node(b, op, last->b);
1608                                if (ret)
1609                                        break;
1610                        }
1611
1612                        if (last->b->level) {
1613                                ret = btree_gc_recurse(last->b, op, writes, gc);
1614                                if (ret)
1615                                        break;
1616                        }
1617
1618                        bkey_copy_key(&b->c->gc_done, &last->b->key);
1619
1620                        /*
1621                         * Must flush leaf nodes before gc ends, since replace
1622                         * operations aren't journalled
1623                         */
1624                        mutex_lock(&last->b->write_lock);
1625                        if (btree_node_dirty(last->b))
1626                                bch_btree_node_write(last->b, writes);
1627                        mutex_unlock(&last->b->write_lock);
1628                        rw_unlock(true, last->b);
1629                }
1630
1631                memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
1632                r->b = NULL;
1633
1634                if (atomic_read(&b->c->search_inflight) &&
1635                    gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) {
1636                        gc->nodes_pre =  gc->nodes;
1637                        ret = -EAGAIN;
1638                        break;
1639                }
1640
1641                if (need_resched()) {
1642                        ret = -EAGAIN;
1643                        break;
1644                }
1645        }
1646
1647        for (i = r; i < r + ARRAY_SIZE(r); i++)
1648                if (!IS_ERR_OR_NULL(i->b)) {
1649                        mutex_lock(&i->b->write_lock);
1650                        if (btree_node_dirty(i->b))
1651                                bch_btree_node_write(i->b, writes);
1652                        mutex_unlock(&i->b->write_lock);
1653                        rw_unlock(true, i->b);
1654                }
1655
1656        return ret;
1657}
1658
1659static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
1660                             struct closure *writes, struct gc_stat *gc)
1661{
1662        struct btree *n = NULL;
1663        int ret = 0;
1664        bool should_rewrite;
1665
1666        should_rewrite = btree_gc_mark_node(b, gc);
1667        if (should_rewrite) {
1668                n = btree_node_alloc_replacement(b, NULL);
1669
1670                if (!IS_ERR_OR_NULL(n)) {
1671                        bch_btree_node_write_sync(n);
1672
1673                        bch_btree_set_root(n);
1674                        btree_node_free(b);
1675                        rw_unlock(true, n);
1676
1677                        return -EINTR;
1678                }
1679        }
1680
1681        __bch_btree_mark_key(b->c, b->level + 1, &b->key);
1682
1683        if (b->level) {
1684                ret = btree_gc_recurse(b, op, writes, gc);
1685                if (ret)
1686                        return ret;
1687        }
1688
1689        bkey_copy_key(&b->c->gc_done, &b->key);
1690
1691        return ret;
1692}
1693
1694static void btree_gc_start(struct cache_set *c)
1695{
1696        struct cache *ca;
1697        struct bucket *b;
1698        unsigned int i;
1699
1700        if (!c->gc_mark_valid)
1701                return;
1702
1703        mutex_lock(&c->bucket_lock);
1704
1705        c->gc_mark_valid = 0;
1706        c->gc_done = ZERO_KEY;
1707
1708        for_each_cache(ca, c, i)
1709                for_each_bucket(b, ca) {
1710                        b->last_gc = b->gen;
1711                        if (!atomic_read(&b->pin)) {
1712                                SET_GC_MARK(b, 0);
1713                                SET_GC_SECTORS_USED(b, 0);
1714                        }
1715                }
1716
1717        mutex_unlock(&c->bucket_lock);
1718}
1719
1720static void bch_btree_gc_finish(struct cache_set *c)
1721{
1722        struct bucket *b;
1723        struct cache *ca;
1724        unsigned int i;
1725
1726        mutex_lock(&c->bucket_lock);
1727
1728        set_gc_sectors(c);
1729        c->gc_mark_valid = 1;
1730        c->need_gc      = 0;
1731
1732        for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++)
1733                SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
1734                            GC_MARK_METADATA);
1735
1736        /* don't reclaim buckets to which writeback keys point */
1737        rcu_read_lock();
1738        for (i = 0; i < c->devices_max_used; i++) {
1739                struct bcache_device *d = c->devices[i];
1740                struct cached_dev *dc;
1741                struct keybuf_key *w, *n;
1742                unsigned int j;
1743
1744                if (!d || UUID_FLASH_ONLY(&c->uuids[i]))
1745                        continue;
1746                dc = container_of(d, struct cached_dev, disk);
1747
1748                spin_lock(&dc->writeback_keys.lock);
1749                rbtree_postorder_for_each_entry_safe(w, n,
1750                                        &dc->writeback_keys.keys, node)
1751                        for (j = 0; j < KEY_PTRS(&w->key); j++)
1752                                SET_GC_MARK(PTR_BUCKET(c, &w->key, j),
1753                                            GC_MARK_DIRTY);
1754                spin_unlock(&dc->writeback_keys.lock);
1755        }
1756        rcu_read_unlock();
1757
1758        c->avail_nbuckets = 0;
1759        for_each_cache(ca, c, i) {
1760                uint64_t *i;
1761
1762                ca->invalidate_needs_gc = 0;
1763
1764                for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++)
1765                        SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
1766
1767                for (i = ca->prio_buckets;
1768                     i < ca->prio_buckets + prio_buckets(ca) * 2; i++)
1769                        SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
1770
1771                for_each_bucket(b, ca) {
1772                        c->need_gc      = max(c->need_gc, bucket_gc_gen(b));
1773
1774                        if (atomic_read(&b->pin))
1775                                continue;
1776
1777                        BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b));
1778
1779                        if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE)
1780                                c->avail_nbuckets++;
1781                }
1782        }
1783
1784        mutex_unlock(&c->bucket_lock);
1785}
1786
1787static void bch_btree_gc(struct cache_set *c)
1788{
1789        int ret;
1790        struct gc_stat stats;
1791        struct closure writes;
1792        struct btree_op op;
1793        uint64_t start_time = local_clock();
1794
1795        trace_bcache_gc_start(c);
1796
1797        memset(&stats, 0, sizeof(struct gc_stat));
1798        closure_init_stack(&writes);
1799        bch_btree_op_init(&op, SHRT_MAX);
1800
1801        btree_gc_start(c);
1802
1803        /* if CACHE_SET_IO_DISABLE set, gc thread should stop too */
1804        do {
1805                ret = bcache_btree_root(gc_root, c, &op, &writes, &stats);
1806                closure_sync(&writes);
1807                cond_resched();
1808
1809                if (ret == -EAGAIN)
1810                        schedule_timeout_interruptible(msecs_to_jiffies
1811                                                       (GC_SLEEP_MS));
1812                else if (ret)
1813                        pr_warn("gc failed!\n");
1814        } while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags));
1815
1816        bch_btree_gc_finish(c);
1817        wake_up_allocators(c);
1818
1819        bch_time_stats_update(&c->btree_gc_time, start_time);
1820
1821        stats.key_bytes *= sizeof(uint64_t);
1822        stats.data      <<= 9;
1823        bch_update_bucket_in_use(c, &stats);
1824        memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
1825
1826        trace_bcache_gc_end(c);
1827
1828        bch_moving_gc(c);
1829}
1830
1831static bool gc_should_run(struct cache_set *c)
1832{
1833        struct cache *ca;
1834        unsigned int i;
1835
1836        for_each_cache(ca, c, i)
1837                if (ca->invalidate_needs_gc)
1838                        return true;
1839
1840        if (atomic_read(&c->sectors_to_gc) < 0)
1841                return true;
1842
1843        return false;
1844}
1845
1846static int bch_gc_thread(void *arg)
1847{
1848        struct cache_set *c = arg;
1849
1850        while (1) {
1851                wait_event_interruptible(c->gc_wait,
1852                           kthread_should_stop() ||
1853                           test_bit(CACHE_SET_IO_DISABLE, &c->flags) ||
1854                           gc_should_run(c));
1855
1856                if (kthread_should_stop() ||
1857                    test_bit(CACHE_SET_IO_DISABLE, &c->flags))
1858                        break;
1859
1860                set_gc_sectors(c);
1861                bch_btree_gc(c);
1862        }
1863
1864        wait_for_kthread_stop();
1865        return 0;
1866}
1867
1868int bch_gc_thread_start(struct cache_set *c)
1869{
1870        c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc");
1871        return PTR_ERR_OR_ZERO(c->gc_thread);
1872}
1873
1874/* Initial partial gc */
1875
1876static int bch_btree_check_recurse(struct btree *b, struct btree_op *op)
1877{
1878        int ret = 0;
1879        struct bkey *k, *p = NULL;
1880        struct btree_iter iter;
1881
1882        for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid)
1883                bch_initial_mark_key(b->c, b->level, k);
1884
1885        bch_initial_mark_key(b->c, b->level + 1, &b->key);
1886
1887        if (b->level) {
1888                bch_btree_iter_init(&b->keys, &iter, NULL);
1889
1890                do {
1891                        k = bch_btree_iter_next_filter(&iter, &b->keys,
1892                                                       bch_ptr_bad);
1893                        if (k) {
1894                                btree_node_prefetch(b, k);
1895                                /*
1896                                 * initiallize c->gc_stats.nodes
1897                                 * for incremental GC
1898                                 */
1899                                b->c->gc_stats.nodes++;
1900                        }
1901
1902                        if (p)
1903                                ret = bcache_btree(check_recurse, p, b, op);
1904
1905                        p = k;
1906                } while (p && !ret);
1907        }
1908
1909        return ret;
1910}
1911
1912
1913static int bch_btree_check_thread(void *arg)
1914{
1915        int ret;
1916        struct btree_check_info *info = arg;
1917        struct btree_check_state *check_state = info->state;
1918        struct cache_set *c = check_state->c;
1919        struct btree_iter iter;
1920        struct bkey *k, *p;
1921        int cur_idx, prev_idx, skip_nr;
1922
1923        k = p = NULL;
1924        cur_idx = prev_idx = 0;
1925        ret = 0;
1926
1927        /* root node keys are checked before thread created */
1928        bch_btree_iter_init(&c->root->keys, &iter, NULL);
1929        k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad);
1930        BUG_ON(!k);
1931
1932        p = k;
1933        while (k) {
1934                /*
1935                 * Fetch a root node key index, skip the keys which
1936                 * should be fetched by other threads, then check the
1937                 * sub-tree indexed by the fetched key.
1938                 */
1939                spin_lock(&check_state->idx_lock);
1940                cur_idx = check_state->key_idx;
1941                check_state->key_idx++;
1942                spin_unlock(&check_state->idx_lock);
1943
1944                skip_nr = cur_idx - prev_idx;
1945
1946                while (skip_nr) {
1947                        k = bch_btree_iter_next_filter(&iter,
1948                                                       &c->root->keys,
1949                                                       bch_ptr_bad);
1950                        if (k)
1951                                p = k;
1952                        else {
1953                                /*
1954                                 * No more keys to check in root node,
1955                                 * current checking threads are enough,
1956                                 * stop creating more.
1957                                 */
1958                                atomic_set(&check_state->enough, 1);
1959                                /* Update check_state->enough earlier */
1960                                smp_mb__after_atomic();
1961                                goto out;
1962                        }
1963                        skip_nr--;
1964                        cond_resched();
1965                }
1966
1967                if (p) {
1968                        struct btree_op op;
1969
1970                        btree_node_prefetch(c->root, p);
1971                        c->gc_stats.nodes++;
1972                        bch_btree_op_init(&op, 0);
1973                        ret = bcache_btree(check_recurse, p, c->root, &op);
1974                        if (ret)
1975                                goto out;
1976                }
1977                p = NULL;
1978                prev_idx = cur_idx;
1979                cond_resched();
1980        }
1981
1982out:
1983        info->result = ret;
1984        /* update check_state->started among all CPUs */
1985        smp_mb__before_atomic();
1986        if (atomic_dec_and_test(&check_state->started))
1987                wake_up(&check_state->wait);
1988
1989        return ret;
1990}
1991
1992
1993
1994static int bch_btree_chkthread_nr(void)
1995{
1996        int n = num_online_cpus()/2;
1997
1998        if (n == 0)
1999                n = 1;
2000        else if (n > BCH_BTR_CHKTHREAD_MAX)

2001                n = BCH_BTR_CHKTHREAD_MAX;
2002
2003        return n;
2004}
2005
2006int bch_btree_check(struct cache_set *c)
2007{
2008        int ret = 0;
2009        int i;
2010        struct bkey *k = NULL;
2011        struct btree_iter iter;
2012        struct btree_check_state *check_state;
2013        char name[32];
2014
2015        /* check and mark root node keys */
2016        for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid)
2017                bch_initial_mark_key(c, c->root->level, k);
2018
2019        bch_initial_mark_key(c, c->root->level + 1, &c->root->key);
2020
2021        if (c->root->level == 0)
2022                return 0;
2023
2024        check_state = kzalloc(sizeof(struct btree_check_state), GFP_KERNEL);
2025        if (!check_state)
2026                return -ENOMEM;
2027
2028        check_state->c = c;
2029        check_state->total_threads = bch_btree_chkthread_nr();
2030        check_state->key_idx = 0;
2031        spin_lock_init(&check_state->idx_lock);
2032        atomic_set(&check_state->started, 0);
2033        atomic_set(&check_state->enough, 0);
2034        init_waitqueue_head(&check_state->wait);
2035
2036        /*
2037         * Run multiple threads to check btree nodes in parallel,
2038         * if check_state->enough is non-zero, it means current
2039         * running check threads are enough, unncessary to create
2040         * more.
2041         */
2042        for (i = 0; i < check_state->total_threads; i++) {
2043                /* fetch latest check_state->enough earlier */
2044                smp_mb__before_atomic();
2045                if (atomic_read(&check_state->enough))
2046                        break;
2047
2048                check_state->infos[i].result = 0;
2049                check_state->infos[i].state = check_state;
2050                snprintf(name, sizeof(name), "bch_btrchk[%u]", i);
2051                atomic_inc(&check_state->started);
2052
2053                check_state->infos[i].thread =
2054                        kthread_run(bch_btree_check_thread,
2055                                    &check_state->infos[i],
2056                                    name);
2057                if (IS_ERR(check_state->infos[i].thread)) {
2058                        pr_err("fails to run thread bch_btrchk[%d]\n", i);
2059                        for (--i; i >= 0; i--)
2060                                kthread_stop(check_state->infos[i].thread);
2061                        ret = -ENOMEM;
2062                        goto out;
2063                }
2064        }
2065
2066        wait_event_interruptible(check_state->wait,
2067                                 atomic_read(&check_state->started) == 0 ||
2068                                  test_bit(CACHE_SET_IO_DISABLE, &c->flags));
2069
2070        for (i = 0; i < check_state->total_threads; i++) {
2071                if (check_state->infos[i].result) {
2072                        ret = check_state->infos[i].result;
2073                        goto out;
2074                }
2075        }
2076
2077out:
2078        kfree(check_state);
2079        return ret;
2080}
2081
2082void bch_initial_gc_finish(struct cache_set *c)
2083{
2084        struct cache *ca;
2085        struct bucket *b;
2086        unsigned int i;
2087
2088        bch_btree_gc_finish(c);
2089
2090        mutex_lock(&c->bucket_lock);
2091
2092        /*
2093         * We need to put some unused buckets directly on the prio freelist in
2094         * order to get the allocator thread started - it needs freed buckets in
2095         * order to rewrite the prios and gens, and it needs to rewrite prios
2096         * and gens in order to free buckets.
2097         *
2098         * This is only safe for buckets that have no live data in them, which
2099         * there should always be some of.
2100         */
2101        for_each_cache(ca, c, i) {
2102                for_each_bucket(b, ca) {
2103                        if (fifo_full(&ca->free[RESERVE_PRIO]) &&
2104                            fifo_full(&ca->free[RESERVE_BTREE]))
2105                                break;
2106
2107                        if (bch_can_invalidate_bucket(ca, b) &&
2108                            !GC_MARK(b)) {
2109                                __bch_invalidate_one_bucket(ca, b);
2110                                if (!fifo_push(&ca->free[RESERVE_PRIO],
2111                                   b - ca->buckets))
2112                                        fifo_push(&ca->free[RESERVE_BTREE],
2113                                                  b - ca->buckets);
2114                        }
2115                }
2116        }
2117
2118        mutex_unlock(&c->bucket_lock);
2119}
2120
2121/* Btree insertion */
2122
2123static bool btree_insert_key(struct btree *b, struct bkey *k,
2124                             struct bkey *replace_key)
2125{
2126        unsigned int status;
2127
2128        BUG_ON(bkey_cmp(k, &b->key) > 0);
2129
2130        status = bch_btree_insert_key(&b->keys, k, replace_key);
2131        if (status != BTREE_INSERT_STATUS_NO_INSERT) {
2132                bch_check_keys(&b->keys, "%u for %s", status,
2133                               replace_key ? "replace" : "insert");
2134
2135                trace_bcache_btree_insert_key(b, k, replace_key != NULL,
2136                                              status);
2137                return true;
2138        } else
2139                return false;
2140}
2141
2142static size_t insert_u64s_remaining(struct btree *b)
2143{
2144        long ret = bch_btree_keys_u64s_remaining(&b->keys);
2145
2146        /*
2147         * Might land in the middle of an existing extent and have to split it
2148         */
2149        if (b->keys.ops->is_extents)
2150                ret -= KEY_MAX_U64S;
2151
2152        return max(ret, 0L);
2153}
2154
2155static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
2156                                  struct keylist *insert_keys,
2157                                  struct bkey *replace_key)
2158{
2159        bool ret = false;
2160        int oldsize = bch_count_data(&b->keys);
2161
2162        while (!bch_keylist_empty(insert_keys)) {
2163                struct bkey *k = insert_keys->keys;
2164
2165                if (bkey_u64s(k) > insert_u64s_remaining(b))
2166                        break;
2167
2168                if (bkey_cmp(k, &b->key) <= 0) {
2169                        if (!b->level)
2170                                bkey_put(b->c, k);
2171
2172                        ret |= btree_insert_key(b, k, replace_key);
2173                        bch_keylist_pop_front(insert_keys);
2174                } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) {
2175                        BKEY_PADDED(key) temp;
2176                        bkey_copy(&temp.key, insert_keys->keys);
2177
2178                        bch_cut_back(&b->key, &temp.key);
2179                        bch_cut_front(&b->key, insert_keys->keys);
2180
2181                        ret |= btree_insert_key(b, &temp.key, replace_key);
2182                        break;
2183                } else {
2184                        break;
2185                }
2186        }
2187
2188        if (!ret)
2189                op->insert_collision = true;
2190
2191        BUG_ON(!bch_keylist_empty(insert_keys) && b->level);
2192
2193        BUG_ON(bch_count_data(&b->keys) < oldsize);
2194        return ret;
2195}
2196
2197static int btree_split(struct btree *b, struct btree_op *op,
2198                       struct keylist *insert_keys,
2199                       struct bkey *replace_key)
2200{
2201        bool split;
2202        struct btree *n1, *n2 = NULL, *n3 = NULL;
2203        uint64_t start_time = local_clock();
2204        struct closure cl;
2205        struct keylist parent_keys;
2206
2207        closure_init_stack(&cl);
2208        bch_keylist_init(&parent_keys);
2209
2210        if (btree_check_reserve(b, op)) {
2211                if (!b->level)
2212                        return -EINTR;
2213                else
2214                        WARN(1, "insufficient reserve for split\n");
2215        }
2216
2217        n1 = btree_node_alloc_replacement(b, op);
2218        if (IS_ERR(n1))
2219                goto err;
2220
2221        split = set_blocks(btree_bset_first(n1),
2222                           block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5;
2223
2224        if (split) {
2225                unsigned int keys = 0;
2226
2227                trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys);
2228
2229                n2 = bch_btree_node_alloc(b->c, op, b->level, b->parent);
2230                if (IS_ERR(n2))
2231                        goto err_free1;
2232
2233                if (!b->parent) {
2234                        n3 = bch_btree_node_alloc(b->c, op, b->level + 1, NULL);
2235                        if (IS_ERR(n3))
2236                                goto err_free2;
2237                }
2238
2239                mutex_lock(&n1->write_lock);
2240                mutex_lock(&n2->write_lock);
2241
2242                bch_btree_insert_keys(n1, op, insert_keys, replace_key);
2243
2244                /*
2245                 * Has to be a linear search because we don't have an auxiliary
2246                 * search tree yet
2247                 */
2248
2249                while (keys < (btree_bset_first(n1)->keys * 3) / 5)
2250                        keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1),
2251                                                        keys));
2252
2253                bkey_copy_key(&n1->key,
2254                              bset_bkey_idx(btree_bset_first(n1), keys));
2255                keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), keys));
2256
2257                btree_bset_first(n2)->keys = btree_bset_first(n1)->keys - keys;
2258                btree_bset_first(n1)->keys = keys;
2259
2260                memcpy(btree_bset_first(n2)->start,
2261                       bset_bkey_last(btree_bset_first(n1)),
2262                       btree_bset_first(n2)->keys * sizeof(uint64_t));
2263
2264                bkey_copy_key(&n2->key, &b->key);
2265
2266                bch_keylist_add(&parent_keys, &n2->key);
2267                bch_btree_node_write(n2, &cl);
2268                mutex_unlock(&n2->write_lock);
2269                rw_unlock(true, n2);
2270        } else {
2271                trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys);
2272
2273                mutex_lock(&n1->write_lock);
2274                bch_btree_insert_keys(n1, op, insert_keys, replace_key);
2275        }
2276
2277        bch_keylist_add(&parent_keys, &n1->key);
2278        bch_btree_node_write(n1, &cl);
2279        mutex_unlock(&n1->write_lock);
2280
2281        if (n3) {
2282                /* Depth increases, make a new root */
2283                mutex_lock(&n3->write_lock);
2284                bkey_copy_key(&n3->key, &MAX_KEY);
2285                bch_btree_insert_keys(n3, op, &parent_keys, NULL);
2286                bch_btree_node_write(n3, &cl);
2287                mutex_unlock(&n3->write_lock);
2288
2289                closure_sync(&cl);
2290                bch_btree_set_root(n3);
2291                rw_unlock(true, n3);
2292        } else if (!b->parent) {
2293                /* Root filled up but didn't need to be split */
2294                closure_sync(&cl);
2295                bch_btree_set_root(n1);
2296        } else {
2297                /* Split a non root node */
2298                closure_sync(&cl);
2299                make_btree_freeing_key(b, parent_keys.top);
2300                bch_keylist_push(&parent_keys);
2301
2302                bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL);
2303                BUG_ON(!bch_keylist_empty(&parent_keys));
2304        }
2305
2306        btree_node_free(b);
2307        rw_unlock(true, n1);
2308
2309        bch_time_stats_update(&b->c->btree_split_time, start_time);
2310
2311        return 0;
2312err_free2:
2313        bkey_put(b->c, &n2->key);
2314        btree_node_free(n2);
2315        rw_unlock(true, n2);
2316err_free1:
2317        bkey_put(b->c, &n1->key);
2318        btree_node_free(n1);
2319        rw_unlock(true, n1);
2320err:
2321        WARN(1, "bcache: btree split failed (level %u)", b->level);
2322
2323        if (n3 == ERR_PTR(-EAGAIN) ||
2324            n2 == ERR_PTR(-EAGAIN) ||
2325            n1 == ERR_PTR(-EAGAIN))
2326                return -EAGAIN;
2327
2328        return -ENOMEM;
2329}
2330
2331static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
2332                                 struct keylist *insert_keys,
2333                                 atomic_t *journal_ref,
2334                                 struct bkey *replace_key)
2335{
2336        struct closure cl;
2337
2338        BUG_ON(b->level && replace_key);
2339
2340        closure_init_stack(&cl);
2341
2342        mutex_lock(&b->write_lock);
2343
2344        if (write_block(b) != btree_bset_last(b) &&
2345            b->keys.last_set_unwritten)
2346                bch_btree_init_next(b); /* just wrote a set */
2347
2348        if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) {
2349                mutex_unlock(&b->write_lock);
2350                goto split;
2351        }
2352
2353        BUG_ON(write_block(b) != btree_bset_last(b));
2354
2355        if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) {
2356                if (!b->level)
2357                        bch_btree_leaf_dirty(b, journal_ref);
2358                else
2359                        bch_btree_node_write(b, &cl);
2360        }
2361
2362        mutex_unlock(&b->write_lock);
2363
2364        /* wait for btree node write if necessary, after unlock */
2365        closure_sync(&cl);
2366
2367        return 0;
2368split:
2369        if (current->bio_list) {
2370                op->lock = b->c->root->level + 1;
2371                return -EAGAIN;
2372        } else if (op->lock <= b->c->root->level) {
2373                op->lock = b->c->root->level + 1;
2374                return -EINTR;
2375        } else {
2376                /* Invalidated all iterators */
2377                int ret = btree_split(b, op, insert_keys, replace_key);
2378
2379                if (bch_keylist_empty(insert_keys))
2380                        return 0;
2381                else if (!ret)
2382                        return -EINTR;
2383                return ret;
2384        }
2385}
2386
2387int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
2388                               struct bkey *check_key)
2389{
2390        int ret = -EINTR;
2391        uint64_t btree_ptr = b->key.ptr[0];
2392        unsigned long seq = b->seq;
2393        struct keylist insert;
2394        bool upgrade = op->lock == -1;
2395
2396        bch_keylist_init(&insert);
2397
2398        if (upgrade) {
2399                rw_unlock(false, b);
2400                rw_lock(true, b, b->level);
2401
2402                if (b->key.ptr[0] != btree_ptr ||
2403                    b->seq != seq + 1) {
2404                        op->lock = b->level;
2405                        goto out;
2406                }
2407        }
2408
2409        SET_KEY_PTRS(check_key, 1);
2410        get_random_bytes(&check_key->ptr[0], sizeof(uint64_t));
2411
2412        SET_PTR_DEV(check_key, 0, PTR_CHECK_DEV);
2413
2414        bch_keylist_add(&insert, check_key);
2415
2416        ret = bch_btree_insert_node(b, op, &insert, NULL, NULL);
2417
2418        BUG_ON(!ret && !bch_keylist_empty(&insert));
2419out:
2420        if (upgrade)
2421                downgrade_write(&b->lock);
2422        return ret;
2423}
2424
2425struct btree_insert_op {
2426        struct btree_op op;
2427        struct keylist  *keys;
2428        atomic_t        *journal_ref;
2429        struct bkey     *replace_key;
2430};
2431
2432static int btree_insert_fn(struct btree_op *b_op, struct btree *b)
2433{
2434        struct btree_insert_op *op = container_of(b_op,
2435                                        struct btree_insert_op, op);
2436
2437        int ret = bch_btree_insert_node(b, &op->op, op->keys,
2438                                        op->journal_ref, op->replace_key);
2439        if (ret && !bch_keylist_empty(op->keys))
2440                return ret;
2441        else
2442                return MAP_DONE;
2443}
2444
2445int bch_btree_insert(struct cache_set *c, struct keylist *keys,
2446                     atomic_t *journal_ref, struct bkey *replace_key)
2447{
2448        struct btree_insert_op op;
2449        int ret = 0;
2450
2451        BUG_ON(current->bio_list);
2452        BUG_ON(bch_keylist_empty(keys));
2453
2454        bch_btree_op_init(&op.op, 0);
2455        op.keys         = keys;
2456        op.journal_ref  = journal_ref;
2457        op.replace_key  = replace_key;
2458
2459        while (!ret && !bch_keylist_empty(keys)) {
2460                op.op.lock = 0;
2461                ret = bch_btree_map_leaf_nodes(&op.op, c,
2462                                               &START_KEY(keys->keys),
2463                                               btree_insert_fn);
2464        }
2465
2466        if (ret) {
2467                struct bkey *k;
2468
2469                pr_err("error %i\n", ret);
2470
2471                while ((k = bch_keylist_pop(keys)))
2472                        bkey_put(c, k);
2473        } else if (op.op.insert_collision)
2474                ret = -ESRCH;
2475
2476        return ret;
2477}
2478
2479void bch_btree_set_root(struct btree *b)
2480{
2481        unsigned int i;
2482        struct closure cl;
2483
2484        closure_init_stack(&cl);
2485
2486        trace_bcache_btree_set_root(b);
2487
2488        BUG_ON(!b->written);
2489
2490        for (i = 0; i < KEY_PTRS(&b->key); i++)
2491                BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO);
2492
2493        mutex_lock(&b->c->bucket_lock);
2494        list_del_init(&b->list);
2495        mutex_unlock(&b->c->bucket_lock);
2496
2497        b->c->root = b;
2498
2499        bch_journal_meta(b->c, &cl);
2500        closure_sync(&cl);
2501}
2502
2503/* Map across nodes or keys */
2504
2505static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
2506                                       struct bkey *from,
2507                                       btree_map_nodes_fn *fn, int flags)
2508{
2509        int ret = MAP_CONTINUE;
2510
2511        if (b->level) {
2512                struct bkey *k;
2513                struct btree_iter iter;
2514
2515                bch_btree_iter_init(&b->keys, &iter, from);
2516
2517                while ((k = bch_btree_iter_next_filter(&iter, &b->keys,
2518                                                       bch_ptr_bad))) {
2519                        ret = bcache_btree(map_nodes_recurse, k, b,
2520                                    op, from, fn, flags);
2521                        from = NULL;
2522
2523                        if (ret != MAP_CONTINUE)
2524                                return ret;
2525                }
2526        }
2527
2528        if (!b->level || flags == MAP_ALL_NODES)
2529                ret = fn(op, b);
2530
2531        return ret;
2532}
2533
2534int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c,
2535                          struct bkey *from, btree_map_nodes_fn *fn, int flags)
2536{
2537        return bcache_btree_root(map_nodes_recurse, c, op, from, fn, flags);
2538}
2539
2540int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
2541                                      struct bkey *from, btree_map_keys_fn *fn,
2542                                      int flags)
2543{
2544        int ret = MAP_CONTINUE;
2545        struct bkey *k;
2546        struct btree_iter iter;
2547
2548        bch_btree_iter_init(&b->keys, &iter, from);
2549
2550        while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) {
2551                ret = !b->level
2552                        ? fn(op, b, k)
2553                        : bcache_btree(map_keys_recurse, k,
2554                                       b, op, from, fn, flags);
2555                from = NULL;
2556
2557                if (ret != MAP_CONTINUE)
2558                        return ret;
2559        }
2560
2561        if (!b->level && (flags & MAP_END_KEY))
2562                ret = fn(op, b, &KEY(KEY_INODE(&b->key),
2563                                     KEY_OFFSET(&b->key), 0));
2564
2565        return ret;
2566}
2567
2568int bch_btree_map_keys(struct btree_op *op, struct cache_set *c,
2569                       struct bkey *from, btree_map_keys_fn *fn, int flags)
2570{
2571        return bcache_btree_root(map_keys_recurse, c, op, from, fn, flags);
2572}
2573
2574/* Keybuf code */
2575
2576static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r)
2577{
2578        /* Overlapping keys compare equal */
2579        if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0)
2580                return -1;
2581        if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0)
2582                return 1;
2583        return 0;
2584}
2585
2586static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
2587                                            struct keybuf_key *r)
2588{
2589        return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1);
2590}
2591
2592struct refill {
2593        struct btree_op op;
2594        unsigned int    nr_found;
2595        struct keybuf   *buf;
2596        struct bkey     *end;
2597        keybuf_pred_fn  *pred;
2598};
2599
2600static int refill_keybuf_fn(struct btree_op *op, struct btree *b,
2601                            struct bkey *k)
2602{
2603        struct refill *refill = container_of(op, struct refill, op);
2604        struct keybuf *buf = refill->buf;
2605        int ret = MAP_CONTINUE;
2606
2607        if (bkey_cmp(k, refill->end) > 0) {
2608                ret = MAP_DONE;
2609                goto out;
2610        }
2611
2612        if (!KEY_SIZE(k)) /* end key */
2613                goto out;
2614
2615        if (refill->pred(buf, k)) {
2616                struct keybuf_key *w;
2617
2618                spin_lock(&buf->lock);
2619
2620                w = array_alloc(&buf->freelist);
2621                if (!w) {
2622                        spin_unlock(&buf->lock);
2623                        return MAP_DONE;
2624                }
2625
2626                w->private = NULL;
2627                bkey_copy(&w->key, k);
2628
2629                if (RB_INSERT(&buf->keys, w, node, keybuf_cmp))
2630                        array_free(&buf->freelist, w);
2631                else
2632                        refill->nr_found++;
2633
2634                if (array_freelist_empty(&buf->freelist))
2635                        ret = MAP_DONE;
2636
2637                spin_unlock(&buf->lock);
2638        }
2639out:
2640        buf->last_scanned = *k;
2641        return ret;
2642}
2643
2644void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
2645                       struct bkey *end, keybuf_pred_fn *pred)
2646{
2647        struct bkey start = buf->last_scanned;
2648        struct refill refill;
2649
2650        cond_resched();
2651
2652        bch_btree_op_init(&refill.op, -1);
2653        refill.nr_found = 0;
2654        refill.buf      = buf;
2655        refill.end      = end;
2656        refill.pred     = pred;
2657
2658        bch_btree_map_keys(&refill.op, c, &buf->last_scanned,
2659                           refill_keybuf_fn, MAP_END_KEY);
2660
2661        trace_bcache_keyscan(refill.nr_found,
2662                             KEY_INODE(&start), KEY_OFFSET(&start),
2663                             KEY_INODE(&buf->last_scanned),
2664                             KEY_OFFSET(&buf->last_scanned));
2665
2666        spin_lock(&buf->lock);
2667
2668        if (!RB_EMPTY_ROOT(&buf->keys)) {
2669                struct keybuf_key *w;
2670
2671                w = RB_FIRST(&buf->keys, struct keybuf_key, node);
2672                buf->start      = START_KEY(&w->key);
2673
2674                w = RB_LAST(&buf->keys, struct keybuf_key, node);
2675                buf->end        = w->key;
2676        } else {
2677                buf->start      = MAX_KEY;
2678                buf->end        = MAX_KEY;
2679        }
2680
2681        spin_unlock(&buf->lock);
2682}
2683
2684static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
2685{
2686        rb_erase(&w->node, &buf->keys);
2687        array_free(&buf->freelist, w);
2688}
2689
2690void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
2691{
2692        spin_lock(&buf->lock);
2693        __bch_keybuf_del(buf, w);
2694        spin_unlock(&buf->lock);
2695}
2696
2697bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start,
2698                                  struct bkey *end)
2699{
2700        bool ret = false;
2701        struct keybuf_key *p, *w, s;
2702
2703        s.key = *start;
2704
2705        if (bkey_cmp(end, &buf->start) <= 0 ||
2706            bkey_cmp(start, &buf->end) >= 0)
2707                return false;
2708
2709        spin_lock(&buf->lock);
2710        w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp);
2711
2712        while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) {
2713                p = w;
2714                w = RB_NEXT(w, node);
2715
2716                if (p->private)
2717                        ret = true;
2718                else
2719                        __bch_keybuf_del(buf, p);
2720        }
2721
2722        spin_unlock(&buf->lock);
2723        return ret;
2724}
2725
2726struct keybuf_key *bch_keybuf_next(struct keybuf *buf)
2727{
2728        struct keybuf_key *w;
2729
2730        spin_lock(&buf->lock);
2731
2732        w = RB_FIRST(&buf->keys, struct keybuf_key, node);
2733
2734        while (w && w->private)
2735                w = RB_NEXT(w, node);
2736
2737        if (w)
2738                w->private = ERR_PTR(-EINTR);
2739
2740        spin_unlock(&buf->lock);
2741        return w;
2742}
2743
2744struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
2745                                          struct keybuf *buf,
2746                                          struct bkey *end,
2747                                          keybuf_pred_fn *pred)
2748{
2749        struct keybuf_key *ret;
2750
2751        while (1) {
2752                ret = bch_keybuf_next(buf);
2753                if (ret)
2754                        break;
2755
2756                if (bkey_cmp(&buf->last_scanned, end) >= 0) {
2757                        pr_debug("scan finished\n");
2758                        break;
2759                }
2760
2761                bch_refill_keybuf(c, buf, end, pred);
2762        }
2763
2764        return ret;
2765}
2766
2767void bch_keybuf_init(struct keybuf *buf)
2768{
2769        buf->last_scanned       = MAX_KEY;
2770        buf->keys               = RB_ROOT;
2771
2772        spin_lock_init(&buf->lock);
2773        array_allocator_init(&buf->freelist);
2774}
2775