linux/drivers/md/bcache/bset.h
<<
>>
Prefs
   1/* SPDX-License-Identifier: GPL-2.0 */
   2#ifndef _BCACHE_BSET_H
   3#define _BCACHE_BSET_H
   4
   5#include <linux/bcache.h>
   6#include <linux/kernel.h>
   7#include <linux/types.h>
   8
   9#include "util.h" /* for time_stats */
  10
  11/*
  12 * BKEYS:
  13 *
  14 * A bkey contains a key, a size field, a variable number of pointers, and some
  15 * ancillary flag bits.
  16 *
  17 * We use two different functions for validating bkeys, bch_ptr_invalid and
  18 * bch_ptr_bad().
  19 *
  20 * bch_ptr_invalid() primarily filters out keys and pointers that would be
  21 * invalid due to some sort of bug, whereas bch_ptr_bad() filters out keys and
  22 * pointer that occur in normal practice but don't point to real data.
  23 *
  24 * The one exception to the rule that ptr_invalid() filters out invalid keys is
  25 * that it also filters out keys of size 0 - these are keys that have been
  26 * completely overwritten. It'd be safe to delete these in memory while leaving
  27 * them on disk, just unnecessary work - so we filter them out when resorting
  28 * instead.
  29 *
  30 * We can't filter out stale keys when we're resorting, because garbage
  31 * collection needs to find them to ensure bucket gens don't wrap around -
  32 * unless we're rewriting the btree node those stale keys still exist on disk.
  33 *
  34 * We also implement functions here for removing some number of sectors from the
  35 * front or the back of a bkey - this is mainly used for fixing overlapping
  36 * extents, by removing the overlapping sectors from the older key.
  37 *
  38 * BSETS:
  39 *
  40 * A bset is an array of bkeys laid out contiguously in memory in sorted order,
  41 * along with a header. A btree node is made up of a number of these, written at
  42 * different times.
  43 *
  44 * There could be many of them on disk, but we never allow there to be more than
  45 * 4 in memory - we lazily resort as needed.
  46 *
  47 * We implement code here for creating and maintaining auxiliary search trees
  48 * (described below) for searching an individial bset, and on top of that we
  49 * implement a btree iterator.
  50 *
  51 * BTREE ITERATOR:
  52 *
  53 * Most of the code in bcache doesn't care about an individual bset - it needs
  54 * to search entire btree nodes and iterate over them in sorted order.
  55 *
  56 * The btree iterator code serves both functions; it iterates through the keys
  57 * in a btree node in sorted order, starting from either keys after a specific
  58 * point (if you pass it a search key) or the start of the btree node.
  59 *
  60 * AUXILIARY SEARCH TREES:
  61 *
  62 * Since keys are variable length, we can't use a binary search on a bset - we
  63 * wouldn't be able to find the start of the next key. But binary searches are
  64 * slow anyways, due to terrible cache behaviour; bcache originally used binary
  65 * searches and that code topped out at under 50k lookups/second.
  66 *
  67 * So we need to construct some sort of lookup table. Since we only insert keys
  68 * into the last (unwritten) set, most of the keys within a given btree node are
  69 * usually in sets that are mostly constant. We use two different types of
  70 * lookup tables to take advantage of this.
  71 *
  72 * Both lookup tables share in common that they don't index every key in the
  73 * set; they index one key every BSET_CACHELINE bytes, and then a linear search
  74 * is used for the rest.
  75 *
  76 * For sets that have been written to disk and are no longer being inserted
  77 * into, we construct a binary search tree in an array - traversing a binary
  78 * search tree in an array gives excellent locality of reference and is very
  79 * fast, since both children of any node are adjacent to each other in memory
  80 * (and their grandchildren, and great grandchildren...) - this means
  81 * prefetching can be used to great effect.
  82 *
  83 * It's quite useful performance wise to keep these nodes small - not just
  84 * because they're more likely to be in L2, but also because we can prefetch
  85 * more nodes on a single cacheline and thus prefetch more iterations in advance
  86 * when traversing this tree.
  87 *
  88 * Nodes in the auxiliary search tree must contain both a key to compare against
  89 * (we don't want to fetch the key from the set, that would defeat the purpose),
  90 * and a pointer to the key. We use a few tricks to compress both of these.
  91 *
  92 * To compress the pointer, we take advantage of the fact that one node in the
  93 * search tree corresponds to precisely BSET_CACHELINE bytes in the set. We have
  94 * a function (to_inorder()) that takes the index of a node in a binary tree and
  95 * returns what its index would be in an inorder traversal, so we only have to
  96 * store the low bits of the offset.
  97 *
  98 * The key is 84 bits (KEY_DEV + key->key, the offset on the device). To
  99 * compress that,  we take advantage of the fact that when we're traversing the
 100 * search tree at every iteration we know that both our search key and the key
 101 * we're looking for lie within some range - bounded by our previous
 102 * comparisons. (We special case the start of a search so that this is true even
 103 * at the root of the tree).
 104 *
 105 * So we know the key we're looking for is between a and b, and a and b don't
 106 * differ higher than bit 50, we don't need to check anything higher than bit
 107 * 50.
 108 *
 109 * We don't usually need the rest of the bits, either; we only need enough bits
 110 * to partition the key range we're currently checking.  Consider key n - the
 111 * key our auxiliary search tree node corresponds to, and key p, the key
 112 * immediately preceding n.  The lowest bit we need to store in the auxiliary
 113 * search tree is the highest bit that differs between n and p.
 114 *
 115 * Note that this could be bit 0 - we might sometimes need all 80 bits to do the
 116 * comparison. But we'd really like our nodes in the auxiliary search tree to be
 117 * of fixed size.
 118 *
 119 * The solution is to make them fixed size, and when we're constructing a node
 120 * check if p and n differed in the bits we needed them to. If they don't we
 121 * flag that node, and when doing lookups we fallback to comparing against the
 122 * real key. As long as this doesn't happen to often (and it seems to reliably
 123 * happen a bit less than 1% of the time), we win - even on failures, that key
 124 * is then more likely to be in cache than if we were doing binary searches all
 125 * the way, since we're touching so much less memory.
 126 *
 127 * The keys in the auxiliary search tree are stored in (software) floating
 128 * point, with an exponent and a mantissa. The exponent needs to be big enough
 129 * to address all the bits in the original key, but the number of bits in the
 130 * mantissa is somewhat arbitrary; more bits just gets us fewer failures.
 131 *
 132 * We need 7 bits for the exponent and 3 bits for the key's offset (since keys
 133 * are 8 byte aligned); using 22 bits for the mantissa means a node is 4 bytes.
 134 * We need one node per 128 bytes in the btree node, which means the auxiliary
 135 * search trees take up 3% as much memory as the btree itself.
 136 *
 137 * Constructing these auxiliary search trees is moderately expensive, and we
 138 * don't want to be constantly rebuilding the search tree for the last set
 139 * whenever we insert another key into it. For the unwritten set, we use a much
 140 * simpler lookup table - it's just a flat array, so index i in the lookup table
 141 * corresponds to the i range of BSET_CACHELINE bytes in the set. Indexing
 142 * within each byte range works the same as with the auxiliary search trees.
 143 *
 144 * These are much easier to keep up to date when we insert a key - we do it
 145 * somewhat lazily; when we shift a key up we usually just increment the pointer
 146 * to it, only when it would overflow do we go to the trouble of finding the
 147 * first key in that range of bytes again.
 148 */
 149
 150struct btree_keys;
 151struct btree_iter;
 152struct btree_iter_set;
 153struct bkey_float;
 154
 155#define MAX_BSETS               4U
 156
 157struct bset_tree {
 158        /*
 159         * We construct a binary tree in an array as if the array
 160         * started at 1, so that things line up on the same cachelines
 161         * better: see comments in bset.c at cacheline_to_bkey() for
 162         * details
 163         */
 164
 165        /* size of the binary tree and prev array */
 166        unsigned int            size;
 167
 168        /* function of size - precalculated for to_inorder() */
 169        unsigned int            extra;
 170
 171        /* copy of the last key in the set */
 172        struct bkey             end;
 173        struct bkey_float       *tree;
 174
 175        /*
 176         * The nodes in the bset tree point to specific keys - this
 177         * array holds the sizes of the previous key.
 178         *
 179         * Conceptually it's a member of struct bkey_float, but we want
 180         * to keep bkey_float to 4 bytes and prev isn't used in the fast
 181         * path.
 182         */
 183        uint8_t                 *prev;
 184
 185        /* The actual btree node, with pointers to each sorted set */
 186        struct bset             *data;
 187};
 188
 189struct btree_keys_ops {
 190        bool            (*sort_cmp)(struct btree_iter_set l,
 191                                    struct btree_iter_set r);
 192        struct bkey     *(*sort_fixup)(struct btree_iter *iter,
 193                                       struct bkey *tmp);
 194        bool            (*insert_fixup)(struct btree_keys *b,
 195                                        struct bkey *insert,
 196                                        struct btree_iter *iter,
 197                                        struct bkey *replace_key);
 198        bool            (*key_invalid)(struct btree_keys *bk,
 199                                       const struct bkey *k);
 200        bool            (*key_bad)(struct btree_keys *bk,
 201                                   const struct bkey *k);
 202        bool            (*key_merge)(struct btree_keys *bk,
 203                                     struct bkey *l, struct bkey *r);
 204        void            (*key_to_text)(char *buf,
 205                                       size_t size,
 206                                       const struct bkey *k);
 207        void            (*key_dump)(struct btree_keys *keys,
 208                                    const struct bkey *k);
 209
 210        /*
 211         * Only used for deciding whether to use START_KEY(k) or just the key
 212         * itself in a couple places
 213         */
 214        bool            is_extents;
 215};
 216
 217struct btree_keys {
 218        const struct btree_keys_ops     *ops;
 219        uint8_t                 page_order;
 220        uint8_t                 nsets;
 221        unsigned int            last_set_unwritten:1;
 222        bool                    *expensive_debug_checks;
 223
 224        /*
 225         * Sets of sorted keys - the real btree node - plus a binary search tree
 226         *
 227         * set[0] is special; set[0]->tree, set[0]->prev and set[0]->data point
 228         * to the memory we have allocated for this btree node. Additionally,
 229         * set[0]->data points to the entire btree node as it exists on disk.
 230         */
 231        struct bset_tree        set[MAX_BSETS];
 232};
 233
 234static inline struct bset_tree *bset_tree_last(struct btree_keys *b)
 235{
 236        return b->set + b->nsets;
 237}
 238
 239static inline bool bset_written(struct btree_keys *b, struct bset_tree *t)
 240{
 241        return t <= b->set + b->nsets - b->last_set_unwritten;
 242}
 243
 244static inline bool bkey_written(struct btree_keys *b, struct bkey *k)
 245{
 246        return !b->last_set_unwritten || k < b->set[b->nsets].data->start;
 247}
 248
 249static inline unsigned int bset_byte_offset(struct btree_keys *b,
 250                                            struct bset *i)
 251{
 252        return ((size_t) i) - ((size_t) b->set->data);
 253}
 254
 255static inline unsigned int bset_sector_offset(struct btree_keys *b,
 256                                              struct bset *i)
 257{
 258        return bset_byte_offset(b, i) >> 9;
 259}
 260
 261#define __set_bytes(i, k)       (sizeof(*(i)) + (k) * sizeof(uint64_t))
 262#define set_bytes(i)            __set_bytes(i, i->keys)
 263
 264#define __set_blocks(i, k, block_bytes)                         \
 265        DIV_ROUND_UP(__set_bytes(i, k), block_bytes)
 266#define set_blocks(i, block_bytes)                              \
 267        __set_blocks(i, (i)->keys, block_bytes)
 268
 269static inline size_t bch_btree_keys_u64s_remaining(struct btree_keys *b)
 270{
 271        struct bset_tree *t = bset_tree_last(b);
 272
 273        BUG_ON((PAGE_SIZE << b->page_order) <
 274               (bset_byte_offset(b, t->data) + set_bytes(t->data)));
 275
 276        if (!b->last_set_unwritten)
 277                return 0;
 278
 279        return ((PAGE_SIZE << b->page_order) -
 280                (bset_byte_offset(b, t->data) + set_bytes(t->data))) /
 281                sizeof(u64);
 282}
 283
 284static inline struct bset *bset_next_set(struct btree_keys *b,
 285                                         unsigned int block_bytes)
 286{
 287        struct bset *i = bset_tree_last(b)->data;
 288
 289        return ((void *) i) + roundup(set_bytes(i), block_bytes);
 290}
 291
 292void bch_btree_keys_free(struct btree_keys *b);
 293int bch_btree_keys_alloc(struct btree_keys *b, unsigned int page_order,
 294                         gfp_t gfp);
 295void bch_btree_keys_init(struct btree_keys *b, const struct btree_keys_ops *ops,
 296                         bool *expensive_debug_checks);
 297
 298void bch_bset_init_next(struct btree_keys *b, struct bset *i, uint64_t magic);
 299void bch_bset_build_written_tree(struct btree_keys *b);
 300void bch_bset_fix_invalidated_key(struct btree_keys *b, struct bkey *k);
 301bool bch_bkey_try_merge(struct btree_keys *b, struct bkey *l, struct bkey *r);
 302void bch_bset_insert(struct btree_keys *b, struct bkey *where,
 303                     struct bkey *insert);
 304unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k,
 305                              struct bkey *replace_key);
 306
 307enum {
 308        BTREE_INSERT_STATUS_NO_INSERT = 0,
 309        BTREE_INSERT_STATUS_INSERT,
 310        BTREE_INSERT_STATUS_BACK_MERGE,
 311        BTREE_INSERT_STATUS_OVERWROTE,
 312        BTREE_INSERT_STATUS_FRONT_MERGE,
 313};
 314
 315/* Btree key iteration */
 316
 317struct btree_iter {
 318        size_t size, used;
 319#ifdef CONFIG_BCACHE_DEBUG
 320        struct btree_keys *b;
 321#endif
 322        struct btree_iter_set {
 323                struct bkey *k, *end;
 324        } data[MAX_BSETS];
 325};
 326
 327typedef bool (*ptr_filter_fn)(struct btree_keys *b, const struct bkey *k);
 328
 329struct bkey *bch_btree_iter_next(struct btree_iter *iter);
 330struct bkey *bch_btree_iter_next_filter(struct btree_iter *iter,
 331                                        struct btree_keys *b,
 332                                        ptr_filter_fn fn);
 333
 334void bch_btree_iter_push(struct btree_iter *iter, struct bkey *k,
 335                         struct bkey *end);
 336struct bkey *bch_btree_iter_init(struct btree_keys *b,
 337                                 struct btree_iter *iter,
 338                                 struct bkey *search);
 339
 340struct bkey *__bch_bset_search(struct btree_keys *b, struct bset_tree *t,
 341                               const struct bkey *search);
 342
 343/*
 344 * Returns the first key that is strictly greater than search
 345 */
 346static inline struct bkey *bch_bset_search(struct btree_keys *b,
 347                                           struct bset_tree *t,
 348                                           const struct bkey *search)
 349{
 350        return search ? __bch_bset_search(b, t, search) : t->data->start;
 351}
 352
 353#define for_each_key_filter(b, k, iter, filter)                         \
 354        for (bch_btree_iter_init((b), (iter), NULL);                    \
 355             ((k) = bch_btree_iter_next_filter((iter), (b), filter));)
 356
 357#define for_each_key(b, k, iter)                                        \
 358        for (bch_btree_iter_init((b), (iter), NULL);                    \
 359             ((k) = bch_btree_iter_next(iter));)
 360
 361/* Sorting */
 362
 363struct bset_sort_state {
 364        mempool_t               pool;
 365
 366        unsigned int            page_order;
 367        unsigned int            crit_factor;
 368
 369        struct time_stats       time;
 370};
 371
 372void bch_bset_sort_state_free(struct bset_sort_state *state);
 373int bch_bset_sort_state_init(struct bset_sort_state *state,
 374                             unsigned int page_order);
 375void bch_btree_sort_lazy(struct btree_keys *b, struct bset_sort_state *state);
 376void bch_btree_sort_into(struct btree_keys *b, struct btree_keys *new,
 377                         struct bset_sort_state *state);
 378void bch_btree_sort_and_fix_extents(struct btree_keys *b,
 379                                    struct btree_iter *iter,
 380                                    struct bset_sort_state *state);
 381void bch_btree_sort_partial(struct btree_keys *b, unsigned int start,
 382                            struct bset_sort_state *state);
 383
 384static inline void bch_btree_sort(struct btree_keys *b,
 385                                  struct bset_sort_state *state)
 386{
 387        bch_btree_sort_partial(b, 0, state);
 388}
 389
 390struct bset_stats {
 391        size_t sets_written, sets_unwritten;
 392        size_t bytes_written, bytes_unwritten;
 393        size_t floats, failed;
 394};
 395
 396void bch_btree_keys_stats(struct btree_keys *b, struct bset_stats *state);
 397
 398/* Bkey utility code */
 399
 400#define bset_bkey_last(i)       bkey_idx((struct bkey *) (i)->d, \
 401                                         (unsigned int)(i)->keys)
 402
 403static inline struct bkey *bset_bkey_idx(struct bset *i, unsigned int idx)
 404{
 405        return bkey_idx(i->start, idx);
 406}
 407
 408static inline void bkey_init(struct bkey *k)
 409{
 410        *k = ZERO_KEY;
 411}
 412
 413static __always_inline int64_t bkey_cmp(const struct bkey *l,
 414                                        const struct bkey *r)
 415{
 416        return unlikely(KEY_INODE(l) != KEY_INODE(r))
 417                ? (int64_t) KEY_INODE(l) - (int64_t) KEY_INODE(r)
 418                : (int64_t) KEY_OFFSET(l) - (int64_t) KEY_OFFSET(r);
 419}
 420
 421void bch_bkey_copy_single_ptr(struct bkey *dest, const struct bkey *src,
 422                              unsigned int i);
 423bool __bch_cut_front(const struct bkey *where, struct bkey *k);
 424bool __bch_cut_back(const struct bkey *where, struct bkey *k);
 425
 426static inline bool bch_cut_front(const struct bkey *where, struct bkey *k)
 427{
 428        BUG_ON(bkey_cmp(where, k) > 0);
 429        return __bch_cut_front(where, k);
 430}
 431
 432static inline bool bch_cut_back(const struct bkey *where, struct bkey *k)
 433{
 434        BUG_ON(bkey_cmp(where, &START_KEY(k)) < 0);
 435        return __bch_cut_back(where, k);
 436}
 437
 438/*
 439 * Pointer '*preceding_key_p' points to a memory object to store preceding
 440 * key of k. If the preceding key does not exist, set '*preceding_key_p' to
 441 * NULL. So the caller of preceding_key() needs to take care of memory
 442 * which '*preceding_key_p' pointed to before calling preceding_key().
 443 * Currently the only caller of preceding_key() is bch_btree_insert_key(),
 444 * and it points to an on-stack variable, so the memory release is handled
 445 * by stackframe itself.
 446 */
 447static inline void preceding_key(struct bkey *k, struct bkey **preceding_key_p)
 448{
 449        if (KEY_INODE(k) || KEY_OFFSET(k)) {
 450                (**preceding_key_p) = KEY(KEY_INODE(k), KEY_OFFSET(k), 0);
 451                if (!(*preceding_key_p)->low)
 452                        (*preceding_key_p)->high--;
 453                (*preceding_key_p)->low--;
 454        } else {
 455                (*preceding_key_p) = NULL;
 456        }
 457}
 458
 459static inline bool bch_ptr_invalid(struct btree_keys *b, const struct bkey *k)
 460{
 461        return b->ops->key_invalid(b, k);
 462}
 463
 464static inline bool bch_ptr_bad(struct btree_keys *b, const struct bkey *k)
 465{
 466        return b->ops->key_bad(b, k);
 467}
 468
 469static inline void bch_bkey_to_text(struct btree_keys *b, char *buf,
 470                                    size_t size, const struct bkey *k)
 471{
 472        return b->ops->key_to_text(buf, size, k);
 473}
 474
 475static inline bool bch_bkey_equal_header(const struct bkey *l,
 476                                         const struct bkey *r)
 477{
 478        return (KEY_DIRTY(l) == KEY_DIRTY(r) &&
 479                KEY_PTRS(l) == KEY_PTRS(r) &&
 480                KEY_CSUM(l) == KEY_CSUM(r));
 481}
 482
 483/* Keylists */
 484
 485struct keylist {
 486        union {
 487                struct bkey             *keys;
 488                uint64_t                *keys_p;
 489        };
 490        union {
 491                struct bkey             *top;
 492                uint64_t                *top_p;
 493        };
 494
 495        /* Enough room for btree_split's keys without realloc */
 496#define KEYLIST_INLINE          16
 497        uint64_t                inline_keys[KEYLIST_INLINE];
 498};
 499
 500static inline void bch_keylist_init(struct keylist *l)
 501{
 502        l->top_p = l->keys_p = l->inline_keys;
 503}
 504
 505static inline void bch_keylist_init_single(struct keylist *l, struct bkey *k)
 506{
 507        l->keys = k;
 508        l->top = bkey_next(k);
 509}
 510
 511static inline void bch_keylist_push(struct keylist *l)
 512{
 513        l->top = bkey_next(l->top);
 514}
 515
 516static inline void bch_keylist_add(struct keylist *l, struct bkey *k)
 517{
 518        bkey_copy(l->top, k);
 519        bch_keylist_push(l);
 520}
 521
 522static inline bool bch_keylist_empty(struct keylist *l)
 523{
 524        return l->top == l->keys;
 525}
 526
 527static inline void bch_keylist_reset(struct keylist *l)
 528{
 529        l->top = l->keys;
 530}
 531
 532static inline void bch_keylist_free(struct keylist *l)
 533{
 534        if (l->keys_p != l->inline_keys)
 535                kfree(l->keys_p);
 536}
 537
 538static inline size_t bch_keylist_nkeys(struct keylist *l)
 539{
 540        return l->top_p - l->keys_p;
 541}
 542
 543static inline size_t bch_keylist_bytes(struct keylist *l)
 544{
 545        return bch_keylist_nkeys(l) * sizeof(uint64_t);
 546}
 547
 548struct bkey *bch_keylist_pop(struct keylist *l);
 549void bch_keylist_pop_front(struct keylist *l);
 550int __bch_keylist_realloc(struct keylist *l, unsigned int u64s);
 551
 552/* Debug stuff */
 553
 554#ifdef CONFIG_BCACHE_DEBUG
 555
 556int __bch_count_data(struct btree_keys *b);
 557void __printf(2, 3) __bch_check_keys(struct btree_keys *b,
 558                                     const char *fmt,
 559                                     ...);
 560void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned int set);
 561void bch_dump_bucket(struct btree_keys *b);
 562
 563#else
 564
 565static inline int __bch_count_data(struct btree_keys *b) { return -1; }
 566static inline void __printf(2, 3)
 567        __bch_check_keys(struct btree_keys *b, const char *fmt, ...) {}
 568static inline void bch_dump_bucket(struct btree_keys *b) {}
 569void bch_dump_bset(struct btree_keys *b, struct bset *i, unsigned int set);
 570
 571#endif
 572
 573static inline bool btree_keys_expensive_checks(struct btree_keys *b)
 574{
 575#ifdef CONFIG_BCACHE_DEBUG
 576        return *b->expensive_debug_checks;
 577#else
 578        return false;
 579#endif
 580}
 581
 582static inline int bch_count_data(struct btree_keys *b)
 583{
 584        return btree_keys_expensive_checks(b) ? __bch_count_data(b) : -1;
 585}
 586
 587#define bch_check_keys(b, ...)                                          \
 588do {                                                                    \
 589        if (btree_keys_expensive_checks(b))                             \
 590                __bch_check_keys(b, __VA_ARGS__);                       \
 591} while (0)
 592
 593#endif
 594