linux/drivers/md/bcache/extents.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
   4 *
   5 * Uses a block device as cache for other block devices; optimized for SSDs.
   6 * All allocation is done in buckets, which should match the erase block size
   7 * of the device.
   8 *
   9 * Buckets containing cached data are kept on a heap sorted by priority;
  10 * bucket priority is increased on cache hit, and periodically all the buckets
  11 * on the heap have their priority scaled down. This currently is just used as
  12 * an LRU but in the future should allow for more intelligent heuristics.
  13 *
  14 * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
  15 * counter. Garbage collection is used to remove stale pointers.
  16 *
  17 * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
  18 * as keys are inserted we only sort the pages that have not yet been written.
  19 * When garbage collection is run, we resort the entire node.
  20 *
  21 * All configuration is done via sysfs; see Documentation/admin-guide/bcache.rst.
  22 */
  23
  24#include "bcache.h"
  25#include "btree.h"
  26#include "debug.h"
  27#include "extents.h"
  28#include "writeback.h"
  29
  30static void sort_key_next(struct btree_iter *iter,
  31                          struct btree_iter_set *i)
  32{
  33        i->k = bkey_next(i->k);
  34
  35        if (i->k == i->end)
  36                *i = iter->data[--iter->used];
  37}
  38
  39static bool bch_key_sort_cmp(struct btree_iter_set l,
  40                             struct btree_iter_set r)
  41{
  42        int64_t c = bkey_cmp(l.k, r.k);
  43
  44        return c ? c > 0 : l.k < r.k;
  45}
  46
  47static bool __ptr_invalid(struct cache_set *c, const struct bkey *k)
  48{
  49        unsigned i;
  50
  51        for (i = 0; i < KEY_PTRS(k); i++)
  52                if (ptr_available(c, k, i)) {
  53                        struct cache *ca = PTR_CACHE(c, k, i);
  54                        size_t bucket = PTR_BUCKET_NR(c, k, i);
  55                        size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
  56
  57                        if (KEY_SIZE(k) + r > c->sb.bucket_size ||
  58                            bucket <  ca->sb.first_bucket ||
  59                            bucket >= ca->sb.nbuckets)
  60                                return true;
  61                }
  62
  63        return false;
  64}
  65
  66/* Common among btree and extent ptrs */
  67
  68static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
  69{
  70        unsigned i;
  71
  72        for (i = 0; i < KEY_PTRS(k); i++)
  73                if (ptr_available(c, k, i)) {
  74                        struct cache *ca = PTR_CACHE(c, k, i);
  75                        size_t bucket = PTR_BUCKET_NR(c, k, i);
  76                        size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
  77
  78                        if (KEY_SIZE(k) + r > c->sb.bucket_size)
  79                                return "bad, length too big";
  80                        if (bucket <  ca->sb.first_bucket)
  81                                return "bad, short offset";
  82                        if (bucket >= ca->sb.nbuckets)
  83                                return "bad, offset past end of device";
  84                        if (ptr_stale(c, k, i))
  85                                return "stale";
  86                }
  87
  88        if (!bkey_cmp(k, &ZERO_KEY))
  89                return "bad, null key";
  90        if (!KEY_PTRS(k))
  91                return "bad, no pointers";
  92        if (!KEY_SIZE(k))
  93                return "zeroed key";
  94        return "";
  95}
  96
  97void bch_extent_to_text(char *buf, size_t size, const struct bkey *k)
  98{
  99        unsigned i = 0;
 100        char *out = buf, *end = buf + size;
 101
 102#define p(...)  (out += scnprintf(out, end - out, __VA_ARGS__))
 103
 104        p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_START(k), KEY_SIZE(k));
 105
 106        for (i = 0; i < KEY_PTRS(k); i++) {
 107                if (i)
 108                        p(", ");
 109
 110                if (PTR_DEV(k, i) == PTR_CHECK_DEV)
 111                        p("check dev");
 112                else
 113                        p("%llu:%llu gen %llu", PTR_DEV(k, i),
 114                          PTR_OFFSET(k, i), PTR_GEN(k, i));
 115        }
 116
 117        p("]");
 118
 119        if (KEY_DIRTY(k))
 120                p(" dirty");
 121        if (KEY_CSUM(k))
 122                p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]);
 123#undef p
 124}
 125
 126static void bch_bkey_dump(struct btree_keys *keys, const struct bkey *k)
 127{
 128        struct btree *b = container_of(keys, struct btree, keys);
 129        unsigned j;
 130        char buf[80];
 131
 132        bch_extent_to_text(buf, sizeof(buf), k);
 133        printk(" %s", buf);
 134
 135        for (j = 0; j < KEY_PTRS(k); j++) {
 136                size_t n = PTR_BUCKET_NR(b->c, k, j);
 137                printk(" bucket %zu", n);
 138
 139                if (n >= b->c->sb.first_bucket && n < b->c->sb.nbuckets)
 140                        printk(" prio %i",
 141                               PTR_BUCKET(b->c, k, j)->prio);
 142        }
 143
 144        printk(" %s\n", bch_ptr_status(b->c, k));
 145}
 146
 147/* Btree ptrs */
 148
 149bool __bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k)
 150{
 151        char buf[80];
 152
 153        if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))
 154                goto bad;
 155
 156        if (__ptr_invalid(c, k))
 157                goto bad;
 158
 159        return false;
 160bad:
 161        bch_extent_to_text(buf, sizeof(buf), k);
 162        cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k));
 163        return true;
 164}
 165
 166static bool bch_btree_ptr_invalid(struct btree_keys *bk, const struct bkey *k)
 167{
 168        struct btree *b = container_of(bk, struct btree, keys);
 169        return __bch_btree_ptr_invalid(b->c, k);
 170}
 171
 172static bool btree_ptr_bad_expensive(struct btree *b, const struct bkey *k)
 173{
 174        unsigned i;
 175        char buf[80];
 176        struct bucket *g;
 177
 178        if (mutex_trylock(&b->c->bucket_lock)) {
 179                for (i = 0; i < KEY_PTRS(k); i++)
 180                        if (ptr_available(b->c, k, i)) {
 181                                g = PTR_BUCKET(b->c, k, i);
 182
 183                                if (KEY_DIRTY(k) ||
 184                                    g->prio != BTREE_PRIO ||
 185                                    (b->c->gc_mark_valid &&
 186                                     GC_MARK(g) != GC_MARK_METADATA))
 187                                        goto err;
 188                        }
 189
 190                mutex_unlock(&b->c->bucket_lock);
 191        }
 192
 193        return false;
 194err:
 195        mutex_unlock(&b->c->bucket_lock);
 196        bch_extent_to_text(buf, sizeof(buf), k);
 197        btree_bug(b,
 198"inconsistent btree pointer %s: bucket %zi pin %i prio %i gen %i last_gc %i mark %llu",
 199                  buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
 200                  g->prio, g->gen, g->last_gc, GC_MARK(g));
 201        return true;
 202}
 203
 204static bool bch_btree_ptr_bad(struct btree_keys *bk, const struct bkey *k)
 205{
 206        struct btree *b = container_of(bk, struct btree, keys);
 207        unsigned i;
 208
 209        if (!bkey_cmp(k, &ZERO_KEY) ||
 210            !KEY_PTRS(k) ||
 211            bch_ptr_invalid(bk, k))
 212                return true;
 213
 214        for (i = 0; i < KEY_PTRS(k); i++)
 215                if (!ptr_available(b->c, k, i) ||
 216                    ptr_stale(b->c, k, i))
 217                        return true;
 218
 219        if (expensive_debug_checks(b->c) &&
 220            btree_ptr_bad_expensive(b, k))
 221                return true;
 222
 223        return false;
 224}
 225
 226static bool bch_btree_ptr_insert_fixup(struct btree_keys *bk,
 227                                       struct bkey *insert,
 228                                       struct btree_iter *iter,
 229                                       struct bkey *replace_key)
 230{
 231        struct btree *b = container_of(bk, struct btree, keys);
 232
 233        if (!KEY_OFFSET(insert))
 234                btree_current_write(b)->prio_blocked++;
 235
 236        return false;
 237}
 238
 239const struct btree_keys_ops bch_btree_keys_ops = {
 240        .sort_cmp       = bch_key_sort_cmp,
 241        .insert_fixup   = bch_btree_ptr_insert_fixup,
 242        .key_invalid    = bch_btree_ptr_invalid,
 243        .key_bad        = bch_btree_ptr_bad,
 244        .key_to_text    = bch_extent_to_text,
 245        .key_dump       = bch_bkey_dump,
 246};
 247
 248/* Extents */
 249
 250/*
 251 * Returns true if l > r - unless l == r, in which case returns true if l is
 252 * older than r.
 253 *
 254 * Necessary for btree_sort_fixup() - if there are multiple keys that compare
 255 * equal in different sets, we have to process them newest to oldest.
 256 */
 257static bool bch_extent_sort_cmp(struct btree_iter_set l,
 258                                struct btree_iter_set r)
 259{
 260        int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k));
 261
 262        return c ? c > 0 : l.k < r.k;
 263}
 264
 265static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter,
 266                                          struct bkey *tmp)
 267{
 268        while (iter->used > 1) {
 269                struct btree_iter_set *top = iter->data, *i = top + 1;
 270
 271                if (iter->used > 2 &&
 272                    bch_extent_sort_cmp(i[0], i[1]))
 273                        i++;
 274
 275                if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0)
 276                        break;
 277
 278                if (!KEY_SIZE(i->k)) {
 279                        sort_key_next(iter, i);
 280                        heap_sift(iter, i - top, bch_extent_sort_cmp);
 281                        continue;
 282                }
 283
 284                if (top->k > i->k) {
 285                        if (bkey_cmp(top->k, i->k) >= 0)
 286                                sort_key_next(iter, i);
 287                        else
 288                                bch_cut_front(top->k, i->k);
 289
 290                        heap_sift(iter, i - top, bch_extent_sort_cmp);
 291                } else {
 292                        /* can't happen because of comparison func */
 293                        BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k)));
 294
 295                        if (bkey_cmp(i->k, top->k) < 0) {
 296                                bkey_copy(tmp, top->k);
 297
 298                                bch_cut_back(&START_KEY(i->k), tmp);
 299                                bch_cut_front(i->k, top->k);
 300                                heap_sift(iter, 0, bch_extent_sort_cmp);
 301
 302                                return tmp;
 303                        } else {
 304                                bch_cut_back(&START_KEY(i->k), top->k);
 305                        }
 306                }
 307        }
 308
 309        return NULL;
 310}
 311
 312static void bch_subtract_dirty(struct bkey *k,
 313                           struct cache_set *c,
 314                           uint64_t offset,
 315                           int sectors)
 316{
 317        if (KEY_DIRTY(k))
 318                bcache_dev_sectors_dirty_add(c, KEY_INODE(k),
 319                                             offset, -sectors);
 320}
 321
 322static bool bch_extent_insert_fixup(struct btree_keys *b,
 323                                    struct bkey *insert,
 324                                    struct btree_iter *iter,
 325                                    struct bkey *replace_key)
 326{
 327        struct cache_set *c = container_of(b, struct btree, keys)->c;
 328
 329        uint64_t old_offset;
 330        unsigned old_size, sectors_found = 0;
 331
 332        BUG_ON(!KEY_OFFSET(insert));
 333        BUG_ON(!KEY_SIZE(insert));
 334
 335        while (1) {
 336                struct bkey *k = bch_btree_iter_next(iter);
 337                if (!k)
 338                        break;
 339
 340                if (bkey_cmp(&START_KEY(k), insert) >= 0) {
 341                        if (KEY_SIZE(k))
 342                                break;
 343                        else
 344                                continue;
 345                }
 346
 347                if (bkey_cmp(k, &START_KEY(insert)) <= 0)
 348                        continue;
 349
 350                old_offset = KEY_START(k);
 351                old_size = KEY_SIZE(k);
 352
 353                /*
 354                 * We might overlap with 0 size extents; we can't skip these
 355                 * because if they're in the set we're inserting to we have to
 356                 * adjust them so they don't overlap with the key we're
 357                 * inserting. But we don't want to check them for replace
 358                 * operations.
 359                 */
 360
 361                if (replace_key && KEY_SIZE(k)) {
 362                        /*
 363                         * k might have been split since we inserted/found the
 364                         * key we're replacing
 365                         */
 366                        unsigned i;
 367                        uint64_t offset = KEY_START(k) -
 368                                KEY_START(replace_key);
 369
 370                        /* But it must be a subset of the replace key */
 371                        if (KEY_START(k) < KEY_START(replace_key) ||
 372                            KEY_OFFSET(k) > KEY_OFFSET(replace_key))
 373                                goto check_failed;
 374
 375                        /* We didn't find a key that we were supposed to */
 376                        if (KEY_START(k) > KEY_START(insert) + sectors_found)
 377                                goto check_failed;
 378
 379                        if (!bch_bkey_equal_header(k, replace_key))
 380                                goto check_failed;
 381
 382                        /* skip past gen */
 383                        offset <<= 8;
 384
 385                        BUG_ON(!KEY_PTRS(replace_key));
 386
 387                        for (i = 0; i < KEY_PTRS(replace_key); i++)
 388                                if (k->ptr[i] != replace_key->ptr[i] + offset)
 389                                        goto check_failed;
 390
 391                        sectors_found = KEY_OFFSET(k) - KEY_START(insert);
 392                }
 393
 394                if (bkey_cmp(insert, k) < 0 &&
 395                    bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) {
 396                        /*
 397                         * We overlapped in the middle of an existing key: that
 398                         * means we have to split the old key. But we have to do
 399                         * slightly different things depending on whether the
 400                         * old key has been written out yet.
 401                         */
 402
 403                        struct bkey *top;
 404
 405                        bch_subtract_dirty(k, c, KEY_START(insert),
 406                                       KEY_SIZE(insert));
 407
 408                        if (bkey_written(b, k)) {
 409                                /*
 410                                 * We insert a new key to cover the top of the
 411                                 * old key, and the old key is modified in place
 412                                 * to represent the bottom split.
 413                                 *
 414                                 * It's completely arbitrary whether the new key
 415                                 * is the top or the bottom, but it has to match
 416                                 * up with what btree_sort_fixup() does - it
 417                                 * doesn't check for this kind of overlap, it
 418                                 * depends on us inserting a new key for the top
 419                                 * here.
 420                                 */
 421                                top = bch_bset_search(b, bset_tree_last(b),
 422                                                      insert);
 423                                bch_bset_insert(b, top, k);
 424                        } else {
 425                                BKEY_PADDED(key) temp;
 426                                bkey_copy(&temp.key, k);
 427                                bch_bset_insert(b, k, &temp.key);
 428                                top = bkey_next(k);
 429                        }
 430
 431                        bch_cut_front(insert, top);
 432                        bch_cut_back(&START_KEY(insert), k);
 433                        bch_bset_fix_invalidated_key(b, k);
 434                        goto out;
 435                }
 436
 437                if (bkey_cmp(insert, k) < 0) {
 438                        bch_cut_front(insert, k);
 439                } else {
 440                        if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0)
 441                                old_offset = KEY_START(insert);
 442
 443                        if (bkey_written(b, k) &&
 444                            bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
 445                                /*
 446                                 * Completely overwrote, so we don't have to
 447                                 * invalidate the binary search tree
 448                                 */
 449                                bch_cut_front(k, k);
 450                        } else {
 451                                __bch_cut_back(&START_KEY(insert), k);
 452                                bch_bset_fix_invalidated_key(b, k);
 453                        }
 454                }
 455
 456                bch_subtract_dirty(k, c, old_offset, old_size - KEY_SIZE(k));
 457        }
 458
 459check_failed:
 460        if (replace_key) {
 461                if (!sectors_found) {
 462                        return true;
 463                } else if (sectors_found < KEY_SIZE(insert)) {
 464                        SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
 465                                       (KEY_SIZE(insert) - sectors_found));
 466                        SET_KEY_SIZE(insert, sectors_found);
 467                }
 468        }
 469out:
 470        if (KEY_DIRTY(insert))
 471                bcache_dev_sectors_dirty_add(c, KEY_INODE(insert),
 472                                             KEY_START(insert),
 473                                             KEY_SIZE(insert));
 474
 475        return false;
 476}
 477
 478bool __bch_extent_invalid(struct cache_set *c, const struct bkey *k)
 479{
 480        char buf[80];
 481
 482        if (!KEY_SIZE(k))
 483                return true;
 484
 485        if (KEY_SIZE(k) > KEY_OFFSET(k))
 486                goto bad;
 487
 488        if (__ptr_invalid(c, k))
 489                goto bad;
 490
 491        return false;
 492bad:
 493        bch_extent_to_text(buf, sizeof(buf), k);
 494        cache_bug(c, "spotted extent %s: %s", buf, bch_ptr_status(c, k));
 495        return true;
 496}
 497
 498static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k)
 499{
 500        struct btree *b = container_of(bk, struct btree, keys);
 501        return __bch_extent_invalid(b->c, k);
 502}
 503
 504static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k,
 505                                     unsigned ptr)
 506{
 507        struct bucket *g = PTR_BUCKET(b->c, k, ptr);
 508        char buf[80];
 509
 510        if (mutex_trylock(&b->c->bucket_lock)) {
 511                if (b->c->gc_mark_valid &&
 512                    (!GC_MARK(g) ||
 513                     GC_MARK(g) == GC_MARK_METADATA ||
 514                     (GC_MARK(g) != GC_MARK_DIRTY && KEY_DIRTY(k))))
 515                        goto err;
 516
 517                if (g->prio == BTREE_PRIO)
 518                        goto err;
 519
 520                mutex_unlock(&b->c->bucket_lock);
 521        }
 522
 523        return false;
 524err:
 525        mutex_unlock(&b->c->bucket_lock);
 526        bch_extent_to_text(buf, sizeof(buf), k);
 527        btree_bug(b,
 528"inconsistent extent pointer %s:\nbucket %zu pin %i prio %i gen %i last_gc %i mark %llu",
 529                  buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin),
 530                  g->prio, g->gen, g->last_gc, GC_MARK(g));
 531        return true;
 532}
 533
 534static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k)
 535{
 536        struct btree *b = container_of(bk, struct btree, keys);
 537        unsigned i, stale;
 538
 539        if (!KEY_PTRS(k) ||
 540            bch_extent_invalid(bk, k))
 541                return true;
 542
 543        for (i = 0; i < KEY_PTRS(k); i++)
 544                if (!ptr_available(b->c, k, i))
 545                        return true;
 546
 547        if (!expensive_debug_checks(b->c) && KEY_DIRTY(k))
 548                return false;
 549
 550        for (i = 0; i < KEY_PTRS(k); i++) {
 551                stale = ptr_stale(b->c, k, i);
 552
 553                btree_bug_on(stale > 96, b,
 554                             "key too stale: %i, need_gc %u",
 555                             stale, b->c->need_gc);
 556
 557                btree_bug_on(stale && KEY_DIRTY(k) && KEY_SIZE(k),
 558                             b, "stale dirty pointer");
 559
 560                if (stale)
 561                        return true;
 562
 563                if (expensive_debug_checks(b->c) &&
 564                    bch_extent_bad_expensive(b, k, i))
 565                        return true;
 566        }
 567
 568        return false;
 569}
 570
 571static uint64_t merge_chksums(struct bkey *l, struct bkey *r)
 572{
 573        return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) &
 574                ~((uint64_t)1 << 63);
 575}
 576
 577static bool bch_extent_merge(struct btree_keys *bk, struct bkey *l, struct bkey *r)
 578{
 579        struct btree *b = container_of(bk, struct btree, keys);
 580        unsigned i;
 581
 582        if (key_merging_disabled(b->c))
 583                return false;
 584
 585        for (i = 0; i < KEY_PTRS(l); i++)
 586                if (l->ptr[i] + MAKE_PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
 587                    PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i))
 588                        return false;
 589
 590        /* Keys with no pointers aren't restricted to one bucket and could
 591         * overflow KEY_SIZE
 592         */
 593        if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) {
 594                SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l));
 595                SET_KEY_SIZE(l, USHRT_MAX);
 596
 597                bch_cut_front(l, r);
 598                return false;
 599        }
 600
 601        if (KEY_CSUM(l)) {
 602                if (KEY_CSUM(r))
 603                        l->ptr[KEY_PTRS(l)] = merge_chksums(l, r);
 604                else
 605                        SET_KEY_CSUM(l, 0);
 606        }
 607
 608        SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r));
 609        SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r));
 610
 611        return true;
 612}
 613
 614const struct btree_keys_ops bch_extent_keys_ops = {
 615        .sort_cmp       = bch_extent_sort_cmp,
 616        .sort_fixup     = bch_extent_sort_fixup,
 617        .insert_fixup   = bch_extent_insert_fixup,
 618        .key_invalid    = bch_extent_invalid,
 619        .key_bad        = bch_extent_bad,
 620        .key_merge      = bch_extent_merge,
 621        .key_to_text    = bch_extent_to_text,
 622        .key_dump       = bch_bkey_dump,
 623        .is_extents     = true,
 624};
 625