linux/drivers/md/bcache/extents.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
   4 *
   5 * Uses a block device as cache for other block devices; optimized for SSDs.
   6 * All allocation is done in buckets, which should match the erase block size
   7 * of the device.
   8 *
   9 * Buckets containing cached data are kept on a heap sorted by priority;
  10 * bucket priority is increased on cache hit, and periodically all the buckets
  11 * on the heap have their priority scaled down. This currently is just used as
  12 * an LRU but in the future should allow for more intelligent heuristics.
  13 *
  14 * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
  15 * counter. Garbage collection is used to remove stale pointers.
  16 *
  17 * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
  18 * as keys are inserted we only sort the pages that have not yet been written.
  19 * When garbage collection is run, we resort the entire node.
  20 *
  21 * All configuration is done via sysfs; see Documentation/admin-guide/bcache.rst.
  22 */
  23
  24#include "bcache.h"
  25#include "btree.h"
  26#include "debug.h"
  27#include "extents.h"
  28#include "writeback.h"
  29
  30static void sort_key_next(struct btree_iter *iter,
  31                          struct btree_iter_set *i)
  32{
  33        i->k = bkey_next(i->k);
  34
  35        if (i->k == i->end)
  36                *i = iter->data[--iter->used];
  37}
  38
  39static bool bch_key_sort_cmp(struct btree_iter_set l,
  40                             struct btree_iter_set r)
  41{
  42        int64_t c = bkey_cmp(l.k, r.k);
  43
  44        return c ? c > 0 : l.k < r.k;
  45}
  46
  47static bool __ptr_invalid(struct cache_set *c, const struct bkey *k)
  48{
  49        unsigned int i;
  50
  51        for (i = 0; i < KEY_PTRS(k); i++)
  52                if (ptr_available(c, k, i)) {
  53                        struct cache *ca = c->cache;
  54                        size_t bucket = PTR_BUCKET_NR(c, k, i);
  55                        size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
  56
  57                        if (KEY_SIZE(k) + r > c->cache->sb.bucket_size ||
  58                            bucket <  ca->sb.first_bucket ||
  59                            bucket >= ca->sb.nbuckets)
  60                                return true;
  61                }
  62
  63        return false;
  64}
  65
  66/* Common among btree and extent ptrs */
  67
  68static const char *bch_ptr_status(struct cache_set *c, const struct bkey *k)
  69{
  70        unsigned int i;
  71
  72        for (i = 0; i < KEY_PTRS(k); i++)
  73                if (ptr_available(c, k, i)) {
  74                        struct cache *ca = c->cache;
  75                        size_t bucket = PTR_BUCKET_NR(c, k, i);
  76                        size_t r = bucket_remainder(c, PTR_OFFSET(k, i));
  77
  78                        if (KEY_SIZE(k) + r > c->cache->sb.bucket_size)
  79                                return "bad, length too big";
  80                        if (bucket <  ca->sb.first_bucket)
  81                                return "bad, short offset";
  82                        if (bucket >= ca->sb.nbuckets)
  83                                return "bad, offset past end of device";
  84                        if (ptr_stale(c, k, i))
  85                                return "stale";
  86                }
  87
  88        if (!bkey_cmp(k, &ZERO_KEY))
  89                return "bad, null key";
  90        if (!KEY_PTRS(k))
  91                return "bad, no pointers";
  92        if (!KEY_SIZE(k))
  93                return "zeroed key";
  94        return "";
  95}
  96
  97void bch_extent_to_text(char *buf, size_t size, const struct bkey *k)
  98{
  99        unsigned int i = 0;
 100        char *out = buf, *end = buf + size;
 101
 102#define p(...)  (out += scnprintf(out, end - out, __VA_ARGS__))
 103
 104        p("%llu:%llu len %llu -> [", KEY_INODE(k), KEY_START(k), KEY_SIZE(k));
 105
 106        for (i = 0; i < KEY_PTRS(k); i++) {
 107                if (i)
 108                        p(", ");
 109
 110                if (PTR_DEV(k, i) == PTR_CHECK_DEV)
 111                        p("check dev");
 112                else
 113                        p("%llu:%llu gen %llu", PTR_DEV(k, i),
 114                          PTR_OFFSET(k, i), PTR_GEN(k, i));
 115        }
 116
 117        p("]");
 118
 119        if (KEY_DIRTY(k))
 120                p(" dirty");
 121        if (KEY_CSUM(k))
 122                p(" cs%llu %llx", KEY_CSUM(k), k->ptr[1]);
 123#undef p
 124}
 125
 126static void bch_bkey_dump(struct btree_keys *keys, const struct bkey *k)
 127{
 128        struct btree *b = container_of(keys, struct btree, keys);
 129        unsigned int j;
 130        char buf[80];
 131
 132        bch_extent_to_text(buf, sizeof(buf), k);
 133        pr_cont(" %s", buf);
 134
 135        for (j = 0; j < KEY_PTRS(k); j++) {
 136                size_t n = PTR_BUCKET_NR(b->c, k, j);
 137
 138                pr_cont(" bucket %zu", n);
 139                if (n >= b->c->cache->sb.first_bucket && n < b->c->cache->sb.nbuckets)
 140                        pr_cont(" prio %i",
 141                                PTR_BUCKET(b->c, k, j)->prio);
 142        }
 143
 144        pr_cont(" %s\n", bch_ptr_status(b->c, k));
 145}
 146
 147/* Btree ptrs */
 148
 149bool __bch_btree_ptr_invalid(struct cache_set *c, const struct bkey *k)
 150{
 151        char buf[80];
 152
 153        if (!KEY_PTRS(k) || !KEY_SIZE(k) || KEY_DIRTY(k))
 154                goto bad;
 155
 156        if (__ptr_invalid(c, k))
 157                goto bad;
 158
 159        return false;
 160bad:
 161        bch_extent_to_text(buf, sizeof(buf), k);
 162        cache_bug(c, "spotted btree ptr %s: %s", buf, bch_ptr_status(c, k));
 163        return true;
 164}
 165
 166static bool bch_btree_ptr_invalid(struct btree_keys *bk, const struct bkey *k)
 167{
 168        struct btree *b = container_of(bk, struct btree, keys);
 169
 170        return __bch_btree_ptr_invalid(b->c, k);
 171}
 172
 173static bool btree_ptr_bad_expensive(struct btree *b, const struct bkey *k)
 174{
 175        unsigned int i;
 176        char buf[80];
 177        struct bucket *g;
 178
 179        if (mutex_trylock(&b->c->bucket_lock)) {
 180                for (i = 0; i < KEY_PTRS(k); i++)
 181                        if (ptr_available(b->c, k, i)) {
 182                                g = PTR_BUCKET(b->c, k, i);
 183
 184                                if (KEY_DIRTY(k) ||
 185                                    g->prio != BTREE_PRIO ||
 186                                    (b->c->gc_mark_valid &&
 187                                     GC_MARK(g) != GC_MARK_METADATA))
 188                                        goto err;
 189                        }
 190
 191                mutex_unlock(&b->c->bucket_lock);
 192        }
 193
 194        return false;
 195err:
 196        mutex_unlock(&b->c->bucket_lock);
 197        bch_extent_to_text(buf, sizeof(buf), k);
 198        btree_bug(b,
 199"inconsistent btree pointer %s: bucket %zi pin %i prio %i gen %i last_gc %i mark %llu",
 200                  buf, PTR_BUCKET_NR(b->c, k, i), atomic_read(&g->pin),
 201                  g->prio, g->gen, g->last_gc, GC_MARK(g));
 202        return true;
 203}
 204
 205static bool bch_btree_ptr_bad(struct btree_keys *bk, const struct bkey *k)
 206{
 207        struct btree *b = container_of(bk, struct btree, keys);
 208        unsigned int i;
 209
 210        if (!bkey_cmp(k, &ZERO_KEY) ||
 211            !KEY_PTRS(k) ||
 212            bch_ptr_invalid(bk, k))
 213                return true;
 214
 215        for (i = 0; i < KEY_PTRS(k); i++)
 216                if (!ptr_available(b->c, k, i) ||
 217                    ptr_stale(b->c, k, i))
 218                        return true;
 219
 220        if (expensive_debug_checks(b->c) &&
 221            btree_ptr_bad_expensive(b, k))
 222                return true;
 223
 224        return false;
 225}
 226
 227static bool bch_btree_ptr_insert_fixup(struct btree_keys *bk,
 228                                       struct bkey *insert,
 229                                       struct btree_iter *iter,
 230                                       struct bkey *replace_key)
 231{
 232        struct btree *b = container_of(bk, struct btree, keys);
 233
 234        if (!KEY_OFFSET(insert))
 235                btree_current_write(b)->prio_blocked++;
 236
 237        return false;
 238}
 239
 240const struct btree_keys_ops bch_btree_keys_ops = {
 241        .sort_cmp       = bch_key_sort_cmp,
 242        .insert_fixup   = bch_btree_ptr_insert_fixup,
 243        .key_invalid    = bch_btree_ptr_invalid,
 244        .key_bad        = bch_btree_ptr_bad,
 245        .key_to_text    = bch_extent_to_text,
 246        .key_dump       = bch_bkey_dump,
 247};
 248
 249/* Extents */
 250
 251/*
 252 * Returns true if l > r - unless l == r, in which case returns true if l is
 253 * older than r.
 254 *
 255 * Necessary for btree_sort_fixup() - if there are multiple keys that compare
 256 * equal in different sets, we have to process them newest to oldest.
 257 */
 258static bool bch_extent_sort_cmp(struct btree_iter_set l,
 259                                struct btree_iter_set r)
 260{
 261        int64_t c = bkey_cmp(&START_KEY(l.k), &START_KEY(r.k));
 262
 263        return c ? c > 0 : l.k < r.k;
 264}
 265
 266static struct bkey *bch_extent_sort_fixup(struct btree_iter *iter,
 267                                          struct bkey *tmp)
 268{
 269        while (iter->used > 1) {
 270                struct btree_iter_set *top = iter->data, *i = top + 1;
 271
 272                if (iter->used > 2 &&
 273                    bch_extent_sort_cmp(i[0], i[1]))
 274                        i++;
 275
 276                if (bkey_cmp(top->k, &START_KEY(i->k)) <= 0)
 277                        break;
 278
 279                if (!KEY_SIZE(i->k)) {
 280                        sort_key_next(iter, i);
 281                        heap_sift(iter, i - top, bch_extent_sort_cmp);
 282                        continue;
 283                }
 284
 285                if (top->k > i->k) {
 286                        if (bkey_cmp(top->k, i->k) >= 0)
 287                                sort_key_next(iter, i);
 288                        else
 289                                bch_cut_front(top->k, i->k);
 290
 291                        heap_sift(iter, i - top, bch_extent_sort_cmp);
 292                } else {
 293                        /* can't happen because of comparison func */
 294                        BUG_ON(!bkey_cmp(&START_KEY(top->k), &START_KEY(i->k)));
 295
 296                        if (bkey_cmp(i->k, top->k) < 0) {
 297                                bkey_copy(tmp, top->k);
 298
 299                                bch_cut_back(&START_KEY(i->k), tmp);
 300                                bch_cut_front(i->k, top->k);
 301                                heap_sift(iter, 0, bch_extent_sort_cmp);
 302
 303                                return tmp;
 304                        } else {
 305                                bch_cut_back(&START_KEY(i->k), top->k);
 306                        }
 307                }
 308        }
 309
 310        return NULL;
 311}
 312
 313static void bch_subtract_dirty(struct bkey *k,
 314                           struct cache_set *c,
 315                           uint64_t offset,
 316                           int sectors)
 317{
 318        if (KEY_DIRTY(k))
 319                bcache_dev_sectors_dirty_add(c, KEY_INODE(k),
 320                                             offset, -sectors);
 321}
 322
 323static bool bch_extent_insert_fixup(struct btree_keys *b,
 324                                    struct bkey *insert,
 325                                    struct btree_iter *iter,
 326                                    struct bkey *replace_key)
 327{
 328        struct cache_set *c = container_of(b, struct btree, keys)->c;
 329
 330        uint64_t old_offset;
 331        unsigned int old_size, sectors_found = 0;
 332
 333        BUG_ON(!KEY_OFFSET(insert));
 334        BUG_ON(!KEY_SIZE(insert));
 335
 336        while (1) {
 337                struct bkey *k = bch_btree_iter_next(iter);
 338
 339                if (!k)
 340                        break;
 341
 342                if (bkey_cmp(&START_KEY(k), insert) >= 0) {
 343                        if (KEY_SIZE(k))
 344                                break;
 345                        else
 346                                continue;
 347                }
 348
 349                if (bkey_cmp(k, &START_KEY(insert)) <= 0)
 350                        continue;
 351
 352                old_offset = KEY_START(k);
 353                old_size = KEY_SIZE(k);
 354
 355                /*
 356                 * We might overlap with 0 size extents; we can't skip these
 357                 * because if they're in the set we're inserting to we have to
 358                 * adjust them so they don't overlap with the key we're
 359                 * inserting. But we don't want to check them for replace
 360                 * operations.
 361                 */
 362
 363                if (replace_key && KEY_SIZE(k)) {
 364                        /*
 365                         * k might have been split since we inserted/found the
 366                         * key we're replacing
 367                         */
 368                        unsigned int i;
 369                        uint64_t offset = KEY_START(k) -
 370                                KEY_START(replace_key);
 371
 372                        /* But it must be a subset of the replace key */
 373                        if (KEY_START(k) < KEY_START(replace_key) ||
 374                            KEY_OFFSET(k) > KEY_OFFSET(replace_key))
 375                                goto check_failed;
 376
 377                        /* We didn't find a key that we were supposed to */
 378                        if (KEY_START(k) > KEY_START(insert) + sectors_found)
 379                                goto check_failed;
 380
 381                        if (!bch_bkey_equal_header(k, replace_key))
 382                                goto check_failed;
 383
 384                        /* skip past gen */
 385                        offset <<= 8;
 386
 387                        BUG_ON(!KEY_PTRS(replace_key));
 388
 389                        for (i = 0; i < KEY_PTRS(replace_key); i++)
 390                                if (k->ptr[i] != replace_key->ptr[i] + offset)
 391                                        goto check_failed;
 392
 393                        sectors_found = KEY_OFFSET(k) - KEY_START(insert);
 394                }
 395
 396                if (bkey_cmp(insert, k) < 0 &&
 397                    bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) {
 398                        /*
 399                         * We overlapped in the middle of an existing key: that
 400                         * means we have to split the old key. But we have to do
 401                         * slightly different things depending on whether the
 402                         * old key has been written out yet.
 403                         */
 404
 405                        struct bkey *top;
 406
 407                        bch_subtract_dirty(k, c, KEY_START(insert),
 408                                       KEY_SIZE(insert));
 409
 410                        if (bkey_written(b, k)) {
 411                                /*
 412                                 * We insert a new key to cover the top of the
 413                                 * old key, and the old key is modified in place
 414                                 * to represent the bottom split.
 415                                 *
 416                                 * It's completely arbitrary whether the new key
 417                                 * is the top or the bottom, but it has to match
 418                                 * up with what btree_sort_fixup() does - it
 419                                 * doesn't check for this kind of overlap, it
 420                                 * depends on us inserting a new key for the top
 421                                 * here.
 422                                 */
 423                                top = bch_bset_search(b, bset_tree_last(b),
 424                                                      insert);
 425                                bch_bset_insert(b, top, k);
 426                        } else {
 427                                BKEY_PADDED(key) temp;
 428                                bkey_copy(&temp.key, k);
 429                                bch_bset_insert(b, k, &temp.key);
 430                                top = bkey_next(k);
 431                        }
 432
 433                        bch_cut_front(insert, top);
 434                        bch_cut_back(&START_KEY(insert), k);
 435                        bch_bset_fix_invalidated_key(b, k);
 436                        goto out;
 437                }
 438
 439                if (bkey_cmp(insert, k) < 0) {
 440                        bch_cut_front(insert, k);
 441                } else {
 442                        if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0)
 443                                old_offset = KEY_START(insert);
 444
 445                        if (bkey_written(b, k) &&
 446                            bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
 447                                /*
 448                                 * Completely overwrote, so we don't have to
 449                                 * invalidate the binary search tree
 450                                 */
 451                                bch_cut_front(k, k);
 452                        } else {
 453                                __bch_cut_back(&START_KEY(insert), k);
 454                                bch_bset_fix_invalidated_key(b, k);
 455                        }
 456                }
 457
 458                bch_subtract_dirty(k, c, old_offset, old_size - KEY_SIZE(k));
 459        }
 460
 461check_failed:
 462        if (replace_key) {
 463                if (!sectors_found) {
 464                        return true;
 465                } else if (sectors_found < KEY_SIZE(insert)) {
 466                        SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
 467                                       (KEY_SIZE(insert) - sectors_found));
 468                        SET_KEY_SIZE(insert, sectors_found);
 469                }
 470        }
 471out:
 472        if (KEY_DIRTY(insert))
 473                bcache_dev_sectors_dirty_add(c, KEY_INODE(insert),
 474                                             KEY_START(insert),
 475                                             KEY_SIZE(insert));
 476
 477        return false;
 478}
 479
 480bool __bch_extent_invalid(struct cache_set *c, const struct bkey *k)
 481{
 482        char buf[80];
 483
 484        if (!KEY_SIZE(k))
 485                return true;
 486
 487        if (KEY_SIZE(k) > KEY_OFFSET(k))
 488                goto bad;
 489
 490        if (__ptr_invalid(c, k))
 491                goto bad;
 492
 493        return false;
 494bad:
 495        bch_extent_to_text(buf, sizeof(buf), k);
 496        cache_bug(c, "spotted extent %s: %s", buf, bch_ptr_status(c, k));
 497        return true;
 498}
 499
 500static bool bch_extent_invalid(struct btree_keys *bk, const struct bkey *k)
 501{
 502        struct btree *b = container_of(bk, struct btree, keys);
 503
 504        return __bch_extent_invalid(b->c, k);
 505}
 506
 507static bool bch_extent_bad_expensive(struct btree *b, const struct bkey *k,
 508                                     unsigned int ptr)
 509{
 510        struct bucket *g = PTR_BUCKET(b->c, k, ptr);
 511        char buf[80];
 512
 513        if (mutex_trylock(&b->c->bucket_lock)) {
 514                if (b->c->gc_mark_valid &&
 515                    (!GC_MARK(g) ||
 516                     GC_MARK(g) == GC_MARK_METADATA ||
 517                     (GC_MARK(g) != GC_MARK_DIRTY && KEY_DIRTY(k))))
 518                        goto err;
 519
 520                if (g->prio == BTREE_PRIO)
 521                        goto err;
 522
 523                mutex_unlock(&b->c->bucket_lock);
 524        }
 525
 526        return false;
 527err:
 528        mutex_unlock(&b->c->bucket_lock);
 529        bch_extent_to_text(buf, sizeof(buf), k);
 530        btree_bug(b,
 531"inconsistent extent pointer %s:\nbucket %zu pin %i prio %i gen %i last_gc %i mark %llu",
 532                  buf, PTR_BUCKET_NR(b->c, k, ptr), atomic_read(&g->pin),
 533                  g->prio, g->gen, g->last_gc, GC_MARK(g));
 534        return true;
 535}
 536
 537static bool bch_extent_bad(struct btree_keys *bk, const struct bkey *k)
 538{
 539        struct btree *b = container_of(bk, struct btree, keys);
 540        unsigned int i, stale;
 541        char buf[80];
 542
 543        if (!KEY_PTRS(k) ||
 544            bch_extent_invalid(bk, k))
 545                return true;
 546
 547        for (i = 0; i < KEY_PTRS(k); i++)
 548                if (!ptr_available(b->c, k, i))
 549                        return true;
 550
 551        for (i = 0; i < KEY_PTRS(k); i++) {
 552                stale = ptr_stale(b->c, k, i);
 553
 554                if (stale && KEY_DIRTY(k)) {
 555                        bch_extent_to_text(buf, sizeof(buf), k);
 556                        pr_info("stale dirty pointer, stale %u, key: %s\n",
 557                                stale, buf);
 558                }
 559
 560                btree_bug_on(stale > BUCKET_GC_GEN_MAX, b,
 561                             "key too stale: %i, need_gc %u",
 562                             stale, b->c->need_gc);
 563
 564                if (stale)
 565                        return true;
 566
 567                if (expensive_debug_checks(b->c) &&
 568                    bch_extent_bad_expensive(b, k, i))
 569                        return true;
 570        }
 571
 572        return false;
 573}
 574
 575static uint64_t merge_chksums(struct bkey *l, struct bkey *r)
 576{
 577        return (l->ptr[KEY_PTRS(l)] + r->ptr[KEY_PTRS(r)]) &
 578                ~((uint64_t)1 << 63);
 579}
 580
 581static bool bch_extent_merge(struct btree_keys *bk,
 582                             struct bkey *l,
 583                             struct bkey *r)
 584{
 585        struct btree *b = container_of(bk, struct btree, keys);
 586        unsigned int i;
 587
 588        if (key_merging_disabled(b->c))
 589                return false;
 590
 591        for (i = 0; i < KEY_PTRS(l); i++)
 592                if (l->ptr[i] + MAKE_PTR(0, KEY_SIZE(l), 0) != r->ptr[i] ||
 593                    PTR_BUCKET_NR(b->c, l, i) != PTR_BUCKET_NR(b->c, r, i))
 594                        return false;
 595
 596        /* Keys with no pointers aren't restricted to one bucket and could
 597         * overflow KEY_SIZE
 598         */
 599        if (KEY_SIZE(l) + KEY_SIZE(r) > USHRT_MAX) {
 600                SET_KEY_OFFSET(l, KEY_OFFSET(l) + USHRT_MAX - KEY_SIZE(l));
 601                SET_KEY_SIZE(l, USHRT_MAX);
 602
 603                bch_cut_front(l, r);
 604                return false;
 605        }
 606
 607        if (KEY_CSUM(l)) {
 608                if (KEY_CSUM(r))
 609                        l->ptr[KEY_PTRS(l)] = merge_chksums(l, r);
 610                else
 611                        SET_KEY_CSUM(l, 0);
 612        }
 613
 614        SET_KEY_OFFSET(l, KEY_OFFSET(l) + KEY_SIZE(r));
 615        SET_KEY_SIZE(l, KEY_SIZE(l) + KEY_SIZE(r));
 616
 617        return true;
 618}
 619
 620const struct btree_keys_ops bch_extent_keys_ops = {
 621        .sort_cmp       = bch_extent_sort_cmp,
 622        .sort_fixup     = bch_extent_sort_fixup,
 623        .insert_fixup   = bch_extent_insert_fixup,
 624        .key_invalid    = bch_extent_invalid,
 625        .key_bad        = bch_extent_bad,
 626        .key_merge      = bch_extent_merge,
 627        .key_to_text    = bch_extent_to_text,
 628        .key_dump       = bch_bkey_dump,
 629        .is_extents     = true,
 630};
 631