linux/drivers/md/bcache/super.c
<<
>>
Prefs
   1/*
   2 * bcache setup/teardown code, and some metadata io - read a superblock and
   3 * figure out what to do with it.
   4 *
   5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
   6 * Copyright 2012 Google, Inc.
   7 */
   8
   9#include "bcache.h"
  10#include "btree.h"
  11#include "debug.h"
  12#include "extents.h"
  13#include "request.h"
  14#include "writeback.h"
  15
  16#include <linux/blkdev.h>
  17#include <linux/buffer_head.h>
  18#include <linux/debugfs.h>
  19#include <linux/genhd.h>
  20#include <linux/idr.h>
  21#include <linux/kthread.h>
  22#include <linux/module.h>
  23#include <linux/random.h>
  24#include <linux/reboot.h>
  25#include <linux/sysfs.h>
  26
  27MODULE_LICENSE("GPL");
  28MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
  29
  30static const char bcache_magic[] = {
  31        0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
  32        0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81
  33};
  34
  35static const char invalid_uuid[] = {
  36        0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78,
  37        0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
  38};
  39
  40/* Default is -1; we skip past it for struct cached_dev's cache mode */
  41const char * const bch_cache_modes[] = {
  42        "default",
  43        "writethrough",
  44        "writeback",
  45        "writearound",
  46        "none",
  47        NULL
  48};
  49
  50static struct kobject *bcache_kobj;
  51struct mutex bch_register_lock;
  52LIST_HEAD(bch_cache_sets);
  53static LIST_HEAD(uncached_devices);
  54
  55static int bcache_major;
  56static DEFINE_IDA(bcache_minor);
  57static wait_queue_head_t unregister_wait;
  58struct workqueue_struct *bcache_wq;
  59
  60#define BTREE_MAX_PAGES         (256 * 1024 / PAGE_SIZE)
  61
  62/* Superblock */
  63
  64static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
  65                              struct page **res)
  66{
  67        const char *err;
  68        struct cache_sb *s;
  69        struct buffer_head *bh = __bread(bdev, 1, SB_SIZE);
  70        unsigned i;
  71
  72        if (!bh)
  73                return "IO error";
  74
  75        s = (struct cache_sb *) bh->b_data;
  76
  77        sb->offset              = le64_to_cpu(s->offset);
  78        sb->version             = le64_to_cpu(s->version);
  79
  80        memcpy(sb->magic,       s->magic, 16);
  81        memcpy(sb->uuid,        s->uuid, 16);
  82        memcpy(sb->set_uuid,    s->set_uuid, 16);
  83        memcpy(sb->label,       s->label, SB_LABEL_SIZE);
  84
  85        sb->flags               = le64_to_cpu(s->flags);
  86        sb->seq                 = le64_to_cpu(s->seq);
  87        sb->last_mount          = le32_to_cpu(s->last_mount);
  88        sb->first_bucket        = le16_to_cpu(s->first_bucket);
  89        sb->keys                = le16_to_cpu(s->keys);
  90
  91        for (i = 0; i < SB_JOURNAL_BUCKETS; i++)
  92                sb->d[i] = le64_to_cpu(s->d[i]);
  93
  94        pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
  95                 sb->version, sb->flags, sb->seq, sb->keys);
  96
  97        err = "Not a bcache superblock";
  98        if (sb->offset != SB_SECTOR)
  99                goto err;
 100
 101        if (memcmp(sb->magic, bcache_magic, 16))
 102                goto err;
 103
 104        err = "Too many journal buckets";
 105        if (sb->keys > SB_JOURNAL_BUCKETS)
 106                goto err;
 107
 108        err = "Bad checksum";
 109        if (s->csum != csum_set(s))
 110                goto err;
 111
 112        err = "Bad UUID";
 113        if (bch_is_zero(sb->uuid, 16))
 114                goto err;
 115
 116        sb->block_size  = le16_to_cpu(s->block_size);
 117
 118        err = "Superblock block size smaller than device block size";
 119        if (sb->block_size << 9 < bdev_logical_block_size(bdev))
 120                goto err;
 121
 122        switch (sb->version) {
 123        case BCACHE_SB_VERSION_BDEV:
 124                sb->data_offset = BDEV_DATA_START_DEFAULT;
 125                break;
 126        case BCACHE_SB_VERSION_BDEV_WITH_OFFSET:
 127                sb->data_offset = le64_to_cpu(s->data_offset);
 128
 129                err = "Bad data offset";
 130                if (sb->data_offset < BDEV_DATA_START_DEFAULT)
 131                        goto err;
 132
 133                break;
 134        case BCACHE_SB_VERSION_CDEV:
 135        case BCACHE_SB_VERSION_CDEV_WITH_UUID:
 136                sb->nbuckets    = le64_to_cpu(s->nbuckets);
 137                sb->bucket_size = le16_to_cpu(s->bucket_size);
 138
 139                sb->nr_in_set   = le16_to_cpu(s->nr_in_set);
 140                sb->nr_this_dev = le16_to_cpu(s->nr_this_dev);
 141
 142                err = "Too many buckets";
 143                if (sb->nbuckets > LONG_MAX)
 144                        goto err;
 145
 146                err = "Not enough buckets";
 147                if (sb->nbuckets < 1 << 7)
 148                        goto err;
 149
 150                err = "Bad block/bucket size";
 151                if (!is_power_of_2(sb->block_size) ||
 152                    sb->block_size > PAGE_SECTORS ||
 153                    !is_power_of_2(sb->bucket_size) ||
 154                    sb->bucket_size < PAGE_SECTORS)
 155                        goto err;
 156
 157                err = "Invalid superblock: device too small";
 158                if (get_capacity(bdev->bd_disk) < sb->bucket_size * sb->nbuckets)
 159                        goto err;
 160
 161                err = "Bad UUID";
 162                if (bch_is_zero(sb->set_uuid, 16))
 163                        goto err;
 164
 165                err = "Bad cache device number in set";
 166                if (!sb->nr_in_set ||
 167                    sb->nr_in_set <= sb->nr_this_dev ||
 168                    sb->nr_in_set > MAX_CACHES_PER_SET)
 169                        goto err;
 170
 171                err = "Journal buckets not sequential";
 172                for (i = 0; i < sb->keys; i++)
 173                        if (sb->d[i] != sb->first_bucket + i)
 174                                goto err;
 175
 176                err = "Too many journal buckets";
 177                if (sb->first_bucket + sb->keys > sb->nbuckets)
 178                        goto err;
 179
 180                err = "Invalid superblock: first bucket comes before end of super";
 181                if (sb->first_bucket * sb->bucket_size < 16)
 182                        goto err;
 183
 184                break;
 185        default:
 186                err = "Unsupported superblock version";
 187                goto err;
 188        }
 189
 190        sb->last_mount = get_seconds();
 191        err = NULL;
 192
 193        get_page(bh->b_page);
 194        *res = bh->b_page;
 195err:
 196        put_bh(bh);
 197        return err;
 198}
 199
 200static void write_bdev_super_endio(struct bio *bio)
 201{
 202        struct cached_dev *dc = bio->bi_private;
 203        /* XXX: error checking */
 204
 205        closure_put(&dc->sb_write);
 206}
 207
 208static void __write_super(struct cache_sb *sb, struct bio *bio)
 209{
 210        struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page);
 211        unsigned i;
 212
 213        bio->bi_iter.bi_sector  = SB_SECTOR;
 214        bio->bi_iter.bi_size    = SB_SIZE;
 215        bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META);
 216        bch_bio_map(bio, NULL);
 217
 218        out->offset             = cpu_to_le64(sb->offset);
 219        out->version            = cpu_to_le64(sb->version);
 220
 221        memcpy(out->uuid,       sb->uuid, 16);
 222        memcpy(out->set_uuid,   sb->set_uuid, 16);
 223        memcpy(out->label,      sb->label, SB_LABEL_SIZE);
 224
 225        out->flags              = cpu_to_le64(sb->flags);
 226        out->seq                = cpu_to_le64(sb->seq);
 227
 228        out->last_mount         = cpu_to_le32(sb->last_mount);
 229        out->first_bucket       = cpu_to_le16(sb->first_bucket);
 230        out->keys               = cpu_to_le16(sb->keys);
 231
 232        for (i = 0; i < sb->keys; i++)
 233                out->d[i] = cpu_to_le64(sb->d[i]);
 234
 235        out->csum = csum_set(out);
 236
 237        pr_debug("ver %llu, flags %llu, seq %llu",
 238                 sb->version, sb->flags, sb->seq);
 239
 240        submit_bio(bio);
 241}
 242
 243static void bch_write_bdev_super_unlock(struct closure *cl)
 244{
 245        struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write);
 246
 247        up(&dc->sb_write_mutex);
 248}
 249
 250void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
 251{
 252        struct closure *cl = &dc->sb_write;
 253        struct bio *bio = &dc->sb_bio;
 254
 255        down(&dc->sb_write_mutex);
 256        closure_init(cl, parent);
 257
 258        bio_reset(bio);
 259        bio->bi_bdev    = dc->bdev;
 260        bio->bi_end_io  = write_bdev_super_endio;
 261        bio->bi_private = dc;
 262
 263        closure_get(cl);
 264        __write_super(&dc->sb, bio);
 265
 266        closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
 267}
 268
 269static void write_super_endio(struct bio *bio)
 270{
 271        struct cache *ca = bio->bi_private;
 272
 273        bch_count_io_errors(ca, bio->bi_error, "writing superblock");
 274        closure_put(&ca->set->sb_write);
 275}
 276
 277static void bcache_write_super_unlock(struct closure *cl)
 278{
 279        struct cache_set *c = container_of(cl, struct cache_set, sb_write);
 280
 281        up(&c->sb_write_mutex);
 282}
 283
 284void bcache_write_super(struct cache_set *c)
 285{
 286        struct closure *cl = &c->sb_write;
 287        struct cache *ca;
 288        unsigned i;
 289
 290        down(&c->sb_write_mutex);
 291        closure_init(cl, &c->cl);
 292
 293        c->sb.seq++;
 294
 295        for_each_cache(ca, c, i) {
 296                struct bio *bio = &ca->sb_bio;
 297
 298                ca->sb.version          = BCACHE_SB_VERSION_CDEV_WITH_UUID;
 299                ca->sb.seq              = c->sb.seq;
 300                ca->sb.last_mount       = c->sb.last_mount;
 301
 302                SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb));
 303
 304                bio_reset(bio);
 305                bio->bi_bdev    = ca->bdev;
 306                bio->bi_end_io  = write_super_endio;
 307                bio->bi_private = ca;
 308
 309                closure_get(cl);
 310                __write_super(&ca->sb, bio);
 311        }
 312
 313        closure_return_with_destructor(cl, bcache_write_super_unlock);
 314}
 315
 316/* UUID io */
 317
 318static void uuid_endio(struct bio *bio)
 319{
 320        struct closure *cl = bio->bi_private;
 321        struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
 322
 323        cache_set_err_on(bio->bi_error, c, "accessing uuids");
 324        bch_bbio_free(bio, c);
 325        closure_put(cl);
 326}
 327
 328static void uuid_io_unlock(struct closure *cl)
 329{
 330        struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
 331
 332        up(&c->uuid_write_mutex);
 333}
 334
 335static void uuid_io(struct cache_set *c, int op, unsigned long op_flags,
 336                    struct bkey *k, struct closure *parent)
 337{
 338        struct closure *cl = &c->uuid_write;
 339        struct uuid_entry *u;
 340        unsigned i;
 341        char buf[80];
 342
 343        BUG_ON(!parent);
 344        down(&c->uuid_write_mutex);
 345        closure_init(cl, parent);
 346
 347        for (i = 0; i < KEY_PTRS(k); i++) {
 348                struct bio *bio = bch_bbio_alloc(c);
 349
 350                bio->bi_opf = REQ_SYNC | REQ_META | op_flags;
 351                bio->bi_iter.bi_size = KEY_SIZE(k) << 9;
 352
 353                bio->bi_end_io  = uuid_endio;
 354                bio->bi_private = cl;
 355                bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags);
 356                bch_bio_map(bio, c->uuids);
 357
 358                bch_submit_bbio(bio, c, k, i);
 359
 360                if (op != REQ_OP_WRITE)
 361                        break;
 362        }
 363
 364        bch_extent_to_text(buf, sizeof(buf), k);
 365        pr_debug("%s UUIDs at %s", op == REQ_OP_WRITE ? "wrote" : "read", buf);
 366
 367        for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
 368                if (!bch_is_zero(u->uuid, 16))
 369                        pr_debug("Slot %zi: %pU: %s: 1st: %u last: %u inv: %u",
 370                                 u - c->uuids, u->uuid, u->label,
 371                                 u->first_reg, u->last_reg, u->invalidated);
 372
 373        closure_return_with_destructor(cl, uuid_io_unlock);
 374}
 375
 376static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
 377{
 378        struct bkey *k = &j->uuid_bucket;
 379
 380        if (__bch_btree_ptr_invalid(c, k))
 381                return "bad uuid pointer";
 382
 383        bkey_copy(&c->uuid_bucket, k);
 384        uuid_io(c, REQ_OP_READ, READ_SYNC, k, cl);
 385
 386        if (j->version < BCACHE_JSET_VERSION_UUIDv1) {
 387                struct uuid_entry_v0    *u0 = (void *) c->uuids;
 388                struct uuid_entry       *u1 = (void *) c->uuids;
 389                int i;
 390
 391                closure_sync(cl);
 392
 393                /*
 394                 * Since the new uuid entry is bigger than the old, we have to
 395                 * convert starting at the highest memory address and work down
 396                 * in order to do it in place
 397                 */
 398
 399                for (i = c->nr_uuids - 1;
 400                     i >= 0;
 401                     --i) {
 402                        memcpy(u1[i].uuid,      u0[i].uuid, 16);
 403                        memcpy(u1[i].label,     u0[i].label, 32);
 404
 405                        u1[i].first_reg         = u0[i].first_reg;
 406                        u1[i].last_reg          = u0[i].last_reg;
 407                        u1[i].invalidated       = u0[i].invalidated;
 408
 409                        u1[i].flags     = 0;
 410                        u1[i].sectors   = 0;
 411                }
 412        }
 413
 414        return NULL;
 415}
 416
 417static int __uuid_write(struct cache_set *c)
 418{
 419        BKEY_PADDED(key) k;
 420        struct closure cl;
 421        closure_init_stack(&cl);
 422
 423        lockdep_assert_held(&bch_register_lock);
 424
 425        if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true))
 426                return 1;
 427
 428        SET_KEY_SIZE(&k.key, c->sb.bucket_size);
 429        uuid_io(c, REQ_OP_WRITE, 0, &k.key, &cl);
 430        closure_sync(&cl);
 431
 432        bkey_copy(&c->uuid_bucket, &k.key);
 433        bkey_put(c, &k.key);
 434        return 0;
 435}
 436
 437int bch_uuid_write(struct cache_set *c)
 438{
 439        int ret = __uuid_write(c);
 440
 441        if (!ret)
 442                bch_journal_meta(c, NULL);
 443
 444        return ret;
 445}
 446
 447static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid)
 448{
 449        struct uuid_entry *u;
 450
 451        for (u = c->uuids;
 452             u < c->uuids + c->nr_uuids; u++)
 453                if (!memcmp(u->uuid, uuid, 16))
 454                        return u;
 455
 456        return NULL;
 457}
 458
 459static struct uuid_entry *uuid_find_empty(struct cache_set *c)
 460{
 461        static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
 462        return uuid_find(c, zero_uuid);
 463}
 464
 465/*
 466 * Bucket priorities/gens:
 467 *
 468 * For each bucket, we store on disk its
 469   * 8 bit gen
 470   * 16 bit priority
 471 *
 472 * See alloc.c for an explanation of the gen. The priority is used to implement
 473 * lru (and in the future other) cache replacement policies; for most purposes
 474 * it's just an opaque integer.
 475 *
 476 * The gens and the priorities don't have a whole lot to do with each other, and
 477 * it's actually the gens that must be written out at specific times - it's no
 478 * big deal if the priorities don't get written, if we lose them we just reuse
 479 * buckets in suboptimal order.
 480 *
 481 * On disk they're stored in a packed array, and in as many buckets are required
 482 * to fit them all. The buckets we use to store them form a list; the journal
 483 * header points to the first bucket, the first bucket points to the second
 484 * bucket, et cetera.
 485 *
 486 * This code is used by the allocation code; periodically (whenever it runs out
 487 * of buckets to allocate from) the allocation code will invalidate some
 488 * buckets, but it can't use those buckets until their new gens are safely on
 489 * disk.
 490 */
 491
 492static void prio_endio(struct bio *bio)
 493{
 494        struct cache *ca = bio->bi_private;
 495
 496        cache_set_err_on(bio->bi_error, ca->set, "accessing priorities");
 497        bch_bbio_free(bio, ca->set);
 498        closure_put(&ca->prio);
 499}
 500
 501static void prio_io(struct cache *ca, uint64_t bucket, int op,
 502                    unsigned long op_flags)
 503{
 504        struct closure *cl = &ca->prio;
 505        struct bio *bio = bch_bbio_alloc(ca->set);
 506
 507        closure_init_stack(cl);
 508
 509        bio->bi_iter.bi_sector  = bucket * ca->sb.bucket_size;
 510        bio->bi_bdev            = ca->bdev;
 511        bio->bi_iter.bi_size    = bucket_bytes(ca);
 512
 513        bio->bi_end_io  = prio_endio;
 514        bio->bi_private = ca;
 515        bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags);
 516        bch_bio_map(bio, ca->disk_buckets);
 517
 518        closure_bio_submit(bio, &ca->prio);
 519        closure_sync(cl);
 520}
 521
 522void bch_prio_write(struct cache *ca)
 523{
 524        int i;
 525        struct bucket *b;
 526        struct closure cl;
 527
 528        closure_init_stack(&cl);
 529
 530        lockdep_assert_held(&ca->set->bucket_lock);
 531
 532        ca->disk_buckets->seq++;
 533
 534        atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
 535                        &ca->meta_sectors_written);
 536
 537        //pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
 538        //       fifo_used(&ca->free_inc), fifo_used(&ca->unused));
 539
 540        for (i = prio_buckets(ca) - 1; i >= 0; --i) {
 541                long bucket;
 542                struct prio_set *p = ca->disk_buckets;
 543                struct bucket_disk *d = p->data;
 544                struct bucket_disk *end = d + prios_per_bucket(ca);
 545
 546                for (b = ca->buckets + i * prios_per_bucket(ca);
 547                     b < ca->buckets + ca->sb.nbuckets && d < end;
 548                     b++, d++) {
 549                        d->prio = cpu_to_le16(b->prio);
 550                        d->gen = b->gen;
 551                }
 552
 553                p->next_bucket  = ca->prio_buckets[i + 1];
 554                p->magic        = pset_magic(&ca->sb);
 555                p->csum         = bch_crc64(&p->magic, bucket_bytes(ca) - 8);
 556
 557                bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true);
 558                BUG_ON(bucket == -1);
 559
 560                mutex_unlock(&ca->set->bucket_lock);
 561                prio_io(ca, bucket, REQ_OP_WRITE, 0);
 562                mutex_lock(&ca->set->bucket_lock);
 563
 564                ca->prio_buckets[i] = bucket;
 565                atomic_dec_bug(&ca->buckets[bucket].pin);
 566        }
 567
 568        mutex_unlock(&ca->set->bucket_lock);
 569
 570        bch_journal_meta(ca->set, &cl);
 571        closure_sync(&cl);
 572
 573        mutex_lock(&ca->set->bucket_lock);
 574
 575        /*
 576         * Don't want the old priorities to get garbage collected until after we
 577         * finish writing the new ones, and they're journalled
 578         */
 579        for (i = 0; i < prio_buckets(ca); i++) {
 580                if (ca->prio_last_buckets[i])
 581                        __bch_bucket_free(ca,
 582                                &ca->buckets[ca->prio_last_buckets[i]]);
 583
 584                ca->prio_last_buckets[i] = ca->prio_buckets[i];
 585        }
 586}
 587
 588static void prio_read(struct cache *ca, uint64_t bucket)
 589{
 590        struct prio_set *p = ca->disk_buckets;
 591        struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
 592        struct bucket *b;
 593        unsigned bucket_nr = 0;
 594
 595        for (b = ca->buckets;
 596             b < ca->buckets + ca->sb.nbuckets;
 597             b++, d++) {
 598                if (d == end) {
 599                        ca->prio_buckets[bucket_nr] = bucket;
 600                        ca->prio_last_buckets[bucket_nr] = bucket;
 601                        bucket_nr++;
 602
 603                        prio_io(ca, bucket, REQ_OP_READ, READ_SYNC);
 604
 605                        if (p->csum != bch_crc64(&p->magic, bucket_bytes(ca) - 8))
 606                                pr_warn("bad csum reading priorities");
 607
 608                        if (p->magic != pset_magic(&ca->sb))
 609                                pr_warn("bad magic reading priorities");
 610
 611                        bucket = p->next_bucket;
 612                        d = p->data;
 613                }
 614
 615                b->prio = le16_to_cpu(d->prio);
 616                b->gen = b->last_gc = d->gen;
 617        }
 618}
 619
 620/* Bcache device */
 621
 622static int open_dev(struct block_device *b, fmode_t mode)
 623{
 624        struct bcache_device *d = b->bd_disk->private_data;
 625        if (test_bit(BCACHE_DEV_CLOSING, &d->flags))
 626                return -ENXIO;
 627
 628        closure_get(&d->cl);
 629        return 0;
 630}
 631
 632static void release_dev(struct gendisk *b, fmode_t mode)
 633{
 634        struct bcache_device *d = b->private_data;
 635        closure_put(&d->cl);
 636}
 637
 638static int ioctl_dev(struct block_device *b, fmode_t mode,
 639                     unsigned int cmd, unsigned long arg)
 640{
 641        struct bcache_device *d = b->bd_disk->private_data;
 642        return d->ioctl(d, mode, cmd, arg);
 643}
 644
 645static const struct block_device_operations bcache_ops = {
 646        .open           = open_dev,
 647        .release        = release_dev,
 648        .ioctl          = ioctl_dev,
 649        .owner          = THIS_MODULE,
 650};
 651
 652void bcache_device_stop(struct bcache_device *d)
 653{
 654        if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags))
 655                closure_queue(&d->cl);
 656}
 657
 658static void bcache_device_unlink(struct bcache_device *d)
 659{
 660        lockdep_assert_held(&bch_register_lock);
 661
 662        if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) {
 663                unsigned i;
 664                struct cache *ca;
 665
 666                sysfs_remove_link(&d->c->kobj, d->name);
 667                sysfs_remove_link(&d->kobj, "cache");
 668
 669                for_each_cache(ca, d->c, i)
 670                        bd_unlink_disk_holder(ca->bdev, d->disk);
 671        }
 672}
 673
 674static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
 675                               const char *name)
 676{
 677        unsigned i;
 678        struct cache *ca;
 679
 680        for_each_cache(ca, d->c, i)
 681                bd_link_disk_holder(ca->bdev, d->disk);
 682
 683        snprintf(d->name, BCACHEDEVNAME_SIZE,
 684                 "%s%u", name, d->id);
 685
 686        WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
 687             sysfs_create_link(&c->kobj, &d->kobj, d->name),
 688             "Couldn't create device <-> cache set symlinks");
 689
 690        clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags);
 691}
 692
 693static void bcache_device_detach(struct bcache_device *d)
 694{
 695        lockdep_assert_held(&bch_register_lock);
 696
 697        if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
 698                struct uuid_entry *u = d->c->uuids + d->id;
 699
 700                SET_UUID_FLASH_ONLY(u, 0);
 701                memcpy(u->uuid, invalid_uuid, 16);
 702                u->invalidated = cpu_to_le32(get_seconds());
 703                bch_uuid_write(d->c);
 704        }
 705
 706        bcache_device_unlink(d);
 707
 708        d->c->devices[d->id] = NULL;
 709        closure_put(&d->c->caching);
 710        d->c = NULL;
 711}
 712
 713static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
 714                                 unsigned id)
 715{
 716        d->id = id;
 717        d->c = c;
 718        c->devices[id] = d;
 719
 720        closure_get(&c->caching);
 721}
 722
 723static void bcache_device_free(struct bcache_device *d)
 724{
 725        lockdep_assert_held(&bch_register_lock);
 726
 727        pr_info("%s stopped", d->disk->disk_name);
 728
 729        if (d->c)
 730                bcache_device_detach(d);
 731        if (d->disk && d->disk->flags & GENHD_FL_UP)
 732                del_gendisk(d->disk);
 733        if (d->disk && d->disk->queue)
 734                blk_cleanup_queue(d->disk->queue);
 735        if (d->disk) {
 736                ida_simple_remove(&bcache_minor, d->disk->first_minor);
 737                put_disk(d->disk);
 738        }
 739
 740        if (d->bio_split)
 741                bioset_free(d->bio_split);
 742        kvfree(d->full_dirty_stripes);
 743        kvfree(d->stripe_sectors_dirty);
 744
 745        closure_debug_destroy(&d->cl);
 746}
 747
 748static int bcache_device_init(struct bcache_device *d, unsigned block_size,
 749                              sector_t sectors)
 750{
 751        struct request_queue *q;
 752        size_t n;
 753        int minor;
 754
 755        if (!d->stripe_size)
 756                d->stripe_size = 1 << 31;
 757
 758        d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
 759
 760        if (!d->nr_stripes ||
 761            d->nr_stripes > INT_MAX ||
 762            d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) {
 763                pr_err("nr_stripes too large or invalid: %u (start sector beyond end of disk?)",
 764                        (unsigned)d->nr_stripes);
 765                return -ENOMEM;
 766        }
 767
 768        n = d->nr_stripes * sizeof(atomic_t);
 769        d->stripe_sectors_dirty = n < PAGE_SIZE << 6
 770                ? kzalloc(n, GFP_KERNEL)
 771                : vzalloc(n);
 772        if (!d->stripe_sectors_dirty)
 773                return -ENOMEM;
 774
 775        n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long);
 776        d->full_dirty_stripes = n < PAGE_SIZE << 6
 777                ? kzalloc(n, GFP_KERNEL)
 778                : vzalloc(n);
 779        if (!d->full_dirty_stripes)
 780                return -ENOMEM;
 781
 782        minor = ida_simple_get(&bcache_minor, 0, MINORMASK + 1, GFP_KERNEL);
 783        if (minor < 0)
 784                return minor;
 785
 786        if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
 787            !(d->disk = alloc_disk(1))) {
 788                ida_simple_remove(&bcache_minor, minor);
 789                return -ENOMEM;
 790        }
 791
 792        set_capacity(d->disk, sectors);
 793        snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", minor);
 794
 795        d->disk->major          = bcache_major;
 796        d->disk->first_minor    = minor;
 797        d->disk->fops           = &bcache_ops;
 798        d->disk->private_data   = d;
 799
 800        q = blk_alloc_queue(GFP_KERNEL);
 801        if (!q)
 802                return -ENOMEM;
 803
 804        blk_queue_make_request(q, NULL);
 805        d->disk->queue                  = q;
 806        q->queuedata                    = d;
 807        q->backing_dev_info.congested_data = d;
 808        q->limits.max_hw_sectors        = UINT_MAX;
 809        q->limits.max_sectors           = UINT_MAX;
 810        q->limits.max_segment_size      = UINT_MAX;
 811        q->limits.max_segments          = BIO_MAX_PAGES;
 812        blk_queue_max_discard_sectors(q, UINT_MAX);
 813        q->limits.discard_granularity   = 512;
 814        q->limits.io_min                = block_size;
 815        q->limits.logical_block_size    = block_size;
 816        q->limits.physical_block_size   = block_size;
 817        set_bit(QUEUE_FLAG_NONROT,      &d->disk->queue->queue_flags);
 818        clear_bit(QUEUE_FLAG_ADD_RANDOM, &d->disk->queue->queue_flags);
 819        set_bit(QUEUE_FLAG_DISCARD,     &d->disk->queue->queue_flags);
 820
 821        blk_queue_write_cache(q, true, true);
 822
 823        return 0;
 824}
 825
 826/* Cached device */
 827
 828static void calc_cached_dev_sectors(struct cache_set *c)
 829{
 830        uint64_t sectors = 0;
 831        struct cached_dev *dc;
 832
 833        list_for_each_entry(dc, &c->cached_devs, list)
 834                sectors += bdev_sectors(dc->bdev);
 835
 836        c->cached_dev_sectors = sectors;
 837}
 838
 839void bch_cached_dev_run(struct cached_dev *dc)
 840{
 841        struct bcache_device *d = &dc->disk;
 842        char buf[SB_LABEL_SIZE + 1];
 843        char *env[] = {
 844                "DRIVER=bcache",
 845                kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid),
 846                NULL,
 847                NULL,
 848        };
 849
 850        memcpy(buf, dc->sb.label, SB_LABEL_SIZE);
 851        buf[SB_LABEL_SIZE] = '\0';
 852        env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);
 853
 854        if (atomic_xchg(&dc->running, 1)) {
 855                kfree(env[1]);
 856                kfree(env[2]);
 857                return;
 858        }
 859
 860        if (!d->c &&
 861            BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
 862                struct closure cl;
 863                closure_init_stack(&cl);
 864
 865                SET_BDEV_STATE(&dc->sb, BDEV_STATE_STALE);
 866                bch_write_bdev_super(dc, &cl);
 867                closure_sync(&cl);
 868        }
 869
 870        add_disk(d->disk);
 871        bd_link_disk_holder(dc->bdev, dc->disk.disk);
 872        /* won't show up in the uevent file, use udevadm monitor -e instead
 873         * only class / kset properties are persistent */
 874        kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
 875        kfree(env[1]);
 876        kfree(env[2]);
 877
 878        if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
 879            sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
 880                pr_debug("error creating sysfs link");
 881}
 882
 883static void cached_dev_detach_finish(struct work_struct *w)
 884{
 885        struct cached_dev *dc = container_of(w, struct cached_dev, detach);
 886        char buf[BDEVNAME_SIZE];
 887        struct closure cl;
 888        closure_init_stack(&cl);
 889
 890        BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
 891        BUG_ON(atomic_read(&dc->count));
 892
 893        mutex_lock(&bch_register_lock);
 894
 895        memset(&dc->sb.set_uuid, 0, 16);
 896        SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE);
 897
 898        bch_write_bdev_super(dc, &cl);
 899        closure_sync(&cl);
 900
 901        bcache_device_detach(&dc->disk);
 902        list_move(&dc->list, &uncached_devices);
 903
 904        clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags);
 905        clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags);
 906
 907        mutex_unlock(&bch_register_lock);
 908
 909        pr_info("Caching disabled for %s", bdevname(dc->bdev, buf));
 910
 911        /* Drop ref we took in cached_dev_detach() */
 912        closure_put(&dc->disk.cl);
 913}
 914
 915void bch_cached_dev_detach(struct cached_dev *dc)
 916{
 917        lockdep_assert_held(&bch_register_lock);
 918
 919        if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
 920                return;
 921
 922        if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
 923                return;
 924
 925        /*
 926         * Block the device from being closed and freed until we're finished
 927         * detaching
 928         */
 929        closure_get(&dc->disk.cl);
 930
 931        bch_writeback_queue(dc);
 932        cached_dev_put(dc);
 933}
 934
 935int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
 936{
 937        uint32_t rtime = cpu_to_le32(get_seconds());
 938        struct uuid_entry *u;
 939        char buf[BDEVNAME_SIZE];
 940
 941        bdevname(dc->bdev, buf);
 942
 943        if (memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16))
 944                return -ENOENT;
 945
 946        if (dc->disk.c) {
 947                pr_err("Can't attach %s: already attached", buf);
 948                return -EINVAL;
 949        }
 950
 951        if (test_bit(CACHE_SET_STOPPING, &c->flags)) {
 952                pr_err("Can't attach %s: shutting down", buf);
 953                return -EINVAL;
 954        }
 955
 956        if (dc->sb.block_size < c->sb.block_size) {
 957                /* Will die */
 958                pr_err("Couldn't attach %s: block size less than set's block size",
 959                       buf);
 960                return -EINVAL;
 961        }
 962
 963        u = uuid_find(c, dc->sb.uuid);
 964
 965        if (u &&
 966            (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE ||
 967             BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) {
 968                memcpy(u->uuid, invalid_uuid, 16);
 969                u->invalidated = cpu_to_le32(get_seconds());
 970                u = NULL;
 971        }
 972
 973        if (!u) {
 974                if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
 975                        pr_err("Couldn't find uuid for %s in set", buf);
 976                        return -ENOENT;
 977                }
 978
 979                u = uuid_find_empty(c);
 980                if (!u) {
 981                        pr_err("Not caching %s, no room for UUID", buf);
 982                        return -EINVAL;
 983                }
 984        }
 985
 986        /* Deadlocks since we're called via sysfs...
 987        sysfs_remove_file(&dc->kobj, &sysfs_attach);
 988         */
 989
 990        if (bch_is_zero(u->uuid, 16)) {
 991                struct closure cl;
 992                closure_init_stack(&cl);
 993
 994                memcpy(u->uuid, dc->sb.uuid, 16);
 995                memcpy(u->label, dc->sb.label, SB_LABEL_SIZE);
 996                u->first_reg = u->last_reg = rtime;
 997                bch_uuid_write(c);
 998
 999                memcpy(dc->sb.set_uuid, c->sb.set_uuid, 16);
1000                SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
1001
1002                bch_write_bdev_super(dc, &cl);
1003                closure_sync(&cl);
1004        } else {
1005                u->last_reg = rtime;
1006                bch_uuid_write(c);
1007        }
1008
1009        bcache_device_attach(&dc->disk, c, u - c->uuids);
1010        list_move(&dc->list, &c->cached_devs);
1011        calc_cached_dev_sectors(c);
1012
1013        smp_wmb();
1014        /*
1015         * dc->c must be set before dc->count != 0 - paired with the mb in
1016         * cached_dev_get()
1017         */
1018        atomic_set(&dc->count, 1);
1019
1020        /* Block writeback thread, but spawn it */
1021        down_write(&dc->writeback_lock);
1022        if (bch_cached_dev_writeback_start(dc)) {
1023                up_write(&dc->writeback_lock);
1024                return -ENOMEM;
1025        }
1026
1027        if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
1028                bch_sectors_dirty_init(dc);
1029                atomic_set(&dc->has_dirty, 1);
1030                atomic_inc(&dc->count);
1031                bch_writeback_queue(dc);
1032        }
1033
1034        bch_cached_dev_run(dc);
1035        bcache_device_link(&dc->disk, c, "bdev");
1036
1037        /* Allow the writeback thread to proceed */
1038        up_write(&dc->writeback_lock);
1039
1040        pr_info("Caching %s as %s on set %pU",
1041                bdevname(dc->bdev, buf), dc->disk.disk->disk_name,
1042                dc->disk.c->sb.set_uuid);
1043        return 0;
1044}
1045
1046void bch_cached_dev_release(struct kobject *kobj)
1047{
1048        struct cached_dev *dc = container_of(kobj, struct cached_dev,
1049                                             disk.kobj);
1050        kfree(dc);
1051        module_put(THIS_MODULE);
1052}
1053
1054static void cached_dev_free(struct closure *cl)
1055{
1056        struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
1057
1058        cancel_delayed_work_sync(&dc->writeback_rate_update);
1059        if (!IS_ERR_OR_NULL(dc->writeback_thread))
1060                kthread_stop(dc->writeback_thread);
1061
1062        mutex_lock(&bch_register_lock);
1063
1064        if (atomic_read(&dc->running))
1065                bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
1066        bcache_device_free(&dc->disk);
1067        list_del(&dc->list);
1068
1069        mutex_unlock(&bch_register_lock);
1070
1071        if (!IS_ERR_OR_NULL(dc->bdev))
1072                blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1073
1074        wake_up(&unregister_wait);
1075
1076        kobject_put(&dc->disk.kobj);
1077}
1078
1079static void cached_dev_flush(struct closure *cl)
1080{
1081        struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
1082        struct bcache_device *d = &dc->disk;
1083
1084        mutex_lock(&bch_register_lock);
1085        bcache_device_unlink(d);
1086        mutex_unlock(&bch_register_lock);
1087
1088        bch_cache_accounting_destroy(&dc->accounting);
1089        kobject_del(&d->kobj);
1090
1091        continue_at(cl, cached_dev_free, system_wq);
1092}
1093
1094static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
1095{
1096        int ret;
1097        struct io *io;
1098        struct request_queue *q = bdev_get_queue(dc->bdev);
1099
1100        __module_get(THIS_MODULE);
1101        INIT_LIST_HEAD(&dc->list);
1102        closure_init(&dc->disk.cl, NULL);
1103        set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
1104        kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
1105        INIT_WORK(&dc->detach, cached_dev_detach_finish);
1106        sema_init(&dc->sb_write_mutex, 1);
1107        INIT_LIST_HEAD(&dc->io_lru);
1108        spin_lock_init(&dc->io_lock);
1109        bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
1110
1111        dc->sequential_cutoff           = 4 << 20;
1112
1113        for (io = dc->io; io < dc->io + RECENT_IO; io++) {
1114                list_add(&io->lru, &dc->io_lru);
1115                hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
1116        }
1117
1118        dc->disk.stripe_size = q->limits.io_opt >> 9;
1119
1120        if (dc->disk.stripe_size)
1121                dc->partial_stripes_expensive =
1122                        q->limits.raid_partial_stripes_expensive;
1123
1124        ret = bcache_device_init(&dc->disk, block_size,
1125                         dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
1126        if (ret)
1127                return ret;
1128
1129        set_capacity(dc->disk.disk,
1130                     dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
1131
1132        dc->disk.disk->queue->backing_dev_info.ra_pages =
1133                max(dc->disk.disk->queue->backing_dev_info.ra_pages,
1134                    q->backing_dev_info.ra_pages);
1135
1136        bch_cached_dev_request_init(dc);
1137        bch_cached_dev_writeback_init(dc);
1138        return 0;
1139}
1140
1141/* Cached device - bcache superblock */
1142
1143static void register_bdev(struct cache_sb *sb, struct page *sb_page,
1144                                 struct block_device *bdev,
1145                                 struct cached_dev *dc)
1146{
1147        char name[BDEVNAME_SIZE];
1148        const char *err = "cannot allocate memory";
1149        struct cache_set *c;
1150
1151        memcpy(&dc->sb, sb, sizeof(struct cache_sb));
1152        dc->bdev = bdev;
1153        dc->bdev->bd_holder = dc;
1154
1155        bio_init(&dc->sb_bio);
1156        dc->sb_bio.bi_max_vecs  = 1;
1157        dc->sb_bio.bi_io_vec    = dc->sb_bio.bi_inline_vecs;
1158        dc->sb_bio.bi_io_vec[0].bv_page = sb_page;
1159        get_page(sb_page);
1160
1161        if (cached_dev_init(dc, sb->block_size << 9))
1162                goto err;
1163
1164        err = "error creating kobject";
1165        if (kobject_add(&dc->disk.kobj, &part_to_dev(bdev->bd_part)->kobj,
1166                        "bcache"))
1167                goto err;
1168        if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
1169                goto err;
1170
1171        pr_info("registered backing device %s", bdevname(bdev, name));
1172
1173        list_add(&dc->list, &uncached_devices);
1174        list_for_each_entry(c, &bch_cache_sets, list)
1175                bch_cached_dev_attach(dc, c);
1176
1177        if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE ||
1178            BDEV_STATE(&dc->sb) == BDEV_STATE_STALE)
1179                bch_cached_dev_run(dc);
1180
1181        return;
1182err:
1183        pr_notice("error opening %s: %s", bdevname(bdev, name), err);
1184        bcache_device_stop(&dc->disk);
1185}
1186
1187/* Flash only volumes */
1188
1189void bch_flash_dev_release(struct kobject *kobj)
1190{
1191        struct bcache_device *d = container_of(kobj, struct bcache_device,
1192                                               kobj);
1193        kfree(d);
1194}
1195
1196static void flash_dev_free(struct closure *cl)
1197{
1198        struct bcache_device *d = container_of(cl, struct bcache_device, cl);
1199        mutex_lock(&bch_register_lock);
1200        bcache_device_free(d);
1201        mutex_unlock(&bch_register_lock);
1202        kobject_put(&d->kobj);
1203}
1204
1205static void flash_dev_flush(struct closure *cl)
1206{
1207        struct bcache_device *d = container_of(cl, struct bcache_device, cl);
1208
1209        mutex_lock(&bch_register_lock);
1210        bcache_device_unlink(d);
1211        mutex_unlock(&bch_register_lock);
1212        kobject_del(&d->kobj);
1213        continue_at(cl, flash_dev_free, system_wq);
1214}
1215
1216static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
1217{
1218        struct bcache_device *d = kzalloc(sizeof(struct bcache_device),
1219                                          GFP_KERNEL);
1220        if (!d)
1221                return -ENOMEM;
1222
1223        closure_init(&d->cl, NULL);
1224        set_closure_fn(&d->cl, flash_dev_flush, system_wq);
1225
1226        kobject_init(&d->kobj, &bch_flash_dev_ktype);
1227
1228        if (bcache_device_init(d, block_bytes(c), u->sectors))
1229                goto err;
1230
1231        bcache_device_attach(d, c, u - c->uuids);
1232        bch_flash_dev_request_init(d);
1233        add_disk(d->disk);
1234
1235        if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache"))
1236                goto err;
1237
1238        bcache_device_link(d, c, "volume");
1239
1240        return 0;
1241err:
1242        kobject_put(&d->kobj);
1243        return -ENOMEM;
1244}
1245
1246static int flash_devs_run(struct cache_set *c)
1247{
1248        int ret = 0;
1249        struct uuid_entry *u;
1250
1251        for (u = c->uuids;
1252             u < c->uuids + c->nr_uuids && !ret;
1253             u++)
1254                if (UUID_FLASH_ONLY(u))
1255                        ret = flash_dev_run(c, u);
1256
1257        return ret;
1258}
1259
1260int bch_flash_dev_create(struct cache_set *c, uint64_t size)
1261{
1262        struct uuid_entry *u;
1263
1264        if (test_bit(CACHE_SET_STOPPING, &c->flags))
1265                return -EINTR;
1266
1267        if (!test_bit(CACHE_SET_RUNNING, &c->flags))
1268                return -EPERM;
1269
1270        u = uuid_find_empty(c);
1271        if (!u) {
1272                pr_err("Can't create volume, no room for UUID");
1273                return -EINVAL;
1274        }
1275
1276        get_random_bytes(u->uuid, 16);
1277        memset(u->label, 0, 32);
1278        u->first_reg = u->last_reg = cpu_to_le32(get_seconds());
1279
1280        SET_UUID_FLASH_ONLY(u, 1);
1281        u->sectors = size >> 9;
1282
1283        bch_uuid_write(c);
1284
1285        return flash_dev_run(c, u);
1286}
1287
1288/* Cache set */
1289
1290__printf(2, 3)
1291bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
1292{
1293        va_list args;
1294
1295        if (c->on_error != ON_ERROR_PANIC &&
1296            test_bit(CACHE_SET_STOPPING, &c->flags))
1297                return false;
1298
1299        /* XXX: we can be called from atomic context
1300        acquire_console_sem();
1301        */
1302
1303        printk(KERN_ERR "bcache: error on %pU: ", c->sb.set_uuid);
1304
1305        va_start(args, fmt);
1306        vprintk(fmt, args);
1307        va_end(args);
1308
1309        printk(", disabling caching\n");
1310
1311        if (c->on_error == ON_ERROR_PANIC)
1312                panic("panic forced after error\n");
1313
1314        bch_cache_set_unregister(c);
1315        return true;
1316}
1317
1318void bch_cache_set_release(struct kobject *kobj)
1319{
1320        struct cache_set *c = container_of(kobj, struct cache_set, kobj);
1321        kfree(c);
1322        module_put(THIS_MODULE);
1323}
1324
1325static void cache_set_free(struct closure *cl)
1326{
1327        struct cache_set *c = container_of(cl, struct cache_set, cl);
1328        struct cache *ca;
1329        unsigned i;
1330
1331        if (!IS_ERR_OR_NULL(c->debug))
1332                debugfs_remove(c->debug);
1333
1334        bch_open_buckets_free(c);
1335        bch_btree_cache_free(c);
1336        bch_journal_free(c);
1337
1338        for_each_cache(ca, c, i)
1339                if (ca) {
1340                        ca->set = NULL;
1341                        c->cache[ca->sb.nr_this_dev] = NULL;
1342                        kobject_put(&ca->kobj);
1343                }
1344
1345        bch_bset_sort_state_free(&c->sort);
1346        free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
1347
1348        if (c->moving_gc_wq)
1349                destroy_workqueue(c->moving_gc_wq);
1350        if (c->bio_split)
1351                bioset_free(c->bio_split);
1352        if (c->fill_iter)
1353                mempool_destroy(c->fill_iter);
1354        if (c->bio_meta)
1355                mempool_destroy(c->bio_meta);
1356        if (c->search)
1357                mempool_destroy(c->search);
1358        kfree(c->devices);
1359
1360        mutex_lock(&bch_register_lock);
1361        list_del(&c->list);
1362        mutex_unlock(&bch_register_lock);
1363
1364        pr_info("Cache set %pU unregistered", c->sb.set_uuid);
1365        wake_up(&unregister_wait);
1366
1367        closure_debug_destroy(&c->cl);
1368        kobject_put(&c->kobj);
1369}
1370
1371static void cache_set_flush(struct closure *cl)
1372{
1373        struct cache_set *c = container_of(cl, struct cache_set, caching);
1374        struct cache *ca;
1375        struct btree *b;
1376        unsigned i;
1377
1378        if (!c)
1379                closure_return(cl);
1380
1381        bch_cache_accounting_destroy(&c->accounting);
1382
1383        kobject_put(&c->internal);
1384        kobject_del(&c->kobj);
1385
1386        if (c->gc_thread)
1387                kthread_stop(c->gc_thread);
1388
1389        if (!IS_ERR_OR_NULL(c->root))
1390                list_add(&c->root->list, &c->btree_cache);
1391
1392        /* Should skip this if we're unregistering because of an error */
1393        list_for_each_entry(b, &c->btree_cache, list) {
1394                mutex_lock(&b->write_lock);
1395                if (btree_node_dirty(b))
1396                        __bch_btree_node_write(b, NULL);
1397                mutex_unlock(&b->write_lock);
1398        }
1399
1400        for_each_cache(ca, c, i)
1401                if (ca->alloc_thread)
1402                        kthread_stop(ca->alloc_thread);
1403
1404        if (c->journal.cur) {
1405                cancel_delayed_work_sync(&c->journal.work);
1406                /* flush last journal entry if needed */
1407                c->journal.work.work.func(&c->journal.work.work);
1408        }
1409
1410        closure_return(cl);
1411}
1412
1413static void __cache_set_unregister(struct closure *cl)
1414{
1415        struct cache_set *c = container_of(cl, struct cache_set, caching);
1416        struct cached_dev *dc;
1417        size_t i;
1418
1419        mutex_lock(&bch_register_lock);
1420
1421        for (i = 0; i < c->nr_uuids; i++)
1422                if (c->devices[i]) {
1423                        if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
1424                            test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
1425                                dc = container_of(c->devices[i],
1426                                                  struct cached_dev, disk);
1427                                bch_cached_dev_detach(dc);
1428                        } else {
1429                                bcache_device_stop(c->devices[i]);
1430                        }
1431                }
1432
1433        mutex_unlock(&bch_register_lock);
1434
1435        continue_at(cl, cache_set_flush, system_wq);
1436}
1437
1438void bch_cache_set_stop(struct cache_set *c)
1439{
1440        if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags))
1441                closure_queue(&c->caching);
1442}
1443
1444void bch_cache_set_unregister(struct cache_set *c)
1445{
1446        set_bit(CACHE_SET_UNREGISTERING, &c->flags);
1447        bch_cache_set_stop(c);
1448}
1449
1450#define alloc_bucket_pages(gfp, c)                      \
1451        ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(c))))
1452
1453struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1454{
1455        int iter_size;
1456        struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
1457        if (!c)
1458                return NULL;
1459
1460        __module_get(THIS_MODULE);
1461        closure_init(&c->cl, NULL);
1462        set_closure_fn(&c->cl, cache_set_free, system_wq);
1463
1464        closure_init(&c->caching, &c->cl);
1465        set_closure_fn(&c->caching, __cache_set_unregister, system_wq);
1466
1467        /* Maybe create continue_at_noreturn() and use it here? */
1468        closure_set_stopped(&c->cl);
1469        closure_put(&c->cl);
1470
1471        kobject_init(&c->kobj, &bch_cache_set_ktype);
1472        kobject_init(&c->internal, &bch_cache_set_internal_ktype);
1473
1474        bch_cache_accounting_init(&c->accounting, &c->cl);
1475
1476        memcpy(c->sb.set_uuid, sb->set_uuid, 16);
1477        c->sb.block_size        = sb->block_size;
1478        c->sb.bucket_size       = sb->bucket_size;
1479        c->sb.nr_in_set         = sb->nr_in_set;
1480        c->sb.last_mount        = sb->last_mount;
1481        c->bucket_bits          = ilog2(sb->bucket_size);
1482        c->block_bits           = ilog2(sb->block_size);
1483        c->nr_uuids             = bucket_bytes(c) / sizeof(struct uuid_entry);
1484
1485        c->btree_pages          = bucket_pages(c);
1486        if (c->btree_pages > BTREE_MAX_PAGES)
1487                c->btree_pages = max_t(int, c->btree_pages / 4,
1488                                       BTREE_MAX_PAGES);
1489
1490        sema_init(&c->sb_write_mutex, 1);
1491        mutex_init(&c->bucket_lock);
1492        init_waitqueue_head(&c->btree_cache_wait);
1493        init_waitqueue_head(&c->bucket_wait);
1494        sema_init(&c->uuid_write_mutex, 1);
1495
1496        spin_lock_init(&c->btree_gc_time.lock);
1497        spin_lock_init(&c->btree_split_time.lock);
1498        spin_lock_init(&c->btree_read_time.lock);
1499
1500        bch_moving_init_cache_set(c);
1501
1502        INIT_LIST_HEAD(&c->list);
1503        INIT_LIST_HEAD(&c->cached_devs);
1504        INIT_LIST_HEAD(&c->btree_cache);
1505        INIT_LIST_HEAD(&c->btree_cache_freeable);
1506        INIT_LIST_HEAD(&c->btree_cache_freed);
1507        INIT_LIST_HEAD(&c->data_buckets);
1508
1509        c->search = mempool_create_slab_pool(32, bch_search_cache);
1510        if (!c->search)
1511                goto err;
1512
1513        iter_size = (sb->bucket_size / sb->block_size + 1) *
1514                sizeof(struct btree_iter_set);
1515
1516        if (!(c->devices = kzalloc(c->nr_uuids * sizeof(void *), GFP_KERNEL)) ||
1517            !(c->bio_meta = mempool_create_kmalloc_pool(2,
1518                                sizeof(struct bbio) + sizeof(struct bio_vec) *
1519                                bucket_pages(c))) ||
1520            !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
1521            !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
1522            !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
1523            !(c->moving_gc_wq = alloc_workqueue("bcache_gc",
1524                                                WQ_MEM_RECLAIM, 0)) ||
1525            bch_journal_alloc(c) ||
1526            bch_btree_cache_alloc(c) ||
1527            bch_open_buckets_alloc(c) ||
1528            bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages)))
1529                goto err;
1530
1531        c->congested_read_threshold_us  = 2000;
1532        c->congested_write_threshold_us = 20000;
1533        c->error_limit  = 8 << IO_ERROR_SHIFT;
1534
1535        return c;
1536err:
1537        bch_cache_set_unregister(c);
1538        return NULL;
1539}
1540
1541static void run_cache_set(struct cache_set *c)
1542{
1543        const char *err = "cannot allocate memory";
1544        struct cached_dev *dc, *t;
1545        struct cache *ca;
1546        struct closure cl;
1547        unsigned i;
1548
1549        closure_init_stack(&cl);
1550
1551        for_each_cache(ca, c, i)
1552                c->nbuckets += ca->sb.nbuckets;
1553
1554        if (CACHE_SYNC(&c->sb)) {
1555                LIST_HEAD(journal);
1556                struct bkey *k;
1557                struct jset *j;
1558
1559                err = "cannot allocate memory for journal";
1560                if (bch_journal_read(c, &journal))
1561                        goto err;
1562
1563                pr_debug("btree_journal_read() done");
1564
1565                err = "no journal entries found";
1566                if (list_empty(&journal))
1567                        goto err;
1568
1569                j = &list_entry(journal.prev, struct journal_replay, list)->j;
1570
1571                err = "IO error reading priorities";
1572                for_each_cache(ca, c, i)
1573                        prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]);
1574
1575                /*
1576                 * If prio_read() fails it'll call cache_set_error and we'll
1577                 * tear everything down right away, but if we perhaps checked
1578                 * sooner we could avoid journal replay.
1579                 */
1580
1581                k = &j->btree_root;
1582
1583                err = "bad btree root";
1584                if (__bch_btree_ptr_invalid(c, k))
1585                        goto err;
1586
1587                err = "error reading btree root";
1588                c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true, NULL);
1589                if (IS_ERR_OR_NULL(c->root))
1590                        goto err;
1591
1592                list_del_init(&c->root->list);
1593                rw_unlock(true, c->root);
1594
1595                err = uuid_read(c, j, &cl);
1596                if (err)
1597                        goto err;
1598
1599                err = "error in recovery";
1600                if (bch_btree_check(c))
1601                        goto err;
1602
1603                bch_journal_mark(c, &journal);
1604                bch_initial_gc_finish(c);
1605                pr_debug("btree_check() done");
1606
1607                /*
1608                 * bcache_journal_next() can't happen sooner, or
1609                 * btree_gc_finish() will give spurious errors about last_gc >
1610                 * gc_gen - this is a hack but oh well.
1611                 */
1612                bch_journal_next(&c->journal);
1613
1614                err = "error starting allocator thread";
1615                for_each_cache(ca, c, i)
1616                        if (bch_cache_allocator_start(ca))
1617                                goto err;
1618
1619                /*
1620                 * First place it's safe to allocate: btree_check() and
1621                 * btree_gc_finish() have to run before we have buckets to
1622                 * allocate, and bch_bucket_alloc_set() might cause a journal
1623                 * entry to be written so bcache_journal_next() has to be called
1624                 * first.
1625                 *
1626                 * If the uuids were in the old format we have to rewrite them
1627                 * before the next journal entry is written:
1628                 */
1629                if (j->version < BCACHE_JSET_VERSION_UUID)
1630                        __uuid_write(c);
1631
1632                bch_journal_replay(c, &journal);
1633        } else {
1634                pr_notice("invalidating existing data");
1635
1636                for_each_cache(ca, c, i) {
1637                        unsigned j;
1638
1639                        ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
1640                                              2, SB_JOURNAL_BUCKETS);
1641
1642                        for (j = 0; j < ca->sb.keys; j++)
1643                                ca->sb.d[j] = ca->sb.first_bucket + j;
1644                }
1645
1646                bch_initial_gc_finish(c);
1647
1648                err = "error starting allocator thread";
1649                for_each_cache(ca, c, i)
1650                        if (bch_cache_allocator_start(ca))
1651                                goto err;
1652
1653                mutex_lock(&c->bucket_lock);
1654                for_each_cache(ca, c, i)
1655                        bch_prio_write(ca);
1656                mutex_unlock(&c->bucket_lock);
1657
1658                err = "cannot allocate new UUID bucket";
1659                if (__uuid_write(c))
1660                        goto err;
1661
1662                err = "cannot allocate new btree root";
1663                c->root = __bch_btree_node_alloc(c, NULL, 0, true, NULL);
1664                if (IS_ERR_OR_NULL(c->root))
1665                        goto err;
1666
1667                mutex_lock(&c->root->write_lock);
1668                bkey_copy_key(&c->root->key, &MAX_KEY);
1669                bch_btree_node_write(c->root, &cl);
1670                mutex_unlock(&c->root->write_lock);
1671
1672                bch_btree_set_root(c->root);
1673                rw_unlock(true, c->root);
1674
1675                /*
1676                 * We don't want to write the first journal entry until
1677                 * everything is set up - fortunately journal entries won't be
1678                 * written until the SET_CACHE_SYNC() here:
1679                 */
1680                SET_CACHE_SYNC(&c->sb, true);
1681
1682                bch_journal_next(&c->journal);
1683                bch_journal_meta(c, &cl);
1684        }
1685
1686        err = "error starting gc thread";
1687        if (bch_gc_thread_start(c))
1688                goto err;
1689
1690        closure_sync(&cl);
1691        c->sb.last_mount = get_seconds();
1692        bcache_write_super(c);
1693
1694        list_for_each_entry_safe(dc, t, &uncached_devices, list)
1695                bch_cached_dev_attach(dc, c);
1696
1697        flash_devs_run(c);
1698
1699        set_bit(CACHE_SET_RUNNING, &c->flags);
1700        return;
1701err:
1702        closure_sync(&cl);
1703        /* XXX: test this, it's broken */
1704        bch_cache_set_error(c, "%s", err);
1705}
1706
1707static bool can_attach_cache(struct cache *ca, struct cache_set *c)
1708{
1709        return ca->sb.block_size        == c->sb.block_size &&
1710                ca->sb.bucket_size      == c->sb.bucket_size &&
1711                ca->sb.nr_in_set        == c->sb.nr_in_set;
1712}
1713
1714static const char *register_cache_set(struct cache *ca)
1715{
1716        char buf[12];
1717        const char *err = "cannot allocate memory";
1718        struct cache_set *c;
1719
1720        list_for_each_entry(c, &bch_cache_sets, list)
1721                if (!memcmp(c->sb.set_uuid, ca->sb.set_uuid, 16)) {
1722                        if (c->cache[ca->sb.nr_this_dev])
1723                                return "duplicate cache set member";
1724
1725                        if (!can_attach_cache(ca, c))
1726                                return "cache sb does not match set";
1727
1728                        if (!CACHE_SYNC(&ca->sb))
1729                                SET_CACHE_SYNC(&c->sb, false);
1730
1731                        goto found;
1732                }
1733
1734        c = bch_cache_set_alloc(&ca->sb);
1735        if (!c)
1736                return err;
1737
1738        err = "error creating kobject";
1739        if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->sb.set_uuid) ||
1740            kobject_add(&c->internal, &c->kobj, "internal"))
1741                goto err;
1742
1743        if (bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
1744                goto err;
1745
1746        bch_debug_init_cache_set(c);
1747
1748        list_add(&c->list, &bch_cache_sets);
1749found:
1750        sprintf(buf, "cache%i", ca->sb.nr_this_dev);
1751        if (sysfs_create_link(&ca->kobj, &c->kobj, "set") ||
1752            sysfs_create_link(&c->kobj, &ca->kobj, buf))
1753                goto err;
1754
1755        if (ca->sb.seq > c->sb.seq) {
1756                c->sb.version           = ca->sb.version;
1757                memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16);
1758                c->sb.flags             = ca->sb.flags;
1759                c->sb.seq               = ca->sb.seq;
1760                pr_debug("set version = %llu", c->sb.version);
1761        }
1762
1763        kobject_get(&ca->kobj);
1764        ca->set = c;
1765        ca->set->cache[ca->sb.nr_this_dev] = ca;
1766        c->cache_by_alloc[c->caches_loaded++] = ca;
1767
1768        if (c->caches_loaded == c->sb.nr_in_set)
1769                run_cache_set(c);
1770
1771        return NULL;
1772err:
1773        bch_cache_set_unregister(c);
1774        return err;
1775}
1776
1777/* Cache device */
1778
1779void bch_cache_release(struct kobject *kobj)
1780{
1781        struct cache *ca = container_of(kobj, struct cache, kobj);
1782        unsigned i;
1783
1784        if (ca->set) {
1785                BUG_ON(ca->set->cache[ca->sb.nr_this_dev] != ca);
1786                ca->set->cache[ca->sb.nr_this_dev] = NULL;
1787        }
1788
1789        free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
1790        kfree(ca->prio_buckets);
1791        vfree(ca->buckets);
1792
1793        free_heap(&ca->heap);
1794        free_fifo(&ca->free_inc);
1795
1796        for (i = 0; i < RESERVE_NR; i++)
1797                free_fifo(&ca->free[i]);
1798
1799        if (ca->sb_bio.bi_inline_vecs[0].bv_page)
1800                put_page(ca->sb_bio.bi_io_vec[0].bv_page);
1801
1802        if (!IS_ERR_OR_NULL(ca->bdev))
1803                blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1804
1805        kfree(ca);
1806        module_put(THIS_MODULE);
1807}
1808
1809static int cache_alloc(struct cache *ca)
1810{
1811        size_t free;
1812        struct bucket *b;
1813
1814        __module_get(THIS_MODULE);
1815        kobject_init(&ca->kobj, &bch_cache_ktype);
1816
1817        bio_init(&ca->journal.bio);
1818        ca->journal.bio.bi_max_vecs = 8;
1819        ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs;
1820
1821        free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
1822
1823        if (!init_fifo(&ca->free[RESERVE_BTREE], 8, GFP_KERNEL) ||
1824            !init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
1825            !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) ||
1826            !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) ||
1827            !init_fifo(&ca->free_inc,   free << 2, GFP_KERNEL) ||
1828            !init_heap(&ca->heap,       free << 3, GFP_KERNEL) ||
1829            !(ca->buckets       = vzalloc(sizeof(struct bucket) *
1830                                          ca->sb.nbuckets)) ||
1831            !(ca->prio_buckets  = kzalloc(sizeof(uint64_t) * prio_buckets(ca) *
1832                                          2, GFP_KERNEL)) ||
1833            !(ca->disk_buckets  = alloc_bucket_pages(GFP_KERNEL, ca)))
1834                return -ENOMEM;
1835
1836        ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
1837
1838        for_each_bucket(b, ca)
1839                atomic_set(&b->pin, 0);
1840
1841        return 0;
1842}
1843
1844static int register_cache(struct cache_sb *sb, struct page *sb_page,
1845                                struct block_device *bdev, struct cache *ca)
1846{
1847        char name[BDEVNAME_SIZE];
1848        const char *err = NULL; /* must be set for any error case */
1849        int ret = 0;
1850
1851        memcpy(&ca->sb, sb, sizeof(struct cache_sb));
1852        ca->bdev = bdev;
1853        ca->bdev->bd_holder = ca;
1854
1855        bio_init(&ca->sb_bio);
1856        ca->sb_bio.bi_max_vecs  = 1;
1857        ca->sb_bio.bi_io_vec    = ca->sb_bio.bi_inline_vecs;
1858        ca->sb_bio.bi_io_vec[0].bv_page = sb_page;
1859        get_page(sb_page);
1860
1861        if (blk_queue_discard(bdev_get_queue(ca->bdev)))
1862                ca->discard = CACHE_DISCARD(&ca->sb);
1863
1864        ret = cache_alloc(ca);
1865        if (ret != 0) {
1866                if (ret == -ENOMEM)
1867                        err = "cache_alloc(): -ENOMEM";
1868                else
1869                        err = "cache_alloc(): unknown error";
1870                goto err;
1871        }
1872
1873        if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache")) {
1874                err = "error calling kobject_add";
1875                ret = -ENOMEM;
1876                goto out;
1877        }
1878
1879        mutex_lock(&bch_register_lock);
1880        err = register_cache_set(ca);
1881        mutex_unlock(&bch_register_lock);
1882
1883        if (err) {
1884                ret = -ENODEV;
1885                goto out;
1886        }
1887
1888        pr_info("registered cache device %s", bdevname(bdev, name));
1889
1890out:
1891        kobject_put(&ca->kobj);
1892
1893err:
1894        if (err)
1895                pr_notice("error opening %s: %s", bdevname(bdev, name), err);
1896
1897        return ret;
1898}
1899
1900/* Global interfaces/init */
1901
1902static ssize_t register_bcache(struct kobject *, struct kobj_attribute *,
1903                               const char *, size_t);
1904
1905kobj_attribute_write(register,          register_bcache);
1906kobj_attribute_write(register_quiet,    register_bcache);
1907
1908static bool bch_is_open_backing(struct block_device *bdev) {
1909        struct cache_set *c, *tc;
1910        struct cached_dev *dc, *t;
1911
1912        list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
1913                list_for_each_entry_safe(dc, t, &c->cached_devs, list)
1914                        if (dc->bdev == bdev)
1915                                return true;
1916        list_for_each_entry_safe(dc, t, &uncached_devices, list)
1917                if (dc->bdev == bdev)
1918                        return true;
1919        return false;
1920}
1921
1922static bool bch_is_open_cache(struct block_device *bdev) {
1923        struct cache_set *c, *tc;
1924        struct cache *ca;
1925        unsigned i;
1926
1927        list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
1928                for_each_cache(ca, c, i)
1929                        if (ca->bdev == bdev)
1930                                return true;
1931        return false;
1932}
1933
1934static bool bch_is_open(struct block_device *bdev) {
1935        return bch_is_open_cache(bdev) || bch_is_open_backing(bdev);
1936}
1937
1938static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
1939                               const char *buffer, size_t size)
1940{
1941        ssize_t ret = size;
1942        const char *err = "cannot allocate memory";
1943        char *path = NULL;
1944        struct cache_sb *sb = NULL;
1945        struct block_device *bdev = NULL;
1946        struct page *sb_page = NULL;
1947
1948        if (!try_module_get(THIS_MODULE))
1949                return -EBUSY;
1950
1951        if (!(path = kstrndup(buffer, size, GFP_KERNEL)) ||
1952            !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL)))
1953                goto err;
1954
1955        err = "failed to open device";
1956        bdev = blkdev_get_by_path(strim(path),
1957                                  FMODE_READ|FMODE_WRITE|FMODE_EXCL,
1958                                  sb);
1959        if (IS_ERR(bdev)) {
1960                if (bdev == ERR_PTR(-EBUSY)) {
1961                        bdev = lookup_bdev(strim(path));
1962                        mutex_lock(&bch_register_lock);
1963                        if (!IS_ERR(bdev) && bch_is_open(bdev))
1964                                err = "device already registered";
1965                        else
1966                                err = "device busy";
1967                        mutex_unlock(&bch_register_lock);
1968                        if (attr == &ksysfs_register_quiet)
1969                                goto out;
1970                }
1971                goto err;
1972        }
1973
1974        err = "failed to set blocksize";
1975        if (set_blocksize(bdev, 4096))
1976                goto err_close;
1977
1978        err = read_super(sb, bdev, &sb_page);
1979        if (err)
1980                goto err_close;
1981
1982        if (SB_IS_BDEV(sb)) {
1983                struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
1984                if (!dc)
1985                        goto err_close;
1986
1987                mutex_lock(&bch_register_lock);
1988                register_bdev(sb, sb_page, bdev, dc);
1989                mutex_unlock(&bch_register_lock);
1990        } else {
1991                struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
1992                if (!ca)
1993                        goto err_close;
1994
1995                if (register_cache(sb, sb_page, bdev, ca) != 0)
1996                        goto err_close;
1997        }
1998out:
1999        if (sb_page)
2000                put_page(sb_page);
2001        kfree(sb);
2002        kfree(path);
2003        module_put(THIS_MODULE);
2004        return ret;
2005
2006err_close:
2007        blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2008err:
2009        pr_info("error opening %s: %s", path, err);
2010        ret = -EINVAL;
2011        goto out;
2012}
2013
2014static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
2015{
2016        if (code == SYS_DOWN ||
2017            code == SYS_HALT ||
2018            code == SYS_POWER_OFF) {
2019                DEFINE_WAIT(wait);
2020                unsigned long start = jiffies;
2021                bool stopped = false;
2022
2023                struct cache_set *c, *tc;
2024                struct cached_dev *dc, *tdc;
2025
2026                mutex_lock(&bch_register_lock);
2027
2028                if (list_empty(&bch_cache_sets) &&
2029                    list_empty(&uncached_devices))
2030                        goto out;
2031
2032                pr_info("Stopping all devices:");
2033
2034                list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
2035                        bch_cache_set_stop(c);
2036
2037                list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
2038                        bcache_device_stop(&dc->disk);
2039
2040                /* What's a condition variable? */
2041                while (1) {
2042                        long timeout = start + 2 * HZ - jiffies;
2043
2044                        stopped = list_empty(&bch_cache_sets) &&
2045                                list_empty(&uncached_devices);
2046
2047                        if (timeout < 0 || stopped)
2048                                break;
2049
2050                        prepare_to_wait(&unregister_wait, &wait,
2051                                        TASK_UNINTERRUPTIBLE);
2052
2053                        mutex_unlock(&bch_register_lock);
2054                        schedule_timeout(timeout);
2055                        mutex_lock(&bch_register_lock);
2056                }
2057
2058                finish_wait(&unregister_wait, &wait);
2059
2060                if (stopped)
2061                        pr_info("All devices stopped");
2062                else
2063                        pr_notice("Timeout waiting for devices to be closed");
2064out:
2065                mutex_unlock(&bch_register_lock);
2066        }
2067
2068        return NOTIFY_DONE;
2069}
2070
2071static struct notifier_block reboot = {
2072        .notifier_call  = bcache_reboot,
2073        .priority       = INT_MAX, /* before any real devices */
2074};
2075
2076static void bcache_exit(void)
2077{
2078        bch_debug_exit();
2079        bch_request_exit();
2080        if (bcache_kobj)
2081                kobject_put(bcache_kobj);
2082        if (bcache_wq)
2083                destroy_workqueue(bcache_wq);
2084        if (bcache_major)
2085                unregister_blkdev(bcache_major, "bcache");
2086        unregister_reboot_notifier(&reboot);
2087}
2088
2089static int __init bcache_init(void)
2090{
2091        static const struct attribute *files[] = {
2092                &ksysfs_register.attr,
2093                &ksysfs_register_quiet.attr,
2094                NULL
2095        };
2096
2097        mutex_init(&bch_register_lock);
2098        init_waitqueue_head(&unregister_wait);
2099        register_reboot_notifier(&reboot);
2100        closure_debug_init();
2101
2102        bcache_major = register_blkdev(0, "bcache");
2103        if (bcache_major < 0) {
2104                unregister_reboot_notifier(&reboot);
2105                return bcache_major;
2106        }
2107
2108        if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) ||
2109            !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
2110            sysfs_create_files(bcache_kobj, files) ||
2111            bch_request_init() ||
2112            bch_debug_init(bcache_kobj))
2113                goto err;
2114
2115        return 0;
2116err:
2117        bcache_exit();
2118        return -ENOMEM;
2119}
2120
2121module_exit(bcache_exit);
2122module_init(bcache_init);
2123