1
2
3
4
5
6
7
8
9#include "bcache.h"
10#include "btree.h"
11#include "debug.h"
12#include "extents.h"
13#include "request.h"
14#include "writeback.h"
15
16#include <linux/blkdev.h>
17#include <linux/buffer_head.h>
18#include <linux/debugfs.h>
19#include <linux/genhd.h>
20#include <linux/idr.h>
21#include <linux/kthread.h>
22#include <linux/module.h>
23#include <linux/random.h>
24#include <linux/reboot.h>
25#include <linux/sysfs.h>
26
27MODULE_LICENSE("GPL");
28MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
29
30static const char bcache_magic[] = {
31 0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
32 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81
33};
34
35static const char invalid_uuid[] = {
36 0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78,
37 0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
38};
39
40static struct kobject *bcache_kobj;
41struct mutex bch_register_lock;
42LIST_HEAD(bch_cache_sets);
43static LIST_HEAD(uncached_devices);
44
45static int bcache_major;
46static DEFINE_IDA(bcache_device_idx);
47static wait_queue_head_t unregister_wait;
48struct workqueue_struct *bcache_wq;
49
50#define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE)
51
52#define BCACHE_MINORS 128
53
54#define BCACHE_DEVICE_IDX_MAX ((1U << MINORBITS)/BCACHE_MINORS)
55
56
57
58static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
59 struct page **res)
60{
61 const char *err;
62 struct cache_sb *s;
63 struct buffer_head *bh = __bread(bdev, 1, SB_SIZE);
64 unsigned i;
65
66 if (!bh)
67 return "IO error";
68
69 s = (struct cache_sb *) bh->b_data;
70
71 sb->offset = le64_to_cpu(s->offset);
72 sb->version = le64_to_cpu(s->version);
73
74 memcpy(sb->magic, s->magic, 16);
75 memcpy(sb->uuid, s->uuid, 16);
76 memcpy(sb->set_uuid, s->set_uuid, 16);
77 memcpy(sb->label, s->label, SB_LABEL_SIZE);
78
79 sb->flags = le64_to_cpu(s->flags);
80 sb->seq = le64_to_cpu(s->seq);
81 sb->last_mount = le32_to_cpu(s->last_mount);
82 sb->first_bucket = le16_to_cpu(s->first_bucket);
83 sb->keys = le16_to_cpu(s->keys);
84
85 for (i = 0; i < SB_JOURNAL_BUCKETS; i++)
86 sb->d[i] = le64_to_cpu(s->d[i]);
87
88 pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
89 sb->version, sb->flags, sb->seq, sb->keys);
90
91 err = "Not a bcache superblock";
92 if (sb->offset != SB_SECTOR)
93 goto err;
94
95 if (memcmp(sb->magic, bcache_magic, 16))
96 goto err;
97
98 err = "Too many journal buckets";
99 if (sb->keys > SB_JOURNAL_BUCKETS)
100 goto err;
101
102 err = "Bad checksum";
103 if (s->csum != csum_set(s))
104 goto err;
105
106 err = "Bad UUID";
107 if (bch_is_zero(sb->uuid, 16))
108 goto err;
109
110 sb->block_size = le16_to_cpu(s->block_size);
111
112 err = "Superblock block size smaller than device block size";
113 if (sb->block_size << 9 < bdev_logical_block_size(bdev))
114 goto err;
115
116 switch (sb->version) {
117 case BCACHE_SB_VERSION_BDEV:
118 sb->data_offset = BDEV_DATA_START_DEFAULT;
119 break;
120 case BCACHE_SB_VERSION_BDEV_WITH_OFFSET:
121 sb->data_offset = le64_to_cpu(s->data_offset);
122
123 err = "Bad data offset";
124 if (sb->data_offset < BDEV_DATA_START_DEFAULT)
125 goto err;
126
127 break;
128 case BCACHE_SB_VERSION_CDEV:
129 case BCACHE_SB_VERSION_CDEV_WITH_UUID:
130 sb->nbuckets = le64_to_cpu(s->nbuckets);
131 sb->bucket_size = le16_to_cpu(s->bucket_size);
132
133 sb->nr_in_set = le16_to_cpu(s->nr_in_set);
134 sb->nr_this_dev = le16_to_cpu(s->nr_this_dev);
135
136 err = "Too many buckets";
137 if (sb->nbuckets > LONG_MAX)
138 goto err;
139
140 err = "Not enough buckets";
141 if (sb->nbuckets < 1 << 7)
142 goto err;
143
144 err = "Bad block/bucket size";
145 if (!is_power_of_2(sb->block_size) ||
146 sb->block_size > PAGE_SECTORS ||
147 !is_power_of_2(sb->bucket_size) ||
148 sb->bucket_size < PAGE_SECTORS)
149 goto err;
150
151 err = "Invalid superblock: device too small";
152 if (get_capacity(bdev->bd_disk) < sb->bucket_size * sb->nbuckets)
153 goto err;
154
155 err = "Bad UUID";
156 if (bch_is_zero(sb->set_uuid, 16))
157 goto err;
158
159 err = "Bad cache device number in set";
160 if (!sb->nr_in_set ||
161 sb->nr_in_set <= sb->nr_this_dev ||
162 sb->nr_in_set > MAX_CACHES_PER_SET)
163 goto err;
164
165 err = "Journal buckets not sequential";
166 for (i = 0; i < sb->keys; i++)
167 if (sb->d[i] != sb->first_bucket + i)
168 goto err;
169
170 err = "Too many journal buckets";
171 if (sb->first_bucket + sb->keys > sb->nbuckets)
172 goto err;
173
174 err = "Invalid superblock: first bucket comes before end of super";
175 if (sb->first_bucket * sb->bucket_size < 16)
176 goto err;
177
178 break;
179 default:
180 err = "Unsupported superblock version";
181 goto err;
182 }
183
184 sb->last_mount = get_seconds();
185 err = NULL;
186
187 get_page(bh->b_page);
188 *res = bh->b_page;
189err:
190 put_bh(bh);
191 return err;
192}
193
194static void write_bdev_super_endio(struct bio *bio)
195{
196 struct cached_dev *dc = bio->bi_private;
197
198
199 closure_put(&dc->sb_write);
200}
201
202static void __write_super(struct cache_sb *sb, struct bio *bio)
203{
204 struct cache_sb *out = page_address(bio_first_page_all(bio));
205 unsigned i;
206
207 bio->bi_iter.bi_sector = SB_SECTOR;
208 bio->bi_iter.bi_size = SB_SIZE;
209 bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META);
210 bch_bio_map(bio, NULL);
211
212 out->offset = cpu_to_le64(sb->offset);
213 out->version = cpu_to_le64(sb->version);
214
215 memcpy(out->uuid, sb->uuid, 16);
216 memcpy(out->set_uuid, sb->set_uuid, 16);
217 memcpy(out->label, sb->label, SB_LABEL_SIZE);
218
219 out->flags = cpu_to_le64(sb->flags);
220 out->seq = cpu_to_le64(sb->seq);
221
222 out->last_mount = cpu_to_le32(sb->last_mount);
223 out->first_bucket = cpu_to_le16(sb->first_bucket);
224 out->keys = cpu_to_le16(sb->keys);
225
226 for (i = 0; i < sb->keys; i++)
227 out->d[i] = cpu_to_le64(sb->d[i]);
228
229 out->csum = csum_set(out);
230
231 pr_debug("ver %llu, flags %llu, seq %llu",
232 sb->version, sb->flags, sb->seq);
233
234 submit_bio(bio);
235}
236
237static void bch_write_bdev_super_unlock(struct closure *cl)
238{
239 struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write);
240
241 up(&dc->sb_write_mutex);
242}
243
244void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
245{
246 struct closure *cl = &dc->sb_write;
247 struct bio *bio = &dc->sb_bio;
248
249 down(&dc->sb_write_mutex);
250 closure_init(cl, parent);
251
252 bio_reset(bio);
253 bio_set_dev(bio, dc->bdev);
254 bio->bi_end_io = write_bdev_super_endio;
255 bio->bi_private = dc;
256
257 closure_get(cl);
258
259 __write_super(&dc->sb, bio);
260
261 closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
262}
263
264static void write_super_endio(struct bio *bio)
265{
266 struct cache *ca = bio->bi_private;
267
268
269 bch_count_io_errors(ca, bio->bi_status, 0,
270 "writing superblock");
271 closure_put(&ca->set->sb_write);
272}
273
274static void bcache_write_super_unlock(struct closure *cl)
275{
276 struct cache_set *c = container_of(cl, struct cache_set, sb_write);
277
278 up(&c->sb_write_mutex);
279}
280
281void bcache_write_super(struct cache_set *c)
282{
283 struct closure *cl = &c->sb_write;
284 struct cache *ca;
285 unsigned i;
286
287 down(&c->sb_write_mutex);
288 closure_init(cl, &c->cl);
289
290 c->sb.seq++;
291
292 for_each_cache(ca, c, i) {
293 struct bio *bio = &ca->sb_bio;
294
295 ca->sb.version = BCACHE_SB_VERSION_CDEV_WITH_UUID;
296 ca->sb.seq = c->sb.seq;
297 ca->sb.last_mount = c->sb.last_mount;
298
299 SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb));
300
301 bio_reset(bio);
302 bio_set_dev(bio, ca->bdev);
303 bio->bi_end_io = write_super_endio;
304 bio->bi_private = ca;
305
306 closure_get(cl);
307 __write_super(&ca->sb, bio);
308 }
309
310 closure_return_with_destructor(cl, bcache_write_super_unlock);
311}
312
313
314
315static void uuid_endio(struct bio *bio)
316{
317 struct closure *cl = bio->bi_private;
318 struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
319
320 cache_set_err_on(bio->bi_status, c, "accessing uuids");
321 bch_bbio_free(bio, c);
322 closure_put(cl);
323}
324
325static void uuid_io_unlock(struct closure *cl)
326{
327 struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
328
329 up(&c->uuid_write_mutex);
330}
331
332static void uuid_io(struct cache_set *c, int op, unsigned long op_flags,
333 struct bkey *k, struct closure *parent)
334{
335 struct closure *cl = &c->uuid_write;
336 struct uuid_entry *u;
337 unsigned i;
338 char buf[80];
339
340 BUG_ON(!parent);
341 down(&c->uuid_write_mutex);
342 closure_init(cl, parent);
343
344 for (i = 0; i < KEY_PTRS(k); i++) {
345 struct bio *bio = bch_bbio_alloc(c);
346
347 bio->bi_opf = REQ_SYNC | REQ_META | op_flags;
348 bio->bi_iter.bi_size = KEY_SIZE(k) << 9;
349
350 bio->bi_end_io = uuid_endio;
351 bio->bi_private = cl;
352 bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags);
353 bch_bio_map(bio, c->uuids);
354
355 bch_submit_bbio(bio, c, k, i);
356
357 if (op != REQ_OP_WRITE)
358 break;
359 }
360
361 bch_extent_to_text(buf, sizeof(buf), k);
362 pr_debug("%s UUIDs at %s", op == REQ_OP_WRITE ? "wrote" : "read", buf);
363
364 for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
365 if (!bch_is_zero(u->uuid, 16))
366 pr_debug("Slot %zi: %pU: %s: 1st: %u last: %u inv: %u",
367 u - c->uuids, u->uuid, u->label,
368 u->first_reg, u->last_reg, u->invalidated);
369
370 closure_return_with_destructor(cl, uuid_io_unlock);
371}
372
373static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
374{
375 struct bkey *k = &j->uuid_bucket;
376
377 if (__bch_btree_ptr_invalid(c, k))
378 return "bad uuid pointer";
379
380 bkey_copy(&c->uuid_bucket, k);
381 uuid_io(c, REQ_OP_READ, 0, k, cl);
382
383 if (j->version < BCACHE_JSET_VERSION_UUIDv1) {
384 struct uuid_entry_v0 *u0 = (void *) c->uuids;
385 struct uuid_entry *u1 = (void *) c->uuids;
386 int i;
387
388 closure_sync(cl);
389
390
391
392
393
394
395
396 for (i = c->nr_uuids - 1;
397 i >= 0;
398 --i) {
399 memcpy(u1[i].uuid, u0[i].uuid, 16);
400 memcpy(u1[i].label, u0[i].label, 32);
401
402 u1[i].first_reg = u0[i].first_reg;
403 u1[i].last_reg = u0[i].last_reg;
404 u1[i].invalidated = u0[i].invalidated;
405
406 u1[i].flags = 0;
407 u1[i].sectors = 0;
408 }
409 }
410
411 return NULL;
412}
413
414static int __uuid_write(struct cache_set *c)
415{
416 BKEY_PADDED(key) k;
417 struct closure cl;
418 closure_init_stack(&cl);
419
420 lockdep_assert_held(&bch_register_lock);
421
422 if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true))
423 return 1;
424
425 SET_KEY_SIZE(&k.key, c->sb.bucket_size);
426 uuid_io(c, REQ_OP_WRITE, 0, &k.key, &cl);
427 closure_sync(&cl);
428
429 bkey_copy(&c->uuid_bucket, &k.key);
430 bkey_put(c, &k.key);
431 return 0;
432}
433
434int bch_uuid_write(struct cache_set *c)
435{
436 int ret = __uuid_write(c);
437
438 if (!ret)
439 bch_journal_meta(c, NULL);
440
441 return ret;
442}
443
444static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid)
445{
446 struct uuid_entry *u;
447
448 for (u = c->uuids;
449 u < c->uuids + c->nr_uuids; u++)
450 if (!memcmp(u->uuid, uuid, 16))
451 return u;
452
453 return NULL;
454}
455
456static struct uuid_entry *uuid_find_empty(struct cache_set *c)
457{
458 static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
459 return uuid_find(c, zero_uuid);
460}
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489static void prio_endio(struct bio *bio)
490{
491 struct cache *ca = bio->bi_private;
492
493 cache_set_err_on(bio->bi_status, ca->set, "accessing priorities");
494 bch_bbio_free(bio, ca->set);
495 closure_put(&ca->prio);
496}
497
498static void prio_io(struct cache *ca, uint64_t bucket, int op,
499 unsigned long op_flags)
500{
501 struct closure *cl = &ca->prio;
502 struct bio *bio = bch_bbio_alloc(ca->set);
503
504 closure_init_stack(cl);
505
506 bio->bi_iter.bi_sector = bucket * ca->sb.bucket_size;
507 bio_set_dev(bio, ca->bdev);
508 bio->bi_iter.bi_size = bucket_bytes(ca);
509
510 bio->bi_end_io = prio_endio;
511 bio->bi_private = ca;
512 bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags);
513 bch_bio_map(bio, ca->disk_buckets);
514
515 closure_bio_submit(ca->set, bio, &ca->prio);
516 closure_sync(cl);
517}
518
519void bch_prio_write(struct cache *ca)
520{
521 int i;
522 struct bucket *b;
523 struct closure cl;
524
525 closure_init_stack(&cl);
526
527 lockdep_assert_held(&ca->set->bucket_lock);
528
529 ca->disk_buckets->seq++;
530
531 atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
532 &ca->meta_sectors_written);
533
534
535
536
537 for (i = prio_buckets(ca) - 1; i >= 0; --i) {
538 long bucket;
539 struct prio_set *p = ca->disk_buckets;
540 struct bucket_disk *d = p->data;
541 struct bucket_disk *end = d + prios_per_bucket(ca);
542
543 for (b = ca->buckets + i * prios_per_bucket(ca);
544 b < ca->buckets + ca->sb.nbuckets && d < end;
545 b++, d++) {
546 d->prio = cpu_to_le16(b->prio);
547 d->gen = b->gen;
548 }
549
550 p->next_bucket = ca->prio_buckets[i + 1];
551 p->magic = pset_magic(&ca->sb);
552 p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8);
553
554 bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true);
555 BUG_ON(bucket == -1);
556
557 mutex_unlock(&ca->set->bucket_lock);
558 prio_io(ca, bucket, REQ_OP_WRITE, 0);
559 mutex_lock(&ca->set->bucket_lock);
560
561 ca->prio_buckets[i] = bucket;
562 atomic_dec_bug(&ca->buckets[bucket].pin);
563 }
564
565 mutex_unlock(&ca->set->bucket_lock);
566
567 bch_journal_meta(ca->set, &cl);
568 closure_sync(&cl);
569
570 mutex_lock(&ca->set->bucket_lock);
571
572
573
574
575
576 for (i = 0; i < prio_buckets(ca); i++) {
577 if (ca->prio_last_buckets[i])
578 __bch_bucket_free(ca,
579 &ca->buckets[ca->prio_last_buckets[i]]);
580
581 ca->prio_last_buckets[i] = ca->prio_buckets[i];
582 }
583}
584
585static void prio_read(struct cache *ca, uint64_t bucket)
586{
587 struct prio_set *p = ca->disk_buckets;
588 struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
589 struct bucket *b;
590 unsigned bucket_nr = 0;
591
592 for (b = ca->buckets;
593 b < ca->buckets + ca->sb.nbuckets;
594 b++, d++) {
595 if (d == end) {
596 ca->prio_buckets[bucket_nr] = bucket;
597 ca->prio_last_buckets[bucket_nr] = bucket;
598 bucket_nr++;
599
600 prio_io(ca, bucket, REQ_OP_READ, 0);
601
602 if (p->csum != bch_crc64(&p->magic, bucket_bytes(ca) - 8))
603 pr_warn("bad csum reading priorities");
604
605 if (p->magic != pset_magic(&ca->sb))
606 pr_warn("bad magic reading priorities");
607
608 bucket = p->next_bucket;
609 d = p->data;
610 }
611
612 b->prio = le16_to_cpu(d->prio);
613 b->gen = b->last_gc = d->gen;
614 }
615}
616
617
618
619static int open_dev(struct block_device *b, fmode_t mode)
620{
621 struct bcache_device *d = b->bd_disk->private_data;
622 if (test_bit(BCACHE_DEV_CLOSING, &d->flags))
623 return -ENXIO;
624
625 closure_get(&d->cl);
626 return 0;
627}
628
629static void release_dev(struct gendisk *b, fmode_t mode)
630{
631 struct bcache_device *d = b->private_data;
632 closure_put(&d->cl);
633}
634
635static int ioctl_dev(struct block_device *b, fmode_t mode,
636 unsigned int cmd, unsigned long arg)
637{
638 struct bcache_device *d = b->bd_disk->private_data;
639 struct cached_dev *dc = container_of(d, struct cached_dev, disk);
640
641 if (dc->io_disable)
642 return -EIO;
643
644 return d->ioctl(d, mode, cmd, arg);
645}
646
647static const struct block_device_operations bcache_ops = {
648 .open = open_dev,
649 .release = release_dev,
650 .ioctl = ioctl_dev,
651 .owner = THIS_MODULE,
652};
653
654void bcache_device_stop(struct bcache_device *d)
655{
656 if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags))
657 closure_queue(&d->cl);
658}
659
660static void bcache_device_unlink(struct bcache_device *d)
661{
662 lockdep_assert_held(&bch_register_lock);
663
664 if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) {
665 unsigned i;
666 struct cache *ca;
667
668 sysfs_remove_link(&d->c->kobj, d->name);
669 sysfs_remove_link(&d->kobj, "cache");
670
671 for_each_cache(ca, d->c, i)
672 bd_unlink_disk_holder(ca->bdev, d->disk);
673 }
674}
675
676static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
677 const char *name)
678{
679 unsigned i;
680 struct cache *ca;
681
682 for_each_cache(ca, d->c, i)
683 bd_link_disk_holder(ca->bdev, d->disk);
684
685 snprintf(d->name, BCACHEDEVNAME_SIZE,
686 "%s%u", name, d->id);
687
688 WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
689 sysfs_create_link(&c->kobj, &d->kobj, d->name),
690 "Couldn't create device <-> cache set symlinks");
691
692 clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags);
693}
694
695static void bcache_device_detach(struct bcache_device *d)
696{
697 lockdep_assert_held(&bch_register_lock);
698
699 if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
700 struct uuid_entry *u = d->c->uuids + d->id;
701
702 SET_UUID_FLASH_ONLY(u, 0);
703 memcpy(u->uuid, invalid_uuid, 16);
704 u->invalidated = cpu_to_le32(get_seconds());
705 bch_uuid_write(d->c);
706 }
707
708 bcache_device_unlink(d);
709
710 d->c->devices[d->id] = NULL;
711 closure_put(&d->c->caching);
712 d->c = NULL;
713}
714
715static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
716 unsigned id)
717{
718 d->id = id;
719 d->c = c;
720 c->devices[id] = d;
721
722 if (id >= c->devices_max_used)
723 c->devices_max_used = id + 1;
724
725 closure_get(&c->caching);
726}
727
728static inline int first_minor_to_idx(int first_minor)
729{
730 return (first_minor/BCACHE_MINORS);
731}
732
733static inline int idx_to_first_minor(int idx)
734{
735 return (idx * BCACHE_MINORS);
736}
737
738static void bcache_device_free(struct bcache_device *d)
739{
740 lockdep_assert_held(&bch_register_lock);
741
742 pr_info("%s stopped", d->disk->disk_name);
743
744 if (d->c)
745 bcache_device_detach(d);
746 if (d->disk && d->disk->flags & GENHD_FL_UP)
747 del_gendisk(d->disk);
748 if (d->disk && d->disk->queue)
749 blk_cleanup_queue(d->disk->queue);
750 if (d->disk) {
751 ida_simple_remove(&bcache_device_idx,
752 first_minor_to_idx(d->disk->first_minor));
753 put_disk(d->disk);
754 }
755
756 bioset_exit(&d->bio_split);
757 kvfree(d->full_dirty_stripes);
758 kvfree(d->stripe_sectors_dirty);
759
760 closure_debug_destroy(&d->cl);
761}
762
763static int bcache_device_init(struct bcache_device *d, unsigned block_size,
764 sector_t sectors)
765{
766 struct request_queue *q;
767 const size_t max_stripes = min_t(size_t, INT_MAX,
768 SIZE_MAX / sizeof(atomic_t));
769 size_t n;
770 int idx;
771
772 if (!d->stripe_size)
773 d->stripe_size = 1 << 31;
774
775 d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
776
777 if (!d->nr_stripes || d->nr_stripes > max_stripes) {
778 pr_err("nr_stripes too large or invalid: %u (start sector beyond end of disk?)",
779 (unsigned)d->nr_stripes);
780 return -ENOMEM;
781 }
782
783 n = d->nr_stripes * sizeof(atomic_t);
784 d->stripe_sectors_dirty = kvzalloc(n, GFP_KERNEL);
785 if (!d->stripe_sectors_dirty)
786 return -ENOMEM;
787
788 n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long);
789 d->full_dirty_stripes = kvzalloc(n, GFP_KERNEL);
790 if (!d->full_dirty_stripes)
791 return -ENOMEM;
792
793 idx = ida_simple_get(&bcache_device_idx, 0,
794 BCACHE_DEVICE_IDX_MAX, GFP_KERNEL);
795 if (idx < 0)
796 return idx;
797
798 if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio),
799 BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER) ||
800 !(d->disk = alloc_disk(BCACHE_MINORS))) {
801 ida_simple_remove(&bcache_device_idx, idx);
802 return -ENOMEM;
803 }
804
805 set_capacity(d->disk, sectors);
806 snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx);
807
808 d->disk->major = bcache_major;
809 d->disk->first_minor = idx_to_first_minor(idx);
810 d->disk->fops = &bcache_ops;
811 d->disk->private_data = d;
812
813 q = blk_alloc_queue(GFP_KERNEL);
814 if (!q)
815 return -ENOMEM;
816
817 blk_queue_make_request(q, NULL);
818 d->disk->queue = q;
819 q->queuedata = d;
820 q->backing_dev_info->congested_data = d;
821 q->limits.max_hw_sectors = UINT_MAX;
822 q->limits.max_sectors = UINT_MAX;
823 q->limits.max_segment_size = UINT_MAX;
824 q->limits.max_segments = BIO_MAX_PAGES;
825 blk_queue_max_discard_sectors(q, UINT_MAX);
826 q->limits.discard_granularity = 512;
827 q->limits.io_min = block_size;
828 q->limits.logical_block_size = block_size;
829 q->limits.physical_block_size = block_size;
830 blk_queue_flag_set(QUEUE_FLAG_NONROT, d->disk->queue);
831 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, d->disk->queue);
832 blk_queue_flag_set(QUEUE_FLAG_DISCARD, d->disk->queue);
833
834 blk_queue_write_cache(q, true, true);
835
836 return 0;
837}
838
839
840
841static void calc_cached_dev_sectors(struct cache_set *c)
842{
843 uint64_t sectors = 0;
844 struct cached_dev *dc;
845
846 list_for_each_entry(dc, &c->cached_devs, list)
847 sectors += bdev_sectors(dc->bdev);
848
849 c->cached_dev_sectors = sectors;
850}
851
852#define BACKING_DEV_OFFLINE_TIMEOUT 5
853static int cached_dev_status_update(void *arg)
854{
855 struct cached_dev *dc = arg;
856 struct request_queue *q;
857
858
859
860
861
862
863 while (!kthread_should_stop() && !dc->io_disable) {
864 q = bdev_get_queue(dc->bdev);
865 if (blk_queue_dying(q))
866 dc->offline_seconds++;
867 else
868 dc->offline_seconds = 0;
869
870 if (dc->offline_seconds >= BACKING_DEV_OFFLINE_TIMEOUT) {
871 pr_err("%s: device offline for %d seconds",
872 dc->backing_dev_name,
873 BACKING_DEV_OFFLINE_TIMEOUT);
874 pr_err("%s: disable I/O request due to backing "
875 "device offline", dc->disk.name);
876 dc->io_disable = true;
877
878 smp_mb();
879 bcache_device_stop(&dc->disk);
880 break;
881 }
882 schedule_timeout_interruptible(HZ);
883 }
884
885 wait_for_kthread_stop();
886 return 0;
887}
888
889
890void bch_cached_dev_run(struct cached_dev *dc)
891{
892 struct bcache_device *d = &dc->disk;
893 char buf[SB_LABEL_SIZE + 1];
894 char *env[] = {
895 "DRIVER=bcache",
896 kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid),
897 NULL,
898 NULL,
899 };
900
901 memcpy(buf, dc->sb.label, SB_LABEL_SIZE);
902 buf[SB_LABEL_SIZE] = '\0';
903 env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);
904
905 if (atomic_xchg(&dc->running, 1)) {
906 kfree(env[1]);
907 kfree(env[2]);
908 return;
909 }
910
911 if (!d->c &&
912 BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
913 struct closure cl;
914 closure_init_stack(&cl);
915
916 SET_BDEV_STATE(&dc->sb, BDEV_STATE_STALE);
917 bch_write_bdev_super(dc, &cl);
918 closure_sync(&cl);
919 }
920
921 add_disk(d->disk);
922 bd_link_disk_holder(dc->bdev, dc->disk.disk);
923
924
925 kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
926 kfree(env[1]);
927 kfree(env[2]);
928
929 if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
930 sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
931 pr_debug("error creating sysfs link");
932
933 dc->status_update_thread = kthread_run(cached_dev_status_update,
934 dc, "bcache_status_update");
935 if (IS_ERR(dc->status_update_thread)) {
936 pr_warn("failed to create bcache_status_update kthread, "
937 "continue to run without monitoring backing "
938 "device status");
939 }
940}
941
942
943
944
945
946
947
948
949static void cancel_writeback_rate_update_dwork(struct cached_dev *dc)
950{
951 int time_out = WRITEBACK_RATE_UPDATE_SECS_MAX * HZ;
952
953 do {
954 if (!test_bit(BCACHE_DEV_RATE_DW_RUNNING,
955 &dc->disk.flags))
956 break;
957 time_out--;
958 schedule_timeout_interruptible(1);
959 } while (time_out > 0);
960
961 if (time_out == 0)
962 pr_warn("give up waiting for dc->writeback_write_update to quit");
963
964 cancel_delayed_work_sync(&dc->writeback_rate_update);
965}
966
967static void cached_dev_detach_finish(struct work_struct *w)
968{
969 struct cached_dev *dc = container_of(w, struct cached_dev, detach);
970 struct closure cl;
971 closure_init_stack(&cl);
972
973 BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
974 BUG_ON(refcount_read(&dc->count));
975
976 mutex_lock(&bch_register_lock);
977
978 if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
979 cancel_writeback_rate_update_dwork(dc);
980
981 if (!IS_ERR_OR_NULL(dc->writeback_thread)) {
982 kthread_stop(dc->writeback_thread);
983 dc->writeback_thread = NULL;
984 }
985
986 memset(&dc->sb.set_uuid, 0, 16);
987 SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE);
988
989 bch_write_bdev_super(dc, &cl);
990 closure_sync(&cl);
991
992 bcache_device_detach(&dc->disk);
993 list_move(&dc->list, &uncached_devices);
994
995 clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags);
996 clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags);
997
998 mutex_unlock(&bch_register_lock);
999
1000 pr_info("Caching disabled for %s", dc->backing_dev_name);
1001
1002
1003 closure_put(&dc->disk.cl);
1004}
1005
1006void bch_cached_dev_detach(struct cached_dev *dc)
1007{
1008 lockdep_assert_held(&bch_register_lock);
1009
1010 if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
1011 return;
1012
1013 if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
1014 return;
1015
1016
1017
1018
1019
1020 closure_get(&dc->disk.cl);
1021
1022 bch_writeback_queue(dc);
1023
1024 cached_dev_put(dc);
1025}
1026
1027int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
1028 uint8_t *set_uuid)
1029{
1030 uint32_t rtime = cpu_to_le32(get_seconds());
1031 struct uuid_entry *u;
1032 struct cached_dev *exist_dc, *t;
1033
1034 if ((set_uuid && memcmp(set_uuid, c->sb.set_uuid, 16)) ||
1035 (!set_uuid && memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16)))
1036 return -ENOENT;
1037
1038 if (dc->disk.c) {
1039 pr_err("Can't attach %s: already attached",
1040 dc->backing_dev_name);
1041 return -EINVAL;
1042 }
1043
1044 if (test_bit(CACHE_SET_STOPPING, &c->flags)) {
1045 pr_err("Can't attach %s: shutting down",
1046 dc->backing_dev_name);
1047 return -EINVAL;
1048 }
1049
1050 if (dc->sb.block_size < c->sb.block_size) {
1051
1052 pr_err("Couldn't attach %s: block size less than set's block size",
1053 dc->backing_dev_name);
1054 return -EINVAL;
1055 }
1056
1057
1058 list_for_each_entry_safe(exist_dc, t, &c->cached_devs, list) {
1059 if (!memcmp(dc->sb.uuid, exist_dc->sb.uuid, 16)) {
1060 pr_err("Tried to attach %s but duplicate UUID already attached",
1061 dc->backing_dev_name);
1062
1063 return -EINVAL;
1064 }
1065 }
1066
1067 u = uuid_find(c, dc->sb.uuid);
1068
1069 if (u &&
1070 (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE ||
1071 BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) {
1072 memcpy(u->uuid, invalid_uuid, 16);
1073 u->invalidated = cpu_to_le32(get_seconds());
1074 u = NULL;
1075 }
1076
1077 if (!u) {
1078 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
1079 pr_err("Couldn't find uuid for %s in set",
1080 dc->backing_dev_name);
1081 return -ENOENT;
1082 }
1083
1084 u = uuid_find_empty(c);
1085 if (!u) {
1086 pr_err("Not caching %s, no room for UUID",
1087 dc->backing_dev_name);
1088 return -EINVAL;
1089 }
1090 }
1091
1092
1093
1094
1095
1096 if (bch_is_zero(u->uuid, 16)) {
1097 struct closure cl;
1098 closure_init_stack(&cl);
1099
1100 memcpy(u->uuid, dc->sb.uuid, 16);
1101 memcpy(u->label, dc->sb.label, SB_LABEL_SIZE);
1102 u->first_reg = u->last_reg = rtime;
1103 bch_uuid_write(c);
1104
1105 memcpy(dc->sb.set_uuid, c->sb.set_uuid, 16);
1106 SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
1107
1108 bch_write_bdev_super(dc, &cl);
1109 closure_sync(&cl);
1110 } else {
1111 u->last_reg = rtime;
1112 bch_uuid_write(c);
1113 }
1114
1115 bcache_device_attach(&dc->disk, c, u - c->uuids);
1116 list_move(&dc->list, &c->cached_devs);
1117 calc_cached_dev_sectors(c);
1118
1119 smp_wmb();
1120
1121
1122
1123
1124 refcount_set(&dc->count, 1);
1125
1126
1127 down_write(&dc->writeback_lock);
1128 if (bch_cached_dev_writeback_start(dc)) {
1129 up_write(&dc->writeback_lock);
1130 return -ENOMEM;
1131 }
1132
1133 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
1134 bch_sectors_dirty_init(&dc->disk);
1135 atomic_set(&dc->has_dirty, 1);
1136 bch_writeback_queue(dc);
1137 }
1138
1139 bch_cached_dev_run(dc);
1140 bcache_device_link(&dc->disk, c, "bdev");
1141
1142
1143 up_write(&dc->writeback_lock);
1144
1145 pr_info("Caching %s as %s on set %pU",
1146 dc->backing_dev_name,
1147 dc->disk.disk->disk_name,
1148 dc->disk.c->sb.set_uuid);
1149 return 0;
1150}
1151
1152void bch_cached_dev_release(struct kobject *kobj)
1153{
1154 struct cached_dev *dc = container_of(kobj, struct cached_dev,
1155 disk.kobj);
1156 kfree(dc);
1157 module_put(THIS_MODULE);
1158}
1159
1160static void cached_dev_free(struct closure *cl)
1161{
1162 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
1163
1164 mutex_lock(&bch_register_lock);
1165
1166 if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
1167 cancel_writeback_rate_update_dwork(dc);
1168
1169 if (!IS_ERR_OR_NULL(dc->writeback_thread))
1170 kthread_stop(dc->writeback_thread);
1171 if (dc->writeback_write_wq)
1172 destroy_workqueue(dc->writeback_write_wq);
1173 if (!IS_ERR_OR_NULL(dc->status_update_thread))
1174 kthread_stop(dc->status_update_thread);
1175
1176 if (atomic_read(&dc->running))
1177 bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
1178 bcache_device_free(&dc->disk);
1179 list_del(&dc->list);
1180
1181 mutex_unlock(&bch_register_lock);
1182
1183 if (!IS_ERR_OR_NULL(dc->bdev))
1184 blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1185
1186 wake_up(&unregister_wait);
1187
1188 kobject_put(&dc->disk.kobj);
1189}
1190
1191static void cached_dev_flush(struct closure *cl)
1192{
1193 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
1194 struct bcache_device *d = &dc->disk;
1195
1196 mutex_lock(&bch_register_lock);
1197 bcache_device_unlink(d);
1198 mutex_unlock(&bch_register_lock);
1199
1200 bch_cache_accounting_destroy(&dc->accounting);
1201 kobject_del(&d->kobj);
1202
1203 continue_at(cl, cached_dev_free, system_wq);
1204}
1205
1206static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
1207{
1208 int ret;
1209 struct io *io;
1210 struct request_queue *q = bdev_get_queue(dc->bdev);
1211
1212 __module_get(THIS_MODULE);
1213 INIT_LIST_HEAD(&dc->list);
1214 closure_init(&dc->disk.cl, NULL);
1215 set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
1216 kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
1217 INIT_WORK(&dc->detach, cached_dev_detach_finish);
1218 sema_init(&dc->sb_write_mutex, 1);
1219 INIT_LIST_HEAD(&dc->io_lru);
1220 spin_lock_init(&dc->io_lock);
1221 bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
1222
1223 dc->sequential_cutoff = 4 << 20;
1224
1225 for (io = dc->io; io < dc->io + RECENT_IO; io++) {
1226 list_add(&io->lru, &dc->io_lru);
1227 hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
1228 }
1229
1230 dc->disk.stripe_size = q->limits.io_opt >> 9;
1231
1232 if (dc->disk.stripe_size)
1233 dc->partial_stripes_expensive =
1234 q->limits.raid_partial_stripes_expensive;
1235
1236 ret = bcache_device_init(&dc->disk, block_size,
1237 dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
1238 if (ret)
1239 return ret;
1240
1241 dc->disk.disk->queue->backing_dev_info->ra_pages =
1242 max(dc->disk.disk->queue->backing_dev_info->ra_pages,
1243 q->backing_dev_info->ra_pages);
1244
1245 atomic_set(&dc->io_errors, 0);
1246 dc->io_disable = false;
1247 dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT;
1248
1249 dc->stop_when_cache_set_failed = BCH_CACHED_DEV_STOP_AUTO;
1250
1251 bch_cached_dev_request_init(dc);
1252 bch_cached_dev_writeback_init(dc);
1253 return 0;
1254}
1255
1256
1257
1258static void register_bdev(struct cache_sb *sb, struct page *sb_page,
1259 struct block_device *bdev,
1260 struct cached_dev *dc)
1261{
1262 const char *err = "cannot allocate memory";
1263 struct cache_set *c;
1264
1265 bdevname(bdev, dc->backing_dev_name);
1266 memcpy(&dc->sb, sb, sizeof(struct cache_sb));
1267 dc->bdev = bdev;
1268 dc->bdev->bd_holder = dc;
1269
1270 bio_init(&dc->sb_bio, dc->sb_bio.bi_inline_vecs, 1);
1271 bio_first_bvec_all(&dc->sb_bio)->bv_page = sb_page;
1272 get_page(sb_page);
1273
1274
1275 if (cached_dev_init(dc, sb->block_size << 9))
1276 goto err;
1277
1278 err = "error creating kobject";
1279 if (kobject_add(&dc->disk.kobj, &part_to_dev(bdev->bd_part)->kobj,
1280 "bcache"))
1281 goto err;
1282 if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
1283 goto err;
1284
1285 pr_info("registered backing device %s", dc->backing_dev_name);
1286
1287 list_add(&dc->list, &uncached_devices);
1288 list_for_each_entry(c, &bch_cache_sets, list)
1289 bch_cached_dev_attach(dc, c, NULL);
1290
1291 if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE ||
1292 BDEV_STATE(&dc->sb) == BDEV_STATE_STALE)
1293 bch_cached_dev_run(dc);
1294
1295 return;
1296err:
1297 pr_notice("error %s: %s", dc->backing_dev_name, err);
1298 bcache_device_stop(&dc->disk);
1299}
1300
1301
1302
1303void bch_flash_dev_release(struct kobject *kobj)
1304{
1305 struct bcache_device *d = container_of(kobj, struct bcache_device,
1306 kobj);
1307 kfree(d);
1308}
1309
1310static void flash_dev_free(struct closure *cl)
1311{
1312 struct bcache_device *d = container_of(cl, struct bcache_device, cl);
1313 mutex_lock(&bch_register_lock);
1314 bcache_device_free(d);
1315 mutex_unlock(&bch_register_lock);
1316 kobject_put(&d->kobj);
1317}
1318
1319static void flash_dev_flush(struct closure *cl)
1320{
1321 struct bcache_device *d = container_of(cl, struct bcache_device, cl);
1322
1323 mutex_lock(&bch_register_lock);
1324 bcache_device_unlink(d);
1325 mutex_unlock(&bch_register_lock);
1326 kobject_del(&d->kobj);
1327 continue_at(cl, flash_dev_free, system_wq);
1328}
1329
1330static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
1331{
1332 struct bcache_device *d = kzalloc(sizeof(struct bcache_device),
1333 GFP_KERNEL);
1334 if (!d)
1335 return -ENOMEM;
1336
1337 closure_init(&d->cl, NULL);
1338 set_closure_fn(&d->cl, flash_dev_flush, system_wq);
1339
1340 kobject_init(&d->kobj, &bch_flash_dev_ktype);
1341
1342 if (bcache_device_init(d, block_bytes(c), u->sectors))
1343 goto err;
1344
1345 bcache_device_attach(d, c, u - c->uuids);
1346 bch_sectors_dirty_init(d);
1347 bch_flash_dev_request_init(d);
1348 add_disk(d->disk);
1349
1350 if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache"))
1351 goto err;
1352
1353 bcache_device_link(d, c, "volume");
1354
1355 return 0;
1356err:
1357 kobject_put(&d->kobj);
1358 return -ENOMEM;
1359}
1360
1361static int flash_devs_run(struct cache_set *c)
1362{
1363 int ret = 0;
1364 struct uuid_entry *u;
1365
1366 for (u = c->uuids;
1367 u < c->uuids + c->nr_uuids && !ret;
1368 u++)
1369 if (UUID_FLASH_ONLY(u))
1370 ret = flash_dev_run(c, u);
1371
1372 return ret;
1373}
1374
1375int bch_flash_dev_create(struct cache_set *c, uint64_t size)
1376{
1377 struct uuid_entry *u;
1378
1379 if (test_bit(CACHE_SET_STOPPING, &c->flags))
1380 return -EINTR;
1381
1382 if (!test_bit(CACHE_SET_RUNNING, &c->flags))
1383 return -EPERM;
1384
1385 u = uuid_find_empty(c);
1386 if (!u) {
1387 pr_err("Can't create volume, no room for UUID");
1388 return -EINVAL;
1389 }
1390
1391 get_random_bytes(u->uuid, 16);
1392 memset(u->label, 0, 32);
1393 u->first_reg = u->last_reg = cpu_to_le32(get_seconds());
1394
1395 SET_UUID_FLASH_ONLY(u, 1);
1396 u->sectors = size >> 9;
1397
1398 bch_uuid_write(c);
1399
1400 return flash_dev_run(c, u);
1401}
1402
1403bool bch_cached_dev_error(struct cached_dev *dc)
1404{
1405 struct cache_set *c;
1406
1407 if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
1408 return false;
1409
1410 dc->io_disable = true;
1411
1412 smp_mb();
1413
1414 pr_err("stop %s: too many IO errors on backing device %s\n",
1415 dc->disk.disk->disk_name, dc->backing_dev_name);
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428 c = dc->disk.c;
1429 if (c && test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags))
1430 pr_info("CACHE_SET_IO_DISABLE already set");
1431
1432 bcache_device_stop(&dc->disk);
1433 return true;
1434}
1435
1436
1437
1438__printf(2, 3)
1439bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
1440{
1441 va_list args;
1442
1443 if (c->on_error != ON_ERROR_PANIC &&
1444 test_bit(CACHE_SET_STOPPING, &c->flags))
1445 return false;
1446
1447 if (test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags))
1448 pr_info("CACHE_SET_IO_DISABLE already set");
1449
1450
1451
1452
1453
1454 printk(KERN_ERR "bcache: error on %pU: ", c->sb.set_uuid);
1455
1456 va_start(args, fmt);
1457 vprintk(fmt, args);
1458 va_end(args);
1459
1460 printk(", disabling caching\n");
1461
1462 if (c->on_error == ON_ERROR_PANIC)
1463 panic("panic forced after error\n");
1464
1465 bch_cache_set_unregister(c);
1466 return true;
1467}
1468
1469void bch_cache_set_release(struct kobject *kobj)
1470{
1471 struct cache_set *c = container_of(kobj, struct cache_set, kobj);
1472 kfree(c);
1473 module_put(THIS_MODULE);
1474}
1475
1476static void cache_set_free(struct closure *cl)
1477{
1478 struct cache_set *c = container_of(cl, struct cache_set, cl);
1479 struct cache *ca;
1480 unsigned i;
1481
1482 if (!IS_ERR_OR_NULL(c->debug))
1483 debugfs_remove(c->debug);
1484
1485 bch_open_buckets_free(c);
1486 bch_btree_cache_free(c);
1487 bch_journal_free(c);
1488
1489 for_each_cache(ca, c, i)
1490 if (ca) {
1491 ca->set = NULL;
1492 c->cache[ca->sb.nr_this_dev] = NULL;
1493 kobject_put(&ca->kobj);
1494 }
1495
1496 bch_bset_sort_state_free(&c->sort);
1497 free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
1498
1499 if (c->moving_gc_wq)
1500 destroy_workqueue(c->moving_gc_wq);
1501 bioset_exit(&c->bio_split);
1502 mempool_exit(&c->fill_iter);
1503 mempool_exit(&c->bio_meta);
1504 mempool_exit(&c->search);
1505 kfree(c->devices);
1506
1507 mutex_lock(&bch_register_lock);
1508 list_del(&c->list);
1509 mutex_unlock(&bch_register_lock);
1510
1511 pr_info("Cache set %pU unregistered", c->sb.set_uuid);
1512 wake_up(&unregister_wait);
1513
1514 closure_debug_destroy(&c->cl);
1515 kobject_put(&c->kobj);
1516}
1517
1518static void cache_set_flush(struct closure *cl)
1519{
1520 struct cache_set *c = container_of(cl, struct cache_set, caching);
1521 struct cache *ca;
1522 struct btree *b;
1523 unsigned i;
1524
1525 bch_cache_accounting_destroy(&c->accounting);
1526
1527 kobject_put(&c->internal);
1528 kobject_del(&c->kobj);
1529
1530 if (c->gc_thread)
1531 kthread_stop(c->gc_thread);
1532
1533 if (!IS_ERR_OR_NULL(c->root))
1534 list_add(&c->root->list, &c->btree_cache);
1535
1536
1537 list_for_each_entry(b, &c->btree_cache, list) {
1538 mutex_lock(&b->write_lock);
1539 if (btree_node_dirty(b))
1540 __bch_btree_node_write(b, NULL);
1541 mutex_unlock(&b->write_lock);
1542 }
1543
1544 for_each_cache(ca, c, i)
1545 if (ca->alloc_thread)
1546 kthread_stop(ca->alloc_thread);
1547
1548 if (c->journal.cur) {
1549 cancel_delayed_work_sync(&c->journal.work);
1550
1551 c->journal.work.work.func(&c->journal.work.work);
1552 }
1553
1554 closure_return(cl);
1555}
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573static void conditional_stop_bcache_device(struct cache_set *c,
1574 struct bcache_device *d,
1575 struct cached_dev *dc)
1576{
1577 if (dc->stop_when_cache_set_failed == BCH_CACHED_DEV_STOP_ALWAYS) {
1578 pr_warn("stop_when_cache_set_failed of %s is \"always\", stop it for failed cache set %pU.",
1579 d->disk->disk_name, c->sb.set_uuid);
1580 bcache_device_stop(d);
1581 } else if (atomic_read(&dc->has_dirty)) {
1582
1583
1584
1585
1586 pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is dirty, stop it to avoid potential data corruption.",
1587 d->disk->disk_name);
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599 dc->io_disable = true;
1600
1601 smp_mb();
1602 bcache_device_stop(d);
1603 } else {
1604
1605
1606
1607
1608 pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is clean, keep it alive.",
1609 d->disk->disk_name);
1610 }
1611}
1612
1613static void __cache_set_unregister(struct closure *cl)
1614{
1615 struct cache_set *c = container_of(cl, struct cache_set, caching);
1616 struct cached_dev *dc;
1617 struct bcache_device *d;
1618 size_t i;
1619
1620 mutex_lock(&bch_register_lock);
1621
1622 for (i = 0; i < c->devices_max_used; i++) {
1623 d = c->devices[i];
1624 if (!d)
1625 continue;
1626
1627 if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
1628 test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
1629 dc = container_of(d, struct cached_dev, disk);
1630 bch_cached_dev_detach(dc);
1631 if (test_bit(CACHE_SET_IO_DISABLE, &c->flags))
1632 conditional_stop_bcache_device(c, d, dc);
1633 } else {
1634 bcache_device_stop(d);
1635 }
1636 }
1637
1638 mutex_unlock(&bch_register_lock);
1639
1640 continue_at(cl, cache_set_flush, system_wq);
1641}
1642
1643void bch_cache_set_stop(struct cache_set *c)
1644{
1645 if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags))
1646 closure_queue(&c->caching);
1647}
1648
1649void bch_cache_set_unregister(struct cache_set *c)
1650{
1651 set_bit(CACHE_SET_UNREGISTERING, &c->flags);
1652 bch_cache_set_stop(c);
1653}
1654
1655#define alloc_bucket_pages(gfp, c) \
1656 ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(c))))
1657
1658struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1659{
1660 int iter_size;
1661 struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
1662 if (!c)
1663 return NULL;
1664
1665 __module_get(THIS_MODULE);
1666 closure_init(&c->cl, NULL);
1667 set_closure_fn(&c->cl, cache_set_free, system_wq);
1668
1669 closure_init(&c->caching, &c->cl);
1670 set_closure_fn(&c->caching, __cache_set_unregister, system_wq);
1671
1672
1673 closure_set_stopped(&c->cl);
1674 closure_put(&c->cl);
1675
1676 kobject_init(&c->kobj, &bch_cache_set_ktype);
1677 kobject_init(&c->internal, &bch_cache_set_internal_ktype);
1678
1679 bch_cache_accounting_init(&c->accounting, &c->cl);
1680
1681 memcpy(c->sb.set_uuid, sb->set_uuid, 16);
1682 c->sb.block_size = sb->block_size;
1683 c->sb.bucket_size = sb->bucket_size;
1684 c->sb.nr_in_set = sb->nr_in_set;
1685 c->sb.last_mount = sb->last_mount;
1686 c->bucket_bits = ilog2(sb->bucket_size);
1687 c->block_bits = ilog2(sb->block_size);
1688 c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry);
1689 c->devices_max_used = 0;
1690 c->btree_pages = bucket_pages(c);
1691 if (c->btree_pages > BTREE_MAX_PAGES)
1692 c->btree_pages = max_t(int, c->btree_pages / 4,
1693 BTREE_MAX_PAGES);
1694
1695 sema_init(&c->sb_write_mutex, 1);
1696 mutex_init(&c->bucket_lock);
1697 init_waitqueue_head(&c->btree_cache_wait);
1698 init_waitqueue_head(&c->bucket_wait);
1699 init_waitqueue_head(&c->gc_wait);
1700 sema_init(&c->uuid_write_mutex, 1);
1701
1702 spin_lock_init(&c->btree_gc_time.lock);
1703 spin_lock_init(&c->btree_split_time.lock);
1704 spin_lock_init(&c->btree_read_time.lock);
1705
1706 bch_moving_init_cache_set(c);
1707
1708 INIT_LIST_HEAD(&c->list);
1709 INIT_LIST_HEAD(&c->cached_devs);
1710 INIT_LIST_HEAD(&c->btree_cache);
1711 INIT_LIST_HEAD(&c->btree_cache_freeable);
1712 INIT_LIST_HEAD(&c->btree_cache_freed);
1713 INIT_LIST_HEAD(&c->data_buckets);
1714
1715 iter_size = (sb->bucket_size / sb->block_size + 1) *
1716 sizeof(struct btree_iter_set);
1717
1718 if (!(c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL)) ||
1719 mempool_init_slab_pool(&c->search, 32, bch_search_cache) ||
1720 mempool_init_kmalloc_pool(&c->bio_meta, 2,
1721 sizeof(struct bbio) + sizeof(struct bio_vec) *
1722 bucket_pages(c)) ||
1723 mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
1724 bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio),
1725 BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER) ||
1726 !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
1727 !(c->moving_gc_wq = alloc_workqueue("bcache_gc",
1728 WQ_MEM_RECLAIM, 0)) ||
1729 bch_journal_alloc(c) ||
1730 bch_btree_cache_alloc(c) ||
1731 bch_open_buckets_alloc(c) ||
1732 bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages)))
1733 goto err;
1734
1735 c->congested_read_threshold_us = 2000;
1736 c->congested_write_threshold_us = 20000;
1737 c->error_limit = DEFAULT_IO_ERROR_LIMIT;
1738 WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags));
1739
1740 return c;
1741err:
1742 bch_cache_set_unregister(c);
1743 return NULL;
1744}
1745
1746static void run_cache_set(struct cache_set *c)
1747{
1748 const char *err = "cannot allocate memory";
1749 struct cached_dev *dc, *t;
1750 struct cache *ca;
1751 struct closure cl;
1752 unsigned i;
1753
1754 closure_init_stack(&cl);
1755
1756 for_each_cache(ca, c, i)
1757 c->nbuckets += ca->sb.nbuckets;
1758 set_gc_sectors(c);
1759
1760 if (CACHE_SYNC(&c->sb)) {
1761 LIST_HEAD(journal);
1762 struct bkey *k;
1763 struct jset *j;
1764
1765 err = "cannot allocate memory for journal";
1766 if (bch_journal_read(c, &journal))
1767 goto err;
1768
1769 pr_debug("btree_journal_read() done");
1770
1771 err = "no journal entries found";
1772 if (list_empty(&journal))
1773 goto err;
1774
1775 j = &list_entry(journal.prev, struct journal_replay, list)->j;
1776
1777 err = "IO error reading priorities";
1778 for_each_cache(ca, c, i)
1779 prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]);
1780
1781
1782
1783
1784
1785
1786
1787 k = &j->btree_root;
1788
1789 err = "bad btree root";
1790 if (__bch_btree_ptr_invalid(c, k))
1791 goto err;
1792
1793 err = "error reading btree root";
1794 c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true, NULL);
1795 if (IS_ERR_OR_NULL(c->root))
1796 goto err;
1797
1798 list_del_init(&c->root->list);
1799 rw_unlock(true, c->root);
1800
1801 err = uuid_read(c, j, &cl);
1802 if (err)
1803 goto err;
1804
1805 err = "error in recovery";
1806 if (bch_btree_check(c))
1807 goto err;
1808
1809 bch_journal_mark(c, &journal);
1810 bch_initial_gc_finish(c);
1811 pr_debug("btree_check() done");
1812
1813
1814
1815
1816
1817
1818 bch_journal_next(&c->journal);
1819
1820 err = "error starting allocator thread";
1821 for_each_cache(ca, c, i)
1822 if (bch_cache_allocator_start(ca))
1823 goto err;
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835 if (j->version < BCACHE_JSET_VERSION_UUID)
1836 __uuid_write(c);
1837
1838 bch_journal_replay(c, &journal);
1839 } else {
1840 pr_notice("invalidating existing data");
1841
1842 for_each_cache(ca, c, i) {
1843 unsigned j;
1844
1845 ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
1846 2, SB_JOURNAL_BUCKETS);
1847
1848 for (j = 0; j < ca->sb.keys; j++)
1849 ca->sb.d[j] = ca->sb.first_bucket + j;
1850 }
1851
1852 bch_initial_gc_finish(c);
1853
1854 err = "error starting allocator thread";
1855 for_each_cache(ca, c, i)
1856 if (bch_cache_allocator_start(ca))
1857 goto err;
1858
1859 mutex_lock(&c->bucket_lock);
1860 for_each_cache(ca, c, i)
1861 bch_prio_write(ca);
1862 mutex_unlock(&c->bucket_lock);
1863
1864 err = "cannot allocate new UUID bucket";
1865 if (__uuid_write(c))
1866 goto err;
1867
1868 err = "cannot allocate new btree root";
1869 c->root = __bch_btree_node_alloc(c, NULL, 0, true, NULL);
1870 if (IS_ERR_OR_NULL(c->root))
1871 goto err;
1872
1873 mutex_lock(&c->root->write_lock);
1874 bkey_copy_key(&c->root->key, &MAX_KEY);
1875 bch_btree_node_write(c->root, &cl);
1876 mutex_unlock(&c->root->write_lock);
1877
1878 bch_btree_set_root(c->root);
1879 rw_unlock(true, c->root);
1880
1881
1882
1883
1884
1885
1886 SET_CACHE_SYNC(&c->sb, true);
1887
1888 bch_journal_next(&c->journal);
1889 bch_journal_meta(c, &cl);
1890 }
1891
1892 err = "error starting gc thread";
1893 if (bch_gc_thread_start(c))
1894 goto err;
1895
1896 closure_sync(&cl);
1897 c->sb.last_mount = get_seconds();
1898 bcache_write_super(c);
1899
1900 list_for_each_entry_safe(dc, t, &uncached_devices, list)
1901 bch_cached_dev_attach(dc, c, NULL);
1902
1903 flash_devs_run(c);
1904
1905 set_bit(CACHE_SET_RUNNING, &c->flags);
1906 return;
1907err:
1908 closure_sync(&cl);
1909
1910 bch_cache_set_error(c, "%s", err);
1911}
1912
1913static bool can_attach_cache(struct cache *ca, struct cache_set *c)
1914{
1915 return ca->sb.block_size == c->sb.block_size &&
1916 ca->sb.bucket_size == c->sb.bucket_size &&
1917 ca->sb.nr_in_set == c->sb.nr_in_set;
1918}
1919
1920static const char *register_cache_set(struct cache *ca)
1921{
1922 char buf[12];
1923 const char *err = "cannot allocate memory";
1924 struct cache_set *c;
1925
1926 list_for_each_entry(c, &bch_cache_sets, list)
1927 if (!memcmp(c->sb.set_uuid, ca->sb.set_uuid, 16)) {
1928 if (c->cache[ca->sb.nr_this_dev])
1929 return "duplicate cache set member";
1930
1931 if (!can_attach_cache(ca, c))
1932 return "cache sb does not match set";
1933
1934 if (!CACHE_SYNC(&ca->sb))
1935 SET_CACHE_SYNC(&c->sb, false);
1936
1937 goto found;
1938 }
1939
1940 c = bch_cache_set_alloc(&ca->sb);
1941 if (!c)
1942 return err;
1943
1944 err = "error creating kobject";
1945 if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->sb.set_uuid) ||
1946 kobject_add(&c->internal, &c->kobj, "internal"))
1947 goto err;
1948
1949 if (bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
1950 goto err;
1951
1952 bch_debug_init_cache_set(c);
1953
1954 list_add(&c->list, &bch_cache_sets);
1955found:
1956 sprintf(buf, "cache%i", ca->sb.nr_this_dev);
1957 if (sysfs_create_link(&ca->kobj, &c->kobj, "set") ||
1958 sysfs_create_link(&c->kobj, &ca->kobj, buf))
1959 goto err;
1960
1961 if (ca->sb.seq > c->sb.seq) {
1962 c->sb.version = ca->sb.version;
1963 memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16);
1964 c->sb.flags = ca->sb.flags;
1965 c->sb.seq = ca->sb.seq;
1966 pr_debug("set version = %llu", c->sb.version);
1967 }
1968
1969 kobject_get(&ca->kobj);
1970 ca->set = c;
1971 ca->set->cache[ca->sb.nr_this_dev] = ca;
1972 c->cache_by_alloc[c->caches_loaded++] = ca;
1973
1974 if (c->caches_loaded == c->sb.nr_in_set)
1975 run_cache_set(c);
1976
1977 return NULL;
1978err:
1979 bch_cache_set_unregister(c);
1980 return err;
1981}
1982
1983
1984
1985void bch_cache_release(struct kobject *kobj)
1986{
1987 struct cache *ca = container_of(kobj, struct cache, kobj);
1988 unsigned i;
1989
1990 if (ca->set) {
1991 BUG_ON(ca->set->cache[ca->sb.nr_this_dev] != ca);
1992 ca->set->cache[ca->sb.nr_this_dev] = NULL;
1993 }
1994
1995 free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
1996 kfree(ca->prio_buckets);
1997 vfree(ca->buckets);
1998
1999 free_heap(&ca->heap);
2000 free_fifo(&ca->free_inc);
2001
2002 for (i = 0; i < RESERVE_NR; i++)
2003 free_fifo(&ca->free[i]);
2004
2005 if (ca->sb_bio.bi_inline_vecs[0].bv_page)
2006 put_page(bio_first_page_all(&ca->sb_bio));
2007
2008 if (!IS_ERR_OR_NULL(ca->bdev))
2009 blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2010
2011 kfree(ca);
2012 module_put(THIS_MODULE);
2013}
2014
2015static int cache_alloc(struct cache *ca)
2016{
2017 size_t free;
2018 size_t btree_buckets;
2019 struct bucket *b;
2020
2021 __module_get(THIS_MODULE);
2022 kobject_init(&ca->kobj, &bch_cache_ktype);
2023
2024 bio_init(&ca->journal.bio, ca->journal.bio.bi_inline_vecs, 8);
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035 btree_buckets = ca->sb.njournal_buckets ?: 8;
2036 free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
2037
2038 if (!init_fifo(&ca->free[RESERVE_BTREE], btree_buckets, GFP_KERNEL) ||
2039 !init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
2040 !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) ||
2041 !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) ||
2042 !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) ||
2043 !init_heap(&ca->heap, free << 3, GFP_KERNEL) ||
2044 !(ca->buckets = vzalloc(array_size(sizeof(struct bucket),
2045 ca->sb.nbuckets))) ||
2046 !(ca->prio_buckets = kzalloc(array3_size(sizeof(uint64_t),
2047 prio_buckets(ca), 2),
2048 GFP_KERNEL)) ||
2049 !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)))
2050 return -ENOMEM;
2051
2052 ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
2053
2054 for_each_bucket(b, ca)
2055 atomic_set(&b->pin, 0);
2056
2057 return 0;
2058}
2059
2060static int register_cache(struct cache_sb *sb, struct page *sb_page,
2061 struct block_device *bdev, struct cache *ca)
2062{
2063 const char *err = NULL;
2064 int ret = 0;
2065
2066 bdevname(bdev, ca->cache_dev_name);
2067 memcpy(&ca->sb, sb, sizeof(struct cache_sb));
2068 ca->bdev = bdev;
2069 ca->bdev->bd_holder = ca;
2070
2071 bio_init(&ca->sb_bio, ca->sb_bio.bi_inline_vecs, 1);
2072 bio_first_bvec_all(&ca->sb_bio)->bv_page = sb_page;
2073 get_page(sb_page);
2074
2075 if (blk_queue_discard(bdev_get_queue(bdev)))
2076 ca->discard = CACHE_DISCARD(&ca->sb);
2077
2078 ret = cache_alloc(ca);
2079 if (ret != 0) {
2080 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2081 if (ret == -ENOMEM)
2082 err = "cache_alloc(): -ENOMEM";
2083 else
2084 err = "cache_alloc(): unknown error";
2085 goto err;
2086 }
2087
2088 if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache")) {
2089 err = "error calling kobject_add";
2090 ret = -ENOMEM;
2091 goto out;
2092 }
2093
2094 mutex_lock(&bch_register_lock);
2095 err = register_cache_set(ca);
2096 mutex_unlock(&bch_register_lock);
2097
2098 if (err) {
2099 ret = -ENODEV;
2100 goto out;
2101 }
2102
2103 pr_info("registered cache device %s", ca->cache_dev_name);
2104
2105out:
2106 kobject_put(&ca->kobj);
2107
2108err:
2109 if (err)
2110 pr_notice("error %s: %s", ca->cache_dev_name, err);
2111
2112 return ret;
2113}
2114
2115
2116
2117static ssize_t register_bcache(struct kobject *, struct kobj_attribute *,
2118 const char *, size_t);
2119
2120kobj_attribute_write(register, register_bcache);
2121kobj_attribute_write(register_quiet, register_bcache);
2122
2123static bool bch_is_open_backing(struct block_device *bdev) {
2124 struct cache_set *c, *tc;
2125 struct cached_dev *dc, *t;
2126
2127 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
2128 list_for_each_entry_safe(dc, t, &c->cached_devs, list)
2129 if (dc->bdev == bdev)
2130 return true;
2131 list_for_each_entry_safe(dc, t, &uncached_devices, list)
2132 if (dc->bdev == bdev)
2133 return true;
2134 return false;
2135}
2136
2137static bool bch_is_open_cache(struct block_device *bdev) {
2138 struct cache_set *c, *tc;
2139 struct cache *ca;
2140 unsigned i;
2141
2142 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
2143 for_each_cache(ca, c, i)
2144 if (ca->bdev == bdev)
2145 return true;
2146 return false;
2147}
2148
2149static bool bch_is_open(struct block_device *bdev) {
2150 return bch_is_open_cache(bdev) || bch_is_open_backing(bdev);
2151}
2152
2153static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
2154 const char *buffer, size_t size)
2155{
2156 ssize_t ret = size;
2157 const char *err = "cannot allocate memory";
2158 char *path = NULL;
2159 struct cache_sb *sb = NULL;
2160 struct block_device *bdev = NULL;
2161 struct page *sb_page = NULL;
2162
2163 if (!try_module_get(THIS_MODULE))
2164 return -EBUSY;
2165
2166 if (!(path = kstrndup(buffer, size, GFP_KERNEL)) ||
2167 !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL)))
2168 goto err;
2169
2170 err = "failed to open device";
2171 bdev = blkdev_get_by_path(strim(path),
2172 FMODE_READ|FMODE_WRITE|FMODE_EXCL,
2173 sb);
2174 if (IS_ERR(bdev)) {
2175 if (bdev == ERR_PTR(-EBUSY)) {
2176 bdev = lookup_bdev(strim(path));
2177 mutex_lock(&bch_register_lock);
2178 if (!IS_ERR(bdev) && bch_is_open(bdev))
2179 err = "device already registered";
2180 else
2181 err = "device busy";
2182 mutex_unlock(&bch_register_lock);
2183 if (!IS_ERR(bdev))
2184 bdput(bdev);
2185 if (attr == &ksysfs_register_quiet)
2186 goto out;
2187 }
2188 goto err;
2189 }
2190
2191 err = "failed to set blocksize";
2192 if (set_blocksize(bdev, 4096))
2193 goto err_close;
2194
2195 err = read_super(sb, bdev, &sb_page);
2196 if (err)
2197 goto err_close;
2198
2199 err = "failed to register device";
2200 if (SB_IS_BDEV(sb)) {
2201 struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
2202 if (!dc)
2203 goto err_close;
2204
2205 mutex_lock(&bch_register_lock);
2206 register_bdev(sb, sb_page, bdev, dc);
2207 mutex_unlock(&bch_register_lock);
2208 } else {
2209 struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
2210 if (!ca)
2211 goto err_close;
2212
2213 if (register_cache(sb, sb_page, bdev, ca) != 0)
2214 goto err;
2215 }
2216out:
2217 if (sb_page)
2218 put_page(sb_page);
2219 kfree(sb);
2220 kfree(path);
2221 module_put(THIS_MODULE);
2222 return ret;
2223
2224err_close:
2225 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2226err:
2227 pr_info("error %s: %s", path, err);
2228 ret = -EINVAL;
2229 goto out;
2230}
2231
2232static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
2233{
2234 if (code == SYS_DOWN ||
2235 code == SYS_HALT ||
2236 code == SYS_POWER_OFF) {
2237 DEFINE_WAIT(wait);
2238 unsigned long start = jiffies;
2239 bool stopped = false;
2240
2241 struct cache_set *c, *tc;
2242 struct cached_dev *dc, *tdc;
2243
2244 mutex_lock(&bch_register_lock);
2245
2246 if (list_empty(&bch_cache_sets) &&
2247 list_empty(&uncached_devices))
2248 goto out;
2249
2250 pr_info("Stopping all devices:");
2251
2252 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
2253 bch_cache_set_stop(c);
2254
2255 list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
2256 bcache_device_stop(&dc->disk);
2257
2258
2259 while (1) {
2260 long timeout = start + 2 * HZ - jiffies;
2261
2262 stopped = list_empty(&bch_cache_sets) &&
2263 list_empty(&uncached_devices);
2264
2265 if (timeout < 0 || stopped)
2266 break;
2267
2268 prepare_to_wait(&unregister_wait, &wait,
2269 TASK_UNINTERRUPTIBLE);
2270
2271 mutex_unlock(&bch_register_lock);
2272 schedule_timeout(timeout);
2273 mutex_lock(&bch_register_lock);
2274 }
2275
2276 finish_wait(&unregister_wait, &wait);
2277
2278 if (stopped)
2279 pr_info("All devices stopped");
2280 else
2281 pr_notice("Timeout waiting for devices to be closed");
2282out:
2283 mutex_unlock(&bch_register_lock);
2284 }
2285
2286 return NOTIFY_DONE;
2287}
2288
2289static struct notifier_block reboot = {
2290 .notifier_call = bcache_reboot,
2291 .priority = INT_MAX,
2292};
2293
2294static void bcache_exit(void)
2295{
2296 bch_debug_exit();
2297 bch_request_exit();
2298 if (bcache_kobj)
2299 kobject_put(bcache_kobj);
2300 if (bcache_wq)
2301 destroy_workqueue(bcache_wq);
2302 if (bcache_major)
2303 unregister_blkdev(bcache_major, "bcache");
2304 unregister_reboot_notifier(&reboot);
2305 mutex_destroy(&bch_register_lock);
2306}
2307
2308static int __init bcache_init(void)
2309{
2310 static const struct attribute *files[] = {
2311 &ksysfs_register.attr,
2312 &ksysfs_register_quiet.attr,
2313 NULL
2314 };
2315
2316 mutex_init(&bch_register_lock);
2317 init_waitqueue_head(&unregister_wait);
2318 register_reboot_notifier(&reboot);
2319
2320 bcache_major = register_blkdev(0, "bcache");
2321 if (bcache_major < 0) {
2322 unregister_reboot_notifier(&reboot);
2323 mutex_destroy(&bch_register_lock);
2324 return bcache_major;
2325 }
2326
2327 if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) ||
2328 !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
2329 bch_request_init() ||
2330 bch_debug_init(bcache_kobj) || closure_debug_init() ||
2331 sysfs_create_files(bcache_kobj, files))
2332 goto err;
2333
2334 return 0;
2335err:
2336 bcache_exit();
2337 return -ENOMEM;
2338}
2339
2340module_exit(bcache_exit);
2341module_init(bcache_init);
2342