1
2
3
4
5
6
7
8
9
10#include "bcache.h"
11#include "btree.h"
12#include "debug.h"
13#include "extents.h"
14#include "request.h"
15#include "writeback.h"
16
17#include <linux/blkdev.h>
18#include <linux/buffer_head.h>
19#include <linux/debugfs.h>
20#include <linux/genhd.h>
21#include <linux/idr.h>
22#include <linux/kthread.h>
23#include <linux/module.h>
24#include <linux/random.h>
25#include <linux/reboot.h>
26#include <linux/sysfs.h>
27
28unsigned int bch_cutoff_writeback;
29unsigned int bch_cutoff_writeback_sync;
30
31static const char bcache_magic[] = {
32 0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
33 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81
34};
35
36static const char invalid_uuid[] = {
37 0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78,
38 0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
39};
40
41static struct kobject *bcache_kobj;
42struct mutex bch_register_lock;
43bool bcache_is_reboot;
44LIST_HEAD(bch_cache_sets);
45static LIST_HEAD(uncached_devices);
46
47static int bcache_major;
48static DEFINE_IDA(bcache_device_idx);
49static wait_queue_head_t unregister_wait;
50struct workqueue_struct *bcache_wq;
51struct workqueue_struct *bch_journal_wq;
52
53
54#define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE)
55
56#define BCACHE_MINORS 128
57
58#define BCACHE_DEVICE_IDX_MAX ((1U << MINORBITS)/BCACHE_MINORS)
59
60
61
62static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
63 struct page **res)
64{
65 const char *err;
66 struct cache_sb *s;
67 struct buffer_head *bh = __bread(bdev, 1, SB_SIZE);
68 unsigned int i;
69
70 if (!bh)
71 return "IO error";
72
73 s = (struct cache_sb *) bh->b_data;
74
75 sb->offset = le64_to_cpu(s->offset);
76 sb->version = le64_to_cpu(s->version);
77
78 memcpy(sb->magic, s->magic, 16);
79 memcpy(sb->uuid, s->uuid, 16);
80 memcpy(sb->set_uuid, s->set_uuid, 16);
81 memcpy(sb->label, s->label, SB_LABEL_SIZE);
82
83 sb->flags = le64_to_cpu(s->flags);
84 sb->seq = le64_to_cpu(s->seq);
85 sb->last_mount = le32_to_cpu(s->last_mount);
86 sb->first_bucket = le16_to_cpu(s->first_bucket);
87 sb->keys = le16_to_cpu(s->keys);
88
89 for (i = 0; i < SB_JOURNAL_BUCKETS; i++)
90 sb->d[i] = le64_to_cpu(s->d[i]);
91
92 pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
93 sb->version, sb->flags, sb->seq, sb->keys);
94
95 err = "Not a bcache superblock";
96 if (sb->offset != SB_SECTOR)
97 goto err;
98
99 if (memcmp(sb->magic, bcache_magic, 16))
100 goto err;
101
102 err = "Too many journal buckets";
103 if (sb->keys > SB_JOURNAL_BUCKETS)
104 goto err;
105
106 err = "Bad checksum";
107 if (s->csum != csum_set(s))
108 goto err;
109
110 err = "Bad UUID";
111 if (bch_is_zero(sb->uuid, 16))
112 goto err;
113
114 sb->block_size = le16_to_cpu(s->block_size);
115
116 err = "Superblock block size smaller than device block size";
117 if (sb->block_size << 9 < bdev_logical_block_size(bdev))
118 goto err;
119
120 switch (sb->version) {
121 case BCACHE_SB_VERSION_BDEV:
122 sb->data_offset = BDEV_DATA_START_DEFAULT;
123 break;
124 case BCACHE_SB_VERSION_BDEV_WITH_OFFSET:
125 sb->data_offset = le64_to_cpu(s->data_offset);
126
127 err = "Bad data offset";
128 if (sb->data_offset < BDEV_DATA_START_DEFAULT)
129 goto err;
130
131 break;
132 case BCACHE_SB_VERSION_CDEV:
133 case BCACHE_SB_VERSION_CDEV_WITH_UUID:
134 sb->nbuckets = le64_to_cpu(s->nbuckets);
135 sb->bucket_size = le16_to_cpu(s->bucket_size);
136
137 sb->nr_in_set = le16_to_cpu(s->nr_in_set);
138 sb->nr_this_dev = le16_to_cpu(s->nr_this_dev);
139
140 err = "Too many buckets";
141 if (sb->nbuckets > LONG_MAX)
142 goto err;
143
144 err = "Not enough buckets";
145 if (sb->nbuckets < 1 << 7)
146 goto err;
147
148 err = "Bad block/bucket size";
149 if (!is_power_of_2(sb->block_size) ||
150 sb->block_size > PAGE_SECTORS ||
151 !is_power_of_2(sb->bucket_size) ||
152 sb->bucket_size < PAGE_SECTORS)
153 goto err;
154
155 err = "Invalid superblock: device too small";
156 if (get_capacity(bdev->bd_disk) <
157 sb->bucket_size * sb->nbuckets)
158 goto err;
159
160 err = "Bad UUID";
161 if (bch_is_zero(sb->set_uuid, 16))
162 goto err;
163
164 err = "Bad cache device number in set";
165 if (!sb->nr_in_set ||
166 sb->nr_in_set <= sb->nr_this_dev ||
167 sb->nr_in_set > MAX_CACHES_PER_SET)
168 goto err;
169
170 err = "Journal buckets not sequential";
171 for (i = 0; i < sb->keys; i++)
172 if (sb->d[i] != sb->first_bucket + i)
173 goto err;
174
175 err = "Too many journal buckets";
176 if (sb->first_bucket + sb->keys > sb->nbuckets)
177 goto err;
178
179 err = "Invalid superblock: first bucket comes before end of super";
180 if (sb->first_bucket * sb->bucket_size < 16)
181 goto err;
182
183 break;
184 default:
185 err = "Unsupported superblock version";
186 goto err;
187 }
188
189 sb->last_mount = (u32)ktime_get_real_seconds();
190 err = NULL;
191
192 get_page(bh->b_page);
193 *res = bh->b_page;
194err:
195 put_bh(bh);
196 return err;
197}
198
199static void write_bdev_super_endio(struct bio *bio)
200{
201 struct cached_dev *dc = bio->bi_private;
202
203 if (bio->bi_status)
204 bch_count_backing_io_errors(dc, bio);
205
206 closure_put(&dc->sb_write);
207}
208
209static void __write_super(struct cache_sb *sb, struct bio *bio)
210{
211 struct cache_sb *out = page_address(bio_first_page_all(bio));
212 unsigned int i;
213
214 bio->bi_iter.bi_sector = SB_SECTOR;
215 bio->bi_iter.bi_size = SB_SIZE;
216 bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META);
217 bch_bio_map(bio, NULL);
218
219 out->offset = cpu_to_le64(sb->offset);
220 out->version = cpu_to_le64(sb->version);
221
222 memcpy(out->uuid, sb->uuid, 16);
223 memcpy(out->set_uuid, sb->set_uuid, 16);
224 memcpy(out->label, sb->label, SB_LABEL_SIZE);
225
226 out->flags = cpu_to_le64(sb->flags);
227 out->seq = cpu_to_le64(sb->seq);
228
229 out->last_mount = cpu_to_le32(sb->last_mount);
230 out->first_bucket = cpu_to_le16(sb->first_bucket);
231 out->keys = cpu_to_le16(sb->keys);
232
233 for (i = 0; i < sb->keys; i++)
234 out->d[i] = cpu_to_le64(sb->d[i]);
235
236 out->csum = csum_set(out);
237
238 pr_debug("ver %llu, flags %llu, seq %llu",
239 sb->version, sb->flags, sb->seq);
240
241 submit_bio(bio);
242}
243
244static void bch_write_bdev_super_unlock(struct closure *cl)
245{
246 struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write);
247
248 up(&dc->sb_write_mutex);
249}
250
251void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
252{
253 struct closure *cl = &dc->sb_write;
254 struct bio *bio = &dc->sb_bio;
255
256 down(&dc->sb_write_mutex);
257 closure_init(cl, parent);
258
259 bio_reset(bio);
260 bio_set_dev(bio, dc->bdev);
261 bio->bi_end_io = write_bdev_super_endio;
262 bio->bi_private = dc;
263
264 closure_get(cl);
265
266 __write_super(&dc->sb, bio);
267
268 closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
269}
270
271static void write_super_endio(struct bio *bio)
272{
273 struct cache *ca = bio->bi_private;
274
275
276 bch_count_io_errors(ca, bio->bi_status, 0,
277 "writing superblock");
278 closure_put(&ca->set->sb_write);
279}
280
281static void bcache_write_super_unlock(struct closure *cl)
282{
283 struct cache_set *c = container_of(cl, struct cache_set, sb_write);
284
285 up(&c->sb_write_mutex);
286}
287
288void bcache_write_super(struct cache_set *c)
289{
290 struct closure *cl = &c->sb_write;
291 struct cache *ca;
292 unsigned int i;
293
294 down(&c->sb_write_mutex);
295 closure_init(cl, &c->cl);
296
297 c->sb.seq++;
298
299 for_each_cache(ca, c, i) {
300 struct bio *bio = &ca->sb_bio;
301
302 ca->sb.version = BCACHE_SB_VERSION_CDEV_WITH_UUID;
303 ca->sb.seq = c->sb.seq;
304 ca->sb.last_mount = c->sb.last_mount;
305
306 SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb));
307
308 bio_reset(bio);
309 bio_set_dev(bio, ca->bdev);
310 bio->bi_end_io = write_super_endio;
311 bio->bi_private = ca;
312
313 closure_get(cl);
314 __write_super(&ca->sb, bio);
315 }
316
317 closure_return_with_destructor(cl, bcache_write_super_unlock);
318}
319
320
321
322static void uuid_endio(struct bio *bio)
323{
324 struct closure *cl = bio->bi_private;
325 struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
326
327 cache_set_err_on(bio->bi_status, c, "accessing uuids");
328 bch_bbio_free(bio, c);
329 closure_put(cl);
330}
331
332static void uuid_io_unlock(struct closure *cl)
333{
334 struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
335
336 up(&c->uuid_write_mutex);
337}
338
339static void uuid_io(struct cache_set *c, int op, unsigned long op_flags,
340 struct bkey *k, struct closure *parent)
341{
342 struct closure *cl = &c->uuid_write;
343 struct uuid_entry *u;
344 unsigned int i;
345 char buf[80];
346
347 BUG_ON(!parent);
348 down(&c->uuid_write_mutex);
349 closure_init(cl, parent);
350
351 for (i = 0; i < KEY_PTRS(k); i++) {
352 struct bio *bio = bch_bbio_alloc(c);
353
354 bio->bi_opf = REQ_SYNC | REQ_META | op_flags;
355 bio->bi_iter.bi_size = KEY_SIZE(k) << 9;
356
357 bio->bi_end_io = uuid_endio;
358 bio->bi_private = cl;
359 bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags);
360 bch_bio_map(bio, c->uuids);
361
362 bch_submit_bbio(bio, c, k, i);
363
364 if (op != REQ_OP_WRITE)
365 break;
366 }
367
368 bch_extent_to_text(buf, sizeof(buf), k);
369 pr_debug("%s UUIDs at %s", op == REQ_OP_WRITE ? "wrote" : "read", buf);
370
371 for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
372 if (!bch_is_zero(u->uuid, 16))
373 pr_debug("Slot %zi: %pU: %s: 1st: %u last: %u inv: %u",
374 u - c->uuids, u->uuid, u->label,
375 u->first_reg, u->last_reg, u->invalidated);
376
377 closure_return_with_destructor(cl, uuid_io_unlock);
378}
379
380static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
381{
382 struct bkey *k = &j->uuid_bucket;
383
384 if (__bch_btree_ptr_invalid(c, k))
385 return "bad uuid pointer";
386
387 bkey_copy(&c->uuid_bucket, k);
388 uuid_io(c, REQ_OP_READ, 0, k, cl);
389
390 if (j->version < BCACHE_JSET_VERSION_UUIDv1) {
391 struct uuid_entry_v0 *u0 = (void *) c->uuids;
392 struct uuid_entry *u1 = (void *) c->uuids;
393 int i;
394
395 closure_sync(cl);
396
397
398
399
400
401
402
403 for (i = c->nr_uuids - 1;
404 i >= 0;
405 --i) {
406 memcpy(u1[i].uuid, u0[i].uuid, 16);
407 memcpy(u1[i].label, u0[i].label, 32);
408
409 u1[i].first_reg = u0[i].first_reg;
410 u1[i].last_reg = u0[i].last_reg;
411 u1[i].invalidated = u0[i].invalidated;
412
413 u1[i].flags = 0;
414 u1[i].sectors = 0;
415 }
416 }
417
418 return NULL;
419}
420
421static int __uuid_write(struct cache_set *c)
422{
423 BKEY_PADDED(key) k;
424 struct closure cl;
425 struct cache *ca;
426
427 closure_init_stack(&cl);
428 lockdep_assert_held(&bch_register_lock);
429
430 if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true))
431 return 1;
432
433 SET_KEY_SIZE(&k.key, c->sb.bucket_size);
434 uuid_io(c, REQ_OP_WRITE, 0, &k.key, &cl);
435 closure_sync(&cl);
436
437
438 ca = PTR_CACHE(c, &k.key, 0);
439 atomic_long_add(ca->sb.bucket_size, &ca->meta_sectors_written);
440
441 bkey_copy(&c->uuid_bucket, &k.key);
442 bkey_put(c, &k.key);
443 return 0;
444}
445
446int bch_uuid_write(struct cache_set *c)
447{
448 int ret = __uuid_write(c);
449
450 if (!ret)
451 bch_journal_meta(c, NULL);
452
453 return ret;
454}
455
456static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid)
457{
458 struct uuid_entry *u;
459
460 for (u = c->uuids;
461 u < c->uuids + c->nr_uuids; u++)
462 if (!memcmp(u->uuid, uuid, 16))
463 return u;
464
465 return NULL;
466}
467
468static struct uuid_entry *uuid_find_empty(struct cache_set *c)
469{
470 static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
471
472 return uuid_find(c, zero_uuid);
473}
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502static void prio_endio(struct bio *bio)
503{
504 struct cache *ca = bio->bi_private;
505
506 cache_set_err_on(bio->bi_status, ca->set, "accessing priorities");
507 bch_bbio_free(bio, ca->set);
508 closure_put(&ca->prio);
509}
510
511static void prio_io(struct cache *ca, uint64_t bucket, int op,
512 unsigned long op_flags)
513{
514 struct closure *cl = &ca->prio;
515 struct bio *bio = bch_bbio_alloc(ca->set);
516
517 closure_init_stack(cl);
518
519 bio->bi_iter.bi_sector = bucket * ca->sb.bucket_size;
520 bio_set_dev(bio, ca->bdev);
521 bio->bi_iter.bi_size = bucket_bytes(ca);
522
523 bio->bi_end_io = prio_endio;
524 bio->bi_private = ca;
525 bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags);
526 bch_bio_map(bio, ca->disk_buckets);
527
528 closure_bio_submit(ca->set, bio, &ca->prio);
529 closure_sync(cl);
530}
531
532void bch_prio_write(struct cache *ca)
533{
534 int i;
535 struct bucket *b;
536 struct closure cl;
537
538 closure_init_stack(&cl);
539
540 lockdep_assert_held(&ca->set->bucket_lock);
541
542 ca->disk_buckets->seq++;
543
544 atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
545 &ca->meta_sectors_written);
546
547
548
549
550 for (i = prio_buckets(ca) - 1; i >= 0; --i) {
551 long bucket;
552 struct prio_set *p = ca->disk_buckets;
553 struct bucket_disk *d = p->data;
554 struct bucket_disk *end = d + prios_per_bucket(ca);
555
556 for (b = ca->buckets + i * prios_per_bucket(ca);
557 b < ca->buckets + ca->sb.nbuckets && d < end;
558 b++, d++) {
559 d->prio = cpu_to_le16(b->prio);
560 d->gen = b->gen;
561 }
562
563 p->next_bucket = ca->prio_buckets[i + 1];
564 p->magic = pset_magic(&ca->sb);
565 p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8);
566
567 bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true);
568 BUG_ON(bucket == -1);
569
570 mutex_unlock(&ca->set->bucket_lock);
571 prio_io(ca, bucket, REQ_OP_WRITE, 0);
572 mutex_lock(&ca->set->bucket_lock);
573
574 ca->prio_buckets[i] = bucket;
575 atomic_dec_bug(&ca->buckets[bucket].pin);
576 }
577
578 mutex_unlock(&ca->set->bucket_lock);
579
580 bch_journal_meta(ca->set, &cl);
581 closure_sync(&cl);
582
583 mutex_lock(&ca->set->bucket_lock);
584
585
586
587
588
589 for (i = 0; i < prio_buckets(ca); i++) {
590 if (ca->prio_last_buckets[i])
591 __bch_bucket_free(ca,
592 &ca->buckets[ca->prio_last_buckets[i]]);
593
594 ca->prio_last_buckets[i] = ca->prio_buckets[i];
595 }
596}
597
598static void prio_read(struct cache *ca, uint64_t bucket)
599{
600 struct prio_set *p = ca->disk_buckets;
601 struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
602 struct bucket *b;
603 unsigned int bucket_nr = 0;
604
605 for (b = ca->buckets;
606 b < ca->buckets + ca->sb.nbuckets;
607 b++, d++) {
608 if (d == end) {
609 ca->prio_buckets[bucket_nr] = bucket;
610 ca->prio_last_buckets[bucket_nr] = bucket;
611 bucket_nr++;
612
613 prio_io(ca, bucket, REQ_OP_READ, 0);
614
615 if (p->csum !=
616 bch_crc64(&p->magic, bucket_bytes(ca) - 8))
617 pr_warn("bad csum reading priorities");
618
619 if (p->magic != pset_magic(&ca->sb))
620 pr_warn("bad magic reading priorities");
621
622 bucket = p->next_bucket;
623 d = p->data;
624 }
625
626 b->prio = le16_to_cpu(d->prio);
627 b->gen = b->last_gc = d->gen;
628 }
629}
630
631
632
633static int open_dev(struct block_device *b, fmode_t mode)
634{
635 struct bcache_device *d = b->bd_disk->private_data;
636
637 if (test_bit(BCACHE_DEV_CLOSING, &d->flags))
638 return -ENXIO;
639
640 closure_get(&d->cl);
641 return 0;
642}
643
644static void release_dev(struct gendisk *b, fmode_t mode)
645{
646 struct bcache_device *d = b->private_data;
647
648 closure_put(&d->cl);
649}
650
651static int ioctl_dev(struct block_device *b, fmode_t mode,
652 unsigned int cmd, unsigned long arg)
653{
654 struct bcache_device *d = b->bd_disk->private_data;
655
656 return d->ioctl(d, mode, cmd, arg);
657}
658
659static const struct block_device_operations bcache_ops = {
660 .open = open_dev,
661 .release = release_dev,
662 .ioctl = ioctl_dev,
663 .owner = THIS_MODULE,
664};
665
666void bcache_device_stop(struct bcache_device *d)
667{
668 if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags))
669
670
671
672
673
674 closure_queue(&d->cl);
675}
676
677static void bcache_device_unlink(struct bcache_device *d)
678{
679 lockdep_assert_held(&bch_register_lock);
680
681 if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) {
682 unsigned int i;
683 struct cache *ca;
684
685 sysfs_remove_link(&d->c->kobj, d->name);
686 sysfs_remove_link(&d->kobj, "cache");
687
688 for_each_cache(ca, d->c, i)
689 bd_unlink_disk_holder(ca->bdev, d->disk);
690 }
691}
692
693static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
694 const char *name)
695{
696 unsigned int i;
697 struct cache *ca;
698 int ret;
699
700 for_each_cache(ca, d->c, i)
701 bd_link_disk_holder(ca->bdev, d->disk);
702
703 snprintf(d->name, BCACHEDEVNAME_SIZE,
704 "%s%u", name, d->id);
705
706 ret = sysfs_create_link(&d->kobj, &c->kobj, "cache");
707 if (ret < 0)
708 pr_err("Couldn't create device -> cache set symlink");
709
710 ret = sysfs_create_link(&c->kobj, &d->kobj, d->name);
711 if (ret < 0)
712 pr_err("Couldn't create cache set -> device symlink");
713
714 clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags);
715}
716
717static void bcache_device_detach(struct bcache_device *d)
718{
719 lockdep_assert_held(&bch_register_lock);
720
721 atomic_dec(&d->c->attached_dev_nr);
722
723 if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
724 struct uuid_entry *u = d->c->uuids + d->id;
725
726 SET_UUID_FLASH_ONLY(u, 0);
727 memcpy(u->uuid, invalid_uuid, 16);
728 u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
729 bch_uuid_write(d->c);
730 }
731
732 bcache_device_unlink(d);
733
734 d->c->devices[d->id] = NULL;
735 closure_put(&d->c->caching);
736 d->c = NULL;
737}
738
739static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
740 unsigned int id)
741{
742 d->id = id;
743 d->c = c;
744 c->devices[id] = d;
745
746 if (id >= c->devices_max_used)
747 c->devices_max_used = id + 1;
748
749 closure_get(&c->caching);
750}
751
752static inline int first_minor_to_idx(int first_minor)
753{
754 return (first_minor/BCACHE_MINORS);
755}
756
757static inline int idx_to_first_minor(int idx)
758{
759 return (idx * BCACHE_MINORS);
760}
761
762static void bcache_device_free(struct bcache_device *d)
763{
764 lockdep_assert_held(&bch_register_lock);
765
766 pr_info("%s stopped", d->disk->disk_name);
767
768 if (d->c)
769 bcache_device_detach(d);
770 if (d->disk && d->disk->flags & GENHD_FL_UP)
771 del_gendisk(d->disk);
772 if (d->disk && d->disk->queue)
773 blk_cleanup_queue(d->disk->queue);
774 if (d->disk) {
775 ida_simple_remove(&bcache_device_idx,
776 first_minor_to_idx(d->disk->first_minor));
777 put_disk(d->disk);
778 }
779
780 bioset_exit(&d->bio_split);
781 kvfree(d->full_dirty_stripes);
782 kvfree(d->stripe_sectors_dirty);
783
784 closure_debug_destroy(&d->cl);
785}
786
787static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
788 sector_t sectors)
789{
790 struct request_queue *q;
791 const size_t max_stripes = min_t(size_t, INT_MAX,
792 SIZE_MAX / sizeof(atomic_t));
793 size_t n;
794 int idx;
795
796 if (!d->stripe_size)
797 d->stripe_size = 1 << 31;
798
799 d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
800
801 if (!d->nr_stripes || d->nr_stripes > max_stripes) {
802 pr_err("nr_stripes too large or invalid: %u (start sector beyond end of disk?)",
803 (unsigned int)d->nr_stripes);
804 return -ENOMEM;
805 }
806
807 n = d->nr_stripes * sizeof(atomic_t);
808 d->stripe_sectors_dirty = kvzalloc(n, GFP_KERNEL);
809 if (!d->stripe_sectors_dirty)
810 return -ENOMEM;
811
812 n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long);
813 d->full_dirty_stripes = kvzalloc(n, GFP_KERNEL);
814 if (!d->full_dirty_stripes)
815 return -ENOMEM;
816
817 idx = ida_simple_get(&bcache_device_idx, 0,
818 BCACHE_DEVICE_IDX_MAX, GFP_KERNEL);
819 if (idx < 0)
820 return idx;
821
822 if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio),
823 BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
824 goto err;
825
826 d->disk = alloc_disk(BCACHE_MINORS);
827 if (!d->disk)
828 goto err;
829
830 set_capacity(d->disk, sectors);
831 snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx);
832
833 d->disk->major = bcache_major;
834 d->disk->first_minor = idx_to_first_minor(idx);
835 d->disk->fops = &bcache_ops;
836 d->disk->private_data = d;
837
838 q = blk_alloc_queue(GFP_KERNEL);
839 if (!q)
840 return -ENOMEM;
841
842 blk_queue_make_request(q, NULL);
843 d->disk->queue = q;
844 q->queuedata = d;
845 q->backing_dev_info->congested_data = d;
846 q->limits.max_hw_sectors = UINT_MAX;
847 q->limits.max_sectors = UINT_MAX;
848 q->limits.max_segment_size = UINT_MAX;
849 q->limits.max_segments = BIO_MAX_PAGES;
850 blk_queue_max_discard_sectors(q, UINT_MAX);
851 q->limits.discard_granularity = 512;
852 q->limits.io_min = block_size;
853 q->limits.logical_block_size = block_size;
854 q->limits.physical_block_size = block_size;
855 blk_queue_flag_set(QUEUE_FLAG_NONROT, d->disk->queue);
856 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, d->disk->queue);
857 blk_queue_flag_set(QUEUE_FLAG_DISCARD, d->disk->queue);
858
859 blk_queue_write_cache(q, true, true);
860
861 return 0;
862
863err:
864 ida_simple_remove(&bcache_device_idx, idx);
865 return -ENOMEM;
866
867}
868
869
870
871static void calc_cached_dev_sectors(struct cache_set *c)
872{
873 uint64_t sectors = 0;
874 struct cached_dev *dc;
875
876 list_for_each_entry(dc, &c->cached_devs, list)
877 sectors += bdev_sectors(dc->bdev);
878
879 c->cached_dev_sectors = sectors;
880}
881
882#define BACKING_DEV_OFFLINE_TIMEOUT 5
883static int cached_dev_status_update(void *arg)
884{
885 struct cached_dev *dc = arg;
886 struct request_queue *q;
887
888
889
890
891
892
893 while (!kthread_should_stop() && !dc->io_disable) {
894 q = bdev_get_queue(dc->bdev);
895 if (blk_queue_dying(q))
896 dc->offline_seconds++;
897 else
898 dc->offline_seconds = 0;
899
900 if (dc->offline_seconds >= BACKING_DEV_OFFLINE_TIMEOUT) {
901 pr_err("%s: device offline for %d seconds",
902 dc->backing_dev_name,
903 BACKING_DEV_OFFLINE_TIMEOUT);
904 pr_err("%s: disable I/O request due to backing "
905 "device offline", dc->disk.name);
906 dc->io_disable = true;
907
908 smp_mb();
909 bcache_device_stop(&dc->disk);
910 break;
911 }
912 schedule_timeout_interruptible(HZ);
913 }
914
915 wait_for_kthread_stop();
916 return 0;
917}
918
919
920int bch_cached_dev_run(struct cached_dev *dc)
921{
922 struct bcache_device *d = &dc->disk;
923 char *buf = kmemdup_nul(dc->sb.label, SB_LABEL_SIZE, GFP_KERNEL);
924 char *env[] = {
925 "DRIVER=bcache",
926 kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid),
927 kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf ? : ""),
928 NULL,
929 };
930
931 if (dc->io_disable) {
932 pr_err("I/O disabled on cached dev %s",
933 dc->backing_dev_name);
934 kfree(env[1]);
935 kfree(env[2]);
936 kfree(buf);
937 return -EIO;
938 }
939
940 if (atomic_xchg(&dc->running, 1)) {
941 kfree(env[1]);
942 kfree(env[2]);
943 kfree(buf);
944 pr_info("cached dev %s is running already",
945 dc->backing_dev_name);
946 return -EBUSY;
947 }
948
949 if (!d->c &&
950 BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
951 struct closure cl;
952
953 closure_init_stack(&cl);
954
955 SET_BDEV_STATE(&dc->sb, BDEV_STATE_STALE);
956 bch_write_bdev_super(dc, &cl);
957 closure_sync(&cl);
958 }
959
960 add_disk(d->disk);
961 bd_link_disk_holder(dc->bdev, dc->disk.disk);
962
963
964
965
966 kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
967 kfree(env[1]);
968 kfree(env[2]);
969 kfree(buf);
970
971 if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
972 sysfs_create_link(&disk_to_dev(d->disk)->kobj,
973 &d->kobj, "bcache")) {
974 pr_err("Couldn't create bcache dev <-> disk sysfs symlinks");
975 return -ENOMEM;
976 }
977
978 dc->status_update_thread = kthread_run(cached_dev_status_update,
979 dc, "bcache_status_update");
980 if (IS_ERR(dc->status_update_thread)) {
981 pr_warn("failed to create bcache_status_update kthread, "
982 "continue to run without monitoring backing "
983 "device status");
984 }
985
986 return 0;
987}
988
989
990
991
992
993
994
995
996static void cancel_writeback_rate_update_dwork(struct cached_dev *dc)
997{
998 int time_out = WRITEBACK_RATE_UPDATE_SECS_MAX * HZ;
999
1000 do {
1001 if (!test_bit(BCACHE_DEV_RATE_DW_RUNNING,
1002 &dc->disk.flags))
1003 break;
1004 time_out--;
1005 schedule_timeout_interruptible(1);
1006 } while (time_out > 0);
1007
1008 if (time_out == 0)
1009 pr_warn("give up waiting for dc->writeback_write_update to quit");
1010
1011 cancel_delayed_work_sync(&dc->writeback_rate_update);
1012}
1013
1014static void cached_dev_detach_finish(struct work_struct *w)
1015{
1016 struct cached_dev *dc = container_of(w, struct cached_dev, detach);
1017 struct closure cl;
1018
1019 closure_init_stack(&cl);
1020
1021 BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
1022 BUG_ON(refcount_read(&dc->count));
1023
1024
1025 if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
1026 cancel_writeback_rate_update_dwork(dc);
1027
1028 if (!IS_ERR_OR_NULL(dc->writeback_thread)) {
1029 kthread_stop(dc->writeback_thread);
1030 dc->writeback_thread = NULL;
1031 }
1032
1033 memset(&dc->sb.set_uuid, 0, 16);
1034 SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE);
1035
1036 bch_write_bdev_super(dc, &cl);
1037 closure_sync(&cl);
1038
1039 mutex_lock(&bch_register_lock);
1040
1041 calc_cached_dev_sectors(dc->disk.c);
1042 bcache_device_detach(&dc->disk);
1043 list_move(&dc->list, &uncached_devices);
1044
1045 clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags);
1046 clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags);
1047
1048 mutex_unlock(&bch_register_lock);
1049
1050 pr_info("Caching disabled for %s", dc->backing_dev_name);
1051
1052
1053 closure_put(&dc->disk.cl);
1054}
1055
1056void bch_cached_dev_detach(struct cached_dev *dc)
1057{
1058 lockdep_assert_held(&bch_register_lock);
1059
1060 if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
1061 return;
1062
1063 if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
1064 return;
1065
1066
1067
1068
1069
1070 closure_get(&dc->disk.cl);
1071
1072 bch_writeback_queue(dc);
1073
1074 cached_dev_put(dc);
1075}
1076
1077int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
1078 uint8_t *set_uuid)
1079{
1080 uint32_t rtime = cpu_to_le32((u32)ktime_get_real_seconds());
1081 struct uuid_entry *u;
1082 struct cached_dev *exist_dc, *t;
1083 int ret = 0;
1084
1085 if ((set_uuid && memcmp(set_uuid, c->sb.set_uuid, 16)) ||
1086 (!set_uuid && memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16)))
1087 return -ENOENT;
1088
1089 if (dc->disk.c) {
1090 pr_err("Can't attach %s: already attached",
1091 dc->backing_dev_name);
1092 return -EINVAL;
1093 }
1094
1095 if (test_bit(CACHE_SET_STOPPING, &c->flags)) {
1096 pr_err("Can't attach %s: shutting down",
1097 dc->backing_dev_name);
1098 return -EINVAL;
1099 }
1100
1101 if (dc->sb.block_size < c->sb.block_size) {
1102
1103 pr_err("Couldn't attach %s: block size less than set's block size",
1104 dc->backing_dev_name);
1105 return -EINVAL;
1106 }
1107
1108
1109 list_for_each_entry_safe(exist_dc, t, &c->cached_devs, list) {
1110 if (!memcmp(dc->sb.uuid, exist_dc->sb.uuid, 16)) {
1111 pr_err("Tried to attach %s but duplicate UUID already attached",
1112 dc->backing_dev_name);
1113
1114 return -EINVAL;
1115 }
1116 }
1117
1118 u = uuid_find(c, dc->sb.uuid);
1119
1120 if (u &&
1121 (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE ||
1122 BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) {
1123 memcpy(u->uuid, invalid_uuid, 16);
1124 u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
1125 u = NULL;
1126 }
1127
1128 if (!u) {
1129 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
1130 pr_err("Couldn't find uuid for %s in set",
1131 dc->backing_dev_name);
1132 return -ENOENT;
1133 }
1134
1135 u = uuid_find_empty(c);
1136 if (!u) {
1137 pr_err("Not caching %s, no room for UUID",
1138 dc->backing_dev_name);
1139 return -EINVAL;
1140 }
1141 }
1142
1143
1144
1145
1146
1147
1148 if (bch_is_zero(u->uuid, 16)) {
1149 struct closure cl;
1150
1151 closure_init_stack(&cl);
1152
1153 memcpy(u->uuid, dc->sb.uuid, 16);
1154 memcpy(u->label, dc->sb.label, SB_LABEL_SIZE);
1155 u->first_reg = u->last_reg = rtime;
1156 bch_uuid_write(c);
1157
1158 memcpy(dc->sb.set_uuid, c->sb.set_uuid, 16);
1159 SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
1160
1161 bch_write_bdev_super(dc, &cl);
1162 closure_sync(&cl);
1163 } else {
1164 u->last_reg = rtime;
1165 bch_uuid_write(c);
1166 }
1167
1168 bcache_device_attach(&dc->disk, c, u - c->uuids);
1169 list_move(&dc->list, &c->cached_devs);
1170 calc_cached_dev_sectors(c);
1171
1172
1173
1174
1175
1176 smp_wmb();
1177 refcount_set(&dc->count, 1);
1178
1179
1180 down_write(&dc->writeback_lock);
1181 if (bch_cached_dev_writeback_start(dc)) {
1182 up_write(&dc->writeback_lock);
1183 pr_err("Couldn't start writeback facilities for %s",
1184 dc->disk.disk->disk_name);
1185 return -ENOMEM;
1186 }
1187
1188 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
1189 atomic_set(&dc->has_dirty, 1);
1190 bch_writeback_queue(dc);
1191 }
1192
1193 bch_sectors_dirty_init(&dc->disk);
1194
1195 ret = bch_cached_dev_run(dc);
1196 if (ret && (ret != -EBUSY)) {
1197 up_write(&dc->writeback_lock);
1198
1199
1200
1201
1202
1203
1204 kthread_stop(dc->writeback_thread);
1205 cancel_writeback_rate_update_dwork(dc);
1206 pr_err("Couldn't run cached device %s",
1207 dc->backing_dev_name);
1208 return ret;
1209 }
1210
1211 bcache_device_link(&dc->disk, c, "bdev");
1212 atomic_inc(&c->attached_dev_nr);
1213
1214
1215 up_write(&dc->writeback_lock);
1216
1217 pr_info("Caching %s as %s on set %pU",
1218 dc->backing_dev_name,
1219 dc->disk.disk->disk_name,
1220 dc->disk.c->sb.set_uuid);
1221 return 0;
1222}
1223
1224
1225void bch_cached_dev_release(struct kobject *kobj)
1226{
1227 struct cached_dev *dc = container_of(kobj, struct cached_dev,
1228 disk.kobj);
1229 kfree(dc);
1230 module_put(THIS_MODULE);
1231}
1232
1233static void cached_dev_free(struct closure *cl)
1234{
1235 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
1236
1237 if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
1238 cancel_writeback_rate_update_dwork(dc);
1239
1240 if (!IS_ERR_OR_NULL(dc->writeback_thread))
1241 kthread_stop(dc->writeback_thread);
1242 if (!IS_ERR_OR_NULL(dc->status_update_thread))
1243 kthread_stop(dc->status_update_thread);
1244
1245 mutex_lock(&bch_register_lock);
1246
1247 if (atomic_read(&dc->running))
1248 bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
1249 bcache_device_free(&dc->disk);
1250 list_del(&dc->list);
1251
1252 mutex_unlock(&bch_register_lock);
1253
1254 if (!IS_ERR_OR_NULL(dc->bdev))
1255 blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1256
1257 wake_up(&unregister_wait);
1258
1259 kobject_put(&dc->disk.kobj);
1260}
1261
1262static void cached_dev_flush(struct closure *cl)
1263{
1264 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
1265 struct bcache_device *d = &dc->disk;
1266
1267 mutex_lock(&bch_register_lock);
1268 bcache_device_unlink(d);
1269 mutex_unlock(&bch_register_lock);
1270
1271 bch_cache_accounting_destroy(&dc->accounting);
1272 kobject_del(&d->kobj);
1273
1274 continue_at(cl, cached_dev_free, system_wq);
1275}
1276
1277static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
1278{
1279 int ret;
1280 struct io *io;
1281 struct request_queue *q = bdev_get_queue(dc->bdev);
1282
1283 __module_get(THIS_MODULE);
1284 INIT_LIST_HEAD(&dc->list);
1285 closure_init(&dc->disk.cl, NULL);
1286 set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
1287 kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
1288 INIT_WORK(&dc->detach, cached_dev_detach_finish);
1289 sema_init(&dc->sb_write_mutex, 1);
1290 INIT_LIST_HEAD(&dc->io_lru);
1291 spin_lock_init(&dc->io_lock);
1292 bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
1293
1294 dc->sequential_cutoff = 4 << 20;
1295
1296 for (io = dc->io; io < dc->io + RECENT_IO; io++) {
1297 list_add(&io->lru, &dc->io_lru);
1298 hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
1299 }
1300
1301 dc->disk.stripe_size = q->limits.io_opt >> 9;
1302
1303 if (dc->disk.stripe_size)
1304 dc->partial_stripes_expensive =
1305 q->limits.raid_partial_stripes_expensive;
1306
1307 ret = bcache_device_init(&dc->disk, block_size,
1308 dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
1309 if (ret)
1310 return ret;
1311
1312 dc->disk.disk->queue->backing_dev_info->ra_pages =
1313 max(dc->disk.disk->queue->backing_dev_info->ra_pages,
1314 q->backing_dev_info->ra_pages);
1315
1316 atomic_set(&dc->io_errors, 0);
1317 dc->io_disable = false;
1318 dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT;
1319
1320 dc->stop_when_cache_set_failed = BCH_CACHED_DEV_STOP_AUTO;
1321
1322 bch_cached_dev_request_init(dc);
1323 bch_cached_dev_writeback_init(dc);
1324 return 0;
1325}
1326
1327
1328
1329static int register_bdev(struct cache_sb *sb, struct page *sb_page,
1330 struct block_device *bdev,
1331 struct cached_dev *dc)
1332{
1333 const char *err = "cannot allocate memory";
1334 struct cache_set *c;
1335 int ret = -ENOMEM;
1336
1337 bdevname(bdev, dc->backing_dev_name);
1338 memcpy(&dc->sb, sb, sizeof(struct cache_sb));
1339 dc->bdev = bdev;
1340 dc->bdev->bd_holder = dc;
1341
1342 bio_init(&dc->sb_bio, dc->sb_bio.bi_inline_vecs, 1);
1343 bio_first_bvec_all(&dc->sb_bio)->bv_page = sb_page;
1344 get_page(sb_page);
1345
1346
1347 if (cached_dev_init(dc, sb->block_size << 9))
1348 goto err;
1349
1350 err = "error creating kobject";
1351 if (kobject_add(&dc->disk.kobj, &part_to_dev(bdev->bd_part)->kobj,
1352 "bcache"))
1353 goto err;
1354 if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
1355 goto err;
1356
1357 pr_info("registered backing device %s", dc->backing_dev_name);
1358
1359 list_add(&dc->list, &uncached_devices);
1360
1361 list_for_each_entry(c, &bch_cache_sets, list)
1362 bch_cached_dev_attach(dc, c, NULL);
1363
1364 if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE ||
1365 BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) {
1366 err = "failed to run cached device";
1367 ret = bch_cached_dev_run(dc);
1368 if (ret)
1369 goto err;
1370 }
1371
1372 return 0;
1373err:
1374 pr_notice("error %s: %s", dc->backing_dev_name, err);
1375 bcache_device_stop(&dc->disk);
1376 return ret;
1377}
1378
1379
1380
1381
1382void bch_flash_dev_release(struct kobject *kobj)
1383{
1384 struct bcache_device *d = container_of(kobj, struct bcache_device,
1385 kobj);
1386 kfree(d);
1387}
1388
1389static void flash_dev_free(struct closure *cl)
1390{
1391 struct bcache_device *d = container_of(cl, struct bcache_device, cl);
1392
1393 mutex_lock(&bch_register_lock);
1394 atomic_long_sub(bcache_dev_sectors_dirty(d),
1395 &d->c->flash_dev_dirty_sectors);
1396 bcache_device_free(d);
1397 mutex_unlock(&bch_register_lock);
1398 kobject_put(&d->kobj);
1399}
1400
1401static void flash_dev_flush(struct closure *cl)
1402{
1403 struct bcache_device *d = container_of(cl, struct bcache_device, cl);
1404
1405 mutex_lock(&bch_register_lock);
1406 bcache_device_unlink(d);
1407 mutex_unlock(&bch_register_lock);
1408 kobject_del(&d->kobj);
1409 continue_at(cl, flash_dev_free, system_wq);
1410}
1411
1412static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
1413{
1414 struct bcache_device *d = kzalloc(sizeof(struct bcache_device),
1415 GFP_KERNEL);
1416 if (!d)
1417 return -ENOMEM;
1418
1419 closure_init(&d->cl, NULL);
1420 set_closure_fn(&d->cl, flash_dev_flush, system_wq);
1421
1422 kobject_init(&d->kobj, &bch_flash_dev_ktype);
1423
1424 if (bcache_device_init(d, block_bytes(c), u->sectors))
1425 goto err;
1426
1427 bcache_device_attach(d, c, u - c->uuids);
1428 bch_sectors_dirty_init(d);
1429 bch_flash_dev_request_init(d);
1430 add_disk(d->disk);
1431
1432 if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache"))
1433 goto err;
1434
1435 bcache_device_link(d, c, "volume");
1436
1437 return 0;
1438err:
1439 kobject_put(&d->kobj);
1440 return -ENOMEM;
1441}
1442
1443static int flash_devs_run(struct cache_set *c)
1444{
1445 int ret = 0;
1446 struct uuid_entry *u;
1447
1448 for (u = c->uuids;
1449 u < c->uuids + c->nr_uuids && !ret;
1450 u++)
1451 if (UUID_FLASH_ONLY(u))
1452 ret = flash_dev_run(c, u);
1453
1454 return ret;
1455}
1456
1457int bch_flash_dev_create(struct cache_set *c, uint64_t size)
1458{
1459 struct uuid_entry *u;
1460
1461 if (test_bit(CACHE_SET_STOPPING, &c->flags))
1462 return -EINTR;
1463
1464 if (!test_bit(CACHE_SET_RUNNING, &c->flags))
1465 return -EPERM;
1466
1467 u = uuid_find_empty(c);
1468 if (!u) {
1469 pr_err("Can't create volume, no room for UUID");
1470 return -EINVAL;
1471 }
1472
1473 get_random_bytes(u->uuid, 16);
1474 memset(u->label, 0, 32);
1475 u->first_reg = u->last_reg = cpu_to_le32((u32)ktime_get_real_seconds());
1476
1477 SET_UUID_FLASH_ONLY(u, 1);
1478 u->sectors = size >> 9;
1479
1480 bch_uuid_write(c);
1481
1482 return flash_dev_run(c, u);
1483}
1484
1485bool bch_cached_dev_error(struct cached_dev *dc)
1486{
1487 if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
1488 return false;
1489
1490 dc->io_disable = true;
1491
1492 smp_mb();
1493
1494 pr_err("stop %s: too many IO errors on backing device %s\n",
1495 dc->disk.disk->disk_name, dc->backing_dev_name);
1496
1497 bcache_device_stop(&dc->disk);
1498 return true;
1499}
1500
1501
1502
1503__printf(2, 3)
1504bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
1505{
1506 va_list args;
1507
1508 if (c->on_error != ON_ERROR_PANIC &&
1509 test_bit(CACHE_SET_STOPPING, &c->flags))
1510 return false;
1511
1512 if (test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags))
1513 pr_info("CACHE_SET_IO_DISABLE already set");
1514
1515
1516
1517
1518
1519
1520 pr_err("bcache: error on %pU: ", c->sb.set_uuid);
1521
1522 va_start(args, fmt);
1523 vprintk(fmt, args);
1524 va_end(args);
1525
1526 pr_err(", disabling caching\n");
1527
1528 if (c->on_error == ON_ERROR_PANIC)
1529 panic("panic forced after error\n");
1530
1531 bch_cache_set_unregister(c);
1532 return true;
1533}
1534
1535
1536void bch_cache_set_release(struct kobject *kobj)
1537{
1538 struct cache_set *c = container_of(kobj, struct cache_set, kobj);
1539
1540 kfree(c);
1541 module_put(THIS_MODULE);
1542}
1543
1544static void cache_set_free(struct closure *cl)
1545{
1546 struct cache_set *c = container_of(cl, struct cache_set, cl);
1547 struct cache *ca;
1548 unsigned int i;
1549
1550 debugfs_remove(c->debug);
1551
1552 bch_open_buckets_free(c);
1553 bch_btree_cache_free(c);
1554 bch_journal_free(c);
1555
1556 mutex_lock(&bch_register_lock);
1557 for_each_cache(ca, c, i)
1558 if (ca) {
1559 ca->set = NULL;
1560 c->cache[ca->sb.nr_this_dev] = NULL;
1561 kobject_put(&ca->kobj);
1562 }
1563
1564 bch_bset_sort_state_free(&c->sort);
1565 free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
1566
1567 if (c->moving_gc_wq)
1568 destroy_workqueue(c->moving_gc_wq);
1569 bioset_exit(&c->bio_split);
1570 mempool_exit(&c->fill_iter);
1571 mempool_exit(&c->bio_meta);
1572 mempool_exit(&c->search);
1573 kfree(c->devices);
1574
1575 list_del(&c->list);
1576 mutex_unlock(&bch_register_lock);
1577
1578 pr_info("Cache set %pU unregistered", c->sb.set_uuid);
1579 wake_up(&unregister_wait);
1580
1581 closure_debug_destroy(&c->cl);
1582 kobject_put(&c->kobj);
1583}
1584
1585static void cache_set_flush(struct closure *cl)
1586{
1587 struct cache_set *c = container_of(cl, struct cache_set, caching);
1588 struct cache *ca;
1589 struct btree *b;
1590 unsigned int i;
1591
1592 bch_cache_accounting_destroy(&c->accounting);
1593
1594 kobject_put(&c->internal);
1595 kobject_del(&c->kobj);
1596
1597 if (!IS_ERR_OR_NULL(c->gc_thread))
1598 kthread_stop(c->gc_thread);
1599
1600 if (!IS_ERR_OR_NULL(c->root))
1601 list_add(&c->root->list, &c->btree_cache);
1602
1603
1604
1605
1606
1607 if (!test_bit(CACHE_SET_IO_DISABLE, &c->flags))
1608 list_for_each_entry(b, &c->btree_cache, list) {
1609 mutex_lock(&b->write_lock);
1610 if (btree_node_dirty(b))
1611 __bch_btree_node_write(b, NULL);
1612 mutex_unlock(&b->write_lock);
1613 }
1614
1615 for_each_cache(ca, c, i)
1616 if (ca->alloc_thread)
1617 kthread_stop(ca->alloc_thread);
1618
1619 if (c->journal.cur) {
1620 cancel_delayed_work_sync(&c->journal.work);
1621
1622 c->journal.work.work.func(&c->journal.work.work);
1623 }
1624
1625 closure_return(cl);
1626}
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644static void conditional_stop_bcache_device(struct cache_set *c,
1645 struct bcache_device *d,
1646 struct cached_dev *dc)
1647{
1648 if (dc->stop_when_cache_set_failed == BCH_CACHED_DEV_STOP_ALWAYS) {
1649 pr_warn("stop_when_cache_set_failed of %s is \"always\", stop it for failed cache set %pU.",
1650 d->disk->disk_name, c->sb.set_uuid);
1651 bcache_device_stop(d);
1652 } else if (atomic_read(&dc->has_dirty)) {
1653
1654
1655
1656
1657 pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is dirty, stop it to avoid potential data corruption.",
1658 d->disk->disk_name);
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670 dc->io_disable = true;
1671
1672 smp_mb();
1673 bcache_device_stop(d);
1674 } else {
1675
1676
1677
1678
1679 pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is clean, keep it alive.",
1680 d->disk->disk_name);
1681 }
1682}
1683
1684static void __cache_set_unregister(struct closure *cl)
1685{
1686 struct cache_set *c = container_of(cl, struct cache_set, caching);
1687 struct cached_dev *dc;
1688 struct bcache_device *d;
1689 size_t i;
1690
1691 mutex_lock(&bch_register_lock);
1692
1693 for (i = 0; i < c->devices_max_used; i++) {
1694 d = c->devices[i];
1695 if (!d)
1696 continue;
1697
1698 if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
1699 test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
1700 dc = container_of(d, struct cached_dev, disk);
1701 bch_cached_dev_detach(dc);
1702 if (test_bit(CACHE_SET_IO_DISABLE, &c->flags))
1703 conditional_stop_bcache_device(c, d, dc);
1704 } else {
1705 bcache_device_stop(d);
1706 }
1707 }
1708
1709 mutex_unlock(&bch_register_lock);
1710
1711 continue_at(cl, cache_set_flush, system_wq);
1712}
1713
1714void bch_cache_set_stop(struct cache_set *c)
1715{
1716 if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags))
1717
1718 closure_queue(&c->caching);
1719}
1720
1721void bch_cache_set_unregister(struct cache_set *c)
1722{
1723 set_bit(CACHE_SET_UNREGISTERING, &c->flags);
1724 bch_cache_set_stop(c);
1725}
1726
1727#define alloc_bucket_pages(gfp, c) \
1728 ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(c))))
1729
1730struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1731{
1732 int iter_size;
1733 struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
1734
1735 if (!c)
1736 return NULL;
1737
1738 __module_get(THIS_MODULE);
1739 closure_init(&c->cl, NULL);
1740 set_closure_fn(&c->cl, cache_set_free, system_wq);
1741
1742 closure_init(&c->caching, &c->cl);
1743 set_closure_fn(&c->caching, __cache_set_unregister, system_wq);
1744
1745
1746 closure_set_stopped(&c->cl);
1747 closure_put(&c->cl);
1748
1749 kobject_init(&c->kobj, &bch_cache_set_ktype);
1750 kobject_init(&c->internal, &bch_cache_set_internal_ktype);
1751
1752 bch_cache_accounting_init(&c->accounting, &c->cl);
1753
1754 memcpy(c->sb.set_uuid, sb->set_uuid, 16);
1755 c->sb.block_size = sb->block_size;
1756 c->sb.bucket_size = sb->bucket_size;
1757 c->sb.nr_in_set = sb->nr_in_set;
1758 c->sb.last_mount = sb->last_mount;
1759 c->bucket_bits = ilog2(sb->bucket_size);
1760 c->block_bits = ilog2(sb->block_size);
1761 c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry);
1762 c->devices_max_used = 0;
1763 atomic_set(&c->attached_dev_nr, 0);
1764 c->btree_pages = bucket_pages(c);
1765 if (c->btree_pages > BTREE_MAX_PAGES)
1766 c->btree_pages = max_t(int, c->btree_pages / 4,
1767 BTREE_MAX_PAGES);
1768
1769 sema_init(&c->sb_write_mutex, 1);
1770 mutex_init(&c->bucket_lock);
1771 init_waitqueue_head(&c->btree_cache_wait);
1772 init_waitqueue_head(&c->bucket_wait);
1773 init_waitqueue_head(&c->gc_wait);
1774 sema_init(&c->uuid_write_mutex, 1);
1775
1776 spin_lock_init(&c->btree_gc_time.lock);
1777 spin_lock_init(&c->btree_split_time.lock);
1778 spin_lock_init(&c->btree_read_time.lock);
1779
1780 bch_moving_init_cache_set(c);
1781
1782 INIT_LIST_HEAD(&c->list);
1783 INIT_LIST_HEAD(&c->cached_devs);
1784 INIT_LIST_HEAD(&c->btree_cache);
1785 INIT_LIST_HEAD(&c->btree_cache_freeable);
1786 INIT_LIST_HEAD(&c->btree_cache_freed);
1787 INIT_LIST_HEAD(&c->data_buckets);
1788
1789 iter_size = (sb->bucket_size / sb->block_size + 1) *
1790 sizeof(struct btree_iter_set);
1791
1792 if (!(c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL)) ||
1793 mempool_init_slab_pool(&c->search, 32, bch_search_cache) ||
1794 mempool_init_kmalloc_pool(&c->bio_meta, 2,
1795 sizeof(struct bbio) + sizeof(struct bio_vec) *
1796 bucket_pages(c)) ||
1797 mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) ||
1798 bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio),
1799 BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER) ||
1800 !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
1801 !(c->moving_gc_wq = alloc_workqueue("bcache_gc",
1802 WQ_MEM_RECLAIM, 0)) ||
1803 bch_journal_alloc(c) ||
1804 bch_btree_cache_alloc(c) ||
1805 bch_open_buckets_alloc(c) ||
1806 bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages)))
1807 goto err;
1808
1809 c->congested_read_threshold_us = 2000;
1810 c->congested_write_threshold_us = 20000;
1811 c->error_limit = DEFAULT_IO_ERROR_LIMIT;
1812 WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags));
1813
1814 return c;
1815err:
1816 bch_cache_set_unregister(c);
1817 return NULL;
1818}
1819
1820static int run_cache_set(struct cache_set *c)
1821{
1822 const char *err = "cannot allocate memory";
1823 struct cached_dev *dc, *t;
1824 struct cache *ca;
1825 struct closure cl;
1826 unsigned int i;
1827 LIST_HEAD(journal);
1828 struct journal_replay *l;
1829
1830 closure_init_stack(&cl);
1831
1832 for_each_cache(ca, c, i)
1833 c->nbuckets += ca->sb.nbuckets;
1834 set_gc_sectors(c);
1835
1836 if (CACHE_SYNC(&c->sb)) {
1837 struct bkey *k;
1838 struct jset *j;
1839
1840 err = "cannot allocate memory for journal";
1841 if (bch_journal_read(c, &journal))
1842 goto err;
1843
1844 pr_debug("btree_journal_read() done");
1845
1846 err = "no journal entries found";
1847 if (list_empty(&journal))
1848 goto err;
1849
1850 j = &list_entry(journal.prev, struct journal_replay, list)->j;
1851
1852 err = "IO error reading priorities";
1853 for_each_cache(ca, c, i)
1854 prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]);
1855
1856
1857
1858
1859
1860
1861
1862 k = &j->btree_root;
1863
1864 err = "bad btree root";
1865 if (__bch_btree_ptr_invalid(c, k))
1866 goto err;
1867
1868 err = "error reading btree root";
1869 c->root = bch_btree_node_get(c, NULL, k,
1870 j->btree_level,
1871 true, NULL);
1872 if (IS_ERR_OR_NULL(c->root))
1873 goto err;
1874
1875 list_del_init(&c->root->list);
1876 rw_unlock(true, c->root);
1877
1878 err = uuid_read(c, j, &cl);
1879 if (err)
1880 goto err;
1881
1882 err = "error in recovery";
1883 if (bch_btree_check(c))
1884 goto err;
1885
1886
1887
1888
1889
1890
1891
1892 if (!c->shrinker_disabled) {
1893 struct shrink_control sc;
1894
1895 sc.gfp_mask = GFP_KERNEL;
1896 sc.nr_to_scan = c->btree_cache_used * c->btree_pages;
1897
1898 c->shrink.scan_objects(&c->shrink, &sc);
1899
1900 c->shrink.scan_objects(&c->shrink, &sc);
1901 }
1902
1903 bch_journal_mark(c, &journal);
1904 bch_initial_gc_finish(c);
1905 pr_debug("btree_check() done");
1906
1907
1908
1909
1910
1911
1912 bch_journal_next(&c->journal);
1913
1914 err = "error starting allocator thread";
1915 for_each_cache(ca, c, i)
1916 if (bch_cache_allocator_start(ca))
1917 goto err;
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929 if (j->version < BCACHE_JSET_VERSION_UUID)
1930 __uuid_write(c);
1931
1932 err = "bcache: replay journal failed";
1933 if (bch_journal_replay(c, &journal))
1934 goto err;
1935 } else {
1936 pr_notice("invalidating existing data");
1937
1938 for_each_cache(ca, c, i) {
1939 unsigned int j;
1940
1941 ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
1942 2, SB_JOURNAL_BUCKETS);
1943
1944 for (j = 0; j < ca->sb.keys; j++)
1945 ca->sb.d[j] = ca->sb.first_bucket + j;
1946 }
1947
1948 bch_initial_gc_finish(c);
1949
1950 err = "error starting allocator thread";
1951 for_each_cache(ca, c, i)
1952 if (bch_cache_allocator_start(ca))
1953 goto err;
1954
1955 mutex_lock(&c->bucket_lock);
1956 for_each_cache(ca, c, i)
1957 bch_prio_write(ca);
1958 mutex_unlock(&c->bucket_lock);
1959
1960 err = "cannot allocate new UUID bucket";
1961 if (__uuid_write(c))
1962 goto err;
1963
1964 err = "cannot allocate new btree root";
1965 c->root = __bch_btree_node_alloc(c, NULL, 0, true, NULL);
1966 if (IS_ERR_OR_NULL(c->root))
1967 goto err;
1968
1969 mutex_lock(&c->root->write_lock);
1970 bkey_copy_key(&c->root->key, &MAX_KEY);
1971 bch_btree_node_write(c->root, &cl);
1972 mutex_unlock(&c->root->write_lock);
1973
1974 bch_btree_set_root(c->root);
1975 rw_unlock(true, c->root);
1976
1977
1978
1979
1980
1981
1982 SET_CACHE_SYNC(&c->sb, true);
1983
1984 bch_journal_next(&c->journal);
1985 bch_journal_meta(c, &cl);
1986 }
1987
1988 err = "error starting gc thread";
1989 if (bch_gc_thread_start(c))
1990 goto err;
1991
1992 closure_sync(&cl);
1993 c->sb.last_mount = (u32)ktime_get_real_seconds();
1994 bcache_write_super(c);
1995
1996 list_for_each_entry_safe(dc, t, &uncached_devices, list)
1997 bch_cached_dev_attach(dc, c, NULL);
1998
1999 flash_devs_run(c);
2000
2001 set_bit(CACHE_SET_RUNNING, &c->flags);
2002 return 0;
2003err:
2004 while (!list_empty(&journal)) {
2005 l = list_first_entry(&journal, struct journal_replay, list);
2006 list_del(&l->list);
2007 kfree(l);
2008 }
2009
2010 closure_sync(&cl);
2011
2012 bch_cache_set_error(c, "%s", err);
2013
2014 return -EIO;
2015}
2016
2017static bool can_attach_cache(struct cache *ca, struct cache_set *c)
2018{
2019 return ca->sb.block_size == c->sb.block_size &&
2020 ca->sb.bucket_size == c->sb.bucket_size &&
2021 ca->sb.nr_in_set == c->sb.nr_in_set;
2022}
2023
2024static const char *register_cache_set(struct cache *ca)
2025{
2026 char buf[12];
2027 const char *err = "cannot allocate memory";
2028 struct cache_set *c;
2029
2030 list_for_each_entry(c, &bch_cache_sets, list)
2031 if (!memcmp(c->sb.set_uuid, ca->sb.set_uuid, 16)) {
2032 if (c->cache[ca->sb.nr_this_dev])
2033 return "duplicate cache set member";
2034
2035 if (!can_attach_cache(ca, c))
2036 return "cache sb does not match set";
2037
2038 if (!CACHE_SYNC(&ca->sb))
2039 SET_CACHE_SYNC(&c->sb, false);
2040
2041 goto found;
2042 }
2043
2044 c = bch_cache_set_alloc(&ca->sb);
2045 if (!c)
2046 return err;
2047
2048 err = "error creating kobject";
2049 if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->sb.set_uuid) ||
2050 kobject_add(&c->internal, &c->kobj, "internal"))
2051 goto err;
2052
2053 if (bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
2054 goto err;
2055
2056 bch_debug_init_cache_set(c);
2057
2058 list_add(&c->list, &bch_cache_sets);
2059found:
2060 sprintf(buf, "cache%i", ca->sb.nr_this_dev);
2061 if (sysfs_create_link(&ca->kobj, &c->kobj, "set") ||
2062 sysfs_create_link(&c->kobj, &ca->kobj, buf))
2063 goto err;
2064
2065 if (ca->sb.seq > c->sb.seq) {
2066 c->sb.version = ca->sb.version;
2067 memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16);
2068 c->sb.flags = ca->sb.flags;
2069 c->sb.seq = ca->sb.seq;
2070 pr_debug("set version = %llu", c->sb.version);
2071 }
2072
2073 kobject_get(&ca->kobj);
2074 ca->set = c;
2075 ca->set->cache[ca->sb.nr_this_dev] = ca;
2076 c->cache_by_alloc[c->caches_loaded++] = ca;
2077
2078 if (c->caches_loaded == c->sb.nr_in_set) {
2079 err = "failed to run cache set";
2080 if (run_cache_set(c) < 0)
2081 goto err;
2082 }
2083
2084 return NULL;
2085err:
2086 bch_cache_set_unregister(c);
2087 return err;
2088}
2089
2090
2091
2092
2093void bch_cache_release(struct kobject *kobj)
2094{
2095 struct cache *ca = container_of(kobj, struct cache, kobj);
2096 unsigned int i;
2097
2098 if (ca->set) {
2099 BUG_ON(ca->set->cache[ca->sb.nr_this_dev] != ca);
2100 ca->set->cache[ca->sb.nr_this_dev] = NULL;
2101 }
2102
2103 free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
2104 kfree(ca->prio_buckets);
2105 vfree(ca->buckets);
2106
2107 free_heap(&ca->heap);
2108 free_fifo(&ca->free_inc);
2109
2110 for (i = 0; i < RESERVE_NR; i++)
2111 free_fifo(&ca->free[i]);
2112
2113 if (ca->sb_bio.bi_inline_vecs[0].bv_page)
2114 put_page(bio_first_page_all(&ca->sb_bio));
2115
2116 if (!IS_ERR_OR_NULL(ca->bdev))
2117 blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2118
2119 kfree(ca);
2120 module_put(THIS_MODULE);
2121}
2122
2123static int cache_alloc(struct cache *ca)
2124{
2125 size_t free;
2126 size_t btree_buckets;
2127 struct bucket *b;
2128 int ret = -ENOMEM;
2129 const char *err = NULL;
2130
2131 __module_get(THIS_MODULE);
2132 kobject_init(&ca->kobj, &bch_cache_ktype);
2133
2134 bio_init(&ca->journal.bio, ca->journal.bio.bi_inline_vecs, 8);
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145 btree_buckets = ca->sb.njournal_buckets ?: 8;
2146 free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
2147 if (!free) {
2148 ret = -EPERM;
2149 err = "ca->sb.nbuckets is too small";
2150 goto err_free;
2151 }
2152
2153 if (!init_fifo(&ca->free[RESERVE_BTREE], btree_buckets,
2154 GFP_KERNEL)) {
2155 err = "ca->free[RESERVE_BTREE] alloc failed";
2156 goto err_btree_alloc;
2157 }
2158
2159 if (!init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca),
2160 GFP_KERNEL)) {
2161 err = "ca->free[RESERVE_PRIO] alloc failed";
2162 goto err_prio_alloc;
2163 }
2164
2165 if (!init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL)) {
2166 err = "ca->free[RESERVE_MOVINGGC] alloc failed";
2167 goto err_movinggc_alloc;
2168 }
2169
2170 if (!init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL)) {
2171 err = "ca->free[RESERVE_NONE] alloc failed";
2172 goto err_none_alloc;
2173 }
2174
2175 if (!init_fifo(&ca->free_inc, free << 2, GFP_KERNEL)) {
2176 err = "ca->free_inc alloc failed";
2177 goto err_free_inc_alloc;
2178 }
2179
2180 if (!init_heap(&ca->heap, free << 3, GFP_KERNEL)) {
2181 err = "ca->heap alloc failed";
2182 goto err_heap_alloc;
2183 }
2184
2185 ca->buckets = vzalloc(array_size(sizeof(struct bucket),
2186 ca->sb.nbuckets));
2187 if (!ca->buckets) {
2188 err = "ca->buckets alloc failed";
2189 goto err_buckets_alloc;
2190 }
2191
2192 ca->prio_buckets = kzalloc(array3_size(sizeof(uint64_t),
2193 prio_buckets(ca), 2),
2194 GFP_KERNEL);
2195 if (!ca->prio_buckets) {
2196 err = "ca->prio_buckets alloc failed";
2197 goto err_prio_buckets_alloc;
2198 }
2199
2200 ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca);
2201 if (!ca->disk_buckets) {
2202 err = "ca->disk_buckets alloc failed";
2203 goto err_disk_buckets_alloc;
2204 }
2205
2206 ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
2207
2208 for_each_bucket(b, ca)
2209 atomic_set(&b->pin, 0);
2210 return 0;
2211
2212err_disk_buckets_alloc:
2213 kfree(ca->prio_buckets);
2214err_prio_buckets_alloc:
2215 vfree(ca->buckets);
2216err_buckets_alloc:
2217 free_heap(&ca->heap);
2218err_heap_alloc:
2219 free_fifo(&ca->free_inc);
2220err_free_inc_alloc:
2221 free_fifo(&ca->free[RESERVE_NONE]);
2222err_none_alloc:
2223 free_fifo(&ca->free[RESERVE_MOVINGGC]);
2224err_movinggc_alloc:
2225 free_fifo(&ca->free[RESERVE_PRIO]);
2226err_prio_alloc:
2227 free_fifo(&ca->free[RESERVE_BTREE]);
2228err_btree_alloc:
2229err_free:
2230 module_put(THIS_MODULE);
2231 if (err)
2232 pr_notice("error %s: %s", ca->cache_dev_name, err);
2233 return ret;
2234}
2235
2236static int register_cache(struct cache_sb *sb, struct page *sb_page,
2237 struct block_device *bdev, struct cache *ca)
2238{
2239 const char *err = NULL;
2240 int ret = 0;
2241
2242 bdevname(bdev, ca->cache_dev_name);
2243 memcpy(&ca->sb, sb, sizeof(struct cache_sb));
2244 ca->bdev = bdev;
2245 ca->bdev->bd_holder = ca;
2246
2247 bio_init(&ca->sb_bio, ca->sb_bio.bi_inline_vecs, 1);
2248 bio_first_bvec_all(&ca->sb_bio)->bv_page = sb_page;
2249 get_page(sb_page);
2250
2251 if (blk_queue_discard(bdev_get_queue(bdev)))
2252 ca->discard = CACHE_DISCARD(&ca->sb);
2253
2254 ret = cache_alloc(ca);
2255 if (ret != 0) {
2256
2257
2258
2259
2260
2261
2262 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2263 if (ret == -ENOMEM)
2264 err = "cache_alloc(): -ENOMEM";
2265 else if (ret == -EPERM)
2266 err = "cache_alloc(): cache device is too small";
2267 else
2268 err = "cache_alloc(): unknown error";
2269 goto err;
2270 }
2271
2272 if (kobject_add(&ca->kobj,
2273 &part_to_dev(bdev->bd_part)->kobj,
2274 "bcache")) {
2275 err = "error calling kobject_add";
2276 ret = -ENOMEM;
2277 goto out;
2278 }
2279
2280 mutex_lock(&bch_register_lock);
2281 err = register_cache_set(ca);
2282 mutex_unlock(&bch_register_lock);
2283
2284 if (err) {
2285 ret = -ENODEV;
2286 goto out;
2287 }
2288
2289 pr_info("registered cache device %s", ca->cache_dev_name);
2290
2291out:
2292 kobject_put(&ca->kobj);
2293
2294err:
2295 if (err)
2296 pr_notice("error %s: %s", ca->cache_dev_name, err);
2297
2298 return ret;
2299}
2300
2301
2302
2303static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
2304 const char *buffer, size_t size);
2305static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
2306 struct kobj_attribute *attr,
2307 const char *buffer, size_t size);
2308
2309kobj_attribute_write(register, register_bcache);
2310kobj_attribute_write(register_quiet, register_bcache);
2311kobj_attribute_write(pendings_cleanup, bch_pending_bdevs_cleanup);
2312
2313static bool bch_is_open_backing(struct block_device *bdev)
2314{
2315 struct cache_set *c, *tc;
2316 struct cached_dev *dc, *t;
2317
2318 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
2319 list_for_each_entry_safe(dc, t, &c->cached_devs, list)
2320 if (dc->bdev == bdev)
2321 return true;
2322 list_for_each_entry_safe(dc, t, &uncached_devices, list)
2323 if (dc->bdev == bdev)
2324 return true;
2325 return false;
2326}
2327
2328static bool bch_is_open_cache(struct block_device *bdev)
2329{
2330 struct cache_set *c, *tc;
2331 struct cache *ca;
2332 unsigned int i;
2333
2334 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
2335 for_each_cache(ca, c, i)
2336 if (ca->bdev == bdev)
2337 return true;
2338 return false;
2339}
2340
2341static bool bch_is_open(struct block_device *bdev)
2342{
2343 return bch_is_open_cache(bdev) || bch_is_open_backing(bdev);
2344}
2345
2346static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
2347 const char *buffer, size_t size)
2348{
2349 ssize_t ret = -EINVAL;
2350 const char *err = "cannot allocate memory";
2351 char *path = NULL;
2352 struct cache_sb *sb = NULL;
2353 struct block_device *bdev = NULL;
2354 struct page *sb_page = NULL;
2355
2356 if (!try_module_get(THIS_MODULE))
2357 return -EBUSY;
2358
2359
2360 smp_mb();
2361 if (bcache_is_reboot)
2362 return -EBUSY;
2363
2364 path = kstrndup(buffer, size, GFP_KERNEL);
2365 if (!path)
2366 goto err;
2367
2368 sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL);
2369 if (!sb)
2370 goto err;
2371
2372 err = "failed to open device";
2373 bdev = blkdev_get_by_path(strim(path),
2374 FMODE_READ|FMODE_WRITE|FMODE_EXCL,
2375 sb);
2376 if (IS_ERR(bdev)) {
2377 if (bdev == ERR_PTR(-EBUSY)) {
2378 bdev = lookup_bdev(strim(path));
2379 mutex_lock(&bch_register_lock);
2380 if (!IS_ERR(bdev) && bch_is_open(bdev))
2381 err = "device already registered";
2382 else
2383 err = "device busy";
2384 mutex_unlock(&bch_register_lock);
2385 if (!IS_ERR(bdev))
2386 bdput(bdev);
2387 if (attr == &ksysfs_register_quiet)
2388 goto quiet_out;
2389 }
2390 goto err;
2391 }
2392
2393 err = "failed to set blocksize";
2394 if (set_blocksize(bdev, 4096))
2395 goto err_close;
2396
2397 err = read_super(sb, bdev, &sb_page);
2398 if (err)
2399 goto err_close;
2400
2401 err = "failed to register device";
2402 if (SB_IS_BDEV(sb)) {
2403 struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
2404
2405 if (!dc)
2406 goto err_close;
2407
2408 mutex_lock(&bch_register_lock);
2409 ret = register_bdev(sb, sb_page, bdev, dc);
2410 mutex_unlock(&bch_register_lock);
2411
2412 if (ret < 0)
2413 goto err;
2414 } else {
2415 struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
2416
2417 if (!ca)
2418 goto err_close;
2419
2420
2421 if (register_cache(sb, sb_page, bdev, ca) != 0)
2422 goto err;
2423 }
2424quiet_out:
2425 ret = size;
2426out:
2427 if (sb_page)
2428 put_page(sb_page);
2429 kfree(sb);
2430 kfree(path);
2431 module_put(THIS_MODULE);
2432 return ret;
2433
2434err_close:
2435 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2436err:
2437 pr_info("error %s: %s", path, err);
2438 goto out;
2439}
2440
2441
2442struct pdev {
2443 struct list_head list;
2444 struct cached_dev *dc;
2445};
2446
2447static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
2448 struct kobj_attribute *attr,
2449 const char *buffer,
2450 size_t size)
2451{
2452 LIST_HEAD(pending_devs);
2453 ssize_t ret = size;
2454 struct cached_dev *dc, *tdc;
2455 struct pdev *pdev, *tpdev;
2456 struct cache_set *c, *tc;
2457
2458 mutex_lock(&bch_register_lock);
2459 list_for_each_entry_safe(dc, tdc, &uncached_devices, list) {
2460 pdev = kmalloc(sizeof(struct pdev), GFP_KERNEL);
2461 if (!pdev)
2462 break;
2463 pdev->dc = dc;
2464 list_add(&pdev->list, &pending_devs);
2465 }
2466
2467 list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) {
2468 list_for_each_entry_safe(c, tc, &bch_cache_sets, list) {
2469 char *pdev_set_uuid = pdev->dc->sb.set_uuid;
2470 char *set_uuid = c->sb.uuid;
2471
2472 if (!memcmp(pdev_set_uuid, set_uuid, 16)) {
2473 list_del(&pdev->list);
2474 kfree(pdev);
2475 break;
2476 }
2477 }
2478 }
2479 mutex_unlock(&bch_register_lock);
2480
2481 list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) {
2482 pr_info("delete pdev %p", pdev);
2483 list_del(&pdev->list);
2484 bcache_device_stop(&pdev->dc->disk);
2485 kfree(pdev);
2486 }
2487
2488 return ret;
2489}
2490
2491static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
2492{
2493 if (bcache_is_reboot)
2494 return NOTIFY_DONE;
2495
2496 if (code == SYS_DOWN ||
2497 code == SYS_HALT ||
2498 code == SYS_POWER_OFF) {
2499 DEFINE_WAIT(wait);
2500 unsigned long start = jiffies;
2501 bool stopped = false;
2502
2503 struct cache_set *c, *tc;
2504 struct cached_dev *dc, *tdc;
2505
2506 mutex_lock(&bch_register_lock);
2507
2508 if (bcache_is_reboot)
2509 goto out;
2510
2511
2512 bcache_is_reboot = true;
2513
2514
2515
2516
2517 smp_mb();
2518
2519 if (list_empty(&bch_cache_sets) &&
2520 list_empty(&uncached_devices))
2521 goto out;
2522
2523 mutex_unlock(&bch_register_lock);
2524
2525 pr_info("Stopping all devices:");
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
2542 bch_cache_set_stop(c);
2543
2544 list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
2545 bcache_device_stop(&dc->disk);
2546
2547
2548
2549
2550
2551
2552 schedule();
2553
2554
2555 while (1) {
2556 long timeout = start + 10 * HZ - jiffies;
2557
2558 mutex_lock(&bch_register_lock);
2559 stopped = list_empty(&bch_cache_sets) &&
2560 list_empty(&uncached_devices);
2561
2562 if (timeout < 0 || stopped)
2563 break;
2564
2565 prepare_to_wait(&unregister_wait, &wait,
2566 TASK_UNINTERRUPTIBLE);
2567
2568 mutex_unlock(&bch_register_lock);
2569 schedule_timeout(timeout);
2570 }
2571
2572 finish_wait(&unregister_wait, &wait);
2573
2574 if (stopped)
2575 pr_info("All devices stopped");
2576 else
2577 pr_notice("Timeout waiting for devices to be closed");
2578out:
2579 mutex_unlock(&bch_register_lock);
2580 }
2581
2582 return NOTIFY_DONE;
2583}
2584
2585static struct notifier_block reboot = {
2586 .notifier_call = bcache_reboot,
2587 .priority = INT_MAX,
2588};
2589
2590static void bcache_exit(void)
2591{
2592 bch_debug_exit();
2593 bch_request_exit();
2594 if (bcache_kobj)
2595 kobject_put(bcache_kobj);
2596 if (bcache_wq)
2597 destroy_workqueue(bcache_wq);
2598 if (bch_journal_wq)
2599 destroy_workqueue(bch_journal_wq);
2600
2601 if (bcache_major)
2602 unregister_blkdev(bcache_major, "bcache");
2603 unregister_reboot_notifier(&reboot);
2604 mutex_destroy(&bch_register_lock);
2605}
2606
2607
2608static void check_module_parameters(void)
2609{
2610 if (bch_cutoff_writeback_sync == 0)
2611 bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC;
2612 else if (bch_cutoff_writeback_sync > CUTOFF_WRITEBACK_SYNC_MAX) {
2613 pr_warn("set bch_cutoff_writeback_sync (%u) to max value %u",
2614 bch_cutoff_writeback_sync, CUTOFF_WRITEBACK_SYNC_MAX);
2615 bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC_MAX;
2616 }
2617
2618 if (bch_cutoff_writeback == 0)
2619 bch_cutoff_writeback = CUTOFF_WRITEBACK;
2620 else if (bch_cutoff_writeback > CUTOFF_WRITEBACK_MAX) {
2621 pr_warn("set bch_cutoff_writeback (%u) to max value %u",
2622 bch_cutoff_writeback, CUTOFF_WRITEBACK_MAX);
2623 bch_cutoff_writeback = CUTOFF_WRITEBACK_MAX;
2624 }
2625
2626 if (bch_cutoff_writeback > bch_cutoff_writeback_sync) {
2627 pr_warn("set bch_cutoff_writeback (%u) to %u",
2628 bch_cutoff_writeback, bch_cutoff_writeback_sync);
2629 bch_cutoff_writeback = bch_cutoff_writeback_sync;
2630 }
2631}
2632
2633static int __init bcache_init(void)
2634{
2635 static const struct attribute *files[] = {
2636 &ksysfs_register.attr,
2637 &ksysfs_register_quiet.attr,
2638 &ksysfs_pendings_cleanup.attr,
2639 NULL
2640 };
2641
2642 check_module_parameters();
2643
2644 mutex_init(&bch_register_lock);
2645 init_waitqueue_head(&unregister_wait);
2646 register_reboot_notifier(&reboot);
2647
2648 bcache_major = register_blkdev(0, "bcache");
2649 if (bcache_major < 0) {
2650 unregister_reboot_notifier(&reboot);
2651 mutex_destroy(&bch_register_lock);
2652 return bcache_major;
2653 }
2654
2655 bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0);
2656 if (!bcache_wq)
2657 goto err;
2658
2659 bch_journal_wq = alloc_workqueue("bch_journal", WQ_MEM_RECLAIM, 0);
2660 if (!bch_journal_wq)
2661 goto err;
2662
2663 bcache_kobj = kobject_create_and_add("bcache", fs_kobj);
2664 if (!bcache_kobj)
2665 goto err;
2666
2667 if (bch_request_init() ||
2668 sysfs_create_files(bcache_kobj, files))
2669 goto err;
2670
2671 bch_debug_init();
2672 closure_debug_init();
2673
2674 bcache_is_reboot = false;
2675
2676 return 0;
2677err:
2678 bcache_exit();
2679 return -ENOMEM;
2680}
2681
2682
2683
2684
2685module_exit(bcache_exit);
2686module_init(bcache_init);
2687
2688module_param(bch_cutoff_writeback, uint, 0);
2689MODULE_PARM_DESC(bch_cutoff_writeback, "threshold to cutoff writeback");
2690
2691module_param(bch_cutoff_writeback_sync, uint, 0);
2692MODULE_PARM_DESC(bch_cutoff_writeback_sync, "hard threshold to cutoff writeback");
2693
2694MODULE_DESCRIPTION("Bcache: a Linux block layer cache");
2695MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
2696MODULE_LICENSE("GPL");
2697