1
2
3
4
5
6
7
8
9
10#include "bcache.h"
11#include "btree.h"
12#include "debug.h"
13#include "extents.h"
14#include "request.h"
15#include "writeback.h"
16#include "features.h"
17
18#include <linux/blkdev.h>
19#include <linux/pagemap.h>
20#include <linux/debugfs.h>
21#include <linux/genhd.h>
22#include <linux/idr.h>
23#include <linux/kthread.h>
24#include <linux/workqueue.h>
25#include <linux/module.h>
26#include <linux/random.h>
27#include <linux/reboot.h>
28#include <linux/sysfs.h>
29
30unsigned int bch_cutoff_writeback;
31unsigned int bch_cutoff_writeback_sync;
32
33static const char bcache_magic[] = {
34 0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
35 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81
36};
37
38static const char invalid_uuid[] = {
39 0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78,
40 0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
41};
42
43static struct kobject *bcache_kobj;
44struct mutex bch_register_lock;
45bool bcache_is_reboot;
46LIST_HEAD(bch_cache_sets);
47static LIST_HEAD(uncached_devices);
48
49static int bcache_major;
50static DEFINE_IDA(bcache_device_idx);
51static wait_queue_head_t unregister_wait;
52struct workqueue_struct *bcache_wq;
53struct workqueue_struct *bch_flush_wq;
54struct workqueue_struct *bch_journal_wq;
55
56
57#define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE)
58
59#define BCACHE_MINORS 128
60
61#define BCACHE_DEVICE_IDX_MAX ((1U << MINORBITS)/BCACHE_MINORS)
62
63
64
65static unsigned int get_bucket_size(struct cache_sb *sb, struct cache_sb_disk *s)
66{
67 unsigned int bucket_size = le16_to_cpu(s->bucket_size);
68
69 if (sb->version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES) {
70 if (bch_has_feature_large_bucket(sb)) {
71 unsigned int max, order;
72
73 max = sizeof(unsigned int) * BITS_PER_BYTE - 1;
74 order = le16_to_cpu(s->bucket_size);
75
76
77
78
79 if (order > max)
80 pr_err("Bucket size (1 << %u) overflows\n",
81 order);
82 bucket_size = 1 << order;
83 } else if (bch_has_feature_obso_large_bucket(sb)) {
84 bucket_size +=
85 le16_to_cpu(s->obso_bucket_size_hi) << 16;
86 }
87 }
88
89 return bucket_size;
90}
91
92static const char *read_super_common(struct cache_sb *sb, struct block_device *bdev,
93 struct cache_sb_disk *s)
94{
95 const char *err;
96 unsigned int i;
97
98 sb->first_bucket= le16_to_cpu(s->first_bucket);
99 sb->nbuckets = le64_to_cpu(s->nbuckets);
100 sb->bucket_size = get_bucket_size(sb, s);
101
102 sb->nr_in_set = le16_to_cpu(s->nr_in_set);
103 sb->nr_this_dev = le16_to_cpu(s->nr_this_dev);
104
105 err = "Too many journal buckets";
106 if (sb->keys > SB_JOURNAL_BUCKETS)
107 goto err;
108
109 err = "Too many buckets";
110 if (sb->nbuckets > LONG_MAX)
111 goto err;
112
113 err = "Not enough buckets";
114 if (sb->nbuckets < 1 << 7)
115 goto err;
116
117 err = "Bad block size (not power of 2)";
118 if (!is_power_of_2(sb->block_size))
119 goto err;
120
121 err = "Bad block size (larger than page size)";
122 if (sb->block_size > PAGE_SECTORS)
123 goto err;
124
125 err = "Bad bucket size (not power of 2)";
126 if (!is_power_of_2(sb->bucket_size))
127 goto err;
128
129 err = "Bad bucket size (smaller than page size)";
130 if (sb->bucket_size < PAGE_SECTORS)
131 goto err;
132
133 err = "Invalid superblock: device too small";
134 if (get_capacity(bdev->bd_disk) <
135 sb->bucket_size * sb->nbuckets)
136 goto err;
137
138 err = "Bad UUID";
139 if (bch_is_zero(sb->set_uuid, 16))
140 goto err;
141
142 err = "Bad cache device number in set";
143 if (!sb->nr_in_set ||
144 sb->nr_in_set <= sb->nr_this_dev ||
145 sb->nr_in_set > MAX_CACHES_PER_SET)
146 goto err;
147
148 err = "Journal buckets not sequential";
149 for (i = 0; i < sb->keys; i++)
150 if (sb->d[i] != sb->first_bucket + i)
151 goto err;
152
153 err = "Too many journal buckets";
154 if (sb->first_bucket + sb->keys > sb->nbuckets)
155 goto err;
156
157 err = "Invalid superblock: first bucket comes before end of super";
158 if (sb->first_bucket * sb->bucket_size < 16)
159 goto err;
160
161 err = NULL;
162err:
163 return err;
164}
165
166
167static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
168 struct cache_sb_disk **res)
169{
170 const char *err;
171 struct cache_sb_disk *s;
172 struct page *page;
173 unsigned int i;
174
175 page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
176 SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL);
177 if (IS_ERR(page))
178 return "IO error";
179 s = page_address(page) + offset_in_page(SB_OFFSET);
180
181 sb->offset = le64_to_cpu(s->offset);
182 sb->version = le64_to_cpu(s->version);
183
184 memcpy(sb->magic, s->magic, 16);
185 memcpy(sb->uuid, s->uuid, 16);
186 memcpy(sb->set_uuid, s->set_uuid, 16);
187 memcpy(sb->label, s->label, SB_LABEL_SIZE);
188
189 sb->flags = le64_to_cpu(s->flags);
190 sb->seq = le64_to_cpu(s->seq);
191 sb->last_mount = le32_to_cpu(s->last_mount);
192 sb->keys = le16_to_cpu(s->keys);
193
194 for (i = 0; i < SB_JOURNAL_BUCKETS; i++)
195 sb->d[i] = le64_to_cpu(s->d[i]);
196
197 pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u\n",
198 sb->version, sb->flags, sb->seq, sb->keys);
199
200 err = "Not a bcache superblock (bad offset)";
201 if (sb->offset != SB_SECTOR)
202 goto err;
203
204 err = "Not a bcache superblock (bad magic)";
205 if (memcmp(sb->magic, bcache_magic, 16))
206 goto err;
207
208 err = "Bad checksum";
209 if (s->csum != csum_set(s))
210 goto err;
211
212 err = "Bad UUID";
213 if (bch_is_zero(sb->uuid, 16))
214 goto err;
215
216 sb->block_size = le16_to_cpu(s->block_size);
217
218 err = "Superblock block size smaller than device block size";
219 if (sb->block_size << 9 < bdev_logical_block_size(bdev))
220 goto err;
221
222 switch (sb->version) {
223 case BCACHE_SB_VERSION_BDEV:
224 sb->data_offset = BDEV_DATA_START_DEFAULT;
225 break;
226 case BCACHE_SB_VERSION_BDEV_WITH_OFFSET:
227 case BCACHE_SB_VERSION_BDEV_WITH_FEATURES:
228 sb->data_offset = le64_to_cpu(s->data_offset);
229
230 err = "Bad data offset";
231 if (sb->data_offset < BDEV_DATA_START_DEFAULT)
232 goto err;
233
234 break;
235 case BCACHE_SB_VERSION_CDEV:
236 case BCACHE_SB_VERSION_CDEV_WITH_UUID:
237 err = read_super_common(sb, bdev, s);
238 if (err)
239 goto err;
240 break;
241 case BCACHE_SB_VERSION_CDEV_WITH_FEATURES:
242
243
244
245
246 sb->feature_compat = le64_to_cpu(s->feature_compat);
247 sb->feature_incompat = le64_to_cpu(s->feature_incompat);
248 sb->feature_ro_compat = le64_to_cpu(s->feature_ro_compat);
249
250
251 err = "Unsupported compatible feature found";
252 if (bch_has_unknown_compat_features(sb))
253 goto err;
254
255 err = "Unsupported read-only compatible feature found";
256 if (bch_has_unknown_ro_compat_features(sb))
257 goto err;
258
259 err = "Unsupported incompatible feature found";
260 if (bch_has_unknown_incompat_features(sb))
261 goto err;
262
263 err = read_super_common(sb, bdev, s);
264 if (err)
265 goto err;
266 break;
267 default:
268 err = "Unsupported superblock version";
269 goto err;
270 }
271
272 sb->last_mount = (u32)ktime_get_real_seconds();
273 *res = s;
274 return NULL;
275err:
276 put_page(page);
277 return err;
278}
279
280static void write_bdev_super_endio(struct bio *bio)
281{
282 struct cached_dev *dc = bio->bi_private;
283
284 if (bio->bi_status)
285 bch_count_backing_io_errors(dc, bio);
286
287 closure_put(&dc->sb_write);
288}
289
290static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out,
291 struct bio *bio)
292{
293 unsigned int i;
294
295 bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META;
296 bio->bi_iter.bi_sector = SB_SECTOR;
297 __bio_add_page(bio, virt_to_page(out), SB_SIZE,
298 offset_in_page(out));
299
300 out->offset = cpu_to_le64(sb->offset);
301
302 memcpy(out->uuid, sb->uuid, 16);
303 memcpy(out->set_uuid, sb->set_uuid, 16);
304 memcpy(out->label, sb->label, SB_LABEL_SIZE);
305
306 out->flags = cpu_to_le64(sb->flags);
307 out->seq = cpu_to_le64(sb->seq);
308
309 out->last_mount = cpu_to_le32(sb->last_mount);
310 out->first_bucket = cpu_to_le16(sb->first_bucket);
311 out->keys = cpu_to_le16(sb->keys);
312
313 for (i = 0; i < sb->keys; i++)
314 out->d[i] = cpu_to_le64(sb->d[i]);
315
316 if (sb->version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES) {
317 out->feature_compat = cpu_to_le64(sb->feature_compat);
318 out->feature_incompat = cpu_to_le64(sb->feature_incompat);
319 out->feature_ro_compat = cpu_to_le64(sb->feature_ro_compat);
320 }
321
322 out->version = cpu_to_le64(sb->version);
323 out->csum = csum_set(out);
324
325 pr_debug("ver %llu, flags %llu, seq %llu\n",
326 sb->version, sb->flags, sb->seq);
327
328 submit_bio(bio);
329}
330
331static void bch_write_bdev_super_unlock(struct closure *cl)
332{
333 struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write);
334
335 up(&dc->sb_write_mutex);
336}
337
338void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
339{
340 struct closure *cl = &dc->sb_write;
341 struct bio *bio = &dc->sb_bio;
342
343 down(&dc->sb_write_mutex);
344 closure_init(cl, parent);
345
346 bio_init(bio, dc->sb_bv, 1);
347 bio_set_dev(bio, dc->bdev);
348 bio->bi_end_io = write_bdev_super_endio;
349 bio->bi_private = dc;
350
351 closure_get(cl);
352
353 __write_super(&dc->sb, dc->sb_disk, bio);
354
355 closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
356}
357
358static void write_super_endio(struct bio *bio)
359{
360 struct cache *ca = bio->bi_private;
361
362
363 bch_count_io_errors(ca, bio->bi_status, 0,
364 "writing superblock");
365 closure_put(&ca->set->sb_write);
366}
367
368static void bcache_write_super_unlock(struct closure *cl)
369{
370 struct cache_set *c = container_of(cl, struct cache_set, sb_write);
371
372 up(&c->sb_write_mutex);
373}
374
375void bcache_write_super(struct cache_set *c)
376{
377 struct closure *cl = &c->sb_write;
378 struct cache *ca = c->cache;
379 struct bio *bio = &ca->sb_bio;
380 unsigned int version = BCACHE_SB_VERSION_CDEV_WITH_UUID;
381
382 down(&c->sb_write_mutex);
383 closure_init(cl, &c->cl);
384
385 ca->sb.seq++;
386
387 if (ca->sb.version < version)
388 ca->sb.version = version;
389
390 bio_init(bio, ca->sb_bv, 1);
391 bio_set_dev(bio, ca->bdev);
392 bio->bi_end_io = write_super_endio;
393 bio->bi_private = ca;
394
395 closure_get(cl);
396 __write_super(&ca->sb, ca->sb_disk, bio);
397
398 closure_return_with_destructor(cl, bcache_write_super_unlock);
399}
400
401
402
403static void uuid_endio(struct bio *bio)
404{
405 struct closure *cl = bio->bi_private;
406 struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
407
408 cache_set_err_on(bio->bi_status, c, "accessing uuids");
409 bch_bbio_free(bio, c);
410 closure_put(cl);
411}
412
413static void uuid_io_unlock(struct closure *cl)
414{
415 struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
416
417 up(&c->uuid_write_mutex);
418}
419
420static void uuid_io(struct cache_set *c, int op, unsigned long op_flags,
421 struct bkey *k, struct closure *parent)
422{
423 struct closure *cl = &c->uuid_write;
424 struct uuid_entry *u;
425 unsigned int i;
426 char buf[80];
427
428 BUG_ON(!parent);
429 down(&c->uuid_write_mutex);
430 closure_init(cl, parent);
431
432 for (i = 0; i < KEY_PTRS(k); i++) {
433 struct bio *bio = bch_bbio_alloc(c);
434
435 bio->bi_opf = REQ_SYNC | REQ_META | op_flags;
436 bio->bi_iter.bi_size = KEY_SIZE(k) << 9;
437
438 bio->bi_end_io = uuid_endio;
439 bio->bi_private = cl;
440 bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags);
441 bch_bio_map(bio, c->uuids);
442
443 bch_submit_bbio(bio, c, k, i);
444
445 if (op != REQ_OP_WRITE)
446 break;
447 }
448
449 bch_extent_to_text(buf, sizeof(buf), k);
450 pr_debug("%s UUIDs at %s\n", op == REQ_OP_WRITE ? "wrote" : "read", buf);
451
452 for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
453 if (!bch_is_zero(u->uuid, 16))
454 pr_debug("Slot %zi: %pU: %s: 1st: %u last: %u inv: %u\n",
455 u - c->uuids, u->uuid, u->label,
456 u->first_reg, u->last_reg, u->invalidated);
457
458 closure_return_with_destructor(cl, uuid_io_unlock);
459}
460
461static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
462{
463 struct bkey *k = &j->uuid_bucket;
464
465 if (__bch_btree_ptr_invalid(c, k))
466 return "bad uuid pointer";
467
468 bkey_copy(&c->uuid_bucket, k);
469 uuid_io(c, REQ_OP_READ, 0, k, cl);
470
471 if (j->version < BCACHE_JSET_VERSION_UUIDv1) {
472 struct uuid_entry_v0 *u0 = (void *) c->uuids;
473 struct uuid_entry *u1 = (void *) c->uuids;
474 int i;
475
476 closure_sync(cl);
477
478
479
480
481
482
483
484 for (i = c->nr_uuids - 1;
485 i >= 0;
486 --i) {
487 memcpy(u1[i].uuid, u0[i].uuid, 16);
488 memcpy(u1[i].label, u0[i].label, 32);
489
490 u1[i].first_reg = u0[i].first_reg;
491 u1[i].last_reg = u0[i].last_reg;
492 u1[i].invalidated = u0[i].invalidated;
493
494 u1[i].flags = 0;
495 u1[i].sectors = 0;
496 }
497 }
498
499 return NULL;
500}
501
502static int __uuid_write(struct cache_set *c)
503{
504 BKEY_PADDED(key) k;
505 struct closure cl;
506 struct cache *ca = c->cache;
507 unsigned int size;
508
509 closure_init_stack(&cl);
510 lockdep_assert_held(&bch_register_lock);
511
512 if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, true))
513 return 1;
514
515 size = meta_bucket_pages(&ca->sb) * PAGE_SECTORS;
516 SET_KEY_SIZE(&k.key, size);
517 uuid_io(c, REQ_OP_WRITE, 0, &k.key, &cl);
518 closure_sync(&cl);
519
520
521 atomic_long_add(ca->sb.bucket_size, &ca->meta_sectors_written);
522
523 bkey_copy(&c->uuid_bucket, &k.key);
524 bkey_put(c, &k.key);
525 return 0;
526}
527
528int bch_uuid_write(struct cache_set *c)
529{
530 int ret = __uuid_write(c);
531
532 if (!ret)
533 bch_journal_meta(c, NULL);
534
535 return ret;
536}
537
538static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid)
539{
540 struct uuid_entry *u;
541
542 for (u = c->uuids;
543 u < c->uuids + c->nr_uuids; u++)
544 if (!memcmp(u->uuid, uuid, 16))
545 return u;
546
547 return NULL;
548}
549
550static struct uuid_entry *uuid_find_empty(struct cache_set *c)
551{
552 static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
553
554 return uuid_find(c, zero_uuid);
555}
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584static void prio_endio(struct bio *bio)
585{
586 struct cache *ca = bio->bi_private;
587
588 cache_set_err_on(bio->bi_status, ca->set, "accessing priorities");
589 bch_bbio_free(bio, ca->set);
590 closure_put(&ca->prio);
591}
592
593static void prio_io(struct cache *ca, uint64_t bucket, int op,
594 unsigned long op_flags)
595{
596 struct closure *cl = &ca->prio;
597 struct bio *bio = bch_bbio_alloc(ca->set);
598
599 closure_init_stack(cl);
600
601 bio->bi_iter.bi_sector = bucket * ca->sb.bucket_size;
602 bio_set_dev(bio, ca->bdev);
603 bio->bi_iter.bi_size = meta_bucket_bytes(&ca->sb);
604
605 bio->bi_end_io = prio_endio;
606 bio->bi_private = ca;
607 bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags);
608 bch_bio_map(bio, ca->disk_buckets);
609
610 closure_bio_submit(ca->set, bio, &ca->prio);
611 closure_sync(cl);
612}
613
614int bch_prio_write(struct cache *ca, bool wait)
615{
616 int i;
617 struct bucket *b;
618 struct closure cl;
619
620 pr_debug("free_prio=%zu, free_none=%zu, free_inc=%zu\n",
621 fifo_used(&ca->free[RESERVE_PRIO]),
622 fifo_used(&ca->free[RESERVE_NONE]),
623 fifo_used(&ca->free_inc));
624
625
626
627
628
629
630 if (!wait) {
631 size_t avail = fifo_used(&ca->free[RESERVE_PRIO]) +
632 fifo_used(&ca->free[RESERVE_NONE]);
633 if (prio_buckets(ca) > avail)
634 return -ENOMEM;
635 }
636
637 closure_init_stack(&cl);
638
639 lockdep_assert_held(&ca->set->bucket_lock);
640
641 ca->disk_buckets->seq++;
642
643 atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
644 &ca->meta_sectors_written);
645
646 for (i = prio_buckets(ca) - 1; i >= 0; --i) {
647 long bucket;
648 struct prio_set *p = ca->disk_buckets;
649 struct bucket_disk *d = p->data;
650 struct bucket_disk *end = d + prios_per_bucket(ca);
651
652 for (b = ca->buckets + i * prios_per_bucket(ca);
653 b < ca->buckets + ca->sb.nbuckets && d < end;
654 b++, d++) {
655 d->prio = cpu_to_le16(b->prio);
656 d->gen = b->gen;
657 }
658
659 p->next_bucket = ca->prio_buckets[i + 1];
660 p->magic = pset_magic(&ca->sb);
661 p->csum = bch_crc64(&p->magic, meta_bucket_bytes(&ca->sb) - 8);
662
663 bucket = bch_bucket_alloc(ca, RESERVE_PRIO, wait);
664 BUG_ON(bucket == -1);
665
666 mutex_unlock(&ca->set->bucket_lock);
667 prio_io(ca, bucket, REQ_OP_WRITE, 0);
668 mutex_lock(&ca->set->bucket_lock);
669
670 ca->prio_buckets[i] = bucket;
671 atomic_dec_bug(&ca->buckets[bucket].pin);
672 }
673
674 mutex_unlock(&ca->set->bucket_lock);
675
676 bch_journal_meta(ca->set, &cl);
677 closure_sync(&cl);
678
679 mutex_lock(&ca->set->bucket_lock);
680
681
682
683
684
685 for (i = 0; i < prio_buckets(ca); i++) {
686 if (ca->prio_last_buckets[i])
687 __bch_bucket_free(ca,
688 &ca->buckets[ca->prio_last_buckets[i]]);
689
690 ca->prio_last_buckets[i] = ca->prio_buckets[i];
691 }
692 return 0;
693}
694
695static int prio_read(struct cache *ca, uint64_t bucket)
696{
697 struct prio_set *p = ca->disk_buckets;
698 struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
699 struct bucket *b;
700 unsigned int bucket_nr = 0;
701 int ret = -EIO;
702
703 for (b = ca->buckets;
704 b < ca->buckets + ca->sb.nbuckets;
705 b++, d++) {
706 if (d == end) {
707 ca->prio_buckets[bucket_nr] = bucket;
708 ca->prio_last_buckets[bucket_nr] = bucket;
709 bucket_nr++;
710
711 prio_io(ca, bucket, REQ_OP_READ, 0);
712
713 if (p->csum !=
714 bch_crc64(&p->magic, meta_bucket_bytes(&ca->sb) - 8)) {
715 pr_warn("bad csum reading priorities\n");
716 goto out;
717 }
718
719 if (p->magic != pset_magic(&ca->sb)) {
720 pr_warn("bad magic reading priorities\n");
721 goto out;
722 }
723
724 bucket = p->next_bucket;
725 d = p->data;
726 }
727
728 b->prio = le16_to_cpu(d->prio);
729 b->gen = b->last_gc = d->gen;
730 }
731
732 ret = 0;
733out:
734 return ret;
735}
736
737
738
739static int open_dev(struct block_device *b, fmode_t mode)
740{
741 struct bcache_device *d = b->bd_disk->private_data;
742
743 if (test_bit(BCACHE_DEV_CLOSING, &d->flags))
744 return -ENXIO;
745
746 closure_get(&d->cl);
747 return 0;
748}
749
750static void release_dev(struct gendisk *b, fmode_t mode)
751{
752 struct bcache_device *d = b->private_data;
753
754 closure_put(&d->cl);
755}
756
757static int ioctl_dev(struct block_device *b, fmode_t mode,
758 unsigned int cmd, unsigned long arg)
759{
760 struct bcache_device *d = b->bd_disk->private_data;
761
762 return d->ioctl(d, mode, cmd, arg);
763}
764
765static const struct block_device_operations bcache_cached_ops = {
766 .submit_bio = cached_dev_submit_bio,
767 .open = open_dev,
768 .release = release_dev,
769 .ioctl = ioctl_dev,
770 .owner = THIS_MODULE,
771};
772
773static const struct block_device_operations bcache_flash_ops = {
774 .submit_bio = flash_dev_submit_bio,
775 .open = open_dev,
776 .release = release_dev,
777 .ioctl = ioctl_dev,
778 .owner = THIS_MODULE,
779};
780
781void bcache_device_stop(struct bcache_device *d)
782{
783 if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags))
784
785
786
787
788
789 closure_queue(&d->cl);
790}
791
792static void bcache_device_unlink(struct bcache_device *d)
793{
794 lockdep_assert_held(&bch_register_lock);
795
796 if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) {
797 struct cache *ca = d->c->cache;
798
799 sysfs_remove_link(&d->c->kobj, d->name);
800 sysfs_remove_link(&d->kobj, "cache");
801
802 bd_unlink_disk_holder(ca->bdev, d->disk);
803 }
804}
805
806static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
807 const char *name)
808{
809 struct cache *ca = c->cache;
810 int ret;
811
812 bd_link_disk_holder(ca->bdev, d->disk);
813
814 snprintf(d->name, BCACHEDEVNAME_SIZE,
815 "%s%u", name, d->id);
816
817 ret = sysfs_create_link(&d->kobj, &c->kobj, "cache");
818 if (ret < 0)
819 pr_err("Couldn't create device -> cache set symlink\n");
820
821 ret = sysfs_create_link(&c->kobj, &d->kobj, d->name);
822 if (ret < 0)
823 pr_err("Couldn't create cache set -> device symlink\n");
824
825 clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags);
826}
827
828static void bcache_device_detach(struct bcache_device *d)
829{
830 lockdep_assert_held(&bch_register_lock);
831
832 atomic_dec(&d->c->attached_dev_nr);
833
834 if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
835 struct uuid_entry *u = d->c->uuids + d->id;
836
837 SET_UUID_FLASH_ONLY(u, 0);
838 memcpy(u->uuid, invalid_uuid, 16);
839 u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
840 bch_uuid_write(d->c);
841 }
842
843 bcache_device_unlink(d);
844
845 d->c->devices[d->id] = NULL;
846 closure_put(&d->c->caching);
847 d->c = NULL;
848}
849
850static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
851 unsigned int id)
852{
853 d->id = id;
854 d->c = c;
855 c->devices[id] = d;
856
857 if (id >= c->devices_max_used)
858 c->devices_max_used = id + 1;
859
860 closure_get(&c->caching);
861}
862
863static inline int first_minor_to_idx(int first_minor)
864{
865 return (first_minor/BCACHE_MINORS);
866}
867
868static inline int idx_to_first_minor(int idx)
869{
870 return (idx * BCACHE_MINORS);
871}
872
873static void bcache_device_free(struct bcache_device *d)
874{
875 struct gendisk *disk = d->disk;
876
877 lockdep_assert_held(&bch_register_lock);
878
879 if (disk)
880 pr_info("%s stopped\n", disk->disk_name);
881 else
882 pr_err("bcache device (NULL gendisk) stopped\n");
883
884 if (d->c)
885 bcache_device_detach(d);
886
887 if (disk) {
888 blk_cleanup_disk(disk);
889 ida_simple_remove(&bcache_device_idx,
890 first_minor_to_idx(disk->first_minor));
891 }
892
893 bioset_exit(&d->bio_split);
894 kvfree(d->full_dirty_stripes);
895 kvfree(d->stripe_sectors_dirty);
896
897 closure_debug_destroy(&d->cl);
898}
899
900static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
901 sector_t sectors, struct block_device *cached_bdev,
902 const struct block_device_operations *ops)
903{
904 struct request_queue *q;
905 const size_t max_stripes = min_t(size_t, INT_MAX,
906 SIZE_MAX / sizeof(atomic_t));
907 uint64_t n;
908 int idx;
909
910 if (!d->stripe_size)
911 d->stripe_size = 1 << 31;
912
913 n = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
914 if (!n || n > max_stripes) {
915 pr_err("nr_stripes too large or invalid: %llu (start sector beyond end of disk?)\n",
916 n);
917 return -ENOMEM;
918 }
919 d->nr_stripes = n;
920
921 n = d->nr_stripes * sizeof(atomic_t);
922 d->stripe_sectors_dirty = kvzalloc(n, GFP_KERNEL);
923 if (!d->stripe_sectors_dirty)
924 return -ENOMEM;
925
926 n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long);
927 d->full_dirty_stripes = kvzalloc(n, GFP_KERNEL);
928 if (!d->full_dirty_stripes)
929 goto out_free_stripe_sectors_dirty;
930
931 idx = ida_simple_get(&bcache_device_idx, 0,
932 BCACHE_DEVICE_IDX_MAX, GFP_KERNEL);
933 if (idx < 0)
934 goto out_free_full_dirty_stripes;
935
936 if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio),
937 BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
938 goto out_ida_remove;
939
940 d->disk = blk_alloc_disk(NUMA_NO_NODE);
941 if (!d->disk)
942 goto out_bioset_exit;
943
944 set_capacity(d->disk, sectors);
945 snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx);
946
947 d->disk->major = bcache_major;
948 d->disk->first_minor = idx_to_first_minor(idx);
949 d->disk->minors = BCACHE_MINORS;
950 d->disk->fops = ops;
951 d->disk->private_data = d;
952
953 q = d->disk->queue;
954 q->limits.max_hw_sectors = UINT_MAX;
955 q->limits.max_sectors = UINT_MAX;
956 q->limits.max_segment_size = UINT_MAX;
957 q->limits.max_segments = BIO_MAX_VECS;
958 blk_queue_max_discard_sectors(q, UINT_MAX);
959 q->limits.discard_granularity = 512;
960 q->limits.io_min = block_size;
961 q->limits.logical_block_size = block_size;
962 q->limits.physical_block_size = block_size;
963
964 if (q->limits.logical_block_size > PAGE_SIZE && cached_bdev) {
965
966
967
968
969 pr_info("%s: sb/logical block size (%u) greater than page size (%lu) falling back to device logical block size (%u)\n",
970 d->disk->disk_name, q->limits.logical_block_size,
971 PAGE_SIZE, bdev_logical_block_size(cached_bdev));
972
973
974 blk_queue_logical_block_size(q, bdev_logical_block_size(cached_bdev));
975 }
976
977 blk_queue_flag_set(QUEUE_FLAG_NONROT, d->disk->queue);
978 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, d->disk->queue);
979 blk_queue_flag_set(QUEUE_FLAG_DISCARD, d->disk->queue);
980
981 blk_queue_write_cache(q, true, true);
982
983 return 0;
984
985out_bioset_exit:
986 bioset_exit(&d->bio_split);
987out_ida_remove:
988 ida_simple_remove(&bcache_device_idx, idx);
989out_free_full_dirty_stripes:
990 kvfree(d->full_dirty_stripes);
991out_free_stripe_sectors_dirty:
992 kvfree(d->stripe_sectors_dirty);
993 return -ENOMEM;
994
995}
996
997
998
999static void calc_cached_dev_sectors(struct cache_set *c)
1000{
1001 uint64_t sectors = 0;
1002 struct cached_dev *dc;
1003
1004 list_for_each_entry(dc, &c->cached_devs, list)
1005 sectors += bdev_sectors(dc->bdev);
1006
1007 c->cached_dev_sectors = sectors;
1008}
1009
1010#define BACKING_DEV_OFFLINE_TIMEOUT 5
1011static int cached_dev_status_update(void *arg)
1012{
1013 struct cached_dev *dc = arg;
1014 struct request_queue *q;
1015
1016
1017
1018
1019
1020
1021 while (!kthread_should_stop() && !dc->io_disable) {
1022 q = bdev_get_queue(dc->bdev);
1023 if (blk_queue_dying(q))
1024 dc->offline_seconds++;
1025 else
1026 dc->offline_seconds = 0;
1027
1028 if (dc->offline_seconds >= BACKING_DEV_OFFLINE_TIMEOUT) {
1029 pr_err("%s: device offline for %d seconds\n",
1030 dc->backing_dev_name,
1031 BACKING_DEV_OFFLINE_TIMEOUT);
1032 pr_err("%s: disable I/O request due to backing device offline\n",
1033 dc->disk.name);
1034 dc->io_disable = true;
1035
1036 smp_mb();
1037 bcache_device_stop(&dc->disk);
1038 break;
1039 }
1040 schedule_timeout_interruptible(HZ);
1041 }
1042
1043 wait_for_kthread_stop();
1044 return 0;
1045}
1046
1047
1048int bch_cached_dev_run(struct cached_dev *dc)
1049{
1050 int ret = 0;
1051 struct bcache_device *d = &dc->disk;
1052 char *buf = kmemdup_nul(dc->sb.label, SB_LABEL_SIZE, GFP_KERNEL);
1053 char *env[] = {
1054 "DRIVER=bcache",
1055 kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid),
1056 kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf ? : ""),
1057 NULL,
1058 };
1059
1060 if (dc->io_disable) {
1061 pr_err("I/O disabled on cached dev %s\n",
1062 dc->backing_dev_name);
1063 ret = -EIO;
1064 goto out;
1065 }
1066
1067 if (atomic_xchg(&dc->running, 1)) {
1068 pr_info("cached dev %s is running already\n",
1069 dc->backing_dev_name);
1070 ret = -EBUSY;
1071 goto out;
1072 }
1073
1074 if (!d->c &&
1075 BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
1076 struct closure cl;
1077
1078 closure_init_stack(&cl);
1079
1080 SET_BDEV_STATE(&dc->sb, BDEV_STATE_STALE);
1081 bch_write_bdev_super(dc, &cl);
1082 closure_sync(&cl);
1083 }
1084
1085 add_disk(d->disk);
1086 bd_link_disk_holder(dc->bdev, dc->disk.disk);
1087
1088
1089
1090
1091 kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
1092
1093 if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
1094 sysfs_create_link(&disk_to_dev(d->disk)->kobj,
1095 &d->kobj, "bcache")) {
1096 pr_err("Couldn't create bcache dev <-> disk sysfs symlinks\n");
1097 ret = -ENOMEM;
1098 goto out;
1099 }
1100
1101 dc->status_update_thread = kthread_run(cached_dev_status_update,
1102 dc, "bcache_status_update");
1103 if (IS_ERR(dc->status_update_thread)) {
1104 pr_warn("failed to create bcache_status_update kthread, continue to run without monitoring backing device status\n");
1105 }
1106
1107out:
1108 kfree(env[1]);
1109 kfree(env[2]);
1110 kfree(buf);
1111 return ret;
1112}
1113
1114
1115
1116
1117
1118
1119
1120
1121static void cancel_writeback_rate_update_dwork(struct cached_dev *dc)
1122{
1123 int time_out = WRITEBACK_RATE_UPDATE_SECS_MAX * HZ;
1124
1125 do {
1126 if (!test_bit(BCACHE_DEV_RATE_DW_RUNNING,
1127 &dc->disk.flags))
1128 break;
1129 time_out--;
1130 schedule_timeout_interruptible(1);
1131 } while (time_out > 0);
1132
1133 if (time_out == 0)
1134 pr_warn("give up waiting for dc->writeback_write_update to quit\n");
1135
1136 cancel_delayed_work_sync(&dc->writeback_rate_update);
1137}
1138
1139static void cached_dev_detach_finish(struct work_struct *w)
1140{
1141 struct cached_dev *dc = container_of(w, struct cached_dev, detach);
1142
1143 BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
1144 BUG_ON(refcount_read(&dc->count));
1145
1146
1147 if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
1148 cancel_writeback_rate_update_dwork(dc);
1149
1150 if (!IS_ERR_OR_NULL(dc->writeback_thread)) {
1151 kthread_stop(dc->writeback_thread);
1152 dc->writeback_thread = NULL;
1153 }
1154
1155 mutex_lock(&bch_register_lock);
1156
1157 calc_cached_dev_sectors(dc->disk.c);
1158 bcache_device_detach(&dc->disk);
1159 list_move(&dc->list, &uncached_devices);
1160
1161 clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags);
1162 clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags);
1163
1164 mutex_unlock(&bch_register_lock);
1165
1166 pr_info("Caching disabled for %s\n", dc->backing_dev_name);
1167
1168
1169 closure_put(&dc->disk.cl);
1170}
1171
1172void bch_cached_dev_detach(struct cached_dev *dc)
1173{
1174 lockdep_assert_held(&bch_register_lock);
1175
1176 if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
1177 return;
1178
1179 if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
1180 return;
1181
1182
1183
1184
1185
1186 closure_get(&dc->disk.cl);
1187
1188 bch_writeback_queue(dc);
1189
1190 cached_dev_put(dc);
1191}
1192
1193int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
1194 uint8_t *set_uuid)
1195{
1196 uint32_t rtime = cpu_to_le32((u32)ktime_get_real_seconds());
1197 struct uuid_entry *u;
1198 struct cached_dev *exist_dc, *t;
1199 int ret = 0;
1200
1201 if ((set_uuid && memcmp(set_uuid, c->set_uuid, 16)) ||
1202 (!set_uuid && memcmp(dc->sb.set_uuid, c->set_uuid, 16)))
1203 return -ENOENT;
1204
1205 if (dc->disk.c) {
1206 pr_err("Can't attach %s: already attached\n",
1207 dc->backing_dev_name);
1208 return -EINVAL;
1209 }
1210
1211 if (test_bit(CACHE_SET_STOPPING, &c->flags)) {
1212 pr_err("Can't attach %s: shutting down\n",
1213 dc->backing_dev_name);
1214 return -EINVAL;
1215 }
1216
1217 if (dc->sb.block_size < c->cache->sb.block_size) {
1218
1219 pr_err("Couldn't attach %s: block size less than set's block size\n",
1220 dc->backing_dev_name);
1221 return -EINVAL;
1222 }
1223
1224
1225 list_for_each_entry_safe(exist_dc, t, &c->cached_devs, list) {
1226 if (!memcmp(dc->sb.uuid, exist_dc->sb.uuid, 16)) {
1227 pr_err("Tried to attach %s but duplicate UUID already attached\n",
1228 dc->backing_dev_name);
1229
1230 return -EINVAL;
1231 }
1232 }
1233
1234 u = uuid_find(c, dc->sb.uuid);
1235
1236 if (u &&
1237 (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE ||
1238 BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) {
1239 memcpy(u->uuid, invalid_uuid, 16);
1240 u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
1241 u = NULL;
1242 }
1243
1244 if (!u) {
1245 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
1246 pr_err("Couldn't find uuid for %s in set\n",
1247 dc->backing_dev_name);
1248 return -ENOENT;
1249 }
1250
1251 u = uuid_find_empty(c);
1252 if (!u) {
1253 pr_err("Not caching %s, no room for UUID\n",
1254 dc->backing_dev_name);
1255 return -EINVAL;
1256 }
1257 }
1258
1259
1260
1261
1262
1263
1264 if (bch_is_zero(u->uuid, 16)) {
1265 struct closure cl;
1266
1267 closure_init_stack(&cl);
1268
1269 memcpy(u->uuid, dc->sb.uuid, 16);
1270 memcpy(u->label, dc->sb.label, SB_LABEL_SIZE);
1271 u->first_reg = u->last_reg = rtime;
1272 bch_uuid_write(c);
1273
1274 memcpy(dc->sb.set_uuid, c->set_uuid, 16);
1275 SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
1276
1277 bch_write_bdev_super(dc, &cl);
1278 closure_sync(&cl);
1279 } else {
1280 u->last_reg = rtime;
1281 bch_uuid_write(c);
1282 }
1283
1284 bcache_device_attach(&dc->disk, c, u - c->uuids);
1285 list_move(&dc->list, &c->cached_devs);
1286 calc_cached_dev_sectors(c);
1287
1288
1289
1290
1291
1292 smp_wmb();
1293 refcount_set(&dc->count, 1);
1294
1295
1296 down_write(&dc->writeback_lock);
1297 if (bch_cached_dev_writeback_start(dc)) {
1298 up_write(&dc->writeback_lock);
1299 pr_err("Couldn't start writeback facilities for %s\n",
1300 dc->disk.disk->disk_name);
1301 return -ENOMEM;
1302 }
1303
1304 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
1305 atomic_set(&dc->has_dirty, 1);
1306 bch_writeback_queue(dc);
1307 }
1308
1309 bch_sectors_dirty_init(&dc->disk);
1310
1311 ret = bch_cached_dev_run(dc);
1312 if (ret && (ret != -EBUSY)) {
1313 up_write(&dc->writeback_lock);
1314
1315
1316
1317
1318
1319
1320 kthread_stop(dc->writeback_thread);
1321 cancel_writeback_rate_update_dwork(dc);
1322 pr_err("Couldn't run cached device %s\n",
1323 dc->backing_dev_name);
1324 return ret;
1325 }
1326
1327 bcache_device_link(&dc->disk, c, "bdev");
1328 atomic_inc(&c->attached_dev_nr);
1329
1330 if (bch_has_feature_obso_large_bucket(&(c->cache->sb))) {
1331 pr_err("The obsoleted large bucket layout is unsupported, set the bcache device into read-only\n");
1332 pr_err("Please update to the latest bcache-tools to create the cache device\n");
1333 set_disk_ro(dc->disk.disk, 1);
1334 }
1335
1336
1337 up_write(&dc->writeback_lock);
1338
1339 pr_info("Caching %s as %s on set %pU\n",
1340 dc->backing_dev_name,
1341 dc->disk.disk->disk_name,
1342 dc->disk.c->set_uuid);
1343 return 0;
1344}
1345
1346
1347void bch_cached_dev_release(struct kobject *kobj)
1348{
1349 struct cached_dev *dc = container_of(kobj, struct cached_dev,
1350 disk.kobj);
1351 kfree(dc);
1352 module_put(THIS_MODULE);
1353}
1354
1355static void cached_dev_free(struct closure *cl)
1356{
1357 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
1358
1359 if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
1360 cancel_writeback_rate_update_dwork(dc);
1361
1362 if (!IS_ERR_OR_NULL(dc->writeback_thread))
1363 kthread_stop(dc->writeback_thread);
1364 if (!IS_ERR_OR_NULL(dc->status_update_thread))
1365 kthread_stop(dc->status_update_thread);
1366
1367 mutex_lock(&bch_register_lock);
1368
1369 if (atomic_read(&dc->running)) {
1370 bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
1371 del_gendisk(dc->disk.disk);
1372 }
1373 bcache_device_free(&dc->disk);
1374 list_del(&dc->list);
1375
1376 mutex_unlock(&bch_register_lock);
1377
1378 if (dc->sb_disk)
1379 put_page(virt_to_page(dc->sb_disk));
1380
1381 if (!IS_ERR_OR_NULL(dc->bdev))
1382 blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1383
1384 wake_up(&unregister_wait);
1385
1386 kobject_put(&dc->disk.kobj);
1387}
1388
1389static void cached_dev_flush(struct closure *cl)
1390{
1391 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
1392 struct bcache_device *d = &dc->disk;
1393
1394 mutex_lock(&bch_register_lock);
1395 bcache_device_unlink(d);
1396 mutex_unlock(&bch_register_lock);
1397
1398 bch_cache_accounting_destroy(&dc->accounting);
1399 kobject_del(&d->kobj);
1400
1401 continue_at(cl, cached_dev_free, system_wq);
1402}
1403
1404static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
1405{
1406 int ret;
1407 struct io *io;
1408 struct request_queue *q = bdev_get_queue(dc->bdev);
1409
1410 __module_get(THIS_MODULE);
1411 INIT_LIST_HEAD(&dc->list);
1412 closure_init(&dc->disk.cl, NULL);
1413 set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
1414 kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
1415 INIT_WORK(&dc->detach, cached_dev_detach_finish);
1416 sema_init(&dc->sb_write_mutex, 1);
1417 INIT_LIST_HEAD(&dc->io_lru);
1418 spin_lock_init(&dc->io_lock);
1419 bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
1420
1421 dc->sequential_cutoff = 4 << 20;
1422
1423 for (io = dc->io; io < dc->io + RECENT_IO; io++) {
1424 list_add(&io->lru, &dc->io_lru);
1425 hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
1426 }
1427
1428 dc->disk.stripe_size = q->limits.io_opt >> 9;
1429
1430 if (dc->disk.stripe_size)
1431 dc->partial_stripes_expensive =
1432 q->limits.raid_partial_stripes_expensive;
1433
1434 ret = bcache_device_init(&dc->disk, block_size,
1435 bdev_nr_sectors(dc->bdev) - dc->sb.data_offset,
1436 dc->bdev, &bcache_cached_ops);
1437 if (ret)
1438 return ret;
1439
1440 blk_queue_io_opt(dc->disk.disk->queue,
1441 max(queue_io_opt(dc->disk.disk->queue), queue_io_opt(q)));
1442
1443 atomic_set(&dc->io_errors, 0);
1444 dc->io_disable = false;
1445 dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT;
1446
1447 dc->stop_when_cache_set_failed = BCH_CACHED_DEV_STOP_AUTO;
1448
1449 bch_cached_dev_request_init(dc);
1450 bch_cached_dev_writeback_init(dc);
1451 return 0;
1452}
1453
1454
1455
1456static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
1457 struct block_device *bdev,
1458 struct cached_dev *dc)
1459{
1460 const char *err = "cannot allocate memory";
1461 struct cache_set *c;
1462 int ret = -ENOMEM;
1463
1464 bdevname(bdev, dc->backing_dev_name);
1465 memcpy(&dc->sb, sb, sizeof(struct cache_sb));
1466 dc->bdev = bdev;
1467 dc->bdev->bd_holder = dc;
1468 dc->sb_disk = sb_disk;
1469
1470 if (cached_dev_init(dc, sb->block_size << 9))
1471 goto err;
1472
1473 err = "error creating kobject";
1474 if (kobject_add(&dc->disk.kobj, bdev_kobj(bdev), "bcache"))
1475 goto err;
1476 if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
1477 goto err;
1478
1479 pr_info("registered backing device %s\n", dc->backing_dev_name);
1480
1481 list_add(&dc->list, &uncached_devices);
1482
1483 list_for_each_entry(c, &bch_cache_sets, list)
1484 bch_cached_dev_attach(dc, c, NULL);
1485
1486 if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE ||
1487 BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) {
1488 err = "failed to run cached device";
1489 ret = bch_cached_dev_run(dc);
1490 if (ret)
1491 goto err;
1492 }
1493
1494 return 0;
1495err:
1496 pr_notice("error %s: %s\n", dc->backing_dev_name, err);
1497 bcache_device_stop(&dc->disk);
1498 return ret;
1499}
1500
1501
1502
1503
1504void bch_flash_dev_release(struct kobject *kobj)
1505{
1506 struct bcache_device *d = container_of(kobj, struct bcache_device,
1507 kobj);
1508 kfree(d);
1509}
1510
1511static void flash_dev_free(struct closure *cl)
1512{
1513 struct bcache_device *d = container_of(cl, struct bcache_device, cl);
1514
1515 mutex_lock(&bch_register_lock);
1516 atomic_long_sub(bcache_dev_sectors_dirty(d),
1517 &d->c->flash_dev_dirty_sectors);
1518 del_gendisk(d->disk);
1519 bcache_device_free(d);
1520 mutex_unlock(&bch_register_lock);
1521 kobject_put(&d->kobj);
1522}
1523
1524static void flash_dev_flush(struct closure *cl)
1525{
1526 struct bcache_device *d = container_of(cl, struct bcache_device, cl);
1527
1528 mutex_lock(&bch_register_lock);
1529 bcache_device_unlink(d);
1530 mutex_unlock(&bch_register_lock);
1531 kobject_del(&d->kobj);
1532 continue_at(cl, flash_dev_free, system_wq);
1533}
1534
1535static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
1536{
1537 struct bcache_device *d = kzalloc(sizeof(struct bcache_device),
1538 GFP_KERNEL);
1539 if (!d)
1540 return -ENOMEM;
1541
1542 closure_init(&d->cl, NULL);
1543 set_closure_fn(&d->cl, flash_dev_flush, system_wq);
1544
1545 kobject_init(&d->kobj, &bch_flash_dev_ktype);
1546
1547 if (bcache_device_init(d, block_bytes(c->cache), u->sectors,
1548 NULL, &bcache_flash_ops))
1549 goto err;
1550
1551 bcache_device_attach(d, c, u - c->uuids);
1552 bch_sectors_dirty_init(d);
1553 bch_flash_dev_request_init(d);
1554 add_disk(d->disk);
1555
1556 if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache"))
1557 goto err;
1558
1559 bcache_device_link(d, c, "volume");
1560
1561 if (bch_has_feature_obso_large_bucket(&c->cache->sb)) {
1562 pr_err("The obsoleted large bucket layout is unsupported, set the bcache device into read-only\n");
1563 pr_err("Please update to the latest bcache-tools to create the cache device\n");
1564 set_disk_ro(d->disk, 1);
1565 }
1566
1567 return 0;
1568err:
1569 kobject_put(&d->kobj);
1570 return -ENOMEM;
1571}
1572
1573static int flash_devs_run(struct cache_set *c)
1574{
1575 int ret = 0;
1576 struct uuid_entry *u;
1577
1578 for (u = c->uuids;
1579 u < c->uuids + c->nr_uuids && !ret;
1580 u++)
1581 if (UUID_FLASH_ONLY(u))
1582 ret = flash_dev_run(c, u);
1583
1584 return ret;
1585}
1586
1587int bch_flash_dev_create(struct cache_set *c, uint64_t size)
1588{
1589 struct uuid_entry *u;
1590
1591 if (test_bit(CACHE_SET_STOPPING, &c->flags))
1592 return -EINTR;
1593
1594 if (!test_bit(CACHE_SET_RUNNING, &c->flags))
1595 return -EPERM;
1596
1597 u = uuid_find_empty(c);
1598 if (!u) {
1599 pr_err("Can't create volume, no room for UUID\n");
1600 return -EINVAL;
1601 }
1602
1603 get_random_bytes(u->uuid, 16);
1604 memset(u->label, 0, 32);
1605 u->first_reg = u->last_reg = cpu_to_le32((u32)ktime_get_real_seconds());
1606
1607 SET_UUID_FLASH_ONLY(u, 1);
1608 u->sectors = size >> 9;
1609
1610 bch_uuid_write(c);
1611
1612 return flash_dev_run(c, u);
1613}
1614
1615bool bch_cached_dev_error(struct cached_dev *dc)
1616{
1617 if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
1618 return false;
1619
1620 dc->io_disable = true;
1621
1622 smp_mb();
1623
1624 pr_err("stop %s: too many IO errors on backing device %s\n",
1625 dc->disk.disk->disk_name, dc->backing_dev_name);
1626
1627 bcache_device_stop(&dc->disk);
1628 return true;
1629}
1630
1631
1632
1633__printf(2, 3)
1634bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
1635{
1636 struct va_format vaf;
1637 va_list args;
1638
1639 if (c->on_error != ON_ERROR_PANIC &&
1640 test_bit(CACHE_SET_STOPPING, &c->flags))
1641 return false;
1642
1643 if (test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags))
1644 pr_info("CACHE_SET_IO_DISABLE already set\n");
1645
1646
1647
1648
1649
1650
1651 va_start(args, fmt);
1652
1653 vaf.fmt = fmt;
1654 vaf.va = &args;
1655
1656 pr_err("error on %pU: %pV, disabling caching\n",
1657 c->set_uuid, &vaf);
1658
1659 va_end(args);
1660
1661 if (c->on_error == ON_ERROR_PANIC)
1662 panic("panic forced after error\n");
1663
1664 bch_cache_set_unregister(c);
1665 return true;
1666}
1667
1668
1669void bch_cache_set_release(struct kobject *kobj)
1670{
1671 struct cache_set *c = container_of(kobj, struct cache_set, kobj);
1672
1673 kfree(c);
1674 module_put(THIS_MODULE);
1675}
1676
1677static void cache_set_free(struct closure *cl)
1678{
1679 struct cache_set *c = container_of(cl, struct cache_set, cl);
1680 struct cache *ca;
1681
1682 debugfs_remove(c->debug);
1683
1684 bch_open_buckets_free(c);
1685 bch_btree_cache_free(c);
1686 bch_journal_free(c);
1687
1688 mutex_lock(&bch_register_lock);
1689 bch_bset_sort_state_free(&c->sort);
1690 free_pages((unsigned long) c->uuids, ilog2(meta_bucket_pages(&c->cache->sb)));
1691
1692 ca = c->cache;
1693 if (ca) {
1694 ca->set = NULL;
1695 c->cache = NULL;
1696 kobject_put(&ca->kobj);
1697 }
1698
1699
1700 if (c->moving_gc_wq)
1701 destroy_workqueue(c->moving_gc_wq);
1702 bioset_exit(&c->bio_split);
1703 mempool_exit(&c->fill_iter);
1704 mempool_exit(&c->bio_meta);
1705 mempool_exit(&c->search);
1706 kfree(c->devices);
1707
1708 list_del(&c->list);
1709 mutex_unlock(&bch_register_lock);
1710
1711 pr_info("Cache set %pU unregistered\n", c->set_uuid);
1712 wake_up(&unregister_wait);
1713
1714 closure_debug_destroy(&c->cl);
1715 kobject_put(&c->kobj);
1716}
1717
1718static void cache_set_flush(struct closure *cl)
1719{
1720 struct cache_set *c = container_of(cl, struct cache_set, caching);
1721 struct cache *ca = c->cache;
1722 struct btree *b;
1723
1724 bch_cache_accounting_destroy(&c->accounting);
1725
1726 kobject_put(&c->internal);
1727 kobject_del(&c->kobj);
1728
1729 if (!IS_ERR_OR_NULL(c->gc_thread))
1730 kthread_stop(c->gc_thread);
1731
1732 if (!IS_ERR_OR_NULL(c->root))
1733 list_add(&c->root->list, &c->btree_cache);
1734
1735
1736
1737
1738
1739 if (!test_bit(CACHE_SET_IO_DISABLE, &c->flags))
1740 list_for_each_entry(b, &c->btree_cache, list) {
1741 mutex_lock(&b->write_lock);
1742 if (btree_node_dirty(b))
1743 __bch_btree_node_write(b, NULL);
1744 mutex_unlock(&b->write_lock);
1745 }
1746
1747 if (ca->alloc_thread)
1748 kthread_stop(ca->alloc_thread);
1749
1750 if (c->journal.cur) {
1751 cancel_delayed_work_sync(&c->journal.work);
1752
1753 c->journal.work.work.func(&c->journal.work.work);
1754 }
1755
1756 closure_return(cl);
1757}
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775static void conditional_stop_bcache_device(struct cache_set *c,
1776 struct bcache_device *d,
1777 struct cached_dev *dc)
1778{
1779 if (dc->stop_when_cache_set_failed == BCH_CACHED_DEV_STOP_ALWAYS) {
1780 pr_warn("stop_when_cache_set_failed of %s is \"always\", stop it for failed cache set %pU.\n",
1781 d->disk->disk_name, c->set_uuid);
1782 bcache_device_stop(d);
1783 } else if (atomic_read(&dc->has_dirty)) {
1784
1785
1786
1787
1788 pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is dirty, stop it to avoid potential data corruption.\n",
1789 d->disk->disk_name);
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801 dc->io_disable = true;
1802
1803 smp_mb();
1804 bcache_device_stop(d);
1805 } else {
1806
1807
1808
1809
1810 pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is clean, keep it alive.\n",
1811 d->disk->disk_name);
1812 }
1813}
1814
1815static void __cache_set_unregister(struct closure *cl)
1816{
1817 struct cache_set *c = container_of(cl, struct cache_set, caching);
1818 struct cached_dev *dc;
1819 struct bcache_device *d;
1820 size_t i;
1821
1822 mutex_lock(&bch_register_lock);
1823
1824 for (i = 0; i < c->devices_max_used; i++) {
1825 d = c->devices[i];
1826 if (!d)
1827 continue;
1828
1829 if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
1830 test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
1831 dc = container_of(d, struct cached_dev, disk);
1832 bch_cached_dev_detach(dc);
1833 if (test_bit(CACHE_SET_IO_DISABLE, &c->flags))
1834 conditional_stop_bcache_device(c, d, dc);
1835 } else {
1836 bcache_device_stop(d);
1837 }
1838 }
1839
1840 mutex_unlock(&bch_register_lock);
1841
1842 continue_at(cl, cache_set_flush, system_wq);
1843}
1844
1845void bch_cache_set_stop(struct cache_set *c)
1846{
1847 if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags))
1848
1849 closure_queue(&c->caching);
1850}
1851
1852void bch_cache_set_unregister(struct cache_set *c)
1853{
1854 set_bit(CACHE_SET_UNREGISTERING, &c->flags);
1855 bch_cache_set_stop(c);
1856}
1857
1858#define alloc_meta_bucket_pages(gfp, sb) \
1859 ((void *) __get_free_pages(__GFP_ZERO|__GFP_COMP|gfp, ilog2(meta_bucket_pages(sb))))
1860
1861struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1862{
1863 int iter_size;
1864 struct cache *ca = container_of(sb, struct cache, sb);
1865 struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
1866
1867 if (!c)
1868 return NULL;
1869
1870 __module_get(THIS_MODULE);
1871 closure_init(&c->cl, NULL);
1872 set_closure_fn(&c->cl, cache_set_free, system_wq);
1873
1874 closure_init(&c->caching, &c->cl);
1875 set_closure_fn(&c->caching, __cache_set_unregister, system_wq);
1876
1877
1878 closure_set_stopped(&c->cl);
1879 closure_put(&c->cl);
1880
1881 kobject_init(&c->kobj, &bch_cache_set_ktype);
1882 kobject_init(&c->internal, &bch_cache_set_internal_ktype);
1883
1884 bch_cache_accounting_init(&c->accounting, &c->cl);
1885
1886 memcpy(c->set_uuid, sb->set_uuid, 16);
1887
1888 c->cache = ca;
1889 c->cache->set = c;
1890 c->bucket_bits = ilog2(sb->bucket_size);
1891 c->block_bits = ilog2(sb->block_size);
1892 c->nr_uuids = meta_bucket_bytes(sb) / sizeof(struct uuid_entry);
1893 c->devices_max_used = 0;
1894 atomic_set(&c->attached_dev_nr, 0);
1895 c->btree_pages = meta_bucket_pages(sb);
1896 if (c->btree_pages > BTREE_MAX_PAGES)
1897 c->btree_pages = max_t(int, c->btree_pages / 4,
1898 BTREE_MAX_PAGES);
1899
1900 sema_init(&c->sb_write_mutex, 1);
1901 mutex_init(&c->bucket_lock);
1902 init_waitqueue_head(&c->btree_cache_wait);
1903 spin_lock_init(&c->btree_cannibalize_lock);
1904 init_waitqueue_head(&c->bucket_wait);
1905 init_waitqueue_head(&c->gc_wait);
1906 sema_init(&c->uuid_write_mutex, 1);
1907
1908 spin_lock_init(&c->btree_gc_time.lock);
1909 spin_lock_init(&c->btree_split_time.lock);
1910 spin_lock_init(&c->btree_read_time.lock);
1911
1912 bch_moving_init_cache_set(c);
1913
1914 INIT_LIST_HEAD(&c->list);
1915 INIT_LIST_HEAD(&c->cached_devs);
1916 INIT_LIST_HEAD(&c->btree_cache);
1917 INIT_LIST_HEAD(&c->btree_cache_freeable);
1918 INIT_LIST_HEAD(&c->btree_cache_freed);
1919 INIT_LIST_HEAD(&c->data_buckets);
1920
1921 iter_size = ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size + 1) *
1922 sizeof(struct btree_iter_set);
1923
1924 c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL);
1925 if (!c->devices)
1926 goto err;
1927
1928 if (mempool_init_slab_pool(&c->search, 32, bch_search_cache))
1929 goto err;
1930
1931 if (mempool_init_kmalloc_pool(&c->bio_meta, 2,
1932 sizeof(struct bbio) +
1933 sizeof(struct bio_vec) * meta_bucket_pages(sb)))
1934 goto err;
1935
1936 if (mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size))
1937 goto err;
1938
1939 if (bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio),
1940 BIOSET_NEED_RESCUER))
1941 goto err;
1942
1943 c->uuids = alloc_meta_bucket_pages(GFP_KERNEL, sb);
1944 if (!c->uuids)
1945 goto err;
1946
1947 c->moving_gc_wq = alloc_workqueue("bcache_gc", WQ_MEM_RECLAIM, 0);
1948 if (!c->moving_gc_wq)
1949 goto err;
1950
1951 if (bch_journal_alloc(c))
1952 goto err;
1953
1954 if (bch_btree_cache_alloc(c))
1955 goto err;
1956
1957 if (bch_open_buckets_alloc(c))
1958 goto err;
1959
1960 if (bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages)))
1961 goto err;
1962
1963 c->congested_read_threshold_us = 2000;
1964 c->congested_write_threshold_us = 20000;
1965 c->error_limit = DEFAULT_IO_ERROR_LIMIT;
1966 c->idle_max_writeback_rate_enabled = 1;
1967 WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags));
1968
1969 return c;
1970err:
1971 bch_cache_set_unregister(c);
1972 return NULL;
1973}
1974
1975static int run_cache_set(struct cache_set *c)
1976{
1977 const char *err = "cannot allocate memory";
1978 struct cached_dev *dc, *t;
1979 struct cache *ca = c->cache;
1980 struct closure cl;
1981 LIST_HEAD(journal);
1982 struct journal_replay *l;
1983
1984 closure_init_stack(&cl);
1985
1986 c->nbuckets = ca->sb.nbuckets;
1987 set_gc_sectors(c);
1988
1989 if (CACHE_SYNC(&c->cache->sb)) {
1990 struct bkey *k;
1991 struct jset *j;
1992
1993 err = "cannot allocate memory for journal";
1994 if (bch_journal_read(c, &journal))
1995 goto err;
1996
1997 pr_debug("btree_journal_read() done\n");
1998
1999 err = "no journal entries found";
2000 if (list_empty(&journal))
2001 goto err;
2002
2003 j = &list_entry(journal.prev, struct journal_replay, list)->j;
2004
2005 err = "IO error reading priorities";
2006 if (prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]))
2007 goto err;
2008
2009
2010
2011
2012
2013
2014
2015 k = &j->btree_root;
2016
2017 err = "bad btree root";
2018 if (__bch_btree_ptr_invalid(c, k))
2019 goto err;
2020
2021 err = "error reading btree root";
2022 c->root = bch_btree_node_get(c, NULL, k,
2023 j->btree_level,
2024 true, NULL);
2025 if (IS_ERR_OR_NULL(c->root))
2026 goto err;
2027
2028 list_del_init(&c->root->list);
2029 rw_unlock(true, c->root);
2030
2031 err = uuid_read(c, j, &cl);
2032 if (err)
2033 goto err;
2034
2035 err = "error in recovery";
2036 if (bch_btree_check(c))
2037 goto err;
2038
2039 bch_journal_mark(c, &journal);
2040 bch_initial_gc_finish(c);
2041 pr_debug("btree_check() done\n");
2042
2043
2044
2045
2046
2047
2048 bch_journal_next(&c->journal);
2049
2050 err = "error starting allocator thread";
2051 if (bch_cache_allocator_start(ca))
2052 goto err;
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064 if (j->version < BCACHE_JSET_VERSION_UUID)
2065 __uuid_write(c);
2066
2067 err = "bcache: replay journal failed";
2068 if (bch_journal_replay(c, &journal))
2069 goto err;
2070 } else {
2071 unsigned int j;
2072
2073 pr_notice("invalidating existing data\n");
2074 ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
2075 2, SB_JOURNAL_BUCKETS);
2076
2077 for (j = 0; j < ca->sb.keys; j++)
2078 ca->sb.d[j] = ca->sb.first_bucket + j;
2079
2080 bch_initial_gc_finish(c);
2081
2082 err = "error starting allocator thread";
2083 if (bch_cache_allocator_start(ca))
2084 goto err;
2085
2086 mutex_lock(&c->bucket_lock);
2087 bch_prio_write(ca, true);
2088 mutex_unlock(&c->bucket_lock);
2089
2090 err = "cannot allocate new UUID bucket";
2091 if (__uuid_write(c))
2092 goto err;
2093
2094 err = "cannot allocate new btree root";
2095 c->root = __bch_btree_node_alloc(c, NULL, 0, true, NULL);
2096 if (IS_ERR_OR_NULL(c->root))
2097 goto err;
2098
2099 mutex_lock(&c->root->write_lock);
2100 bkey_copy_key(&c->root->key, &MAX_KEY);
2101 bch_btree_node_write(c->root, &cl);
2102 mutex_unlock(&c->root->write_lock);
2103
2104 bch_btree_set_root(c->root);
2105 rw_unlock(true, c->root);
2106
2107
2108
2109
2110
2111
2112 SET_CACHE_SYNC(&c->cache->sb, true);
2113
2114 bch_journal_next(&c->journal);
2115 bch_journal_meta(c, &cl);
2116 }
2117
2118 err = "error starting gc thread";
2119 if (bch_gc_thread_start(c))
2120 goto err;
2121
2122 closure_sync(&cl);
2123 c->cache->sb.last_mount = (u32)ktime_get_real_seconds();
2124 bcache_write_super(c);
2125
2126 if (bch_has_feature_obso_large_bucket(&c->cache->sb))
2127 pr_err("Detect obsoleted large bucket layout, all attached bcache device will be read-only\n");
2128
2129 list_for_each_entry_safe(dc, t, &uncached_devices, list)
2130 bch_cached_dev_attach(dc, c, NULL);
2131
2132 flash_devs_run(c);
2133
2134 set_bit(CACHE_SET_RUNNING, &c->flags);
2135 return 0;
2136err:
2137 while (!list_empty(&journal)) {
2138 l = list_first_entry(&journal, struct journal_replay, list);
2139 list_del(&l->list);
2140 kfree(l);
2141 }
2142
2143 closure_sync(&cl);
2144
2145 bch_cache_set_error(c, "%s", err);
2146
2147 return -EIO;
2148}
2149
2150static const char *register_cache_set(struct cache *ca)
2151{
2152 char buf[12];
2153 const char *err = "cannot allocate memory";
2154 struct cache_set *c;
2155
2156 list_for_each_entry(c, &bch_cache_sets, list)
2157 if (!memcmp(c->set_uuid, ca->sb.set_uuid, 16)) {
2158 if (c->cache)
2159 return "duplicate cache set member";
2160
2161 goto found;
2162 }
2163
2164 c = bch_cache_set_alloc(&ca->sb);
2165 if (!c)
2166 return err;
2167
2168 err = "error creating kobject";
2169 if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->set_uuid) ||
2170 kobject_add(&c->internal, &c->kobj, "internal"))
2171 goto err;
2172
2173 if (bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
2174 goto err;
2175
2176 bch_debug_init_cache_set(c);
2177
2178 list_add(&c->list, &bch_cache_sets);
2179found:
2180 sprintf(buf, "cache%i", ca->sb.nr_this_dev);
2181 if (sysfs_create_link(&ca->kobj, &c->kobj, "set") ||
2182 sysfs_create_link(&c->kobj, &ca->kobj, buf))
2183 goto err;
2184
2185 kobject_get(&ca->kobj);
2186 ca->set = c;
2187 ca->set->cache = ca;
2188
2189 err = "failed to run cache set";
2190 if (run_cache_set(c) < 0)
2191 goto err;
2192
2193 return NULL;
2194err:
2195 bch_cache_set_unregister(c);
2196 return err;
2197}
2198
2199
2200
2201
2202void bch_cache_release(struct kobject *kobj)
2203{
2204 struct cache *ca = container_of(kobj, struct cache, kobj);
2205 unsigned int i;
2206
2207 if (ca->set) {
2208 BUG_ON(ca->set->cache != ca);
2209 ca->set->cache = NULL;
2210 }
2211
2212 free_pages((unsigned long) ca->disk_buckets, ilog2(meta_bucket_pages(&ca->sb)));
2213 kfree(ca->prio_buckets);
2214 vfree(ca->buckets);
2215
2216 free_heap(&ca->heap);
2217 free_fifo(&ca->free_inc);
2218
2219 for (i = 0; i < RESERVE_NR; i++)
2220 free_fifo(&ca->free[i]);
2221
2222 if (ca->sb_disk)
2223 put_page(virt_to_page(ca->sb_disk));
2224
2225 if (!IS_ERR_OR_NULL(ca->bdev))
2226 blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2227
2228 kfree(ca);
2229 module_put(THIS_MODULE);
2230}
2231
2232static int cache_alloc(struct cache *ca)
2233{
2234 size_t free;
2235 size_t btree_buckets;
2236 struct bucket *b;
2237 int ret = -ENOMEM;
2238 const char *err = NULL;
2239
2240 __module_get(THIS_MODULE);
2241 kobject_init(&ca->kobj, &bch_cache_ktype);
2242
2243 bio_init(&ca->journal.bio, ca->journal.bio.bi_inline_vecs, 8);
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254 btree_buckets = ca->sb.njournal_buckets ?: 8;
2255 free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
2256 if (!free) {
2257 ret = -EPERM;
2258 err = "ca->sb.nbuckets is too small";
2259 goto err_free;
2260 }
2261
2262 if (!init_fifo(&ca->free[RESERVE_BTREE], btree_buckets,
2263 GFP_KERNEL)) {
2264 err = "ca->free[RESERVE_BTREE] alloc failed";
2265 goto err_btree_alloc;
2266 }
2267
2268 if (!init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca),
2269 GFP_KERNEL)) {
2270 err = "ca->free[RESERVE_PRIO] alloc failed";
2271 goto err_prio_alloc;
2272 }
2273
2274 if (!init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL)) {
2275 err = "ca->free[RESERVE_MOVINGGC] alloc failed";
2276 goto err_movinggc_alloc;
2277 }
2278
2279 if (!init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL)) {
2280 err = "ca->free[RESERVE_NONE] alloc failed";
2281 goto err_none_alloc;
2282 }
2283
2284 if (!init_fifo(&ca->free_inc, free << 2, GFP_KERNEL)) {
2285 err = "ca->free_inc alloc failed";
2286 goto err_free_inc_alloc;
2287 }
2288
2289 if (!init_heap(&ca->heap, free << 3, GFP_KERNEL)) {
2290 err = "ca->heap alloc failed";
2291 goto err_heap_alloc;
2292 }
2293
2294 ca->buckets = vzalloc(array_size(sizeof(struct bucket),
2295 ca->sb.nbuckets));
2296 if (!ca->buckets) {
2297 err = "ca->buckets alloc failed";
2298 goto err_buckets_alloc;
2299 }
2300
2301 ca->prio_buckets = kzalloc(array3_size(sizeof(uint64_t),
2302 prio_buckets(ca), 2),
2303 GFP_KERNEL);
2304 if (!ca->prio_buckets) {
2305 err = "ca->prio_buckets alloc failed";
2306 goto err_prio_buckets_alloc;
2307 }
2308
2309 ca->disk_buckets = alloc_meta_bucket_pages(GFP_KERNEL, &ca->sb);
2310 if (!ca->disk_buckets) {
2311 err = "ca->disk_buckets alloc failed";
2312 goto err_disk_buckets_alloc;
2313 }
2314
2315 ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
2316
2317 for_each_bucket(b, ca)
2318 atomic_set(&b->pin, 0);
2319 return 0;
2320
2321err_disk_buckets_alloc:
2322 kfree(ca->prio_buckets);
2323err_prio_buckets_alloc:
2324 vfree(ca->buckets);
2325err_buckets_alloc:
2326 free_heap(&ca->heap);
2327err_heap_alloc:
2328 free_fifo(&ca->free_inc);
2329err_free_inc_alloc:
2330 free_fifo(&ca->free[RESERVE_NONE]);
2331err_none_alloc:
2332 free_fifo(&ca->free[RESERVE_MOVINGGC]);
2333err_movinggc_alloc:
2334 free_fifo(&ca->free[RESERVE_PRIO]);
2335err_prio_alloc:
2336 free_fifo(&ca->free[RESERVE_BTREE]);
2337err_btree_alloc:
2338err_free:
2339 module_put(THIS_MODULE);
2340 if (err)
2341 pr_notice("error %s: %s\n", ca->cache_dev_name, err);
2342 return ret;
2343}
2344
2345static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
2346 struct block_device *bdev, struct cache *ca)
2347{
2348 const char *err = NULL;
2349 int ret = 0;
2350
2351 bdevname(bdev, ca->cache_dev_name);
2352 memcpy(&ca->sb, sb, sizeof(struct cache_sb));
2353 ca->bdev = bdev;
2354 ca->bdev->bd_holder = ca;
2355 ca->sb_disk = sb_disk;
2356
2357 if (blk_queue_discard(bdev_get_queue(bdev)))
2358 ca->discard = CACHE_DISCARD(&ca->sb);
2359
2360 ret = cache_alloc(ca);
2361 if (ret != 0) {
2362
2363
2364
2365
2366
2367
2368 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2369 if (ret == -ENOMEM)
2370 err = "cache_alloc(): -ENOMEM";
2371 else if (ret == -EPERM)
2372 err = "cache_alloc(): cache device is too small";
2373 else
2374 err = "cache_alloc(): unknown error";
2375 goto err;
2376 }
2377
2378 if (kobject_add(&ca->kobj, bdev_kobj(bdev), "bcache")) {
2379 err = "error calling kobject_add";
2380 ret = -ENOMEM;
2381 goto out;
2382 }
2383
2384 mutex_lock(&bch_register_lock);
2385 err = register_cache_set(ca);
2386 mutex_unlock(&bch_register_lock);
2387
2388 if (err) {
2389 ret = -ENODEV;
2390 goto out;
2391 }
2392
2393 pr_info("registered cache device %s\n", ca->cache_dev_name);
2394
2395out:
2396 kobject_put(&ca->kobj);
2397
2398err:
2399 if (err)
2400 pr_notice("error %s: %s\n", ca->cache_dev_name, err);
2401
2402 return ret;
2403}
2404
2405
2406
2407static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
2408 const char *buffer, size_t size);
2409static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
2410 struct kobj_attribute *attr,
2411 const char *buffer, size_t size);
2412
2413kobj_attribute_write(register, register_bcache);
2414kobj_attribute_write(register_quiet, register_bcache);
2415kobj_attribute_write(pendings_cleanup, bch_pending_bdevs_cleanup);
2416
2417static bool bch_is_open_backing(dev_t dev)
2418{
2419 struct cache_set *c, *tc;
2420 struct cached_dev *dc, *t;
2421
2422 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
2423 list_for_each_entry_safe(dc, t, &c->cached_devs, list)
2424 if (dc->bdev->bd_dev == dev)
2425 return true;
2426 list_for_each_entry_safe(dc, t, &uncached_devices, list)
2427 if (dc->bdev->bd_dev == dev)
2428 return true;
2429 return false;
2430}
2431
2432static bool bch_is_open_cache(dev_t dev)
2433{
2434 struct cache_set *c, *tc;
2435
2436 list_for_each_entry_safe(c, tc, &bch_cache_sets, list) {
2437 struct cache *ca = c->cache;
2438
2439 if (ca->bdev->bd_dev == dev)
2440 return true;
2441 }
2442
2443 return false;
2444}
2445
2446static bool bch_is_open(dev_t dev)
2447{
2448 return bch_is_open_cache(dev) || bch_is_open_backing(dev);
2449}
2450
2451struct async_reg_args {
2452 struct delayed_work reg_work;
2453 char *path;
2454 struct cache_sb *sb;
2455 struct cache_sb_disk *sb_disk;
2456 struct block_device *bdev;
2457};
2458
2459static void register_bdev_worker(struct work_struct *work)
2460{
2461 int fail = false;
2462 struct async_reg_args *args =
2463 container_of(work, struct async_reg_args, reg_work.work);
2464 struct cached_dev *dc;
2465
2466 dc = kzalloc(sizeof(*dc), GFP_KERNEL);
2467 if (!dc) {
2468 fail = true;
2469 put_page(virt_to_page(args->sb_disk));
2470 blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2471 goto out;
2472 }
2473
2474 mutex_lock(&bch_register_lock);
2475 if (register_bdev(args->sb, args->sb_disk, args->bdev, dc) < 0)
2476 fail = true;
2477 mutex_unlock(&bch_register_lock);
2478
2479out:
2480 if (fail)
2481 pr_info("error %s: fail to register backing device\n",
2482 args->path);
2483 kfree(args->sb);
2484 kfree(args->path);
2485 kfree(args);
2486 module_put(THIS_MODULE);
2487}
2488
2489static void register_cache_worker(struct work_struct *work)
2490{
2491 int fail = false;
2492 struct async_reg_args *args =
2493 container_of(work, struct async_reg_args, reg_work.work);
2494 struct cache *ca;
2495
2496 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
2497 if (!ca) {
2498 fail = true;
2499 put_page(virt_to_page(args->sb_disk));
2500 blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2501 goto out;
2502 }
2503
2504
2505 if (register_cache(args->sb, args->sb_disk, args->bdev, ca) != 0)
2506 fail = true;
2507
2508out:
2509 if (fail)
2510 pr_info("error %s: fail to register cache device\n",
2511 args->path);
2512 kfree(args->sb);
2513 kfree(args->path);
2514 kfree(args);
2515 module_put(THIS_MODULE);
2516}
2517
2518static void register_device_async(struct async_reg_args *args)
2519{
2520 if (SB_IS_BDEV(args->sb))
2521 INIT_DELAYED_WORK(&args->reg_work, register_bdev_worker);
2522 else
2523 INIT_DELAYED_WORK(&args->reg_work, register_cache_worker);
2524
2525
2526 queue_delayed_work(system_wq, &args->reg_work, 10);
2527}
2528
2529static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
2530 const char *buffer, size_t size)
2531{
2532 const char *err;
2533 char *path = NULL;
2534 struct cache_sb *sb;
2535 struct cache_sb_disk *sb_disk;
2536 struct block_device *bdev;
2537 ssize_t ret;
2538 bool async_registration = false;
2539
2540#ifdef CONFIG_BCACHE_ASYNC_REGISTRATION
2541 async_registration = true;
2542#endif
2543
2544 ret = -EBUSY;
2545 err = "failed to reference bcache module";
2546 if (!try_module_get(THIS_MODULE))
2547 goto out;
2548
2549
2550 smp_mb();
2551 err = "bcache is in reboot";
2552 if (bcache_is_reboot)
2553 goto out_module_put;
2554
2555 ret = -ENOMEM;
2556 err = "cannot allocate memory";
2557 path = kstrndup(buffer, size, GFP_KERNEL);
2558 if (!path)
2559 goto out_module_put;
2560
2561 sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL);
2562 if (!sb)
2563 goto out_free_path;
2564
2565 ret = -EINVAL;
2566 err = "failed to open device";
2567 bdev = blkdev_get_by_path(strim(path),
2568 FMODE_READ|FMODE_WRITE|FMODE_EXCL,
2569 sb);
2570 if (IS_ERR(bdev)) {
2571 if (bdev == ERR_PTR(-EBUSY)) {
2572 dev_t dev;
2573
2574 mutex_lock(&bch_register_lock);
2575 if (lookup_bdev(strim(path), &dev) == 0 &&
2576 bch_is_open(dev))
2577 err = "device already registered";
2578 else
2579 err = "device busy";
2580 mutex_unlock(&bch_register_lock);
2581 if (attr == &ksysfs_register_quiet)
2582 goto done;
2583 }
2584 goto out_free_sb;
2585 }
2586
2587 err = "failed to set blocksize";
2588 if (set_blocksize(bdev, 4096))
2589 goto out_blkdev_put;
2590
2591 err = read_super(sb, bdev, &sb_disk);
2592 if (err)
2593 goto out_blkdev_put;
2594
2595 err = "failed to register device";
2596
2597 if (async_registration) {
2598
2599 struct async_reg_args *args =
2600 kzalloc(sizeof(struct async_reg_args), GFP_KERNEL);
2601
2602 if (!args) {
2603 ret = -ENOMEM;
2604 err = "cannot allocate memory";
2605 goto out_put_sb_page;
2606 }
2607
2608 args->path = path;
2609 args->sb = sb;
2610 args->sb_disk = sb_disk;
2611 args->bdev = bdev;
2612 register_device_async(args);
2613
2614 goto async_done;
2615 }
2616
2617 if (SB_IS_BDEV(sb)) {
2618 struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
2619
2620 if (!dc)
2621 goto out_put_sb_page;
2622
2623 mutex_lock(&bch_register_lock);
2624 ret = register_bdev(sb, sb_disk, bdev, dc);
2625 mutex_unlock(&bch_register_lock);
2626
2627 if (ret < 0)
2628 goto out_free_sb;
2629 } else {
2630 struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
2631
2632 if (!ca)
2633 goto out_put_sb_page;
2634
2635
2636 if (register_cache(sb, sb_disk, bdev, ca) != 0)
2637 goto out_free_sb;
2638 }
2639
2640done:
2641 kfree(sb);
2642 kfree(path);
2643 module_put(THIS_MODULE);
2644async_done:
2645 return size;
2646
2647out_put_sb_page:
2648 put_page(virt_to_page(sb_disk));
2649out_blkdev_put:
2650 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2651out_free_sb:
2652 kfree(sb);
2653out_free_path:
2654 kfree(path);
2655 path = NULL;
2656out_module_put:
2657 module_put(THIS_MODULE);
2658out:
2659 pr_info("error %s: %s\n", path?path:"", err);
2660 return ret;
2661}
2662
2663
2664struct pdev {
2665 struct list_head list;
2666 struct cached_dev *dc;
2667};
2668
2669static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
2670 struct kobj_attribute *attr,
2671 const char *buffer,
2672 size_t size)
2673{
2674 LIST_HEAD(pending_devs);
2675 ssize_t ret = size;
2676 struct cached_dev *dc, *tdc;
2677 struct pdev *pdev, *tpdev;
2678 struct cache_set *c, *tc;
2679
2680 mutex_lock(&bch_register_lock);
2681 list_for_each_entry_safe(dc, tdc, &uncached_devices, list) {
2682 pdev = kmalloc(sizeof(struct pdev), GFP_KERNEL);
2683 if (!pdev)
2684 break;
2685 pdev->dc = dc;
2686 list_add(&pdev->list, &pending_devs);
2687 }
2688
2689 list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) {
2690 char *pdev_set_uuid = pdev->dc->sb.set_uuid;
2691 list_for_each_entry_safe(c, tc, &bch_cache_sets, list) {
2692 char *set_uuid = c->set_uuid;
2693
2694 if (!memcmp(pdev_set_uuid, set_uuid, 16)) {
2695 list_del(&pdev->list);
2696 kfree(pdev);
2697 break;
2698 }
2699 }
2700 }
2701 mutex_unlock(&bch_register_lock);
2702
2703 list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) {
2704 pr_info("delete pdev %p\n", pdev);
2705 list_del(&pdev->list);
2706 bcache_device_stop(&pdev->dc->disk);
2707 kfree(pdev);
2708 }
2709
2710 return ret;
2711}
2712
2713static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
2714{
2715 if (bcache_is_reboot)
2716 return NOTIFY_DONE;
2717
2718 if (code == SYS_DOWN ||
2719 code == SYS_HALT ||
2720 code == SYS_POWER_OFF) {
2721 DEFINE_WAIT(wait);
2722 unsigned long start = jiffies;
2723 bool stopped = false;
2724
2725 struct cache_set *c, *tc;
2726 struct cached_dev *dc, *tdc;
2727
2728 mutex_lock(&bch_register_lock);
2729
2730 if (bcache_is_reboot)
2731 goto out;
2732
2733
2734 bcache_is_reboot = true;
2735
2736
2737
2738
2739 smp_mb();
2740
2741 if (list_empty(&bch_cache_sets) &&
2742 list_empty(&uncached_devices))
2743 goto out;
2744
2745 mutex_unlock(&bch_register_lock);
2746
2747 pr_info("Stopping all devices:\n");
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
2762
2763 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
2764 bch_cache_set_stop(c);
2765
2766 list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
2767 bcache_device_stop(&dc->disk);
2768
2769
2770
2771
2772
2773
2774 schedule();
2775
2776
2777 while (1) {
2778 long timeout = start + 10 * HZ - jiffies;
2779
2780 mutex_lock(&bch_register_lock);
2781 stopped = list_empty(&bch_cache_sets) &&
2782 list_empty(&uncached_devices);
2783
2784 if (timeout < 0 || stopped)
2785 break;
2786
2787 prepare_to_wait(&unregister_wait, &wait,
2788 TASK_UNINTERRUPTIBLE);
2789
2790 mutex_unlock(&bch_register_lock);
2791 schedule_timeout(timeout);
2792 }
2793
2794 finish_wait(&unregister_wait, &wait);
2795
2796 if (stopped)
2797 pr_info("All devices stopped\n");
2798 else
2799 pr_notice("Timeout waiting for devices to be closed\n");
2800out:
2801 mutex_unlock(&bch_register_lock);
2802 }
2803
2804 return NOTIFY_DONE;
2805}
2806
2807static struct notifier_block reboot = {
2808 .notifier_call = bcache_reboot,
2809 .priority = INT_MAX,
2810};
2811
2812static void bcache_exit(void)
2813{
2814 bch_debug_exit();
2815 bch_request_exit();
2816 if (bcache_kobj)
2817 kobject_put(bcache_kobj);
2818 if (bcache_wq)
2819 destroy_workqueue(bcache_wq);
2820 if (bch_journal_wq)
2821 destroy_workqueue(bch_journal_wq);
2822 if (bch_flush_wq)
2823 destroy_workqueue(bch_flush_wq);
2824 bch_btree_exit();
2825
2826 if (bcache_major)
2827 unregister_blkdev(bcache_major, "bcache");
2828 unregister_reboot_notifier(&reboot);
2829 mutex_destroy(&bch_register_lock);
2830}
2831
2832
2833static void check_module_parameters(void)
2834{
2835 if (bch_cutoff_writeback_sync == 0)
2836 bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC;
2837 else if (bch_cutoff_writeback_sync > CUTOFF_WRITEBACK_SYNC_MAX) {
2838 pr_warn("set bch_cutoff_writeback_sync (%u) to max value %u\n",
2839 bch_cutoff_writeback_sync, CUTOFF_WRITEBACK_SYNC_MAX);
2840 bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC_MAX;
2841 }
2842
2843 if (bch_cutoff_writeback == 0)
2844 bch_cutoff_writeback = CUTOFF_WRITEBACK;
2845 else if (bch_cutoff_writeback > CUTOFF_WRITEBACK_MAX) {
2846 pr_warn("set bch_cutoff_writeback (%u) to max value %u\n",
2847 bch_cutoff_writeback, CUTOFF_WRITEBACK_MAX);
2848 bch_cutoff_writeback = CUTOFF_WRITEBACK_MAX;
2849 }
2850
2851 if (bch_cutoff_writeback > bch_cutoff_writeback_sync) {
2852 pr_warn("set bch_cutoff_writeback (%u) to %u\n",
2853 bch_cutoff_writeback, bch_cutoff_writeback_sync);
2854 bch_cutoff_writeback = bch_cutoff_writeback_sync;
2855 }
2856}
2857
2858static int __init bcache_init(void)
2859{
2860 static const struct attribute *files[] = {
2861 &ksysfs_register.attr,
2862 &ksysfs_register_quiet.attr,
2863 &ksysfs_pendings_cleanup.attr,
2864 NULL
2865 };
2866
2867 check_module_parameters();
2868
2869 mutex_init(&bch_register_lock);
2870 init_waitqueue_head(&unregister_wait);
2871 register_reboot_notifier(&reboot);
2872
2873 bcache_major = register_blkdev(0, "bcache");
2874 if (bcache_major < 0) {
2875 unregister_reboot_notifier(&reboot);
2876 mutex_destroy(&bch_register_lock);
2877 return bcache_major;
2878 }
2879
2880 if (bch_btree_init())
2881 goto err;
2882
2883 bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0);
2884 if (!bcache_wq)
2885 goto err;
2886
2887
2888
2889
2890
2891
2892
2893
2894
2895
2896 bch_flush_wq = alloc_workqueue("bch_flush", 0, 0);
2897 if (!bch_flush_wq)
2898 goto err;
2899
2900 bch_journal_wq = alloc_workqueue("bch_journal", WQ_MEM_RECLAIM, 0);
2901 if (!bch_journal_wq)
2902 goto err;
2903
2904 bcache_kobj = kobject_create_and_add("bcache", fs_kobj);
2905 if (!bcache_kobj)
2906 goto err;
2907
2908 if (bch_request_init() ||
2909 sysfs_create_files(bcache_kobj, files))
2910 goto err;
2911
2912 bch_debug_init();
2913 closure_debug_init();
2914
2915 bcache_is_reboot = false;
2916
2917 return 0;
2918err:
2919 bcache_exit();
2920 return -ENOMEM;
2921}
2922
2923
2924
2925
2926module_exit(bcache_exit);
2927module_init(bcache_init);
2928
2929module_param(bch_cutoff_writeback, uint, 0);
2930MODULE_PARM_DESC(bch_cutoff_writeback, "threshold to cutoff writeback");
2931
2932module_param(bch_cutoff_writeback_sync, uint, 0);
2933MODULE_PARM_DESC(bch_cutoff_writeback_sync, "hard threshold to cutoff writeback");
2934
2935MODULE_DESCRIPTION("Bcache: a Linux block layer cache");
2936MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
2937MODULE_LICENSE("GPL");
2938