1
2
3
4
5
6
7
8
9
10#include "bcache.h"
11#include "btree.h"
12#include "debug.h"
13#include "extents.h"
14#include "request.h"
15#include "writeback.h"
16#include "features.h"
17
18#include <linux/blkdev.h>
19#include <linux/pagemap.h>
20#include <linux/debugfs.h>
21#include <linux/genhd.h>
22#include <linux/idr.h>
23#include <linux/kthread.h>
24#include <linux/workqueue.h>
25#include <linux/module.h>
26#include <linux/random.h>
27#include <linux/reboot.h>
28#include <linux/sysfs.h>
29
30unsigned int bch_cutoff_writeback;
31unsigned int bch_cutoff_writeback_sync;
32
33static const char bcache_magic[] = {
34 0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
35 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81
36};
37
38static const char invalid_uuid[] = {
39 0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78,
40 0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
41};
42
43static struct kobject *bcache_kobj;
44struct mutex bch_register_lock;
45bool bcache_is_reboot;
46LIST_HEAD(bch_cache_sets);
47static LIST_HEAD(uncached_devices);
48
49static int bcache_major;
50static DEFINE_IDA(bcache_device_idx);
51static wait_queue_head_t unregister_wait;
52struct workqueue_struct *bcache_wq;
53struct workqueue_struct *bch_flush_wq;
54struct workqueue_struct *bch_journal_wq;
55
56
57#define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE)
58
59#define BCACHE_MINORS 128
60
61#define BCACHE_DEVICE_IDX_MAX ((1U << MINORBITS)/BCACHE_MINORS)
62
63
64
65static unsigned int get_bucket_size(struct cache_sb *sb, struct cache_sb_disk *s)
66{
67 unsigned int bucket_size = le16_to_cpu(s->bucket_size);
68
69 if (sb->version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES) {
70 if (bch_has_feature_large_bucket(sb)) {
71 unsigned int max, order;
72
73 max = sizeof(unsigned int) * BITS_PER_BYTE - 1;
74 order = le16_to_cpu(s->bucket_size);
75
76
77
78
79 if (order > max)
80 pr_err("Bucket size (1 << %u) overflows\n",
81 order);
82 bucket_size = 1 << order;
83 } else if (bch_has_feature_obso_large_bucket(sb)) {
84 bucket_size +=
85 le16_to_cpu(s->obso_bucket_size_hi) << 16;
86 }
87 }
88
89 return bucket_size;
90}
91
92static const char *read_super_common(struct cache_sb *sb, struct block_device *bdev,
93 struct cache_sb_disk *s)
94{
95 const char *err;
96 unsigned int i;
97
98 sb->first_bucket= le16_to_cpu(s->first_bucket);
99 sb->nbuckets = le64_to_cpu(s->nbuckets);
100 sb->bucket_size = get_bucket_size(sb, s);
101
102 sb->nr_in_set = le16_to_cpu(s->nr_in_set);
103 sb->nr_this_dev = le16_to_cpu(s->nr_this_dev);
104
105 err = "Too many journal buckets";
106 if (sb->keys > SB_JOURNAL_BUCKETS)
107 goto err;
108
109 err = "Too many buckets";
110 if (sb->nbuckets > LONG_MAX)
111 goto err;
112
113 err = "Not enough buckets";
114 if (sb->nbuckets < 1 << 7)
115 goto err;
116
117 err = "Bad block size (not power of 2)";
118 if (!is_power_of_2(sb->block_size))
119 goto err;
120
121 err = "Bad block size (larger than page size)";
122 if (sb->block_size > PAGE_SECTORS)
123 goto err;
124
125 err = "Bad bucket size (not power of 2)";
126 if (!is_power_of_2(sb->bucket_size))
127 goto err;
128
129 err = "Bad bucket size (smaller than page size)";
130 if (sb->bucket_size < PAGE_SECTORS)
131 goto err;
132
133 err = "Invalid superblock: device too small";
134 if (get_capacity(bdev->bd_disk) <
135 sb->bucket_size * sb->nbuckets)
136 goto err;
137
138 err = "Bad UUID";
139 if (bch_is_zero(sb->set_uuid, 16))
140 goto err;
141
142 err = "Bad cache device number in set";
143 if (!sb->nr_in_set ||
144 sb->nr_in_set <= sb->nr_this_dev ||
145 sb->nr_in_set > MAX_CACHES_PER_SET)
146 goto err;
147
148 err = "Journal buckets not sequential";
149 for (i = 0; i < sb->keys; i++)
150 if (sb->d[i] != sb->first_bucket + i)
151 goto err;
152
153 err = "Too many journal buckets";
154 if (sb->first_bucket + sb->keys > sb->nbuckets)
155 goto err;
156
157 err = "Invalid superblock: first bucket comes before end of super";
158 if (sb->first_bucket * sb->bucket_size < 16)
159 goto err;
160
161 err = NULL;
162err:
163 return err;
164}
165
166
167static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
168 struct cache_sb_disk **res)
169{
170 const char *err;
171 struct cache_sb_disk *s;
172 struct page *page;
173 unsigned int i;
174
175 page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
176 SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL);
177 if (IS_ERR(page))
178 return "IO error";
179 s = page_address(page) + offset_in_page(SB_OFFSET);
180
181 sb->offset = le64_to_cpu(s->offset);
182 sb->version = le64_to_cpu(s->version);
183
184 memcpy(sb->magic, s->magic, 16);
185 memcpy(sb->uuid, s->uuid, 16);
186 memcpy(sb->set_uuid, s->set_uuid, 16);
187 memcpy(sb->label, s->label, SB_LABEL_SIZE);
188
189 sb->flags = le64_to_cpu(s->flags);
190 sb->seq = le64_to_cpu(s->seq);
191 sb->last_mount = le32_to_cpu(s->last_mount);
192 sb->keys = le16_to_cpu(s->keys);
193
194 for (i = 0; i < SB_JOURNAL_BUCKETS; i++)
195 sb->d[i] = le64_to_cpu(s->d[i]);
196
197 pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u\n",
198 sb->version, sb->flags, sb->seq, sb->keys);
199
200 err = "Not a bcache superblock (bad offset)";
201 if (sb->offset != SB_SECTOR)
202 goto err;
203
204 err = "Not a bcache superblock (bad magic)";
205 if (memcmp(sb->magic, bcache_magic, 16))
206 goto err;
207
208 err = "Bad checksum";
209 if (s->csum != csum_set(s))
210 goto err;
211
212 err = "Bad UUID";
213 if (bch_is_zero(sb->uuid, 16))
214 goto err;
215
216 sb->block_size = le16_to_cpu(s->block_size);
217
218 err = "Superblock block size smaller than device block size";
219 if (sb->block_size << 9 < bdev_logical_block_size(bdev))
220 goto err;
221
222 switch (sb->version) {
223 case BCACHE_SB_VERSION_BDEV:
224 sb->data_offset = BDEV_DATA_START_DEFAULT;
225 break;
226 case BCACHE_SB_VERSION_BDEV_WITH_OFFSET:
227 case BCACHE_SB_VERSION_BDEV_WITH_FEATURES:
228 sb->data_offset = le64_to_cpu(s->data_offset);
229
230 err = "Bad data offset";
231 if (sb->data_offset < BDEV_DATA_START_DEFAULT)
232 goto err;
233
234 break;
235 case BCACHE_SB_VERSION_CDEV:
236 case BCACHE_SB_VERSION_CDEV_WITH_UUID:
237 err = read_super_common(sb, bdev, s);
238 if (err)
239 goto err;
240 break;
241 case BCACHE_SB_VERSION_CDEV_WITH_FEATURES:
242
243
244
245
246 sb->feature_compat = le64_to_cpu(s->feature_compat);
247 sb->feature_incompat = le64_to_cpu(s->feature_incompat);
248 sb->feature_ro_compat = le64_to_cpu(s->feature_ro_compat);
249
250
251 err = "Unsupported compatible feature found";
252 if (bch_has_unknown_compat_features(sb))
253 goto err;
254
255 err = "Unsupported read-only compatible feature found";
256 if (bch_has_unknown_ro_compat_features(sb))
257 goto err;
258
259 err = "Unsupported incompatible feature found";
260 if (bch_has_unknown_incompat_features(sb))
261 goto err;
262
263 err = read_super_common(sb, bdev, s);
264 if (err)
265 goto err;
266 break;
267 default:
268 err = "Unsupported superblock version";
269 goto err;
270 }
271
272 sb->last_mount = (u32)ktime_get_real_seconds();
273 *res = s;
274 return NULL;
275err:
276 put_page(page);
277 return err;
278}
279
280static void write_bdev_super_endio(struct bio *bio)
281{
282 struct cached_dev *dc = bio->bi_private;
283
284 if (bio->bi_status)
285 bch_count_backing_io_errors(dc, bio);
286
287 closure_put(&dc->sb_write);
288}
289
290static void __write_super(struct cache_sb *sb, struct cache_sb_disk *out,
291 struct bio *bio)
292{
293 unsigned int i;
294
295 bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META;
296 bio->bi_iter.bi_sector = SB_SECTOR;
297 __bio_add_page(bio, virt_to_page(out), SB_SIZE,
298 offset_in_page(out));
299
300 out->offset = cpu_to_le64(sb->offset);
301
302 memcpy(out->uuid, sb->uuid, 16);
303 memcpy(out->set_uuid, sb->set_uuid, 16);
304 memcpy(out->label, sb->label, SB_LABEL_SIZE);
305
306 out->flags = cpu_to_le64(sb->flags);
307 out->seq = cpu_to_le64(sb->seq);
308
309 out->last_mount = cpu_to_le32(sb->last_mount);
310 out->first_bucket = cpu_to_le16(sb->first_bucket);
311 out->keys = cpu_to_le16(sb->keys);
312
313 for (i = 0; i < sb->keys; i++)
314 out->d[i] = cpu_to_le64(sb->d[i]);
315
316 if (sb->version >= BCACHE_SB_VERSION_CDEV_WITH_FEATURES) {
317 out->feature_compat = cpu_to_le64(sb->feature_compat);
318 out->feature_incompat = cpu_to_le64(sb->feature_incompat);
319 out->feature_ro_compat = cpu_to_le64(sb->feature_ro_compat);
320 }
321
322 out->version = cpu_to_le64(sb->version);
323 out->csum = csum_set(out);
324
325 pr_debug("ver %llu, flags %llu, seq %llu\n",
326 sb->version, sb->flags, sb->seq);
327
328 submit_bio(bio);
329}
330
331static void bch_write_bdev_super_unlock(struct closure *cl)
332{
333 struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write);
334
335 up(&dc->sb_write_mutex);
336}
337
338void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
339{
340 struct closure *cl = &dc->sb_write;
341 struct bio *bio = &dc->sb_bio;
342
343 down(&dc->sb_write_mutex);
344 closure_init(cl, parent);
345
346 bio_init(bio, dc->sb_bv, 1);
347 bio_set_dev(bio, dc->bdev);
348 bio->bi_end_io = write_bdev_super_endio;
349 bio->bi_private = dc;
350
351 closure_get(cl);
352
353 __write_super(&dc->sb, dc->sb_disk, bio);
354
355 closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
356}
357
358static void write_super_endio(struct bio *bio)
359{
360 struct cache *ca = bio->bi_private;
361
362
363 bch_count_io_errors(ca, bio->bi_status, 0,
364 "writing superblock");
365 closure_put(&ca->set->sb_write);
366}
367
368static void bcache_write_super_unlock(struct closure *cl)
369{
370 struct cache_set *c = container_of(cl, struct cache_set, sb_write);
371
372 up(&c->sb_write_mutex);
373}
374
375void bcache_write_super(struct cache_set *c)
376{
377 struct closure *cl = &c->sb_write;
378 struct cache *ca = c->cache;
379 struct bio *bio = &ca->sb_bio;
380 unsigned int version = BCACHE_SB_VERSION_CDEV_WITH_UUID;
381
382 down(&c->sb_write_mutex);
383 closure_init(cl, &c->cl);
384
385 ca->sb.seq++;
386
387 if (ca->sb.version < version)
388 ca->sb.version = version;
389
390 bio_init(bio, ca->sb_bv, 1);
391 bio_set_dev(bio, ca->bdev);
392 bio->bi_end_io = write_super_endio;
393 bio->bi_private = ca;
394
395 closure_get(cl);
396 __write_super(&ca->sb, ca->sb_disk, bio);
397
398 closure_return_with_destructor(cl, bcache_write_super_unlock);
399}
400
401
402
403static void uuid_endio(struct bio *bio)
404{
405 struct closure *cl = bio->bi_private;
406 struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
407
408 cache_set_err_on(bio->bi_status, c, "accessing uuids");
409 bch_bbio_free(bio, c);
410 closure_put(cl);
411}
412
413static void uuid_io_unlock(struct closure *cl)
414{
415 struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
416
417 up(&c->uuid_write_mutex);
418}
419
420static void uuid_io(struct cache_set *c, int op, unsigned long op_flags,
421 struct bkey *k, struct closure *parent)
422{
423 struct closure *cl = &c->uuid_write;
424 struct uuid_entry *u;
425 unsigned int i;
426 char buf[80];
427
428 BUG_ON(!parent);
429 down(&c->uuid_write_mutex);
430 closure_init(cl, parent);
431
432 for (i = 0; i < KEY_PTRS(k); i++) {
433 struct bio *bio = bch_bbio_alloc(c);
434
435 bio->bi_opf = REQ_SYNC | REQ_META | op_flags;
436 bio->bi_iter.bi_size = KEY_SIZE(k) << 9;
437
438 bio->bi_end_io = uuid_endio;
439 bio->bi_private = cl;
440 bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags);
441 bch_bio_map(bio, c->uuids);
442
443 bch_submit_bbio(bio, c, k, i);
444
445 if (op != REQ_OP_WRITE)
446 break;
447 }
448
449 bch_extent_to_text(buf, sizeof(buf), k);
450 pr_debug("%s UUIDs at %s\n", op == REQ_OP_WRITE ? "wrote" : "read", buf);
451
452 for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
453 if (!bch_is_zero(u->uuid, 16))
454 pr_debug("Slot %zi: %pU: %s: 1st: %u last: %u inv: %u\n",
455 u - c->uuids, u->uuid, u->label,
456 u->first_reg, u->last_reg, u->invalidated);
457
458 closure_return_with_destructor(cl, uuid_io_unlock);
459}
460
461static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
462{
463 struct bkey *k = &j->uuid_bucket;
464
465 if (__bch_btree_ptr_invalid(c, k))
466 return "bad uuid pointer";
467
468 bkey_copy(&c->uuid_bucket, k);
469 uuid_io(c, REQ_OP_READ, 0, k, cl);
470
471 if (j->version < BCACHE_JSET_VERSION_UUIDv1) {
472 struct uuid_entry_v0 *u0 = (void *) c->uuids;
473 struct uuid_entry *u1 = (void *) c->uuids;
474 int i;
475
476 closure_sync(cl);
477
478
479
480
481
482
483
484 for (i = c->nr_uuids - 1;
485 i >= 0;
486 --i) {
487 memcpy(u1[i].uuid, u0[i].uuid, 16);
488 memcpy(u1[i].label, u0[i].label, 32);
489
490 u1[i].first_reg = u0[i].first_reg;
491 u1[i].last_reg = u0[i].last_reg;
492 u1[i].invalidated = u0[i].invalidated;
493
494 u1[i].flags = 0;
495 u1[i].sectors = 0;
496 }
497 }
498
499 return NULL;
500}
501
502static int __uuid_write(struct cache_set *c)
503{
504 BKEY_PADDED(key) k;
505 struct closure cl;
506 struct cache *ca = c->cache;
507 unsigned int size;
508
509 closure_init_stack(&cl);
510 lockdep_assert_held(&bch_register_lock);
511
512 if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, true))
513 return 1;
514
515 size = meta_bucket_pages(&ca->sb) * PAGE_SECTORS;
516 SET_KEY_SIZE(&k.key, size);
517 uuid_io(c, REQ_OP_WRITE, 0, &k.key, &cl);
518 closure_sync(&cl);
519
520
521 atomic_long_add(ca->sb.bucket_size, &ca->meta_sectors_written);
522
523 bkey_copy(&c->uuid_bucket, &k.key);
524 bkey_put(c, &k.key);
525 return 0;
526}
527
528int bch_uuid_write(struct cache_set *c)
529{
530 int ret = __uuid_write(c);
531
532 if (!ret)
533 bch_journal_meta(c, NULL);
534
535 return ret;
536}
537
538static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid)
539{
540 struct uuid_entry *u;
541
542 for (u = c->uuids;
543 u < c->uuids + c->nr_uuids; u++)
544 if (!memcmp(u->uuid, uuid, 16))
545 return u;
546
547 return NULL;
548}
549
550static struct uuid_entry *uuid_find_empty(struct cache_set *c)
551{
552 static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
553
554 return uuid_find(c, zero_uuid);
555}
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584static void prio_endio(struct bio *bio)
585{
586 struct cache *ca = bio->bi_private;
587
588 cache_set_err_on(bio->bi_status, ca->set, "accessing priorities");
589 bch_bbio_free(bio, ca->set);
590 closure_put(&ca->prio);
591}
592
593static void prio_io(struct cache *ca, uint64_t bucket, int op,
594 unsigned long op_flags)
595{
596 struct closure *cl = &ca->prio;
597 struct bio *bio = bch_bbio_alloc(ca->set);
598
599 closure_init_stack(cl);
600
601 bio->bi_iter.bi_sector = bucket * ca->sb.bucket_size;
602 bio_set_dev(bio, ca->bdev);
603 bio->bi_iter.bi_size = meta_bucket_bytes(&ca->sb);
604
605 bio->bi_end_io = prio_endio;
606 bio->bi_private = ca;
607 bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags);
608 bch_bio_map(bio, ca->disk_buckets);
609
610 closure_bio_submit(ca->set, bio, &ca->prio);
611 closure_sync(cl);
612}
613
614int bch_prio_write(struct cache *ca, bool wait)
615{
616 int i;
617 struct bucket *b;
618 struct closure cl;
619
620 pr_debug("free_prio=%zu, free_none=%zu, free_inc=%zu\n",
621 fifo_used(&ca->free[RESERVE_PRIO]),
622 fifo_used(&ca->free[RESERVE_NONE]),
623 fifo_used(&ca->free_inc));
624
625
626
627
628
629
630 if (!wait) {
631 size_t avail = fifo_used(&ca->free[RESERVE_PRIO]) +
632 fifo_used(&ca->free[RESERVE_NONE]);
633 if (prio_buckets(ca) > avail)
634 return -ENOMEM;
635 }
636
637 closure_init_stack(&cl);
638
639 lockdep_assert_held(&ca->set->bucket_lock);
640
641 ca->disk_buckets->seq++;
642
643 atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
644 &ca->meta_sectors_written);
645
646 for (i = prio_buckets(ca) - 1; i >= 0; --i) {
647 long bucket;
648 struct prio_set *p = ca->disk_buckets;
649 struct bucket_disk *d = p->data;
650 struct bucket_disk *end = d + prios_per_bucket(ca);
651
652 for (b = ca->buckets + i * prios_per_bucket(ca);
653 b < ca->buckets + ca->sb.nbuckets && d < end;
654 b++, d++) {
655 d->prio = cpu_to_le16(b->prio);
656 d->gen = b->gen;
657 }
658
659 p->next_bucket = ca->prio_buckets[i + 1];
660 p->magic = pset_magic(&ca->sb);
661 p->csum = bch_crc64(&p->magic, meta_bucket_bytes(&ca->sb) - 8);
662
663 bucket = bch_bucket_alloc(ca, RESERVE_PRIO, wait);
664 BUG_ON(bucket == -1);
665
666 mutex_unlock(&ca->set->bucket_lock);
667 prio_io(ca, bucket, REQ_OP_WRITE, 0);
668 mutex_lock(&ca->set->bucket_lock);
669
670 ca->prio_buckets[i] = bucket;
671 atomic_dec_bug(&ca->buckets[bucket].pin);
672 }
673
674 mutex_unlock(&ca->set->bucket_lock);
675
676 bch_journal_meta(ca->set, &cl);
677 closure_sync(&cl);
678
679 mutex_lock(&ca->set->bucket_lock);
680
681
682
683
684
685 for (i = 0; i < prio_buckets(ca); i++) {
686 if (ca->prio_last_buckets[i])
687 __bch_bucket_free(ca,
688 &ca->buckets[ca->prio_last_buckets[i]]);
689
690 ca->prio_last_buckets[i] = ca->prio_buckets[i];
691 }
692 return 0;
693}
694
695static int prio_read(struct cache *ca, uint64_t bucket)
696{
697 struct prio_set *p = ca->disk_buckets;
698 struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
699 struct bucket *b;
700 unsigned int bucket_nr = 0;
701 int ret = -EIO;
702
703 for (b = ca->buckets;
704 b < ca->buckets + ca->sb.nbuckets;
705 b++, d++) {
706 if (d == end) {
707 ca->prio_buckets[bucket_nr] = bucket;
708 ca->prio_last_buckets[bucket_nr] = bucket;
709 bucket_nr++;
710
711 prio_io(ca, bucket, REQ_OP_READ, 0);
712
713 if (p->csum !=
714 bch_crc64(&p->magic, meta_bucket_bytes(&ca->sb) - 8)) {
715 pr_warn("bad csum reading priorities\n");
716 goto out;
717 }
718
719 if (p->magic != pset_magic(&ca->sb)) {
720 pr_warn("bad magic reading priorities\n");
721 goto out;
722 }
723
724 bucket = p->next_bucket;
725 d = p->data;
726 }
727
728 b->prio = le16_to_cpu(d->prio);
729 b->gen = b->last_gc = d->gen;
730 }
731
732 ret = 0;
733out:
734 return ret;
735}
736
737
738
739static int open_dev(struct block_device *b, fmode_t mode)
740{
741 struct bcache_device *d = b->bd_disk->private_data;
742
743 if (test_bit(BCACHE_DEV_CLOSING, &d->flags))
744 return -ENXIO;
745
746 closure_get(&d->cl);
747 return 0;
748}
749
750static void release_dev(struct gendisk *b, fmode_t mode)
751{
752 struct bcache_device *d = b->private_data;
753
754 closure_put(&d->cl);
755}
756
757static int ioctl_dev(struct block_device *b, fmode_t mode,
758 unsigned int cmd, unsigned long arg)
759{
760 struct bcache_device *d = b->bd_disk->private_data;
761
762 return d->ioctl(d, mode, cmd, arg);
763}
764
765static const struct block_device_operations bcache_cached_ops = {
766 .submit_bio = cached_dev_submit_bio,
767 .open = open_dev,
768 .release = release_dev,
769 .ioctl = ioctl_dev,
770 .owner = THIS_MODULE,
771};
772
773static const struct block_device_operations bcache_flash_ops = {
774 .submit_bio = flash_dev_submit_bio,
775 .open = open_dev,
776 .release = release_dev,
777 .ioctl = ioctl_dev,
778 .owner = THIS_MODULE,
779};
780
781void bcache_device_stop(struct bcache_device *d)
782{
783 if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags))
784
785
786
787
788
789 closure_queue(&d->cl);
790}
791
792static void bcache_device_unlink(struct bcache_device *d)
793{
794 lockdep_assert_held(&bch_register_lock);
795
796 if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) {
797 struct cache *ca = d->c->cache;
798
799 sysfs_remove_link(&d->c->kobj, d->name);
800 sysfs_remove_link(&d->kobj, "cache");
801
802 bd_unlink_disk_holder(ca->bdev, d->disk);
803 }
804}
805
806static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
807 const char *name)
808{
809 struct cache *ca = c->cache;
810 int ret;
811
812 bd_link_disk_holder(ca->bdev, d->disk);
813
814 snprintf(d->name, BCACHEDEVNAME_SIZE,
815 "%s%u", name, d->id);
816
817 ret = sysfs_create_link(&d->kobj, &c->kobj, "cache");
818 if (ret < 0)
819 pr_err("Couldn't create device -> cache set symlink\n");
820
821 ret = sysfs_create_link(&c->kobj, &d->kobj, d->name);
822 if (ret < 0)
823 pr_err("Couldn't create cache set -> device symlink\n");
824
825 clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags);
826}
827
828static void bcache_device_detach(struct bcache_device *d)
829{
830 lockdep_assert_held(&bch_register_lock);
831
832 atomic_dec(&d->c->attached_dev_nr);
833
834 if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
835 struct uuid_entry *u = d->c->uuids + d->id;
836
837 SET_UUID_FLASH_ONLY(u, 0);
838 memcpy(u->uuid, invalid_uuid, 16);
839 u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
840 bch_uuid_write(d->c);
841 }
842
843 bcache_device_unlink(d);
844
845 d->c->devices[d->id] = NULL;
846 closure_put(&d->c->caching);
847 d->c = NULL;
848}
849
850static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
851 unsigned int id)
852{
853 d->id = id;
854 d->c = c;
855 c->devices[id] = d;
856
857 if (id >= c->devices_max_used)
858 c->devices_max_used = id + 1;
859
860 closure_get(&c->caching);
861}
862
863static inline int first_minor_to_idx(int first_minor)
864{
865 return (first_minor/BCACHE_MINORS);
866}
867
868static inline int idx_to_first_minor(int idx)
869{
870 return (idx * BCACHE_MINORS);
871}
872
873static void bcache_device_free(struct bcache_device *d)
874{
875 struct gendisk *disk = d->disk;
876
877 lockdep_assert_held(&bch_register_lock);
878
879 if (disk)
880 pr_info("%s stopped\n", disk->disk_name);
881 else
882 pr_err("bcache device (NULL gendisk) stopped\n");
883
884 if (d->c)
885 bcache_device_detach(d);
886
887 if (disk) {
888 bool disk_added = (disk->flags & GENHD_FL_UP) != 0;
889
890 if (disk_added)
891 del_gendisk(disk);
892
893 blk_cleanup_disk(disk);
894 ida_simple_remove(&bcache_device_idx,
895 first_minor_to_idx(disk->first_minor));
896 }
897
898 bioset_exit(&d->bio_split);
899 kvfree(d->full_dirty_stripes);
900 kvfree(d->stripe_sectors_dirty);
901
902 closure_debug_destroy(&d->cl);
903}
904
905static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
906 sector_t sectors, struct block_device *cached_bdev,
907 const struct block_device_operations *ops)
908{
909 struct request_queue *q;
910 const size_t max_stripes = min_t(size_t, INT_MAX,
911 SIZE_MAX / sizeof(atomic_t));
912 uint64_t n;
913 int idx;
914
915 if (!d->stripe_size)
916 d->stripe_size = 1 << 31;
917
918 n = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
919 if (!n || n > max_stripes) {
920 pr_err("nr_stripes too large or invalid: %llu (start sector beyond end of disk?)\n",
921 n);
922 return -ENOMEM;
923 }
924 d->nr_stripes = n;
925
926 n = d->nr_stripes * sizeof(atomic_t);
927 d->stripe_sectors_dirty = kvzalloc(n, GFP_KERNEL);
928 if (!d->stripe_sectors_dirty)
929 return -ENOMEM;
930
931 n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long);
932 d->full_dirty_stripes = kvzalloc(n, GFP_KERNEL);
933 if (!d->full_dirty_stripes)
934 return -ENOMEM;
935
936 idx = ida_simple_get(&bcache_device_idx, 0,
937 BCACHE_DEVICE_IDX_MAX, GFP_KERNEL);
938 if (idx < 0)
939 return idx;
940
941 if (bioset_init(&d->bio_split, 4, offsetof(struct bbio, bio),
942 BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
943 goto err;
944
945 d->disk = blk_alloc_disk(NUMA_NO_NODE);
946 if (!d->disk)
947 goto err;
948
949 set_capacity(d->disk, sectors);
950 snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", idx);
951
952 d->disk->major = bcache_major;
953 d->disk->first_minor = idx_to_first_minor(idx);
954 d->disk->minors = BCACHE_MINORS;
955 d->disk->fops = ops;
956 d->disk->private_data = d;
957
958 q = d->disk->queue;
959 q->limits.max_hw_sectors = UINT_MAX;
960 q->limits.max_sectors = UINT_MAX;
961 q->limits.max_segment_size = UINT_MAX;
962 q->limits.max_segments = BIO_MAX_VECS;
963 blk_queue_max_discard_sectors(q, UINT_MAX);
964 q->limits.discard_granularity = 512;
965 q->limits.io_min = block_size;
966 q->limits.logical_block_size = block_size;
967 q->limits.physical_block_size = block_size;
968
969 if (q->limits.logical_block_size > PAGE_SIZE && cached_bdev) {
970
971
972
973
974 pr_info("%s: sb/logical block size (%u) greater than page size (%lu) falling back to device logical block size (%u)\n",
975 d->disk->disk_name, q->limits.logical_block_size,
976 PAGE_SIZE, bdev_logical_block_size(cached_bdev));
977
978
979 blk_queue_logical_block_size(q, bdev_logical_block_size(cached_bdev));
980 }
981
982 blk_queue_flag_set(QUEUE_FLAG_NONROT, d->disk->queue);
983 blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, d->disk->queue);
984 blk_queue_flag_set(QUEUE_FLAG_DISCARD, d->disk->queue);
985
986 blk_queue_write_cache(q, true, true);
987
988 return 0;
989
990err:
991 ida_simple_remove(&bcache_device_idx, idx);
992 return -ENOMEM;
993
994}
995
996
997
998static void calc_cached_dev_sectors(struct cache_set *c)
999{
1000 uint64_t sectors = 0;
1001 struct cached_dev *dc;
1002
1003 list_for_each_entry(dc, &c->cached_devs, list)
1004 sectors += bdev_sectors(dc->bdev);
1005
1006 c->cached_dev_sectors = sectors;
1007}
1008
1009#define BACKING_DEV_OFFLINE_TIMEOUT 5
1010static int cached_dev_status_update(void *arg)
1011{
1012 struct cached_dev *dc = arg;
1013 struct request_queue *q;
1014
1015
1016
1017
1018
1019
1020 while (!kthread_should_stop() && !dc->io_disable) {
1021 q = bdev_get_queue(dc->bdev);
1022 if (blk_queue_dying(q))
1023 dc->offline_seconds++;
1024 else
1025 dc->offline_seconds = 0;
1026
1027 if (dc->offline_seconds >= BACKING_DEV_OFFLINE_TIMEOUT) {
1028 pr_err("%s: device offline for %d seconds\n",
1029 dc->backing_dev_name,
1030 BACKING_DEV_OFFLINE_TIMEOUT);
1031 pr_err("%s: disable I/O request due to backing device offline\n",
1032 dc->disk.name);
1033 dc->io_disable = true;
1034
1035 smp_mb();
1036 bcache_device_stop(&dc->disk);
1037 break;
1038 }
1039 schedule_timeout_interruptible(HZ);
1040 }
1041
1042 wait_for_kthread_stop();
1043 return 0;
1044}
1045
1046
1047int bch_cached_dev_run(struct cached_dev *dc)
1048{
1049 int ret = 0;
1050 struct bcache_device *d = &dc->disk;
1051 char *buf = kmemdup_nul(dc->sb.label, SB_LABEL_SIZE, GFP_KERNEL);
1052 char *env[] = {
1053 "DRIVER=bcache",
1054 kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid),
1055 kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf ? : ""),
1056 NULL,
1057 };
1058
1059 if (dc->io_disable) {
1060 pr_err("I/O disabled on cached dev %s\n",
1061 dc->backing_dev_name);
1062 ret = -EIO;
1063 goto out;
1064 }
1065
1066 if (atomic_xchg(&dc->running, 1)) {
1067 pr_info("cached dev %s is running already\n",
1068 dc->backing_dev_name);
1069 ret = -EBUSY;
1070 goto out;
1071 }
1072
1073 if (!d->c &&
1074 BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
1075 struct closure cl;
1076
1077 closure_init_stack(&cl);
1078
1079 SET_BDEV_STATE(&dc->sb, BDEV_STATE_STALE);
1080 bch_write_bdev_super(dc, &cl);
1081 closure_sync(&cl);
1082 }
1083
1084 add_disk(d->disk);
1085 bd_link_disk_holder(dc->bdev, dc->disk.disk);
1086
1087
1088
1089
1090 kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
1091
1092 if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
1093 sysfs_create_link(&disk_to_dev(d->disk)->kobj,
1094 &d->kobj, "bcache")) {
1095 pr_err("Couldn't create bcache dev <-> disk sysfs symlinks\n");
1096 ret = -ENOMEM;
1097 goto out;
1098 }
1099
1100 dc->status_update_thread = kthread_run(cached_dev_status_update,
1101 dc, "bcache_status_update");
1102 if (IS_ERR(dc->status_update_thread)) {
1103 pr_warn("failed to create bcache_status_update kthread, continue to run without monitoring backing device status\n");
1104 }
1105
1106out:
1107 kfree(env[1]);
1108 kfree(env[2]);
1109 kfree(buf);
1110 return ret;
1111}
1112
1113
1114
1115
1116
1117
1118
1119
1120static void cancel_writeback_rate_update_dwork(struct cached_dev *dc)
1121{
1122 int time_out = WRITEBACK_RATE_UPDATE_SECS_MAX * HZ;
1123
1124 do {
1125 if (!test_bit(BCACHE_DEV_RATE_DW_RUNNING,
1126 &dc->disk.flags))
1127 break;
1128 time_out--;
1129 schedule_timeout_interruptible(1);
1130 } while (time_out > 0);
1131
1132 if (time_out == 0)
1133 pr_warn("give up waiting for dc->writeback_write_update to quit\n");
1134
1135 cancel_delayed_work_sync(&dc->writeback_rate_update);
1136}
1137
1138static void cached_dev_detach_finish(struct work_struct *w)
1139{
1140 struct cached_dev *dc = container_of(w, struct cached_dev, detach);
1141
1142 BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
1143 BUG_ON(refcount_read(&dc->count));
1144
1145
1146 if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
1147 cancel_writeback_rate_update_dwork(dc);
1148
1149 if (!IS_ERR_OR_NULL(dc->writeback_thread)) {
1150 kthread_stop(dc->writeback_thread);
1151 dc->writeback_thread = NULL;
1152 }
1153
1154 mutex_lock(&bch_register_lock);
1155
1156 calc_cached_dev_sectors(dc->disk.c);
1157 bcache_device_detach(&dc->disk);
1158 list_move(&dc->list, &uncached_devices);
1159
1160 clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags);
1161 clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags);
1162
1163 mutex_unlock(&bch_register_lock);
1164
1165 pr_info("Caching disabled for %s\n", dc->backing_dev_name);
1166
1167
1168 closure_put(&dc->disk.cl);
1169}
1170
1171void bch_cached_dev_detach(struct cached_dev *dc)
1172{
1173 lockdep_assert_held(&bch_register_lock);
1174
1175 if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
1176 return;
1177
1178 if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
1179 return;
1180
1181
1182
1183
1184
1185 closure_get(&dc->disk.cl);
1186
1187 bch_writeback_queue(dc);
1188
1189 cached_dev_put(dc);
1190}
1191
1192int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
1193 uint8_t *set_uuid)
1194{
1195 uint32_t rtime = cpu_to_le32((u32)ktime_get_real_seconds());
1196 struct uuid_entry *u;
1197 struct cached_dev *exist_dc, *t;
1198 int ret = 0;
1199
1200 if ((set_uuid && memcmp(set_uuid, c->set_uuid, 16)) ||
1201 (!set_uuid && memcmp(dc->sb.set_uuid, c->set_uuid, 16)))
1202 return -ENOENT;
1203
1204 if (dc->disk.c) {
1205 pr_err("Can't attach %s: already attached\n",
1206 dc->backing_dev_name);
1207 return -EINVAL;
1208 }
1209
1210 if (test_bit(CACHE_SET_STOPPING, &c->flags)) {
1211 pr_err("Can't attach %s: shutting down\n",
1212 dc->backing_dev_name);
1213 return -EINVAL;
1214 }
1215
1216 if (dc->sb.block_size < c->cache->sb.block_size) {
1217
1218 pr_err("Couldn't attach %s: block size less than set's block size\n",
1219 dc->backing_dev_name);
1220 return -EINVAL;
1221 }
1222
1223
1224 list_for_each_entry_safe(exist_dc, t, &c->cached_devs, list) {
1225 if (!memcmp(dc->sb.uuid, exist_dc->sb.uuid, 16)) {
1226 pr_err("Tried to attach %s but duplicate UUID already attached\n",
1227 dc->backing_dev_name);
1228
1229 return -EINVAL;
1230 }
1231 }
1232
1233 u = uuid_find(c, dc->sb.uuid);
1234
1235 if (u &&
1236 (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE ||
1237 BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) {
1238 memcpy(u->uuid, invalid_uuid, 16);
1239 u->invalidated = cpu_to_le32((u32)ktime_get_real_seconds());
1240 u = NULL;
1241 }
1242
1243 if (!u) {
1244 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
1245 pr_err("Couldn't find uuid for %s in set\n",
1246 dc->backing_dev_name);
1247 return -ENOENT;
1248 }
1249
1250 u = uuid_find_empty(c);
1251 if (!u) {
1252 pr_err("Not caching %s, no room for UUID\n",
1253 dc->backing_dev_name);
1254 return -EINVAL;
1255 }
1256 }
1257
1258
1259
1260
1261
1262
1263 if (bch_is_zero(u->uuid, 16)) {
1264 struct closure cl;
1265
1266 closure_init_stack(&cl);
1267
1268 memcpy(u->uuid, dc->sb.uuid, 16);
1269 memcpy(u->label, dc->sb.label, SB_LABEL_SIZE);
1270 u->first_reg = u->last_reg = rtime;
1271 bch_uuid_write(c);
1272
1273 memcpy(dc->sb.set_uuid, c->set_uuid, 16);
1274 SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
1275
1276 bch_write_bdev_super(dc, &cl);
1277 closure_sync(&cl);
1278 } else {
1279 u->last_reg = rtime;
1280 bch_uuid_write(c);
1281 }
1282
1283 bcache_device_attach(&dc->disk, c, u - c->uuids);
1284 list_move(&dc->list, &c->cached_devs);
1285 calc_cached_dev_sectors(c);
1286
1287
1288
1289
1290
1291 smp_wmb();
1292 refcount_set(&dc->count, 1);
1293
1294
1295 down_write(&dc->writeback_lock);
1296 if (bch_cached_dev_writeback_start(dc)) {
1297 up_write(&dc->writeback_lock);
1298 pr_err("Couldn't start writeback facilities for %s\n",
1299 dc->disk.disk->disk_name);
1300 return -ENOMEM;
1301 }
1302
1303 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
1304 atomic_set(&dc->has_dirty, 1);
1305 bch_writeback_queue(dc);
1306 }
1307
1308 bch_sectors_dirty_init(&dc->disk);
1309
1310 ret = bch_cached_dev_run(dc);
1311 if (ret && (ret != -EBUSY)) {
1312 up_write(&dc->writeback_lock);
1313
1314
1315
1316
1317
1318
1319 kthread_stop(dc->writeback_thread);
1320 cancel_writeback_rate_update_dwork(dc);
1321 pr_err("Couldn't run cached device %s\n",
1322 dc->backing_dev_name);
1323 return ret;
1324 }
1325
1326 bcache_device_link(&dc->disk, c, "bdev");
1327 atomic_inc(&c->attached_dev_nr);
1328
1329 if (bch_has_feature_obso_large_bucket(&(c->cache->sb))) {
1330 pr_err("The obsoleted large bucket layout is unsupported, set the bcache device into read-only\n");
1331 pr_err("Please update to the latest bcache-tools to create the cache device\n");
1332 set_disk_ro(dc->disk.disk, 1);
1333 }
1334
1335
1336 up_write(&dc->writeback_lock);
1337
1338 pr_info("Caching %s as %s on set %pU\n",
1339 dc->backing_dev_name,
1340 dc->disk.disk->disk_name,
1341 dc->disk.c->set_uuid);
1342 return 0;
1343}
1344
1345
1346void bch_cached_dev_release(struct kobject *kobj)
1347{
1348 struct cached_dev *dc = container_of(kobj, struct cached_dev,
1349 disk.kobj);
1350 kfree(dc);
1351 module_put(THIS_MODULE);
1352}
1353
1354static void cached_dev_free(struct closure *cl)
1355{
1356 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
1357
1358 if (test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags))
1359 cancel_writeback_rate_update_dwork(dc);
1360
1361 if (!IS_ERR_OR_NULL(dc->writeback_thread))
1362 kthread_stop(dc->writeback_thread);
1363 if (!IS_ERR_OR_NULL(dc->status_update_thread))
1364 kthread_stop(dc->status_update_thread);
1365
1366 mutex_lock(&bch_register_lock);
1367
1368 if (atomic_read(&dc->running))
1369 bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
1370 bcache_device_free(&dc->disk);
1371 list_del(&dc->list);
1372
1373 mutex_unlock(&bch_register_lock);
1374
1375 if (dc->sb_disk)
1376 put_page(virt_to_page(dc->sb_disk));
1377
1378 if (!IS_ERR_OR_NULL(dc->bdev))
1379 blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1380
1381 wake_up(&unregister_wait);
1382
1383 kobject_put(&dc->disk.kobj);
1384}
1385
1386static void cached_dev_flush(struct closure *cl)
1387{
1388 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
1389 struct bcache_device *d = &dc->disk;
1390
1391 mutex_lock(&bch_register_lock);
1392 bcache_device_unlink(d);
1393 mutex_unlock(&bch_register_lock);
1394
1395 bch_cache_accounting_destroy(&dc->accounting);
1396 kobject_del(&d->kobj);
1397
1398 continue_at(cl, cached_dev_free, system_wq);
1399}
1400
1401static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
1402{
1403 int ret;
1404 struct io *io;
1405 struct request_queue *q = bdev_get_queue(dc->bdev);
1406
1407 __module_get(THIS_MODULE);
1408 INIT_LIST_HEAD(&dc->list);
1409 closure_init(&dc->disk.cl, NULL);
1410 set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
1411 kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
1412 INIT_WORK(&dc->detach, cached_dev_detach_finish);
1413 sema_init(&dc->sb_write_mutex, 1);
1414 INIT_LIST_HEAD(&dc->io_lru);
1415 spin_lock_init(&dc->io_lock);
1416 bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
1417
1418 dc->sequential_cutoff = 4 << 20;
1419
1420 for (io = dc->io; io < dc->io + RECENT_IO; io++) {
1421 list_add(&io->lru, &dc->io_lru);
1422 hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
1423 }
1424
1425 dc->disk.stripe_size = q->limits.io_opt >> 9;
1426
1427 if (dc->disk.stripe_size)
1428 dc->partial_stripes_expensive =
1429 q->limits.raid_partial_stripes_expensive;
1430
1431 ret = bcache_device_init(&dc->disk, block_size,
1432 bdev_nr_sectors(dc->bdev) - dc->sb.data_offset,
1433 dc->bdev, &bcache_cached_ops);
1434 if (ret)
1435 return ret;
1436
1437 blk_queue_io_opt(dc->disk.disk->queue,
1438 max(queue_io_opt(dc->disk.disk->queue), queue_io_opt(q)));
1439
1440 atomic_set(&dc->io_errors, 0);
1441 dc->io_disable = false;
1442 dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT;
1443
1444 dc->stop_when_cache_set_failed = BCH_CACHED_DEV_STOP_AUTO;
1445
1446 bch_cached_dev_request_init(dc);
1447 bch_cached_dev_writeback_init(dc);
1448 return 0;
1449}
1450
1451
1452
1453static int register_bdev(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
1454 struct block_device *bdev,
1455 struct cached_dev *dc)
1456{
1457 const char *err = "cannot allocate memory";
1458 struct cache_set *c;
1459 int ret = -ENOMEM;
1460
1461 bdevname(bdev, dc->backing_dev_name);
1462 memcpy(&dc->sb, sb, sizeof(struct cache_sb));
1463 dc->bdev = bdev;
1464 dc->bdev->bd_holder = dc;
1465 dc->sb_disk = sb_disk;
1466
1467 if (cached_dev_init(dc, sb->block_size << 9))
1468 goto err;
1469
1470 err = "error creating kobject";
1471 if (kobject_add(&dc->disk.kobj, bdev_kobj(bdev), "bcache"))
1472 goto err;
1473 if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
1474 goto err;
1475
1476 pr_info("registered backing device %s\n", dc->backing_dev_name);
1477
1478 list_add(&dc->list, &uncached_devices);
1479
1480 list_for_each_entry(c, &bch_cache_sets, list)
1481 bch_cached_dev_attach(dc, c, NULL);
1482
1483 if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE ||
1484 BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) {
1485 err = "failed to run cached device";
1486 ret = bch_cached_dev_run(dc);
1487 if (ret)
1488 goto err;
1489 }
1490
1491 return 0;
1492err:
1493 pr_notice("error %s: %s\n", dc->backing_dev_name, err);
1494 bcache_device_stop(&dc->disk);
1495 return ret;
1496}
1497
1498
1499
1500
1501void bch_flash_dev_release(struct kobject *kobj)
1502{
1503 struct bcache_device *d = container_of(kobj, struct bcache_device,
1504 kobj);
1505 kfree(d);
1506}
1507
1508static void flash_dev_free(struct closure *cl)
1509{
1510 struct bcache_device *d = container_of(cl, struct bcache_device, cl);
1511
1512 mutex_lock(&bch_register_lock);
1513 atomic_long_sub(bcache_dev_sectors_dirty(d),
1514 &d->c->flash_dev_dirty_sectors);
1515 bcache_device_free(d);
1516 mutex_unlock(&bch_register_lock);
1517 kobject_put(&d->kobj);
1518}
1519
1520static void flash_dev_flush(struct closure *cl)
1521{
1522 struct bcache_device *d = container_of(cl, struct bcache_device, cl);
1523
1524 mutex_lock(&bch_register_lock);
1525 bcache_device_unlink(d);
1526 mutex_unlock(&bch_register_lock);
1527 kobject_del(&d->kobj);
1528 continue_at(cl, flash_dev_free, system_wq);
1529}
1530
1531static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
1532{
1533 struct bcache_device *d = kzalloc(sizeof(struct bcache_device),
1534 GFP_KERNEL);
1535 if (!d)
1536 return -ENOMEM;
1537
1538 closure_init(&d->cl, NULL);
1539 set_closure_fn(&d->cl, flash_dev_flush, system_wq);
1540
1541 kobject_init(&d->kobj, &bch_flash_dev_ktype);
1542
1543 if (bcache_device_init(d, block_bytes(c->cache), u->sectors,
1544 NULL, &bcache_flash_ops))
1545 goto err;
1546
1547 bcache_device_attach(d, c, u - c->uuids);
1548 bch_sectors_dirty_init(d);
1549 bch_flash_dev_request_init(d);
1550 add_disk(d->disk);
1551
1552 if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache"))
1553 goto err;
1554
1555 bcache_device_link(d, c, "volume");
1556
1557 if (bch_has_feature_obso_large_bucket(&c->cache->sb)) {
1558 pr_err("The obsoleted large bucket layout is unsupported, set the bcache device into read-only\n");
1559 pr_err("Please update to the latest bcache-tools to create the cache device\n");
1560 set_disk_ro(d->disk, 1);
1561 }
1562
1563 return 0;
1564err:
1565 kobject_put(&d->kobj);
1566 return -ENOMEM;
1567}
1568
1569static int flash_devs_run(struct cache_set *c)
1570{
1571 int ret = 0;
1572 struct uuid_entry *u;
1573
1574 for (u = c->uuids;
1575 u < c->uuids + c->nr_uuids && !ret;
1576 u++)
1577 if (UUID_FLASH_ONLY(u))
1578 ret = flash_dev_run(c, u);
1579
1580 return ret;
1581}
1582
1583int bch_flash_dev_create(struct cache_set *c, uint64_t size)
1584{
1585 struct uuid_entry *u;
1586
1587 if (test_bit(CACHE_SET_STOPPING, &c->flags))
1588 return -EINTR;
1589
1590 if (!test_bit(CACHE_SET_RUNNING, &c->flags))
1591 return -EPERM;
1592
1593 u = uuid_find_empty(c);
1594 if (!u) {
1595 pr_err("Can't create volume, no room for UUID\n");
1596 return -EINVAL;
1597 }
1598
1599 get_random_bytes(u->uuid, 16);
1600 memset(u->label, 0, 32);
1601 u->first_reg = u->last_reg = cpu_to_le32((u32)ktime_get_real_seconds());
1602
1603 SET_UUID_FLASH_ONLY(u, 1);
1604 u->sectors = size >> 9;
1605
1606 bch_uuid_write(c);
1607
1608 return flash_dev_run(c, u);
1609}
1610
1611bool bch_cached_dev_error(struct cached_dev *dc)
1612{
1613 if (!dc || test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
1614 return false;
1615
1616 dc->io_disable = true;
1617
1618 smp_mb();
1619
1620 pr_err("stop %s: too many IO errors on backing device %s\n",
1621 dc->disk.disk->disk_name, dc->backing_dev_name);
1622
1623 bcache_device_stop(&dc->disk);
1624 return true;
1625}
1626
1627
1628
1629__printf(2, 3)
1630bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
1631{
1632 struct va_format vaf;
1633 va_list args;
1634
1635 if (c->on_error != ON_ERROR_PANIC &&
1636 test_bit(CACHE_SET_STOPPING, &c->flags))
1637 return false;
1638
1639 if (test_and_set_bit(CACHE_SET_IO_DISABLE, &c->flags))
1640 pr_info("CACHE_SET_IO_DISABLE already set\n");
1641
1642
1643
1644
1645
1646
1647 va_start(args, fmt);
1648
1649 vaf.fmt = fmt;
1650 vaf.va = &args;
1651
1652 pr_err("error on %pU: %pV, disabling caching\n",
1653 c->set_uuid, &vaf);
1654
1655 va_end(args);
1656
1657 if (c->on_error == ON_ERROR_PANIC)
1658 panic("panic forced after error\n");
1659
1660 bch_cache_set_unregister(c);
1661 return true;
1662}
1663
1664
1665void bch_cache_set_release(struct kobject *kobj)
1666{
1667 struct cache_set *c = container_of(kobj, struct cache_set, kobj);
1668
1669 kfree(c);
1670 module_put(THIS_MODULE);
1671}
1672
1673static void cache_set_free(struct closure *cl)
1674{
1675 struct cache_set *c = container_of(cl, struct cache_set, cl);
1676 struct cache *ca;
1677
1678 debugfs_remove(c->debug);
1679
1680 bch_open_buckets_free(c);
1681 bch_btree_cache_free(c);
1682 bch_journal_free(c);
1683
1684 mutex_lock(&bch_register_lock);
1685 bch_bset_sort_state_free(&c->sort);
1686 free_pages((unsigned long) c->uuids, ilog2(meta_bucket_pages(&c->cache->sb)));
1687
1688 ca = c->cache;
1689 if (ca) {
1690 ca->set = NULL;
1691 c->cache = NULL;
1692 kobject_put(&ca->kobj);
1693 }
1694
1695
1696 if (c->moving_gc_wq)
1697 destroy_workqueue(c->moving_gc_wq);
1698 bioset_exit(&c->bio_split);
1699 mempool_exit(&c->fill_iter);
1700 mempool_exit(&c->bio_meta);
1701 mempool_exit(&c->search);
1702 kfree(c->devices);
1703
1704 list_del(&c->list);
1705 mutex_unlock(&bch_register_lock);
1706
1707 pr_info("Cache set %pU unregistered\n", c->set_uuid);
1708 wake_up(&unregister_wait);
1709
1710 closure_debug_destroy(&c->cl);
1711 kobject_put(&c->kobj);
1712}
1713
1714static void cache_set_flush(struct closure *cl)
1715{
1716 struct cache_set *c = container_of(cl, struct cache_set, caching);
1717 struct cache *ca = c->cache;
1718 struct btree *b;
1719
1720 bch_cache_accounting_destroy(&c->accounting);
1721
1722 kobject_put(&c->internal);
1723 kobject_del(&c->kobj);
1724
1725 if (!IS_ERR_OR_NULL(c->gc_thread))
1726 kthread_stop(c->gc_thread);
1727
1728 if (!IS_ERR_OR_NULL(c->root))
1729 list_add(&c->root->list, &c->btree_cache);
1730
1731
1732
1733
1734
1735 if (!test_bit(CACHE_SET_IO_DISABLE, &c->flags))
1736 list_for_each_entry(b, &c->btree_cache, list) {
1737 mutex_lock(&b->write_lock);
1738 if (btree_node_dirty(b))
1739 __bch_btree_node_write(b, NULL);
1740 mutex_unlock(&b->write_lock);
1741 }
1742
1743 if (ca->alloc_thread)
1744 kthread_stop(ca->alloc_thread);
1745
1746 if (c->journal.cur) {
1747 cancel_delayed_work_sync(&c->journal.work);
1748
1749 c->journal.work.work.func(&c->journal.work.work);
1750 }
1751
1752 closure_return(cl);
1753}
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771static void conditional_stop_bcache_device(struct cache_set *c,
1772 struct bcache_device *d,
1773 struct cached_dev *dc)
1774{
1775 if (dc->stop_when_cache_set_failed == BCH_CACHED_DEV_STOP_ALWAYS) {
1776 pr_warn("stop_when_cache_set_failed of %s is \"always\", stop it for failed cache set %pU.\n",
1777 d->disk->disk_name, c->set_uuid);
1778 bcache_device_stop(d);
1779 } else if (atomic_read(&dc->has_dirty)) {
1780
1781
1782
1783
1784 pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is dirty, stop it to avoid potential data corruption.\n",
1785 d->disk->disk_name);
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797 dc->io_disable = true;
1798
1799 smp_mb();
1800 bcache_device_stop(d);
1801 } else {
1802
1803
1804
1805
1806 pr_warn("stop_when_cache_set_failed of %s is \"auto\" and cache is clean, keep it alive.\n",
1807 d->disk->disk_name);
1808 }
1809}
1810
1811static void __cache_set_unregister(struct closure *cl)
1812{
1813 struct cache_set *c = container_of(cl, struct cache_set, caching);
1814 struct cached_dev *dc;
1815 struct bcache_device *d;
1816 size_t i;
1817
1818 mutex_lock(&bch_register_lock);
1819
1820 for (i = 0; i < c->devices_max_used; i++) {
1821 d = c->devices[i];
1822 if (!d)
1823 continue;
1824
1825 if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
1826 test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
1827 dc = container_of(d, struct cached_dev, disk);
1828 bch_cached_dev_detach(dc);
1829 if (test_bit(CACHE_SET_IO_DISABLE, &c->flags))
1830 conditional_stop_bcache_device(c, d, dc);
1831 } else {
1832 bcache_device_stop(d);
1833 }
1834 }
1835
1836 mutex_unlock(&bch_register_lock);
1837
1838 continue_at(cl, cache_set_flush, system_wq);
1839}
1840
1841void bch_cache_set_stop(struct cache_set *c)
1842{
1843 if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags))
1844
1845 closure_queue(&c->caching);
1846}
1847
1848void bch_cache_set_unregister(struct cache_set *c)
1849{
1850 set_bit(CACHE_SET_UNREGISTERING, &c->flags);
1851 bch_cache_set_stop(c);
1852}
1853
1854#define alloc_meta_bucket_pages(gfp, sb) \
1855 ((void *) __get_free_pages(__GFP_ZERO|__GFP_COMP|gfp, ilog2(meta_bucket_pages(sb))))
1856
1857struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1858{
1859 int iter_size;
1860 struct cache *ca = container_of(sb, struct cache, sb);
1861 struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
1862
1863 if (!c)
1864 return NULL;
1865
1866 __module_get(THIS_MODULE);
1867 closure_init(&c->cl, NULL);
1868 set_closure_fn(&c->cl, cache_set_free, system_wq);
1869
1870 closure_init(&c->caching, &c->cl);
1871 set_closure_fn(&c->caching, __cache_set_unregister, system_wq);
1872
1873
1874 closure_set_stopped(&c->cl);
1875 closure_put(&c->cl);
1876
1877 kobject_init(&c->kobj, &bch_cache_set_ktype);
1878 kobject_init(&c->internal, &bch_cache_set_internal_ktype);
1879
1880 bch_cache_accounting_init(&c->accounting, &c->cl);
1881
1882 memcpy(c->set_uuid, sb->set_uuid, 16);
1883
1884 c->cache = ca;
1885 c->cache->set = c;
1886 c->bucket_bits = ilog2(sb->bucket_size);
1887 c->block_bits = ilog2(sb->block_size);
1888 c->nr_uuids = meta_bucket_bytes(sb) / sizeof(struct uuid_entry);
1889 c->devices_max_used = 0;
1890 atomic_set(&c->attached_dev_nr, 0);
1891 c->btree_pages = meta_bucket_pages(sb);
1892 if (c->btree_pages > BTREE_MAX_PAGES)
1893 c->btree_pages = max_t(int, c->btree_pages / 4,
1894 BTREE_MAX_PAGES);
1895
1896 sema_init(&c->sb_write_mutex, 1);
1897 mutex_init(&c->bucket_lock);
1898 init_waitqueue_head(&c->btree_cache_wait);
1899 spin_lock_init(&c->btree_cannibalize_lock);
1900 init_waitqueue_head(&c->bucket_wait);
1901 init_waitqueue_head(&c->gc_wait);
1902 sema_init(&c->uuid_write_mutex, 1);
1903
1904 spin_lock_init(&c->btree_gc_time.lock);
1905 spin_lock_init(&c->btree_split_time.lock);
1906 spin_lock_init(&c->btree_read_time.lock);
1907
1908 bch_moving_init_cache_set(c);
1909
1910 INIT_LIST_HEAD(&c->list);
1911 INIT_LIST_HEAD(&c->cached_devs);
1912 INIT_LIST_HEAD(&c->btree_cache);
1913 INIT_LIST_HEAD(&c->btree_cache_freeable);
1914 INIT_LIST_HEAD(&c->btree_cache_freed);
1915 INIT_LIST_HEAD(&c->data_buckets);
1916
1917 iter_size = ((meta_bucket_pages(sb) * PAGE_SECTORS) / sb->block_size + 1) *
1918 sizeof(struct btree_iter_set);
1919
1920 c->devices = kcalloc(c->nr_uuids, sizeof(void *), GFP_KERNEL);
1921 if (!c->devices)
1922 goto err;
1923
1924 if (mempool_init_slab_pool(&c->search, 32, bch_search_cache))
1925 goto err;
1926
1927 if (mempool_init_kmalloc_pool(&c->bio_meta, 2,
1928 sizeof(struct bbio) +
1929 sizeof(struct bio_vec) * meta_bucket_pages(sb)))
1930 goto err;
1931
1932 if (mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size))
1933 goto err;
1934
1935 if (bioset_init(&c->bio_split, 4, offsetof(struct bbio, bio),
1936 BIOSET_NEED_RESCUER))
1937 goto err;
1938
1939 c->uuids = alloc_meta_bucket_pages(GFP_KERNEL, sb);
1940 if (!c->uuids)
1941 goto err;
1942
1943 c->moving_gc_wq = alloc_workqueue("bcache_gc", WQ_MEM_RECLAIM, 0);
1944 if (!c->moving_gc_wq)
1945 goto err;
1946
1947 if (bch_journal_alloc(c))
1948 goto err;
1949
1950 if (bch_btree_cache_alloc(c))
1951 goto err;
1952
1953 if (bch_open_buckets_alloc(c))
1954 goto err;
1955
1956 if (bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages)))
1957 goto err;
1958
1959 c->congested_read_threshold_us = 2000;
1960 c->congested_write_threshold_us = 20000;
1961 c->error_limit = DEFAULT_IO_ERROR_LIMIT;
1962 c->idle_max_writeback_rate_enabled = 1;
1963 WARN_ON(test_and_clear_bit(CACHE_SET_IO_DISABLE, &c->flags));
1964
1965 return c;
1966err:
1967 bch_cache_set_unregister(c);
1968 return NULL;
1969}
1970
1971static int run_cache_set(struct cache_set *c)
1972{
1973 const char *err = "cannot allocate memory";
1974 struct cached_dev *dc, *t;
1975 struct cache *ca = c->cache;
1976 struct closure cl;
1977 LIST_HEAD(journal);
1978 struct journal_replay *l;
1979
1980 closure_init_stack(&cl);
1981
1982 c->nbuckets = ca->sb.nbuckets;
1983 set_gc_sectors(c);
1984
1985 if (CACHE_SYNC(&c->cache->sb)) {
1986 struct bkey *k;
1987 struct jset *j;
1988
1989 err = "cannot allocate memory for journal";
1990 if (bch_journal_read(c, &journal))
1991 goto err;
1992
1993 pr_debug("btree_journal_read() done\n");
1994
1995 err = "no journal entries found";
1996 if (list_empty(&journal))
1997 goto err;
1998
1999 j = &list_entry(journal.prev, struct journal_replay, list)->j;
2000
2001 err = "IO error reading priorities";
2002 if (prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]))
2003 goto err;
2004
2005
2006
2007
2008
2009
2010
2011 k = &j->btree_root;
2012
2013 err = "bad btree root";
2014 if (__bch_btree_ptr_invalid(c, k))
2015 goto err;
2016
2017 err = "error reading btree root";
2018 c->root = bch_btree_node_get(c, NULL, k,
2019 j->btree_level,
2020 true, NULL);
2021 if (IS_ERR_OR_NULL(c->root))
2022 goto err;
2023
2024 list_del_init(&c->root->list);
2025 rw_unlock(true, c->root);
2026
2027 err = uuid_read(c, j, &cl);
2028 if (err)
2029 goto err;
2030
2031 err = "error in recovery";
2032 if (bch_btree_check(c))
2033 goto err;
2034
2035 bch_journal_mark(c, &journal);
2036 bch_initial_gc_finish(c);
2037 pr_debug("btree_check() done\n");
2038
2039
2040
2041
2042
2043
2044 bch_journal_next(&c->journal);
2045
2046 err = "error starting allocator thread";
2047 if (bch_cache_allocator_start(ca))
2048 goto err;
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060 if (j->version < BCACHE_JSET_VERSION_UUID)
2061 __uuid_write(c);
2062
2063 err = "bcache: replay journal failed";
2064 if (bch_journal_replay(c, &journal))
2065 goto err;
2066 } else {
2067 unsigned int j;
2068
2069 pr_notice("invalidating existing data\n");
2070 ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
2071 2, SB_JOURNAL_BUCKETS);
2072
2073 for (j = 0; j < ca->sb.keys; j++)
2074 ca->sb.d[j] = ca->sb.first_bucket + j;
2075
2076 bch_initial_gc_finish(c);
2077
2078 err = "error starting allocator thread";
2079 if (bch_cache_allocator_start(ca))
2080 goto err;
2081
2082 mutex_lock(&c->bucket_lock);
2083 bch_prio_write(ca, true);
2084 mutex_unlock(&c->bucket_lock);
2085
2086 err = "cannot allocate new UUID bucket";
2087 if (__uuid_write(c))
2088 goto err;
2089
2090 err = "cannot allocate new btree root";
2091 c->root = __bch_btree_node_alloc(c, NULL, 0, true, NULL);
2092 if (IS_ERR_OR_NULL(c->root))
2093 goto err;
2094
2095 mutex_lock(&c->root->write_lock);
2096 bkey_copy_key(&c->root->key, &MAX_KEY);
2097 bch_btree_node_write(c->root, &cl);
2098 mutex_unlock(&c->root->write_lock);
2099
2100 bch_btree_set_root(c->root);
2101 rw_unlock(true, c->root);
2102
2103
2104
2105
2106
2107
2108 SET_CACHE_SYNC(&c->cache->sb, true);
2109
2110 bch_journal_next(&c->journal);
2111 bch_journal_meta(c, &cl);
2112 }
2113
2114 err = "error starting gc thread";
2115 if (bch_gc_thread_start(c))
2116 goto err;
2117
2118 closure_sync(&cl);
2119 c->cache->sb.last_mount = (u32)ktime_get_real_seconds();
2120 bcache_write_super(c);
2121
2122 if (bch_has_feature_obso_large_bucket(&c->cache->sb))
2123 pr_err("Detect obsoleted large bucket layout, all attached bcache device will be read-only\n");
2124
2125 list_for_each_entry_safe(dc, t, &uncached_devices, list)
2126 bch_cached_dev_attach(dc, c, NULL);
2127
2128 flash_devs_run(c);
2129
2130 set_bit(CACHE_SET_RUNNING, &c->flags);
2131 return 0;
2132err:
2133 while (!list_empty(&journal)) {
2134 l = list_first_entry(&journal, struct journal_replay, list);
2135 list_del(&l->list);
2136 kfree(l);
2137 }
2138
2139 closure_sync(&cl);
2140
2141 bch_cache_set_error(c, "%s", err);
2142
2143 return -EIO;
2144}
2145
2146static const char *register_cache_set(struct cache *ca)
2147{
2148 char buf[12];
2149 const char *err = "cannot allocate memory";
2150 struct cache_set *c;
2151
2152 list_for_each_entry(c, &bch_cache_sets, list)
2153 if (!memcmp(c->set_uuid, ca->sb.set_uuid, 16)) {
2154 if (c->cache)
2155 return "duplicate cache set member";
2156
2157 goto found;
2158 }
2159
2160 c = bch_cache_set_alloc(&ca->sb);
2161 if (!c)
2162 return err;
2163
2164 err = "error creating kobject";
2165 if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->set_uuid) ||
2166 kobject_add(&c->internal, &c->kobj, "internal"))
2167 goto err;
2168
2169 if (bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
2170 goto err;
2171
2172 bch_debug_init_cache_set(c);
2173
2174 list_add(&c->list, &bch_cache_sets);
2175found:
2176 sprintf(buf, "cache%i", ca->sb.nr_this_dev);
2177 if (sysfs_create_link(&ca->kobj, &c->kobj, "set") ||
2178 sysfs_create_link(&c->kobj, &ca->kobj, buf))
2179 goto err;
2180
2181 kobject_get(&ca->kobj);
2182 ca->set = c;
2183 ca->set->cache = ca;
2184
2185 err = "failed to run cache set";
2186 if (run_cache_set(c) < 0)
2187 goto err;
2188
2189 return NULL;
2190err:
2191 bch_cache_set_unregister(c);
2192 return err;
2193}
2194
2195
2196
2197
2198void bch_cache_release(struct kobject *kobj)
2199{
2200 struct cache *ca = container_of(kobj, struct cache, kobj);
2201 unsigned int i;
2202
2203 if (ca->set) {
2204 BUG_ON(ca->set->cache != ca);
2205 ca->set->cache = NULL;
2206 }
2207
2208 free_pages((unsigned long) ca->disk_buckets, ilog2(meta_bucket_pages(&ca->sb)));
2209 kfree(ca->prio_buckets);
2210 vfree(ca->buckets);
2211
2212 free_heap(&ca->heap);
2213 free_fifo(&ca->free_inc);
2214
2215 for (i = 0; i < RESERVE_NR; i++)
2216 free_fifo(&ca->free[i]);
2217
2218 if (ca->sb_disk)
2219 put_page(virt_to_page(ca->sb_disk));
2220
2221 if (!IS_ERR_OR_NULL(ca->bdev))
2222 blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2223
2224 kfree(ca);
2225 module_put(THIS_MODULE);
2226}
2227
2228static int cache_alloc(struct cache *ca)
2229{
2230 size_t free;
2231 size_t btree_buckets;
2232 struct bucket *b;
2233 int ret = -ENOMEM;
2234 const char *err = NULL;
2235
2236 __module_get(THIS_MODULE);
2237 kobject_init(&ca->kobj, &bch_cache_ktype);
2238
2239 bio_init(&ca->journal.bio, ca->journal.bio.bi_inline_vecs, 8);
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250 btree_buckets = ca->sb.njournal_buckets ?: 8;
2251 free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
2252 if (!free) {
2253 ret = -EPERM;
2254 err = "ca->sb.nbuckets is too small";
2255 goto err_free;
2256 }
2257
2258 if (!init_fifo(&ca->free[RESERVE_BTREE], btree_buckets,
2259 GFP_KERNEL)) {
2260 err = "ca->free[RESERVE_BTREE] alloc failed";
2261 goto err_btree_alloc;
2262 }
2263
2264 if (!init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca),
2265 GFP_KERNEL)) {
2266 err = "ca->free[RESERVE_PRIO] alloc failed";
2267 goto err_prio_alloc;
2268 }
2269
2270 if (!init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL)) {
2271 err = "ca->free[RESERVE_MOVINGGC] alloc failed";
2272 goto err_movinggc_alloc;
2273 }
2274
2275 if (!init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL)) {
2276 err = "ca->free[RESERVE_NONE] alloc failed";
2277 goto err_none_alloc;
2278 }
2279
2280 if (!init_fifo(&ca->free_inc, free << 2, GFP_KERNEL)) {
2281 err = "ca->free_inc alloc failed";
2282 goto err_free_inc_alloc;
2283 }
2284
2285 if (!init_heap(&ca->heap, free << 3, GFP_KERNEL)) {
2286 err = "ca->heap alloc failed";
2287 goto err_heap_alloc;
2288 }
2289
2290 ca->buckets = vzalloc(array_size(sizeof(struct bucket),
2291 ca->sb.nbuckets));
2292 if (!ca->buckets) {
2293 err = "ca->buckets alloc failed";
2294 goto err_buckets_alloc;
2295 }
2296
2297 ca->prio_buckets = kzalloc(array3_size(sizeof(uint64_t),
2298 prio_buckets(ca), 2),
2299 GFP_KERNEL);
2300 if (!ca->prio_buckets) {
2301 err = "ca->prio_buckets alloc failed";
2302 goto err_prio_buckets_alloc;
2303 }
2304
2305 ca->disk_buckets = alloc_meta_bucket_pages(GFP_KERNEL, &ca->sb);
2306 if (!ca->disk_buckets) {
2307 err = "ca->disk_buckets alloc failed";
2308 goto err_disk_buckets_alloc;
2309 }
2310
2311 ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
2312
2313 for_each_bucket(b, ca)
2314 atomic_set(&b->pin, 0);
2315 return 0;
2316
2317err_disk_buckets_alloc:
2318 kfree(ca->prio_buckets);
2319err_prio_buckets_alloc:
2320 vfree(ca->buckets);
2321err_buckets_alloc:
2322 free_heap(&ca->heap);
2323err_heap_alloc:
2324 free_fifo(&ca->free_inc);
2325err_free_inc_alloc:
2326 free_fifo(&ca->free[RESERVE_NONE]);
2327err_none_alloc:
2328 free_fifo(&ca->free[RESERVE_MOVINGGC]);
2329err_movinggc_alloc:
2330 free_fifo(&ca->free[RESERVE_PRIO]);
2331err_prio_alloc:
2332 free_fifo(&ca->free[RESERVE_BTREE]);
2333err_btree_alloc:
2334err_free:
2335 module_put(THIS_MODULE);
2336 if (err)
2337 pr_notice("error %s: %s\n", ca->cache_dev_name, err);
2338 return ret;
2339}
2340
2341static int register_cache(struct cache_sb *sb, struct cache_sb_disk *sb_disk,
2342 struct block_device *bdev, struct cache *ca)
2343{
2344 const char *err = NULL;
2345 int ret = 0;
2346
2347 bdevname(bdev, ca->cache_dev_name);
2348 memcpy(&ca->sb, sb, sizeof(struct cache_sb));
2349 ca->bdev = bdev;
2350 ca->bdev->bd_holder = ca;
2351 ca->sb_disk = sb_disk;
2352
2353 if (blk_queue_discard(bdev_get_queue(bdev)))
2354 ca->discard = CACHE_DISCARD(&ca->sb);
2355
2356 ret = cache_alloc(ca);
2357 if (ret != 0) {
2358
2359
2360
2361
2362
2363
2364 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2365 if (ret == -ENOMEM)
2366 err = "cache_alloc(): -ENOMEM";
2367 else if (ret == -EPERM)
2368 err = "cache_alloc(): cache device is too small";
2369 else
2370 err = "cache_alloc(): unknown error";
2371 goto err;
2372 }
2373
2374 if (kobject_add(&ca->kobj, bdev_kobj(bdev), "bcache")) {
2375 err = "error calling kobject_add";
2376 ret = -ENOMEM;
2377 goto out;
2378 }
2379
2380 mutex_lock(&bch_register_lock);
2381 err = register_cache_set(ca);
2382 mutex_unlock(&bch_register_lock);
2383
2384 if (err) {
2385 ret = -ENODEV;
2386 goto out;
2387 }
2388
2389 pr_info("registered cache device %s\n", ca->cache_dev_name);
2390
2391out:
2392 kobject_put(&ca->kobj);
2393
2394err:
2395 if (err)
2396 pr_notice("error %s: %s\n", ca->cache_dev_name, err);
2397
2398 return ret;
2399}
2400
2401
2402
2403static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
2404 const char *buffer, size_t size);
2405static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
2406 struct kobj_attribute *attr,
2407 const char *buffer, size_t size);
2408
2409kobj_attribute_write(register, register_bcache);
2410kobj_attribute_write(register_quiet, register_bcache);
2411kobj_attribute_write(pendings_cleanup, bch_pending_bdevs_cleanup);
2412
2413static bool bch_is_open_backing(dev_t dev)
2414{
2415 struct cache_set *c, *tc;
2416 struct cached_dev *dc, *t;
2417
2418 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
2419 list_for_each_entry_safe(dc, t, &c->cached_devs, list)
2420 if (dc->bdev->bd_dev == dev)
2421 return true;
2422 list_for_each_entry_safe(dc, t, &uncached_devices, list)
2423 if (dc->bdev->bd_dev == dev)
2424 return true;
2425 return false;
2426}
2427
2428static bool bch_is_open_cache(dev_t dev)
2429{
2430 struct cache_set *c, *tc;
2431
2432 list_for_each_entry_safe(c, tc, &bch_cache_sets, list) {
2433 struct cache *ca = c->cache;
2434
2435 if (ca->bdev->bd_dev == dev)
2436 return true;
2437 }
2438
2439 return false;
2440}
2441
2442static bool bch_is_open(dev_t dev)
2443{
2444 return bch_is_open_cache(dev) || bch_is_open_backing(dev);
2445}
2446
2447struct async_reg_args {
2448 struct delayed_work reg_work;
2449 char *path;
2450 struct cache_sb *sb;
2451 struct cache_sb_disk *sb_disk;
2452 struct block_device *bdev;
2453};
2454
2455static void register_bdev_worker(struct work_struct *work)
2456{
2457 int fail = false;
2458 struct async_reg_args *args =
2459 container_of(work, struct async_reg_args, reg_work.work);
2460 struct cached_dev *dc;
2461
2462 dc = kzalloc(sizeof(*dc), GFP_KERNEL);
2463 if (!dc) {
2464 fail = true;
2465 put_page(virt_to_page(args->sb_disk));
2466 blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2467 goto out;
2468 }
2469
2470 mutex_lock(&bch_register_lock);
2471 if (register_bdev(args->sb, args->sb_disk, args->bdev, dc) < 0)
2472 fail = true;
2473 mutex_unlock(&bch_register_lock);
2474
2475out:
2476 if (fail)
2477 pr_info("error %s: fail to register backing device\n",
2478 args->path);
2479 kfree(args->sb);
2480 kfree(args->path);
2481 kfree(args);
2482 module_put(THIS_MODULE);
2483}
2484
2485static void register_cache_worker(struct work_struct *work)
2486{
2487 int fail = false;
2488 struct async_reg_args *args =
2489 container_of(work, struct async_reg_args, reg_work.work);
2490 struct cache *ca;
2491
2492 ca = kzalloc(sizeof(*ca), GFP_KERNEL);
2493 if (!ca) {
2494 fail = true;
2495 put_page(virt_to_page(args->sb_disk));
2496 blkdev_put(args->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2497 goto out;
2498 }
2499
2500
2501 if (register_cache(args->sb, args->sb_disk, args->bdev, ca) != 0)
2502 fail = true;
2503
2504out:
2505 if (fail)
2506 pr_info("error %s: fail to register cache device\n",
2507 args->path);
2508 kfree(args->sb);
2509 kfree(args->path);
2510 kfree(args);
2511 module_put(THIS_MODULE);
2512}
2513
2514static void register_device_async(struct async_reg_args *args)
2515{
2516 if (SB_IS_BDEV(args->sb))
2517 INIT_DELAYED_WORK(&args->reg_work, register_bdev_worker);
2518 else
2519 INIT_DELAYED_WORK(&args->reg_work, register_cache_worker);
2520
2521
2522 queue_delayed_work(system_wq, &args->reg_work, 10);
2523}
2524
2525static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
2526 const char *buffer, size_t size)
2527{
2528 const char *err;
2529 char *path = NULL;
2530 struct cache_sb *sb;
2531 struct cache_sb_disk *sb_disk;
2532 struct block_device *bdev;
2533 ssize_t ret;
2534 bool async_registration = false;
2535
2536#ifdef CONFIG_BCACHE_ASYNC_REGISTRATION
2537 async_registration = true;
2538#endif
2539
2540 ret = -EBUSY;
2541 err = "failed to reference bcache module";
2542 if (!try_module_get(THIS_MODULE))
2543 goto out;
2544
2545
2546 smp_mb();
2547 err = "bcache is in reboot";
2548 if (bcache_is_reboot)
2549 goto out_module_put;
2550
2551 ret = -ENOMEM;
2552 err = "cannot allocate memory";
2553 path = kstrndup(buffer, size, GFP_KERNEL);
2554 if (!path)
2555 goto out_module_put;
2556
2557 sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL);
2558 if (!sb)
2559 goto out_free_path;
2560
2561 ret = -EINVAL;
2562 err = "failed to open device";
2563 bdev = blkdev_get_by_path(strim(path),
2564 FMODE_READ|FMODE_WRITE|FMODE_EXCL,
2565 sb);
2566 if (IS_ERR(bdev)) {
2567 if (bdev == ERR_PTR(-EBUSY)) {
2568 dev_t dev;
2569
2570 mutex_lock(&bch_register_lock);
2571 if (lookup_bdev(strim(path), &dev) == 0 &&
2572 bch_is_open(dev))
2573 err = "device already registered";
2574 else
2575 err = "device busy";
2576 mutex_unlock(&bch_register_lock);
2577 if (attr == &ksysfs_register_quiet)
2578 goto done;
2579 }
2580 goto out_free_sb;
2581 }
2582
2583 err = "failed to set blocksize";
2584 if (set_blocksize(bdev, 4096))
2585 goto out_blkdev_put;
2586
2587 err = read_super(sb, bdev, &sb_disk);
2588 if (err)
2589 goto out_blkdev_put;
2590
2591 err = "failed to register device";
2592
2593 if (async_registration) {
2594
2595 struct async_reg_args *args =
2596 kzalloc(sizeof(struct async_reg_args), GFP_KERNEL);
2597
2598 if (!args) {
2599 ret = -ENOMEM;
2600 err = "cannot allocate memory";
2601 goto out_put_sb_page;
2602 }
2603
2604 args->path = path;
2605 args->sb = sb;
2606 args->sb_disk = sb_disk;
2607 args->bdev = bdev;
2608 register_device_async(args);
2609
2610 goto async_done;
2611 }
2612
2613 if (SB_IS_BDEV(sb)) {
2614 struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
2615
2616 if (!dc)
2617 goto out_put_sb_page;
2618
2619 mutex_lock(&bch_register_lock);
2620 ret = register_bdev(sb, sb_disk, bdev, dc);
2621 mutex_unlock(&bch_register_lock);
2622
2623 if (ret < 0)
2624 goto out_free_sb;
2625 } else {
2626 struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
2627
2628 if (!ca)
2629 goto out_put_sb_page;
2630
2631
2632 if (register_cache(sb, sb_disk, bdev, ca) != 0)
2633 goto out_free_sb;
2634 }
2635
2636done:
2637 kfree(sb);
2638 kfree(path);
2639 module_put(THIS_MODULE);
2640async_done:
2641 return size;
2642
2643out_put_sb_page:
2644 put_page(virt_to_page(sb_disk));
2645out_blkdev_put:
2646 blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
2647out_free_sb:
2648 kfree(sb);
2649out_free_path:
2650 kfree(path);
2651 path = NULL;
2652out_module_put:
2653 module_put(THIS_MODULE);
2654out:
2655 pr_info("error %s: %s\n", path?path:"", err);
2656 return ret;
2657}
2658
2659
2660struct pdev {
2661 struct list_head list;
2662 struct cached_dev *dc;
2663};
2664
2665static ssize_t bch_pending_bdevs_cleanup(struct kobject *k,
2666 struct kobj_attribute *attr,
2667 const char *buffer,
2668 size_t size)
2669{
2670 LIST_HEAD(pending_devs);
2671 ssize_t ret = size;
2672 struct cached_dev *dc, *tdc;
2673 struct pdev *pdev, *tpdev;
2674 struct cache_set *c, *tc;
2675
2676 mutex_lock(&bch_register_lock);
2677 list_for_each_entry_safe(dc, tdc, &uncached_devices, list) {
2678 pdev = kmalloc(sizeof(struct pdev), GFP_KERNEL);
2679 if (!pdev)
2680 break;
2681 pdev->dc = dc;
2682 list_add(&pdev->list, &pending_devs);
2683 }
2684
2685 list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) {
2686 char *pdev_set_uuid = pdev->dc->sb.set_uuid;
2687 list_for_each_entry_safe(c, tc, &bch_cache_sets, list) {
2688 char *set_uuid = c->set_uuid;
2689
2690 if (!memcmp(pdev_set_uuid, set_uuid, 16)) {
2691 list_del(&pdev->list);
2692 kfree(pdev);
2693 break;
2694 }
2695 }
2696 }
2697 mutex_unlock(&bch_register_lock);
2698
2699 list_for_each_entry_safe(pdev, tpdev, &pending_devs, list) {
2700 pr_info("delete pdev %p\n", pdev);
2701 list_del(&pdev->list);
2702 bcache_device_stop(&pdev->dc->disk);
2703 kfree(pdev);
2704 }
2705
2706 return ret;
2707}
2708
2709static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
2710{
2711 if (bcache_is_reboot)
2712 return NOTIFY_DONE;
2713
2714 if (code == SYS_DOWN ||
2715 code == SYS_HALT ||
2716 code == SYS_POWER_OFF) {
2717 DEFINE_WAIT(wait);
2718 unsigned long start = jiffies;
2719 bool stopped = false;
2720
2721 struct cache_set *c, *tc;
2722 struct cached_dev *dc, *tdc;
2723
2724 mutex_lock(&bch_register_lock);
2725
2726 if (bcache_is_reboot)
2727 goto out;
2728
2729
2730 bcache_is_reboot = true;
2731
2732
2733
2734
2735 smp_mb();
2736
2737 if (list_empty(&bch_cache_sets) &&
2738 list_empty(&uncached_devices))
2739 goto out;
2740
2741 mutex_unlock(&bch_register_lock);
2742
2743 pr_info("Stopping all devices:\n");
2744
2745
2746
2747
2748
2749
2750
2751
2752
2753
2754
2755
2756
2757
2758
2759 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
2760 bch_cache_set_stop(c);
2761
2762 list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
2763 bcache_device_stop(&dc->disk);
2764
2765
2766
2767
2768
2769
2770 schedule();
2771
2772
2773 while (1) {
2774 long timeout = start + 10 * HZ - jiffies;
2775
2776 mutex_lock(&bch_register_lock);
2777 stopped = list_empty(&bch_cache_sets) &&
2778 list_empty(&uncached_devices);
2779
2780 if (timeout < 0 || stopped)
2781 break;
2782
2783 prepare_to_wait(&unregister_wait, &wait,
2784 TASK_UNINTERRUPTIBLE);
2785
2786 mutex_unlock(&bch_register_lock);
2787 schedule_timeout(timeout);
2788 }
2789
2790 finish_wait(&unregister_wait, &wait);
2791
2792 if (stopped)
2793 pr_info("All devices stopped\n");
2794 else
2795 pr_notice("Timeout waiting for devices to be closed\n");
2796out:
2797 mutex_unlock(&bch_register_lock);
2798 }
2799
2800 return NOTIFY_DONE;
2801}
2802
2803static struct notifier_block reboot = {
2804 .notifier_call = bcache_reboot,
2805 .priority = INT_MAX,
2806};
2807
2808static void bcache_exit(void)
2809{
2810 bch_debug_exit();
2811 bch_request_exit();
2812 if (bcache_kobj)
2813 kobject_put(bcache_kobj);
2814 if (bcache_wq)
2815 destroy_workqueue(bcache_wq);
2816 if (bch_journal_wq)
2817 destroy_workqueue(bch_journal_wq);
2818 if (bch_flush_wq)
2819 destroy_workqueue(bch_flush_wq);
2820 bch_btree_exit();
2821
2822 if (bcache_major)
2823 unregister_blkdev(bcache_major, "bcache");
2824 unregister_reboot_notifier(&reboot);
2825 mutex_destroy(&bch_register_lock);
2826}
2827
2828
2829static void check_module_parameters(void)
2830{
2831 if (bch_cutoff_writeback_sync == 0)
2832 bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC;
2833 else if (bch_cutoff_writeback_sync > CUTOFF_WRITEBACK_SYNC_MAX) {
2834 pr_warn("set bch_cutoff_writeback_sync (%u) to max value %u\n",
2835 bch_cutoff_writeback_sync, CUTOFF_WRITEBACK_SYNC_MAX);
2836 bch_cutoff_writeback_sync = CUTOFF_WRITEBACK_SYNC_MAX;
2837 }
2838
2839 if (bch_cutoff_writeback == 0)
2840 bch_cutoff_writeback = CUTOFF_WRITEBACK;
2841 else if (bch_cutoff_writeback > CUTOFF_WRITEBACK_MAX) {
2842 pr_warn("set bch_cutoff_writeback (%u) to max value %u\n",
2843 bch_cutoff_writeback, CUTOFF_WRITEBACK_MAX);
2844 bch_cutoff_writeback = CUTOFF_WRITEBACK_MAX;
2845 }
2846
2847 if (bch_cutoff_writeback > bch_cutoff_writeback_sync) {
2848 pr_warn("set bch_cutoff_writeback (%u) to %u\n",
2849 bch_cutoff_writeback, bch_cutoff_writeback_sync);
2850 bch_cutoff_writeback = bch_cutoff_writeback_sync;
2851 }
2852}
2853
2854static int __init bcache_init(void)
2855{
2856 static const struct attribute *files[] = {
2857 &ksysfs_register.attr,
2858 &ksysfs_register_quiet.attr,
2859 &ksysfs_pendings_cleanup.attr,
2860 NULL
2861 };
2862
2863 check_module_parameters();
2864
2865 mutex_init(&bch_register_lock);
2866 init_waitqueue_head(&unregister_wait);
2867 register_reboot_notifier(&reboot);
2868
2869 bcache_major = register_blkdev(0, "bcache");
2870 if (bcache_major < 0) {
2871 unregister_reboot_notifier(&reboot);
2872 mutex_destroy(&bch_register_lock);
2873 return bcache_major;
2874 }
2875
2876 if (bch_btree_init())
2877 goto err;
2878
2879 bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0);
2880 if (!bcache_wq)
2881 goto err;
2882
2883
2884
2885
2886
2887
2888
2889
2890
2891
2892 bch_flush_wq = alloc_workqueue("bch_flush", 0, 0);
2893 if (!bch_flush_wq)
2894 goto err;
2895
2896 bch_journal_wq = alloc_workqueue("bch_journal", WQ_MEM_RECLAIM, 0);
2897 if (!bch_journal_wq)
2898 goto err;
2899
2900 bcache_kobj = kobject_create_and_add("bcache", fs_kobj);
2901 if (!bcache_kobj)
2902 goto err;
2903
2904 if (bch_request_init() ||
2905 sysfs_create_files(bcache_kobj, files))
2906 goto err;
2907
2908 bch_debug_init();
2909 closure_debug_init();
2910
2911 bcache_is_reboot = false;
2912
2913 return 0;
2914err:
2915 bcache_exit();
2916 return -ENOMEM;
2917}
2918
2919
2920
2921
2922module_exit(bcache_exit);
2923module_init(bcache_init);
2924
2925module_param(bch_cutoff_writeback, uint, 0);
2926MODULE_PARM_DESC(bch_cutoff_writeback, "threshold to cutoff writeback");
2927
2928module_param(bch_cutoff_writeback_sync, uint, 0);
2929MODULE_PARM_DESC(bch_cutoff_writeback_sync, "hard threshold to cutoff writeback");
2930
2931MODULE_DESCRIPTION("Bcache: a Linux block layer cache");
2932MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
2933MODULE_LICENSE("GPL");
2934