1
2
3
4
5
6
7
8
9#include "bcache.h"
10#include "btree.h"
11#include "debug.h"
12#include "extents.h"
13#include "request.h"
14#include "writeback.h"
15
16#include <linux/blkdev.h>
17#include <linux/buffer_head.h>
18#include <linux/debugfs.h>
19#include <linux/genhd.h>
20#include <linux/idr.h>
21#include <linux/kthread.h>
22#include <linux/module.h>
23#include <linux/random.h>
24#include <linux/reboot.h>
25#include <linux/sysfs.h>
26
27MODULE_LICENSE("GPL");
28MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
29
30static const char bcache_magic[] = {
31 0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
32 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81
33};
34
35static const char invalid_uuid[] = {
36 0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78,
37 0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
38};
39
40
41const char * const bch_cache_modes[] = {
42 "default",
43 "writethrough",
44 "writeback",
45 "writearound",
46 "none",
47 NULL
48};
49
50static struct kobject *bcache_kobj;
51struct mutex bch_register_lock;
52LIST_HEAD(bch_cache_sets);
53static LIST_HEAD(uncached_devices);
54
55static int bcache_major;
56static DEFINE_IDA(bcache_minor);
57static wait_queue_head_t unregister_wait;
58struct workqueue_struct *bcache_wq;
59
60#define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE)
61
62
63
64static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
65 struct page **res)
66{
67 const char *err;
68 struct cache_sb *s;
69 struct buffer_head *bh = __bread(bdev, 1, SB_SIZE);
70 unsigned i;
71
72 if (!bh)
73 return "IO error";
74
75 s = (struct cache_sb *) bh->b_data;
76
77 sb->offset = le64_to_cpu(s->offset);
78 sb->version = le64_to_cpu(s->version);
79
80 memcpy(sb->magic, s->magic, 16);
81 memcpy(sb->uuid, s->uuid, 16);
82 memcpy(sb->set_uuid, s->set_uuid, 16);
83 memcpy(sb->label, s->label, SB_LABEL_SIZE);
84
85 sb->flags = le64_to_cpu(s->flags);
86 sb->seq = le64_to_cpu(s->seq);
87 sb->last_mount = le32_to_cpu(s->last_mount);
88 sb->first_bucket = le16_to_cpu(s->first_bucket);
89 sb->keys = le16_to_cpu(s->keys);
90
91 for (i = 0; i < SB_JOURNAL_BUCKETS; i++)
92 sb->d[i] = le64_to_cpu(s->d[i]);
93
94 pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
95 sb->version, sb->flags, sb->seq, sb->keys);
96
97 err = "Not a bcache superblock";
98 if (sb->offset != SB_SECTOR)
99 goto err;
100
101 if (memcmp(sb->magic, bcache_magic, 16))
102 goto err;
103
104 err = "Too many journal buckets";
105 if (sb->keys > SB_JOURNAL_BUCKETS)
106 goto err;
107
108 err = "Bad checksum";
109 if (s->csum != csum_set(s))
110 goto err;
111
112 err = "Bad UUID";
113 if (bch_is_zero(sb->uuid, 16))
114 goto err;
115
116 sb->block_size = le16_to_cpu(s->block_size);
117
118 err = "Superblock block size smaller than device block size";
119 if (sb->block_size << 9 < bdev_logical_block_size(bdev))
120 goto err;
121
122 switch (sb->version) {
123 case BCACHE_SB_VERSION_BDEV:
124 sb->data_offset = BDEV_DATA_START_DEFAULT;
125 break;
126 case BCACHE_SB_VERSION_BDEV_WITH_OFFSET:
127 sb->data_offset = le64_to_cpu(s->data_offset);
128
129 err = "Bad data offset";
130 if (sb->data_offset < BDEV_DATA_START_DEFAULT)
131 goto err;
132
133 break;
134 case BCACHE_SB_VERSION_CDEV:
135 case BCACHE_SB_VERSION_CDEV_WITH_UUID:
136 sb->nbuckets = le64_to_cpu(s->nbuckets);
137 sb->block_size = le16_to_cpu(s->block_size);
138 sb->bucket_size = le16_to_cpu(s->bucket_size);
139
140 sb->nr_in_set = le16_to_cpu(s->nr_in_set);
141 sb->nr_this_dev = le16_to_cpu(s->nr_this_dev);
142
143 err = "Too many buckets";
144 if (sb->nbuckets > LONG_MAX)
145 goto err;
146
147 err = "Not enough buckets";
148 if (sb->nbuckets < 1 << 7)
149 goto err;
150
151 err = "Bad block/bucket size";
152 if (!is_power_of_2(sb->block_size) ||
153 sb->block_size > PAGE_SECTORS ||
154 !is_power_of_2(sb->bucket_size) ||
155 sb->bucket_size < PAGE_SECTORS)
156 goto err;
157
158 err = "Invalid superblock: device too small";
159 if (get_capacity(bdev->bd_disk) < sb->bucket_size * sb->nbuckets)
160 goto err;
161
162 err = "Bad UUID";
163 if (bch_is_zero(sb->set_uuid, 16))
164 goto err;
165
166 err = "Bad cache device number in set";
167 if (!sb->nr_in_set ||
168 sb->nr_in_set <= sb->nr_this_dev ||
169 sb->nr_in_set > MAX_CACHES_PER_SET)
170 goto err;
171
172 err = "Journal buckets not sequential";
173 for (i = 0; i < sb->keys; i++)
174 if (sb->d[i] != sb->first_bucket + i)
175 goto err;
176
177 err = "Too many journal buckets";
178 if (sb->first_bucket + sb->keys > sb->nbuckets)
179 goto err;
180
181 err = "Invalid superblock: first bucket comes before end of super";
182 if (sb->first_bucket * sb->bucket_size < 16)
183 goto err;
184
185 break;
186 default:
187 err = "Unsupported superblock version";
188 goto err;
189 }
190
191 sb->last_mount = get_seconds();
192 err = NULL;
193
194 get_page(bh->b_page);
195 *res = bh->b_page;
196err:
197 put_bh(bh);
198 return err;
199}
200
201static void write_bdev_super_endio(struct bio *bio)
202{
203 struct cached_dev *dc = bio->bi_private;
204
205
206 closure_put(&dc->sb_write);
207}
208
209static void __write_super(struct cache_sb *sb, struct bio *bio)
210{
211 struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page);
212 unsigned i;
213
214 bio->bi_iter.bi_sector = SB_SECTOR;
215 bio->bi_rw = REQ_SYNC|REQ_META;
216 bio->bi_iter.bi_size = SB_SIZE;
217 bch_bio_map(bio, NULL);
218
219 out->offset = cpu_to_le64(sb->offset);
220 out->version = cpu_to_le64(sb->version);
221
222 memcpy(out->uuid, sb->uuid, 16);
223 memcpy(out->set_uuid, sb->set_uuid, 16);
224 memcpy(out->label, sb->label, SB_LABEL_SIZE);
225
226 out->flags = cpu_to_le64(sb->flags);
227 out->seq = cpu_to_le64(sb->seq);
228
229 out->last_mount = cpu_to_le32(sb->last_mount);
230 out->first_bucket = cpu_to_le16(sb->first_bucket);
231 out->keys = cpu_to_le16(sb->keys);
232
233 for (i = 0; i < sb->keys; i++)
234 out->d[i] = cpu_to_le64(sb->d[i]);
235
236 out->csum = csum_set(out);
237
238 pr_debug("ver %llu, flags %llu, seq %llu",
239 sb->version, sb->flags, sb->seq);
240
241 submit_bio(REQ_WRITE, bio);
242}
243
244static void bch_write_bdev_super_unlock(struct closure *cl)
245{
246 struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write);
247
248 up(&dc->sb_write_mutex);
249}
250
251void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
252{
253 struct closure *cl = &dc->sb_write;
254 struct bio *bio = &dc->sb_bio;
255
256 down(&dc->sb_write_mutex);
257 closure_init(cl, parent);
258
259 bio_reset(bio);
260 bio->bi_bdev = dc->bdev;
261 bio->bi_end_io = write_bdev_super_endio;
262 bio->bi_private = dc;
263
264 closure_get(cl);
265 __write_super(&dc->sb, bio);
266
267 closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
268}
269
270static void write_super_endio(struct bio *bio)
271{
272 struct cache *ca = bio->bi_private;
273
274 bch_count_io_errors(ca, bio->bi_error, "writing superblock");
275 closure_put(&ca->set->sb_write);
276}
277
278static void bcache_write_super_unlock(struct closure *cl)
279{
280 struct cache_set *c = container_of(cl, struct cache_set, sb_write);
281
282 up(&c->sb_write_mutex);
283}
284
285void bcache_write_super(struct cache_set *c)
286{
287 struct closure *cl = &c->sb_write;
288 struct cache *ca;
289 unsigned i;
290
291 down(&c->sb_write_mutex);
292 closure_init(cl, &c->cl);
293
294 c->sb.seq++;
295
296 for_each_cache(ca, c, i) {
297 struct bio *bio = &ca->sb_bio;
298
299 ca->sb.version = BCACHE_SB_VERSION_CDEV_WITH_UUID;
300 ca->sb.seq = c->sb.seq;
301 ca->sb.last_mount = c->sb.last_mount;
302
303 SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb));
304
305 bio_reset(bio);
306 bio->bi_bdev = ca->bdev;
307 bio->bi_end_io = write_super_endio;
308 bio->bi_private = ca;
309
310 closure_get(cl);
311 __write_super(&ca->sb, bio);
312 }
313
314 closure_return_with_destructor(cl, bcache_write_super_unlock);
315}
316
317
318
319static void uuid_endio(struct bio *bio)
320{
321 struct closure *cl = bio->bi_private;
322 struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
323
324 cache_set_err_on(bio->bi_error, c, "accessing uuids");
325 bch_bbio_free(bio, c);
326 closure_put(cl);
327}
328
329static void uuid_io_unlock(struct closure *cl)
330{
331 struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
332
333 up(&c->uuid_write_mutex);
334}
335
336static void uuid_io(struct cache_set *c, unsigned long rw,
337 struct bkey *k, struct closure *parent)
338{
339 struct closure *cl = &c->uuid_write;
340 struct uuid_entry *u;
341 unsigned i;
342 char buf[80];
343
344 BUG_ON(!parent);
345 down(&c->uuid_write_mutex);
346 closure_init(cl, parent);
347
348 for (i = 0; i < KEY_PTRS(k); i++) {
349 struct bio *bio = bch_bbio_alloc(c);
350
351 bio->bi_rw = REQ_SYNC|REQ_META|rw;
352 bio->bi_iter.bi_size = KEY_SIZE(k) << 9;
353
354 bio->bi_end_io = uuid_endio;
355 bio->bi_private = cl;
356 bch_bio_map(bio, c->uuids);
357
358 bch_submit_bbio(bio, c, k, i);
359
360 if (!(rw & WRITE))
361 break;
362 }
363
364 bch_extent_to_text(buf, sizeof(buf), k);
365 pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", buf);
366
367 for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
368 if (!bch_is_zero(u->uuid, 16))
369 pr_debug("Slot %zi: %pU: %s: 1st: %u last: %u inv: %u",
370 u - c->uuids, u->uuid, u->label,
371 u->first_reg, u->last_reg, u->invalidated);
372
373 closure_return_with_destructor(cl, uuid_io_unlock);
374}
375
376static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
377{
378 struct bkey *k = &j->uuid_bucket;
379
380 if (__bch_btree_ptr_invalid(c, k))
381 return "bad uuid pointer";
382
383 bkey_copy(&c->uuid_bucket, k);
384 uuid_io(c, READ_SYNC, k, cl);
385
386 if (j->version < BCACHE_JSET_VERSION_UUIDv1) {
387 struct uuid_entry_v0 *u0 = (void *) c->uuids;
388 struct uuid_entry *u1 = (void *) c->uuids;
389 int i;
390
391 closure_sync(cl);
392
393
394
395
396
397
398
399 for (i = c->nr_uuids - 1;
400 i >= 0;
401 --i) {
402 memcpy(u1[i].uuid, u0[i].uuid, 16);
403 memcpy(u1[i].label, u0[i].label, 32);
404
405 u1[i].first_reg = u0[i].first_reg;
406 u1[i].last_reg = u0[i].last_reg;
407 u1[i].invalidated = u0[i].invalidated;
408
409 u1[i].flags = 0;
410 u1[i].sectors = 0;
411 }
412 }
413
414 return NULL;
415}
416
417static int __uuid_write(struct cache_set *c)
418{
419 BKEY_PADDED(key) k;
420 struct closure cl;
421 closure_init_stack(&cl);
422
423 lockdep_assert_held(&bch_register_lock);
424
425 if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true))
426 return 1;
427
428 SET_KEY_SIZE(&k.key, c->sb.bucket_size);
429 uuid_io(c, REQ_WRITE, &k.key, &cl);
430 closure_sync(&cl);
431
432 bkey_copy(&c->uuid_bucket, &k.key);
433 bkey_put(c, &k.key);
434 return 0;
435}
436
437int bch_uuid_write(struct cache_set *c)
438{
439 int ret = __uuid_write(c);
440
441 if (!ret)
442 bch_journal_meta(c, NULL);
443
444 return ret;
445}
446
447static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid)
448{
449 struct uuid_entry *u;
450
451 for (u = c->uuids;
452 u < c->uuids + c->nr_uuids; u++)
453 if (!memcmp(u->uuid, uuid, 16))
454 return u;
455
456 return NULL;
457}
458
459static struct uuid_entry *uuid_find_empty(struct cache_set *c)
460{
461 static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
462 return uuid_find(c, zero_uuid);
463}
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492static void prio_endio(struct bio *bio)
493{
494 struct cache *ca = bio->bi_private;
495
496 cache_set_err_on(bio->bi_error, ca->set, "accessing priorities");
497 bch_bbio_free(bio, ca->set);
498 closure_put(&ca->prio);
499}
500
501static void prio_io(struct cache *ca, uint64_t bucket, unsigned long rw)
502{
503 struct closure *cl = &ca->prio;
504 struct bio *bio = bch_bbio_alloc(ca->set);
505
506 closure_init_stack(cl);
507
508 bio->bi_iter.bi_sector = bucket * ca->sb.bucket_size;
509 bio->bi_bdev = ca->bdev;
510 bio->bi_rw = REQ_SYNC|REQ_META|rw;
511 bio->bi_iter.bi_size = bucket_bytes(ca);
512
513 bio->bi_end_io = prio_endio;
514 bio->bi_private = ca;
515 bch_bio_map(bio, ca->disk_buckets);
516
517 closure_bio_submit(bio, &ca->prio);
518 closure_sync(cl);
519}
520
521void bch_prio_write(struct cache *ca)
522{
523 int i;
524 struct bucket *b;
525 struct closure cl;
526
527 closure_init_stack(&cl);
528
529 lockdep_assert_held(&ca->set->bucket_lock);
530
531 ca->disk_buckets->seq++;
532
533 atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
534 &ca->meta_sectors_written);
535
536
537
538
539 for (i = prio_buckets(ca) - 1; i >= 0; --i) {
540 long bucket;
541 struct prio_set *p = ca->disk_buckets;
542 struct bucket_disk *d = p->data;
543 struct bucket_disk *end = d + prios_per_bucket(ca);
544
545 for (b = ca->buckets + i * prios_per_bucket(ca);
546 b < ca->buckets + ca->sb.nbuckets && d < end;
547 b++, d++) {
548 d->prio = cpu_to_le16(b->prio);
549 d->gen = b->gen;
550 }
551
552 p->next_bucket = ca->prio_buckets[i + 1];
553 p->magic = pset_magic(&ca->sb);
554 p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8);
555
556 bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true);
557 BUG_ON(bucket == -1);
558
559 mutex_unlock(&ca->set->bucket_lock);
560 prio_io(ca, bucket, REQ_WRITE);
561 mutex_lock(&ca->set->bucket_lock);
562
563 ca->prio_buckets[i] = bucket;
564 atomic_dec_bug(&ca->buckets[bucket].pin);
565 }
566
567 mutex_unlock(&ca->set->bucket_lock);
568
569 bch_journal_meta(ca->set, &cl);
570 closure_sync(&cl);
571
572 mutex_lock(&ca->set->bucket_lock);
573
574
575
576
577
578 for (i = 0; i < prio_buckets(ca); i++) {
579 if (ca->prio_last_buckets[i])
580 __bch_bucket_free(ca,
581 &ca->buckets[ca->prio_last_buckets[i]]);
582
583 ca->prio_last_buckets[i] = ca->prio_buckets[i];
584 }
585}
586
587static void prio_read(struct cache *ca, uint64_t bucket)
588{
589 struct prio_set *p = ca->disk_buckets;
590 struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
591 struct bucket *b;
592 unsigned bucket_nr = 0;
593
594 for (b = ca->buckets;
595 b < ca->buckets + ca->sb.nbuckets;
596 b++, d++) {
597 if (d == end) {
598 ca->prio_buckets[bucket_nr] = bucket;
599 ca->prio_last_buckets[bucket_nr] = bucket;
600 bucket_nr++;
601
602 prio_io(ca, bucket, READ_SYNC);
603
604 if (p->csum != bch_crc64(&p->magic, bucket_bytes(ca) - 8))
605 pr_warn("bad csum reading priorities");
606
607 if (p->magic != pset_magic(&ca->sb))
608 pr_warn("bad magic reading priorities");
609
610 bucket = p->next_bucket;
611 d = p->data;
612 }
613
614 b->prio = le16_to_cpu(d->prio);
615 b->gen = b->last_gc = d->gen;
616 }
617}
618
619
620
621static int open_dev(struct block_device *b, fmode_t mode)
622{
623 struct bcache_device *d = b->bd_disk->private_data;
624 if (test_bit(BCACHE_DEV_CLOSING, &d->flags))
625 return -ENXIO;
626
627 closure_get(&d->cl);
628 return 0;
629}
630
631static void release_dev(struct gendisk *b, fmode_t mode)
632{
633 struct bcache_device *d = b->private_data;
634 closure_put(&d->cl);
635}
636
637static int ioctl_dev(struct block_device *b, fmode_t mode,
638 unsigned int cmd, unsigned long arg)
639{
640 struct bcache_device *d = b->bd_disk->private_data;
641 return d->ioctl(d, mode, cmd, arg);
642}
643
644static const struct block_device_operations bcache_ops = {
645 .open = open_dev,
646 .release = release_dev,
647 .ioctl = ioctl_dev,
648 .owner = THIS_MODULE,
649};
650
651void bcache_device_stop(struct bcache_device *d)
652{
653 if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags))
654 closure_queue(&d->cl);
655}
656
657static void bcache_device_unlink(struct bcache_device *d)
658{
659 lockdep_assert_held(&bch_register_lock);
660
661 if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) {
662 unsigned i;
663 struct cache *ca;
664
665 sysfs_remove_link(&d->c->kobj, d->name);
666 sysfs_remove_link(&d->kobj, "cache");
667
668 for_each_cache(ca, d->c, i)
669 bd_unlink_disk_holder(ca->bdev, d->disk);
670 }
671}
672
673static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
674 const char *name)
675{
676 unsigned i;
677 struct cache *ca;
678
679 for_each_cache(ca, d->c, i)
680 bd_link_disk_holder(ca->bdev, d->disk);
681
682 snprintf(d->name, BCACHEDEVNAME_SIZE,
683 "%s%u", name, d->id);
684
685 WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
686 sysfs_create_link(&c->kobj, &d->kobj, d->name),
687 "Couldn't create device <-> cache set symlinks");
688
689 clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags);
690}
691
692static void bcache_device_detach(struct bcache_device *d)
693{
694 lockdep_assert_held(&bch_register_lock);
695
696 if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
697 struct uuid_entry *u = d->c->uuids + d->id;
698
699 SET_UUID_FLASH_ONLY(u, 0);
700 memcpy(u->uuid, invalid_uuid, 16);
701 u->invalidated = cpu_to_le32(get_seconds());
702 bch_uuid_write(d->c);
703 }
704
705 bcache_device_unlink(d);
706
707 d->c->devices[d->id] = NULL;
708 closure_put(&d->c->caching);
709 d->c = NULL;
710}
711
712static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
713 unsigned id)
714{
715 d->id = id;
716 d->c = c;
717 c->devices[id] = d;
718
719 closure_get(&c->caching);
720}
721
722static void bcache_device_free(struct bcache_device *d)
723{
724 lockdep_assert_held(&bch_register_lock);
725
726 pr_info("%s stopped", d->disk->disk_name);
727
728 if (d->c)
729 bcache_device_detach(d);
730 if (d->disk && d->disk->flags & GENHD_FL_UP)
731 del_gendisk(d->disk);
732 if (d->disk && d->disk->queue)
733 blk_cleanup_queue(d->disk->queue);
734 if (d->disk) {
735 ida_simple_remove(&bcache_minor, d->disk->first_minor);
736 put_disk(d->disk);
737 }
738
739 if (d->bio_split)
740 bioset_free(d->bio_split);
741 kvfree(d->full_dirty_stripes);
742 kvfree(d->stripe_sectors_dirty);
743
744 closure_debug_destroy(&d->cl);
745}
746
747static int bcache_device_init(struct bcache_device *d, unsigned block_size,
748 sector_t sectors)
749{
750 struct request_queue *q;
751 size_t n;
752 int minor;
753
754 if (!d->stripe_size)
755 d->stripe_size = 1 << 31;
756
757 d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
758
759 if (!d->nr_stripes ||
760 d->nr_stripes > INT_MAX ||
761 d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) {
762 pr_err("nr_stripes too large");
763 return -ENOMEM;
764 }
765
766 n = d->nr_stripes * sizeof(atomic_t);
767 d->stripe_sectors_dirty = n < PAGE_SIZE << 6
768 ? kzalloc(n, GFP_KERNEL)
769 : vzalloc(n);
770 if (!d->stripe_sectors_dirty)
771 return -ENOMEM;
772
773 n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long);
774 d->full_dirty_stripes = n < PAGE_SIZE << 6
775 ? kzalloc(n, GFP_KERNEL)
776 : vzalloc(n);
777 if (!d->full_dirty_stripes)
778 return -ENOMEM;
779
780 minor = ida_simple_get(&bcache_minor, 0, MINORMASK + 1, GFP_KERNEL);
781 if (minor < 0)
782 return minor;
783
784 if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
785 !(d->disk = alloc_disk(1))) {
786 ida_simple_remove(&bcache_minor, minor);
787 return -ENOMEM;
788 }
789
790 set_capacity(d->disk, sectors);
791 snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", minor);
792
793 d->disk->major = bcache_major;
794 d->disk->first_minor = minor;
795 d->disk->fops = &bcache_ops;
796 d->disk->private_data = d;
797
798 q = blk_alloc_queue(GFP_KERNEL);
799 if (!q)
800 return -ENOMEM;
801
802 blk_queue_make_request(q, NULL);
803 d->disk->queue = q;
804 q->queuedata = d;
805 q->backing_dev_info.congested_data = d;
806 q->limits.max_hw_sectors = UINT_MAX;
807 q->limits.max_sectors = UINT_MAX;
808 q->limits.max_segment_size = UINT_MAX;
809 q->limits.max_segments = BIO_MAX_PAGES;
810 blk_queue_max_discard_sectors(q, UINT_MAX);
811 q->limits.discard_granularity = 512;
812 q->limits.io_min = block_size;
813 q->limits.logical_block_size = block_size;
814 q->limits.physical_block_size = block_size;
815 set_bit(QUEUE_FLAG_NONROT, &d->disk->queue->queue_flags);
816 clear_bit(QUEUE_FLAG_ADD_RANDOM, &d->disk->queue->queue_flags);
817 set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags);
818
819 blk_queue_write_cache(q, true, true);
820
821 return 0;
822}
823
824
825
826static void calc_cached_dev_sectors(struct cache_set *c)
827{
828 uint64_t sectors = 0;
829 struct cached_dev *dc;
830
831 list_for_each_entry(dc, &c->cached_devs, list)
832 sectors += bdev_sectors(dc->bdev);
833
834 c->cached_dev_sectors = sectors;
835}
836
837void bch_cached_dev_run(struct cached_dev *dc)
838{
839 struct bcache_device *d = &dc->disk;
840 char buf[SB_LABEL_SIZE + 1];
841 char *env[] = {
842 "DRIVER=bcache",
843 kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid),
844 NULL,
845 NULL,
846 };
847
848 memcpy(buf, dc->sb.label, SB_LABEL_SIZE);
849 buf[SB_LABEL_SIZE] = '\0';
850 env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);
851
852 if (atomic_xchg(&dc->running, 1)) {
853 kfree(env[1]);
854 kfree(env[2]);
855 return;
856 }
857
858 if (!d->c &&
859 BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
860 struct closure cl;
861 closure_init_stack(&cl);
862
863 SET_BDEV_STATE(&dc->sb, BDEV_STATE_STALE);
864 bch_write_bdev_super(dc, &cl);
865 closure_sync(&cl);
866 }
867
868 add_disk(d->disk);
869 bd_link_disk_holder(dc->bdev, dc->disk.disk);
870
871
872 kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
873 kfree(env[1]);
874 kfree(env[2]);
875
876 if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
877 sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
878 pr_debug("error creating sysfs link");
879}
880
881static void cached_dev_detach_finish(struct work_struct *w)
882{
883 struct cached_dev *dc = container_of(w, struct cached_dev, detach);
884 char buf[BDEVNAME_SIZE];
885 struct closure cl;
886 closure_init_stack(&cl);
887
888 BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
889 BUG_ON(atomic_read(&dc->count));
890
891 mutex_lock(&bch_register_lock);
892
893 memset(&dc->sb.set_uuid, 0, 16);
894 SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE);
895
896 bch_write_bdev_super(dc, &cl);
897 closure_sync(&cl);
898
899 bcache_device_detach(&dc->disk);
900 list_move(&dc->list, &uncached_devices);
901
902 clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags);
903 clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags);
904
905 mutex_unlock(&bch_register_lock);
906
907 pr_info("Caching disabled for %s", bdevname(dc->bdev, buf));
908
909
910 closure_put(&dc->disk.cl);
911}
912
913void bch_cached_dev_detach(struct cached_dev *dc)
914{
915 lockdep_assert_held(&bch_register_lock);
916
917 if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
918 return;
919
920 if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
921 return;
922
923
924
925
926
927 closure_get(&dc->disk.cl);
928
929 bch_writeback_queue(dc);
930 cached_dev_put(dc);
931}
932
933int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
934{
935 uint32_t rtime = cpu_to_le32(get_seconds());
936 struct uuid_entry *u;
937 char buf[BDEVNAME_SIZE];
938
939 bdevname(dc->bdev, buf);
940
941 if (memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16))
942 return -ENOENT;
943
944 if (dc->disk.c) {
945 pr_err("Can't attach %s: already attached", buf);
946 return -EINVAL;
947 }
948
949 if (test_bit(CACHE_SET_STOPPING, &c->flags)) {
950 pr_err("Can't attach %s: shutting down", buf);
951 return -EINVAL;
952 }
953
954 if (dc->sb.block_size < c->sb.block_size) {
955
956 pr_err("Couldn't attach %s: block size less than set's block size",
957 buf);
958 return -EINVAL;
959 }
960
961 u = uuid_find(c, dc->sb.uuid);
962
963 if (u &&
964 (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE ||
965 BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) {
966 memcpy(u->uuid, invalid_uuid, 16);
967 u->invalidated = cpu_to_le32(get_seconds());
968 u = NULL;
969 }
970
971 if (!u) {
972 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
973 pr_err("Couldn't find uuid for %s in set", buf);
974 return -ENOENT;
975 }
976
977 u = uuid_find_empty(c);
978 if (!u) {
979 pr_err("Not caching %s, no room for UUID", buf);
980 return -EINVAL;
981 }
982 }
983
984
985
986
987
988 if (bch_is_zero(u->uuid, 16)) {
989 struct closure cl;
990 closure_init_stack(&cl);
991
992 memcpy(u->uuid, dc->sb.uuid, 16);
993 memcpy(u->label, dc->sb.label, SB_LABEL_SIZE);
994 u->first_reg = u->last_reg = rtime;
995 bch_uuid_write(c);
996
997 memcpy(dc->sb.set_uuid, c->sb.set_uuid, 16);
998 SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
999
1000 bch_write_bdev_super(dc, &cl);
1001 closure_sync(&cl);
1002 } else {
1003 u->last_reg = rtime;
1004 bch_uuid_write(c);
1005 }
1006
1007 bcache_device_attach(&dc->disk, c, u - c->uuids);
1008 list_move(&dc->list, &c->cached_devs);
1009 calc_cached_dev_sectors(c);
1010
1011 smp_wmb();
1012
1013
1014
1015
1016 atomic_set(&dc->count, 1);
1017
1018
1019 down_write(&dc->writeback_lock);
1020 if (bch_cached_dev_writeback_start(dc)) {
1021 up_write(&dc->writeback_lock);
1022 return -ENOMEM;
1023 }
1024
1025 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
1026 bch_sectors_dirty_init(dc);
1027 atomic_set(&dc->has_dirty, 1);
1028 atomic_inc(&dc->count);
1029 bch_writeback_queue(dc);
1030 }
1031
1032 bch_cached_dev_run(dc);
1033 bcache_device_link(&dc->disk, c, "bdev");
1034
1035
1036 up_write(&dc->writeback_lock);
1037
1038 pr_info("Caching %s as %s on set %pU",
1039 bdevname(dc->bdev, buf), dc->disk.disk->disk_name,
1040 dc->disk.c->sb.set_uuid);
1041 return 0;
1042}
1043
1044void bch_cached_dev_release(struct kobject *kobj)
1045{
1046 struct cached_dev *dc = container_of(kobj, struct cached_dev,
1047 disk.kobj);
1048 kfree(dc);
1049 module_put(THIS_MODULE);
1050}
1051
1052static void cached_dev_free(struct closure *cl)
1053{
1054 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
1055
1056 cancel_delayed_work_sync(&dc->writeback_rate_update);
1057 if (!IS_ERR_OR_NULL(dc->writeback_thread))
1058 kthread_stop(dc->writeback_thread);
1059
1060 mutex_lock(&bch_register_lock);
1061
1062 if (atomic_read(&dc->running))
1063 bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
1064 bcache_device_free(&dc->disk);
1065 list_del(&dc->list);
1066
1067 mutex_unlock(&bch_register_lock);
1068
1069 if (!IS_ERR_OR_NULL(dc->bdev))
1070 blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1071
1072 wake_up(&unregister_wait);
1073
1074 kobject_put(&dc->disk.kobj);
1075}
1076
1077static void cached_dev_flush(struct closure *cl)
1078{
1079 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
1080 struct bcache_device *d = &dc->disk;
1081
1082 mutex_lock(&bch_register_lock);
1083 bcache_device_unlink(d);
1084 mutex_unlock(&bch_register_lock);
1085
1086 bch_cache_accounting_destroy(&dc->accounting);
1087 kobject_del(&d->kobj);
1088
1089 continue_at(cl, cached_dev_free, system_wq);
1090}
1091
1092static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
1093{
1094 int ret;
1095 struct io *io;
1096 struct request_queue *q = bdev_get_queue(dc->bdev);
1097
1098 __module_get(THIS_MODULE);
1099 INIT_LIST_HEAD(&dc->list);
1100 closure_init(&dc->disk.cl, NULL);
1101 set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
1102 kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
1103 INIT_WORK(&dc->detach, cached_dev_detach_finish);
1104 sema_init(&dc->sb_write_mutex, 1);
1105 INIT_LIST_HEAD(&dc->io_lru);
1106 spin_lock_init(&dc->io_lock);
1107 bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
1108
1109 dc->sequential_cutoff = 4 << 20;
1110
1111 for (io = dc->io; io < dc->io + RECENT_IO; io++) {
1112 list_add(&io->lru, &dc->io_lru);
1113 hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
1114 }
1115
1116 dc->disk.stripe_size = q->limits.io_opt >> 9;
1117
1118 if (dc->disk.stripe_size)
1119 dc->partial_stripes_expensive =
1120 q->limits.raid_partial_stripes_expensive;
1121
1122 ret = bcache_device_init(&dc->disk, block_size,
1123 dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
1124 if (ret)
1125 return ret;
1126
1127 set_capacity(dc->disk.disk,
1128 dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
1129
1130 dc->disk.disk->queue->backing_dev_info.ra_pages =
1131 max(dc->disk.disk->queue->backing_dev_info.ra_pages,
1132 q->backing_dev_info.ra_pages);
1133
1134 bch_cached_dev_request_init(dc);
1135 bch_cached_dev_writeback_init(dc);
1136 return 0;
1137}
1138
1139
1140
1141static void register_bdev(struct cache_sb *sb, struct page *sb_page,
1142 struct block_device *bdev,
1143 struct cached_dev *dc)
1144{
1145 char name[BDEVNAME_SIZE];
1146 const char *err = "cannot allocate memory";
1147 struct cache_set *c;
1148
1149 memcpy(&dc->sb, sb, sizeof(struct cache_sb));
1150 dc->bdev = bdev;
1151 dc->bdev->bd_holder = dc;
1152
1153 bio_init(&dc->sb_bio);
1154 dc->sb_bio.bi_max_vecs = 1;
1155 dc->sb_bio.bi_io_vec = dc->sb_bio.bi_inline_vecs;
1156 dc->sb_bio.bi_io_vec[0].bv_page = sb_page;
1157 get_page(sb_page);
1158
1159 if (cached_dev_init(dc, sb->block_size << 9))
1160 goto err;
1161
1162 err = "error creating kobject";
1163 if (kobject_add(&dc->disk.kobj, &part_to_dev(bdev->bd_part)->kobj,
1164 "bcache"))
1165 goto err;
1166 if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
1167 goto err;
1168
1169 pr_info("registered backing device %s", bdevname(bdev, name));
1170
1171 list_add(&dc->list, &uncached_devices);
1172 list_for_each_entry(c, &bch_cache_sets, list)
1173 bch_cached_dev_attach(dc, c);
1174
1175 if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE ||
1176 BDEV_STATE(&dc->sb) == BDEV_STATE_STALE)
1177 bch_cached_dev_run(dc);
1178
1179 return;
1180err:
1181 pr_notice("error opening %s: %s", bdevname(bdev, name), err);
1182 bcache_device_stop(&dc->disk);
1183}
1184
1185
1186
1187void bch_flash_dev_release(struct kobject *kobj)
1188{
1189 struct bcache_device *d = container_of(kobj, struct bcache_device,
1190 kobj);
1191 kfree(d);
1192}
1193
1194static void flash_dev_free(struct closure *cl)
1195{
1196 struct bcache_device *d = container_of(cl, struct bcache_device, cl);
1197 mutex_lock(&bch_register_lock);
1198 bcache_device_free(d);
1199 mutex_unlock(&bch_register_lock);
1200 kobject_put(&d->kobj);
1201}
1202
1203static void flash_dev_flush(struct closure *cl)
1204{
1205 struct bcache_device *d = container_of(cl, struct bcache_device, cl);
1206
1207 mutex_lock(&bch_register_lock);
1208 bcache_device_unlink(d);
1209 mutex_unlock(&bch_register_lock);
1210 kobject_del(&d->kobj);
1211 continue_at(cl, flash_dev_free, system_wq);
1212}
1213
1214static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
1215{
1216 struct bcache_device *d = kzalloc(sizeof(struct bcache_device),
1217 GFP_KERNEL);
1218 if (!d)
1219 return -ENOMEM;
1220
1221 closure_init(&d->cl, NULL);
1222 set_closure_fn(&d->cl, flash_dev_flush, system_wq);
1223
1224 kobject_init(&d->kobj, &bch_flash_dev_ktype);
1225
1226 if (bcache_device_init(d, block_bytes(c), u->sectors))
1227 goto err;
1228
1229 bcache_device_attach(d, c, u - c->uuids);
1230 bch_flash_dev_request_init(d);
1231 add_disk(d->disk);
1232
1233 if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache"))
1234 goto err;
1235
1236 bcache_device_link(d, c, "volume");
1237
1238 return 0;
1239err:
1240 kobject_put(&d->kobj);
1241 return -ENOMEM;
1242}
1243
1244static int flash_devs_run(struct cache_set *c)
1245{
1246 int ret = 0;
1247 struct uuid_entry *u;
1248
1249 for (u = c->uuids;
1250 u < c->uuids + c->nr_uuids && !ret;
1251 u++)
1252 if (UUID_FLASH_ONLY(u))
1253 ret = flash_dev_run(c, u);
1254
1255 return ret;
1256}
1257
1258int bch_flash_dev_create(struct cache_set *c, uint64_t size)
1259{
1260 struct uuid_entry *u;
1261
1262 if (test_bit(CACHE_SET_STOPPING, &c->flags))
1263 return -EINTR;
1264
1265 if (!test_bit(CACHE_SET_RUNNING, &c->flags))
1266 return -EPERM;
1267
1268 u = uuid_find_empty(c);
1269 if (!u) {
1270 pr_err("Can't create volume, no room for UUID");
1271 return -EINVAL;
1272 }
1273
1274 get_random_bytes(u->uuid, 16);
1275 memset(u->label, 0, 32);
1276 u->first_reg = u->last_reg = cpu_to_le32(get_seconds());
1277
1278 SET_UUID_FLASH_ONLY(u, 1);
1279 u->sectors = size >> 9;
1280
1281 bch_uuid_write(c);
1282
1283 return flash_dev_run(c, u);
1284}
1285
1286
1287
1288__printf(2, 3)
1289bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
1290{
1291 va_list args;
1292
1293 if (c->on_error != ON_ERROR_PANIC &&
1294 test_bit(CACHE_SET_STOPPING, &c->flags))
1295 return false;
1296
1297
1298
1299
1300
1301 printk(KERN_ERR "bcache: error on %pU: ", c->sb.set_uuid);
1302
1303 va_start(args, fmt);
1304 vprintk(fmt, args);
1305 va_end(args);
1306
1307 printk(", disabling caching\n");
1308
1309 if (c->on_error == ON_ERROR_PANIC)
1310 panic("panic forced after error\n");
1311
1312 bch_cache_set_unregister(c);
1313 return true;
1314}
1315
1316void bch_cache_set_release(struct kobject *kobj)
1317{
1318 struct cache_set *c = container_of(kobj, struct cache_set, kobj);
1319 kfree(c);
1320 module_put(THIS_MODULE);
1321}
1322
1323static void cache_set_free(struct closure *cl)
1324{
1325 struct cache_set *c = container_of(cl, struct cache_set, cl);
1326 struct cache *ca;
1327 unsigned i;
1328
1329 if (!IS_ERR_OR_NULL(c->debug))
1330 debugfs_remove(c->debug);
1331
1332 bch_open_buckets_free(c);
1333 bch_btree_cache_free(c);
1334 bch_journal_free(c);
1335
1336 for_each_cache(ca, c, i)
1337 if (ca) {
1338 ca->set = NULL;
1339 c->cache[ca->sb.nr_this_dev] = NULL;
1340 kobject_put(&ca->kobj);
1341 }
1342
1343 bch_bset_sort_state_free(&c->sort);
1344 free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
1345
1346 if (c->moving_gc_wq)
1347 destroy_workqueue(c->moving_gc_wq);
1348 if (c->bio_split)
1349 bioset_free(c->bio_split);
1350 if (c->fill_iter)
1351 mempool_destroy(c->fill_iter);
1352 if (c->bio_meta)
1353 mempool_destroy(c->bio_meta);
1354 if (c->search)
1355 mempool_destroy(c->search);
1356 kfree(c->devices);
1357
1358 mutex_lock(&bch_register_lock);
1359 list_del(&c->list);
1360 mutex_unlock(&bch_register_lock);
1361
1362 pr_info("Cache set %pU unregistered", c->sb.set_uuid);
1363 wake_up(&unregister_wait);
1364
1365 closure_debug_destroy(&c->cl);
1366 kobject_put(&c->kobj);
1367}
1368
1369static void cache_set_flush(struct closure *cl)
1370{
1371 struct cache_set *c = container_of(cl, struct cache_set, caching);
1372 struct cache *ca;
1373 struct btree *b;
1374 unsigned i;
1375
1376 if (!c)
1377 closure_return(cl);
1378
1379 bch_cache_accounting_destroy(&c->accounting);
1380
1381 kobject_put(&c->internal);
1382 kobject_del(&c->kobj);
1383
1384 if (c->gc_thread)
1385 kthread_stop(c->gc_thread);
1386
1387 if (!IS_ERR_OR_NULL(c->root))
1388 list_add(&c->root->list, &c->btree_cache);
1389
1390
1391 list_for_each_entry(b, &c->btree_cache, list) {
1392 mutex_lock(&b->write_lock);
1393 if (btree_node_dirty(b))
1394 __bch_btree_node_write(b, NULL);
1395 mutex_unlock(&b->write_lock);
1396 }
1397
1398 for_each_cache(ca, c, i)
1399 if (ca->alloc_thread)
1400 kthread_stop(ca->alloc_thread);
1401
1402 if (c->journal.cur) {
1403 cancel_delayed_work_sync(&c->journal.work);
1404
1405 c->journal.work.work.func(&c->journal.work.work);
1406 }
1407
1408 closure_return(cl);
1409}
1410
1411static void __cache_set_unregister(struct closure *cl)
1412{
1413 struct cache_set *c = container_of(cl, struct cache_set, caching);
1414 struct cached_dev *dc;
1415 size_t i;
1416
1417 mutex_lock(&bch_register_lock);
1418
1419 for (i = 0; i < c->nr_uuids; i++)
1420 if (c->devices[i]) {
1421 if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
1422 test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
1423 dc = container_of(c->devices[i],
1424 struct cached_dev, disk);
1425 bch_cached_dev_detach(dc);
1426 } else {
1427 bcache_device_stop(c->devices[i]);
1428 }
1429 }
1430
1431 mutex_unlock(&bch_register_lock);
1432
1433 continue_at(cl, cache_set_flush, system_wq);
1434}
1435
1436void bch_cache_set_stop(struct cache_set *c)
1437{
1438 if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags))
1439 closure_queue(&c->caching);
1440}
1441
1442void bch_cache_set_unregister(struct cache_set *c)
1443{
1444 set_bit(CACHE_SET_UNREGISTERING, &c->flags);
1445 bch_cache_set_stop(c);
1446}
1447
1448#define alloc_bucket_pages(gfp, c) \
1449 ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(c))))
1450
1451struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1452{
1453 int iter_size;
1454 struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
1455 if (!c)
1456 return NULL;
1457
1458 __module_get(THIS_MODULE);
1459 closure_init(&c->cl, NULL);
1460 set_closure_fn(&c->cl, cache_set_free, system_wq);
1461
1462 closure_init(&c->caching, &c->cl);
1463 set_closure_fn(&c->caching, __cache_set_unregister, system_wq);
1464
1465
1466 closure_set_stopped(&c->cl);
1467 closure_put(&c->cl);
1468
1469 kobject_init(&c->kobj, &bch_cache_set_ktype);
1470 kobject_init(&c->internal, &bch_cache_set_internal_ktype);
1471
1472 bch_cache_accounting_init(&c->accounting, &c->cl);
1473
1474 memcpy(c->sb.set_uuid, sb->set_uuid, 16);
1475 c->sb.block_size = sb->block_size;
1476 c->sb.bucket_size = sb->bucket_size;
1477 c->sb.nr_in_set = sb->nr_in_set;
1478 c->sb.last_mount = sb->last_mount;
1479 c->bucket_bits = ilog2(sb->bucket_size);
1480 c->block_bits = ilog2(sb->block_size);
1481 c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry);
1482
1483 c->btree_pages = bucket_pages(c);
1484 if (c->btree_pages > BTREE_MAX_PAGES)
1485 c->btree_pages = max_t(int, c->btree_pages / 4,
1486 BTREE_MAX_PAGES);
1487
1488 sema_init(&c->sb_write_mutex, 1);
1489 mutex_init(&c->bucket_lock);
1490 init_waitqueue_head(&c->btree_cache_wait);
1491 init_waitqueue_head(&c->bucket_wait);
1492 sema_init(&c->uuid_write_mutex, 1);
1493
1494 spin_lock_init(&c->btree_gc_time.lock);
1495 spin_lock_init(&c->btree_split_time.lock);
1496 spin_lock_init(&c->btree_read_time.lock);
1497
1498 bch_moving_init_cache_set(c);
1499
1500 INIT_LIST_HEAD(&c->list);
1501 INIT_LIST_HEAD(&c->cached_devs);
1502 INIT_LIST_HEAD(&c->btree_cache);
1503 INIT_LIST_HEAD(&c->btree_cache_freeable);
1504 INIT_LIST_HEAD(&c->btree_cache_freed);
1505 INIT_LIST_HEAD(&c->data_buckets);
1506
1507 c->search = mempool_create_slab_pool(32, bch_search_cache);
1508 if (!c->search)
1509 goto err;
1510
1511 iter_size = (sb->bucket_size / sb->block_size + 1) *
1512 sizeof(struct btree_iter_set);
1513
1514 if (!(c->devices = kzalloc(c->nr_uuids * sizeof(void *), GFP_KERNEL)) ||
1515 !(c->bio_meta = mempool_create_kmalloc_pool(2,
1516 sizeof(struct bbio) + sizeof(struct bio_vec) *
1517 bucket_pages(c))) ||
1518 !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
1519 !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
1520 !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
1521 !(c->moving_gc_wq = create_workqueue("bcache_gc")) ||
1522 bch_journal_alloc(c) ||
1523 bch_btree_cache_alloc(c) ||
1524 bch_open_buckets_alloc(c) ||
1525 bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages)))
1526 goto err;
1527
1528 c->congested_read_threshold_us = 2000;
1529 c->congested_write_threshold_us = 20000;
1530 c->error_limit = 8 << IO_ERROR_SHIFT;
1531
1532 return c;
1533err:
1534 bch_cache_set_unregister(c);
1535 return NULL;
1536}
1537
1538static void run_cache_set(struct cache_set *c)
1539{
1540 const char *err = "cannot allocate memory";
1541 struct cached_dev *dc, *t;
1542 struct cache *ca;
1543 struct closure cl;
1544 unsigned i;
1545
1546 closure_init_stack(&cl);
1547
1548 for_each_cache(ca, c, i)
1549 c->nbuckets += ca->sb.nbuckets;
1550
1551 if (CACHE_SYNC(&c->sb)) {
1552 LIST_HEAD(journal);
1553 struct bkey *k;
1554 struct jset *j;
1555
1556 err = "cannot allocate memory for journal";
1557 if (bch_journal_read(c, &journal))
1558 goto err;
1559
1560 pr_debug("btree_journal_read() done");
1561
1562 err = "no journal entries found";
1563 if (list_empty(&journal))
1564 goto err;
1565
1566 j = &list_entry(journal.prev, struct journal_replay, list)->j;
1567
1568 err = "IO error reading priorities";
1569 for_each_cache(ca, c, i)
1570 prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]);
1571
1572
1573
1574
1575
1576
1577
1578 k = &j->btree_root;
1579
1580 err = "bad btree root";
1581 if (__bch_btree_ptr_invalid(c, k))
1582 goto err;
1583
1584 err = "error reading btree root";
1585 c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true, NULL);
1586 if (IS_ERR_OR_NULL(c->root))
1587 goto err;
1588
1589 list_del_init(&c->root->list);
1590 rw_unlock(true, c->root);
1591
1592 err = uuid_read(c, j, &cl);
1593 if (err)
1594 goto err;
1595
1596 err = "error in recovery";
1597 if (bch_btree_check(c))
1598 goto err;
1599
1600 bch_journal_mark(c, &journal);
1601 bch_initial_gc_finish(c);
1602 pr_debug("btree_check() done");
1603
1604
1605
1606
1607
1608
1609 bch_journal_next(&c->journal);
1610
1611 err = "error starting allocator thread";
1612 for_each_cache(ca, c, i)
1613 if (bch_cache_allocator_start(ca))
1614 goto err;
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626 if (j->version < BCACHE_JSET_VERSION_UUID)
1627 __uuid_write(c);
1628
1629 bch_journal_replay(c, &journal);
1630 } else {
1631 pr_notice("invalidating existing data");
1632
1633 for_each_cache(ca, c, i) {
1634 unsigned j;
1635
1636 ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
1637 2, SB_JOURNAL_BUCKETS);
1638
1639 for (j = 0; j < ca->sb.keys; j++)
1640 ca->sb.d[j] = ca->sb.first_bucket + j;
1641 }
1642
1643 bch_initial_gc_finish(c);
1644
1645 err = "error starting allocator thread";
1646 for_each_cache(ca, c, i)
1647 if (bch_cache_allocator_start(ca))
1648 goto err;
1649
1650 mutex_lock(&c->bucket_lock);
1651 for_each_cache(ca, c, i)
1652 bch_prio_write(ca);
1653 mutex_unlock(&c->bucket_lock);
1654
1655 err = "cannot allocate new UUID bucket";
1656 if (__uuid_write(c))
1657 goto err;
1658
1659 err = "cannot allocate new btree root";
1660 c->root = __bch_btree_node_alloc(c, NULL, 0, true, NULL);
1661 if (IS_ERR_OR_NULL(c->root))
1662 goto err;
1663
1664 mutex_lock(&c->root->write_lock);
1665 bkey_copy_key(&c->root->key, &MAX_KEY);
1666 bch_btree_node_write(c->root, &cl);
1667 mutex_unlock(&c->root->write_lock);
1668
1669 bch_btree_set_root(c->root);
1670 rw_unlock(true, c->root);
1671
1672
1673
1674
1675
1676
1677 SET_CACHE_SYNC(&c->sb, true);
1678
1679 bch_journal_next(&c->journal);
1680 bch_journal_meta(c, &cl);
1681 }
1682
1683 err = "error starting gc thread";
1684 if (bch_gc_thread_start(c))
1685 goto err;
1686
1687 closure_sync(&cl);
1688 c->sb.last_mount = get_seconds();
1689 bcache_write_super(c);
1690
1691 list_for_each_entry_safe(dc, t, &uncached_devices, list)
1692 bch_cached_dev_attach(dc, c);
1693
1694 flash_devs_run(c);
1695
1696 set_bit(CACHE_SET_RUNNING, &c->flags);
1697 return;
1698err:
1699 closure_sync(&cl);
1700
1701 bch_cache_set_error(c, "%s", err);
1702}
1703
1704static bool can_attach_cache(struct cache *ca, struct cache_set *c)
1705{
1706 return ca->sb.block_size == c->sb.block_size &&
1707 ca->sb.bucket_size == c->sb.bucket_size &&
1708 ca->sb.nr_in_set == c->sb.nr_in_set;
1709}
1710
1711static const char *register_cache_set(struct cache *ca)
1712{
1713 char buf[12];
1714 const char *err = "cannot allocate memory";
1715 struct cache_set *c;
1716
1717 list_for_each_entry(c, &bch_cache_sets, list)
1718 if (!memcmp(c->sb.set_uuid, ca->sb.set_uuid, 16)) {
1719 if (c->cache[ca->sb.nr_this_dev])
1720 return "duplicate cache set member";
1721
1722 if (!can_attach_cache(ca, c))
1723 return "cache sb does not match set";
1724
1725 if (!CACHE_SYNC(&ca->sb))
1726 SET_CACHE_SYNC(&c->sb, false);
1727
1728 goto found;
1729 }
1730
1731 c = bch_cache_set_alloc(&ca->sb);
1732 if (!c)
1733 return err;
1734
1735 err = "error creating kobject";
1736 if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->sb.set_uuid) ||
1737 kobject_add(&c->internal, &c->kobj, "internal"))
1738 goto err;
1739
1740 if (bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
1741 goto err;
1742
1743 bch_debug_init_cache_set(c);
1744
1745 list_add(&c->list, &bch_cache_sets);
1746found:
1747 sprintf(buf, "cache%i", ca->sb.nr_this_dev);
1748 if (sysfs_create_link(&ca->kobj, &c->kobj, "set") ||
1749 sysfs_create_link(&c->kobj, &ca->kobj, buf))
1750 goto err;
1751
1752 if (ca->sb.seq > c->sb.seq) {
1753 c->sb.version = ca->sb.version;
1754 memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16);
1755 c->sb.flags = ca->sb.flags;
1756 c->sb.seq = ca->sb.seq;
1757 pr_debug("set version = %llu", c->sb.version);
1758 }
1759
1760 kobject_get(&ca->kobj);
1761 ca->set = c;
1762 ca->set->cache[ca->sb.nr_this_dev] = ca;
1763 c->cache_by_alloc[c->caches_loaded++] = ca;
1764
1765 if (c->caches_loaded == c->sb.nr_in_set)
1766 run_cache_set(c);
1767
1768 return NULL;
1769err:
1770 bch_cache_set_unregister(c);
1771 return err;
1772}
1773
1774
1775
1776void bch_cache_release(struct kobject *kobj)
1777{
1778 struct cache *ca = container_of(kobj, struct cache, kobj);
1779 unsigned i;
1780
1781 if (ca->set) {
1782 BUG_ON(ca->set->cache[ca->sb.nr_this_dev] != ca);
1783 ca->set->cache[ca->sb.nr_this_dev] = NULL;
1784 }
1785
1786 free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
1787 kfree(ca->prio_buckets);
1788 vfree(ca->buckets);
1789
1790 free_heap(&ca->heap);
1791 free_fifo(&ca->free_inc);
1792
1793 for (i = 0; i < RESERVE_NR; i++)
1794 free_fifo(&ca->free[i]);
1795
1796 if (ca->sb_bio.bi_inline_vecs[0].bv_page)
1797 put_page(ca->sb_bio.bi_io_vec[0].bv_page);
1798
1799 if (!IS_ERR_OR_NULL(ca->bdev))
1800 blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1801
1802 kfree(ca);
1803 module_put(THIS_MODULE);
1804}
1805
1806static int cache_alloc(struct cache_sb *sb, struct cache *ca)
1807{
1808 size_t free;
1809 struct bucket *b;
1810
1811 __module_get(THIS_MODULE);
1812 kobject_init(&ca->kobj, &bch_cache_ktype);
1813
1814 bio_init(&ca->journal.bio);
1815 ca->journal.bio.bi_max_vecs = 8;
1816 ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs;
1817
1818 free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
1819
1820 if (!init_fifo(&ca->free[RESERVE_BTREE], 8, GFP_KERNEL) ||
1821 !init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
1822 !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) ||
1823 !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) ||
1824 !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) ||
1825 !init_heap(&ca->heap, free << 3, GFP_KERNEL) ||
1826 !(ca->buckets = vzalloc(sizeof(struct bucket) *
1827 ca->sb.nbuckets)) ||
1828 !(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) *
1829 2, GFP_KERNEL)) ||
1830 !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)))
1831 return -ENOMEM;
1832
1833 ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
1834
1835 for_each_bucket(b, ca)
1836 atomic_set(&b->pin, 0);
1837
1838 return 0;
1839}
1840
1841static int register_cache(struct cache_sb *sb, struct page *sb_page,
1842 struct block_device *bdev, struct cache *ca)
1843{
1844 char name[BDEVNAME_SIZE];
1845 const char *err = NULL;
1846 int ret = 0;
1847
1848 memcpy(&ca->sb, sb, sizeof(struct cache_sb));
1849 ca->bdev = bdev;
1850 ca->bdev->bd_holder = ca;
1851
1852 bio_init(&ca->sb_bio);
1853 ca->sb_bio.bi_max_vecs = 1;
1854 ca->sb_bio.bi_io_vec = ca->sb_bio.bi_inline_vecs;
1855 ca->sb_bio.bi_io_vec[0].bv_page = sb_page;
1856 get_page(sb_page);
1857
1858 if (blk_queue_discard(bdev_get_queue(ca->bdev)))
1859 ca->discard = CACHE_DISCARD(&ca->sb);
1860
1861 ret = cache_alloc(sb, ca);
1862 if (ret != 0)
1863 goto err;
1864
1865 if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache")) {
1866 err = "error calling kobject_add";
1867 ret = -ENOMEM;
1868 goto out;
1869 }
1870
1871 mutex_lock(&bch_register_lock);
1872 err = register_cache_set(ca);
1873 mutex_unlock(&bch_register_lock);
1874
1875 if (err) {
1876 ret = -ENODEV;
1877 goto out;
1878 }
1879
1880 pr_info("registered cache device %s", bdevname(bdev, name));
1881
1882out:
1883 kobject_put(&ca->kobj);
1884
1885err:
1886 if (err)
1887 pr_notice("error opening %s: %s", bdevname(bdev, name), err);
1888
1889 return ret;
1890}
1891
1892
1893
1894static ssize_t register_bcache(struct kobject *, struct kobj_attribute *,
1895 const char *, size_t);
1896
1897kobj_attribute_write(register, register_bcache);
1898kobj_attribute_write(register_quiet, register_bcache);
1899
1900static bool bch_is_open_backing(struct block_device *bdev) {
1901 struct cache_set *c, *tc;
1902 struct cached_dev *dc, *t;
1903
1904 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
1905 list_for_each_entry_safe(dc, t, &c->cached_devs, list)
1906 if (dc->bdev == bdev)
1907 return true;
1908 list_for_each_entry_safe(dc, t, &uncached_devices, list)
1909 if (dc->bdev == bdev)
1910 return true;
1911 return false;
1912}
1913
1914static bool bch_is_open_cache(struct block_device *bdev) {
1915 struct cache_set *c, *tc;
1916 struct cache *ca;
1917 unsigned i;
1918
1919 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
1920 for_each_cache(ca, c, i)
1921 if (ca->bdev == bdev)
1922 return true;
1923 return false;
1924}
1925
1926static bool bch_is_open(struct block_device *bdev) {
1927 return bch_is_open_cache(bdev) || bch_is_open_backing(bdev);
1928}
1929
1930static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
1931 const char *buffer, size_t size)
1932{
1933 ssize_t ret = size;
1934 const char *err = "cannot allocate memory";
1935 char *path = NULL;
1936 struct cache_sb *sb = NULL;
1937 struct block_device *bdev = NULL;
1938 struct page *sb_page = NULL;
1939
1940 if (!try_module_get(THIS_MODULE))
1941 return -EBUSY;
1942
1943 if (!(path = kstrndup(buffer, size, GFP_KERNEL)) ||
1944 !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL)))
1945 goto err;
1946
1947 err = "failed to open device";
1948 bdev = blkdev_get_by_path(strim(path),
1949 FMODE_READ|FMODE_WRITE|FMODE_EXCL,
1950 sb);
1951 if (IS_ERR(bdev)) {
1952 if (bdev == ERR_PTR(-EBUSY)) {
1953 bdev = lookup_bdev(strim(path));
1954 mutex_lock(&bch_register_lock);
1955 if (!IS_ERR(bdev) && bch_is_open(bdev))
1956 err = "device already registered";
1957 else
1958 err = "device busy";
1959 mutex_unlock(&bch_register_lock);
1960 if (attr == &ksysfs_register_quiet)
1961 goto out;
1962 }
1963 goto err;
1964 }
1965
1966 err = "failed to set blocksize";
1967 if (set_blocksize(bdev, 4096))
1968 goto err_close;
1969
1970 err = read_super(sb, bdev, &sb_page);
1971 if (err)
1972 goto err_close;
1973
1974 if (SB_IS_BDEV(sb)) {
1975 struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
1976 if (!dc)
1977 goto err_close;
1978
1979 mutex_lock(&bch_register_lock);
1980 register_bdev(sb, sb_page, bdev, dc);
1981 mutex_unlock(&bch_register_lock);
1982 } else {
1983 struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
1984 if (!ca)
1985 goto err_close;
1986
1987 if (register_cache(sb, sb_page, bdev, ca) != 0)
1988 goto err_close;
1989 }
1990out:
1991 if (sb_page)
1992 put_page(sb_page);
1993 kfree(sb);
1994 kfree(path);
1995 module_put(THIS_MODULE);
1996 return ret;
1997
1998err_close:
1999 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2000err:
2001 pr_info("error opening %s: %s", path, err);
2002 ret = -EINVAL;
2003 goto out;
2004}
2005
2006static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
2007{
2008 if (code == SYS_DOWN ||
2009 code == SYS_HALT ||
2010 code == SYS_POWER_OFF) {
2011 DEFINE_WAIT(wait);
2012 unsigned long start = jiffies;
2013 bool stopped = false;
2014
2015 struct cache_set *c, *tc;
2016 struct cached_dev *dc, *tdc;
2017
2018 mutex_lock(&bch_register_lock);
2019
2020 if (list_empty(&bch_cache_sets) &&
2021 list_empty(&uncached_devices))
2022 goto out;
2023
2024 pr_info("Stopping all devices:");
2025
2026 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
2027 bch_cache_set_stop(c);
2028
2029 list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
2030 bcache_device_stop(&dc->disk);
2031
2032
2033 while (1) {
2034 long timeout = start + 2 * HZ - jiffies;
2035
2036 stopped = list_empty(&bch_cache_sets) &&
2037 list_empty(&uncached_devices);
2038
2039 if (timeout < 0 || stopped)
2040 break;
2041
2042 prepare_to_wait(&unregister_wait, &wait,
2043 TASK_UNINTERRUPTIBLE);
2044
2045 mutex_unlock(&bch_register_lock);
2046 schedule_timeout(timeout);
2047 mutex_lock(&bch_register_lock);
2048 }
2049
2050 finish_wait(&unregister_wait, &wait);
2051
2052 if (stopped)
2053 pr_info("All devices stopped");
2054 else
2055 pr_notice("Timeout waiting for devices to be closed");
2056out:
2057 mutex_unlock(&bch_register_lock);
2058 }
2059
2060 return NOTIFY_DONE;
2061}
2062
2063static struct notifier_block reboot = {
2064 .notifier_call = bcache_reboot,
2065 .priority = INT_MAX,
2066};
2067
2068static void bcache_exit(void)
2069{
2070 bch_debug_exit();
2071 bch_request_exit();
2072 if (bcache_kobj)
2073 kobject_put(bcache_kobj);
2074 if (bcache_wq)
2075 destroy_workqueue(bcache_wq);
2076 if (bcache_major)
2077 unregister_blkdev(bcache_major, "bcache");
2078 unregister_reboot_notifier(&reboot);
2079}
2080
2081static int __init bcache_init(void)
2082{
2083 static const struct attribute *files[] = {
2084 &ksysfs_register.attr,
2085 &ksysfs_register_quiet.attr,
2086 NULL
2087 };
2088
2089 mutex_init(&bch_register_lock);
2090 init_waitqueue_head(&unregister_wait);
2091 register_reboot_notifier(&reboot);
2092 closure_debug_init();
2093
2094 bcache_major = register_blkdev(0, "bcache");
2095 if (bcache_major < 0) {
2096 unregister_reboot_notifier(&reboot);
2097 return bcache_major;
2098 }
2099
2100 if (!(bcache_wq = create_workqueue("bcache")) ||
2101 !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
2102 sysfs_create_files(bcache_kobj, files) ||
2103 bch_request_init() ||
2104 bch_debug_init(bcache_kobj))
2105 goto err;
2106
2107 return 0;
2108err:
2109 bcache_exit();
2110 return -ENOMEM;
2111}
2112
2113module_exit(bcache_exit);
2114module_init(bcache_init);
2115