1
2
3
4
5
6
7
8
9#include "bcache.h"
10#include "btree.h"
11#include "debug.h"
12#include "extents.h"
13#include "request.h"
14#include "writeback.h"
15
16#include <linux/blkdev.h>
17#include <linux/buffer_head.h>
18#include <linux/debugfs.h>
19#include <linux/genhd.h>
20#include <linux/idr.h>
21#include <linux/kthread.h>
22#include <linux/module.h>
23#include <linux/random.h>
24#include <linux/reboot.h>
25#include <linux/sysfs.h>
26
27MODULE_LICENSE("GPL");
28MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
29
30static const char bcache_magic[] = {
31 0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
32 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81
33};
34
35static const char invalid_uuid[] = {
36 0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78,
37 0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
38};
39
40
41const char * const bch_cache_modes[] = {
42 "default",
43 "writethrough",
44 "writeback",
45 "writearound",
46 "none",
47 NULL
48};
49
50static struct kobject *bcache_kobj;
51struct mutex bch_register_lock;
52LIST_HEAD(bch_cache_sets);
53static LIST_HEAD(uncached_devices);
54
55static int bcache_major;
56static DEFINE_IDA(bcache_minor);
57static wait_queue_head_t unregister_wait;
58struct workqueue_struct *bcache_wq;
59
60#define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE)
61
62
63
64static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
65 struct page **res)
66{
67 const char *err;
68 struct cache_sb *s;
69 struct buffer_head *bh = __bread(bdev, 1, SB_SIZE);
70 unsigned i;
71
72 if (!bh)
73 return "IO error";
74
75 s = (struct cache_sb *) bh->b_data;
76
77 sb->offset = le64_to_cpu(s->offset);
78 sb->version = le64_to_cpu(s->version);
79
80 memcpy(sb->magic, s->magic, 16);
81 memcpy(sb->uuid, s->uuid, 16);
82 memcpy(sb->set_uuid, s->set_uuid, 16);
83 memcpy(sb->label, s->label, SB_LABEL_SIZE);
84
85 sb->flags = le64_to_cpu(s->flags);
86 sb->seq = le64_to_cpu(s->seq);
87 sb->last_mount = le32_to_cpu(s->last_mount);
88 sb->first_bucket = le16_to_cpu(s->first_bucket);
89 sb->keys = le16_to_cpu(s->keys);
90
91 for (i = 0; i < SB_JOURNAL_BUCKETS; i++)
92 sb->d[i] = le64_to_cpu(s->d[i]);
93
94 pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
95 sb->version, sb->flags, sb->seq, sb->keys);
96
97 err = "Not a bcache superblock";
98 if (sb->offset != SB_SECTOR)
99 goto err;
100
101 if (memcmp(sb->magic, bcache_magic, 16))
102 goto err;
103
104 err = "Too many journal buckets";
105 if (sb->keys > SB_JOURNAL_BUCKETS)
106 goto err;
107
108 err = "Bad checksum";
109 if (s->csum != csum_set(s))
110 goto err;
111
112 err = "Bad UUID";
113 if (bch_is_zero(sb->uuid, 16))
114 goto err;
115
116 sb->block_size = le16_to_cpu(s->block_size);
117
118 err = "Superblock block size smaller than device block size";
119 if (sb->block_size << 9 < bdev_logical_block_size(bdev))
120 goto err;
121
122 switch (sb->version) {
123 case BCACHE_SB_VERSION_BDEV:
124 sb->data_offset = BDEV_DATA_START_DEFAULT;
125 break;
126 case BCACHE_SB_VERSION_BDEV_WITH_OFFSET:
127 sb->data_offset = le64_to_cpu(s->data_offset);
128
129 err = "Bad data offset";
130 if (sb->data_offset < BDEV_DATA_START_DEFAULT)
131 goto err;
132
133 break;
134 case BCACHE_SB_VERSION_CDEV:
135 case BCACHE_SB_VERSION_CDEV_WITH_UUID:
136 sb->nbuckets = le64_to_cpu(s->nbuckets);
137 sb->block_size = le16_to_cpu(s->block_size);
138 sb->bucket_size = le16_to_cpu(s->bucket_size);
139
140 sb->nr_in_set = le16_to_cpu(s->nr_in_set);
141 sb->nr_this_dev = le16_to_cpu(s->nr_this_dev);
142
143 err = "Too many buckets";
144 if (sb->nbuckets > LONG_MAX)
145 goto err;
146
147 err = "Not enough buckets";
148 if (sb->nbuckets < 1 << 7)
149 goto err;
150
151 err = "Bad block/bucket size";
152 if (!is_power_of_2(sb->block_size) ||
153 sb->block_size > PAGE_SECTORS ||
154 !is_power_of_2(sb->bucket_size) ||
155 sb->bucket_size < PAGE_SECTORS)
156 goto err;
157
158 err = "Invalid superblock: device too small";
159 if (get_capacity(bdev->bd_disk) < sb->bucket_size * sb->nbuckets)
160 goto err;
161
162 err = "Bad UUID";
163 if (bch_is_zero(sb->set_uuid, 16))
164 goto err;
165
166 err = "Bad cache device number in set";
167 if (!sb->nr_in_set ||
168 sb->nr_in_set <= sb->nr_this_dev ||
169 sb->nr_in_set > MAX_CACHES_PER_SET)
170 goto err;
171
172 err = "Journal buckets not sequential";
173 for (i = 0; i < sb->keys; i++)
174 if (sb->d[i] != sb->first_bucket + i)
175 goto err;
176
177 err = "Too many journal buckets";
178 if (sb->first_bucket + sb->keys > sb->nbuckets)
179 goto err;
180
181 err = "Invalid superblock: first bucket comes before end of super";
182 if (sb->first_bucket * sb->bucket_size < 16)
183 goto err;
184
185 break;
186 default:
187 err = "Unsupported superblock version";
188 goto err;
189 }
190
191 sb->last_mount = get_seconds();
192 err = NULL;
193
194 get_page(bh->b_page);
195 *res = bh->b_page;
196err:
197 put_bh(bh);
198 return err;
199}
200
201static void write_bdev_super_endio(struct bio *bio)
202{
203 struct cached_dev *dc = bio->bi_private;
204
205
206 closure_put(&dc->sb_write);
207}
208
209static void __write_super(struct cache_sb *sb, struct bio *bio)
210{
211 struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page);
212 unsigned i;
213
214 bio->bi_iter.bi_sector = SB_SECTOR;
215 bio->bi_rw = REQ_SYNC|REQ_META;
216 bio->bi_iter.bi_size = SB_SIZE;
217 bch_bio_map(bio, NULL);
218
219 out->offset = cpu_to_le64(sb->offset);
220 out->version = cpu_to_le64(sb->version);
221
222 memcpy(out->uuid, sb->uuid, 16);
223 memcpy(out->set_uuid, sb->set_uuid, 16);
224 memcpy(out->label, sb->label, SB_LABEL_SIZE);
225
226 out->flags = cpu_to_le64(sb->flags);
227 out->seq = cpu_to_le64(sb->seq);
228
229 out->last_mount = cpu_to_le32(sb->last_mount);
230 out->first_bucket = cpu_to_le16(sb->first_bucket);
231 out->keys = cpu_to_le16(sb->keys);
232
233 for (i = 0; i < sb->keys; i++)
234 out->d[i] = cpu_to_le64(sb->d[i]);
235
236 out->csum = csum_set(out);
237
238 pr_debug("ver %llu, flags %llu, seq %llu",
239 sb->version, sb->flags, sb->seq);
240
241 submit_bio(REQ_WRITE, bio);
242}
243
244static void bch_write_bdev_super_unlock(struct closure *cl)
245{
246 struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write);
247
248 up(&dc->sb_write_mutex);
249}
250
251void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
252{
253 struct closure *cl = &dc->sb_write;
254 struct bio *bio = &dc->sb_bio;
255
256 down(&dc->sb_write_mutex);
257 closure_init(cl, parent);
258
259 bio_reset(bio);
260 bio->bi_bdev = dc->bdev;
261 bio->bi_end_io = write_bdev_super_endio;
262 bio->bi_private = dc;
263
264 closure_get(cl);
265 __write_super(&dc->sb, bio);
266
267 closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
268}
269
270static void write_super_endio(struct bio *bio)
271{
272 struct cache *ca = bio->bi_private;
273
274 bch_count_io_errors(ca, bio->bi_error, "writing superblock");
275 closure_put(&ca->set->sb_write);
276}
277
278static void bcache_write_super_unlock(struct closure *cl)
279{
280 struct cache_set *c = container_of(cl, struct cache_set, sb_write);
281
282 up(&c->sb_write_mutex);
283}
284
285void bcache_write_super(struct cache_set *c)
286{
287 struct closure *cl = &c->sb_write;
288 struct cache *ca;
289 unsigned i;
290
291 down(&c->sb_write_mutex);
292 closure_init(cl, &c->cl);
293
294 c->sb.seq++;
295
296 for_each_cache(ca, c, i) {
297 struct bio *bio = &ca->sb_bio;
298
299 ca->sb.version = BCACHE_SB_VERSION_CDEV_WITH_UUID;
300 ca->sb.seq = c->sb.seq;
301 ca->sb.last_mount = c->sb.last_mount;
302
303 SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb));
304
305 bio_reset(bio);
306 bio->bi_bdev = ca->bdev;
307 bio->bi_end_io = write_super_endio;
308 bio->bi_private = ca;
309
310 closure_get(cl);
311 __write_super(&ca->sb, bio);
312 }
313
314 closure_return_with_destructor(cl, bcache_write_super_unlock);
315}
316
317
318
319static void uuid_endio(struct bio *bio)
320{
321 struct closure *cl = bio->bi_private;
322 struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
323
324 cache_set_err_on(bio->bi_error, c, "accessing uuids");
325 bch_bbio_free(bio, c);
326 closure_put(cl);
327}
328
329static void uuid_io_unlock(struct closure *cl)
330{
331 struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
332
333 up(&c->uuid_write_mutex);
334}
335
336static void uuid_io(struct cache_set *c, unsigned long rw,
337 struct bkey *k, struct closure *parent)
338{
339 struct closure *cl = &c->uuid_write;
340 struct uuid_entry *u;
341 unsigned i;
342 char buf[80];
343
344 BUG_ON(!parent);
345 down(&c->uuid_write_mutex);
346 closure_init(cl, parent);
347
348 for (i = 0; i < KEY_PTRS(k); i++) {
349 struct bio *bio = bch_bbio_alloc(c);
350
351 bio->bi_rw = REQ_SYNC|REQ_META|rw;
352 bio->bi_iter.bi_size = KEY_SIZE(k) << 9;
353
354 bio->bi_end_io = uuid_endio;
355 bio->bi_private = cl;
356 bch_bio_map(bio, c->uuids);
357
358 bch_submit_bbio(bio, c, k, i);
359
360 if (!(rw & WRITE))
361 break;
362 }
363
364 bch_extent_to_text(buf, sizeof(buf), k);
365 pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", buf);
366
367 for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
368 if (!bch_is_zero(u->uuid, 16))
369 pr_debug("Slot %zi: %pU: %s: 1st: %u last: %u inv: %u",
370 u - c->uuids, u->uuid, u->label,
371 u->first_reg, u->last_reg, u->invalidated);
372
373 closure_return_with_destructor(cl, uuid_io_unlock);
374}
375
376static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
377{
378 struct bkey *k = &j->uuid_bucket;
379
380 if (__bch_btree_ptr_invalid(c, k))
381 return "bad uuid pointer";
382
383 bkey_copy(&c->uuid_bucket, k);
384 uuid_io(c, READ_SYNC, k, cl);
385
386 if (j->version < BCACHE_JSET_VERSION_UUIDv1) {
387 struct uuid_entry_v0 *u0 = (void *) c->uuids;
388 struct uuid_entry *u1 = (void *) c->uuids;
389 int i;
390
391 closure_sync(cl);
392
393
394
395
396
397
398
399 for (i = c->nr_uuids - 1;
400 i >= 0;
401 --i) {
402 memcpy(u1[i].uuid, u0[i].uuid, 16);
403 memcpy(u1[i].label, u0[i].label, 32);
404
405 u1[i].first_reg = u0[i].first_reg;
406 u1[i].last_reg = u0[i].last_reg;
407 u1[i].invalidated = u0[i].invalidated;
408
409 u1[i].flags = 0;
410 u1[i].sectors = 0;
411 }
412 }
413
414 return NULL;
415}
416
417static int __uuid_write(struct cache_set *c)
418{
419 BKEY_PADDED(key) k;
420 struct closure cl;
421 closure_init_stack(&cl);
422
423 lockdep_assert_held(&bch_register_lock);
424
425 if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true))
426 return 1;
427
428 SET_KEY_SIZE(&k.key, c->sb.bucket_size);
429 uuid_io(c, REQ_WRITE, &k.key, &cl);
430 closure_sync(&cl);
431
432 bkey_copy(&c->uuid_bucket, &k.key);
433 bkey_put(c, &k.key);
434 return 0;
435}
436
437int bch_uuid_write(struct cache_set *c)
438{
439 int ret = __uuid_write(c);
440
441 if (!ret)
442 bch_journal_meta(c, NULL);
443
444 return ret;
445}
446
447static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid)
448{
449 struct uuid_entry *u;
450
451 for (u = c->uuids;
452 u < c->uuids + c->nr_uuids; u++)
453 if (!memcmp(u->uuid, uuid, 16))
454 return u;
455
456 return NULL;
457}
458
459static struct uuid_entry *uuid_find_empty(struct cache_set *c)
460{
461 static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
462 return uuid_find(c, zero_uuid);
463}
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492static void prio_endio(struct bio *bio)
493{
494 struct cache *ca = bio->bi_private;
495
496 cache_set_err_on(bio->bi_error, ca->set, "accessing priorities");
497 bch_bbio_free(bio, ca->set);
498 closure_put(&ca->prio);
499}
500
501static void prio_io(struct cache *ca, uint64_t bucket, unsigned long rw)
502{
503 struct closure *cl = &ca->prio;
504 struct bio *bio = bch_bbio_alloc(ca->set);
505
506 closure_init_stack(cl);
507
508 bio->bi_iter.bi_sector = bucket * ca->sb.bucket_size;
509 bio->bi_bdev = ca->bdev;
510 bio->bi_rw = REQ_SYNC|REQ_META|rw;
511 bio->bi_iter.bi_size = bucket_bytes(ca);
512
513 bio->bi_end_io = prio_endio;
514 bio->bi_private = ca;
515 bch_bio_map(bio, ca->disk_buckets);
516
517 closure_bio_submit(bio, &ca->prio);
518 closure_sync(cl);
519}
520
521void bch_prio_write(struct cache *ca)
522{
523 int i;
524 struct bucket *b;
525 struct closure cl;
526
527 closure_init_stack(&cl);
528
529 lockdep_assert_held(&ca->set->bucket_lock);
530
531 ca->disk_buckets->seq++;
532
533 atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
534 &ca->meta_sectors_written);
535
536
537
538
539 for (i = prio_buckets(ca) - 1; i >= 0; --i) {
540 long bucket;
541 struct prio_set *p = ca->disk_buckets;
542 struct bucket_disk *d = p->data;
543 struct bucket_disk *end = d + prios_per_bucket(ca);
544
545 for (b = ca->buckets + i * prios_per_bucket(ca);
546 b < ca->buckets + ca->sb.nbuckets && d < end;
547 b++, d++) {
548 d->prio = cpu_to_le16(b->prio);
549 d->gen = b->gen;
550 }
551
552 p->next_bucket = ca->prio_buckets[i + 1];
553 p->magic = pset_magic(&ca->sb);
554 p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8);
555
556 bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true);
557 BUG_ON(bucket == -1);
558
559 mutex_unlock(&ca->set->bucket_lock);
560 prio_io(ca, bucket, REQ_WRITE);
561 mutex_lock(&ca->set->bucket_lock);
562
563 ca->prio_buckets[i] = bucket;
564 atomic_dec_bug(&ca->buckets[bucket].pin);
565 }
566
567 mutex_unlock(&ca->set->bucket_lock);
568
569 bch_journal_meta(ca->set, &cl);
570 closure_sync(&cl);
571
572 mutex_lock(&ca->set->bucket_lock);
573
574
575
576
577
578 for (i = 0; i < prio_buckets(ca); i++) {
579 if (ca->prio_last_buckets[i])
580 __bch_bucket_free(ca,
581 &ca->buckets[ca->prio_last_buckets[i]]);
582
583 ca->prio_last_buckets[i] = ca->prio_buckets[i];
584 }
585}
586
587static void prio_read(struct cache *ca, uint64_t bucket)
588{
589 struct prio_set *p = ca->disk_buckets;
590 struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
591 struct bucket *b;
592 unsigned bucket_nr = 0;
593
594 for (b = ca->buckets;
595 b < ca->buckets + ca->sb.nbuckets;
596 b++, d++) {
597 if (d == end) {
598 ca->prio_buckets[bucket_nr] = bucket;
599 ca->prio_last_buckets[bucket_nr] = bucket;
600 bucket_nr++;
601
602 prio_io(ca, bucket, READ_SYNC);
603
604 if (p->csum != bch_crc64(&p->magic, bucket_bytes(ca) - 8))
605 pr_warn("bad csum reading priorities");
606
607 if (p->magic != pset_magic(&ca->sb))
608 pr_warn("bad magic reading priorities");
609
610 bucket = p->next_bucket;
611 d = p->data;
612 }
613
614 b->prio = le16_to_cpu(d->prio);
615 b->gen = b->last_gc = d->gen;
616 }
617}
618
619
620
621static int open_dev(struct block_device *b, fmode_t mode)
622{
623 struct bcache_device *d = b->bd_disk->private_data;
624 if (test_bit(BCACHE_DEV_CLOSING, &d->flags))
625 return -ENXIO;
626
627 closure_get(&d->cl);
628 return 0;
629}
630
631static void release_dev(struct gendisk *b, fmode_t mode)
632{
633 struct bcache_device *d = b->private_data;
634 closure_put(&d->cl);
635}
636
637static int ioctl_dev(struct block_device *b, fmode_t mode,
638 unsigned int cmd, unsigned long arg)
639{
640 struct bcache_device *d = b->bd_disk->private_data;
641 return d->ioctl(d, mode, cmd, arg);
642}
643
644static const struct block_device_operations bcache_ops = {
645 .open = open_dev,
646 .release = release_dev,
647 .ioctl = ioctl_dev,
648 .owner = THIS_MODULE,
649};
650
651void bcache_device_stop(struct bcache_device *d)
652{
653 if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags))
654 closure_queue(&d->cl);
655}
656
657static void bcache_device_unlink(struct bcache_device *d)
658{
659 lockdep_assert_held(&bch_register_lock);
660
661 if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) {
662 unsigned i;
663 struct cache *ca;
664
665 sysfs_remove_link(&d->c->kobj, d->name);
666 sysfs_remove_link(&d->kobj, "cache");
667
668 for_each_cache(ca, d->c, i)
669 bd_unlink_disk_holder(ca->bdev, d->disk);
670 }
671}
672
673static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
674 const char *name)
675{
676 unsigned i;
677 struct cache *ca;
678
679 for_each_cache(ca, d->c, i)
680 bd_link_disk_holder(ca->bdev, d->disk);
681
682 snprintf(d->name, BCACHEDEVNAME_SIZE,
683 "%s%u", name, d->id);
684
685 WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
686 sysfs_create_link(&c->kobj, &d->kobj, d->name),
687 "Couldn't create device <-> cache set symlinks");
688}
689
690static void bcache_device_detach(struct bcache_device *d)
691{
692 lockdep_assert_held(&bch_register_lock);
693
694 if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
695 struct uuid_entry *u = d->c->uuids + d->id;
696
697 SET_UUID_FLASH_ONLY(u, 0);
698 memcpy(u->uuid, invalid_uuid, 16);
699 u->invalidated = cpu_to_le32(get_seconds());
700 bch_uuid_write(d->c);
701 }
702
703 bcache_device_unlink(d);
704
705 d->c->devices[d->id] = NULL;
706 closure_put(&d->c->caching);
707 d->c = NULL;
708}
709
710static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
711 unsigned id)
712{
713 d->id = id;
714 d->c = c;
715 c->devices[id] = d;
716
717 closure_get(&c->caching);
718}
719
720static void bcache_device_free(struct bcache_device *d)
721{
722 lockdep_assert_held(&bch_register_lock);
723
724 pr_info("%s stopped", d->disk->disk_name);
725
726 if (d->c)
727 bcache_device_detach(d);
728 if (d->disk && d->disk->flags & GENHD_FL_UP)
729 del_gendisk(d->disk);
730 if (d->disk && d->disk->queue)
731 blk_cleanup_queue(d->disk->queue);
732 if (d->disk) {
733 ida_simple_remove(&bcache_minor, d->disk->first_minor);
734 put_disk(d->disk);
735 }
736
737 if (d->bio_split)
738 bioset_free(d->bio_split);
739 kvfree(d->full_dirty_stripes);
740 kvfree(d->stripe_sectors_dirty);
741
742 closure_debug_destroy(&d->cl);
743}
744
745static int bcache_device_init(struct bcache_device *d, unsigned block_size,
746 sector_t sectors)
747{
748 struct request_queue *q;
749 size_t n;
750 int minor;
751
752 if (!d->stripe_size)
753 d->stripe_size = 1 << 31;
754
755 d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
756
757 if (!d->nr_stripes ||
758 d->nr_stripes > INT_MAX ||
759 d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) {
760 pr_err("nr_stripes too large");
761 return -ENOMEM;
762 }
763
764 n = d->nr_stripes * sizeof(atomic_t);
765 d->stripe_sectors_dirty = n < PAGE_SIZE << 6
766 ? kzalloc(n, GFP_KERNEL)
767 : vzalloc(n);
768 if (!d->stripe_sectors_dirty)
769 return -ENOMEM;
770
771 n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long);
772 d->full_dirty_stripes = n < PAGE_SIZE << 6
773 ? kzalloc(n, GFP_KERNEL)
774 : vzalloc(n);
775 if (!d->full_dirty_stripes)
776 return -ENOMEM;
777
778 minor = ida_simple_get(&bcache_minor, 0, MINORMASK + 1, GFP_KERNEL);
779 if (minor < 0)
780 return minor;
781
782 if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
783 !(d->disk = alloc_disk(1))) {
784 ida_simple_remove(&bcache_minor, minor);
785 return -ENOMEM;
786 }
787
788 set_capacity(d->disk, sectors);
789 snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", minor);
790
791 d->disk->major = bcache_major;
792 d->disk->first_minor = minor;
793 d->disk->fops = &bcache_ops;
794 d->disk->private_data = d;
795
796 q = blk_alloc_queue(GFP_KERNEL);
797 if (!q)
798 return -ENOMEM;
799
800 blk_queue_make_request(q, NULL);
801 d->disk->queue = q;
802 q->queuedata = d;
803 q->backing_dev_info.congested_data = d;
804 q->limits.max_hw_sectors = UINT_MAX;
805 q->limits.max_sectors = UINT_MAX;
806 q->limits.max_segment_size = UINT_MAX;
807 q->limits.max_segments = BIO_MAX_PAGES;
808 blk_queue_max_discard_sectors(q, UINT_MAX);
809 q->limits.discard_granularity = 512;
810 q->limits.io_min = block_size;
811 q->limits.logical_block_size = block_size;
812 q->limits.physical_block_size = block_size;
813 set_bit(QUEUE_FLAG_NONROT, &d->disk->queue->queue_flags);
814 clear_bit(QUEUE_FLAG_ADD_RANDOM, &d->disk->queue->queue_flags);
815 set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags);
816
817 blk_queue_flush(q, REQ_FLUSH|REQ_FUA);
818
819 return 0;
820}
821
822
823
824static void calc_cached_dev_sectors(struct cache_set *c)
825{
826 uint64_t sectors = 0;
827 struct cached_dev *dc;
828
829 list_for_each_entry(dc, &c->cached_devs, list)
830 sectors += bdev_sectors(dc->bdev);
831
832 c->cached_dev_sectors = sectors;
833}
834
835void bch_cached_dev_run(struct cached_dev *dc)
836{
837 struct bcache_device *d = &dc->disk;
838 char buf[SB_LABEL_SIZE + 1];
839 char *env[] = {
840 "DRIVER=bcache",
841 kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid),
842 NULL,
843 NULL,
844 };
845
846 memcpy(buf, dc->sb.label, SB_LABEL_SIZE);
847 buf[SB_LABEL_SIZE] = '\0';
848 env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);
849
850 if (atomic_xchg(&dc->running, 1))
851 return;
852
853 if (!d->c &&
854 BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
855 struct closure cl;
856 closure_init_stack(&cl);
857
858 SET_BDEV_STATE(&dc->sb, BDEV_STATE_STALE);
859 bch_write_bdev_super(dc, &cl);
860 closure_sync(&cl);
861 }
862
863 add_disk(d->disk);
864 bd_link_disk_holder(dc->bdev, dc->disk.disk);
865
866
867 kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
868 kfree(env[1]);
869 kfree(env[2]);
870
871 if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
872 sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
873 pr_debug("error creating sysfs link");
874}
875
876static void cached_dev_detach_finish(struct work_struct *w)
877{
878 struct cached_dev *dc = container_of(w, struct cached_dev, detach);
879 char buf[BDEVNAME_SIZE];
880 struct closure cl;
881 closure_init_stack(&cl);
882
883 BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
884 BUG_ON(atomic_read(&dc->count));
885
886 mutex_lock(&bch_register_lock);
887
888 memset(&dc->sb.set_uuid, 0, 16);
889 SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE);
890
891 bch_write_bdev_super(dc, &cl);
892 closure_sync(&cl);
893
894 bcache_device_detach(&dc->disk);
895 list_move(&dc->list, &uncached_devices);
896
897 clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags);
898 clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags);
899
900 mutex_unlock(&bch_register_lock);
901
902 pr_info("Caching disabled for %s", bdevname(dc->bdev, buf));
903
904
905 closure_put(&dc->disk.cl);
906}
907
908void bch_cached_dev_detach(struct cached_dev *dc)
909{
910 lockdep_assert_held(&bch_register_lock);
911
912 if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
913 return;
914
915 if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
916 return;
917
918
919
920
921
922 closure_get(&dc->disk.cl);
923
924 bch_writeback_queue(dc);
925 cached_dev_put(dc);
926}
927
928int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
929{
930 uint32_t rtime = cpu_to_le32(get_seconds());
931 struct uuid_entry *u;
932 char buf[BDEVNAME_SIZE];
933
934 bdevname(dc->bdev, buf);
935
936 if (memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16))
937 return -ENOENT;
938
939 if (dc->disk.c) {
940 pr_err("Can't attach %s: already attached", buf);
941 return -EINVAL;
942 }
943
944 if (test_bit(CACHE_SET_STOPPING, &c->flags)) {
945 pr_err("Can't attach %s: shutting down", buf);
946 return -EINVAL;
947 }
948
949 if (dc->sb.block_size < c->sb.block_size) {
950
951 pr_err("Couldn't attach %s: block size less than set's block size",
952 buf);
953 return -EINVAL;
954 }
955
956 u = uuid_find(c, dc->sb.uuid);
957
958 if (u &&
959 (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE ||
960 BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) {
961 memcpy(u->uuid, invalid_uuid, 16);
962 u->invalidated = cpu_to_le32(get_seconds());
963 u = NULL;
964 }
965
966 if (!u) {
967 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
968 pr_err("Couldn't find uuid for %s in set", buf);
969 return -ENOENT;
970 }
971
972 u = uuid_find_empty(c);
973 if (!u) {
974 pr_err("Not caching %s, no room for UUID", buf);
975 return -EINVAL;
976 }
977 }
978
979
980
981
982
983 if (bch_is_zero(u->uuid, 16)) {
984 struct closure cl;
985 closure_init_stack(&cl);
986
987 memcpy(u->uuid, dc->sb.uuid, 16);
988 memcpy(u->label, dc->sb.label, SB_LABEL_SIZE);
989 u->first_reg = u->last_reg = rtime;
990 bch_uuid_write(c);
991
992 memcpy(dc->sb.set_uuid, c->sb.set_uuid, 16);
993 SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
994
995 bch_write_bdev_super(dc, &cl);
996 closure_sync(&cl);
997 } else {
998 u->last_reg = rtime;
999 bch_uuid_write(c);
1000 }
1001
1002 bcache_device_attach(&dc->disk, c, u - c->uuids);
1003 list_move(&dc->list, &c->cached_devs);
1004 calc_cached_dev_sectors(c);
1005
1006 smp_wmb();
1007
1008
1009
1010
1011 atomic_set(&dc->count, 1);
1012
1013 if (bch_cached_dev_writeback_start(dc))
1014 return -ENOMEM;
1015
1016 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
1017 bch_sectors_dirty_init(dc);
1018 atomic_set(&dc->has_dirty, 1);
1019 atomic_inc(&dc->count);
1020 bch_writeback_queue(dc);
1021 }
1022
1023 bch_cached_dev_run(dc);
1024 bcache_device_link(&dc->disk, c, "bdev");
1025
1026 pr_info("Caching %s as %s on set %pU",
1027 bdevname(dc->bdev, buf), dc->disk.disk->disk_name,
1028 dc->disk.c->sb.set_uuid);
1029 return 0;
1030}
1031
1032void bch_cached_dev_release(struct kobject *kobj)
1033{
1034 struct cached_dev *dc = container_of(kobj, struct cached_dev,
1035 disk.kobj);
1036 kfree(dc);
1037 module_put(THIS_MODULE);
1038}
1039
1040static void cached_dev_free(struct closure *cl)
1041{
1042 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
1043
1044 cancel_delayed_work_sync(&dc->writeback_rate_update);
1045 if (!IS_ERR_OR_NULL(dc->writeback_thread))
1046 kthread_stop(dc->writeback_thread);
1047
1048 mutex_lock(&bch_register_lock);
1049
1050 if (atomic_read(&dc->running))
1051 bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
1052 bcache_device_free(&dc->disk);
1053 list_del(&dc->list);
1054
1055 mutex_unlock(&bch_register_lock);
1056
1057 if (!IS_ERR_OR_NULL(dc->bdev))
1058 blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1059
1060 wake_up(&unregister_wait);
1061
1062 kobject_put(&dc->disk.kobj);
1063}
1064
1065static void cached_dev_flush(struct closure *cl)
1066{
1067 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
1068 struct bcache_device *d = &dc->disk;
1069
1070 mutex_lock(&bch_register_lock);
1071 bcache_device_unlink(d);
1072 mutex_unlock(&bch_register_lock);
1073
1074 bch_cache_accounting_destroy(&dc->accounting);
1075 kobject_del(&d->kobj);
1076
1077 continue_at(cl, cached_dev_free, system_wq);
1078}
1079
1080static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
1081{
1082 int ret;
1083 struct io *io;
1084 struct request_queue *q = bdev_get_queue(dc->bdev);
1085
1086 __module_get(THIS_MODULE);
1087 INIT_LIST_HEAD(&dc->list);
1088 closure_init(&dc->disk.cl, NULL);
1089 set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
1090 kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
1091 INIT_WORK(&dc->detach, cached_dev_detach_finish);
1092 sema_init(&dc->sb_write_mutex, 1);
1093 INIT_LIST_HEAD(&dc->io_lru);
1094 spin_lock_init(&dc->io_lock);
1095 bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
1096
1097 dc->sequential_cutoff = 4 << 20;
1098
1099 for (io = dc->io; io < dc->io + RECENT_IO; io++) {
1100 list_add(&io->lru, &dc->io_lru);
1101 hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
1102 }
1103
1104 dc->disk.stripe_size = q->limits.io_opt >> 9;
1105
1106 if (dc->disk.stripe_size)
1107 dc->partial_stripes_expensive =
1108 q->limits.raid_partial_stripes_expensive;
1109
1110 ret = bcache_device_init(&dc->disk, block_size,
1111 dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
1112 if (ret)
1113 return ret;
1114
1115 set_capacity(dc->disk.disk,
1116 dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
1117
1118 dc->disk.disk->queue->backing_dev_info.ra_pages =
1119 max(dc->disk.disk->queue->backing_dev_info.ra_pages,
1120 q->backing_dev_info.ra_pages);
1121
1122 bch_cached_dev_request_init(dc);
1123 bch_cached_dev_writeback_init(dc);
1124 return 0;
1125}
1126
1127
1128
1129static void register_bdev(struct cache_sb *sb, struct page *sb_page,
1130 struct block_device *bdev,
1131 struct cached_dev *dc)
1132{
1133 char name[BDEVNAME_SIZE];
1134 const char *err = "cannot allocate memory";
1135 struct cache_set *c;
1136
1137 memcpy(&dc->sb, sb, sizeof(struct cache_sb));
1138 dc->bdev = bdev;
1139 dc->bdev->bd_holder = dc;
1140
1141 bio_init(&dc->sb_bio);
1142 dc->sb_bio.bi_max_vecs = 1;
1143 dc->sb_bio.bi_io_vec = dc->sb_bio.bi_inline_vecs;
1144 dc->sb_bio.bi_io_vec[0].bv_page = sb_page;
1145 get_page(sb_page);
1146
1147 if (cached_dev_init(dc, sb->block_size << 9))
1148 goto err;
1149
1150 err = "error creating kobject";
1151 if (kobject_add(&dc->disk.kobj, &part_to_dev(bdev->bd_part)->kobj,
1152 "bcache"))
1153 goto err;
1154 if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
1155 goto err;
1156
1157 pr_info("registered backing device %s", bdevname(bdev, name));
1158
1159 list_add(&dc->list, &uncached_devices);
1160 list_for_each_entry(c, &bch_cache_sets, list)
1161 bch_cached_dev_attach(dc, c);
1162
1163 if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE ||
1164 BDEV_STATE(&dc->sb) == BDEV_STATE_STALE)
1165 bch_cached_dev_run(dc);
1166
1167 return;
1168err:
1169 pr_notice("error opening %s: %s", bdevname(bdev, name), err);
1170 bcache_device_stop(&dc->disk);
1171}
1172
1173
1174
1175void bch_flash_dev_release(struct kobject *kobj)
1176{
1177 struct bcache_device *d = container_of(kobj, struct bcache_device,
1178 kobj);
1179 kfree(d);
1180}
1181
1182static void flash_dev_free(struct closure *cl)
1183{
1184 struct bcache_device *d = container_of(cl, struct bcache_device, cl);
1185 mutex_lock(&bch_register_lock);
1186 bcache_device_free(d);
1187 mutex_unlock(&bch_register_lock);
1188 kobject_put(&d->kobj);
1189}
1190
1191static void flash_dev_flush(struct closure *cl)
1192{
1193 struct bcache_device *d = container_of(cl, struct bcache_device, cl);
1194
1195 mutex_lock(&bch_register_lock);
1196 bcache_device_unlink(d);
1197 mutex_unlock(&bch_register_lock);
1198 kobject_del(&d->kobj);
1199 continue_at(cl, flash_dev_free, system_wq);
1200}
1201
1202static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
1203{
1204 struct bcache_device *d = kzalloc(sizeof(struct bcache_device),
1205 GFP_KERNEL);
1206 if (!d)
1207 return -ENOMEM;
1208
1209 closure_init(&d->cl, NULL);
1210 set_closure_fn(&d->cl, flash_dev_flush, system_wq);
1211
1212 kobject_init(&d->kobj, &bch_flash_dev_ktype);
1213
1214 if (bcache_device_init(d, block_bytes(c), u->sectors))
1215 goto err;
1216
1217 bcache_device_attach(d, c, u - c->uuids);
1218 bch_flash_dev_request_init(d);
1219 add_disk(d->disk);
1220
1221 if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache"))
1222 goto err;
1223
1224 bcache_device_link(d, c, "volume");
1225
1226 return 0;
1227err:
1228 kobject_put(&d->kobj);
1229 return -ENOMEM;
1230}
1231
1232static int flash_devs_run(struct cache_set *c)
1233{
1234 int ret = 0;
1235 struct uuid_entry *u;
1236
1237 for (u = c->uuids;
1238 u < c->uuids + c->nr_uuids && !ret;
1239 u++)
1240 if (UUID_FLASH_ONLY(u))
1241 ret = flash_dev_run(c, u);
1242
1243 return ret;
1244}
1245
1246int bch_flash_dev_create(struct cache_set *c, uint64_t size)
1247{
1248 struct uuid_entry *u;
1249
1250 if (test_bit(CACHE_SET_STOPPING, &c->flags))
1251 return -EINTR;
1252
1253 if (!test_bit(CACHE_SET_RUNNING, &c->flags))
1254 return -EPERM;
1255
1256 u = uuid_find_empty(c);
1257 if (!u) {
1258 pr_err("Can't create volume, no room for UUID");
1259 return -EINVAL;
1260 }
1261
1262 get_random_bytes(u->uuid, 16);
1263 memset(u->label, 0, 32);
1264 u->first_reg = u->last_reg = cpu_to_le32(get_seconds());
1265
1266 SET_UUID_FLASH_ONLY(u, 1);
1267 u->sectors = size >> 9;
1268
1269 bch_uuid_write(c);
1270
1271 return flash_dev_run(c, u);
1272}
1273
1274
1275
1276__printf(2, 3)
1277bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
1278{
1279 va_list args;
1280
1281 if (c->on_error != ON_ERROR_PANIC &&
1282 test_bit(CACHE_SET_STOPPING, &c->flags))
1283 return false;
1284
1285
1286
1287
1288
1289 printk(KERN_ERR "bcache: error on %pU: ", c->sb.set_uuid);
1290
1291 va_start(args, fmt);
1292 vprintk(fmt, args);
1293 va_end(args);
1294
1295 printk(", disabling caching\n");
1296
1297 if (c->on_error == ON_ERROR_PANIC)
1298 panic("panic forced after error\n");
1299
1300 bch_cache_set_unregister(c);
1301 return true;
1302}
1303
1304void bch_cache_set_release(struct kobject *kobj)
1305{
1306 struct cache_set *c = container_of(kobj, struct cache_set, kobj);
1307 kfree(c);
1308 module_put(THIS_MODULE);
1309}
1310
1311static void cache_set_free(struct closure *cl)
1312{
1313 struct cache_set *c = container_of(cl, struct cache_set, cl);
1314 struct cache *ca;
1315 unsigned i;
1316
1317 if (!IS_ERR_OR_NULL(c->debug))
1318 debugfs_remove(c->debug);
1319
1320 bch_open_buckets_free(c);
1321 bch_btree_cache_free(c);
1322 bch_journal_free(c);
1323
1324 for_each_cache(ca, c, i)
1325 if (ca) {
1326 ca->set = NULL;
1327 c->cache[ca->sb.nr_this_dev] = NULL;
1328 kobject_put(&ca->kobj);
1329 }
1330
1331 bch_bset_sort_state_free(&c->sort);
1332 free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
1333
1334 if (c->moving_gc_wq)
1335 destroy_workqueue(c->moving_gc_wq);
1336 if (c->bio_split)
1337 bioset_free(c->bio_split);
1338 if (c->fill_iter)
1339 mempool_destroy(c->fill_iter);
1340 if (c->bio_meta)
1341 mempool_destroy(c->bio_meta);
1342 if (c->search)
1343 mempool_destroy(c->search);
1344 kfree(c->devices);
1345
1346 mutex_lock(&bch_register_lock);
1347 list_del(&c->list);
1348 mutex_unlock(&bch_register_lock);
1349
1350 pr_info("Cache set %pU unregistered", c->sb.set_uuid);
1351 wake_up(&unregister_wait);
1352
1353 closure_debug_destroy(&c->cl);
1354 kobject_put(&c->kobj);
1355}
1356
1357static void cache_set_flush(struct closure *cl)
1358{
1359 struct cache_set *c = container_of(cl, struct cache_set, caching);
1360 struct cache *ca;
1361 struct btree *b;
1362 unsigned i;
1363
1364 bch_cache_accounting_destroy(&c->accounting);
1365
1366 kobject_put(&c->internal);
1367 kobject_del(&c->kobj);
1368
1369 if (c->gc_thread)
1370 kthread_stop(c->gc_thread);
1371
1372 if (!IS_ERR_OR_NULL(c->root))
1373 list_add(&c->root->list, &c->btree_cache);
1374
1375
1376 list_for_each_entry(b, &c->btree_cache, list) {
1377 mutex_lock(&b->write_lock);
1378 if (btree_node_dirty(b))
1379 __bch_btree_node_write(b, NULL);
1380 mutex_unlock(&b->write_lock);
1381 }
1382
1383 for_each_cache(ca, c, i)
1384 if (ca->alloc_thread)
1385 kthread_stop(ca->alloc_thread);
1386
1387 if (c->journal.cur) {
1388 cancel_delayed_work_sync(&c->journal.work);
1389
1390 c->journal.work.work.func(&c->journal.work.work);
1391 }
1392
1393 closure_return(cl);
1394}
1395
1396static void __cache_set_unregister(struct closure *cl)
1397{
1398 struct cache_set *c = container_of(cl, struct cache_set, caching);
1399 struct cached_dev *dc;
1400 size_t i;
1401
1402 mutex_lock(&bch_register_lock);
1403
1404 for (i = 0; i < c->nr_uuids; i++)
1405 if (c->devices[i]) {
1406 if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
1407 test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
1408 dc = container_of(c->devices[i],
1409 struct cached_dev, disk);
1410 bch_cached_dev_detach(dc);
1411 } else {
1412 bcache_device_stop(c->devices[i]);
1413 }
1414 }
1415
1416 mutex_unlock(&bch_register_lock);
1417
1418 continue_at(cl, cache_set_flush, system_wq);
1419}
1420
1421void bch_cache_set_stop(struct cache_set *c)
1422{
1423 if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags))
1424 closure_queue(&c->caching);
1425}
1426
1427void bch_cache_set_unregister(struct cache_set *c)
1428{
1429 set_bit(CACHE_SET_UNREGISTERING, &c->flags);
1430 bch_cache_set_stop(c);
1431}
1432
1433#define alloc_bucket_pages(gfp, c) \
1434 ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(c))))
1435
1436struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1437{
1438 int iter_size;
1439 struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
1440 if (!c)
1441 return NULL;
1442
1443 __module_get(THIS_MODULE);
1444 closure_init(&c->cl, NULL);
1445 set_closure_fn(&c->cl, cache_set_free, system_wq);
1446
1447 closure_init(&c->caching, &c->cl);
1448 set_closure_fn(&c->caching, __cache_set_unregister, system_wq);
1449
1450
1451 closure_set_stopped(&c->cl);
1452 closure_put(&c->cl);
1453
1454 kobject_init(&c->kobj, &bch_cache_set_ktype);
1455 kobject_init(&c->internal, &bch_cache_set_internal_ktype);
1456
1457 bch_cache_accounting_init(&c->accounting, &c->cl);
1458
1459 memcpy(c->sb.set_uuid, sb->set_uuid, 16);
1460 c->sb.block_size = sb->block_size;
1461 c->sb.bucket_size = sb->bucket_size;
1462 c->sb.nr_in_set = sb->nr_in_set;
1463 c->sb.last_mount = sb->last_mount;
1464 c->bucket_bits = ilog2(sb->bucket_size);
1465 c->block_bits = ilog2(sb->block_size);
1466 c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry);
1467
1468 c->btree_pages = bucket_pages(c);
1469 if (c->btree_pages > BTREE_MAX_PAGES)
1470 c->btree_pages = max_t(int, c->btree_pages / 4,
1471 BTREE_MAX_PAGES);
1472
1473 sema_init(&c->sb_write_mutex, 1);
1474 mutex_init(&c->bucket_lock);
1475 init_waitqueue_head(&c->btree_cache_wait);
1476 init_waitqueue_head(&c->bucket_wait);
1477 sema_init(&c->uuid_write_mutex, 1);
1478
1479 spin_lock_init(&c->btree_gc_time.lock);
1480 spin_lock_init(&c->btree_split_time.lock);
1481 spin_lock_init(&c->btree_read_time.lock);
1482
1483 bch_moving_init_cache_set(c);
1484
1485 INIT_LIST_HEAD(&c->list);
1486 INIT_LIST_HEAD(&c->cached_devs);
1487 INIT_LIST_HEAD(&c->btree_cache);
1488 INIT_LIST_HEAD(&c->btree_cache_freeable);
1489 INIT_LIST_HEAD(&c->btree_cache_freed);
1490 INIT_LIST_HEAD(&c->data_buckets);
1491
1492 c->search = mempool_create_slab_pool(32, bch_search_cache);
1493 if (!c->search)
1494 goto err;
1495
1496 iter_size = (sb->bucket_size / sb->block_size + 1) *
1497 sizeof(struct btree_iter_set);
1498
1499 if (!(c->devices = kzalloc(c->nr_uuids * sizeof(void *), GFP_KERNEL)) ||
1500 !(c->bio_meta = mempool_create_kmalloc_pool(2,
1501 sizeof(struct bbio) + sizeof(struct bio_vec) *
1502 bucket_pages(c))) ||
1503 !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
1504 !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
1505 !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
1506 !(c->moving_gc_wq = create_workqueue("bcache_gc")) ||
1507 bch_journal_alloc(c) ||
1508 bch_btree_cache_alloc(c) ||
1509 bch_open_buckets_alloc(c) ||
1510 bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages)))
1511 goto err;
1512
1513 c->congested_read_threshold_us = 2000;
1514 c->congested_write_threshold_us = 20000;
1515 c->error_limit = 8 << IO_ERROR_SHIFT;
1516
1517 return c;
1518err:
1519 bch_cache_set_unregister(c);
1520 return NULL;
1521}
1522
1523static void run_cache_set(struct cache_set *c)
1524{
1525 const char *err = "cannot allocate memory";
1526 struct cached_dev *dc, *t;
1527 struct cache *ca;
1528 struct closure cl;
1529 unsigned i;
1530
1531 closure_init_stack(&cl);
1532
1533 for_each_cache(ca, c, i)
1534 c->nbuckets += ca->sb.nbuckets;
1535
1536 if (CACHE_SYNC(&c->sb)) {
1537 LIST_HEAD(journal);
1538 struct bkey *k;
1539 struct jset *j;
1540
1541 err = "cannot allocate memory for journal";
1542 if (bch_journal_read(c, &journal))
1543 goto err;
1544
1545 pr_debug("btree_journal_read() done");
1546
1547 err = "no journal entries found";
1548 if (list_empty(&journal))
1549 goto err;
1550
1551 j = &list_entry(journal.prev, struct journal_replay, list)->j;
1552
1553 err = "IO error reading priorities";
1554 for_each_cache(ca, c, i)
1555 prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]);
1556
1557
1558
1559
1560
1561
1562
1563 k = &j->btree_root;
1564
1565 err = "bad btree root";
1566 if (__bch_btree_ptr_invalid(c, k))
1567 goto err;
1568
1569 err = "error reading btree root";
1570 c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true, NULL);
1571 if (IS_ERR_OR_NULL(c->root))
1572 goto err;
1573
1574 list_del_init(&c->root->list);
1575 rw_unlock(true, c->root);
1576
1577 err = uuid_read(c, j, &cl);
1578 if (err)
1579 goto err;
1580
1581 err = "error in recovery";
1582 if (bch_btree_check(c))
1583 goto err;
1584
1585 bch_journal_mark(c, &journal);
1586 bch_initial_gc_finish(c);
1587 pr_debug("btree_check() done");
1588
1589
1590
1591
1592
1593
1594 bch_journal_next(&c->journal);
1595
1596 err = "error starting allocator thread";
1597 for_each_cache(ca, c, i)
1598 if (bch_cache_allocator_start(ca))
1599 goto err;
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611 if (j->version < BCACHE_JSET_VERSION_UUID)
1612 __uuid_write(c);
1613
1614 bch_journal_replay(c, &journal);
1615 } else {
1616 pr_notice("invalidating existing data");
1617
1618 for_each_cache(ca, c, i) {
1619 unsigned j;
1620
1621 ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
1622 2, SB_JOURNAL_BUCKETS);
1623
1624 for (j = 0; j < ca->sb.keys; j++)
1625 ca->sb.d[j] = ca->sb.first_bucket + j;
1626 }
1627
1628 bch_initial_gc_finish(c);
1629
1630 err = "error starting allocator thread";
1631 for_each_cache(ca, c, i)
1632 if (bch_cache_allocator_start(ca))
1633 goto err;
1634
1635 mutex_lock(&c->bucket_lock);
1636 for_each_cache(ca, c, i)
1637 bch_prio_write(ca);
1638 mutex_unlock(&c->bucket_lock);
1639
1640 err = "cannot allocate new UUID bucket";
1641 if (__uuid_write(c))
1642 goto err;
1643
1644 err = "cannot allocate new btree root";
1645 c->root = __bch_btree_node_alloc(c, NULL, 0, true, NULL);
1646 if (IS_ERR_OR_NULL(c->root))
1647 goto err;
1648
1649 mutex_lock(&c->root->write_lock);
1650 bkey_copy_key(&c->root->key, &MAX_KEY);
1651 bch_btree_node_write(c->root, &cl);
1652 mutex_unlock(&c->root->write_lock);
1653
1654 bch_btree_set_root(c->root);
1655 rw_unlock(true, c->root);
1656
1657
1658
1659
1660
1661
1662 SET_CACHE_SYNC(&c->sb, true);
1663
1664 bch_journal_next(&c->journal);
1665 bch_journal_meta(c, &cl);
1666 }
1667
1668 err = "error starting gc thread";
1669 if (bch_gc_thread_start(c))
1670 goto err;
1671
1672 closure_sync(&cl);
1673 c->sb.last_mount = get_seconds();
1674 bcache_write_super(c);
1675
1676 list_for_each_entry_safe(dc, t, &uncached_devices, list)
1677 bch_cached_dev_attach(dc, c);
1678
1679 flash_devs_run(c);
1680
1681 set_bit(CACHE_SET_RUNNING, &c->flags);
1682 return;
1683err:
1684 closure_sync(&cl);
1685
1686 bch_cache_set_error(c, "%s", err);
1687}
1688
1689static bool can_attach_cache(struct cache *ca, struct cache_set *c)
1690{
1691 return ca->sb.block_size == c->sb.block_size &&
1692 ca->sb.bucket_size == c->sb.bucket_size &&
1693 ca->sb.nr_in_set == c->sb.nr_in_set;
1694}
1695
1696static const char *register_cache_set(struct cache *ca)
1697{
1698 char buf[12];
1699 const char *err = "cannot allocate memory";
1700 struct cache_set *c;
1701
1702 list_for_each_entry(c, &bch_cache_sets, list)
1703 if (!memcmp(c->sb.set_uuid, ca->sb.set_uuid, 16)) {
1704 if (c->cache[ca->sb.nr_this_dev])
1705 return "duplicate cache set member";
1706
1707 if (!can_attach_cache(ca, c))
1708 return "cache sb does not match set";
1709
1710 if (!CACHE_SYNC(&ca->sb))
1711 SET_CACHE_SYNC(&c->sb, false);
1712
1713 goto found;
1714 }
1715
1716 c = bch_cache_set_alloc(&ca->sb);
1717 if (!c)
1718 return err;
1719
1720 err = "error creating kobject";
1721 if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->sb.set_uuid) ||
1722 kobject_add(&c->internal, &c->kobj, "internal"))
1723 goto err;
1724
1725 if (bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
1726 goto err;
1727
1728 bch_debug_init_cache_set(c);
1729
1730 list_add(&c->list, &bch_cache_sets);
1731found:
1732 sprintf(buf, "cache%i", ca->sb.nr_this_dev);
1733 if (sysfs_create_link(&ca->kobj, &c->kobj, "set") ||
1734 sysfs_create_link(&c->kobj, &ca->kobj, buf))
1735 goto err;
1736
1737 if (ca->sb.seq > c->sb.seq) {
1738 c->sb.version = ca->sb.version;
1739 memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16);
1740 c->sb.flags = ca->sb.flags;
1741 c->sb.seq = ca->sb.seq;
1742 pr_debug("set version = %llu", c->sb.version);
1743 }
1744
1745 kobject_get(&ca->kobj);
1746 ca->set = c;
1747 ca->set->cache[ca->sb.nr_this_dev] = ca;
1748 c->cache_by_alloc[c->caches_loaded++] = ca;
1749
1750 if (c->caches_loaded == c->sb.nr_in_set)
1751 run_cache_set(c);
1752
1753 return NULL;
1754err:
1755 bch_cache_set_unregister(c);
1756 return err;
1757}
1758
1759
1760
1761void bch_cache_release(struct kobject *kobj)
1762{
1763 struct cache *ca = container_of(kobj, struct cache, kobj);
1764 unsigned i;
1765
1766 if (ca->set) {
1767 BUG_ON(ca->set->cache[ca->sb.nr_this_dev] != ca);
1768 ca->set->cache[ca->sb.nr_this_dev] = NULL;
1769 }
1770
1771 free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
1772 kfree(ca->prio_buckets);
1773 vfree(ca->buckets);
1774
1775 free_heap(&ca->heap);
1776 free_fifo(&ca->free_inc);
1777
1778 for (i = 0; i < RESERVE_NR; i++)
1779 free_fifo(&ca->free[i]);
1780
1781 if (ca->sb_bio.bi_inline_vecs[0].bv_page)
1782 put_page(ca->sb_bio.bi_io_vec[0].bv_page);
1783
1784 if (!IS_ERR_OR_NULL(ca->bdev))
1785 blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1786
1787 kfree(ca);
1788 module_put(THIS_MODULE);
1789}
1790
1791static int cache_alloc(struct cache_sb *sb, struct cache *ca)
1792{
1793 size_t free;
1794 struct bucket *b;
1795
1796 __module_get(THIS_MODULE);
1797 kobject_init(&ca->kobj, &bch_cache_ktype);
1798
1799 bio_init(&ca->journal.bio);
1800 ca->journal.bio.bi_max_vecs = 8;
1801 ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs;
1802
1803 free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
1804
1805 if (!init_fifo(&ca->free[RESERVE_BTREE], 8, GFP_KERNEL) ||
1806 !init_fifo(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
1807 !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) ||
1808 !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) ||
1809 !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) ||
1810 !init_heap(&ca->heap, free << 3, GFP_KERNEL) ||
1811 !(ca->buckets = vzalloc(sizeof(struct bucket) *
1812 ca->sb.nbuckets)) ||
1813 !(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) *
1814 2, GFP_KERNEL)) ||
1815 !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)))
1816 return -ENOMEM;
1817
1818 ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
1819
1820 for_each_bucket(b, ca)
1821 atomic_set(&b->pin, 0);
1822
1823 return 0;
1824}
1825
1826static void register_cache(struct cache_sb *sb, struct page *sb_page,
1827 struct block_device *bdev, struct cache *ca)
1828{
1829 char name[BDEVNAME_SIZE];
1830 const char *err = "cannot allocate memory";
1831
1832 memcpy(&ca->sb, sb, sizeof(struct cache_sb));
1833 ca->bdev = bdev;
1834 ca->bdev->bd_holder = ca;
1835
1836 bio_init(&ca->sb_bio);
1837 ca->sb_bio.bi_max_vecs = 1;
1838 ca->sb_bio.bi_io_vec = ca->sb_bio.bi_inline_vecs;
1839 ca->sb_bio.bi_io_vec[0].bv_page = sb_page;
1840 get_page(sb_page);
1841
1842 if (blk_queue_discard(bdev_get_queue(ca->bdev)))
1843 ca->discard = CACHE_DISCARD(&ca->sb);
1844
1845 if (cache_alloc(sb, ca) != 0)
1846 goto err;
1847
1848 err = "error creating kobject";
1849 if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache"))
1850 goto err;
1851
1852 mutex_lock(&bch_register_lock);
1853 err = register_cache_set(ca);
1854 mutex_unlock(&bch_register_lock);
1855
1856 if (err)
1857 goto err;
1858
1859 pr_info("registered cache device %s", bdevname(bdev, name));
1860out:
1861 kobject_put(&ca->kobj);
1862 return;
1863err:
1864 pr_notice("error opening %s: %s", bdevname(bdev, name), err);
1865 goto out;
1866}
1867
1868
1869
1870static ssize_t register_bcache(struct kobject *, struct kobj_attribute *,
1871 const char *, size_t);
1872
1873kobj_attribute_write(register, register_bcache);
1874kobj_attribute_write(register_quiet, register_bcache);
1875
1876static bool bch_is_open_backing(struct block_device *bdev) {
1877 struct cache_set *c, *tc;
1878 struct cached_dev *dc, *t;
1879
1880 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
1881 list_for_each_entry_safe(dc, t, &c->cached_devs, list)
1882 if (dc->bdev == bdev)
1883 return true;
1884 list_for_each_entry_safe(dc, t, &uncached_devices, list)
1885 if (dc->bdev == bdev)
1886 return true;
1887 return false;
1888}
1889
1890static bool bch_is_open_cache(struct block_device *bdev) {
1891 struct cache_set *c, *tc;
1892 struct cache *ca;
1893 unsigned i;
1894
1895 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
1896 for_each_cache(ca, c, i)
1897 if (ca->bdev == bdev)
1898 return true;
1899 return false;
1900}
1901
1902static bool bch_is_open(struct block_device *bdev) {
1903 return bch_is_open_cache(bdev) || bch_is_open_backing(bdev);
1904}
1905
1906static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
1907 const char *buffer, size_t size)
1908{
1909 ssize_t ret = size;
1910 const char *err = "cannot allocate memory";
1911 char *path = NULL;
1912 struct cache_sb *sb = NULL;
1913 struct block_device *bdev = NULL;
1914 struct page *sb_page = NULL;
1915
1916 if (!try_module_get(THIS_MODULE))
1917 return -EBUSY;
1918
1919 if (!(path = kstrndup(buffer, size, GFP_KERNEL)) ||
1920 !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL)))
1921 goto err;
1922
1923 err = "failed to open device";
1924 bdev = blkdev_get_by_path(strim(path),
1925 FMODE_READ|FMODE_WRITE|FMODE_EXCL,
1926 sb);
1927 if (IS_ERR(bdev)) {
1928 if (bdev == ERR_PTR(-EBUSY)) {
1929 bdev = lookup_bdev(strim(path));
1930 mutex_lock(&bch_register_lock);
1931 if (!IS_ERR(bdev) && bch_is_open(bdev))
1932 err = "device already registered";
1933 else
1934 err = "device busy";
1935 mutex_unlock(&bch_register_lock);
1936 }
1937 goto err;
1938 }
1939
1940 err = "failed to set blocksize";
1941 if (set_blocksize(bdev, 4096))
1942 goto err_close;
1943
1944 err = read_super(sb, bdev, &sb_page);
1945 if (err)
1946 goto err_close;
1947
1948 if (SB_IS_BDEV(sb)) {
1949 struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
1950 if (!dc)
1951 goto err_close;
1952
1953 mutex_lock(&bch_register_lock);
1954 register_bdev(sb, sb_page, bdev, dc);
1955 mutex_unlock(&bch_register_lock);
1956 } else {
1957 struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
1958 if (!ca)
1959 goto err_close;
1960
1961 register_cache(sb, sb_page, bdev, ca);
1962 }
1963out:
1964 if (sb_page)
1965 put_page(sb_page);
1966 kfree(sb);
1967 kfree(path);
1968 module_put(THIS_MODULE);
1969 return ret;
1970
1971err_close:
1972 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1973err:
1974 if (attr != &ksysfs_register_quiet)
1975 pr_info("error opening %s: %s", path, err);
1976 ret = -EINVAL;
1977 goto out;
1978}
1979
1980static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
1981{
1982 if (code == SYS_DOWN ||
1983 code == SYS_HALT ||
1984 code == SYS_POWER_OFF) {
1985 DEFINE_WAIT(wait);
1986 unsigned long start = jiffies;
1987 bool stopped = false;
1988
1989 struct cache_set *c, *tc;
1990 struct cached_dev *dc, *tdc;
1991
1992 mutex_lock(&bch_register_lock);
1993
1994 if (list_empty(&bch_cache_sets) &&
1995 list_empty(&uncached_devices))
1996 goto out;
1997
1998 pr_info("Stopping all devices:");
1999
2000 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
2001 bch_cache_set_stop(c);
2002
2003 list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
2004 bcache_device_stop(&dc->disk);
2005
2006
2007 while (1) {
2008 long timeout = start + 2 * HZ - jiffies;
2009
2010 stopped = list_empty(&bch_cache_sets) &&
2011 list_empty(&uncached_devices);
2012
2013 if (timeout < 0 || stopped)
2014 break;
2015
2016 prepare_to_wait(&unregister_wait, &wait,
2017 TASK_UNINTERRUPTIBLE);
2018
2019 mutex_unlock(&bch_register_lock);
2020 schedule_timeout(timeout);
2021 mutex_lock(&bch_register_lock);
2022 }
2023
2024 finish_wait(&unregister_wait, &wait);
2025
2026 if (stopped)
2027 pr_info("All devices stopped");
2028 else
2029 pr_notice("Timeout waiting for devices to be closed");
2030out:
2031 mutex_unlock(&bch_register_lock);
2032 }
2033
2034 return NOTIFY_DONE;
2035}
2036
2037static struct notifier_block reboot = {
2038 .notifier_call = bcache_reboot,
2039 .priority = INT_MAX,
2040};
2041
2042static void bcache_exit(void)
2043{
2044 bch_debug_exit();
2045 bch_request_exit();
2046 if (bcache_kobj)
2047 kobject_put(bcache_kobj);
2048 if (bcache_wq)
2049 destroy_workqueue(bcache_wq);
2050 if (bcache_major)
2051 unregister_blkdev(bcache_major, "bcache");
2052 unregister_reboot_notifier(&reboot);
2053}
2054
2055static int __init bcache_init(void)
2056{
2057 static const struct attribute *files[] = {
2058 &ksysfs_register.attr,
2059 &ksysfs_register_quiet.attr,
2060 NULL
2061 };
2062
2063 mutex_init(&bch_register_lock);
2064 init_waitqueue_head(&unregister_wait);
2065 register_reboot_notifier(&reboot);
2066 closure_debug_init();
2067
2068 bcache_major = register_blkdev(0, "bcache");
2069 if (bcache_major < 0)
2070 return bcache_major;
2071
2072 if (!(bcache_wq = create_workqueue("bcache")) ||
2073 !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
2074 sysfs_create_files(bcache_kobj, files) ||
2075 bch_request_init() ||
2076 bch_debug_init(bcache_kobj))
2077 goto err;
2078
2079 return 0;
2080err:
2081 bcache_exit();
2082 return -ENOMEM;
2083}
2084
2085module_exit(bcache_exit);
2086module_init(bcache_init);
2087