1
2
3
4
5
6
7
8
9#include "bcache.h"
10#include "btree.h"
11#include "debug.h"
12#include "extents.h"
13#include "request.h"
14#include "writeback.h"
15
16#include <linux/blkdev.h>
17#include <linux/buffer_head.h>
18#include <linux/debugfs.h>
19#include <linux/genhd.h>
20#include <linux/idr.h>
21#include <linux/kthread.h>
22#include <linux/module.h>
23#include <linux/random.h>
24#include <linux/reboot.h>
25#include <linux/sysfs.h>
26
27MODULE_LICENSE("GPL");
28MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
29
30static const char bcache_magic[] = {
31 0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
32 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81
33};
34
35static const char invalid_uuid[] = {
36 0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78,
37 0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
38};
39
40
41const char * const bch_cache_modes[] = {
42 "default",
43 "writethrough",
44 "writeback",
45 "writearound",
46 "none",
47 NULL
48};
49
50static struct kobject *bcache_kobj;
51struct mutex bch_register_lock;
52LIST_HEAD(bch_cache_sets);
53static LIST_HEAD(uncached_devices);
54
55static int bcache_major;
56static DEFINE_IDA(bcache_minor);
57static wait_queue_head_t unregister_wait;
58struct workqueue_struct *bcache_wq;
59
60#define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE)
61
62
63
64static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
65 struct page **res)
66{
67 const char *err;
68 struct cache_sb *s;
69 struct buffer_head *bh = __bread(bdev, 1, SB_SIZE);
70 unsigned i;
71
72 if (!bh)
73 return "IO error";
74
75 s = (struct cache_sb *) bh->b_data;
76
77 sb->offset = le64_to_cpu(s->offset);
78 sb->version = le64_to_cpu(s->version);
79
80 memcpy(sb->magic, s->magic, 16);
81 memcpy(sb->uuid, s->uuid, 16);
82 memcpy(sb->set_uuid, s->set_uuid, 16);
83 memcpy(sb->label, s->label, SB_LABEL_SIZE);
84
85 sb->flags = le64_to_cpu(s->flags);
86 sb->seq = le64_to_cpu(s->seq);
87 sb->last_mount = le32_to_cpu(s->last_mount);
88 sb->first_bucket = le16_to_cpu(s->first_bucket);
89 sb->keys = le16_to_cpu(s->keys);
90
91 for (i = 0; i < SB_JOURNAL_BUCKETS; i++)
92 sb->d[i] = le64_to_cpu(s->d[i]);
93
94 pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
95 sb->version, sb->flags, sb->seq, sb->keys);
96
97 err = "Not a bcache superblock";
98 if (sb->offset != SB_SECTOR)
99 goto err;
100
101 if (memcmp(sb->magic, bcache_magic, 16))
102 goto err;
103
104 err = "Too many journal buckets";
105 if (sb->keys > SB_JOURNAL_BUCKETS)
106 goto err;
107
108 err = "Bad checksum";
109 if (s->csum != csum_set(s))
110 goto err;
111
112 err = "Bad UUID";
113 if (bch_is_zero(sb->uuid, 16))
114 goto err;
115
116 sb->block_size = le16_to_cpu(s->block_size);
117
118 err = "Superblock block size smaller than device block size";
119 if (sb->block_size << 9 < bdev_logical_block_size(bdev))
120 goto err;
121
122 switch (sb->version) {
123 case BCACHE_SB_VERSION_BDEV:
124 sb->data_offset = BDEV_DATA_START_DEFAULT;
125 break;
126 case BCACHE_SB_VERSION_BDEV_WITH_OFFSET:
127 sb->data_offset = le64_to_cpu(s->data_offset);
128
129 err = "Bad data offset";
130 if (sb->data_offset < BDEV_DATA_START_DEFAULT)
131 goto err;
132
133 break;
134 case BCACHE_SB_VERSION_CDEV:
135 case BCACHE_SB_VERSION_CDEV_WITH_UUID:
136 sb->nbuckets = le64_to_cpu(s->nbuckets);
137 sb->bucket_size = le16_to_cpu(s->bucket_size);
138
139 sb->nr_in_set = le16_to_cpu(s->nr_in_set);
140 sb->nr_this_dev = le16_to_cpu(s->nr_this_dev);
141
142 err = "Too many buckets";
143 if (sb->nbuckets > LONG_MAX)
144 goto err;
145
146 err = "Not enough buckets";
147 if (sb->nbuckets < 1 << 7)
148 goto err;
149
150 err = "Bad block/bucket size";
151 if (!is_power_of_2(sb->block_size) ||
152 sb->block_size > PAGE_SECTORS ||
153 !is_power_of_2(sb->bucket_size) ||
154 sb->bucket_size < PAGE_SECTORS)
155 goto err;
156
157 err = "Invalid superblock: device too small";
158 if (get_capacity(bdev->bd_disk) < sb->bucket_size * sb->nbuckets)
159 goto err;
160
161 err = "Bad UUID";
162 if (bch_is_zero(sb->set_uuid, 16))
163 goto err;
164
165 err = "Bad cache device number in set";
166 if (!sb->nr_in_set ||
167 sb->nr_in_set <= sb->nr_this_dev ||
168 sb->nr_in_set > MAX_CACHES_PER_SET)
169 goto err;
170
171 err = "Journal buckets not sequential";
172 for (i = 0; i < sb->keys; i++)
173 if (sb->d[i] != sb->first_bucket + i)
174 goto err;
175
176 err = "Too many journal buckets";
177 if (sb->first_bucket + sb->keys > sb->nbuckets)
178 goto err;
179
180 err = "Invalid superblock: first bucket comes before end of super";
181 if (sb->first_bucket * sb->bucket_size < 16)
182 goto err;
183
184 break;
185 default:
186 err = "Unsupported superblock version";
187 goto err;
188 }
189
190 sb->last_mount = get_seconds();
191 err = NULL;
192
193 get_page(bh->b_page);
194 *res = bh->b_page;
195err:
196 put_bh(bh);
197 return err;
198}
199
200static void write_bdev_super_endio(struct bio *bio)
201{
202 struct cached_dev *dc = bio->bi_private;
203
204
205 closure_put(&dc->sb_write);
206}
207
208static void __write_super(struct cache_sb *sb, struct bio *bio)
209{
210 struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page);
211 unsigned i;
212
213 bio->bi_iter.bi_sector = SB_SECTOR;
214 bio->bi_iter.bi_size = SB_SIZE;
215 bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META);
216 bch_bio_map(bio, NULL);
217
218 out->offset = cpu_to_le64(sb->offset);
219 out->version = cpu_to_le64(sb->version);
220
221 memcpy(out->uuid, sb->uuid, 16);
222 memcpy(out->set_uuid, sb->set_uuid, 16);
223 memcpy(out->label, sb->label, SB_LABEL_SIZE);
224
225 out->flags = cpu_to_le64(sb->flags);
226 out->seq = cpu_to_le64(sb->seq);
227
228 out->last_mount = cpu_to_le32(sb->last_mount);
229 out->first_bucket = cpu_to_le16(sb->first_bucket);
230 out->keys = cpu_to_le16(sb->keys);
231
232 for (i = 0; i < sb->keys; i++)
233 out->d[i] = cpu_to_le64(sb->d[i]);
234
235 out->csum = csum_set(out);
236
237 pr_debug("ver %llu, flags %llu, seq %llu",
238 sb->version, sb->flags, sb->seq);
239
240 submit_bio(bio);
241}
242
243static void bch_write_bdev_super_unlock(struct closure *cl)
244{
245 struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write);
246
247 up(&dc->sb_write_mutex);
248}
249
250void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
251{
252 struct closure *cl = &dc->sb_write;
253 struct bio *bio = &dc->sb_bio;
254
255 down(&dc->sb_write_mutex);
256 closure_init(cl, parent);
257
258 bio_reset(bio);
259 bio->bi_bdev = dc->bdev;
260 bio->bi_end_io = write_bdev_super_endio;
261 bio->bi_private = dc;
262
263 closure_get(cl);
264 __write_super(&dc->sb, bio);
265
266 closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
267}
268
269static void write_super_endio(struct bio *bio)
270{
271 struct cache *ca = bio->bi_private;
272
273 bch_count_io_errors(ca, bio->bi_error, "writing superblock");
274 closure_put(&ca->set->sb_write);
275}
276
277static void bcache_write_super_unlock(struct closure *cl)
278{
279 struct cache_set *c = container_of(cl, struct cache_set, sb_write);
280
281 up(&c->sb_write_mutex);
282}
283
284void bcache_write_super(struct cache_set *c)
285{
286 struct closure *cl = &c->sb_write;
287 struct cache *ca;
288 unsigned i;
289
290 down(&c->sb_write_mutex);
291 closure_init(cl, &c->cl);
292
293 c->sb.seq++;
294
295 for_each_cache(ca, c, i) {
296 struct bio *bio = &ca->sb_bio;
297
298 ca->sb.version = BCACHE_SB_VERSION_CDEV_WITH_UUID;
299 ca->sb.seq = c->sb.seq;
300 ca->sb.last_mount = c->sb.last_mount;
301
302 SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb));
303
304 bio_reset(bio);
305 bio->bi_bdev = ca->bdev;
306 bio->bi_end_io = write_super_endio;
307 bio->bi_private = ca;
308
309 closure_get(cl);
310 __write_super(&ca->sb, bio);
311 }
312
313 closure_return_with_destructor(cl, bcache_write_super_unlock);
314}
315
316
317
318static void uuid_endio(struct bio *bio)
319{
320 struct closure *cl = bio->bi_private;
321 struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
322
323 cache_set_err_on(bio->bi_error, c, "accessing uuids");
324 bch_bbio_free(bio, c);
325 closure_put(cl);
326}
327
328static void uuid_io_unlock(struct closure *cl)
329{
330 struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
331
332 up(&c->uuid_write_mutex);
333}
334
335static void uuid_io(struct cache_set *c, int op, unsigned long op_flags,
336 struct bkey *k, struct closure *parent)
337{
338 struct closure *cl = &c->uuid_write;
339 struct uuid_entry *u;
340 unsigned i;
341 char buf[80];
342
343 BUG_ON(!parent);
344 down(&c->uuid_write_mutex);
345 closure_init(cl, parent);
346
347 for (i = 0; i < KEY_PTRS(k); i++) {
348 struct bio *bio = bch_bbio_alloc(c);
349
350 bio->bi_opf = REQ_SYNC | REQ_META | op_flags;
351 bio->bi_iter.bi_size = KEY_SIZE(k) << 9;
352
353 bio->bi_end_io = uuid_endio;
354 bio->bi_private = cl;
355 bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags);
356 bch_bio_map(bio, c->uuids);
357
358 bch_submit_bbio(bio, c, k, i);
359
360 if (op != REQ_OP_WRITE)
361 break;
362 }
363
364 bch_extent_to_text(buf, sizeof(buf), k);
365 pr_debug("%s UUIDs at %s", op == REQ_OP_WRITE ? "wrote" : "read", buf);
366
367 for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
368 if (!bch_is_zero(u->uuid, 16))
369 pr_debug("Slot %zi: %pU: %s: 1st: %u last: %u inv: %u",
370 u - c->uuids, u->uuid, u->label,
371 u->first_reg, u->last_reg, u->invalidated);
372
373 closure_return_with_destructor(cl, uuid_io_unlock);
374}
375
376static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
377{
378 struct bkey *k = &j->uuid_bucket;
379
380 if (__bch_btree_ptr_invalid(c, k))
381 return "bad uuid pointer";
382
383 bkey_copy(&c->uuid_bucket, k);
384 uuid_io(c, REQ_OP_READ, READ_SYNC, k, cl);
385
386 if (j->version < BCACHE_JSET_VERSION_UUIDv1) {
387 struct uuid_entry_v0 *u0 = (void *) c->uuids;
388 struct uuid_entry *u1 = (void *) c->uuids;
389 int i;
390
391 closure_sync(cl);
392
393
394
395
396
397
398
399 for (i = c->nr_uuids - 1;
400 i >= 0;
401 --i) {
402 memcpy(u1[i].uuid, u0[i].uuid, 16);
403 memcpy(u1[i].label, u0[i].label, 32);
404
405 u1[i].first_reg = u0[i].first_reg;
406 u1[i].last_reg = u0[i].last_reg;
407 u1[i].invalidated = u0[i].invalidated;
408
409 u1[i].flags = 0;
410 u1[i].sectors = 0;
411 }
412 }
413
414 return NULL;
415}
416
417static int __uuid_write(struct cache_set *c)
418{
419 BKEY_PADDED(key) k;
420 struct closure cl;
421 closure_init_stack(&cl);
422
423 lockdep_assert_held(&bch_register_lock);
424
425 if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true))
426 return 1;
427
428 SET_KEY_SIZE(&k.key, c->sb.bucket_size);
429 uuid_io(c, REQ_OP_WRITE, 0, &k.key, &cl);
430 closure_sync(&cl);
431
432 bkey_copy(&c->uuid_bucket, &k.key);
433 bkey_put(c, &k.key);
434 return 0;
435}
436
437int bch_uuid_write(struct cache_set *c)
438{
439 int ret = __uuid_write(c);
440
441 if (!ret)
442 bch_journal_meta(c, NULL);
443
444 return ret;
445}
446
447static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid)
448{
449 struct uuid_entry *u;
450
451 for (u = c->uuids;
452 u < c->uuids + c->nr_uuids; u++)
453 if (!memcmp(u->uuid, uuid, 16))
454 return u;
455
456 return NULL;
457}
458
459static struct uuid_entry *uuid_find_empty(struct cache_set *c)
460{
461 static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
462 return uuid_find(c, zero_uuid);
463}
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492static void prio_endio(struct bio *bio)
493{
494 struct cache *ca = bio->bi_private;
495
496 cache_set_err_on(bio->bi_error, ca->set, "accessing priorities");
497 bch_bbio_free(bio, ca->set);
498 closure_put(&ca->prio);
499}
500
501static void prio_io(struct cache *ca, uint64_t bucket, int op,
502 unsigned long op_flags)
503{
504 struct closure *cl = &ca->prio;
505 struct bio *bio = bch_bbio_alloc(ca->set);
506
507 closure_init_stack(cl);
508
509 bio->bi_iter.bi_sector = bucket * ca->sb.bucket_size;
510 bio->bi_bdev = ca->bdev;
511 bio->bi_iter.bi_size = bucket_bytes(ca);
512
513 bio->bi_end_io = prio_endio;
514 bio->bi_private = ca;
515 bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags);
516 bch_bio_map(bio, ca->disk_buckets);
517
518 closure_bio_submit(bio, &ca->prio);
519 closure_sync(cl);
520}
521
522void bch_prio_write(struct cache *ca)
523{
524 int i;
525 struct bucket *b;
526 struct closure cl;
527
528 closure_init_stack(&cl);
529
530 lockdep_assert_held(&ca->set->bucket_lock);
531
532 ca->disk_buckets->seq++;
533
534 atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
535 &ca->meta_sectors_written);
536
537
538
539
540 for (i = prio_buckets(ca) - 1; i >= 0; --i) {
541 long bucket;
542 struct prio_set *p = ca->disk_buckets;
543 struct bucket_disk *d = p->data;
544 struct bucket_disk *end = d + prios_per_bucket(ca);
545
546 for (b = ca->buckets + i * prios_per_bucket(ca);
547 b < ca->buckets + ca->sb.nbuckets && d < end;
548 b++, d++) {
549 d->prio = cpu_to_le16(b->prio);
550 d->gen = b->gen;
551 }
552
553 p->next_bucket = ca->prio_buckets[i + 1];
554 p->magic = pset_magic(&ca->sb);
555 p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8);
556
557 bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true);
558 BUG_ON(bucket == -1);
559
560 mutex_unlock(&ca->set->bucket_lock);
561 prio_io(ca, bucket, REQ_OP_WRITE, 0);
562 mutex_lock(&ca->set->bucket_lock);
563
564 ca->prio_buckets[i] = bucket;
565 atomic_dec_bug(&ca->buckets[bucket].pin);
566 }
567
568 mutex_unlock(&ca->set->bucket_lock);
569
570 bch_journal_meta(ca->set, &cl);
571 closure_sync(&cl);
572
573 mutex_lock(&ca->set->bucket_lock);
574
575
576
577
578
579 for (i = 0; i < prio_buckets(ca); i++) {
580 if (ca->prio_last_buckets[i])
581 __bch_bucket_free(ca,
582 &ca->buckets[ca->prio_last_buckets[i]]);
583
584 ca->prio_last_buckets[i] = ca->prio_buckets[i];
585 }
586}
587
588static void prio_read(struct cache *ca, uint64_t bucket)
589{
590 struct prio_set *p = ca->disk_buckets;
591 struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
592 struct bucket *b;
593 unsigned bucket_nr = 0;
594
595 for (b = ca->buckets;
596 b < ca->buckets + ca->sb.nbuckets;
597 b++, d++) {
598 if (d == end) {
599 ca->prio_buckets[bucket_nr] = bucket;
600 ca->prio_last_buckets[bucket_nr] = bucket;
601 bucket_nr++;
602
603 prio_io(ca, bucket, REQ_OP_READ, READ_SYNC);
604
605 if (p->csum != bch_crc64(&p->magic, bucket_bytes(ca) - 8))
606 pr_warn("bad csum reading priorities");
607
608 if (p->magic != pset_magic(&ca->sb))
609 pr_warn("bad magic reading priorities");
610
611 bucket = p->next_bucket;
612 d = p->data;
613 }
614
615 b->prio = le16_to_cpu(d->prio);
616 b->gen = b->last_gc = d->gen;
617 }
618}
619
620
621
622static int open_dev(struct block_device *b, fmode_t mode)
623{
624 struct bcache_device *d = b->bd_disk->private_data;
625 if (test_bit(BCACHE_DEV_CLOSING, &d->flags))
626 return -ENXIO;
627
628 closure_get(&d->cl);
629 return 0;
630}
631
632static void release_dev(struct gendisk *b, fmode_t mode)
633{
634 struct bcache_device *d = b->private_data;
635 closure_put(&d->cl);
636}
637
638static int ioctl_dev(struct block_device *b, fmode_t mode,
639 unsigned int cmd, unsigned long arg)
640{
641 struct bcache_device *d = b->bd_disk->private_data;
642 return d->ioctl(d, mode, cmd, arg);
643}
644
645static const struct block_device_operations bcache_ops = {
646 .open = open_dev,
647 .release = release_dev,
648 .ioctl = ioctl_dev,
649 .owner = THIS_MODULE,
650};
651
652void bcache_device_stop(struct bcache_device *d)
653{
654 if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags))
655 closure_queue(&d->cl);
656}
657
658static void bcache_device_unlink(struct bcache_device *d)
659{
660 lockdep_assert_held(&bch_register_lock);
661
662 if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) {
663 unsigned i;
664 struct cache *ca;
665
666 sysfs_remove_link(&d->c->kobj, d->name);
667 sysfs_remove_link(&d->kobj, "cache");
668
669 for_each_cache(ca, d->c, i)
670 bd_unlink_disk_holder(ca->bdev, d->disk);
671 }
672}
673
674static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
675 const char *name)
676{
677 unsigned i;
678 struct cache *ca;
679
680 for_each_cache(ca, d->c, i)
681 bd_link_disk_holder(ca->bdev, d->disk);
682
683 snprintf(d->name, BCACHEDEVNAME_SIZE,
684 "%s%u", name, d->id);
685
686 WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
687 sysfs_create_link(&c->kobj, &d->kobj, d->name),
688 "Couldn't create device <-> cache set symlinks");
689
690 clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags);
691}
692
693static void bcache_device_detach(struct bcache_device *d)
694{
695 lockdep_assert_held(&bch_register_lock);
696
697 if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
698 struct uuid_entry *u = d->c->uuids + d->id;
699
700 SET_UUID_FLASH_ONLY(u, 0);
701 memcpy(u->uuid, invalid_uuid, 16);
702 u->invalidated = cpu_to_le32(get_seconds());
703 bch_uuid_write(d->c);
704 }
705
706 bcache_device_unlink(d);
707
708 d->c->devices[d->id] = NULL;
709 closure_put(&d->c->caching);
710 d->c = NULL;
711}
712
713static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
714 unsigned id)
715{
716 d->id = id;
717 d->c = c;
718 c->devices[id] = d;
719
720 closure_get(&c->caching);
721}
722
723static void bcache_device_free(struct bcache_device *d)
724{
725 lockdep_assert_held(&bch_register_lock);
726
727 pr_info("%s stopped", d->disk->disk_name);
728
729 if (d->c)
730 bcache_device_detach(d);
731 if (d->disk && d->disk->flags & GENHD_FL_UP)
732 del_gendisk(d->disk);
733 if (d->disk && d->disk->queue)
734 blk_cleanup_queue(d->disk->queue);
735 if (d->disk) {
736 ida_simple_remove(&bcache_minor, d->disk->first_minor);
737 put_disk(d->disk);
738 }
739
740 if (d->bio_split)
741 bioset_free(d->bio_split);
742 kvfree(d->full_dirty_stripes);
743 kvfree(d->stripe_sectors_dirty);
744
745 closure_debug_destroy(&d->cl);
746}
747
748static int bcache_device_init(struct bcache_device *d, unsigned block_size,
749 sector_t sectors)
750{
751 struct request_queue *q;
752 size_t n;
753 int minor;
754
755 if (!d->stripe_size)
756 d->stripe_size = 1 << 31;
757
758 d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
759
760 if (!d->nr_stripes ||
761 d->nr_stripes > INT_MAX ||
762 d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) {
763 pr_err("nr_stripes too large or invalid: %u (start sector beyond end of disk?)",
764 (unsigned)d->nr_stripes);
765 return -ENOMEM;
766 }
767
768 n = d->nr_stripes * sizeof(atomic_t);
769 d->stripe_sectors_dirty = n < PAGE_SIZE << 6
770 ? kzalloc(n, GFP_KERNEL)
771 : vzalloc(n);
772 if (!d->stripe_sectors_dirty)
773 return -ENOMEM;
774
775 n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long);
776 d->full_dirty_stripes = n < PAGE_SIZE << 6
777 ? kzalloc(n, GFP_KERNEL)
778 : vzalloc(n);
779 if (!d->full_dirty_stripes)
780 return -ENOMEM;
781
782 minor = ida_simple_get(&bcache_minor, 0, MINORMASK + 1, GFP_KERNEL);
783 if (minor < 0)
784 return minor;
785
786 if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
787 !(d->disk = alloc_disk(1))) {
788 ida_simple_remove(&bcache_minor, minor);
789 return -ENOMEM;
790 }
791
792 set_capacity(d->disk, sectors);
793 snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", minor);
794
795 d->disk->major = bcache_major;
796 d->disk->first_minor = minor;
797 d->disk->fops = &bcache_ops;
798 d->disk->private_data = d;
799
800 q = blk_alloc_queue(GFP_KERNEL);
801 if (!q)
802 return -ENOMEM;
803
804 blk_queue_make_request(q, NULL);
805 d->disk->queue = q;
806 q->queuedata = d;
807 q->backing_dev_info.congested_data = d;
808 q->limits.max_hw_sectors = UINT_MAX;
809 q->limits.max_sectors = UINT_MAX;
810 q->limits.max_segment_size = UINT_MAX;
811 q->limits.max_segments = BIO_MAX_PAGES;
812 blk_queue_max_discard_sectors(q, UINT_MAX);
813 q->limits.discard_granularity = 512;
814 q->limits.io_min = block_size;
815 q->limits.logical_block_size = block_size;
816 q->limits.physical_block_size = block_size;
817 set_bit(QUEUE_FLAG_NONROT, &d->disk->queue->queue_flags);
818 clear_bit(QUEUE_FLAG_ADD_RANDOM, &d->disk->queue->queue_flags);
819 set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags);
820
821 blk_queue_write_cache(q, true, true);
822
823 return 0;
824}
825
826
827
828static void calc_cached_dev_sectors(struct cache_set *c)
829{
830 uint64_t sectors = 0;
831 struct cached_dev *dc;
832
833 list_for_each_entry(dc, &c->cached_devs, list)
834 sectors += bdev_sectors(dc->bdev);
835
836 c->cached_dev_sectors = sectors;
837}
838
839void bch_cached_dev_run(struct cached_dev *dc)
840{
841 struct bcache_device *d = &dc->disk;
842 char buf[SB_LABEL_SIZE + 1];
843 char *env[] = {
844 "DRIVER=bcache",
845 kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid),
846 NULL,
847 NULL,
848 };
849
850 memcpy(buf, dc->sb.label, SB_LABEL_SIZE);
851 buf[SB_LABEL_SIZE] = '\0';
852 env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);
853
854 if (atomic_xchg(&dc->running, 1)) {
855 kfree(env[1]);
856 kfree(env[2]);
857 return;
858 }
859
860 if (!d->c &&
861 BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
862 struct closure cl;
863 closure_init_stack(&cl);
864
865 SET_BDEV_STATE(&dc->sb, BDEV_STATE_STALE);
866 bch_write_bdev_super(dc, &cl);
867 closure_sync(&cl);
868 }
869
870 add_disk(d->disk);
871 bd_link_disk_holder(dc->bdev, dc->disk.disk);
872
873
874 kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
875 kfree(env[1]);
876 kfree(env[2]);
877
878 if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
879 sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
880 pr_debug("error creating sysfs link");
881}
882
883static void cached_dev_detach_finish(struct work_struct *w)
884{
885 struct cached_dev *dc = container_of(w, struct cached_dev, detach);
886 char buf[BDEVNAME_SIZE];
887 struct closure cl;
888 closure_init_stack(&cl);
889
890 BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
891 BUG_ON(atomic_read(&dc->count));
892
893 mutex_lock(&bch_register_lock);
894
895 memset(&dc->sb.set_uuid, 0, 16);
896 SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE);
897
898 bch_write_bdev_super(dc, &cl);
899 closure_sync(&cl);
900
901 bcache_device_detach(&dc->disk);
902 list_move(&dc->list, &uncached_devices);
903
904 clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags);
905 clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags);
906
907 mutex_unlock(&bch_register_lock);
908
909 pr_info("Caching disabled for %s", bdevname(dc->bdev, buf));
910
911
912 closure_put(&dc->disk.cl);
913}
914
915void bch_cached_dev_detach(struct cached_dev *dc)
916{
917 lockdep_assert_held(&bch_register_lock);
918
919 if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
920 return;
921
922 if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
923 return;
924
925
926
927
928
929 closure_get(&dc->disk.cl);
930
931 bch_writeback_queue(dc);
932 cached_dev_put(dc);
933}
934
935int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
936{
937 uint32_t rtime = cpu_to_le32(get_seconds());
938 struct uuid_entry *u;
939 char buf[BDEVNAME_SIZE];
940
941 bdevname(dc->bdev, buf);
942
943 if (memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16))
944 return -ENOENT;
945
946 if (dc->disk.c) {
947 pr_err("Can't attach %s: already attached", buf);
948 return -EINVAL;
949 }
950
951 if (test_bit(CACHE_SET_STOPPING, &c->flags)) {
952 pr_err("Can't attach %s: shutting down", buf);
953 return -EINVAL;
954 }
955
956 if (dc->sb.block_size < c->sb.block_size) {
957
958 pr_err("Couldn't attach %s: block size less than set's block size",
959 buf);
960 return -EINVAL;
961 }
962
963 u = uuid_find(c, dc->sb.uuid);
964
965 if (u &&
966 (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE ||
967 BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) {
968 memcpy(u->uuid, invalid_uuid, 16);
969 u->invalidated = cpu_to_le32(get_seconds());
970 u = NULL;
971 }
972
973 if (!u) {
974 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
975 pr_err("Couldn't find uuid for %s in set", buf);
976 return -ENOENT;
977 }
978
979 u = uuid_find_empty(c);
980 if (!u) {
981 pr_err("Not caching %s, no room for UUID", buf);
982 return -EINVAL;
983 }
984 }
985
986
987
988
989
990 if (bch_is_zero(u->uuid, 16)) {
991 struct closure cl;
992 closure_init_stack(&cl);
993
994 memcpy(u->uuid, dc->sb.uuid, 16);
995 memcpy(u->label, dc->sb.label, SB_LABEL_SIZE);
996 u->first_reg = u->last_reg = rtime;
997 bch_uuid_write(c);
998
999 memcpy(dc->sb.set_uuid, c->sb.set_uuid, 16);
1000 SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
1001
1002 bch_write_bdev_super(dc, &cl);
1003 closure_sync(&cl);
1004 } else {
1005 u->last_reg = rtime;
1006 bch_uuid_write(c);
1007 }
1008
1009 bcache_device_attach(&dc->disk, c, u - c->uuids);
1010 list_move(&dc->list, &c->cached_devs);
1011 calc_cached_dev_sectors(c);
1012
1013 smp_wmb();
1014
1015
1016
1017
1018 atomic_set(&dc->count, 1);
1019
1020
1021 down_write(&dc->writeback_lock);
1022 if (bch_cached_dev_writeback_start(dc)) {
1023 up_write(&dc->writeback_lock);
1024 return -ENOMEM;
1025 }
1026
1027 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
1028 bch_sectors_dirty_init(dc);
1029 atomic_set(&dc->has_dirty, 1);
1030 atomic_inc(&dc->count);
1031 bch_writeback_queue(dc);
1032 }
1033
1034 bch_cached_dev_run(dc);
1035 bcache_device_link(&dc->disk, c, "bdev");
1036
1037
1038 up_write(&dc->writeback_lock);
1039
1040 pr_info("Caching %s as %s on set %pU",
1041 bdevname(dc->bdev, buf), dc->disk.disk->disk_name,
1042 dc->disk.c->sb.set_uuid);
1043 return 0;
1044}
1045
1046void bch_cached_dev_release(struct kobject *kobj)
1047{
1048 struct cached_dev *dc = container_of(kobj, struct cached_dev,
1049 disk.kobj);
1050 kfree(dc);
1051 module_put(THIS_MODULE);
1052}
1053
1054static void cached_dev_free(struct closure *cl)
1055{
1056 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
1057
1058 cancel_delayed_work_sync(&dc->writeback_rate_update);
1059 if (!IS_ERR_OR_NULL(dc->writeback_thread))
1060 kthread_stop(dc->writeback_thread);
1061
1062 mutex_lock(&bch_register_lock);
1063
1064 if (atomic_read(&dc->running))
1065 bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
1066 bcache_device_free(&dc->disk);
1067 list_del(&dc->list);
1068
1069 mutex_unlock(&bch_register_lock);
1070
1071 if (!IS_ERR_OR_NULL(dc->bdev))
1072 blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1073
1074 wake_up(&unregister_wait);
1075
1076 kobject_put(&dc->disk.kobj);
1077}
1078
1079static void cached_dev_flush(struct closure *cl)
1080{
1081 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
1082 struct bcache_device *d = &dc->disk;
1083
1084 mutex_lock(&bch_register_lock);
1085 bcache_device_unlink(d);
1086 mutex_unlock(&bch_register_lock);
1087
1088 bch_cache_accounting_destroy(&dc->accounting);
1089 kobject_del(&d->kobj);
1090
1091 continue_at(cl, cached_dev_free, system_wq);
1092}
1093
1094static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
1095{
1096 int ret;
1097 struct io *io;
1098 struct request_queue *q = bdev_get_queue(dc->bdev);
1099
1100 __module_get(THIS_MODULE);
1101 INIT_LIST_HEAD(&dc->list);
1102 closure_init(&dc->disk.cl, NULL);
1103 set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
1104 kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
1105 INIT_WORK(&dc->detach, cached_dev_detach_finish);
1106 sema_init(&dc->sb_write_mutex, 1);
1107 INIT_LIST_HEAD(&dc->io_lru);
1108 spin_lock_init(&dc->io_lock);
1109 bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
1110
1111 dc->sequential_cutoff = 4 << 20;
1112
1113 for (io = dc->io; io < dc->io + RECENT_IO; io++) {
1114 list_add(&io->lru, &dc->io_lru);
1115 hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
1116 }
1117
1118 dc->disk.stripe_size = q->limits.io_opt >> 9;
1119
1120 if (dc->disk.stripe_size)
1121 dc->partial_stripes_expensive =
1122 q->limits.raid_partial_stripes_expensive;
1123
1124 ret = bcache_device_init(&dc->disk, block_size,
1125 dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
1126 if (ret)
1127 return ret;
1128
1129 set_capacity(dc->disk.disk,
1130 dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
1131
1132 dc->disk.disk->queue->backing_dev_info.ra_pages =
1133 max(dc->disk.disk->queue->backing_dev_info.ra_pages,
1134 q->backing_dev_info.ra_pages);
1135
1136 bch_cached_dev_request_init(dc);
1137 bch_cached_dev_writeback_init(dc);
1138 return 0;
1139}
1140
1141
1142
1143static void register_bdev(struct cache_sb *sb, struct page *sb_page,
1144 struct block_device *bdev,
1145 struct cached_dev *dc)
1146{
1147 char name[BDEVNAME_SIZE];
1148 const char *err = "cannot allocate memory";
1149 struct cache_set *c;
1150
1151 memcpy(&dc->sb, sb, sizeof(struct cache_sb));
1152 dc->bdev = bdev;
1153 dc->bdev->bd_holder = dc;
1154
1155 bio_init(&dc->sb_bio);
1156 dc->sb_bio.bi_max_vecs = 1;
1157 dc->sb_bio.bi_io_vec = dc->sb_bio.bi_inline_vecs;
1158 dc->sb_bio.bi_io_vec[0].bv_page = sb_page;
1159 get_page(sb_page);
1160
1161 if (cached_dev_init(dc, sb->block_size << 9))
1162 goto err;
1163
1164 err = "error creating kobject";
1165 if (kobject_add(&dc->disk.kobj, &part_to_dev(bdev->bd_part)->kobj,
1166 "bcache"))
1167 goto err;
1168 if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
1169 goto err;
1170
1171 pr_info("registered backing device %s", bdevname(bdev, name));
1172
1173 list_add(&dc->list, &uncached_devices);
1174 list_for_each_entry(c, &bch_cache_sets, list)
1175 bch_cached_dev_attach(dc, c);
1176
1177 if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE ||
1178 BDEV_STATE(&dc->sb) == BDEV_STATE_STALE)
1179 bch_cached_dev_run(dc);
1180
1181 return;
1182err:
1183 pr_notice("error opening %s: %s", bdevname(bdev, name), err);
1184 bcache_device_stop(&dc->disk);
1185}
1186
1187
1188
1189void bch_flash_dev_release(struct kobject *kobj)
1190{
1191 struct bcache_device *d = container_of(kobj, struct bcache_device,
1192 kobj);
1193 kfree(d);
1194}
1195
1196static void flash_dev_free(struct closure *cl)
1197{
1198 struct bcache_device *d = container_of(cl, struct bcache_device, cl);
1199 mutex_lock(&bch_register_lock);
1200 bcache_device_free(d);
1201 mutex_unlock(&bch_register_lock);
1202 kobject_put(&d->kobj);
1203}
1204
1205static void flash_dev_flush(struct closure *cl)
1206{
1207 struct bcache_device *d = container_of(cl, struct bcache_device, cl);
1208
1209 mutex_lock(&bch_register_lock);
1210 bcache_device_unlink(d);
1211 mutex_unlock(&bch_register_lock);
1212 kobject_del(&d->kobj);
1213 continue_at(cl, flash_dev_free, system_wq);
1214}
1215
1216static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
1217{
1218 struct bcache_device *d = kzalloc(sizeof(struct bcache_device),
1219 GFP_KERNEL);
1220 if (!d)
1221 return -ENOMEM;
1222
1223 closure_init(&d->cl, NULL);
1224 set_closure_fn(&d->cl, flash_dev_flush, system_wq);
1225
1226 kobject_init(&d->kobj, &bch_flash_dev_ktype);
1227
1228 if (bcache_device_init(d, block_bytes(c), u->sectors))
1229 goto err;
1230
1231 bcache_device_attach(d, c, u - c->uuids);
1232 bch_flash_dev_request_init(d);
1233 add_disk(d->disk);
1234
1235 if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache"))
1236 goto err;
1237
1238 bcache_device_link(d, c, "volume");
1239
1240 return 0;
1241err:
1242 kobject_put(&d->kobj);
1243 return -ENOMEM;
1244}
1245
1246static int flash_devs_run(struct cache_set *c)
1247{
1248 int ret = 0;
1249 struct uuid_entry *u;
1250
1251 for (u = c->uuids;
1252 u < c->uuids + c->nr_uuids && !ret;
1253 u++)
1254 if (UUID_FLASH_ONLY(u))
1255 ret = flash_dev_run(c, u);
1256
1257 return ret;
1258}
1259
1260int bch_flash_dev_create(struct cache_set *c, uint64_t size)
1261{
1262 struct uuid_entry *u;
1263
1264 if (test_bit(CACHE_SET_STOPPING, &c->flags))
1265 return -EINTR;
1266
1267 if (!test_bit(CACHE_SET_RUNNING, &c->flags))
1268 return -EPERM;
1269
1270 u = uuid_find_empty(c);
1271 if (!u) {
1272 pr_err("Can't create volume, no room for UUID");
1273 return -EINVAL;
1274 }
1275
1276 get_random_bytes(u->uuid, 16);
1277 memset(u->label, 0, 32);
1278 u->first_reg = u->last_reg = cpu_to_le32(get_seconds());
1279
1280 SET_UUID_FLASH_ONLY(u, 1);
1281 u->sectors = size >> 9;
1282
1283 bch_uuid_write(c);
1284
1285 return flash_dev_run(c, u);
1286}
1287
1288
1289
1290__printf(2, 3)
1291bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
1292{
1293 va_list args;
1294
1295 if (c->on_error != ON_ERROR_PANIC &&
1296 test_bit(CACHE_SET_STOPPING, &c->flags))
1297 return false;
1298
1299
1300
1301
1302
1303 printk(KERN_ERR "bcache: error on %pU: ", c->sb.set_uuid);
1304
1305 va_start(args, fmt);
1306 vprintk(fmt, args);
1307 va_end(args);
1308
1309 printk(", disabling caching\n");
1310
1311 if (c->on_error == ON_ERROR_PANIC)
1312 panic("panic forced after error\n");
1313
1314 bch_cache_set_unregister(c);
1315 return true;
1316}
1317
1318void bch_cache_set_release(struct kobject *kobj)
1319{
1320 struct cache_set *c = container_of(kobj, struct cache_set, kobj);
1321 kfree(c);
1322 module_put(THIS_MODULE);
1323}
1324
1325static void cache_set_free(struct closure *cl)
1326{
1327 struct cache_set *c = container_of(cl, struct cache_set, cl);
1328 struct cache *ca;
1329 unsigned i;
1330
1331 if (!IS_ERR_OR_NULL(c->debug))
1332 debugfs_remove(c->debug);
1333
1334 bch_open_buckets_free(c);
1335 bch_btree_cache_free(c);
1336 bch_journal_free(c);
1337
1338 for_each_cache(ca, c, i)
1339 if (ca) {
1340 ca->set = NULL;
1341 c->cache[ca->sb.nr_this_dev] = NULL;
1342 kobject_put(&ca->kobj);
1343 }
1344
1345 bch_bset_sort_state_free(&c->sort);
1346 free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
1347
1348 if (c->moving_gc_wq)
1349 destroy_workqueue(c->moving_gc_wq);
1350 if (c->bio_split)
1351 bioset_free(c->bio_split);
1352 if (c->fill_iter)
1353 mempool_destroy(c->fill_iter);
1354 if (c->bio_meta)
1355 mempool_destroy(c->bio_meta);
1356 if (c->search)
1357 mempool_destroy(c->search);
1358 kfree(c->devices);
1359
1360 mutex_lock(&bch_register_lock);
1361 list_del(&c->list);
1362 mutex_unlock(&bch_register_lock);
1363
1364 pr_info("Cache set %pU unregistered", c->sb.set_uuid);
1365 wake_up(&unregister_wait);
1366
1367 closure_debug_destroy(&c->cl);
1368 kobject_put(&c->kobj);
1369}
1370
1371static void cache_set_flush(struct closure *cl)
1372{
1373 struct cache_set *c = container_of(cl, struct cache_set, caching);
1374 struct cache *ca;
1375 struct btree *b;
1376 unsigned i;
1377
1378 if (!c)
1379 closure_return(cl);
1380
1381 bch_cache_accounting_destroy(&c->accounting);
1382
1383 kobject_put(&c->internal);
1384 kobject_del(&c->kobj);
1385
1386 if (c->gc_thread)
1387 kthread_stop(c->gc_thread);
1388
1389 if (!IS_ERR_OR_NULL(c->root))
1390 list_add(&c->root->list, &c->btree_cache);
1391
1392
1393 list_for_each_entry(b, &c->btree_cache, list) {
1394 mutex_lock(&b->write_lock);
1395 if (btree_node_dirty(b))
1396 __bch_btree_node_write(b, NULL);
1397 mutex_unlock(&b->write_lock);
1398 }
1399
1400 for_each_cache(ca, c, i)
1401 if (ca->alloc_thread)
1402 kthread_stop(ca->alloc_thread);
1403
1404 if (c->journal.cur) {
1405 cancel_delayed_work_sync(&c->journal.work);
1406
1407 c->journal.work.work.func(&c->journal.work.work);
1408 }
1409
1410 closure_return(cl);
1411}
1412
1413static void __cache_set_unregister(struct closure *cl)
1414{
1415 struct cache_set *c = container_of(cl, struct cache_set, caching);
1416 struct cached_dev *dc;
1417 size_t i;
1418
1419 mutex_lock(&bch_register_lock);
1420
1421 for (i = 0; i < c->nr_uuids; i++)
1422 if (c->devices[i]) {
1423 if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
1424 test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
1425 dc = container_of(c->devices[i],
1426 struct cached_dev, disk);
1427 bch_cached_dev_detach(dc);
1428 } else {
1429 bcache_device_stop(c->devices[i]);
1430 }
1431 }
1432
1433 mutex_unlock(&bch_register_lock);
1434
1435 continue_at(cl, cache_set_flush, system_wq);
1436}
1437
1438void bch_cache_set_stop(struct cache_set *c)
1439{
1440 if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags))
1441 closure_queue(&c->caching);
1442}
1443
1444void bch_cache_set_unregister(struct cache_set *c)
1445{
1446 set_bit(CACHE_SET_UNREGISTERING, &c->flags);
1447 bch_cache_set_stop(c);
1448}
1449
1450#define alloc_bucket_pages(gfp, c) \
1451 ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(c))))
1452
1453struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1454{
1455 int iter_size;
1456 struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
1457 if (!c)
1458 return NULL;
1459
1460 __module_get(THIS_MODULE);
1461 closure_init(&c->cl, NULL);
1462 set_closure_fn(&c->cl, cache_set_free, system_wq);
1463
1464 closure_init(&c->caching, &c->cl);
1465 set_closure_fn(&c->caching, __cache_set_unregister, system_wq);
1466
1467
1468 closure_set_stopped(&c->cl);
1469 closure_put(&c->cl);
1470
1471 kobject_init(&c->kobj, &bch_cache_set_ktype);
1472 kobject_init(&c->internal, &bch_cache_set_internal_ktype);
1473
1474 bch_cache_accounting_init(&c->accounting, &c->cl);
1475
1476 memcpy(c->sb.set_uuid, sb->set_uuid, 16);
1477 c->sb.block_size = sb->block_size;
1478 c->sb.bucket_size = sb->bucket_size;
1479 c->sb.nr_in_set = sb->nr_in_set;
1480 c->sb.last_mount = sb->last_mount;
1481 c->bucket_bits = ilog2(sb->bucket_size);
1482 c->block_bits = ilog2(sb->block_size);
1483 c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry);
1484
1485 c->btree_pages = bucket_pages(c);
1486 if (c->btree_pages > BTREE_MAX_PAGES)
1487 c->btree_pages = max_t(int, c->btree_pages / 4,
1488 BTREE_MAX_PAGES);
1489
1490 sema_init(&c->sb_write_mutex, 1);
1491 mutex_init(&c->bucket_lock);
1492 init_waitqueue_head(&c->btree_cache_wait);
1493 init_waitqueue_head(&c->bucket_wait);
1494 sema_init(&c->uuid_write_mutex, 1);
1495
1496 spin_lock_init(&c->btree_gc_time.lock);
1497 spin_lock_init(&c->btree_split_time.lock);
1498 spin_lock_init(&c->btree_read_time.lock);
1499
1500 bch_moving_init_cache_set(c);
1501
1502 INIT_LIST_HEAD(&c->list);
1503 INIT_LIST_HEAD(&c->cached_devs);
1504 INIT_LIST_HEAD(&c->btree_cache);
1505 INIT_LIST_HEAD(&c->btree_cache_freeable);
1506 INIT_LIST_HEAD(&c->btree_cache_freed);
1507 INIT_LIST_HEAD(&c->data_buckets);
1508
1509 c->search = mempool_create_slab_pool(32, bch_search_cache);
1510 if (!c->search)
1511 goto err;
1512
1513 iter_size = (sb->bucket_size / sb->block_size + 1) *
1514 sizeof(struct btree_iter_set);
1515
1516 if (!(c->devices = kzalloc(c->nr_uuids * sizeof(void *), GFP_KERNEL)) ||
1517 !(c->bio_meta = mempool_create_kmalloc_pool(2,
1518 sizeof(struct bbio) + sizeof(struct bio_vec) *
1519 bucket_pages(c))) ||
1520 !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
1521 !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
1522 !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
1523 !(c->moving_gc_wq = alloc_workqueue("bcache_gc",
1524 WQ_MEM_RECLAIM, 0)) ||
1525 bch_journal_alloc(c) ||
1526 bch_btree_cache_alloc(c) ||
1527 bch_open_buckets_alloc(c) ||
1528 bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages)))
1529 goto err;
1530
1531 c->congested_read_threshold_us = 2000;
1532 c->congested_write_threshold_us = 20000;
1533 c->error_limit = 8 << IO_ERROR_SHIFT;
1534
1535 return c;
1536err:
1537 bch_cache_set_unregister(c);
1538 return NULL;
1539}
1540
1541static void run_cache_set(struct cache_set *c)
1542{
1543 const char *err = "cannot allocate memory";
1544 struct cached_dev *dc, *t;
1545 struct cache *ca;
1546 struct closure cl;
1547 unsigned i;
1548
1549 closure_init_stack(&cl);
1550
1551 for_each_cache(ca, c, i)
1552 c->nbuckets += ca->sb.nbuckets;
1553
1554 if (CACHE_SYNC(&c->sb)) {
1555 LIST_HEAD(journal);
1556 struct bkey *k;
1557 struct jset *j;
1558
1559 err = "cannot allocate memory for journal";
1560 if (bch_journal_read(c, &journal))
1561 goto err;
1562
1563 pr_debug("btree_journal_read() done");
1564
1565 err = "no journal entries found";
1566 if (list_empty(&journal))
1567 goto err;
1568
1569 j = &list_entry(journal.prev, struct journal_replay, list)->j;
1570
1571 err = "IO error reading priorities";
1572 for_each_cache(ca, c, i)
1573 prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]);
1574
1575
1576
1577
1578
1579
1580
1581 k = &j->btree_root;
1582
1583 err = "bad btree root";
1584 if (__bch_btree_ptr_invalid(c, k))
1585 goto err;
1586
1587 err = "error reading btree root";
1588 c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true, NULL);
1589 if (IS_ERR_OR_NULL(c->root))
1590 goto err;
1591
1592 list_del_init(&c->root->list);
1593 rw_unlock(true, c->root);
1594
1595 err = uuid_read(c, j, &cl);
1596 if (err)
1597 goto err;
1598
1599 err = "error in recovery";
1600 if (bch_btree_check(c))
1601 goto err;
1602
1603 bch_journal_mark(c, &journal);
1604 bch_initial_gc_finish(c);
1605 pr_debug("btree_check() done");
1606
1607
1608
1609
1610
1611
1612 bch_journal_next(&c->journal);
1613
1614 err = "error starting allocator thread";
1615 for_each_cache(ca, c, i)
1616 if (bch_cache_allocator_start(ca))
1617 goto err;
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629 if (j->version < BCACHE_JSET_VERSION_UUID)
1630 __uuid_write(c);
1631
1632 bch_journal_replay(c, &journal);
1633 } else {
1634 pr_notice("invalidating existing data");
1635
1636 for_each_cache(ca, c, i) {
1637 unsigned j;
1638
1639 ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
1640 2, SB_JOURNAL_BUCKETS);
1641
1642 for (j = 0; j < ca->sb.keys; j++)
1643 ca->sb.d[j] = ca->sb.first_bucket + j;
1644 }
1645
1646 bch_initial_gc_finish(c);
1647
1648 err = "error starting allocator thread";
1649 for_each_cache(ca, c, i)
1650 if (bch_cache_allocator_start(ca))
1651 goto err;
1652
1653 mutex_lock(&c->bucket_lock);
1654 for_each_cache(ca, c, i)
1655 bch_prio_write(ca);
1656 mutex_unlock(&c->bucket_lock);
1657
1658 err = "cannot allocate new UUID bucket";
1659 if (__uuid_write(c))
1660 goto err;
1661
1662 err = "cannot allocate new btree root";
1663 c->root = __bch_btree_node_alloc(c, NULL, 0, true, NULL);
1664 if (IS_ERR_OR_NULL(c->root))
1665 goto err;
1666
1667 mutex_lock(&c->root->write_lock);
1668 bkey_copy_key(&c->root->key, &MAX_KEY);
1669 bch_btree_node_write(c->root, &cl);
1670 mutex_unlock(&c->root->write_lock);
1671
1672 bch_btree_set_root(c->root);
1673 rw_unlock(true, c->root);
1674
1675
1676
1677
1678
1679
1680 SET_CACHE_SYNC(&c->sb, true);
1681
1682 bch_journal_next(&c->journal);
1683 bch_journal_meta(c, &cl);
1684 }
1685
1686 err = "error starting gc thread";
1687 if (bch_gc_thread_start(c))
1688 goto err;
1689
1690 closure_sync(&cl);
1691 c->sb.last_mount = get_seconds();
1692 bcache_write_super(c);
1693
1694 list_for_each_entry_safe(dc, t, &uncached_devices, list)
1695 bch_cached_dev_attach(dc, c);
1696
1697 flash_devs_run(c);
1698
1699 set_bit(CACHE_SET_RUNNING, &c->flags);
1700 return;
1701err:
1702 closure_sync(&cl);
1703
1704 bch_cache_set_error(c, "%s", err);
1705}
1706
1707static bool can_attach_cache(struct cache *ca, struct cache_set *c)
1708{
1709 return ca->sb.block_size == c->sb.block_size &&
1710 ca->sb.bucket_size == c->sb.bucket_size &&
1711 ca->sb.nr_in_set == c->sb.nr_in_set;
1712}
1713
1714static const char *register_cache_set(struct cache *ca)
1715{
1716 char buf[12];
1717 const char *err = "cannot allocate memory";
1718 struct cache_set *c;
1719
1720 list_for_each_entry(c, &bch_cache_sets, list)
1721 if (!memcmp(c->sb.set_uuid, ca->sb.set_uuid, 16)) {
1722 if (c->cache[ca->sb.nr_this_dev])
1723 return "duplicate cache set member";
1724
1725 if (!can_attach_cache(ca, c))
1726 return "cache sb does not match set";
1727
1728 if (!CACHE_SYNC(&ca->sb))
1729 SET_CACHE_SYNC(&c->sb, false);
1730
1731 goto found;
1732 }
1733
1734 c = bch_cache_set_alloc(&ca->sb);
1735 if (!c)
1736 return err;
1737
1738 err = "error creating kobject";
1739 if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->sb.set_uuid) ||
1740 kobject_add(&c->internal, &c->kobj, "internal"))
1741 goto err;
1742
1743 if (bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
1744 goto err;
1745
1746 bch_debug_init_cache_set(c);
1747
1748 list_add(&c->list, &bch_cache_sets);
1749found:
1750 sprintf(buf, "cache%i", ca->sb.nr_this_dev);
1751 if (sysfs_create_link(&ca->kobj, &c->kobj, "set") ||
1752 sysfs_create_link(&c->kobj, &ca->kobj, buf))
1753 goto err;
1754
1755 if (ca->sb.seq > c->sb.seq) {
1756 c->sb.version = ca->sb.version;
1757 memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16);
1758 c->sb.flags = ca->sb.flags;
1759 c->sb.seq = ca->sb.seq;
1760 pr_debug("set version = %llu", c->sb.version);
1761 }
1762
1763 kobject_get(&ca->kobj);
1764 ca->set = c;
1765 ca->set->cache[ca->sb.nr_this_dev] = ca;
1766 c->cache_by_alloc[c->caches_loaded++] = ca;
1767
1768 if (c->caches_loaded == c->sb.nr_in_set)
1769 run_cache_set(c);
1770
1771 return NULL;
1772err:
1773 bch_cache_set_unregister(c);
1774 return err;
1775}
1776
1777
1778
1779void bch_cache_release(struct kobject *kobj)
1780{
1781 struct cache *ca = container_of(kobj, struct cache, kobj);
1782 unsigned i;
1783
1784 if (ca->set) {
1785 BUG_ON(ca->set->cache[ca->sb.nr_this_dev] != ca);
1786 ca->set->cache[ca->sb.nr_this_dev] = NULL;
1787 }
1788
1789 free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
1790 kfree(ca->prio_buckets);
1791 vfree(ca->buckets);
1792
1793 free_heap(&ca->heap);
1794 free_fifo(&ca->free_inc);
1795
1796 for (i = 0; i < RESERVE_NR; i++)
1797 free_fifo(&ca->free[i]);
1798
1799 if (ca->sb_bio.bi_inline_vecs[0].bv_page)
1800 put_page(ca->sb_bio.bi_io_vec[0].bv_page);
1801
1802 if (!IS_ERR_OR_NULL(ca->bdev))
1803 blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1804
1805 kfree(ca);
1806 module_put(THIS_MODULE);
1807}
1808
1809static int cache_alloc(struct cache *ca)
1810{
1811 size_t free;
1812 struct bucket *b;
1813
1814 __module_get(THIS_MODULE);
1815 kobject_init(&ca->kobj, &bch_cache_ktype);
1816
1817 bio_init(&ca->journal.bio);
1818 ca->journal.bio.bi_max_vecs = 8;
1819 ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs;
1820
1821 free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
1822
1823 if (!init_fifo(&ca->free[RESERVE_BTREE], 8, GFP_KERNEL) ||
1824 !init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
1825 !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) ||
1826 !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) ||
1827 !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) ||
1828 !init_heap(&ca->heap, free << 3, GFP_KERNEL) ||
1829 !(ca->buckets = vzalloc(sizeof(struct bucket) *
1830 ca->sb.nbuckets)) ||
1831 !(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) *
1832 2, GFP_KERNEL)) ||
1833 !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)))
1834 return -ENOMEM;
1835
1836 ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
1837
1838 for_each_bucket(b, ca)
1839 atomic_set(&b->pin, 0);
1840
1841 return 0;
1842}
1843
1844static int register_cache(struct cache_sb *sb, struct page *sb_page,
1845 struct block_device *bdev, struct cache *ca)
1846{
1847 char name[BDEVNAME_SIZE];
1848 const char *err = NULL;
1849 int ret = 0;
1850
1851 memcpy(&ca->sb, sb, sizeof(struct cache_sb));
1852 ca->bdev = bdev;
1853 ca->bdev->bd_holder = ca;
1854
1855 bio_init(&ca->sb_bio);
1856 ca->sb_bio.bi_max_vecs = 1;
1857 ca->sb_bio.bi_io_vec = ca->sb_bio.bi_inline_vecs;
1858 ca->sb_bio.bi_io_vec[0].bv_page = sb_page;
1859 get_page(sb_page);
1860
1861 if (blk_queue_discard(bdev_get_queue(ca->bdev)))
1862 ca->discard = CACHE_DISCARD(&ca->sb);
1863
1864 ret = cache_alloc(ca);
1865 if (ret != 0) {
1866 if (ret == -ENOMEM)
1867 err = "cache_alloc(): -ENOMEM";
1868 else
1869 err = "cache_alloc(): unknown error";
1870 goto err;
1871 }
1872
1873 if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache")) {
1874 err = "error calling kobject_add";
1875 ret = -ENOMEM;
1876 goto out;
1877 }
1878
1879 mutex_lock(&bch_register_lock);
1880 err = register_cache_set(ca);
1881 mutex_unlock(&bch_register_lock);
1882
1883 if (err) {
1884 ret = -ENODEV;
1885 goto out;
1886 }
1887
1888 pr_info("registered cache device %s", bdevname(bdev, name));
1889
1890out:
1891 kobject_put(&ca->kobj);
1892
1893err:
1894 if (err)
1895 pr_notice("error opening %s: %s", bdevname(bdev, name), err);
1896
1897 return ret;
1898}
1899
1900
1901
1902static ssize_t register_bcache(struct kobject *, struct kobj_attribute *,
1903 const char *, size_t);
1904
1905kobj_attribute_write(register, register_bcache);
1906kobj_attribute_write(register_quiet, register_bcache);
1907
1908static bool bch_is_open_backing(struct block_device *bdev) {
1909 struct cache_set *c, *tc;
1910 struct cached_dev *dc, *t;
1911
1912 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
1913 list_for_each_entry_safe(dc, t, &c->cached_devs, list)
1914 if (dc->bdev == bdev)
1915 return true;
1916 list_for_each_entry_safe(dc, t, &uncached_devices, list)
1917 if (dc->bdev == bdev)
1918 return true;
1919 return false;
1920}
1921
1922static bool bch_is_open_cache(struct block_device *bdev) {
1923 struct cache_set *c, *tc;
1924 struct cache *ca;
1925 unsigned i;
1926
1927 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
1928 for_each_cache(ca, c, i)
1929 if (ca->bdev == bdev)
1930 return true;
1931 return false;
1932}
1933
1934static bool bch_is_open(struct block_device *bdev) {
1935 return bch_is_open_cache(bdev) || bch_is_open_backing(bdev);
1936}
1937
1938static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
1939 const char *buffer, size_t size)
1940{
1941 ssize_t ret = size;
1942 const char *err = "cannot allocate memory";
1943 char *path = NULL;
1944 struct cache_sb *sb = NULL;
1945 struct block_device *bdev = NULL;
1946 struct page *sb_page = NULL;
1947
1948 if (!try_module_get(THIS_MODULE))
1949 return -EBUSY;
1950
1951 if (!(path = kstrndup(buffer, size, GFP_KERNEL)) ||
1952 !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL)))
1953 goto err;
1954
1955 err = "failed to open device";
1956 bdev = blkdev_get_by_path(strim(path),
1957 FMODE_READ|FMODE_WRITE|FMODE_EXCL,
1958 sb);
1959 if (IS_ERR(bdev)) {
1960 if (bdev == ERR_PTR(-EBUSY)) {
1961 bdev = lookup_bdev(strim(path));
1962 mutex_lock(&bch_register_lock);
1963 if (!IS_ERR(bdev) && bch_is_open(bdev))
1964 err = "device already registered";
1965 else
1966 err = "device busy";
1967 mutex_unlock(&bch_register_lock);
1968 if (attr == &ksysfs_register_quiet)
1969 goto out;
1970 }
1971 goto err;
1972 }
1973
1974 err = "failed to set blocksize";
1975 if (set_blocksize(bdev, 4096))
1976 goto err_close;
1977
1978 err = read_super(sb, bdev, &sb_page);
1979 if (err)
1980 goto err_close;
1981
1982 if (SB_IS_BDEV(sb)) {
1983 struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
1984 if (!dc)
1985 goto err_close;
1986
1987 mutex_lock(&bch_register_lock);
1988 register_bdev(sb, sb_page, bdev, dc);
1989 mutex_unlock(&bch_register_lock);
1990 } else {
1991 struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
1992 if (!ca)
1993 goto err_close;
1994
1995 if (register_cache(sb, sb_page, bdev, ca) != 0)
1996 goto err_close;
1997 }
1998out:
1999 if (sb_page)
2000 put_page(sb_page);
2001 kfree(sb);
2002 kfree(path);
2003 module_put(THIS_MODULE);
2004 return ret;
2005
2006err_close:
2007 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2008err:
2009 pr_info("error opening %s: %s", path, err);
2010 ret = -EINVAL;
2011 goto out;
2012}
2013
2014static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
2015{
2016 if (code == SYS_DOWN ||
2017 code == SYS_HALT ||
2018 code == SYS_POWER_OFF) {
2019 DEFINE_WAIT(wait);
2020 unsigned long start = jiffies;
2021 bool stopped = false;
2022
2023 struct cache_set *c, *tc;
2024 struct cached_dev *dc, *tdc;
2025
2026 mutex_lock(&bch_register_lock);
2027
2028 if (list_empty(&bch_cache_sets) &&
2029 list_empty(&uncached_devices))
2030 goto out;
2031
2032 pr_info("Stopping all devices:");
2033
2034 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
2035 bch_cache_set_stop(c);
2036
2037 list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
2038 bcache_device_stop(&dc->disk);
2039
2040
2041 while (1) {
2042 long timeout = start + 2 * HZ - jiffies;
2043
2044 stopped = list_empty(&bch_cache_sets) &&
2045 list_empty(&uncached_devices);
2046
2047 if (timeout < 0 || stopped)
2048 break;
2049
2050 prepare_to_wait(&unregister_wait, &wait,
2051 TASK_UNINTERRUPTIBLE);
2052
2053 mutex_unlock(&bch_register_lock);
2054 schedule_timeout(timeout);
2055 mutex_lock(&bch_register_lock);
2056 }
2057
2058 finish_wait(&unregister_wait, &wait);
2059
2060 if (stopped)
2061 pr_info("All devices stopped");
2062 else
2063 pr_notice("Timeout waiting for devices to be closed");
2064out:
2065 mutex_unlock(&bch_register_lock);
2066 }
2067
2068 return NOTIFY_DONE;
2069}
2070
2071static struct notifier_block reboot = {
2072 .notifier_call = bcache_reboot,
2073 .priority = INT_MAX,
2074};
2075
2076static void bcache_exit(void)
2077{
2078 bch_debug_exit();
2079 bch_request_exit();
2080 if (bcache_kobj)
2081 kobject_put(bcache_kobj);
2082 if (bcache_wq)
2083 destroy_workqueue(bcache_wq);
2084 if (bcache_major)
2085 unregister_blkdev(bcache_major, "bcache");
2086 unregister_reboot_notifier(&reboot);
2087}
2088
2089static int __init bcache_init(void)
2090{
2091 static const struct attribute *files[] = {
2092 &ksysfs_register.attr,
2093 &ksysfs_register_quiet.attr,
2094 NULL
2095 };
2096
2097 mutex_init(&bch_register_lock);
2098 init_waitqueue_head(&unregister_wait);
2099 register_reboot_notifier(&reboot);
2100 closure_debug_init();
2101
2102 bcache_major = register_blkdev(0, "bcache");
2103 if (bcache_major < 0) {
2104 unregister_reboot_notifier(&reboot);
2105 return bcache_major;
2106 }
2107
2108 if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) ||
2109 !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
2110 sysfs_create_files(bcache_kobj, files) ||
2111 bch_request_init() ||
2112 bch_debug_init(bcache_kobj))
2113 goto err;
2114
2115 return 0;
2116err:
2117 bcache_exit();
2118 return -ENOMEM;
2119}
2120
2121module_exit(bcache_exit);
2122module_init(bcache_init);
2123