1
2
3
4
5
6
7
8
9#include "bcache.h"
10#include "btree.h"
11#include "debug.h"
12#include "request.h"
13#include "writeback.h"
14
15#include <linux/blkdev.h>
16#include <linux/buffer_head.h>
17#include <linux/debugfs.h>
18#include <linux/genhd.h>
19#include <linux/kthread.h>
20#include <linux/module.h>
21#include <linux/random.h>
22#include <linux/reboot.h>
23#include <linux/sysfs.h>
24
25MODULE_LICENSE("GPL");
26MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
27
28static const char bcache_magic[] = {
29 0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
30 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81
31};
32
33static const char invalid_uuid[] = {
34 0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78,
35 0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
36};
37
38
39const char * const bch_cache_modes[] = {
40 "default",
41 "writethrough",
42 "writeback",
43 "writearound",
44 "none",
45 NULL
46};
47
48struct uuid_entry_v0 {
49 uint8_t uuid[16];
50 uint8_t label[32];
51 uint32_t first_reg;
52 uint32_t last_reg;
53 uint32_t invalidated;
54 uint32_t pad;
55};
56
57static struct kobject *bcache_kobj;
58struct mutex bch_register_lock;
59LIST_HEAD(bch_cache_sets);
60static LIST_HEAD(uncached_devices);
61
62static int bcache_major, bcache_minor;
63static wait_queue_head_t unregister_wait;
64struct workqueue_struct *bcache_wq;
65
66#define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE)
67
68static void bio_split_pool_free(struct bio_split_pool *p)
69{
70 if (p->bio_split_hook)
71 mempool_destroy(p->bio_split_hook);
72
73 if (p->bio_split)
74 bioset_free(p->bio_split);
75}
76
77static int bio_split_pool_init(struct bio_split_pool *p)
78{
79 p->bio_split = bioset_create(4, 0);
80 if (!p->bio_split)
81 return -ENOMEM;
82
83 p->bio_split_hook = mempool_create_kmalloc_pool(4,
84 sizeof(struct bio_split_hook));
85 if (!p->bio_split_hook)
86 return -ENOMEM;
87
88 return 0;
89}
90
91
92
93static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
94 struct page **res)
95{
96 const char *err;
97 struct cache_sb *s;
98 struct buffer_head *bh = __bread(bdev, 1, SB_SIZE);
99 unsigned i;
100
101 if (!bh)
102 return "IO error";
103
104 s = (struct cache_sb *) bh->b_data;
105
106 sb->offset = le64_to_cpu(s->offset);
107 sb->version = le64_to_cpu(s->version);
108
109 memcpy(sb->magic, s->magic, 16);
110 memcpy(sb->uuid, s->uuid, 16);
111 memcpy(sb->set_uuid, s->set_uuid, 16);
112 memcpy(sb->label, s->label, SB_LABEL_SIZE);
113
114 sb->flags = le64_to_cpu(s->flags);
115 sb->seq = le64_to_cpu(s->seq);
116 sb->last_mount = le32_to_cpu(s->last_mount);
117 sb->first_bucket = le16_to_cpu(s->first_bucket);
118 sb->keys = le16_to_cpu(s->keys);
119
120 for (i = 0; i < SB_JOURNAL_BUCKETS; i++)
121 sb->d[i] = le64_to_cpu(s->d[i]);
122
123 pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
124 sb->version, sb->flags, sb->seq, sb->keys);
125
126 err = "Not a bcache superblock";
127 if (sb->offset != SB_SECTOR)
128 goto err;
129
130 if (memcmp(sb->magic, bcache_magic, 16))
131 goto err;
132
133 err = "Too many journal buckets";
134 if (sb->keys > SB_JOURNAL_BUCKETS)
135 goto err;
136
137 err = "Bad checksum";
138 if (s->csum != csum_set(s))
139 goto err;
140
141 err = "Bad UUID";
142 if (bch_is_zero(sb->uuid, 16))
143 goto err;
144
145 sb->block_size = le16_to_cpu(s->block_size);
146
147 err = "Superblock block size smaller than device block size";
148 if (sb->block_size << 9 < bdev_logical_block_size(bdev))
149 goto err;
150
151 switch (sb->version) {
152 case BCACHE_SB_VERSION_BDEV:
153 sb->data_offset = BDEV_DATA_START_DEFAULT;
154 break;
155 case BCACHE_SB_VERSION_BDEV_WITH_OFFSET:
156 sb->data_offset = le64_to_cpu(s->data_offset);
157
158 err = "Bad data offset";
159 if (sb->data_offset < BDEV_DATA_START_DEFAULT)
160 goto err;
161
162 break;
163 case BCACHE_SB_VERSION_CDEV:
164 case BCACHE_SB_VERSION_CDEV_WITH_UUID:
165 sb->nbuckets = le64_to_cpu(s->nbuckets);
166 sb->block_size = le16_to_cpu(s->block_size);
167 sb->bucket_size = le16_to_cpu(s->bucket_size);
168
169 sb->nr_in_set = le16_to_cpu(s->nr_in_set);
170 sb->nr_this_dev = le16_to_cpu(s->nr_this_dev);
171
172 err = "Too many buckets";
173 if (sb->nbuckets > LONG_MAX)
174 goto err;
175
176 err = "Not enough buckets";
177 if (sb->nbuckets < 1 << 7)
178 goto err;
179
180 err = "Bad block/bucket size";
181 if (!is_power_of_2(sb->block_size) ||
182 sb->block_size > PAGE_SECTORS ||
183 !is_power_of_2(sb->bucket_size) ||
184 sb->bucket_size < PAGE_SECTORS)
185 goto err;
186
187 err = "Invalid superblock: device too small";
188 if (get_capacity(bdev->bd_disk) < sb->bucket_size * sb->nbuckets)
189 goto err;
190
191 err = "Bad UUID";
192 if (bch_is_zero(sb->set_uuid, 16))
193 goto err;
194
195 err = "Bad cache device number in set";
196 if (!sb->nr_in_set ||
197 sb->nr_in_set <= sb->nr_this_dev ||
198 sb->nr_in_set > MAX_CACHES_PER_SET)
199 goto err;
200
201 err = "Journal buckets not sequential";
202 for (i = 0; i < sb->keys; i++)
203 if (sb->d[i] != sb->first_bucket + i)
204 goto err;
205
206 err = "Too many journal buckets";
207 if (sb->first_bucket + sb->keys > sb->nbuckets)
208 goto err;
209
210 err = "Invalid superblock: first bucket comes before end of super";
211 if (sb->first_bucket * sb->bucket_size < 16)
212 goto err;
213
214 break;
215 default:
216 err = "Unsupported superblock version";
217 goto err;
218 }
219
220 sb->last_mount = get_seconds();
221 err = NULL;
222
223 get_page(bh->b_page);
224 *res = bh->b_page;
225err:
226 put_bh(bh);
227 return err;
228}
229
230static void write_bdev_super_endio(struct bio *bio, int error)
231{
232 struct cached_dev *dc = bio->bi_private;
233
234
235 closure_put(&dc->sb_write.cl);
236}
237
238static void __write_super(struct cache_sb *sb, struct bio *bio)
239{
240 struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page);
241 unsigned i;
242
243 bio->bi_sector = SB_SECTOR;
244 bio->bi_rw = REQ_SYNC|REQ_META;
245 bio->bi_size = SB_SIZE;
246 bch_bio_map(bio, NULL);
247
248 out->offset = cpu_to_le64(sb->offset);
249 out->version = cpu_to_le64(sb->version);
250
251 memcpy(out->uuid, sb->uuid, 16);
252 memcpy(out->set_uuid, sb->set_uuid, 16);
253 memcpy(out->label, sb->label, SB_LABEL_SIZE);
254
255 out->flags = cpu_to_le64(sb->flags);
256 out->seq = cpu_to_le64(sb->seq);
257
258 out->last_mount = cpu_to_le32(sb->last_mount);
259 out->first_bucket = cpu_to_le16(sb->first_bucket);
260 out->keys = cpu_to_le16(sb->keys);
261
262 for (i = 0; i < sb->keys; i++)
263 out->d[i] = cpu_to_le64(sb->d[i]);
264
265 out->csum = csum_set(out);
266
267 pr_debug("ver %llu, flags %llu, seq %llu",
268 sb->version, sb->flags, sb->seq);
269
270 submit_bio(REQ_WRITE, bio);
271}
272
273void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
274{
275 struct closure *cl = &dc->sb_write.cl;
276 struct bio *bio = &dc->sb_bio;
277
278 closure_lock(&dc->sb_write, parent);
279
280 bio_reset(bio);
281 bio->bi_bdev = dc->bdev;
282 bio->bi_end_io = write_bdev_super_endio;
283 bio->bi_private = dc;
284
285 closure_get(cl);
286 __write_super(&dc->sb, bio);
287
288 closure_return(cl);
289}
290
291static void write_super_endio(struct bio *bio, int error)
292{
293 struct cache *ca = bio->bi_private;
294
295 bch_count_io_errors(ca, error, "writing superblock");
296 closure_put(&ca->set->sb_write.cl);
297}
298
299void bcache_write_super(struct cache_set *c)
300{
301 struct closure *cl = &c->sb_write.cl;
302 struct cache *ca;
303 unsigned i;
304
305 closure_lock(&c->sb_write, &c->cl);
306
307 c->sb.seq++;
308
309 for_each_cache(ca, c, i) {
310 struct bio *bio = &ca->sb_bio;
311
312 ca->sb.version = BCACHE_SB_VERSION_CDEV_WITH_UUID;
313 ca->sb.seq = c->sb.seq;
314 ca->sb.last_mount = c->sb.last_mount;
315
316 SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb));
317
318 bio_reset(bio);
319 bio->bi_bdev = ca->bdev;
320 bio->bi_end_io = write_super_endio;
321 bio->bi_private = ca;
322
323 closure_get(cl);
324 __write_super(&ca->sb, bio);
325 }
326
327 closure_return(cl);
328}
329
330
331
332static void uuid_endio(struct bio *bio, int error)
333{
334 struct closure *cl = bio->bi_private;
335 struct cache_set *c = container_of(cl, struct cache_set, uuid_write.cl);
336
337 cache_set_err_on(error, c, "accessing uuids");
338 bch_bbio_free(bio, c);
339 closure_put(cl);
340}
341
342static void uuid_io(struct cache_set *c, unsigned long rw,
343 struct bkey *k, struct closure *parent)
344{
345 struct closure *cl = &c->uuid_write.cl;
346 struct uuid_entry *u;
347 unsigned i;
348 char buf[80];
349
350 BUG_ON(!parent);
351 closure_lock(&c->uuid_write, parent);
352
353 for (i = 0; i < KEY_PTRS(k); i++) {
354 struct bio *bio = bch_bbio_alloc(c);
355
356 bio->bi_rw = REQ_SYNC|REQ_META|rw;
357 bio->bi_size = KEY_SIZE(k) << 9;
358
359 bio->bi_end_io = uuid_endio;
360 bio->bi_private = cl;
361 bch_bio_map(bio, c->uuids);
362
363 bch_submit_bbio(bio, c, k, i);
364
365 if (!(rw & WRITE))
366 break;
367 }
368
369 bch_bkey_to_text(buf, sizeof(buf), k);
370 pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", buf);
371
372 for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
373 if (!bch_is_zero(u->uuid, 16))
374 pr_debug("Slot %zi: %pU: %s: 1st: %u last: %u inv: %u",
375 u - c->uuids, u->uuid, u->label,
376 u->first_reg, u->last_reg, u->invalidated);
377
378 closure_return(cl);
379}
380
381static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
382{
383 struct bkey *k = &j->uuid_bucket;
384
385 if (__bch_ptr_invalid(c, 1, k))
386 return "bad uuid pointer";
387
388 bkey_copy(&c->uuid_bucket, k);
389 uuid_io(c, READ_SYNC, k, cl);
390
391 if (j->version < BCACHE_JSET_VERSION_UUIDv1) {
392 struct uuid_entry_v0 *u0 = (void *) c->uuids;
393 struct uuid_entry *u1 = (void *) c->uuids;
394 int i;
395
396 closure_sync(cl);
397
398
399
400
401
402
403
404 for (i = c->nr_uuids - 1;
405 i >= 0;
406 --i) {
407 memcpy(u1[i].uuid, u0[i].uuid, 16);
408 memcpy(u1[i].label, u0[i].label, 32);
409
410 u1[i].first_reg = u0[i].first_reg;
411 u1[i].last_reg = u0[i].last_reg;
412 u1[i].invalidated = u0[i].invalidated;
413
414 u1[i].flags = 0;
415 u1[i].sectors = 0;
416 }
417 }
418
419 return NULL;
420}
421
422static int __uuid_write(struct cache_set *c)
423{
424 BKEY_PADDED(key) k;
425 struct closure cl;
426 closure_init_stack(&cl);
427
428 lockdep_assert_held(&bch_register_lock);
429
430 if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, &cl))
431 return 1;
432
433 SET_KEY_SIZE(&k.key, c->sb.bucket_size);
434 uuid_io(c, REQ_WRITE, &k.key, &cl);
435 closure_sync(&cl);
436
437 bkey_copy(&c->uuid_bucket, &k.key);
438 __bkey_put(c, &k.key);
439 return 0;
440}
441
442int bch_uuid_write(struct cache_set *c)
443{
444 int ret = __uuid_write(c);
445
446 if (!ret)
447 bch_journal_meta(c, NULL);
448
449 return ret;
450}
451
452static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid)
453{
454 struct uuid_entry *u;
455
456 for (u = c->uuids;
457 u < c->uuids + c->nr_uuids; u++)
458 if (!memcmp(u->uuid, uuid, 16))
459 return u;
460
461 return NULL;
462}
463
464static struct uuid_entry *uuid_find_empty(struct cache_set *c)
465{
466 static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
467 return uuid_find(c, zero_uuid);
468}
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497static void prio_endio(struct bio *bio, int error)
498{
499 struct cache *ca = bio->bi_private;
500
501 cache_set_err_on(error, ca->set, "accessing priorities");
502 bch_bbio_free(bio, ca->set);
503 closure_put(&ca->prio);
504}
505
506static void prio_io(struct cache *ca, uint64_t bucket, unsigned long rw)
507{
508 struct closure *cl = &ca->prio;
509 struct bio *bio = bch_bbio_alloc(ca->set);
510
511 closure_init_stack(cl);
512
513 bio->bi_sector = bucket * ca->sb.bucket_size;
514 bio->bi_bdev = ca->bdev;
515 bio->bi_rw = REQ_SYNC|REQ_META|rw;
516 bio->bi_size = bucket_bytes(ca);
517
518 bio->bi_end_io = prio_endio;
519 bio->bi_private = ca;
520 bch_bio_map(bio, ca->disk_buckets);
521
522 closure_bio_submit(bio, &ca->prio, ca);
523 closure_sync(cl);
524}
525
526#define buckets_free(c) "free %zu, free_inc %zu, unused %zu", \
527 fifo_used(&c->free), fifo_used(&c->free_inc), fifo_used(&c->unused)
528
529void bch_prio_write(struct cache *ca)
530{
531 int i;
532 struct bucket *b;
533 struct closure cl;
534
535 closure_init_stack(&cl);
536
537 lockdep_assert_held(&ca->set->bucket_lock);
538
539 for (b = ca->buckets;
540 b < ca->buckets + ca->sb.nbuckets; b++)
541 b->disk_gen = b->gen;
542
543 ca->disk_buckets->seq++;
544
545 atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
546 &ca->meta_sectors_written);
547
548 pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free),
549 fifo_used(&ca->free_inc), fifo_used(&ca->unused));
550
551 for (i = prio_buckets(ca) - 1; i >= 0; --i) {
552 long bucket;
553 struct prio_set *p = ca->disk_buckets;
554 struct bucket_disk *d = p->data;
555 struct bucket_disk *end = d + prios_per_bucket(ca);
556
557 for (b = ca->buckets + i * prios_per_bucket(ca);
558 b < ca->buckets + ca->sb.nbuckets && d < end;
559 b++, d++) {
560 d->prio = cpu_to_le16(b->prio);
561 d->gen = b->gen;
562 }
563
564 p->next_bucket = ca->prio_buckets[i + 1];
565 p->magic = pset_magic(ca);
566 p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8);
567
568 bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, &cl);
569 BUG_ON(bucket == -1);
570
571 mutex_unlock(&ca->set->bucket_lock);
572 prio_io(ca, bucket, REQ_WRITE);
573 mutex_lock(&ca->set->bucket_lock);
574
575 ca->prio_buckets[i] = bucket;
576 atomic_dec_bug(&ca->buckets[bucket].pin);
577 }
578
579 mutex_unlock(&ca->set->bucket_lock);
580
581 bch_journal_meta(ca->set, &cl);
582 closure_sync(&cl);
583
584 mutex_lock(&ca->set->bucket_lock);
585
586 ca->need_save_prio = 0;
587
588
589
590
591
592 for (i = 0; i < prio_buckets(ca); i++)
593 ca->prio_last_buckets[i] = ca->prio_buckets[i];
594}
595
596static void prio_read(struct cache *ca, uint64_t bucket)
597{
598 struct prio_set *p = ca->disk_buckets;
599 struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
600 struct bucket *b;
601 unsigned bucket_nr = 0;
602
603 for (b = ca->buckets;
604 b < ca->buckets + ca->sb.nbuckets;
605 b++, d++) {
606 if (d == end) {
607 ca->prio_buckets[bucket_nr] = bucket;
608 ca->prio_last_buckets[bucket_nr] = bucket;
609 bucket_nr++;
610
611 prio_io(ca, bucket, READ_SYNC);
612
613 if (p->csum != bch_crc64(&p->magic, bucket_bytes(ca) - 8))
614 pr_warn("bad csum reading priorities");
615
616 if (p->magic != pset_magic(ca))
617 pr_warn("bad magic reading priorities");
618
619 bucket = p->next_bucket;
620 d = p->data;
621 }
622
623 b->prio = le16_to_cpu(d->prio);
624 b->gen = b->disk_gen = b->last_gc = b->gc_gen = d->gen;
625 }
626}
627
628
629
630static int open_dev(struct block_device *b, fmode_t mode)
631{
632 struct bcache_device *d = b->bd_disk->private_data;
633 if (atomic_read(&d->closing))
634 return -ENXIO;
635
636 closure_get(&d->cl);
637 return 0;
638}
639
640static void release_dev(struct gendisk *b, fmode_t mode)
641{
642 struct bcache_device *d = b->private_data;
643 closure_put(&d->cl);
644}
645
646static int ioctl_dev(struct block_device *b, fmode_t mode,
647 unsigned int cmd, unsigned long arg)
648{
649 struct bcache_device *d = b->bd_disk->private_data;
650 return d->ioctl(d, mode, cmd, arg);
651}
652
653static const struct block_device_operations bcache_ops = {
654 .open = open_dev,
655 .release = release_dev,
656 .ioctl = ioctl_dev,
657 .owner = THIS_MODULE,
658};
659
660void bcache_device_stop(struct bcache_device *d)
661{
662 if (!atomic_xchg(&d->closing, 1))
663 closure_queue(&d->cl);
664}
665
666static void bcache_device_unlink(struct bcache_device *d)
667{
668 unsigned i;
669 struct cache *ca;
670
671 sysfs_remove_link(&d->c->kobj, d->name);
672 sysfs_remove_link(&d->kobj, "cache");
673
674 for_each_cache(ca, d->c, i)
675 bd_unlink_disk_holder(ca->bdev, d->disk);
676}
677
678static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
679 const char *name)
680{
681 unsigned i;
682 struct cache *ca;
683
684 for_each_cache(ca, d->c, i)
685 bd_link_disk_holder(ca->bdev, d->disk);
686
687 snprintf(d->name, BCACHEDEVNAME_SIZE,
688 "%s%u", name, d->id);
689
690 WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
691 sysfs_create_link(&c->kobj, &d->kobj, d->name),
692 "Couldn't create device <-> cache set symlinks");
693}
694
695static void bcache_device_detach(struct bcache_device *d)
696{
697 lockdep_assert_held(&bch_register_lock);
698
699 if (atomic_read(&d->detaching)) {
700 struct uuid_entry *u = d->c->uuids + d->id;
701
702 SET_UUID_FLASH_ONLY(u, 0);
703 memcpy(u->uuid, invalid_uuid, 16);
704 u->invalidated = cpu_to_le32(get_seconds());
705 bch_uuid_write(d->c);
706
707 atomic_set(&d->detaching, 0);
708 }
709
710 if (!d->flush_done)
711 bcache_device_unlink(d);
712
713 d->c->devices[d->id] = NULL;
714 closure_put(&d->c->caching);
715 d->c = NULL;
716}
717
718static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
719 unsigned id)
720{
721 BUG_ON(test_bit(CACHE_SET_STOPPING, &c->flags));
722
723 d->id = id;
724 d->c = c;
725 c->devices[id] = d;
726
727 closure_get(&c->caching);
728}
729
730static void bcache_device_free(struct bcache_device *d)
731{
732 lockdep_assert_held(&bch_register_lock);
733
734 pr_info("%s stopped", d->disk->disk_name);
735
736 if (d->c)
737 bcache_device_detach(d);
738 if (d->disk && d->disk->flags & GENHD_FL_UP)
739 del_gendisk(d->disk);
740 if (d->disk && d->disk->queue)
741 blk_cleanup_queue(d->disk->queue);
742 if (d->disk)
743 put_disk(d->disk);
744
745 bio_split_pool_free(&d->bio_split_hook);
746 if (d->unaligned_bvec)
747 mempool_destroy(d->unaligned_bvec);
748 if (d->bio_split)
749 bioset_free(d->bio_split);
750 if (is_vmalloc_addr(d->stripe_sectors_dirty))
751 vfree(d->stripe_sectors_dirty);
752 else
753 kfree(d->stripe_sectors_dirty);
754
755 closure_debug_destroy(&d->cl);
756}
757
758static int bcache_device_init(struct bcache_device *d, unsigned block_size,
759 sector_t sectors)
760{
761 struct request_queue *q;
762 size_t n;
763
764 if (!d->stripe_size_bits)
765 d->stripe_size_bits = 31;
766
767 d->nr_stripes = round_up(sectors, 1 << d->stripe_size_bits) >>
768 d->stripe_size_bits;
769
770 if (!d->nr_stripes || d->nr_stripes > SIZE_MAX / sizeof(atomic_t))
771 return -ENOMEM;
772
773 n = d->nr_stripes * sizeof(atomic_t);
774 d->stripe_sectors_dirty = n < PAGE_SIZE << 6
775 ? kzalloc(n, GFP_KERNEL)
776 : vzalloc(n);
777 if (!d->stripe_sectors_dirty)
778 return -ENOMEM;
779
780 if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
781 !(d->unaligned_bvec = mempool_create_kmalloc_pool(1,
782 sizeof(struct bio_vec) * BIO_MAX_PAGES)) ||
783 bio_split_pool_init(&d->bio_split_hook) ||
784 !(d->disk = alloc_disk(1)) ||
785 !(q = blk_alloc_queue(GFP_KERNEL)))
786 return -ENOMEM;
787
788 set_capacity(d->disk, sectors);
789 snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor);
790
791 d->disk->major = bcache_major;
792 d->disk->first_minor = bcache_minor++;
793 d->disk->fops = &bcache_ops;
794 d->disk->private_data = d;
795
796 blk_queue_make_request(q, NULL);
797 d->disk->queue = q;
798 q->queuedata = d;
799 q->backing_dev_info.congested_data = d;
800 q->limits.max_hw_sectors = UINT_MAX;
801 q->limits.max_sectors = UINT_MAX;
802 q->limits.max_segment_size = UINT_MAX;
803 q->limits.max_segments = BIO_MAX_PAGES;
804 q->limits.max_discard_sectors = UINT_MAX;
805 q->limits.io_min = block_size;
806 q->limits.logical_block_size = block_size;
807 q->limits.physical_block_size = block_size;
808 set_bit(QUEUE_FLAG_NONROT, &d->disk->queue->queue_flags);
809 set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags);
810
811 blk_queue_flush(q, REQ_FLUSH|REQ_FUA);
812
813 return 0;
814}
815
816
817
818static void calc_cached_dev_sectors(struct cache_set *c)
819{
820 uint64_t sectors = 0;
821 struct cached_dev *dc;
822
823 list_for_each_entry(dc, &c->cached_devs, list)
824 sectors += bdev_sectors(dc->bdev);
825
826 c->cached_dev_sectors = sectors;
827}
828
829void bch_cached_dev_run(struct cached_dev *dc)
830{
831 struct bcache_device *d = &dc->disk;
832 char buf[SB_LABEL_SIZE + 1];
833 char *env[] = {
834 "DRIVER=bcache",
835 kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid),
836 NULL,
837 NULL,
838 };
839
840 memcpy(buf, dc->sb.label, SB_LABEL_SIZE);
841 buf[SB_LABEL_SIZE] = '\0';
842 env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);
843
844 if (atomic_xchg(&dc->running, 1))
845 return;
846
847 if (!d->c &&
848 BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
849 struct closure cl;
850 closure_init_stack(&cl);
851
852 SET_BDEV_STATE(&dc->sb, BDEV_STATE_STALE);
853 bch_write_bdev_super(dc, &cl);
854 closure_sync(&cl);
855 }
856
857 add_disk(d->disk);
858 bd_link_disk_holder(dc->bdev, dc->disk.disk);
859
860
861 kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
862 kfree(env[1]);
863 kfree(env[2]);
864
865 if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
866 sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
867 pr_debug("error creating sysfs link");
868}
869
870static void cached_dev_detach_finish(struct work_struct *w)
871{
872 struct cached_dev *dc = container_of(w, struct cached_dev, detach);
873 char buf[BDEVNAME_SIZE];
874 struct closure cl;
875 closure_init_stack(&cl);
876
877 BUG_ON(!atomic_read(&dc->disk.detaching));
878 BUG_ON(atomic_read(&dc->count));
879
880 mutex_lock(&bch_register_lock);
881
882 memset(&dc->sb.set_uuid, 0, 16);
883 SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE);
884
885 bch_write_bdev_super(dc, &cl);
886 closure_sync(&cl);
887
888 bcache_device_detach(&dc->disk);
889 list_move(&dc->list, &uncached_devices);
890
891 mutex_unlock(&bch_register_lock);
892
893 pr_info("Caching disabled for %s", bdevname(dc->bdev, buf));
894
895
896 closure_put(&dc->disk.cl);
897}
898
899void bch_cached_dev_detach(struct cached_dev *dc)
900{
901 lockdep_assert_held(&bch_register_lock);
902
903 if (atomic_read(&dc->disk.closing))
904 return;
905
906 if (atomic_xchg(&dc->disk.detaching, 1))
907 return;
908
909
910
911
912
913 closure_get(&dc->disk.cl);
914
915 bch_writeback_queue(dc);
916 cached_dev_put(dc);
917}
918
919int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
920{
921 uint32_t rtime = cpu_to_le32(get_seconds());
922 struct uuid_entry *u;
923 char buf[BDEVNAME_SIZE];
924
925 bdevname(dc->bdev, buf);
926
927 if (memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16))
928 return -ENOENT;
929
930 if (dc->disk.c) {
931 pr_err("Can't attach %s: already attached", buf);
932 return -EINVAL;
933 }
934
935 if (test_bit(CACHE_SET_STOPPING, &c->flags)) {
936 pr_err("Can't attach %s: shutting down", buf);
937 return -EINVAL;
938 }
939
940 if (dc->sb.block_size < c->sb.block_size) {
941
942 pr_err("Couldn't attach %s: block size less than set's block size",
943 buf);
944 return -EINVAL;
945 }
946
947 u = uuid_find(c, dc->sb.uuid);
948
949 if (u &&
950 (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE ||
951 BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) {
952 memcpy(u->uuid, invalid_uuid, 16);
953 u->invalidated = cpu_to_le32(get_seconds());
954 u = NULL;
955 }
956
957 if (!u) {
958 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
959 pr_err("Couldn't find uuid for %s in set", buf);
960 return -ENOENT;
961 }
962
963 u = uuid_find_empty(c);
964 if (!u) {
965 pr_err("Not caching %s, no room for UUID", buf);
966 return -EINVAL;
967 }
968 }
969
970
971
972
973
974 if (bch_is_zero(u->uuid, 16)) {
975 struct closure cl;
976 closure_init_stack(&cl);
977
978 memcpy(u->uuid, dc->sb.uuid, 16);
979 memcpy(u->label, dc->sb.label, SB_LABEL_SIZE);
980 u->first_reg = u->last_reg = rtime;
981 bch_uuid_write(c);
982
983 memcpy(dc->sb.set_uuid, c->sb.set_uuid, 16);
984 SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
985
986 bch_write_bdev_super(dc, &cl);
987 closure_sync(&cl);
988 } else {
989 u->last_reg = rtime;
990 bch_uuid_write(c);
991 }
992
993 bcache_device_attach(&dc->disk, c, u - c->uuids);
994 list_move(&dc->list, &c->cached_devs);
995 calc_cached_dev_sectors(c);
996
997 smp_wmb();
998
999
1000
1001
1002 atomic_set(&dc->count, 1);
1003
1004 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
1005 bch_sectors_dirty_init(dc);
1006 atomic_set(&dc->has_dirty, 1);
1007 atomic_inc(&dc->count);
1008 bch_writeback_queue(dc);
1009 }
1010
1011 bch_cached_dev_run(dc);
1012 bcache_device_link(&dc->disk, c, "bdev");
1013
1014 pr_info("Caching %s as %s on set %pU",
1015 bdevname(dc->bdev, buf), dc->disk.disk->disk_name,
1016 dc->disk.c->sb.set_uuid);
1017 return 0;
1018}
1019
1020void bch_cached_dev_release(struct kobject *kobj)
1021{
1022 struct cached_dev *dc = container_of(kobj, struct cached_dev,
1023 disk.kobj);
1024 kfree(dc);
1025 module_put(THIS_MODULE);
1026}
1027
1028static void cached_dev_free(struct closure *cl)
1029{
1030 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
1031
1032 cancel_delayed_work_sync(&dc->writeback_rate_update);
1033
1034 mutex_lock(&bch_register_lock);
1035
1036 if (atomic_read(&dc->running))
1037 bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
1038 bcache_device_free(&dc->disk);
1039 list_del(&dc->list);
1040
1041 mutex_unlock(&bch_register_lock);
1042
1043 if (!IS_ERR_OR_NULL(dc->bdev)) {
1044 if (dc->bdev->bd_disk)
1045 blk_sync_queue(bdev_get_queue(dc->bdev));
1046
1047 blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1048 }
1049
1050 wake_up(&unregister_wait);
1051
1052 kobject_put(&dc->disk.kobj);
1053}
1054
1055static void cached_dev_flush(struct closure *cl)
1056{
1057 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
1058 struct bcache_device *d = &dc->disk;
1059
1060 mutex_lock(&bch_register_lock);
1061 d->flush_done = 1;
1062
1063 if (d->c)
1064 bcache_device_unlink(d);
1065
1066 mutex_unlock(&bch_register_lock);
1067
1068 bch_cache_accounting_destroy(&dc->accounting);
1069 kobject_del(&d->kobj);
1070
1071 continue_at(cl, cached_dev_free, system_wq);
1072}
1073
1074static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
1075{
1076 int ret;
1077 struct io *io;
1078 struct request_queue *q = bdev_get_queue(dc->bdev);
1079
1080 __module_get(THIS_MODULE);
1081 INIT_LIST_HEAD(&dc->list);
1082 closure_init(&dc->disk.cl, NULL);
1083 set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
1084 kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
1085 INIT_WORK(&dc->detach, cached_dev_detach_finish);
1086 closure_init_unlocked(&dc->sb_write);
1087 INIT_LIST_HEAD(&dc->io_lru);
1088 spin_lock_init(&dc->io_lock);
1089 bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
1090
1091 dc->sequential_merge = true;
1092 dc->sequential_cutoff = 4 << 20;
1093
1094 for (io = dc->io; io < dc->io + RECENT_IO; io++) {
1095 list_add(&io->lru, &dc->io_lru);
1096 hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
1097 }
1098
1099 ret = bcache_device_init(&dc->disk, block_size,
1100 dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
1101 if (ret)
1102 return ret;
1103
1104 set_capacity(dc->disk.disk,
1105 dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
1106
1107 dc->disk.disk->queue->backing_dev_info.ra_pages =
1108 max(dc->disk.disk->queue->backing_dev_info.ra_pages,
1109 q->backing_dev_info.ra_pages);
1110
1111 bch_cached_dev_request_init(dc);
1112 bch_cached_dev_writeback_init(dc);
1113 return 0;
1114}
1115
1116
1117
1118static void register_bdev(struct cache_sb *sb, struct page *sb_page,
1119 struct block_device *bdev,
1120 struct cached_dev *dc)
1121{
1122 char name[BDEVNAME_SIZE];
1123 const char *err = "cannot allocate memory";
1124 struct cache_set *c;
1125
1126 memcpy(&dc->sb, sb, sizeof(struct cache_sb));
1127 dc->bdev = bdev;
1128 dc->bdev->bd_holder = dc;
1129
1130 bio_init(&dc->sb_bio);
1131 dc->sb_bio.bi_max_vecs = 1;
1132 dc->sb_bio.bi_io_vec = dc->sb_bio.bi_inline_vecs;
1133 dc->sb_bio.bi_io_vec[0].bv_page = sb_page;
1134 get_page(sb_page);
1135
1136 if (cached_dev_init(dc, sb->block_size << 9))
1137 goto err;
1138
1139 err = "error creating kobject";
1140 if (kobject_add(&dc->disk.kobj, &part_to_dev(bdev->bd_part)->kobj,
1141 "bcache"))
1142 goto err;
1143 if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
1144 goto err;
1145
1146 pr_info("registered backing device %s", bdevname(bdev, name));
1147
1148 list_add(&dc->list, &uncached_devices);
1149 list_for_each_entry(c, &bch_cache_sets, list)
1150 bch_cached_dev_attach(dc, c);
1151
1152 if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE ||
1153 BDEV_STATE(&dc->sb) == BDEV_STATE_STALE)
1154 bch_cached_dev_run(dc);
1155
1156 return;
1157err:
1158 pr_notice("error opening %s: %s", bdevname(bdev, name), err);
1159 bcache_device_stop(&dc->disk);
1160}
1161
1162
1163
1164void bch_flash_dev_release(struct kobject *kobj)
1165{
1166 struct bcache_device *d = container_of(kobj, struct bcache_device,
1167 kobj);
1168 kfree(d);
1169}
1170
1171static void flash_dev_free(struct closure *cl)
1172{
1173 struct bcache_device *d = container_of(cl, struct bcache_device, cl);
1174 bcache_device_free(d);
1175 kobject_put(&d->kobj);
1176}
1177
1178static void flash_dev_flush(struct closure *cl)
1179{
1180 struct bcache_device *d = container_of(cl, struct bcache_device, cl);
1181
1182 bcache_device_unlink(d);
1183 kobject_del(&d->kobj);
1184 continue_at(cl, flash_dev_free, system_wq);
1185}
1186
1187static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
1188{
1189 struct bcache_device *d = kzalloc(sizeof(struct bcache_device),
1190 GFP_KERNEL);
1191 if (!d)
1192 return -ENOMEM;
1193
1194 closure_init(&d->cl, NULL);
1195 set_closure_fn(&d->cl, flash_dev_flush, system_wq);
1196
1197 kobject_init(&d->kobj, &bch_flash_dev_ktype);
1198
1199 if (bcache_device_init(d, block_bytes(c), u->sectors))
1200 goto err;
1201
1202 bcache_device_attach(d, c, u - c->uuids);
1203 bch_flash_dev_request_init(d);
1204 add_disk(d->disk);
1205
1206 if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache"))
1207 goto err;
1208
1209 bcache_device_link(d, c, "volume");
1210
1211 return 0;
1212err:
1213 kobject_put(&d->kobj);
1214 return -ENOMEM;
1215}
1216
1217static int flash_devs_run(struct cache_set *c)
1218{
1219 int ret = 0;
1220 struct uuid_entry *u;
1221
1222 for (u = c->uuids;
1223 u < c->uuids + c->nr_uuids && !ret;
1224 u++)
1225 if (UUID_FLASH_ONLY(u))
1226 ret = flash_dev_run(c, u);
1227
1228 return ret;
1229}
1230
1231int bch_flash_dev_create(struct cache_set *c, uint64_t size)
1232{
1233 struct uuid_entry *u;
1234
1235 if (test_bit(CACHE_SET_STOPPING, &c->flags))
1236 return -EINTR;
1237
1238 u = uuid_find_empty(c);
1239 if (!u) {
1240 pr_err("Can't create volume, no room for UUID");
1241 return -EINVAL;
1242 }
1243
1244 get_random_bytes(u->uuid, 16);
1245 memset(u->label, 0, 32);
1246 u->first_reg = u->last_reg = cpu_to_le32(get_seconds());
1247
1248 SET_UUID_FLASH_ONLY(u, 1);
1249 u->sectors = size >> 9;
1250
1251 bch_uuid_write(c);
1252
1253 return flash_dev_run(c, u);
1254}
1255
1256
1257
1258__printf(2, 3)
1259bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
1260{
1261 va_list args;
1262
1263 if (test_bit(CACHE_SET_STOPPING, &c->flags))
1264 return false;
1265
1266
1267
1268
1269
1270 printk(KERN_ERR "bcache: error on %pU: ", c->sb.set_uuid);
1271
1272 va_start(args, fmt);
1273 vprintk(fmt, args);
1274 va_end(args);
1275
1276 printk(", disabling caching\n");
1277
1278 bch_cache_set_unregister(c);
1279 return true;
1280}
1281
1282void bch_cache_set_release(struct kobject *kobj)
1283{
1284 struct cache_set *c = container_of(kobj, struct cache_set, kobj);
1285 kfree(c);
1286 module_put(THIS_MODULE);
1287}
1288
1289static void cache_set_free(struct closure *cl)
1290{
1291 struct cache_set *c = container_of(cl, struct cache_set, cl);
1292 struct cache *ca;
1293 unsigned i;
1294
1295 if (!IS_ERR_OR_NULL(c->debug))
1296 debugfs_remove(c->debug);
1297
1298 bch_open_buckets_free(c);
1299 bch_btree_cache_free(c);
1300 bch_journal_free(c);
1301
1302 for_each_cache(ca, c, i)
1303 if (ca)
1304 kobject_put(&ca->kobj);
1305
1306 free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
1307 free_pages((unsigned long) c->sort, ilog2(bucket_pages(c)));
1308
1309 if (c->bio_split)
1310 bioset_free(c->bio_split);
1311 if (c->fill_iter)
1312 mempool_destroy(c->fill_iter);
1313 if (c->bio_meta)
1314 mempool_destroy(c->bio_meta);
1315 if (c->search)
1316 mempool_destroy(c->search);
1317 kfree(c->devices);
1318
1319 mutex_lock(&bch_register_lock);
1320 list_del(&c->list);
1321 mutex_unlock(&bch_register_lock);
1322
1323 pr_info("Cache set %pU unregistered", c->sb.set_uuid);
1324 wake_up(&unregister_wait);
1325
1326 closure_debug_destroy(&c->cl);
1327 kobject_put(&c->kobj);
1328}
1329
1330static void cache_set_flush(struct closure *cl)
1331{
1332 struct cache_set *c = container_of(cl, struct cache_set, caching);
1333 struct cache *ca;
1334 struct btree *b;
1335 unsigned i;
1336
1337 bch_cache_accounting_destroy(&c->accounting);
1338
1339 kobject_put(&c->internal);
1340 kobject_del(&c->kobj);
1341
1342 if (!IS_ERR_OR_NULL(c->root))
1343 list_add(&c->root->list, &c->btree_cache);
1344
1345
1346 list_for_each_entry(b, &c->btree_cache, list)
1347 if (btree_node_dirty(b))
1348 bch_btree_node_write(b, NULL);
1349
1350 for_each_cache(ca, c, i)
1351 if (ca->alloc_thread)
1352 kthread_stop(ca->alloc_thread);
1353
1354 closure_return(cl);
1355}
1356
1357static void __cache_set_unregister(struct closure *cl)
1358{
1359 struct cache_set *c = container_of(cl, struct cache_set, caching);
1360 struct cached_dev *dc;
1361 size_t i;
1362
1363 mutex_lock(&bch_register_lock);
1364
1365 for (i = 0; i < c->nr_uuids; i++)
1366 if (c->devices[i]) {
1367 if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
1368 test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
1369 dc = container_of(c->devices[i],
1370 struct cached_dev, disk);
1371 bch_cached_dev_detach(dc);
1372 } else {
1373 bcache_device_stop(c->devices[i]);
1374 }
1375 }
1376
1377 mutex_unlock(&bch_register_lock);
1378
1379 continue_at(cl, cache_set_flush, system_wq);
1380}
1381
1382void bch_cache_set_stop(struct cache_set *c)
1383{
1384 if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags))
1385 closure_queue(&c->caching);
1386}
1387
1388void bch_cache_set_unregister(struct cache_set *c)
1389{
1390 set_bit(CACHE_SET_UNREGISTERING, &c->flags);
1391 bch_cache_set_stop(c);
1392}
1393
1394#define alloc_bucket_pages(gfp, c) \
1395 ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(c))))
1396
1397struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1398{
1399 int iter_size;
1400 struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
1401 if (!c)
1402 return NULL;
1403
1404 __module_get(THIS_MODULE);
1405 closure_init(&c->cl, NULL);
1406 set_closure_fn(&c->cl, cache_set_free, system_wq);
1407
1408 closure_init(&c->caching, &c->cl);
1409 set_closure_fn(&c->caching, __cache_set_unregister, system_wq);
1410
1411
1412 closure_set_stopped(&c->cl);
1413 closure_put(&c->cl);
1414
1415 kobject_init(&c->kobj, &bch_cache_set_ktype);
1416 kobject_init(&c->internal, &bch_cache_set_internal_ktype);
1417
1418 bch_cache_accounting_init(&c->accounting, &c->cl);
1419
1420 memcpy(c->sb.set_uuid, sb->set_uuid, 16);
1421 c->sb.block_size = sb->block_size;
1422 c->sb.bucket_size = sb->bucket_size;
1423 c->sb.nr_in_set = sb->nr_in_set;
1424 c->sb.last_mount = sb->last_mount;
1425 c->bucket_bits = ilog2(sb->bucket_size);
1426 c->block_bits = ilog2(sb->block_size);
1427 c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry);
1428
1429 c->btree_pages = c->sb.bucket_size / PAGE_SECTORS;
1430 if (c->btree_pages > BTREE_MAX_PAGES)
1431 c->btree_pages = max_t(int, c->btree_pages / 4,
1432 BTREE_MAX_PAGES);
1433
1434 c->sort_crit_factor = int_sqrt(c->btree_pages);
1435
1436 mutex_init(&c->bucket_lock);
1437 mutex_init(&c->sort_lock);
1438 spin_lock_init(&c->sort_time_lock);
1439 closure_init_unlocked(&c->sb_write);
1440 closure_init_unlocked(&c->uuid_write);
1441 spin_lock_init(&c->btree_read_time_lock);
1442 bch_moving_init_cache_set(c);
1443
1444 INIT_LIST_HEAD(&c->list);
1445 INIT_LIST_HEAD(&c->cached_devs);
1446 INIT_LIST_HEAD(&c->btree_cache);
1447 INIT_LIST_HEAD(&c->btree_cache_freeable);
1448 INIT_LIST_HEAD(&c->btree_cache_freed);
1449 INIT_LIST_HEAD(&c->data_buckets);
1450
1451 c->search = mempool_create_slab_pool(32, bch_search_cache);
1452 if (!c->search)
1453 goto err;
1454
1455 iter_size = (sb->bucket_size / sb->block_size + 1) *
1456 sizeof(struct btree_iter_set);
1457
1458 if (!(c->devices = kzalloc(c->nr_uuids * sizeof(void *), GFP_KERNEL)) ||
1459 !(c->bio_meta = mempool_create_kmalloc_pool(2,
1460 sizeof(struct bbio) + sizeof(struct bio_vec) *
1461 bucket_pages(c))) ||
1462 !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
1463 !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) ||
1464 !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) ||
1465 !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
1466 bch_journal_alloc(c) ||
1467 bch_btree_cache_alloc(c) ||
1468 bch_open_buckets_alloc(c))
1469 goto err;
1470
1471 c->congested_read_threshold_us = 2000;
1472 c->congested_write_threshold_us = 20000;
1473 c->error_limit = 8 << IO_ERROR_SHIFT;
1474
1475 return c;
1476err:
1477 bch_cache_set_unregister(c);
1478 return NULL;
1479}
1480
1481static void run_cache_set(struct cache_set *c)
1482{
1483 const char *err = "cannot allocate memory";
1484 struct cached_dev *dc, *t;
1485 struct cache *ca;
1486 unsigned i;
1487
1488 struct btree_op op;
1489 bch_btree_op_init_stack(&op);
1490 op.lock = SHRT_MAX;
1491
1492 for_each_cache(ca, c, i)
1493 c->nbuckets += ca->sb.nbuckets;
1494
1495 if (CACHE_SYNC(&c->sb)) {
1496 LIST_HEAD(journal);
1497 struct bkey *k;
1498 struct jset *j;
1499
1500 err = "cannot allocate memory for journal";
1501 if (bch_journal_read(c, &journal, &op))
1502 goto err;
1503
1504 pr_debug("btree_journal_read() done");
1505
1506 err = "no journal entries found";
1507 if (list_empty(&journal))
1508 goto err;
1509
1510 j = &list_entry(journal.prev, struct journal_replay, list)->j;
1511
1512 err = "IO error reading priorities";
1513 for_each_cache(ca, c, i)
1514 prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]);
1515
1516
1517
1518
1519
1520
1521
1522 k = &j->btree_root;
1523
1524 err = "bad btree root";
1525 if (__bch_ptr_invalid(c, j->btree_level + 1, k))
1526 goto err;
1527
1528 err = "error reading btree root";
1529 c->root = bch_btree_node_get(c, k, j->btree_level, &op);
1530 if (IS_ERR_OR_NULL(c->root))
1531 goto err;
1532
1533 list_del_init(&c->root->list);
1534 rw_unlock(true, c->root);
1535
1536 err = uuid_read(c, j, &op.cl);
1537 if (err)
1538 goto err;
1539
1540 err = "error in recovery";
1541 if (bch_btree_check(c, &op))
1542 goto err;
1543
1544 bch_journal_mark(c, &journal);
1545 bch_btree_gc_finish(c);
1546 pr_debug("btree_check() done");
1547
1548
1549
1550
1551
1552
1553 bch_journal_next(&c->journal);
1554
1555 err = "error starting allocator thread";
1556 for_each_cache(ca, c, i)
1557 if (bch_cache_allocator_start(ca))
1558 goto err;
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570 if (j->version < BCACHE_JSET_VERSION_UUID)
1571 __uuid_write(c);
1572
1573 bch_journal_replay(c, &journal, &op);
1574 } else {
1575 pr_notice("invalidating existing data");
1576
1577 closure_lock(&c->gc, NULL);
1578
1579 for_each_cache(ca, c, i) {
1580 unsigned j;
1581
1582 ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
1583 2, SB_JOURNAL_BUCKETS);
1584
1585 for (j = 0; j < ca->sb.keys; j++)
1586 ca->sb.d[j] = ca->sb.first_bucket + j;
1587 }
1588
1589 bch_btree_gc_finish(c);
1590
1591 err = "error starting allocator thread";
1592 for_each_cache(ca, c, i)
1593 if (bch_cache_allocator_start(ca))
1594 goto err;
1595
1596 mutex_lock(&c->bucket_lock);
1597 for_each_cache(ca, c, i)
1598 bch_prio_write(ca);
1599 mutex_unlock(&c->bucket_lock);
1600
1601 err = "cannot allocate new UUID bucket";
1602 if (__uuid_write(c))
1603 goto err_unlock_gc;
1604
1605 err = "cannot allocate new btree root";
1606 c->root = bch_btree_node_alloc(c, 0, &op.cl);
1607 if (IS_ERR_OR_NULL(c->root))
1608 goto err_unlock_gc;
1609
1610 bkey_copy_key(&c->root->key, &MAX_KEY);
1611 bch_btree_node_write(c->root, &op.cl);
1612
1613 bch_btree_set_root(c->root);
1614 rw_unlock(true, c->root);
1615
1616
1617
1618
1619
1620
1621 SET_CACHE_SYNC(&c->sb, true);
1622
1623 bch_journal_next(&c->journal);
1624 bch_journal_meta(c, &op.cl);
1625
1626
1627 closure_set_stopped(&c->gc.cl);
1628 closure_put(&c->gc.cl);
1629 }
1630
1631 closure_sync(&op.cl);
1632 c->sb.last_mount = get_seconds();
1633 bcache_write_super(c);
1634
1635 list_for_each_entry_safe(dc, t, &uncached_devices, list)
1636 bch_cached_dev_attach(dc, c);
1637
1638 flash_devs_run(c);
1639
1640 return;
1641err_unlock_gc:
1642 closure_set_stopped(&c->gc.cl);
1643 closure_put(&c->gc.cl);
1644err:
1645 closure_sync(&op.cl);
1646
1647 bch_cache_set_error(c, err);
1648}
1649
1650static bool can_attach_cache(struct cache *ca, struct cache_set *c)
1651{
1652 return ca->sb.block_size == c->sb.block_size &&
1653 ca->sb.bucket_size == c->sb.block_size &&
1654 ca->sb.nr_in_set == c->sb.nr_in_set;
1655}
1656
1657static const char *register_cache_set(struct cache *ca)
1658{
1659 char buf[12];
1660 const char *err = "cannot allocate memory";
1661 struct cache_set *c;
1662
1663 list_for_each_entry(c, &bch_cache_sets, list)
1664 if (!memcmp(c->sb.set_uuid, ca->sb.set_uuid, 16)) {
1665 if (c->cache[ca->sb.nr_this_dev])
1666 return "duplicate cache set member";
1667
1668 if (!can_attach_cache(ca, c))
1669 return "cache sb does not match set";
1670
1671 if (!CACHE_SYNC(&ca->sb))
1672 SET_CACHE_SYNC(&c->sb, false);
1673
1674 goto found;
1675 }
1676
1677 c = bch_cache_set_alloc(&ca->sb);
1678 if (!c)
1679 return err;
1680
1681 err = "error creating kobject";
1682 if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->sb.set_uuid) ||
1683 kobject_add(&c->internal, &c->kobj, "internal"))
1684 goto err;
1685
1686 if (bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
1687 goto err;
1688
1689 bch_debug_init_cache_set(c);
1690
1691 list_add(&c->list, &bch_cache_sets);
1692found:
1693 sprintf(buf, "cache%i", ca->sb.nr_this_dev);
1694 if (sysfs_create_link(&ca->kobj, &c->kobj, "set") ||
1695 sysfs_create_link(&c->kobj, &ca->kobj, buf))
1696 goto err;
1697
1698 if (ca->sb.seq > c->sb.seq) {
1699 c->sb.version = ca->sb.version;
1700 memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16);
1701 c->sb.flags = ca->sb.flags;
1702 c->sb.seq = ca->sb.seq;
1703 pr_debug("set version = %llu", c->sb.version);
1704 }
1705
1706 ca->set = c;
1707 ca->set->cache[ca->sb.nr_this_dev] = ca;
1708 c->cache_by_alloc[c->caches_loaded++] = ca;
1709
1710 if (c->caches_loaded == c->sb.nr_in_set)
1711 run_cache_set(c);
1712
1713 return NULL;
1714err:
1715 bch_cache_set_unregister(c);
1716 return err;
1717}
1718
1719
1720
1721void bch_cache_release(struct kobject *kobj)
1722{
1723 struct cache *ca = container_of(kobj, struct cache, kobj);
1724
1725 if (ca->set)
1726 ca->set->cache[ca->sb.nr_this_dev] = NULL;
1727
1728 bch_cache_allocator_exit(ca);
1729
1730 bio_split_pool_free(&ca->bio_split_hook);
1731
1732 free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
1733 kfree(ca->prio_buckets);
1734 vfree(ca->buckets);
1735
1736 free_heap(&ca->heap);
1737 free_fifo(&ca->unused);
1738 free_fifo(&ca->free_inc);
1739 free_fifo(&ca->free);
1740
1741 if (ca->sb_bio.bi_inline_vecs[0].bv_page)
1742 put_page(ca->sb_bio.bi_io_vec[0].bv_page);
1743
1744 if (!IS_ERR_OR_NULL(ca->bdev)) {
1745 blk_sync_queue(bdev_get_queue(ca->bdev));
1746 blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1747 }
1748
1749 kfree(ca);
1750 module_put(THIS_MODULE);
1751}
1752
1753static int cache_alloc(struct cache_sb *sb, struct cache *ca)
1754{
1755 size_t free;
1756 struct bucket *b;
1757
1758 __module_get(THIS_MODULE);
1759 kobject_init(&ca->kobj, &bch_cache_ktype);
1760
1761 INIT_LIST_HEAD(&ca->discards);
1762
1763 bio_init(&ca->journal.bio);
1764 ca->journal.bio.bi_max_vecs = 8;
1765 ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs;
1766
1767 free = roundup_pow_of_two(ca->sb.nbuckets) >> 9;
1768 free = max_t(size_t, free, (prio_buckets(ca) + 8) * 2);
1769
1770 if (!init_fifo(&ca->free, free, GFP_KERNEL) ||
1771 !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) ||
1772 !init_fifo(&ca->unused, free << 2, GFP_KERNEL) ||
1773 !init_heap(&ca->heap, free << 3, GFP_KERNEL) ||
1774 !(ca->buckets = vzalloc(sizeof(struct bucket) *
1775 ca->sb.nbuckets)) ||
1776 !(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) *
1777 2, GFP_KERNEL)) ||
1778 !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) ||
1779 bio_split_pool_init(&ca->bio_split_hook))
1780 return -ENOMEM;
1781
1782 ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
1783
1784 for_each_bucket(b, ca)
1785 atomic_set(&b->pin, 0);
1786
1787 if (bch_cache_allocator_init(ca))
1788 goto err;
1789
1790 return 0;
1791err:
1792 kobject_put(&ca->kobj);
1793 return -ENOMEM;
1794}
1795
1796static void register_cache(struct cache_sb *sb, struct page *sb_page,
1797 struct block_device *bdev, struct cache *ca)
1798{
1799 char name[BDEVNAME_SIZE];
1800 const char *err = "cannot allocate memory";
1801
1802 memcpy(&ca->sb, sb, sizeof(struct cache_sb));
1803 ca->bdev = bdev;
1804 ca->bdev->bd_holder = ca;
1805
1806 bio_init(&ca->sb_bio);
1807 ca->sb_bio.bi_max_vecs = 1;
1808 ca->sb_bio.bi_io_vec = ca->sb_bio.bi_inline_vecs;
1809 ca->sb_bio.bi_io_vec[0].bv_page = sb_page;
1810 get_page(sb_page);
1811
1812 if (blk_queue_discard(bdev_get_queue(ca->bdev)))
1813 ca->discard = CACHE_DISCARD(&ca->sb);
1814
1815 if (cache_alloc(sb, ca) != 0)
1816 goto err;
1817
1818 err = "error creating kobject";
1819 if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache"))
1820 goto err;
1821
1822 err = register_cache_set(ca);
1823 if (err)
1824 goto err;
1825
1826 pr_info("registered cache device %s", bdevname(bdev, name));
1827 return;
1828err:
1829 pr_notice("error opening %s: %s", bdevname(bdev, name), err);
1830 kobject_put(&ca->kobj);
1831}
1832
1833
1834
1835static ssize_t register_bcache(struct kobject *, struct kobj_attribute *,
1836 const char *, size_t);
1837
1838kobj_attribute_write(register, register_bcache);
1839kobj_attribute_write(register_quiet, register_bcache);
1840
1841static bool bch_is_open_backing(struct block_device *bdev) {
1842 struct cache_set *c, *tc;
1843 struct cached_dev *dc, *t;
1844
1845 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
1846 list_for_each_entry_safe(dc, t, &c->cached_devs, list)
1847 if (dc->bdev == bdev)
1848 return true;
1849 list_for_each_entry_safe(dc, t, &uncached_devices, list)
1850 if (dc->bdev == bdev)
1851 return true;
1852 return false;
1853}
1854
1855static bool bch_is_open_cache(struct block_device *bdev) {
1856 struct cache_set *c, *tc;
1857 struct cache *ca;
1858 unsigned i;
1859
1860 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
1861 for_each_cache(ca, c, i)
1862 if (ca->bdev == bdev)
1863 return true;
1864 return false;
1865}
1866
1867static bool bch_is_open(struct block_device *bdev) {
1868 return bch_is_open_cache(bdev) || bch_is_open_backing(bdev);
1869}
1870
1871static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
1872 const char *buffer, size_t size)
1873{
1874 ssize_t ret = size;
1875 const char *err = "cannot allocate memory";
1876 char *path = NULL;
1877 struct cache_sb *sb = NULL;
1878 struct block_device *bdev = NULL;
1879 struct page *sb_page = NULL;
1880
1881 if (!try_module_get(THIS_MODULE))
1882 return -EBUSY;
1883
1884 mutex_lock(&bch_register_lock);
1885
1886 if (!(path = kstrndup(buffer, size, GFP_KERNEL)) ||
1887 !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL)))
1888 goto err;
1889
1890 err = "failed to open device";
1891 bdev = blkdev_get_by_path(strim(path),
1892 FMODE_READ|FMODE_WRITE|FMODE_EXCL,
1893 sb);
1894 if (IS_ERR(bdev)) {
1895 if (bdev == ERR_PTR(-EBUSY)) {
1896 bdev = lookup_bdev(strim(path));
1897 if (!IS_ERR(bdev) && bch_is_open(bdev))
1898 err = "device already registered";
1899 else
1900 err = "device busy";
1901 }
1902 goto err;
1903 }
1904
1905 err = "failed to set blocksize";
1906 if (set_blocksize(bdev, 4096))
1907 goto err_close;
1908
1909 err = read_super(sb, bdev, &sb_page);
1910 if (err)
1911 goto err_close;
1912
1913 if (SB_IS_BDEV(sb)) {
1914 struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
1915 if (!dc)
1916 goto err_close;
1917
1918 register_bdev(sb, sb_page, bdev, dc);
1919 } else {
1920 struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
1921 if (!ca)
1922 goto err_close;
1923
1924 register_cache(sb, sb_page, bdev, ca);
1925 }
1926out:
1927 if (sb_page)
1928 put_page(sb_page);
1929 kfree(sb);
1930 kfree(path);
1931 mutex_unlock(&bch_register_lock);
1932 module_put(THIS_MODULE);
1933 return ret;
1934
1935err_close:
1936 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1937err:
1938 if (attr != &ksysfs_register_quiet)
1939 pr_info("error opening %s: %s", path, err);
1940 ret = -EINVAL;
1941 goto out;
1942}
1943
1944static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
1945{
1946 if (code == SYS_DOWN ||
1947 code == SYS_HALT ||
1948 code == SYS_POWER_OFF) {
1949 DEFINE_WAIT(wait);
1950 unsigned long start = jiffies;
1951 bool stopped = false;
1952
1953 struct cache_set *c, *tc;
1954 struct cached_dev *dc, *tdc;
1955
1956 mutex_lock(&bch_register_lock);
1957
1958 if (list_empty(&bch_cache_sets) &&
1959 list_empty(&uncached_devices))
1960 goto out;
1961
1962 pr_info("Stopping all devices:");
1963
1964 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
1965 bch_cache_set_stop(c);
1966
1967 list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
1968 bcache_device_stop(&dc->disk);
1969
1970
1971 while (1) {
1972 long timeout = start + 2 * HZ - jiffies;
1973
1974 stopped = list_empty(&bch_cache_sets) &&
1975 list_empty(&uncached_devices);
1976
1977 if (timeout < 0 || stopped)
1978 break;
1979
1980 prepare_to_wait(&unregister_wait, &wait,
1981 TASK_UNINTERRUPTIBLE);
1982
1983 mutex_unlock(&bch_register_lock);
1984 schedule_timeout(timeout);
1985 mutex_lock(&bch_register_lock);
1986 }
1987
1988 finish_wait(&unregister_wait, &wait);
1989
1990 if (stopped)
1991 pr_info("All devices stopped");
1992 else
1993 pr_notice("Timeout waiting for devices to be closed");
1994out:
1995 mutex_unlock(&bch_register_lock);
1996 }
1997
1998 return NOTIFY_DONE;
1999}
2000
2001static struct notifier_block reboot = {
2002 .notifier_call = bcache_reboot,
2003 .priority = INT_MAX,
2004};
2005
2006static void bcache_exit(void)
2007{
2008 bch_debug_exit();
2009 bch_writeback_exit();
2010 bch_request_exit();
2011 bch_btree_exit();
2012 if (bcache_kobj)
2013 kobject_put(bcache_kobj);
2014 if (bcache_wq)
2015 destroy_workqueue(bcache_wq);
2016 unregister_blkdev(bcache_major, "bcache");
2017 unregister_reboot_notifier(&reboot);
2018}
2019
2020static int __init bcache_init(void)
2021{
2022 static const struct attribute *files[] = {
2023 &ksysfs_register.attr,
2024 &ksysfs_register_quiet.attr,
2025 NULL
2026 };
2027
2028 mutex_init(&bch_register_lock);
2029 init_waitqueue_head(&unregister_wait);
2030 register_reboot_notifier(&reboot);
2031 closure_debug_init();
2032
2033 bcache_major = register_blkdev(0, "bcache");
2034 if (bcache_major < 0)
2035 return bcache_major;
2036
2037 if (!(bcache_wq = create_workqueue("bcache")) ||
2038 !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
2039 sysfs_create_files(bcache_kobj, files) ||
2040 bch_btree_init() ||
2041 bch_request_init() ||
2042 bch_writeback_init() ||
2043 bch_debug_init(bcache_kobj))
2044 goto err;
2045
2046 return 0;
2047err:
2048 bcache_exit();
2049 return -ENOMEM;
2050}
2051
2052module_exit(bcache_exit);
2053module_init(bcache_init);
2054