1
2
3
4
5
6
7
8
9#include "bcache.h"
10#include "btree.h"
11#include "debug.h"
12#include "extents.h"
13#include "request.h"
14#include "writeback.h"
15
16#include <linux/blkdev.h>
17#include <linux/buffer_head.h>
18#include <linux/debugfs.h>
19#include <linux/genhd.h>
20#include <linux/idr.h>
21#include <linux/kthread.h>
22#include <linux/module.h>
23#include <linux/random.h>
24#include <linux/reboot.h>
25#include <linux/sysfs.h>
26
27MODULE_LICENSE("GPL");
28MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>");
29
30static const char bcache_magic[] = {
31 0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca,
32 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81
33};
34
35static const char invalid_uuid[] = {
36 0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78,
37 0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99
38};
39
40
41const char * const bch_cache_modes[] = {
42 "default",
43 "writethrough",
44 "writeback",
45 "writearound",
46 "none",
47 NULL
48};
49
50static struct kobject *bcache_kobj;
51struct mutex bch_register_lock;
52LIST_HEAD(bch_cache_sets);
53static LIST_HEAD(uncached_devices);
54
55static int bcache_major;
56static DEFINE_IDA(bcache_minor);
57static wait_queue_head_t unregister_wait;
58struct workqueue_struct *bcache_wq;
59
60#define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE)
61#define BCACHE_MINORS 16
62
63
64
65static const char *read_super(struct cache_sb *sb, struct block_device *bdev,
66 struct page **res)
67{
68 const char *err;
69 struct cache_sb *s;
70 struct buffer_head *bh = __bread(bdev, 1, SB_SIZE);
71 unsigned i;
72
73 if (!bh)
74 return "IO error";
75
76 s = (struct cache_sb *) bh->b_data;
77
78 sb->offset = le64_to_cpu(s->offset);
79 sb->version = le64_to_cpu(s->version);
80
81 memcpy(sb->magic, s->magic, 16);
82 memcpy(sb->uuid, s->uuid, 16);
83 memcpy(sb->set_uuid, s->set_uuid, 16);
84 memcpy(sb->label, s->label, SB_LABEL_SIZE);
85
86 sb->flags = le64_to_cpu(s->flags);
87 sb->seq = le64_to_cpu(s->seq);
88 sb->last_mount = le32_to_cpu(s->last_mount);
89 sb->first_bucket = le16_to_cpu(s->first_bucket);
90 sb->keys = le16_to_cpu(s->keys);
91
92 for (i = 0; i < SB_JOURNAL_BUCKETS; i++)
93 sb->d[i] = le64_to_cpu(s->d[i]);
94
95 pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u",
96 sb->version, sb->flags, sb->seq, sb->keys);
97
98 err = "Not a bcache superblock";
99 if (sb->offset != SB_SECTOR)
100 goto err;
101
102 if (memcmp(sb->magic, bcache_magic, 16))
103 goto err;
104
105 err = "Too many journal buckets";
106 if (sb->keys > SB_JOURNAL_BUCKETS)
107 goto err;
108
109 err = "Bad checksum";
110 if (s->csum != csum_set(s))
111 goto err;
112
113 err = "Bad UUID";
114 if (bch_is_zero(sb->uuid, 16))
115 goto err;
116
117 sb->block_size = le16_to_cpu(s->block_size);
118
119 err = "Superblock block size smaller than device block size";
120 if (sb->block_size << 9 < bdev_logical_block_size(bdev))
121 goto err;
122
123 switch (sb->version) {
124 case BCACHE_SB_VERSION_BDEV:
125 sb->data_offset = BDEV_DATA_START_DEFAULT;
126 break;
127 case BCACHE_SB_VERSION_BDEV_WITH_OFFSET:
128 sb->data_offset = le64_to_cpu(s->data_offset);
129
130 err = "Bad data offset";
131 if (sb->data_offset < BDEV_DATA_START_DEFAULT)
132 goto err;
133
134 break;
135 case BCACHE_SB_VERSION_CDEV:
136 case BCACHE_SB_VERSION_CDEV_WITH_UUID:
137 sb->nbuckets = le64_to_cpu(s->nbuckets);
138 sb->bucket_size = le16_to_cpu(s->bucket_size);
139
140 sb->nr_in_set = le16_to_cpu(s->nr_in_set);
141 sb->nr_this_dev = le16_to_cpu(s->nr_this_dev);
142
143 err = "Too many buckets";
144 if (sb->nbuckets > LONG_MAX)
145 goto err;
146
147 err = "Not enough buckets";
148 if (sb->nbuckets < 1 << 7)
149 goto err;
150
151 err = "Bad block/bucket size";
152 if (!is_power_of_2(sb->block_size) ||
153 sb->block_size > PAGE_SECTORS ||
154 !is_power_of_2(sb->bucket_size) ||
155 sb->bucket_size < PAGE_SECTORS)
156 goto err;
157
158 err = "Invalid superblock: device too small";
159 if (get_capacity(bdev->bd_disk) < sb->bucket_size * sb->nbuckets)
160 goto err;
161
162 err = "Bad UUID";
163 if (bch_is_zero(sb->set_uuid, 16))
164 goto err;
165
166 err = "Bad cache device number in set";
167 if (!sb->nr_in_set ||
168 sb->nr_in_set <= sb->nr_this_dev ||
169 sb->nr_in_set > MAX_CACHES_PER_SET)
170 goto err;
171
172 err = "Journal buckets not sequential";
173 for (i = 0; i < sb->keys; i++)
174 if (sb->d[i] != sb->first_bucket + i)
175 goto err;
176
177 err = "Too many journal buckets";
178 if (sb->first_bucket + sb->keys > sb->nbuckets)
179 goto err;
180
181 err = "Invalid superblock: first bucket comes before end of super";
182 if (sb->first_bucket * sb->bucket_size < 16)
183 goto err;
184
185 break;
186 default:
187 err = "Unsupported superblock version";
188 goto err;
189 }
190
191 sb->last_mount = get_seconds();
192 err = NULL;
193
194 get_page(bh->b_page);
195 *res = bh->b_page;
196err:
197 put_bh(bh);
198 return err;
199}
200
201static void write_bdev_super_endio(struct bio *bio)
202{
203 struct cached_dev *dc = bio->bi_private;
204
205
206 closure_put(&dc->sb_write);
207}
208
209static void __write_super(struct cache_sb *sb, struct bio *bio)
210{
211 struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page);
212 unsigned i;
213
214 bio->bi_iter.bi_sector = SB_SECTOR;
215 bio->bi_iter.bi_size = SB_SIZE;
216 bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC|REQ_META);
217 bch_bio_map(bio, NULL);
218
219 out->offset = cpu_to_le64(sb->offset);
220 out->version = cpu_to_le64(sb->version);
221
222 memcpy(out->uuid, sb->uuid, 16);
223 memcpy(out->set_uuid, sb->set_uuid, 16);
224 memcpy(out->label, sb->label, SB_LABEL_SIZE);
225
226 out->flags = cpu_to_le64(sb->flags);
227 out->seq = cpu_to_le64(sb->seq);
228
229 out->last_mount = cpu_to_le32(sb->last_mount);
230 out->first_bucket = cpu_to_le16(sb->first_bucket);
231 out->keys = cpu_to_le16(sb->keys);
232
233 for (i = 0; i < sb->keys; i++)
234 out->d[i] = cpu_to_le64(sb->d[i]);
235
236 out->csum = csum_set(out);
237
238 pr_debug("ver %llu, flags %llu, seq %llu",
239 sb->version, sb->flags, sb->seq);
240
241 submit_bio(bio);
242}
243
244static void bch_write_bdev_super_unlock(struct closure *cl)
245{
246 struct cached_dev *dc = container_of(cl, struct cached_dev, sb_write);
247
248 up(&dc->sb_write_mutex);
249}
250
251void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent)
252{
253 struct closure *cl = &dc->sb_write;
254 struct bio *bio = &dc->sb_bio;
255
256 down(&dc->sb_write_mutex);
257 closure_init(cl, parent);
258
259 bio_reset(bio);
260 bio->bi_bdev = dc->bdev;
261 bio->bi_end_io = write_bdev_super_endio;
262 bio->bi_private = dc;
263
264 closure_get(cl);
265 __write_super(&dc->sb, bio);
266
267 closure_return_with_destructor(cl, bch_write_bdev_super_unlock);
268}
269
270static void write_super_endio(struct bio *bio)
271{
272 struct cache *ca = bio->bi_private;
273
274 bch_count_io_errors(ca, bio->bi_status, "writing superblock");
275 closure_put(&ca->set->sb_write);
276}
277
278static void bcache_write_super_unlock(struct closure *cl)
279{
280 struct cache_set *c = container_of(cl, struct cache_set, sb_write);
281
282 up(&c->sb_write_mutex);
283}
284
285void bcache_write_super(struct cache_set *c)
286{
287 struct closure *cl = &c->sb_write;
288 struct cache *ca;
289 unsigned i;
290
291 down(&c->sb_write_mutex);
292 closure_init(cl, &c->cl);
293
294 c->sb.seq++;
295
296 for_each_cache(ca, c, i) {
297 struct bio *bio = &ca->sb_bio;
298
299 ca->sb.version = BCACHE_SB_VERSION_CDEV_WITH_UUID;
300 ca->sb.seq = c->sb.seq;
301 ca->sb.last_mount = c->sb.last_mount;
302
303 SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb));
304
305 bio_reset(bio);
306 bio->bi_bdev = ca->bdev;
307 bio->bi_end_io = write_super_endio;
308 bio->bi_private = ca;
309
310 closure_get(cl);
311 __write_super(&ca->sb, bio);
312 }
313
314 closure_return_with_destructor(cl, bcache_write_super_unlock);
315}
316
317
318
319static void uuid_endio(struct bio *bio)
320{
321 struct closure *cl = bio->bi_private;
322 struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
323
324 cache_set_err_on(bio->bi_status, c, "accessing uuids");
325 bch_bbio_free(bio, c);
326 closure_put(cl);
327}
328
329static void uuid_io_unlock(struct closure *cl)
330{
331 struct cache_set *c = container_of(cl, struct cache_set, uuid_write);
332
333 up(&c->uuid_write_mutex);
334}
335
336static void uuid_io(struct cache_set *c, int op, unsigned long op_flags,
337 struct bkey *k, struct closure *parent)
338{
339 struct closure *cl = &c->uuid_write;
340 struct uuid_entry *u;
341 unsigned i;
342 char buf[80];
343
344 BUG_ON(!parent);
345 down(&c->uuid_write_mutex);
346 closure_init(cl, parent);
347
348 for (i = 0; i < KEY_PTRS(k); i++) {
349 struct bio *bio = bch_bbio_alloc(c);
350
351 bio->bi_opf = REQ_SYNC | REQ_META | op_flags;
352 bio->bi_iter.bi_size = KEY_SIZE(k) << 9;
353
354 bio->bi_end_io = uuid_endio;
355 bio->bi_private = cl;
356 bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags);
357 bch_bio_map(bio, c->uuids);
358
359 bch_submit_bbio(bio, c, k, i);
360
361 if (op != REQ_OP_WRITE)
362 break;
363 }
364
365 bch_extent_to_text(buf, sizeof(buf), k);
366 pr_debug("%s UUIDs at %s", op == REQ_OP_WRITE ? "wrote" : "read", buf);
367
368 for (u = c->uuids; u < c->uuids + c->nr_uuids; u++)
369 if (!bch_is_zero(u->uuid, 16))
370 pr_debug("Slot %zi: %pU: %s: 1st: %u last: %u inv: %u",
371 u - c->uuids, u->uuid, u->label,
372 u->first_reg, u->last_reg, u->invalidated);
373
374 closure_return_with_destructor(cl, uuid_io_unlock);
375}
376
377static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl)
378{
379 struct bkey *k = &j->uuid_bucket;
380
381 if (__bch_btree_ptr_invalid(c, k))
382 return "bad uuid pointer";
383
384 bkey_copy(&c->uuid_bucket, k);
385 uuid_io(c, REQ_OP_READ, 0, k, cl);
386
387 if (j->version < BCACHE_JSET_VERSION_UUIDv1) {
388 struct uuid_entry_v0 *u0 = (void *) c->uuids;
389 struct uuid_entry *u1 = (void *) c->uuids;
390 int i;
391
392 closure_sync(cl);
393
394
395
396
397
398
399
400 for (i = c->nr_uuids - 1;
401 i >= 0;
402 --i) {
403 memcpy(u1[i].uuid, u0[i].uuid, 16);
404 memcpy(u1[i].label, u0[i].label, 32);
405
406 u1[i].first_reg = u0[i].first_reg;
407 u1[i].last_reg = u0[i].last_reg;
408 u1[i].invalidated = u0[i].invalidated;
409
410 u1[i].flags = 0;
411 u1[i].sectors = 0;
412 }
413 }
414
415 return NULL;
416}
417
418static int __uuid_write(struct cache_set *c)
419{
420 BKEY_PADDED(key) k;
421 struct closure cl;
422 closure_init_stack(&cl);
423
424 lockdep_assert_held(&bch_register_lock);
425
426 if (bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, true))
427 return 1;
428
429 SET_KEY_SIZE(&k.key, c->sb.bucket_size);
430 uuid_io(c, REQ_OP_WRITE, 0, &k.key, &cl);
431 closure_sync(&cl);
432
433 bkey_copy(&c->uuid_bucket, &k.key);
434 bkey_put(c, &k.key);
435 return 0;
436}
437
438int bch_uuid_write(struct cache_set *c)
439{
440 int ret = __uuid_write(c);
441
442 if (!ret)
443 bch_journal_meta(c, NULL);
444
445 return ret;
446}
447
448static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid)
449{
450 struct uuid_entry *u;
451
452 for (u = c->uuids;
453 u < c->uuids + c->nr_uuids; u++)
454 if (!memcmp(u->uuid, uuid, 16))
455 return u;
456
457 return NULL;
458}
459
460static struct uuid_entry *uuid_find_empty(struct cache_set *c)
461{
462 static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
463 return uuid_find(c, zero_uuid);
464}
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493static void prio_endio(struct bio *bio)
494{
495 struct cache *ca = bio->bi_private;
496
497 cache_set_err_on(bio->bi_status, ca->set, "accessing priorities");
498 bch_bbio_free(bio, ca->set);
499 closure_put(&ca->prio);
500}
501
502static void prio_io(struct cache *ca, uint64_t bucket, int op,
503 unsigned long op_flags)
504{
505 struct closure *cl = &ca->prio;
506 struct bio *bio = bch_bbio_alloc(ca->set);
507
508 closure_init_stack(cl);
509
510 bio->bi_iter.bi_sector = bucket * ca->sb.bucket_size;
511 bio->bi_bdev = ca->bdev;
512 bio->bi_iter.bi_size = bucket_bytes(ca);
513
514 bio->bi_end_io = prio_endio;
515 bio->bi_private = ca;
516 bio_set_op_attrs(bio, op, REQ_SYNC|REQ_META|op_flags);
517 bch_bio_map(bio, ca->disk_buckets);
518
519 closure_bio_submit(bio, &ca->prio);
520 closure_sync(cl);
521}
522
523void bch_prio_write(struct cache *ca)
524{
525 int i;
526 struct bucket *b;
527 struct closure cl;
528
529 closure_init_stack(&cl);
530
531 lockdep_assert_held(&ca->set->bucket_lock);
532
533 ca->disk_buckets->seq++;
534
535 atomic_long_add(ca->sb.bucket_size * prio_buckets(ca),
536 &ca->meta_sectors_written);
537
538
539
540
541 for (i = prio_buckets(ca) - 1; i >= 0; --i) {
542 long bucket;
543 struct prio_set *p = ca->disk_buckets;
544 struct bucket_disk *d = p->data;
545 struct bucket_disk *end = d + prios_per_bucket(ca);
546
547 for (b = ca->buckets + i * prios_per_bucket(ca);
548 b < ca->buckets + ca->sb.nbuckets && d < end;
549 b++, d++) {
550 d->prio = cpu_to_le16(b->prio);
551 d->gen = b->gen;
552 }
553
554 p->next_bucket = ca->prio_buckets[i + 1];
555 p->magic = pset_magic(&ca->sb);
556 p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8);
557
558 bucket = bch_bucket_alloc(ca, RESERVE_PRIO, true);
559 BUG_ON(bucket == -1);
560
561 mutex_unlock(&ca->set->bucket_lock);
562 prio_io(ca, bucket, REQ_OP_WRITE, 0);
563 mutex_lock(&ca->set->bucket_lock);
564
565 ca->prio_buckets[i] = bucket;
566 atomic_dec_bug(&ca->buckets[bucket].pin);
567 }
568
569 mutex_unlock(&ca->set->bucket_lock);
570
571 bch_journal_meta(ca->set, &cl);
572 closure_sync(&cl);
573
574 mutex_lock(&ca->set->bucket_lock);
575
576
577
578
579
580 for (i = 0; i < prio_buckets(ca); i++) {
581 if (ca->prio_last_buckets[i])
582 __bch_bucket_free(ca,
583 &ca->buckets[ca->prio_last_buckets[i]]);
584
585 ca->prio_last_buckets[i] = ca->prio_buckets[i];
586 }
587}
588
589static void prio_read(struct cache *ca, uint64_t bucket)
590{
591 struct prio_set *p = ca->disk_buckets;
592 struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d;
593 struct bucket *b;
594 unsigned bucket_nr = 0;
595
596 for (b = ca->buckets;
597 b < ca->buckets + ca->sb.nbuckets;
598 b++, d++) {
599 if (d == end) {
600 ca->prio_buckets[bucket_nr] = bucket;
601 ca->prio_last_buckets[bucket_nr] = bucket;
602 bucket_nr++;
603
604 prio_io(ca, bucket, REQ_OP_READ, 0);
605
606 if (p->csum != bch_crc64(&p->magic, bucket_bytes(ca) - 8))
607 pr_warn("bad csum reading priorities");
608
609 if (p->magic != pset_magic(&ca->sb))
610 pr_warn("bad magic reading priorities");
611
612 bucket = p->next_bucket;
613 d = p->data;
614 }
615
616 b->prio = le16_to_cpu(d->prio);
617 b->gen = b->last_gc = d->gen;
618 }
619}
620
621
622
623static int open_dev(struct block_device *b, fmode_t mode)
624{
625 struct bcache_device *d = b->bd_disk->private_data;
626 if (test_bit(BCACHE_DEV_CLOSING, &d->flags))
627 return -ENXIO;
628
629 closure_get(&d->cl);
630 return 0;
631}
632
633static void release_dev(struct gendisk *b, fmode_t mode)
634{
635 struct bcache_device *d = b->private_data;
636 closure_put(&d->cl);
637}
638
639static int ioctl_dev(struct block_device *b, fmode_t mode,
640 unsigned int cmd, unsigned long arg)
641{
642 struct bcache_device *d = b->bd_disk->private_data;
643 return d->ioctl(d, mode, cmd, arg);
644}
645
646static const struct block_device_operations bcache_ops = {
647 .open = open_dev,
648 .release = release_dev,
649 .ioctl = ioctl_dev,
650 .owner = THIS_MODULE,
651};
652
653void bcache_device_stop(struct bcache_device *d)
654{
655 if (!test_and_set_bit(BCACHE_DEV_CLOSING, &d->flags))
656 closure_queue(&d->cl);
657}
658
659static void bcache_device_unlink(struct bcache_device *d)
660{
661 lockdep_assert_held(&bch_register_lock);
662
663 if (d->c && !test_and_set_bit(BCACHE_DEV_UNLINK_DONE, &d->flags)) {
664 unsigned i;
665 struct cache *ca;
666
667 sysfs_remove_link(&d->c->kobj, d->name);
668 sysfs_remove_link(&d->kobj, "cache");
669
670 for_each_cache(ca, d->c, i)
671 bd_unlink_disk_holder(ca->bdev, d->disk);
672 }
673}
674
675static void bcache_device_link(struct bcache_device *d, struct cache_set *c,
676 const char *name)
677{
678 unsigned i;
679 struct cache *ca;
680
681 for_each_cache(ca, d->c, i)
682 bd_link_disk_holder(ca->bdev, d->disk);
683
684 snprintf(d->name, BCACHEDEVNAME_SIZE,
685 "%s%u", name, d->id);
686
687 WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") ||
688 sysfs_create_link(&c->kobj, &d->kobj, d->name),
689 "Couldn't create device <-> cache set symlinks");
690
691 clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags);
692}
693
694static void bcache_device_detach(struct bcache_device *d)
695{
696 lockdep_assert_held(&bch_register_lock);
697
698 if (test_bit(BCACHE_DEV_DETACHING, &d->flags)) {
699 struct uuid_entry *u = d->c->uuids + d->id;
700
701 SET_UUID_FLASH_ONLY(u, 0);
702 memcpy(u->uuid, invalid_uuid, 16);
703 u->invalidated = cpu_to_le32(get_seconds());
704 bch_uuid_write(d->c);
705 }
706
707 bcache_device_unlink(d);
708
709 d->c->devices[d->id] = NULL;
710 closure_put(&d->c->caching);
711 d->c = NULL;
712}
713
714static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
715 unsigned id)
716{
717 d->id = id;
718 d->c = c;
719 c->devices[id] = d;
720
721 closure_get(&c->caching);
722}
723
724static void bcache_device_free(struct bcache_device *d)
725{
726 lockdep_assert_held(&bch_register_lock);
727
728 pr_info("%s stopped", d->disk->disk_name);
729
730 if (d->c)
731 bcache_device_detach(d);
732 if (d->disk && d->disk->flags & GENHD_FL_UP)
733 del_gendisk(d->disk);
734 if (d->disk && d->disk->queue)
735 blk_cleanup_queue(d->disk->queue);
736 if (d->disk) {
737 ida_simple_remove(&bcache_minor, d->disk->first_minor);
738 put_disk(d->disk);
739 }
740
741 if (d->bio_split)
742 bioset_free(d->bio_split);
743 kvfree(d->full_dirty_stripes);
744 kvfree(d->stripe_sectors_dirty);
745
746 closure_debug_destroy(&d->cl);
747}
748
749static int bcache_device_init(struct bcache_device *d, unsigned block_size,
750 sector_t sectors)
751{
752 struct request_queue *q;
753 size_t n;
754 int minor;
755
756 if (!d->stripe_size)
757 d->stripe_size = 1 << 31;
758
759 d->nr_stripes = DIV_ROUND_UP_ULL(sectors, d->stripe_size);
760
761 if (!d->nr_stripes ||
762 d->nr_stripes > INT_MAX ||
763 d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) {
764 pr_err("nr_stripes too large or invalid: %u (start sector beyond end of disk?)",
765 (unsigned)d->nr_stripes);
766 return -ENOMEM;
767 }
768
769 n = d->nr_stripes * sizeof(atomic_t);
770 d->stripe_sectors_dirty = kvzalloc(n, GFP_KERNEL);
771 if (!d->stripe_sectors_dirty)
772 return -ENOMEM;
773
774 n = BITS_TO_LONGS(d->nr_stripes) * sizeof(unsigned long);
775 d->full_dirty_stripes = kvzalloc(n, GFP_KERNEL);
776 if (!d->full_dirty_stripes)
777 return -ENOMEM;
778
779 minor = ida_simple_get(&bcache_minor, 0, MINORMASK + 1, GFP_KERNEL);
780 if (minor < 0)
781 return minor;
782
783 minor *= BCACHE_MINORS;
784
785 if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio),
786 BIOSET_NEED_BVECS |
787 BIOSET_NEED_RESCUER)) ||
788 !(d->disk = alloc_disk(BCACHE_MINORS))) {
789 ida_simple_remove(&bcache_minor, minor);
790 return -ENOMEM;
791 }
792
793 set_capacity(d->disk, sectors);
794 snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", minor);
795
796 d->disk->major = bcache_major;
797 d->disk->first_minor = minor;
798 d->disk->fops = &bcache_ops;
799 d->disk->private_data = d;
800
801 q = blk_alloc_queue(GFP_KERNEL);
802 if (!q)
803 return -ENOMEM;
804
805 blk_queue_make_request(q, NULL);
806 d->disk->queue = q;
807 q->queuedata = d;
808 q->backing_dev_info->congested_data = d;
809 q->limits.max_hw_sectors = UINT_MAX;
810 q->limits.max_sectors = UINT_MAX;
811 q->limits.max_segment_size = UINT_MAX;
812 q->limits.max_segments = BIO_MAX_PAGES;
813 blk_queue_max_discard_sectors(q, UINT_MAX);
814 q->limits.discard_granularity = 512;
815 q->limits.io_min = block_size;
816 q->limits.logical_block_size = block_size;
817 q->limits.physical_block_size = block_size;
818 set_bit(QUEUE_FLAG_NONROT, &d->disk->queue->queue_flags);
819 clear_bit(QUEUE_FLAG_ADD_RANDOM, &d->disk->queue->queue_flags);
820 set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags);
821
822 blk_queue_write_cache(q, true, true);
823
824 return 0;
825}
826
827
828
829static void calc_cached_dev_sectors(struct cache_set *c)
830{
831 uint64_t sectors = 0;
832 struct cached_dev *dc;
833
834 list_for_each_entry(dc, &c->cached_devs, list)
835 sectors += bdev_sectors(dc->bdev);
836
837 c->cached_dev_sectors = sectors;
838}
839
840void bch_cached_dev_run(struct cached_dev *dc)
841{
842 struct bcache_device *d = &dc->disk;
843 char buf[SB_LABEL_SIZE + 1];
844 char *env[] = {
845 "DRIVER=bcache",
846 kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid),
847 NULL,
848 NULL,
849 };
850
851 memcpy(buf, dc->sb.label, SB_LABEL_SIZE);
852 buf[SB_LABEL_SIZE] = '\0';
853 env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf);
854
855 if (atomic_xchg(&dc->running, 1)) {
856 kfree(env[1]);
857 kfree(env[2]);
858 return;
859 }
860
861 if (!d->c &&
862 BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) {
863 struct closure cl;
864 closure_init_stack(&cl);
865
866 SET_BDEV_STATE(&dc->sb, BDEV_STATE_STALE);
867 bch_write_bdev_super(dc, &cl);
868 closure_sync(&cl);
869 }
870
871 add_disk(d->disk);
872 bd_link_disk_holder(dc->bdev, dc->disk.disk);
873
874
875 kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env);
876 kfree(env[1]);
877 kfree(env[2]);
878
879 if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") ||
880 sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache"))
881 pr_debug("error creating sysfs link");
882}
883
884static void cached_dev_detach_finish(struct work_struct *w)
885{
886 struct cached_dev *dc = container_of(w, struct cached_dev, detach);
887 char buf[BDEVNAME_SIZE];
888 struct closure cl;
889 closure_init_stack(&cl);
890
891 BUG_ON(!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags));
892 BUG_ON(atomic_read(&dc->count));
893
894 mutex_lock(&bch_register_lock);
895
896 memset(&dc->sb.set_uuid, 0, 16);
897 SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE);
898
899 bch_write_bdev_super(dc, &cl);
900 closure_sync(&cl);
901
902 bcache_device_detach(&dc->disk);
903 list_move(&dc->list, &uncached_devices);
904
905 clear_bit(BCACHE_DEV_DETACHING, &dc->disk.flags);
906 clear_bit(BCACHE_DEV_UNLINK_DONE, &dc->disk.flags);
907
908 mutex_unlock(&bch_register_lock);
909
910 pr_info("Caching disabled for %s", bdevname(dc->bdev, buf));
911
912
913 closure_put(&dc->disk.cl);
914}
915
916void bch_cached_dev_detach(struct cached_dev *dc)
917{
918 lockdep_assert_held(&bch_register_lock);
919
920 if (test_bit(BCACHE_DEV_CLOSING, &dc->disk.flags))
921 return;
922
923 if (test_and_set_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
924 return;
925
926
927
928
929
930 closure_get(&dc->disk.cl);
931
932 bch_writeback_queue(dc);
933 cached_dev_put(dc);
934}
935
936int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c)
937{
938 uint32_t rtime = cpu_to_le32(get_seconds());
939 struct uuid_entry *u;
940 char buf[BDEVNAME_SIZE];
941
942 bdevname(dc->bdev, buf);
943
944 if (memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16))
945 return -ENOENT;
946
947 if (dc->disk.c) {
948 pr_err("Can't attach %s: already attached", buf);
949 return -EINVAL;
950 }
951
952 if (test_bit(CACHE_SET_STOPPING, &c->flags)) {
953 pr_err("Can't attach %s: shutting down", buf);
954 return -EINVAL;
955 }
956
957 if (dc->sb.block_size < c->sb.block_size) {
958
959 pr_err("Couldn't attach %s: block size less than set's block size",
960 buf);
961 return -EINVAL;
962 }
963
964 u = uuid_find(c, dc->sb.uuid);
965
966 if (u &&
967 (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE ||
968 BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) {
969 memcpy(u->uuid, invalid_uuid, 16);
970 u->invalidated = cpu_to_le32(get_seconds());
971 u = NULL;
972 }
973
974 if (!u) {
975 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
976 pr_err("Couldn't find uuid for %s in set", buf);
977 return -ENOENT;
978 }
979
980 u = uuid_find_empty(c);
981 if (!u) {
982 pr_err("Not caching %s, no room for UUID", buf);
983 return -EINVAL;
984 }
985 }
986
987
988
989
990
991 if (bch_is_zero(u->uuid, 16)) {
992 struct closure cl;
993 closure_init_stack(&cl);
994
995 memcpy(u->uuid, dc->sb.uuid, 16);
996 memcpy(u->label, dc->sb.label, SB_LABEL_SIZE);
997 u->first_reg = u->last_reg = rtime;
998 bch_uuid_write(c);
999
1000 memcpy(dc->sb.set_uuid, c->sb.set_uuid, 16);
1001 SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
1002
1003 bch_write_bdev_super(dc, &cl);
1004 closure_sync(&cl);
1005 } else {
1006 u->last_reg = rtime;
1007 bch_uuid_write(c);
1008 }
1009
1010 bcache_device_attach(&dc->disk, c, u - c->uuids);
1011 list_move(&dc->list, &c->cached_devs);
1012 calc_cached_dev_sectors(c);
1013
1014 smp_wmb();
1015
1016
1017
1018
1019 atomic_set(&dc->count, 1);
1020
1021
1022 down_write(&dc->writeback_lock);
1023 if (bch_cached_dev_writeback_start(dc)) {
1024 up_write(&dc->writeback_lock);
1025 return -ENOMEM;
1026 }
1027
1028 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) {
1029 bch_sectors_dirty_init(dc);
1030 atomic_set(&dc->has_dirty, 1);
1031 atomic_inc(&dc->count);
1032 bch_writeback_queue(dc);
1033 }
1034
1035 bch_cached_dev_run(dc);
1036 bcache_device_link(&dc->disk, c, "bdev");
1037
1038
1039 up_write(&dc->writeback_lock);
1040
1041 pr_info("Caching %s as %s on set %pU",
1042 bdevname(dc->bdev, buf), dc->disk.disk->disk_name,
1043 dc->disk.c->sb.set_uuid);
1044 return 0;
1045}
1046
1047void bch_cached_dev_release(struct kobject *kobj)
1048{
1049 struct cached_dev *dc = container_of(kobj, struct cached_dev,
1050 disk.kobj);
1051 kfree(dc);
1052 module_put(THIS_MODULE);
1053}
1054
1055static void cached_dev_free(struct closure *cl)
1056{
1057 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
1058
1059 cancel_delayed_work_sync(&dc->writeback_rate_update);
1060 if (!IS_ERR_OR_NULL(dc->writeback_thread))
1061 kthread_stop(dc->writeback_thread);
1062
1063 mutex_lock(&bch_register_lock);
1064
1065 if (atomic_read(&dc->running))
1066 bd_unlink_disk_holder(dc->bdev, dc->disk.disk);
1067 bcache_device_free(&dc->disk);
1068 list_del(&dc->list);
1069
1070 mutex_unlock(&bch_register_lock);
1071
1072 if (!IS_ERR_OR_NULL(dc->bdev))
1073 blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1074
1075 wake_up(&unregister_wait);
1076
1077 kobject_put(&dc->disk.kobj);
1078}
1079
1080static void cached_dev_flush(struct closure *cl)
1081{
1082 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl);
1083 struct bcache_device *d = &dc->disk;
1084
1085 mutex_lock(&bch_register_lock);
1086 bcache_device_unlink(d);
1087 mutex_unlock(&bch_register_lock);
1088
1089 bch_cache_accounting_destroy(&dc->accounting);
1090 kobject_del(&d->kobj);
1091
1092 continue_at(cl, cached_dev_free, system_wq);
1093}
1094
1095static int cached_dev_init(struct cached_dev *dc, unsigned block_size)
1096{
1097 int ret;
1098 struct io *io;
1099 struct request_queue *q = bdev_get_queue(dc->bdev);
1100
1101 __module_get(THIS_MODULE);
1102 INIT_LIST_HEAD(&dc->list);
1103 closure_init(&dc->disk.cl, NULL);
1104 set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq);
1105 kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype);
1106 INIT_WORK(&dc->detach, cached_dev_detach_finish);
1107 sema_init(&dc->sb_write_mutex, 1);
1108 INIT_LIST_HEAD(&dc->io_lru);
1109 spin_lock_init(&dc->io_lock);
1110 bch_cache_accounting_init(&dc->accounting, &dc->disk.cl);
1111
1112 dc->sequential_cutoff = 4 << 20;
1113
1114 for (io = dc->io; io < dc->io + RECENT_IO; io++) {
1115 list_add(&io->lru, &dc->io_lru);
1116 hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
1117 }
1118
1119 dc->disk.stripe_size = q->limits.io_opt >> 9;
1120
1121 if (dc->disk.stripe_size)
1122 dc->partial_stripes_expensive =
1123 q->limits.raid_partial_stripes_expensive;
1124
1125 ret = bcache_device_init(&dc->disk, block_size,
1126 dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
1127 if (ret)
1128 return ret;
1129
1130 set_capacity(dc->disk.disk,
1131 dc->bdev->bd_part->nr_sects - dc->sb.data_offset);
1132
1133 dc->disk.disk->queue->backing_dev_info->ra_pages =
1134 max(dc->disk.disk->queue->backing_dev_info->ra_pages,
1135 q->backing_dev_info->ra_pages);
1136
1137 bch_cached_dev_request_init(dc);
1138 bch_cached_dev_writeback_init(dc);
1139 return 0;
1140}
1141
1142
1143
1144static void register_bdev(struct cache_sb *sb, struct page *sb_page,
1145 struct block_device *bdev,
1146 struct cached_dev *dc)
1147{
1148 char name[BDEVNAME_SIZE];
1149 const char *err = "cannot allocate memory";
1150 struct cache_set *c;
1151
1152 memcpy(&dc->sb, sb, sizeof(struct cache_sb));
1153 dc->bdev = bdev;
1154 dc->bdev->bd_holder = dc;
1155
1156 bio_init(&dc->sb_bio, dc->sb_bio.bi_inline_vecs, 1);
1157 dc->sb_bio.bi_io_vec[0].bv_page = sb_page;
1158 get_page(sb_page);
1159
1160 if (cached_dev_init(dc, sb->block_size << 9))
1161 goto err;
1162
1163 err = "error creating kobject";
1164 if (kobject_add(&dc->disk.kobj, &part_to_dev(bdev->bd_part)->kobj,
1165 "bcache"))
1166 goto err;
1167 if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj))
1168 goto err;
1169
1170 pr_info("registered backing device %s", bdevname(bdev, name));
1171
1172 list_add(&dc->list, &uncached_devices);
1173 list_for_each_entry(c, &bch_cache_sets, list)
1174 bch_cached_dev_attach(dc, c);
1175
1176 if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE ||
1177 BDEV_STATE(&dc->sb) == BDEV_STATE_STALE)
1178 bch_cached_dev_run(dc);
1179
1180 return;
1181err:
1182 pr_notice("error opening %s: %s", bdevname(bdev, name), err);
1183 bcache_device_stop(&dc->disk);
1184}
1185
1186
1187
1188void bch_flash_dev_release(struct kobject *kobj)
1189{
1190 struct bcache_device *d = container_of(kobj, struct bcache_device,
1191 kobj);
1192 kfree(d);
1193}
1194
1195static void flash_dev_free(struct closure *cl)
1196{
1197 struct bcache_device *d = container_of(cl, struct bcache_device, cl);
1198 mutex_lock(&bch_register_lock);
1199 bcache_device_free(d);
1200 mutex_unlock(&bch_register_lock);
1201 kobject_put(&d->kobj);
1202}
1203
1204static void flash_dev_flush(struct closure *cl)
1205{
1206 struct bcache_device *d = container_of(cl, struct bcache_device, cl);
1207
1208 mutex_lock(&bch_register_lock);
1209 bcache_device_unlink(d);
1210 mutex_unlock(&bch_register_lock);
1211 kobject_del(&d->kobj);
1212 continue_at(cl, flash_dev_free, system_wq);
1213}
1214
1215static int flash_dev_run(struct cache_set *c, struct uuid_entry *u)
1216{
1217 struct bcache_device *d = kzalloc(sizeof(struct bcache_device),
1218 GFP_KERNEL);
1219 if (!d)
1220 return -ENOMEM;
1221
1222 closure_init(&d->cl, NULL);
1223 set_closure_fn(&d->cl, flash_dev_flush, system_wq);
1224
1225 kobject_init(&d->kobj, &bch_flash_dev_ktype);
1226
1227 if (bcache_device_init(d, block_bytes(c), u->sectors))
1228 goto err;
1229
1230 bcache_device_attach(d, c, u - c->uuids);
1231 bch_flash_dev_request_init(d);
1232 add_disk(d->disk);
1233
1234 if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache"))
1235 goto err;
1236
1237 bcache_device_link(d, c, "volume");
1238
1239 return 0;
1240err:
1241 kobject_put(&d->kobj);
1242 return -ENOMEM;
1243}
1244
1245static int flash_devs_run(struct cache_set *c)
1246{
1247 int ret = 0;
1248 struct uuid_entry *u;
1249
1250 for (u = c->uuids;
1251 u < c->uuids + c->nr_uuids && !ret;
1252 u++)
1253 if (UUID_FLASH_ONLY(u))
1254 ret = flash_dev_run(c, u);
1255
1256 return ret;
1257}
1258
1259int bch_flash_dev_create(struct cache_set *c, uint64_t size)
1260{
1261 struct uuid_entry *u;
1262
1263 if (test_bit(CACHE_SET_STOPPING, &c->flags))
1264 return -EINTR;
1265
1266 if (!test_bit(CACHE_SET_RUNNING, &c->flags))
1267 return -EPERM;
1268
1269 u = uuid_find_empty(c);
1270 if (!u) {
1271 pr_err("Can't create volume, no room for UUID");
1272 return -EINVAL;
1273 }
1274
1275 get_random_bytes(u->uuid, 16);
1276 memset(u->label, 0, 32);
1277 u->first_reg = u->last_reg = cpu_to_le32(get_seconds());
1278
1279 SET_UUID_FLASH_ONLY(u, 1);
1280 u->sectors = size >> 9;
1281
1282 bch_uuid_write(c);
1283
1284 return flash_dev_run(c, u);
1285}
1286
1287
1288
1289__printf(2, 3)
1290bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...)
1291{
1292 va_list args;
1293
1294 if (c->on_error != ON_ERROR_PANIC &&
1295 test_bit(CACHE_SET_STOPPING, &c->flags))
1296 return false;
1297
1298
1299
1300
1301
1302 printk(KERN_ERR "bcache: error on %pU: ", c->sb.set_uuid);
1303
1304 va_start(args, fmt);
1305 vprintk(fmt, args);
1306 va_end(args);
1307
1308 printk(", disabling caching\n");
1309
1310 if (c->on_error == ON_ERROR_PANIC)
1311 panic("panic forced after error\n");
1312
1313 bch_cache_set_unregister(c);
1314 return true;
1315}
1316
1317void bch_cache_set_release(struct kobject *kobj)
1318{
1319 struct cache_set *c = container_of(kobj, struct cache_set, kobj);
1320 kfree(c);
1321 module_put(THIS_MODULE);
1322}
1323
1324static void cache_set_free(struct closure *cl)
1325{
1326 struct cache_set *c = container_of(cl, struct cache_set, cl);
1327 struct cache *ca;
1328 unsigned i;
1329
1330 if (!IS_ERR_OR_NULL(c->debug))
1331 debugfs_remove(c->debug);
1332
1333 bch_open_buckets_free(c);
1334 bch_btree_cache_free(c);
1335 bch_journal_free(c);
1336
1337 for_each_cache(ca, c, i)
1338 if (ca) {
1339 ca->set = NULL;
1340 c->cache[ca->sb.nr_this_dev] = NULL;
1341 kobject_put(&ca->kobj);
1342 }
1343
1344 bch_bset_sort_state_free(&c->sort);
1345 free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c)));
1346
1347 if (c->moving_gc_wq)
1348 destroy_workqueue(c->moving_gc_wq);
1349 if (c->bio_split)
1350 bioset_free(c->bio_split);
1351 if (c->fill_iter)
1352 mempool_destroy(c->fill_iter);
1353 if (c->bio_meta)
1354 mempool_destroy(c->bio_meta);
1355 if (c->search)
1356 mempool_destroy(c->search);
1357 kfree(c->devices);
1358
1359 mutex_lock(&bch_register_lock);
1360 list_del(&c->list);
1361 mutex_unlock(&bch_register_lock);
1362
1363 pr_info("Cache set %pU unregistered", c->sb.set_uuid);
1364 wake_up(&unregister_wait);
1365
1366 closure_debug_destroy(&c->cl);
1367 kobject_put(&c->kobj);
1368}
1369
1370static void cache_set_flush(struct closure *cl)
1371{
1372 struct cache_set *c = container_of(cl, struct cache_set, caching);
1373 struct cache *ca;
1374 struct btree *b;
1375 unsigned i;
1376
1377 if (!c)
1378 closure_return(cl);
1379
1380 bch_cache_accounting_destroy(&c->accounting);
1381
1382 kobject_put(&c->internal);
1383 kobject_del(&c->kobj);
1384
1385 if (c->gc_thread)
1386 kthread_stop(c->gc_thread);
1387
1388 if (!IS_ERR_OR_NULL(c->root))
1389 list_add(&c->root->list, &c->btree_cache);
1390
1391
1392 list_for_each_entry(b, &c->btree_cache, list) {
1393 mutex_lock(&b->write_lock);
1394 if (btree_node_dirty(b))
1395 __bch_btree_node_write(b, NULL);
1396 mutex_unlock(&b->write_lock);
1397 }
1398
1399 for_each_cache(ca, c, i)
1400 if (ca->alloc_thread)
1401 kthread_stop(ca->alloc_thread);
1402
1403 if (c->journal.cur) {
1404 cancel_delayed_work_sync(&c->journal.work);
1405
1406 c->journal.work.work.func(&c->journal.work.work);
1407 }
1408
1409 closure_return(cl);
1410}
1411
1412static void __cache_set_unregister(struct closure *cl)
1413{
1414 struct cache_set *c = container_of(cl, struct cache_set, caching);
1415 struct cached_dev *dc;
1416 size_t i;
1417
1418 mutex_lock(&bch_register_lock);
1419
1420 for (i = 0; i < c->nr_uuids; i++)
1421 if (c->devices[i]) {
1422 if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
1423 test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
1424 dc = container_of(c->devices[i],
1425 struct cached_dev, disk);
1426 bch_cached_dev_detach(dc);
1427 } else {
1428 bcache_device_stop(c->devices[i]);
1429 }
1430 }
1431
1432 mutex_unlock(&bch_register_lock);
1433
1434 continue_at(cl, cache_set_flush, system_wq);
1435}
1436
1437void bch_cache_set_stop(struct cache_set *c)
1438{
1439 if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags))
1440 closure_queue(&c->caching);
1441}
1442
1443void bch_cache_set_unregister(struct cache_set *c)
1444{
1445 set_bit(CACHE_SET_UNREGISTERING, &c->flags);
1446 bch_cache_set_stop(c);
1447}
1448
1449#define alloc_bucket_pages(gfp, c) \
1450 ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(c))))
1451
1452struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
1453{
1454 int iter_size;
1455 struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL);
1456 if (!c)
1457 return NULL;
1458
1459 __module_get(THIS_MODULE);
1460 closure_init(&c->cl, NULL);
1461 set_closure_fn(&c->cl, cache_set_free, system_wq);
1462
1463 closure_init(&c->caching, &c->cl);
1464 set_closure_fn(&c->caching, __cache_set_unregister, system_wq);
1465
1466
1467 closure_set_stopped(&c->cl);
1468 closure_put(&c->cl);
1469
1470 kobject_init(&c->kobj, &bch_cache_set_ktype);
1471 kobject_init(&c->internal, &bch_cache_set_internal_ktype);
1472
1473 bch_cache_accounting_init(&c->accounting, &c->cl);
1474
1475 memcpy(c->sb.set_uuid, sb->set_uuid, 16);
1476 c->sb.block_size = sb->block_size;
1477 c->sb.bucket_size = sb->bucket_size;
1478 c->sb.nr_in_set = sb->nr_in_set;
1479 c->sb.last_mount = sb->last_mount;
1480 c->bucket_bits = ilog2(sb->bucket_size);
1481 c->block_bits = ilog2(sb->block_size);
1482 c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry);
1483
1484 c->btree_pages = bucket_pages(c);
1485 if (c->btree_pages > BTREE_MAX_PAGES)
1486 c->btree_pages = max_t(int, c->btree_pages / 4,
1487 BTREE_MAX_PAGES);
1488
1489 sema_init(&c->sb_write_mutex, 1);
1490 mutex_init(&c->bucket_lock);
1491 init_waitqueue_head(&c->btree_cache_wait);
1492 init_waitqueue_head(&c->bucket_wait);
1493 init_waitqueue_head(&c->gc_wait);
1494 sema_init(&c->uuid_write_mutex, 1);
1495
1496 spin_lock_init(&c->btree_gc_time.lock);
1497 spin_lock_init(&c->btree_split_time.lock);
1498 spin_lock_init(&c->btree_read_time.lock);
1499
1500 bch_moving_init_cache_set(c);
1501
1502 INIT_LIST_HEAD(&c->list);
1503 INIT_LIST_HEAD(&c->cached_devs);
1504 INIT_LIST_HEAD(&c->btree_cache);
1505 INIT_LIST_HEAD(&c->btree_cache_freeable);
1506 INIT_LIST_HEAD(&c->btree_cache_freed);
1507 INIT_LIST_HEAD(&c->data_buckets);
1508
1509 c->search = mempool_create_slab_pool(32, bch_search_cache);
1510 if (!c->search)
1511 goto err;
1512
1513 iter_size = (sb->bucket_size / sb->block_size + 1) *
1514 sizeof(struct btree_iter_set);
1515
1516 if (!(c->devices = kzalloc(c->nr_uuids * sizeof(void *), GFP_KERNEL)) ||
1517 !(c->bio_meta = mempool_create_kmalloc_pool(2,
1518 sizeof(struct bbio) + sizeof(struct bio_vec) *
1519 bucket_pages(c))) ||
1520 !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) ||
1521 !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio),
1522 BIOSET_NEED_BVECS |
1523 BIOSET_NEED_RESCUER)) ||
1524 !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) ||
1525 !(c->moving_gc_wq = alloc_workqueue("bcache_gc",
1526 WQ_MEM_RECLAIM, 0)) ||
1527 bch_journal_alloc(c) ||
1528 bch_btree_cache_alloc(c) ||
1529 bch_open_buckets_alloc(c) ||
1530 bch_bset_sort_state_init(&c->sort, ilog2(c->btree_pages)))
1531 goto err;
1532
1533 c->congested_read_threshold_us = 2000;
1534 c->congested_write_threshold_us = 20000;
1535 c->error_limit = 8 << IO_ERROR_SHIFT;
1536
1537 return c;
1538err:
1539 bch_cache_set_unregister(c);
1540 return NULL;
1541}
1542
1543static void run_cache_set(struct cache_set *c)
1544{
1545 const char *err = "cannot allocate memory";
1546 struct cached_dev *dc, *t;
1547 struct cache *ca;
1548 struct closure cl;
1549 unsigned i;
1550
1551 closure_init_stack(&cl);
1552
1553 for_each_cache(ca, c, i)
1554 c->nbuckets += ca->sb.nbuckets;
1555 set_gc_sectors(c);
1556
1557 if (CACHE_SYNC(&c->sb)) {
1558 LIST_HEAD(journal);
1559 struct bkey *k;
1560 struct jset *j;
1561
1562 err = "cannot allocate memory for journal";
1563 if (bch_journal_read(c, &journal))
1564 goto err;
1565
1566 pr_debug("btree_journal_read() done");
1567
1568 err = "no journal entries found";
1569 if (list_empty(&journal))
1570 goto err;
1571
1572 j = &list_entry(journal.prev, struct journal_replay, list)->j;
1573
1574 err = "IO error reading priorities";
1575 for_each_cache(ca, c, i)
1576 prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]);
1577
1578
1579
1580
1581
1582
1583
1584 k = &j->btree_root;
1585
1586 err = "bad btree root";
1587 if (__bch_btree_ptr_invalid(c, k))
1588 goto err;
1589
1590 err = "error reading btree root";
1591 c->root = bch_btree_node_get(c, NULL, k, j->btree_level, true, NULL);
1592 if (IS_ERR_OR_NULL(c->root))
1593 goto err;
1594
1595 list_del_init(&c->root->list);
1596 rw_unlock(true, c->root);
1597
1598 err = uuid_read(c, j, &cl);
1599 if (err)
1600 goto err;
1601
1602 err = "error in recovery";
1603 if (bch_btree_check(c))
1604 goto err;
1605
1606 bch_journal_mark(c, &journal);
1607 bch_initial_gc_finish(c);
1608 pr_debug("btree_check() done");
1609
1610
1611
1612
1613
1614
1615 bch_journal_next(&c->journal);
1616
1617 err = "error starting allocator thread";
1618 for_each_cache(ca, c, i)
1619 if (bch_cache_allocator_start(ca))
1620 goto err;
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632 if (j->version < BCACHE_JSET_VERSION_UUID)
1633 __uuid_write(c);
1634
1635 bch_journal_replay(c, &journal);
1636 } else {
1637 pr_notice("invalidating existing data");
1638
1639 for_each_cache(ca, c, i) {
1640 unsigned j;
1641
1642 ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7,
1643 2, SB_JOURNAL_BUCKETS);
1644
1645 for (j = 0; j < ca->sb.keys; j++)
1646 ca->sb.d[j] = ca->sb.first_bucket + j;
1647 }
1648
1649 bch_initial_gc_finish(c);
1650
1651 err = "error starting allocator thread";
1652 for_each_cache(ca, c, i)
1653 if (bch_cache_allocator_start(ca))
1654 goto err;
1655
1656 mutex_lock(&c->bucket_lock);
1657 for_each_cache(ca, c, i)
1658 bch_prio_write(ca);
1659 mutex_unlock(&c->bucket_lock);
1660
1661 err = "cannot allocate new UUID bucket";
1662 if (__uuid_write(c))
1663 goto err;
1664
1665 err = "cannot allocate new btree root";
1666 c->root = __bch_btree_node_alloc(c, NULL, 0, true, NULL);
1667 if (IS_ERR_OR_NULL(c->root))
1668 goto err;
1669
1670 mutex_lock(&c->root->write_lock);
1671 bkey_copy_key(&c->root->key, &MAX_KEY);
1672 bch_btree_node_write(c->root, &cl);
1673 mutex_unlock(&c->root->write_lock);
1674
1675 bch_btree_set_root(c->root);
1676 rw_unlock(true, c->root);
1677
1678
1679
1680
1681
1682
1683 SET_CACHE_SYNC(&c->sb, true);
1684
1685 bch_journal_next(&c->journal);
1686 bch_journal_meta(c, &cl);
1687 }
1688
1689 err = "error starting gc thread";
1690 if (bch_gc_thread_start(c))
1691 goto err;
1692
1693 closure_sync(&cl);
1694 c->sb.last_mount = get_seconds();
1695 bcache_write_super(c);
1696
1697 list_for_each_entry_safe(dc, t, &uncached_devices, list)
1698 bch_cached_dev_attach(dc, c);
1699
1700 flash_devs_run(c);
1701
1702 set_bit(CACHE_SET_RUNNING, &c->flags);
1703 return;
1704err:
1705 closure_sync(&cl);
1706
1707 bch_cache_set_error(c, "%s", err);
1708}
1709
1710static bool can_attach_cache(struct cache *ca, struct cache_set *c)
1711{
1712 return ca->sb.block_size == c->sb.block_size &&
1713 ca->sb.bucket_size == c->sb.bucket_size &&
1714 ca->sb.nr_in_set == c->sb.nr_in_set;
1715}
1716
1717static const char *register_cache_set(struct cache *ca)
1718{
1719 char buf[12];
1720 const char *err = "cannot allocate memory";
1721 struct cache_set *c;
1722
1723 list_for_each_entry(c, &bch_cache_sets, list)
1724 if (!memcmp(c->sb.set_uuid, ca->sb.set_uuid, 16)) {
1725 if (c->cache[ca->sb.nr_this_dev])
1726 return "duplicate cache set member";
1727
1728 if (!can_attach_cache(ca, c))
1729 return "cache sb does not match set";
1730
1731 if (!CACHE_SYNC(&ca->sb))
1732 SET_CACHE_SYNC(&c->sb, false);
1733
1734 goto found;
1735 }
1736
1737 c = bch_cache_set_alloc(&ca->sb);
1738 if (!c)
1739 return err;
1740
1741 err = "error creating kobject";
1742 if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->sb.set_uuid) ||
1743 kobject_add(&c->internal, &c->kobj, "internal"))
1744 goto err;
1745
1746 if (bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj))
1747 goto err;
1748
1749 bch_debug_init_cache_set(c);
1750
1751 list_add(&c->list, &bch_cache_sets);
1752found:
1753 sprintf(buf, "cache%i", ca->sb.nr_this_dev);
1754 if (sysfs_create_link(&ca->kobj, &c->kobj, "set") ||
1755 sysfs_create_link(&c->kobj, &ca->kobj, buf))
1756 goto err;
1757
1758 if (ca->sb.seq > c->sb.seq) {
1759 c->sb.version = ca->sb.version;
1760 memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16);
1761 c->sb.flags = ca->sb.flags;
1762 c->sb.seq = ca->sb.seq;
1763 pr_debug("set version = %llu", c->sb.version);
1764 }
1765
1766 kobject_get(&ca->kobj);
1767 ca->set = c;
1768 ca->set->cache[ca->sb.nr_this_dev] = ca;
1769 c->cache_by_alloc[c->caches_loaded++] = ca;
1770
1771 if (c->caches_loaded == c->sb.nr_in_set)
1772 run_cache_set(c);
1773
1774 return NULL;
1775err:
1776 bch_cache_set_unregister(c);
1777 return err;
1778}
1779
1780
1781
1782void bch_cache_release(struct kobject *kobj)
1783{
1784 struct cache *ca = container_of(kobj, struct cache, kobj);
1785 unsigned i;
1786
1787 if (ca->set) {
1788 BUG_ON(ca->set->cache[ca->sb.nr_this_dev] != ca);
1789 ca->set->cache[ca->sb.nr_this_dev] = NULL;
1790 }
1791
1792 free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca)));
1793 kfree(ca->prio_buckets);
1794 vfree(ca->buckets);
1795
1796 free_heap(&ca->heap);
1797 free_fifo(&ca->free_inc);
1798
1799 for (i = 0; i < RESERVE_NR; i++)
1800 free_fifo(&ca->free[i]);
1801
1802 if (ca->sb_bio.bi_inline_vecs[0].bv_page)
1803 put_page(ca->sb_bio.bi_io_vec[0].bv_page);
1804
1805 if (!IS_ERR_OR_NULL(ca->bdev))
1806 blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1807
1808 kfree(ca);
1809 module_put(THIS_MODULE);
1810}
1811
1812static int cache_alloc(struct cache *ca)
1813{
1814 size_t free;
1815 struct bucket *b;
1816
1817 __module_get(THIS_MODULE);
1818 kobject_init(&ca->kobj, &bch_cache_ktype);
1819
1820 bio_init(&ca->journal.bio, ca->journal.bio.bi_inline_vecs, 8);
1821
1822 free = roundup_pow_of_two(ca->sb.nbuckets) >> 10;
1823
1824 if (!init_fifo(&ca->free[RESERVE_BTREE], 8, GFP_KERNEL) ||
1825 !init_fifo_exact(&ca->free[RESERVE_PRIO], prio_buckets(ca), GFP_KERNEL) ||
1826 !init_fifo(&ca->free[RESERVE_MOVINGGC], free, GFP_KERNEL) ||
1827 !init_fifo(&ca->free[RESERVE_NONE], free, GFP_KERNEL) ||
1828 !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) ||
1829 !init_heap(&ca->heap, free << 3, GFP_KERNEL) ||
1830 !(ca->buckets = vzalloc(sizeof(struct bucket) *
1831 ca->sb.nbuckets)) ||
1832 !(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) *
1833 2, GFP_KERNEL)) ||
1834 !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)))
1835 return -ENOMEM;
1836
1837 ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca);
1838
1839 for_each_bucket(b, ca)
1840 atomic_set(&b->pin, 0);
1841
1842 return 0;
1843}
1844
1845static int register_cache(struct cache_sb *sb, struct page *sb_page,
1846 struct block_device *bdev, struct cache *ca)
1847{
1848 char name[BDEVNAME_SIZE];
1849 const char *err = NULL;
1850 int ret = 0;
1851
1852 memcpy(&ca->sb, sb, sizeof(struct cache_sb));
1853 ca->bdev = bdev;
1854 ca->bdev->bd_holder = ca;
1855
1856 bio_init(&ca->sb_bio, ca->sb_bio.bi_inline_vecs, 1);
1857 ca->sb_bio.bi_io_vec[0].bv_page = sb_page;
1858 get_page(sb_page);
1859
1860 if (blk_queue_discard(bdev_get_queue(ca->bdev)))
1861 ca->discard = CACHE_DISCARD(&ca->sb);
1862
1863 ret = cache_alloc(ca);
1864 if (ret != 0) {
1865 if (ret == -ENOMEM)
1866 err = "cache_alloc(): -ENOMEM";
1867 else
1868 err = "cache_alloc(): unknown error";
1869 goto err;
1870 }
1871
1872 if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache")) {
1873 err = "error calling kobject_add";
1874 ret = -ENOMEM;
1875 goto out;
1876 }
1877
1878 mutex_lock(&bch_register_lock);
1879 err = register_cache_set(ca);
1880 mutex_unlock(&bch_register_lock);
1881
1882 if (err) {
1883 ret = -ENODEV;
1884 goto out;
1885 }
1886
1887 pr_info("registered cache device %s", bdevname(bdev, name));
1888
1889out:
1890 kobject_put(&ca->kobj);
1891
1892err:
1893 if (err)
1894 pr_notice("error opening %s: %s", bdevname(bdev, name), err);
1895
1896 return ret;
1897}
1898
1899
1900
1901static ssize_t register_bcache(struct kobject *, struct kobj_attribute *,
1902 const char *, size_t);
1903
1904kobj_attribute_write(register, register_bcache);
1905kobj_attribute_write(register_quiet, register_bcache);
1906
1907static bool bch_is_open_backing(struct block_device *bdev) {
1908 struct cache_set *c, *tc;
1909 struct cached_dev *dc, *t;
1910
1911 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
1912 list_for_each_entry_safe(dc, t, &c->cached_devs, list)
1913 if (dc->bdev == bdev)
1914 return true;
1915 list_for_each_entry_safe(dc, t, &uncached_devices, list)
1916 if (dc->bdev == bdev)
1917 return true;
1918 return false;
1919}
1920
1921static bool bch_is_open_cache(struct block_device *bdev) {
1922 struct cache_set *c, *tc;
1923 struct cache *ca;
1924 unsigned i;
1925
1926 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
1927 for_each_cache(ca, c, i)
1928 if (ca->bdev == bdev)
1929 return true;
1930 return false;
1931}
1932
1933static bool bch_is_open(struct block_device *bdev) {
1934 return bch_is_open_cache(bdev) || bch_is_open_backing(bdev);
1935}
1936
1937static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
1938 const char *buffer, size_t size)
1939{
1940 ssize_t ret = size;
1941 const char *err = "cannot allocate memory";
1942 char *path = NULL;
1943 struct cache_sb *sb = NULL;
1944 struct block_device *bdev = NULL;
1945 struct page *sb_page = NULL;
1946
1947 if (!try_module_get(THIS_MODULE))
1948 return -EBUSY;
1949
1950 if (!(path = kstrndup(buffer, size, GFP_KERNEL)) ||
1951 !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL)))
1952 goto err;
1953
1954 err = "failed to open device";
1955 bdev = blkdev_get_by_path(strim(path),
1956 FMODE_READ|FMODE_WRITE|FMODE_EXCL,
1957 sb);
1958 if (IS_ERR(bdev)) {
1959 if (bdev == ERR_PTR(-EBUSY)) {
1960 bdev = lookup_bdev(strim(path));
1961 mutex_lock(&bch_register_lock);
1962 if (!IS_ERR(bdev) && bch_is_open(bdev))
1963 err = "device already registered";
1964 else
1965 err = "device busy";
1966 mutex_unlock(&bch_register_lock);
1967 if (attr == &ksysfs_register_quiet)
1968 goto out;
1969 }
1970 goto err;
1971 }
1972
1973 err = "failed to set blocksize";
1974 if (set_blocksize(bdev, 4096))
1975 goto err_close;
1976
1977 err = read_super(sb, bdev, &sb_page);
1978 if (err)
1979 goto err_close;
1980
1981 if (SB_IS_BDEV(sb)) {
1982 struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL);
1983 if (!dc)
1984 goto err_close;
1985
1986 mutex_lock(&bch_register_lock);
1987 register_bdev(sb, sb_page, bdev, dc);
1988 mutex_unlock(&bch_register_lock);
1989 } else {
1990 struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
1991 if (!ca)
1992 goto err_close;
1993
1994 if (register_cache(sb, sb_page, bdev, ca) != 0)
1995 goto err_close;
1996 }
1997out:
1998 if (sb_page)
1999 put_page(sb_page);
2000 kfree(sb);
2001 kfree(path);
2002 module_put(THIS_MODULE);
2003 return ret;
2004
2005err_close:
2006 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2007err:
2008 pr_info("error opening %s: %s", path, err);
2009 ret = -EINVAL;
2010 goto out;
2011}
2012
2013static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x)
2014{
2015 if (code == SYS_DOWN ||
2016 code == SYS_HALT ||
2017 code == SYS_POWER_OFF) {
2018 DEFINE_WAIT(wait);
2019 unsigned long start = jiffies;
2020 bool stopped = false;
2021
2022 struct cache_set *c, *tc;
2023 struct cached_dev *dc, *tdc;
2024
2025 mutex_lock(&bch_register_lock);
2026
2027 if (list_empty(&bch_cache_sets) &&
2028 list_empty(&uncached_devices))
2029 goto out;
2030
2031 pr_info("Stopping all devices:");
2032
2033 list_for_each_entry_safe(c, tc, &bch_cache_sets, list)
2034 bch_cache_set_stop(c);
2035
2036 list_for_each_entry_safe(dc, tdc, &uncached_devices, list)
2037 bcache_device_stop(&dc->disk);
2038
2039
2040 while (1) {
2041 long timeout = start + 2 * HZ - jiffies;
2042
2043 stopped = list_empty(&bch_cache_sets) &&
2044 list_empty(&uncached_devices);
2045
2046 if (timeout < 0 || stopped)
2047 break;
2048
2049 prepare_to_wait(&unregister_wait, &wait,
2050 TASK_UNINTERRUPTIBLE);
2051
2052 mutex_unlock(&bch_register_lock);
2053 schedule_timeout(timeout);
2054 mutex_lock(&bch_register_lock);
2055 }
2056
2057 finish_wait(&unregister_wait, &wait);
2058
2059 if (stopped)
2060 pr_info("All devices stopped");
2061 else
2062 pr_notice("Timeout waiting for devices to be closed");
2063out:
2064 mutex_unlock(&bch_register_lock);
2065 }
2066
2067 return NOTIFY_DONE;
2068}
2069
2070static struct notifier_block reboot = {
2071 .notifier_call = bcache_reboot,
2072 .priority = INT_MAX,
2073};
2074
2075static void bcache_exit(void)
2076{
2077 bch_debug_exit();
2078 bch_request_exit();
2079 if (bcache_kobj)
2080 kobject_put(bcache_kobj);
2081 if (bcache_wq)
2082 destroy_workqueue(bcache_wq);
2083 if (bcache_major)
2084 unregister_blkdev(bcache_major, "bcache");
2085 unregister_reboot_notifier(&reboot);
2086}
2087
2088static int __init bcache_init(void)
2089{
2090 static const struct attribute *files[] = {
2091 &ksysfs_register.attr,
2092 &ksysfs_register_quiet.attr,
2093 NULL
2094 };
2095
2096 mutex_init(&bch_register_lock);
2097 init_waitqueue_head(&unregister_wait);
2098 register_reboot_notifier(&reboot);
2099 closure_debug_init();
2100
2101 bcache_major = register_blkdev(0, "bcache");
2102 if (bcache_major < 0) {
2103 unregister_reboot_notifier(&reboot);
2104 return bcache_major;
2105 }
2106
2107 if (!(bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0)) ||
2108 !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) ||
2109 sysfs_create_files(bcache_kobj, files) ||
2110 bch_request_init() ||
2111 bch_debug_init(bcache_kobj))
2112 goto err;
2113
2114 return 0;
2115err:
2116 bcache_exit();
2117 return -ENOMEM;
2118}
2119
2120module_exit(bcache_exit);
2121module_init(bcache_init);
2122