1
2#ifndef _BCACHE_H
3#define _BCACHE_H
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179#define pr_fmt(fmt) "bcache: %s() " fmt, __func__
180
181#include <linux/bcache.h>
182#include <linux/bio.h>
183#include <linux/kobject.h>
184#include <linux/list.h>
185#include <linux/mutex.h>
186#include <linux/rbtree.h>
187#include <linux/rwsem.h>
188#include <linux/refcount.h>
189#include <linux/types.h>
190#include <linux/workqueue.h>
191#include <linux/kthread.h>
192
193#include "bset.h"
194#include "util.h"
195#include "closure.h"
196
197struct bucket {
198 atomic_t pin;
199 uint16_t prio;
200 uint8_t gen;
201 uint8_t last_gc;
202 uint16_t gc_mark;
203};
204
205
206
207
208
209
210BITMASK(GC_MARK, struct bucket, gc_mark, 0, 2);
211#define GC_MARK_RECLAIMABLE 1
212#define GC_MARK_DIRTY 2
213#define GC_MARK_METADATA 3
214#define GC_SECTORS_USED_SIZE 13
215#define MAX_GC_SECTORS_USED (~(~0ULL << GC_SECTORS_USED_SIZE))
216BITMASK(GC_SECTORS_USED, struct bucket, gc_mark, 2, GC_SECTORS_USED_SIZE);
217BITMASK(GC_MOVE, struct bucket, gc_mark, 15, 1);
218
219#include "journal.h"
220#include "stats.h"
221struct search;
222struct btree;
223struct keybuf;
224
225struct keybuf_key {
226 struct rb_node node;
227 BKEY_PADDED(key);
228 void *private;
229};
230
231struct keybuf {
232 struct bkey last_scanned;
233 spinlock_t lock;
234
235
236
237
238
239
240 struct bkey start;
241 struct bkey end;
242
243 struct rb_root keys;
244
245#define KEYBUF_NR 500
246 DECLARE_ARRAY_ALLOCATOR(struct keybuf_key, freelist, KEYBUF_NR);
247};
248
249struct bcache_device {
250 struct closure cl;
251
252 struct kobject kobj;
253
254 struct cache_set *c;
255 unsigned int id;
256#define BCACHEDEVNAME_SIZE 12
257 char name[BCACHEDEVNAME_SIZE];
258
259 struct gendisk *disk;
260
261 unsigned long flags;
262#define BCACHE_DEV_CLOSING 0
263#define BCACHE_DEV_DETACHING 1
264#define BCACHE_DEV_UNLINK_DONE 2
265#define BCACHE_DEV_WB_RUNNING 3
266#define BCACHE_DEV_RATE_DW_RUNNING 4
267 int nr_stripes;
268 unsigned int stripe_size;
269 atomic_t *stripe_sectors_dirty;
270 unsigned long *full_dirty_stripes;
271
272 struct bio_set bio_split;
273
274 unsigned int data_csum:1;
275
276 int (*cache_miss)(struct btree *b, struct search *s,
277 struct bio *bio, unsigned int sectors);
278 int (*ioctl)(struct bcache_device *d, fmode_t mode,
279 unsigned int cmd, unsigned long arg);
280};
281
282struct io {
283
284 struct hlist_node hash;
285 struct list_head lru;
286
287 unsigned long jiffies;
288 unsigned int sequential;
289 sector_t last;
290};
291
292enum stop_on_failure {
293 BCH_CACHED_DEV_STOP_AUTO = 0,
294 BCH_CACHED_DEV_STOP_ALWAYS,
295 BCH_CACHED_DEV_STOP_MODE_MAX,
296};
297
298struct cached_dev {
299 struct list_head list;
300 struct bcache_device disk;
301 struct block_device *bdev;
302
303 struct cache_sb sb;
304 struct cache_sb_disk *sb_disk;
305 struct bio sb_bio;
306 struct bio_vec sb_bv[1];
307 struct closure sb_write;
308 struct semaphore sb_write_mutex;
309
310
311 refcount_t count;
312 struct work_struct detach;
313
314
315
316
317
318 atomic_t running;
319
320
321
322
323
324 struct rw_semaphore writeback_lock;
325
326
327
328
329
330
331 atomic_t has_dirty;
332
333#define BCH_CACHE_READA_ALL 0
334#define BCH_CACHE_READA_META_ONLY 1
335 unsigned int cache_readahead_policy;
336 struct bch_ratelimit writeback_rate;
337 struct delayed_work writeback_rate_update;
338
339
340 struct semaphore in_flight;
341 struct task_struct *writeback_thread;
342 struct workqueue_struct *writeback_write_wq;
343
344 struct keybuf writeback_keys;
345
346 struct task_struct *status_update_thread;
347
348
349
350
351
352 struct closure_waitlist writeback_ordering_wait;
353 atomic_t writeback_sequence_next;
354
355
356#define RECENT_IO_BITS 7
357#define RECENT_IO (1 << RECENT_IO_BITS)
358 struct io io[RECENT_IO];
359 struct hlist_head io_hash[RECENT_IO + 1];
360 struct list_head io_lru;
361 spinlock_t io_lock;
362
363 struct cache_accounting accounting;
364
365
366 unsigned int sequential_cutoff;
367
368 unsigned int io_disable:1;
369 unsigned int verify:1;
370 unsigned int bypass_torture_test:1;
371
372 unsigned int partial_stripes_expensive:1;
373 unsigned int writeback_metadata:1;
374 unsigned int writeback_running:1;
375 unsigned int writeback_consider_fragment:1;
376 unsigned char writeback_percent;
377 unsigned int writeback_delay;
378
379 uint64_t writeback_rate_target;
380 int64_t writeback_rate_proportional;
381 int64_t writeback_rate_integral;
382 int64_t writeback_rate_integral_scaled;
383 int32_t writeback_rate_change;
384
385 unsigned int writeback_rate_update_seconds;
386 unsigned int writeback_rate_i_term_inverse;
387 unsigned int writeback_rate_p_term_inverse;
388 unsigned int writeback_rate_fp_term_low;
389 unsigned int writeback_rate_fp_term_mid;
390 unsigned int writeback_rate_fp_term_high;
391 unsigned int writeback_rate_minimum;
392
393 enum stop_on_failure stop_when_cache_set_failed;
394#define DEFAULT_CACHED_DEV_ERROR_LIMIT 64
395 atomic_t io_errors;
396 unsigned int error_limit;
397 unsigned int offline_seconds;
398
399 char backing_dev_name[BDEVNAME_SIZE];
400};
401
402enum alloc_reserve {
403 RESERVE_BTREE,
404 RESERVE_PRIO,
405 RESERVE_MOVINGGC,
406 RESERVE_NONE,
407 RESERVE_NR,
408};
409
410struct cache {
411 struct cache_set *set;
412 struct cache_sb sb;
413 struct cache_sb_disk *sb_disk;
414 struct bio sb_bio;
415 struct bio_vec sb_bv[1];
416
417 struct kobject kobj;
418 struct block_device *bdev;
419
420 struct task_struct *alloc_thread;
421
422 struct closure prio;
423 struct prio_set *disk_buckets;
424
425
426
427
428
429
430
431
432 uint64_t *prio_buckets;
433 uint64_t *prio_last_buckets;
434
435
436
437
438
439
440
441
442
443
444 DECLARE_FIFO(long, free)[RESERVE_NR];
445 DECLARE_FIFO(long, free_inc);
446
447 size_t fifo_last_bucket;
448
449
450 struct bucket *buckets;
451
452 DECLARE_HEAP(struct bucket *, heap);
453
454
455
456
457
458
459 unsigned int invalidate_needs_gc;
460
461 bool discard;
462
463 struct journal_device journal;
464
465
466#define IO_ERROR_SHIFT 20
467 atomic_t io_errors;
468 atomic_t io_count;
469
470 atomic_long_t meta_sectors_written;
471 atomic_long_t btree_sectors_written;
472 atomic_long_t sectors_written;
473
474 char cache_dev_name[BDEVNAME_SIZE];
475};
476
477struct gc_stat {
478 size_t nodes;
479 size_t nodes_pre;
480 size_t key_bytes;
481
482 size_t nkeys;
483 uint64_t data;
484 unsigned int in_use;
485};
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505#define CACHE_SET_UNREGISTERING 0
506#define CACHE_SET_STOPPING 1
507#define CACHE_SET_RUNNING 2
508#define CACHE_SET_IO_DISABLE 3
509
510struct cache_set {
511 struct closure cl;
512
513 struct list_head list;
514 struct kobject kobj;
515 struct kobject internal;
516 struct dentry *debug;
517 struct cache_accounting accounting;
518
519 unsigned long flags;
520 atomic_t idle_counter;
521 atomic_t at_max_writeback_rate;
522
523 struct cache *cache;
524
525 struct bcache_device **devices;
526 unsigned int devices_max_used;
527 atomic_t attached_dev_nr;
528 struct list_head cached_devs;
529 uint64_t cached_dev_sectors;
530 atomic_long_t flash_dev_dirty_sectors;
531 struct closure caching;
532
533 struct closure sb_write;
534 struct semaphore sb_write_mutex;
535
536 mempool_t search;
537 mempool_t bio_meta;
538 struct bio_set bio_split;
539
540
541 struct shrinker shrink;
542
543
544 struct mutex bucket_lock;
545
546
547 unsigned short bucket_bits;
548
549
550 unsigned short block_bits;
551
552
553
554
555
556 unsigned int btree_pages;
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574 struct list_head btree_cache;
575 struct list_head btree_cache_freeable;
576 struct list_head btree_cache_freed;
577
578
579 unsigned int btree_cache_used;
580
581
582
583
584
585
586
587 wait_queue_head_t btree_cache_wait;
588 struct task_struct *btree_cache_alloc_lock;
589 spinlock_t btree_cannibalize_lock;
590
591
592
593
594
595
596
597
598
599
600
601 atomic_t prio_blocked;
602 wait_queue_head_t bucket_wait;
603
604
605
606
607
608 atomic_t rescale;
609
610
611
612 atomic_t search_inflight;
613
614
615
616
617
618
619 uint16_t min_prio;
620
621
622
623
624
625 uint8_t need_gc;
626 struct gc_stat gc_stats;
627 size_t nbuckets;
628 size_t avail_nbuckets;
629
630 struct task_struct *gc_thread;
631
632 struct bkey gc_done;
633
634
635
636
637
638
639
640
641
642
643
644#define BCH_ENABLE_AUTO_GC 1
645#define BCH_DO_AUTO_GC 2
646 uint8_t gc_after_writeback;
647
648
649
650
651
652 int gc_mark_valid;
653
654
655 atomic_t sectors_to_gc;
656 wait_queue_head_t gc_wait;
657
658 struct keybuf moving_gc_keys;
659
660 struct semaphore moving_in_flight;
661
662 struct workqueue_struct *moving_gc_wq;
663
664 struct btree *root;
665
666#ifdef CONFIG_BCACHE_DEBUG
667 struct btree *verify_data;
668 struct bset *verify_ondisk;
669 struct mutex verify_lock;
670#endif
671
672 uint8_t set_uuid[16];
673 unsigned int nr_uuids;
674 struct uuid_entry *uuids;
675 BKEY_PADDED(uuid_bucket);
676 struct closure uuid_write;
677 struct semaphore uuid_write_mutex;
678
679
680
681
682
683
684
685
686
687 mempool_t fill_iter;
688
689 struct bset_sort_state sort;
690
691
692 struct list_head data_buckets;
693 spinlock_t data_bucket_lock;
694
695 struct journal journal;
696
697#define CONGESTED_MAX 1024
698 unsigned int congested_last_us;
699 atomic_t congested;
700
701
702 unsigned int congested_read_threshold_us;
703 unsigned int congested_write_threshold_us;
704
705 struct time_stats btree_gc_time;
706 struct time_stats btree_split_time;
707 struct time_stats btree_read_time;
708
709 atomic_long_t cache_read_races;
710 atomic_long_t writeback_keys_done;
711 atomic_long_t writeback_keys_failed;
712
713 atomic_long_t reclaim;
714 atomic_long_t reclaimed_journal_buckets;
715 atomic_long_t flush_write;
716
717 enum {
718 ON_ERROR_UNREGISTER,
719 ON_ERROR_PANIC,
720 } on_error;
721#define DEFAULT_IO_ERROR_LIMIT 8
722 unsigned int error_limit;
723 unsigned int error_decay;
724
725 unsigned short journal_delay_ms;
726 bool expensive_debug_checks;
727 unsigned int verify:1;
728 unsigned int key_merging_disabled:1;
729 unsigned int gc_always_rewrite:1;
730 unsigned int shrinker_disabled:1;
731 unsigned int copy_gc_enabled:1;
732 unsigned int idle_max_writeback_rate_enabled:1;
733
734#define BUCKET_HASH_BITS 12
735 struct hlist_head bucket_hash[1 << BUCKET_HASH_BITS];
736};
737
738struct bbio {
739 unsigned int submit_time_us;
740 union {
741 struct bkey key;
742 uint64_t _pad[3];
743
744
745
746
747 };
748 struct bio bio;
749};
750
751#define BTREE_PRIO USHRT_MAX
752#define INITIAL_PRIO 32768U
753
754#define btree_bytes(c) ((c)->btree_pages * PAGE_SIZE)
755#define btree_blocks(b) \
756 ((unsigned int) (KEY_SIZE(&b->key) >> (b)->c->block_bits))
757
758#define btree_default_blocks(c) \
759 ((unsigned int) ((PAGE_SECTORS * (c)->btree_pages) >> (c)->block_bits))
760
761#define bucket_bytes(ca) ((ca)->sb.bucket_size << 9)
762#define block_bytes(ca) ((ca)->sb.block_size << 9)
763
764static inline unsigned int meta_bucket_pages(struct cache_sb *sb)
765{
766 unsigned int n, max_pages;
767
768 max_pages = min_t(unsigned int,
769 __rounddown_pow_of_two(USHRT_MAX) / PAGE_SECTORS,
770 MAX_ORDER_NR_PAGES);
771
772 n = sb->bucket_size / PAGE_SECTORS;
773 if (n > max_pages)
774 n = max_pages;
775
776 return n;
777}
778
779static inline unsigned int meta_bucket_bytes(struct cache_sb *sb)
780{
781 return meta_bucket_pages(sb) << PAGE_SHIFT;
782}
783
784#define prios_per_bucket(ca) \
785 ((meta_bucket_bytes(&(ca)->sb) - sizeof(struct prio_set)) / \
786 sizeof(struct bucket_disk))
787
788#define prio_buckets(ca) \
789 DIV_ROUND_UP((size_t) (ca)->sb.nbuckets, prios_per_bucket(ca))
790
791static inline size_t sector_to_bucket(struct cache_set *c, sector_t s)
792{
793 return s >> c->bucket_bits;
794}
795
796static inline sector_t bucket_to_sector(struct cache_set *c, size_t b)
797{
798 return ((sector_t) b) << c->bucket_bits;
799}
800
801static inline sector_t bucket_remainder(struct cache_set *c, sector_t s)
802{
803 return s & (c->cache->sb.bucket_size - 1);
804}
805
806static inline size_t PTR_BUCKET_NR(struct cache_set *c,
807 const struct bkey *k,
808 unsigned int ptr)
809{
810 return sector_to_bucket(c, PTR_OFFSET(k, ptr));
811}
812
813static inline struct bucket *PTR_BUCKET(struct cache_set *c,
814 const struct bkey *k,
815 unsigned int ptr)
816{
817 return c->cache->buckets + PTR_BUCKET_NR(c, k, ptr);
818}
819
820static inline uint8_t gen_after(uint8_t a, uint8_t b)
821{
822 uint8_t r = a - b;
823
824 return r > 128U ? 0 : r;
825}
826
827static inline uint8_t ptr_stale(struct cache_set *c, const struct bkey *k,
828 unsigned int i)
829{
830 return gen_after(PTR_BUCKET(c, k, i)->gen, PTR_GEN(k, i));
831}
832
833static inline bool ptr_available(struct cache_set *c, const struct bkey *k,
834 unsigned int i)
835{
836 return (PTR_DEV(k, i) < MAX_CACHES_PER_SET) && c->cache;
837}
838
839
840
841
842
843
844
845#define csum_set(i) \
846 bch_crc64(((void *) (i)) + sizeof(uint64_t), \
847 ((void *) bset_bkey_last(i)) - \
848 (((void *) (i)) + sizeof(uint64_t)))
849
850
851
852#define btree_bug(b, ...) \
853do { \
854 if (bch_cache_set_error((b)->c, __VA_ARGS__)) \
855 dump_stack(); \
856} while (0)
857
858#define cache_bug(c, ...) \
859do { \
860 if (bch_cache_set_error(c, __VA_ARGS__)) \
861 dump_stack(); \
862} while (0)
863
864#define btree_bug_on(cond, b, ...) \
865do { \
866 if (cond) \
867 btree_bug(b, __VA_ARGS__); \
868} while (0)
869
870#define cache_bug_on(cond, c, ...) \
871do { \
872 if (cond) \
873 cache_bug(c, __VA_ARGS__); \
874} while (0)
875
876#define cache_set_err_on(cond, c, ...) \
877do { \
878 if (cond) \
879 bch_cache_set_error(c, __VA_ARGS__); \
880} while (0)
881
882
883
884#define for_each_bucket(b, ca) \
885 for (b = (ca)->buckets + (ca)->sb.first_bucket; \
886 b < (ca)->buckets + (ca)->sb.nbuckets; b++)
887
888static inline void cached_dev_put(struct cached_dev *dc)
889{
890 if (refcount_dec_and_test(&dc->count))
891 schedule_work(&dc->detach);
892}
893
894static inline bool cached_dev_get(struct cached_dev *dc)
895{
896 if (!refcount_inc_not_zero(&dc->count))
897 return false;
898
899
900 smp_mb__after_atomic();
901 return true;
902}
903
904
905
906
907
908
909static inline uint8_t bucket_gc_gen(struct bucket *b)
910{
911 return b->gen - b->last_gc;
912}
913
914#define BUCKET_GC_GEN_MAX 96U
915
916#define kobj_attribute_write(n, fn) \
917 static struct kobj_attribute ksysfs_##n = __ATTR(n, 0200, NULL, fn)
918
919#define kobj_attribute_rw(n, show, store) \
920 static struct kobj_attribute ksysfs_##n = \
921 __ATTR(n, 0600, show, store)
922
923static inline void wake_up_allocators(struct cache_set *c)
924{
925 struct cache *ca = c->cache;
926
927 wake_up_process(ca->alloc_thread);
928}
929
930static inline void closure_bio_submit(struct cache_set *c,
931 struct bio *bio,
932 struct closure *cl)
933{
934 closure_get(cl);
935 if (unlikely(test_bit(CACHE_SET_IO_DISABLE, &c->flags))) {
936 bio->bi_status = BLK_STS_IOERR;
937 bio_endio(bio);
938 return;
939 }
940 submit_bio_noacct(bio);
941}
942
943
944
945
946
947
948
949static inline void wait_for_kthread_stop(void)
950{
951 while (!kthread_should_stop()) {
952 set_current_state(TASK_INTERRUPTIBLE);
953 schedule();
954 }
955}
956
957
958
959void bch_count_backing_io_errors(struct cached_dev *dc, struct bio *bio);
960void bch_count_io_errors(struct cache *ca, blk_status_t error,
961 int is_read, const char *m);
962void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio,
963 blk_status_t error, const char *m);
964void bch_bbio_endio(struct cache_set *c, struct bio *bio,
965 blk_status_t error, const char *m);
966void bch_bbio_free(struct bio *bio, struct cache_set *c);
967struct bio *bch_bbio_alloc(struct cache_set *c);
968
969void __bch_submit_bbio(struct bio *bio, struct cache_set *c);
970void bch_submit_bbio(struct bio *bio, struct cache_set *c,
971 struct bkey *k, unsigned int ptr);
972
973uint8_t bch_inc_gen(struct cache *ca, struct bucket *b);
974void bch_rescale_priorities(struct cache_set *c, int sectors);
975
976bool bch_can_invalidate_bucket(struct cache *ca, struct bucket *b);
977void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *b);
978
979void __bch_bucket_free(struct cache *ca, struct bucket *b);
980void bch_bucket_free(struct cache_set *c, struct bkey *k);
981
982long bch_bucket_alloc(struct cache *ca, unsigned int reserve, bool wait);
983int __bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
984 struct bkey *k, bool wait);
985int bch_bucket_alloc_set(struct cache_set *c, unsigned int reserve,
986 struct bkey *k, bool wait);
987bool bch_alloc_sectors(struct cache_set *c, struct bkey *k,
988 unsigned int sectors, unsigned int write_point,
989 unsigned int write_prio, bool wait);
990bool bch_cached_dev_error(struct cached_dev *dc);
991
992__printf(2, 3)
993bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...);
994
995int bch_prio_write(struct cache *ca, bool wait);
996void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent);
997
998extern struct workqueue_struct *bcache_wq;
999extern struct workqueue_struct *bch_journal_wq;
1000extern struct workqueue_struct *bch_flush_wq;
1001extern struct mutex bch_register_lock;
1002extern struct list_head bch_cache_sets;
1003
1004extern struct kobj_type bch_cached_dev_ktype;
1005extern struct kobj_type bch_flash_dev_ktype;
1006extern struct kobj_type bch_cache_set_ktype;
1007extern struct kobj_type bch_cache_set_internal_ktype;
1008extern struct kobj_type bch_cache_ktype;
1009
1010void bch_cached_dev_release(struct kobject *kobj);
1011void bch_flash_dev_release(struct kobject *kobj);
1012void bch_cache_set_release(struct kobject *kobj);
1013void bch_cache_release(struct kobject *kobj);
1014
1015int bch_uuid_write(struct cache_set *c);
1016void bcache_write_super(struct cache_set *c);
1017
1018int bch_flash_dev_create(struct cache_set *c, uint64_t size);
1019
1020int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c,
1021 uint8_t *set_uuid);
1022void bch_cached_dev_detach(struct cached_dev *dc);
1023int bch_cached_dev_run(struct cached_dev *dc);
1024void bcache_device_stop(struct bcache_device *d);
1025
1026void bch_cache_set_unregister(struct cache_set *c);
1027void bch_cache_set_stop(struct cache_set *c);
1028
1029struct cache_set *bch_cache_set_alloc(struct cache_sb *sb);
1030void bch_btree_cache_free(struct cache_set *c);
1031int bch_btree_cache_alloc(struct cache_set *c);
1032void bch_moving_init_cache_set(struct cache_set *c);
1033int bch_open_buckets_alloc(struct cache_set *c);
1034void bch_open_buckets_free(struct cache_set *c);
1035
1036int bch_cache_allocator_start(struct cache *ca);
1037
1038void bch_debug_exit(void);
1039void bch_debug_init(void);
1040void bch_request_exit(void);
1041int bch_request_init(void);
1042void bch_btree_exit(void);
1043int bch_btree_init(void);
1044
1045#endif
1046