1
2
3
4
5
6
7
8
9#include <linux/dm-bufio.h>
10
11#include <linux/device-mapper.h>
12#include <linux/dm-io.h>
13#include <linux/slab.h>
14#include <linux/sched/mm.h>
15#include <linux/jiffies.h>
16#include <linux/vmalloc.h>
17#include <linux/shrinker.h>
18#include <linux/module.h>
19#include <linux/rbtree.h>
20#include <linux/stacktrace.h>
21
22#define DM_MSG_PREFIX "bufio"
23
24
25
26
27
28
29
30
31
32#define DM_BUFIO_MIN_BUFFERS 8
33
34#define DM_BUFIO_MEMORY_PERCENT 2
35#define DM_BUFIO_VMALLOC_PERCENT 25
36#define DM_BUFIO_WRITEBACK_RATIO 3
37#define DM_BUFIO_LOW_WATERMARK_RATIO 16
38
39
40
41
42#define DM_BUFIO_WORK_TIMER_SECS 30
43
44
45
46
47#define DM_BUFIO_DEFAULT_AGE_SECS 300
48
49
50
51
52#define DM_BUFIO_DEFAULT_RETAIN_BYTES (256 * 1024)
53
54
55
56
57
58#define DM_BUFIO_WRITE_ALIGN 4096
59
60
61
62
63#define LIST_CLEAN 0
64#define LIST_DIRTY 1
65#define LIST_SIZE 2
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82struct dm_bufio_client {
83 struct mutex lock;
84
85 struct list_head lru[LIST_SIZE];
86 unsigned long n_buffers[LIST_SIZE];
87
88 struct block_device *bdev;
89 unsigned block_size;
90 s8 sectors_per_block_bits;
91 void (*alloc_callback)(struct dm_buffer *);
92 void (*write_callback)(struct dm_buffer *);
93
94 struct kmem_cache *slab_buffer;
95 struct kmem_cache *slab_cache;
96 struct dm_io_client *dm_io;
97
98 struct list_head reserved_buffers;
99 unsigned need_reserved_buffers;
100
101 unsigned minimum_buffers;
102
103 struct rb_root buffer_tree;
104 wait_queue_head_t free_buffer_wait;
105
106 sector_t start;
107
108 int async_write_error;
109
110 struct list_head client_list;
111
112 struct shrinker shrinker;
113 struct work_struct shrink_work;
114 atomic_long_t need_shrink;
115};
116
117
118
119
120#define B_READING 0
121#define B_WRITING 1
122#define B_DIRTY 2
123
124
125
126
127
128
129enum data_mode {
130 DATA_MODE_SLAB = 0,
131 DATA_MODE_GET_FREE_PAGES = 1,
132 DATA_MODE_VMALLOC = 2,
133 DATA_MODE_LIMIT = 3
134};
135
136struct dm_buffer {
137 struct rb_node node;
138 struct list_head lru_list;
139 struct list_head global_list;
140 sector_t block;
141 void *data;
142 unsigned char data_mode;
143 unsigned char list_mode;
144 blk_status_t read_error;
145 blk_status_t write_error;
146 unsigned accessed;
147 unsigned hold_count;
148 unsigned long state;
149 unsigned long last_accessed;
150 unsigned dirty_start;
151 unsigned dirty_end;
152 unsigned write_start;
153 unsigned write_end;
154 struct dm_bufio_client *c;
155 struct list_head write_list;
156 void (*end_io)(struct dm_buffer *, blk_status_t);
157#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
158#define MAX_STACK 10
159 unsigned int stack_len;
160 unsigned long stack_entries[MAX_STACK];
161#endif
162};
163
164
165
166#define dm_bufio_in_request() (!!current->bio_list)
167
168static void dm_bufio_lock(struct dm_bufio_client *c)
169{
170 mutex_lock_nested(&c->lock, dm_bufio_in_request());
171}
172
173static int dm_bufio_trylock(struct dm_bufio_client *c)
174{
175 return mutex_trylock(&c->lock);
176}
177
178static void dm_bufio_unlock(struct dm_bufio_client *c)
179{
180 mutex_unlock(&c->lock);
181}
182
183
184
185
186
187
188static unsigned long dm_bufio_default_cache_size;
189
190
191
192
193static unsigned long dm_bufio_cache_size;
194
195
196
197
198
199static unsigned long dm_bufio_cache_size_latch;
200
201static DEFINE_SPINLOCK(global_spinlock);
202
203static LIST_HEAD(global_queue);
204
205static unsigned long global_num = 0;
206
207
208
209
210static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
211static unsigned long dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
212
213static unsigned long dm_bufio_peak_allocated;
214static unsigned long dm_bufio_allocated_kmem_cache;
215static unsigned long dm_bufio_allocated_get_free_pages;
216static unsigned long dm_bufio_allocated_vmalloc;
217static unsigned long dm_bufio_current_allocated;
218
219
220
221
222
223
224static int dm_bufio_client_count;
225
226
227
228
229static LIST_HEAD(dm_bufio_all_clients);
230
231
232
233
234static DEFINE_MUTEX(dm_bufio_clients_lock);
235
236static struct workqueue_struct *dm_bufio_wq;
237static struct delayed_work dm_bufio_cleanup_old_work;
238static struct work_struct dm_bufio_replacement_work;
239
240
241#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
242static void buffer_record_stack(struct dm_buffer *b)
243{
244 b->stack_len = stack_trace_save(b->stack_entries, MAX_STACK, 2);
245}
246#endif
247
248
249
250
251static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
252{
253 struct rb_node *n = c->buffer_tree.rb_node;
254 struct dm_buffer *b;
255
256 while (n) {
257 b = container_of(n, struct dm_buffer, node);
258
259 if (b->block == block)
260 return b;
261
262 n = block < b->block ? n->rb_left : n->rb_right;
263 }
264
265 return NULL;
266}
267
268static void __insert(struct dm_bufio_client *c, struct dm_buffer *b)
269{
270 struct rb_node **new = &c->buffer_tree.rb_node, *parent = NULL;
271 struct dm_buffer *found;
272
273 while (*new) {
274 found = container_of(*new, struct dm_buffer, node);
275
276 if (found->block == b->block) {
277 BUG_ON(found != b);
278 return;
279 }
280
281 parent = *new;
282 new = b->block < found->block ?
283 &found->node.rb_left : &found->node.rb_right;
284 }
285
286 rb_link_node(&b->node, parent, new);
287 rb_insert_color(&b->node, &c->buffer_tree);
288}
289
290static void __remove(struct dm_bufio_client *c, struct dm_buffer *b)
291{
292 rb_erase(&b->node, &c->buffer_tree);
293}
294
295
296
297static void adjust_total_allocated(struct dm_buffer *b, bool unlink)
298{
299 unsigned char data_mode;
300 long diff;
301
302 static unsigned long * const class_ptr[DATA_MODE_LIMIT] = {
303 &dm_bufio_allocated_kmem_cache,
304 &dm_bufio_allocated_get_free_pages,
305 &dm_bufio_allocated_vmalloc,
306 };
307
308 data_mode = b->data_mode;
309 diff = (long)b->c->block_size;
310 if (unlink)
311 diff = -diff;
312
313 spin_lock(&global_spinlock);
314
315 *class_ptr[data_mode] += diff;
316
317 dm_bufio_current_allocated += diff;
318
319 if (dm_bufio_current_allocated > dm_bufio_peak_allocated)
320 dm_bufio_peak_allocated = dm_bufio_current_allocated;
321
322 b->accessed = 1;
323
324 if (!unlink) {
325 list_add(&b->global_list, &global_queue);
326 global_num++;
327 if (dm_bufio_current_allocated > dm_bufio_cache_size)
328 queue_work(dm_bufio_wq, &dm_bufio_replacement_work);
329 } else {
330 list_del(&b->global_list);
331 global_num--;
332 }
333
334 spin_unlock(&global_spinlock);
335}
336
337
338
339
340static void __cache_size_refresh(void)
341{
342 BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock));
343 BUG_ON(dm_bufio_client_count < 0);
344
345 dm_bufio_cache_size_latch = READ_ONCE(dm_bufio_cache_size);
346
347
348
349
350 if (!dm_bufio_cache_size_latch) {
351 (void)cmpxchg(&dm_bufio_cache_size, 0,
352 dm_bufio_default_cache_size);
353 dm_bufio_cache_size_latch = dm_bufio_default_cache_size;
354 }
355}
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
379 unsigned char *data_mode)
380{
381 if (unlikely(c->slab_cache != NULL)) {
382 *data_mode = DATA_MODE_SLAB;
383 return kmem_cache_alloc(c->slab_cache, gfp_mask);
384 }
385
386 if (c->block_size <= KMALLOC_MAX_SIZE &&
387 gfp_mask & __GFP_NORETRY) {
388 *data_mode = DATA_MODE_GET_FREE_PAGES;
389 return (void *)__get_free_pages(gfp_mask,
390 c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT));
391 }
392
393 *data_mode = DATA_MODE_VMALLOC;
394
395
396
397
398
399
400
401
402
403
404 if (gfp_mask & __GFP_NORETRY) {
405 unsigned noio_flag = memalloc_noio_save();
406 void *ptr = __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL);
407
408 memalloc_noio_restore(noio_flag);
409 return ptr;
410 }
411
412 return __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL);
413}
414
415
416
417
418static void free_buffer_data(struct dm_bufio_client *c,
419 void *data, unsigned char data_mode)
420{
421 switch (data_mode) {
422 case DATA_MODE_SLAB:
423 kmem_cache_free(c->slab_cache, data);
424 break;
425
426 case DATA_MODE_GET_FREE_PAGES:
427 free_pages((unsigned long)data,
428 c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT));
429 break;
430
431 case DATA_MODE_VMALLOC:
432 vfree(data);
433 break;
434
435 default:
436 DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d",
437 data_mode);
438 BUG();
439 }
440}
441
442
443
444
445static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask)
446{
447 struct dm_buffer *b = kmem_cache_alloc(c->slab_buffer, gfp_mask);
448
449 if (!b)
450 return NULL;
451
452 b->c = c;
453
454 b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode);
455 if (!b->data) {
456 kmem_cache_free(c->slab_buffer, b);
457 return NULL;
458 }
459
460#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
461 b->stack_len = 0;
462#endif
463 return b;
464}
465
466
467
468
469static void free_buffer(struct dm_buffer *b)
470{
471 struct dm_bufio_client *c = b->c;
472
473 free_buffer_data(c, b->data, b->data_mode);
474 kmem_cache_free(c->slab_buffer, b);
475}
476
477
478
479
480static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty)
481{
482 struct dm_bufio_client *c = b->c;
483
484 c->n_buffers[dirty]++;
485 b->block = block;
486 b->list_mode = dirty;
487 list_add(&b->lru_list, &c->lru[dirty]);
488 __insert(b->c, b);
489 b->last_accessed = jiffies;
490
491 adjust_total_allocated(b, false);
492}
493
494
495
496
497static void __unlink_buffer(struct dm_buffer *b)
498{
499 struct dm_bufio_client *c = b->c;
500
501 BUG_ON(!c->n_buffers[b->list_mode]);
502
503 c->n_buffers[b->list_mode]--;
504 __remove(b->c, b);
505 list_del(&b->lru_list);
506
507 adjust_total_allocated(b, true);
508}
509
510
511
512
513static void __relink_lru(struct dm_buffer *b, int dirty)
514{
515 struct dm_bufio_client *c = b->c;
516
517 b->accessed = 1;
518
519 BUG_ON(!c->n_buffers[b->list_mode]);
520
521 c->n_buffers[b->list_mode]--;
522 c->n_buffers[dirty]++;
523 b->list_mode = dirty;
524 list_move(&b->lru_list, &c->lru[dirty]);
525 b->last_accessed = jiffies;
526}
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550static void dmio_complete(unsigned long error, void *context)
551{
552 struct dm_buffer *b = context;
553
554 b->end_io(b, unlikely(error != 0) ? BLK_STS_IOERR : 0);
555}
556
557static void use_dmio(struct dm_buffer *b, int rw, sector_t sector,
558 unsigned n_sectors, unsigned offset)
559{
560 int r;
561 struct dm_io_request io_req = {
562 .bi_op = rw,
563 .bi_op_flags = 0,
564 .notify.fn = dmio_complete,
565 .notify.context = b,
566 .client = b->c->dm_io,
567 };
568 struct dm_io_region region = {
569 .bdev = b->c->bdev,
570 .sector = sector,
571 .count = n_sectors,
572 };
573
574 if (b->data_mode != DATA_MODE_VMALLOC) {
575 io_req.mem.type = DM_IO_KMEM;
576 io_req.mem.ptr.addr = (char *)b->data + offset;
577 } else {
578 io_req.mem.type = DM_IO_VMA;
579 io_req.mem.ptr.vma = (char *)b->data + offset;
580 }
581
582 r = dm_io(&io_req, 1, ®ion, NULL);
583 if (unlikely(r))
584 b->end_io(b, errno_to_blk_status(r));
585}
586
587static void bio_complete(struct bio *bio)
588{
589 struct dm_buffer *b = bio->bi_private;
590 blk_status_t status = bio->bi_status;
591 bio_put(bio);
592 b->end_io(b, status);
593}
594
595static void use_bio(struct dm_buffer *b, int rw, sector_t sector,
596 unsigned n_sectors, unsigned offset)
597{
598 struct bio *bio;
599 char *ptr;
600 unsigned vec_size, len;
601
602 vec_size = b->c->block_size >> PAGE_SHIFT;
603 if (unlikely(b->c->sectors_per_block_bits < PAGE_SHIFT - SECTOR_SHIFT))
604 vec_size += 2;
605
606 bio = bio_kmalloc(GFP_NOWAIT | __GFP_NORETRY | __GFP_NOWARN, vec_size);
607 if (!bio) {
608dmio:
609 use_dmio(b, rw, sector, n_sectors, offset);
610 return;
611 }
612
613 bio->bi_iter.bi_sector = sector;
614 bio_set_dev(bio, b->c->bdev);
615 bio_set_op_attrs(bio, rw, 0);
616 bio->bi_end_io = bio_complete;
617 bio->bi_private = b;
618
619 ptr = (char *)b->data + offset;
620 len = n_sectors << SECTOR_SHIFT;
621
622 do {
623 unsigned this_step = min((unsigned)(PAGE_SIZE - offset_in_page(ptr)), len);
624 if (!bio_add_page(bio, virt_to_page(ptr), this_step,
625 offset_in_page(ptr))) {
626 bio_put(bio);
627 goto dmio;
628 }
629
630 len -= this_step;
631 ptr += this_step;
632 } while (len > 0);
633
634 submit_bio(bio);
635}
636
637static void submit_io(struct dm_buffer *b, int rw, void (*end_io)(struct dm_buffer *, blk_status_t))
638{
639 unsigned n_sectors;
640 sector_t sector;
641 unsigned offset, end;
642
643 b->end_io = end_io;
644
645 if (likely(b->c->sectors_per_block_bits >= 0))
646 sector = b->block << b->c->sectors_per_block_bits;
647 else
648 sector = b->block * (b->c->block_size >> SECTOR_SHIFT);
649 sector += b->c->start;
650
651 if (rw != REQ_OP_WRITE) {
652 n_sectors = b->c->block_size >> SECTOR_SHIFT;
653 offset = 0;
654 } else {
655 if (b->c->write_callback)
656 b->c->write_callback(b);
657 offset = b->write_start;
658 end = b->write_end;
659 offset &= -DM_BUFIO_WRITE_ALIGN;
660 end += DM_BUFIO_WRITE_ALIGN - 1;
661 end &= -DM_BUFIO_WRITE_ALIGN;
662 if (unlikely(end > b->c->block_size))
663 end = b->c->block_size;
664
665 sector += offset >> SECTOR_SHIFT;
666 n_sectors = (end - offset) >> SECTOR_SHIFT;
667 }
668
669 if (b->data_mode != DATA_MODE_VMALLOC)
670 use_bio(b, rw, sector, n_sectors, offset);
671 else
672 use_dmio(b, rw, sector, n_sectors, offset);
673}
674
675
676
677
678
679
680
681
682
683
684
685static void write_endio(struct dm_buffer *b, blk_status_t status)
686{
687 b->write_error = status;
688 if (unlikely(status)) {
689 struct dm_bufio_client *c = b->c;
690
691 (void)cmpxchg(&c->async_write_error, 0,
692 blk_status_to_errno(status));
693 }
694
695 BUG_ON(!test_bit(B_WRITING, &b->state));
696
697 smp_mb__before_atomic();
698 clear_bit(B_WRITING, &b->state);
699 smp_mb__after_atomic();
700
701 wake_up_bit(&b->state, B_WRITING);
702}
703
704
705
706
707
708
709
710
711
712
713static void __write_dirty_buffer(struct dm_buffer *b,
714 struct list_head *write_list)
715{
716 if (!test_bit(B_DIRTY, &b->state))
717 return;
718
719 clear_bit(B_DIRTY, &b->state);
720 wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
721
722 b->write_start = b->dirty_start;
723 b->write_end = b->dirty_end;
724
725 if (!write_list)
726 submit_io(b, REQ_OP_WRITE, write_endio);
727 else
728 list_add_tail(&b->write_list, write_list);
729}
730
731static void __flush_write_list(struct list_head *write_list)
732{
733 struct blk_plug plug;
734 blk_start_plug(&plug);
735 while (!list_empty(write_list)) {
736 struct dm_buffer *b =
737 list_entry(write_list->next, struct dm_buffer, write_list);
738 list_del(&b->write_list);
739 submit_io(b, REQ_OP_WRITE, write_endio);
740 cond_resched();
741 }
742 blk_finish_plug(&plug);
743}
744
745
746
747
748
749
750static void __make_buffer_clean(struct dm_buffer *b)
751{
752 BUG_ON(b->hold_count);
753
754 if (!b->state)
755 return;
756
757 wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
758 __write_dirty_buffer(b, NULL);
759 wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
760}
761
762
763
764
765
766static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c)
767{
768 struct dm_buffer *b;
769
770 list_for_each_entry_reverse(b, &c->lru[LIST_CLEAN], lru_list) {
771 BUG_ON(test_bit(B_WRITING, &b->state));
772 BUG_ON(test_bit(B_DIRTY, &b->state));
773
774 if (!b->hold_count) {
775 __make_buffer_clean(b);
776 __unlink_buffer(b);
777 return b;
778 }
779 cond_resched();
780 }
781
782 list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) {
783 BUG_ON(test_bit(B_READING, &b->state));
784
785 if (!b->hold_count) {
786 __make_buffer_clean(b);
787 __unlink_buffer(b);
788 return b;
789 }
790 cond_resched();
791 }
792
793 return NULL;
794}
795
796
797
798
799
800
801
802
803static void __wait_for_free_buffer(struct dm_bufio_client *c)
804{
805 DECLARE_WAITQUEUE(wait, current);
806
807 add_wait_queue(&c->free_buffer_wait, &wait);
808 set_current_state(TASK_UNINTERRUPTIBLE);
809 dm_bufio_unlock(c);
810
811 io_schedule();
812
813 remove_wait_queue(&c->free_buffer_wait, &wait);
814
815 dm_bufio_lock(c);
816}
817
818enum new_flag {
819 NF_FRESH = 0,
820 NF_READ = 1,
821 NF_GET = 2,
822 NF_PREFETCH = 3
823};
824
825
826
827
828
829
830
831static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf)
832{
833 struct dm_buffer *b;
834 bool tried_noio_alloc = false;
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849 while (1) {
850 if (dm_bufio_cache_size_latch != 1) {
851 b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
852 if (b)
853 return b;
854 }
855
856 if (nf == NF_PREFETCH)
857 return NULL;
858
859 if (dm_bufio_cache_size_latch != 1 && !tried_noio_alloc) {
860 dm_bufio_unlock(c);
861 b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
862 dm_bufio_lock(c);
863 if (b)
864 return b;
865 tried_noio_alloc = true;
866 }
867
868 if (!list_empty(&c->reserved_buffers)) {
869 b = list_entry(c->reserved_buffers.next,
870 struct dm_buffer, lru_list);
871 list_del(&b->lru_list);
872 c->need_reserved_buffers++;
873
874 return b;
875 }
876
877 b = __get_unclaimed_buffer(c);
878 if (b)
879 return b;
880
881 __wait_for_free_buffer(c);
882 }
883}
884
885static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf)
886{
887 struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf);
888
889 if (!b)
890 return NULL;
891
892 if (c->alloc_callback)
893 c->alloc_callback(b);
894
895 return b;
896}
897
898
899
900
901static void __free_buffer_wake(struct dm_buffer *b)
902{
903 struct dm_bufio_client *c = b->c;
904
905 if (!c->need_reserved_buffers)
906 free_buffer(b);
907 else {
908 list_add(&b->lru_list, &c->reserved_buffers);
909 c->need_reserved_buffers--;
910 }
911
912 wake_up(&c->free_buffer_wait);
913}
914
915static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait,
916 struct list_head *write_list)
917{
918 struct dm_buffer *b, *tmp;
919
920 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
921 BUG_ON(test_bit(B_READING, &b->state));
922
923 if (!test_bit(B_DIRTY, &b->state) &&
924 !test_bit(B_WRITING, &b->state)) {
925 __relink_lru(b, LIST_CLEAN);
926 continue;
927 }
928
929 if (no_wait && test_bit(B_WRITING, &b->state))
930 return;
931
932 __write_dirty_buffer(b, write_list);
933 cond_resched();
934 }
935}
936
937
938
939
940
941
942static void __check_watermark(struct dm_bufio_client *c,
943 struct list_head *write_list)
944{
945 if (c->n_buffers[LIST_DIRTY] > c->n_buffers[LIST_CLEAN] * DM_BUFIO_WRITEBACK_RATIO)
946 __write_dirty_buffers_async(c, 1, write_list);
947}
948
949
950
951
952
953static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
954 enum new_flag nf, int *need_submit,
955 struct list_head *write_list)
956{
957 struct dm_buffer *b, *new_b = NULL;
958
959 *need_submit = 0;
960
961 b = __find(c, block);
962 if (b)
963 goto found_buffer;
964
965 if (nf == NF_GET)
966 return NULL;
967
968 new_b = __alloc_buffer_wait(c, nf);
969 if (!new_b)
970 return NULL;
971
972
973
974
975
976 b = __find(c, block);
977 if (b) {
978 __free_buffer_wake(new_b);
979 goto found_buffer;
980 }
981
982 __check_watermark(c, write_list);
983
984 b = new_b;
985 b->hold_count = 1;
986 b->read_error = 0;
987 b->write_error = 0;
988 __link_buffer(b, block, LIST_CLEAN);
989
990 if (nf == NF_FRESH) {
991 b->state = 0;
992 return b;
993 }
994
995 b->state = 1 << B_READING;
996 *need_submit = 1;
997
998 return b;
999
1000found_buffer:
1001 if (nf == NF_PREFETCH)
1002 return NULL;
1003
1004
1005
1006
1007
1008
1009
1010 if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state)))
1011 return NULL;
1012
1013 b->hold_count++;
1014 __relink_lru(b, test_bit(B_DIRTY, &b->state) ||
1015 test_bit(B_WRITING, &b->state));
1016 return b;
1017}
1018
1019
1020
1021
1022
1023static void read_endio(struct dm_buffer *b, blk_status_t status)
1024{
1025 b->read_error = status;
1026
1027 BUG_ON(!test_bit(B_READING, &b->state));
1028
1029 smp_mb__before_atomic();
1030 clear_bit(B_READING, &b->state);
1031 smp_mb__after_atomic();
1032
1033 wake_up_bit(&b->state, B_READING);
1034}
1035
1036
1037
1038
1039
1040
1041
1042static void *new_read(struct dm_bufio_client *c, sector_t block,
1043 enum new_flag nf, struct dm_buffer **bp)
1044{
1045 int need_submit;
1046 struct dm_buffer *b;
1047
1048 LIST_HEAD(write_list);
1049
1050 dm_bufio_lock(c);
1051 b = __bufio_new(c, block, nf, &need_submit, &write_list);
1052#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1053 if (b && b->hold_count == 1)
1054 buffer_record_stack(b);
1055#endif
1056 dm_bufio_unlock(c);
1057
1058 __flush_write_list(&write_list);
1059
1060 if (!b)
1061 return NULL;
1062
1063 if (need_submit)
1064 submit_io(b, REQ_OP_READ, read_endio);
1065
1066 wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
1067
1068 if (b->read_error) {
1069 int error = blk_status_to_errno(b->read_error);
1070
1071 dm_bufio_release(b);
1072
1073 return ERR_PTR(error);
1074 }
1075
1076 *bp = b;
1077
1078 return b->data;
1079}
1080
1081void *dm_bufio_get(struct dm_bufio_client *c, sector_t block,
1082 struct dm_buffer **bp)
1083{
1084 return new_read(c, block, NF_GET, bp);
1085}
1086EXPORT_SYMBOL_GPL(dm_bufio_get);
1087
1088void *dm_bufio_read(struct dm_bufio_client *c, sector_t block,
1089 struct dm_buffer **bp)
1090{
1091 BUG_ON(dm_bufio_in_request());
1092
1093 return new_read(c, block, NF_READ, bp);
1094}
1095EXPORT_SYMBOL_GPL(dm_bufio_read);
1096
1097void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
1098 struct dm_buffer **bp)
1099{
1100 BUG_ON(dm_bufio_in_request());
1101
1102 return new_read(c, block, NF_FRESH, bp);
1103}
1104EXPORT_SYMBOL_GPL(dm_bufio_new);
1105
1106void dm_bufio_prefetch(struct dm_bufio_client *c,
1107 sector_t block, unsigned n_blocks)
1108{
1109 struct blk_plug plug;
1110
1111 LIST_HEAD(write_list);
1112
1113 BUG_ON(dm_bufio_in_request());
1114
1115 blk_start_plug(&plug);
1116 dm_bufio_lock(c);
1117
1118 for (; n_blocks--; block++) {
1119 int need_submit;
1120 struct dm_buffer *b;
1121 b = __bufio_new(c, block, NF_PREFETCH, &need_submit,
1122 &write_list);
1123 if (unlikely(!list_empty(&write_list))) {
1124 dm_bufio_unlock(c);
1125 blk_finish_plug(&plug);
1126 __flush_write_list(&write_list);
1127 blk_start_plug(&plug);
1128 dm_bufio_lock(c);
1129 }
1130 if (unlikely(b != NULL)) {
1131 dm_bufio_unlock(c);
1132
1133 if (need_submit)
1134 submit_io(b, REQ_OP_READ, read_endio);
1135 dm_bufio_release(b);
1136
1137 cond_resched();
1138
1139 if (!n_blocks)
1140 goto flush_plug;
1141 dm_bufio_lock(c);
1142 }
1143 }
1144
1145 dm_bufio_unlock(c);
1146
1147flush_plug:
1148 blk_finish_plug(&plug);
1149}
1150EXPORT_SYMBOL_GPL(dm_bufio_prefetch);
1151
1152void dm_bufio_release(struct dm_buffer *b)
1153{
1154 struct dm_bufio_client *c = b->c;
1155
1156 dm_bufio_lock(c);
1157
1158 BUG_ON(!b->hold_count);
1159
1160 b->hold_count--;
1161 if (!b->hold_count) {
1162 wake_up(&c->free_buffer_wait);
1163
1164
1165
1166
1167
1168
1169 if ((b->read_error || b->write_error) &&
1170 !test_bit(B_READING, &b->state) &&
1171 !test_bit(B_WRITING, &b->state) &&
1172 !test_bit(B_DIRTY, &b->state)) {
1173 __unlink_buffer(b);
1174 __free_buffer_wake(b);
1175 }
1176 }
1177
1178 dm_bufio_unlock(c);
1179}
1180EXPORT_SYMBOL_GPL(dm_bufio_release);
1181
1182void dm_bufio_mark_partial_buffer_dirty(struct dm_buffer *b,
1183 unsigned start, unsigned end)
1184{
1185 struct dm_bufio_client *c = b->c;
1186
1187 BUG_ON(start >= end);
1188 BUG_ON(end > b->c->block_size);
1189
1190 dm_bufio_lock(c);
1191
1192 BUG_ON(test_bit(B_READING, &b->state));
1193
1194 if (!test_and_set_bit(B_DIRTY, &b->state)) {
1195 b->dirty_start = start;
1196 b->dirty_end = end;
1197 __relink_lru(b, LIST_DIRTY);
1198 } else {
1199 if (start < b->dirty_start)
1200 b->dirty_start = start;
1201 if (end > b->dirty_end)
1202 b->dirty_end = end;
1203 }
1204
1205 dm_bufio_unlock(c);
1206}
1207EXPORT_SYMBOL_GPL(dm_bufio_mark_partial_buffer_dirty);
1208
1209void dm_bufio_mark_buffer_dirty(struct dm_buffer *b)
1210{
1211 dm_bufio_mark_partial_buffer_dirty(b, 0, b->c->block_size);
1212}
1213EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty);
1214
1215void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c)
1216{
1217 LIST_HEAD(write_list);
1218
1219 BUG_ON(dm_bufio_in_request());
1220
1221 dm_bufio_lock(c);
1222 __write_dirty_buffers_async(c, 0, &write_list);
1223 dm_bufio_unlock(c);
1224 __flush_write_list(&write_list);
1225}
1226EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
1227
1228
1229
1230
1231
1232
1233
1234
1235int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
1236{
1237 int a, f;
1238 unsigned long buffers_processed = 0;
1239 struct dm_buffer *b, *tmp;
1240
1241 LIST_HEAD(write_list);
1242
1243 dm_bufio_lock(c);
1244 __write_dirty_buffers_async(c, 0, &write_list);
1245 dm_bufio_unlock(c);
1246 __flush_write_list(&write_list);
1247 dm_bufio_lock(c);
1248
1249again:
1250 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
1251 int dropped_lock = 0;
1252
1253 if (buffers_processed < c->n_buffers[LIST_DIRTY])
1254 buffers_processed++;
1255
1256 BUG_ON(test_bit(B_READING, &b->state));
1257
1258 if (test_bit(B_WRITING, &b->state)) {
1259 if (buffers_processed < c->n_buffers[LIST_DIRTY]) {
1260 dropped_lock = 1;
1261 b->hold_count++;
1262 dm_bufio_unlock(c);
1263 wait_on_bit_io(&b->state, B_WRITING,
1264 TASK_UNINTERRUPTIBLE);
1265 dm_bufio_lock(c);
1266 b->hold_count--;
1267 } else
1268 wait_on_bit_io(&b->state, B_WRITING,
1269 TASK_UNINTERRUPTIBLE);
1270 }
1271
1272 if (!test_bit(B_DIRTY, &b->state) &&
1273 !test_bit(B_WRITING, &b->state))
1274 __relink_lru(b, LIST_CLEAN);
1275
1276 cond_resched();
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292 if (dropped_lock)
1293 goto again;
1294 }
1295 wake_up(&c->free_buffer_wait);
1296 dm_bufio_unlock(c);
1297
1298 a = xchg(&c->async_write_error, 0);
1299 f = dm_bufio_issue_flush(c);
1300 if (a)
1301 return a;
1302
1303 return f;
1304}
1305EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers);
1306
1307
1308
1309
1310int dm_bufio_issue_flush(struct dm_bufio_client *c)
1311{
1312 struct dm_io_request io_req = {
1313 .bi_op = REQ_OP_WRITE,
1314 .bi_op_flags = REQ_PREFLUSH | REQ_SYNC,
1315 .mem.type = DM_IO_KMEM,
1316 .mem.ptr.addr = NULL,
1317 .client = c->dm_io,
1318 };
1319 struct dm_io_region io_reg = {
1320 .bdev = c->bdev,
1321 .sector = 0,
1322 .count = 0,
1323 };
1324
1325 BUG_ON(dm_bufio_in_request());
1326
1327 return dm_io(&io_req, 1, &io_reg, NULL);
1328}
1329EXPORT_SYMBOL_GPL(dm_bufio_issue_flush);
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block)
1344{
1345 struct dm_bufio_client *c = b->c;
1346 struct dm_buffer *new;
1347
1348 BUG_ON(dm_bufio_in_request());
1349
1350 dm_bufio_lock(c);
1351
1352retry:
1353 new = __find(c, new_block);
1354 if (new) {
1355 if (new->hold_count) {
1356 __wait_for_free_buffer(c);
1357 goto retry;
1358 }
1359
1360
1361
1362
1363
1364 __make_buffer_clean(new);
1365 __unlink_buffer(new);
1366 __free_buffer_wake(new);
1367 }
1368
1369 BUG_ON(!b->hold_count);
1370 BUG_ON(test_bit(B_READING, &b->state));
1371
1372 __write_dirty_buffer(b, NULL);
1373 if (b->hold_count == 1) {
1374 wait_on_bit_io(&b->state, B_WRITING,
1375 TASK_UNINTERRUPTIBLE);
1376 set_bit(B_DIRTY, &b->state);
1377 b->dirty_start = 0;
1378 b->dirty_end = c->block_size;
1379 __unlink_buffer(b);
1380 __link_buffer(b, new_block, LIST_DIRTY);
1381 } else {
1382 sector_t old_block;
1383 wait_on_bit_lock_io(&b->state, B_WRITING,
1384 TASK_UNINTERRUPTIBLE);
1385
1386
1387
1388
1389
1390
1391
1392 old_block = b->block;
1393 __unlink_buffer(b);
1394 __link_buffer(b, new_block, b->list_mode);
1395 submit_io(b, REQ_OP_WRITE, write_endio);
1396 wait_on_bit_io(&b->state, B_WRITING,
1397 TASK_UNINTERRUPTIBLE);
1398 __unlink_buffer(b);
1399 __link_buffer(b, old_block, b->list_mode);
1400 }
1401
1402 dm_bufio_unlock(c);
1403 dm_bufio_release(b);
1404}
1405EXPORT_SYMBOL_GPL(dm_bufio_release_move);
1406
1407
1408
1409
1410
1411
1412
1413void dm_bufio_forget(struct dm_bufio_client *c, sector_t block)
1414{
1415 struct dm_buffer *b;
1416
1417 dm_bufio_lock(c);
1418
1419 b = __find(c, block);
1420 if (b && likely(!b->hold_count) && likely(!b->state)) {
1421 __unlink_buffer(b);
1422 __free_buffer_wake(b);
1423 }
1424
1425 dm_bufio_unlock(c);
1426}
1427EXPORT_SYMBOL_GPL(dm_bufio_forget);
1428
1429void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n)
1430{
1431 c->minimum_buffers = n;
1432}
1433EXPORT_SYMBOL_GPL(dm_bufio_set_minimum_buffers);
1434
1435unsigned dm_bufio_get_block_size(struct dm_bufio_client *c)
1436{
1437 return c->block_size;
1438}
1439EXPORT_SYMBOL_GPL(dm_bufio_get_block_size);
1440
1441sector_t dm_bufio_get_device_size(struct dm_bufio_client *c)
1442{
1443 sector_t s = i_size_read(c->bdev->bd_inode) >> SECTOR_SHIFT;
1444 if (s >= c->start)
1445 s -= c->start;
1446 else
1447 s = 0;
1448 if (likely(c->sectors_per_block_bits >= 0))
1449 s >>= c->sectors_per_block_bits;
1450 else
1451 sector_div(s, c->block_size >> SECTOR_SHIFT);
1452 return s;
1453}
1454EXPORT_SYMBOL_GPL(dm_bufio_get_device_size);
1455
1456struct dm_io_client *dm_bufio_get_dm_io_client(struct dm_bufio_client *c)
1457{
1458 return c->dm_io;
1459}
1460EXPORT_SYMBOL_GPL(dm_bufio_get_dm_io_client);
1461
1462sector_t dm_bufio_get_block_number(struct dm_buffer *b)
1463{
1464 return b->block;
1465}
1466EXPORT_SYMBOL_GPL(dm_bufio_get_block_number);
1467
1468void *dm_bufio_get_block_data(struct dm_buffer *b)
1469{
1470 return b->data;
1471}
1472EXPORT_SYMBOL_GPL(dm_bufio_get_block_data);
1473
1474void *dm_bufio_get_aux_data(struct dm_buffer *b)
1475{
1476 return b + 1;
1477}
1478EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data);
1479
1480struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b)
1481{
1482 return b->c;
1483}
1484EXPORT_SYMBOL_GPL(dm_bufio_get_client);
1485
1486static void drop_buffers(struct dm_bufio_client *c)
1487{
1488 struct dm_buffer *b;
1489 int i;
1490 bool warned = false;
1491
1492 BUG_ON(dm_bufio_in_request());
1493
1494
1495
1496
1497 dm_bufio_write_dirty_buffers_async(c);
1498
1499 dm_bufio_lock(c);
1500
1501 while ((b = __get_unclaimed_buffer(c)))
1502 __free_buffer_wake(b);
1503
1504 for (i = 0; i < LIST_SIZE; i++)
1505 list_for_each_entry(b, &c->lru[i], lru_list) {
1506 WARN_ON(!warned);
1507 warned = true;
1508 DMERR("leaked buffer %llx, hold count %u, list %d",
1509 (unsigned long long)b->block, b->hold_count, i);
1510#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1511 stack_trace_print(b->stack_entries, b->stack_len, 1);
1512
1513 b->hold_count = 0;
1514#endif
1515 }
1516
1517#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1518 while ((b = __get_unclaimed_buffer(c)))
1519 __free_buffer_wake(b);
1520#endif
1521
1522 for (i = 0; i < LIST_SIZE; i++)
1523 BUG_ON(!list_empty(&c->lru[i]));
1524
1525 dm_bufio_unlock(c);
1526}
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp)
1537{
1538 if (!(gfp & __GFP_FS)) {
1539 if (test_bit(B_READING, &b->state) ||
1540 test_bit(B_WRITING, &b->state) ||
1541 test_bit(B_DIRTY, &b->state))
1542 return false;
1543 }
1544
1545 if (b->hold_count)
1546 return false;
1547
1548 __make_buffer_clean(b);
1549 __unlink_buffer(b);
1550 __free_buffer_wake(b);
1551
1552 return true;
1553}
1554
1555static unsigned long get_retain_buffers(struct dm_bufio_client *c)
1556{
1557 unsigned long retain_bytes = READ_ONCE(dm_bufio_retain_bytes);
1558 if (likely(c->sectors_per_block_bits >= 0))
1559 retain_bytes >>= c->sectors_per_block_bits + SECTOR_SHIFT;
1560 else
1561 retain_bytes /= c->block_size;
1562 return retain_bytes;
1563}
1564
1565static void __scan(struct dm_bufio_client *c)
1566{
1567 int l;
1568 struct dm_buffer *b, *tmp;
1569 unsigned long freed = 0;
1570 unsigned long count = c->n_buffers[LIST_CLEAN] +
1571 c->n_buffers[LIST_DIRTY];
1572 unsigned long retain_target = get_retain_buffers(c);
1573
1574 for (l = 0; l < LIST_SIZE; l++) {
1575 list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) {
1576 if (count - freed <= retain_target)
1577 atomic_long_set(&c->need_shrink, 0);
1578 if (!atomic_long_read(&c->need_shrink))
1579 return;
1580 if (__try_evict_buffer(b, GFP_KERNEL)) {
1581 atomic_long_dec(&c->need_shrink);
1582 freed++;
1583 }
1584 cond_resched();
1585 }
1586 }
1587}
1588
1589static void shrink_work(struct work_struct *w)
1590{
1591 struct dm_bufio_client *c = container_of(w, struct dm_bufio_client, shrink_work);
1592
1593 dm_bufio_lock(c);
1594 __scan(c);
1595 dm_bufio_unlock(c);
1596}
1597
1598static unsigned long dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
1599{
1600 struct dm_bufio_client *c;
1601
1602 c = container_of(shrink, struct dm_bufio_client, shrinker);
1603 atomic_long_add(sc->nr_to_scan, &c->need_shrink);
1604 queue_work(dm_bufio_wq, &c->shrink_work);
1605
1606 return sc->nr_to_scan;
1607}
1608
1609static unsigned long dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
1610{
1611 struct dm_bufio_client *c = container_of(shrink, struct dm_bufio_client, shrinker);
1612 unsigned long count = READ_ONCE(c->n_buffers[LIST_CLEAN]) +
1613 READ_ONCE(c->n_buffers[LIST_DIRTY]);
1614 unsigned long retain_target = get_retain_buffers(c);
1615 unsigned long queued_for_cleanup = atomic_long_read(&c->need_shrink);
1616
1617 if (unlikely(count < retain_target))
1618 count = 0;
1619 else
1620 count -= retain_target;
1621
1622 if (unlikely(count < queued_for_cleanup))
1623 count = 0;
1624 else
1625 count -= queued_for_cleanup;
1626
1627 return count;
1628}
1629
1630
1631
1632
1633struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size,
1634 unsigned reserved_buffers, unsigned aux_size,
1635 void (*alloc_callback)(struct dm_buffer *),
1636 void (*write_callback)(struct dm_buffer *))
1637{
1638 int r;
1639 struct dm_bufio_client *c;
1640 unsigned i;
1641 char slab_name[27];
1642
1643 if (!block_size || block_size & ((1 << SECTOR_SHIFT) - 1)) {
1644 DMERR("%s: block size not specified or is not multiple of 512b", __func__);
1645 r = -EINVAL;
1646 goto bad_client;
1647 }
1648
1649 c = kzalloc(sizeof(*c), GFP_KERNEL);
1650 if (!c) {
1651 r = -ENOMEM;
1652 goto bad_client;
1653 }
1654 c->buffer_tree = RB_ROOT;
1655
1656 c->bdev = bdev;
1657 c->block_size = block_size;
1658 if (is_power_of_2(block_size))
1659 c->sectors_per_block_bits = __ffs(block_size) - SECTOR_SHIFT;
1660 else
1661 c->sectors_per_block_bits = -1;
1662
1663 c->alloc_callback = alloc_callback;
1664 c->write_callback = write_callback;
1665
1666 for (i = 0; i < LIST_SIZE; i++) {
1667 INIT_LIST_HEAD(&c->lru[i]);
1668 c->n_buffers[i] = 0;
1669 }
1670
1671 mutex_init(&c->lock);
1672 INIT_LIST_HEAD(&c->reserved_buffers);
1673 c->need_reserved_buffers = reserved_buffers;
1674
1675 dm_bufio_set_minimum_buffers(c, DM_BUFIO_MIN_BUFFERS);
1676
1677 init_waitqueue_head(&c->free_buffer_wait);
1678 c->async_write_error = 0;
1679
1680 c->dm_io = dm_io_client_create();
1681 if (IS_ERR(c->dm_io)) {
1682 r = PTR_ERR(c->dm_io);
1683 goto bad_dm_io;
1684 }
1685
1686 if (block_size <= KMALLOC_MAX_SIZE &&
1687 (block_size < PAGE_SIZE || !is_power_of_2(block_size))) {
1688 unsigned align = min(1U << __ffs(block_size), (unsigned)PAGE_SIZE);
1689 snprintf(slab_name, sizeof slab_name, "dm_bufio_cache-%u", block_size);
1690 c->slab_cache = kmem_cache_create(slab_name, block_size, align,
1691 SLAB_RECLAIM_ACCOUNT, NULL);
1692 if (!c->slab_cache) {
1693 r = -ENOMEM;
1694 goto bad;
1695 }
1696 }
1697 if (aux_size)
1698 snprintf(slab_name, sizeof slab_name, "dm_bufio_buffer-%u", aux_size);
1699 else
1700 snprintf(slab_name, sizeof slab_name, "dm_bufio_buffer");
1701 c->slab_buffer = kmem_cache_create(slab_name, sizeof(struct dm_buffer) + aux_size,
1702 0, SLAB_RECLAIM_ACCOUNT, NULL);
1703 if (!c->slab_buffer) {
1704 r = -ENOMEM;
1705 goto bad;
1706 }
1707
1708 while (c->need_reserved_buffers) {
1709 struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL);
1710
1711 if (!b) {
1712 r = -ENOMEM;
1713 goto bad;
1714 }
1715 __free_buffer_wake(b);
1716 }
1717
1718 INIT_WORK(&c->shrink_work, shrink_work);
1719 atomic_long_set(&c->need_shrink, 0);
1720
1721 c->shrinker.count_objects = dm_bufio_shrink_count;
1722 c->shrinker.scan_objects = dm_bufio_shrink_scan;
1723 c->shrinker.seeks = 1;
1724 c->shrinker.batch = 0;
1725 r = register_shrinker(&c->shrinker);
1726 if (r)
1727 goto bad;
1728
1729 mutex_lock(&dm_bufio_clients_lock);
1730 dm_bufio_client_count++;
1731 list_add(&c->client_list, &dm_bufio_all_clients);
1732 __cache_size_refresh();
1733 mutex_unlock(&dm_bufio_clients_lock);
1734
1735 return c;
1736
1737bad:
1738 while (!list_empty(&c->reserved_buffers)) {
1739 struct dm_buffer *b = list_entry(c->reserved_buffers.next,
1740 struct dm_buffer, lru_list);
1741 list_del(&b->lru_list);
1742 free_buffer(b);
1743 }
1744 kmem_cache_destroy(c->slab_cache);
1745 kmem_cache_destroy(c->slab_buffer);
1746 dm_io_client_destroy(c->dm_io);
1747bad_dm_io:
1748 mutex_destroy(&c->lock);
1749 kfree(c);
1750bad_client:
1751 return ERR_PTR(r);
1752}
1753EXPORT_SYMBOL_GPL(dm_bufio_client_create);
1754
1755
1756
1757
1758
1759void dm_bufio_client_destroy(struct dm_bufio_client *c)
1760{
1761 unsigned i;
1762
1763 drop_buffers(c);
1764
1765 unregister_shrinker(&c->shrinker);
1766 flush_work(&c->shrink_work);
1767
1768 mutex_lock(&dm_bufio_clients_lock);
1769
1770 list_del(&c->client_list);
1771 dm_bufio_client_count--;
1772 __cache_size_refresh();
1773
1774 mutex_unlock(&dm_bufio_clients_lock);
1775
1776 BUG_ON(!RB_EMPTY_ROOT(&c->buffer_tree));
1777 BUG_ON(c->need_reserved_buffers);
1778
1779 while (!list_empty(&c->reserved_buffers)) {
1780 struct dm_buffer *b = list_entry(c->reserved_buffers.next,
1781 struct dm_buffer, lru_list);
1782 list_del(&b->lru_list);
1783 free_buffer(b);
1784 }
1785
1786 for (i = 0; i < LIST_SIZE; i++)
1787 if (c->n_buffers[i])
1788 DMERR("leaked buffer count %d: %ld", i, c->n_buffers[i]);
1789
1790 for (i = 0; i < LIST_SIZE; i++)
1791 BUG_ON(c->n_buffers[i]);
1792
1793 kmem_cache_destroy(c->slab_cache);
1794 kmem_cache_destroy(c->slab_buffer);
1795 dm_io_client_destroy(c->dm_io);
1796 mutex_destroy(&c->lock);
1797 kfree(c);
1798}
1799EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
1800
1801void dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start)
1802{
1803 c->start = start;
1804}
1805EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset);
1806
1807static unsigned get_max_age_hz(void)
1808{
1809 unsigned max_age = READ_ONCE(dm_bufio_max_age);
1810
1811 if (max_age > UINT_MAX / HZ)
1812 max_age = UINT_MAX / HZ;
1813
1814 return max_age * HZ;
1815}
1816
1817static bool older_than(struct dm_buffer *b, unsigned long age_hz)
1818{
1819 return time_after_eq(jiffies, b->last_accessed + age_hz);
1820}
1821
1822static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
1823{
1824 struct dm_buffer *b, *tmp;
1825 unsigned long retain_target = get_retain_buffers(c);
1826 unsigned long count;
1827 LIST_HEAD(write_list);
1828
1829 dm_bufio_lock(c);
1830
1831 __check_watermark(c, &write_list);
1832 if (unlikely(!list_empty(&write_list))) {
1833 dm_bufio_unlock(c);
1834 __flush_write_list(&write_list);
1835 dm_bufio_lock(c);
1836 }
1837
1838 count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
1839 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) {
1840 if (count <= retain_target)
1841 break;
1842
1843 if (!older_than(b, age_hz))
1844 break;
1845
1846 if (__try_evict_buffer(b, 0))
1847 count--;
1848
1849 cond_resched();
1850 }
1851
1852 dm_bufio_unlock(c);
1853}
1854
1855static void do_global_cleanup(struct work_struct *w)
1856{
1857 struct dm_bufio_client *locked_client = NULL;
1858 struct dm_bufio_client *current_client;
1859 struct dm_buffer *b;
1860 unsigned spinlock_hold_count;
1861 unsigned long threshold = dm_bufio_cache_size -
1862 dm_bufio_cache_size / DM_BUFIO_LOW_WATERMARK_RATIO;
1863 unsigned long loops = global_num * 2;
1864
1865 mutex_lock(&dm_bufio_clients_lock);
1866
1867 while (1) {
1868 cond_resched();
1869
1870 spin_lock(&global_spinlock);
1871 if (unlikely(dm_bufio_current_allocated <= threshold))
1872 break;
1873
1874 spinlock_hold_count = 0;
1875get_next:
1876 if (!loops--)
1877 break;
1878 if (unlikely(list_empty(&global_queue)))
1879 break;
1880 b = list_entry(global_queue.prev, struct dm_buffer, global_list);
1881
1882 if (b->accessed) {
1883 b->accessed = 0;
1884 list_move(&b->global_list, &global_queue);
1885 if (likely(++spinlock_hold_count < 16))
1886 goto get_next;
1887 spin_unlock(&global_spinlock);
1888 continue;
1889 }
1890
1891 current_client = b->c;
1892 if (unlikely(current_client != locked_client)) {
1893 if (locked_client)
1894 dm_bufio_unlock(locked_client);
1895
1896 if (!dm_bufio_trylock(current_client)) {
1897 spin_unlock(&global_spinlock);
1898 dm_bufio_lock(current_client);
1899 locked_client = current_client;
1900 continue;
1901 }
1902
1903 locked_client = current_client;
1904 }
1905
1906 spin_unlock(&global_spinlock);
1907
1908 if (unlikely(!__try_evict_buffer(b, GFP_KERNEL))) {
1909 spin_lock(&global_spinlock);
1910 list_move(&b->global_list, &global_queue);
1911 spin_unlock(&global_spinlock);
1912 }
1913 }
1914
1915 spin_unlock(&global_spinlock);
1916
1917 if (locked_client)
1918 dm_bufio_unlock(locked_client);
1919
1920 mutex_unlock(&dm_bufio_clients_lock);
1921}
1922
1923static void cleanup_old_buffers(void)
1924{
1925 unsigned long max_age_hz = get_max_age_hz();
1926 struct dm_bufio_client *c;
1927
1928 mutex_lock(&dm_bufio_clients_lock);
1929
1930 __cache_size_refresh();
1931
1932 list_for_each_entry(c, &dm_bufio_all_clients, client_list)
1933 __evict_old_buffers(c, max_age_hz);
1934
1935 mutex_unlock(&dm_bufio_clients_lock);
1936}
1937
1938static void work_fn(struct work_struct *w)
1939{
1940 cleanup_old_buffers();
1941
1942 queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
1943 DM_BUFIO_WORK_TIMER_SECS * HZ);
1944}
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954static int __init dm_bufio_init(void)
1955{
1956 __u64 mem;
1957
1958 dm_bufio_allocated_kmem_cache = 0;
1959 dm_bufio_allocated_get_free_pages = 0;
1960 dm_bufio_allocated_vmalloc = 0;
1961 dm_bufio_current_allocated = 0;
1962
1963 mem = (__u64)mult_frac(totalram_pages() - totalhigh_pages(),
1964 DM_BUFIO_MEMORY_PERCENT, 100) << PAGE_SHIFT;
1965
1966 if (mem > ULONG_MAX)
1967 mem = ULONG_MAX;
1968
1969#ifdef CONFIG_MMU
1970 if (mem > mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100))
1971 mem = mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100);
1972#endif
1973
1974 dm_bufio_default_cache_size = mem;
1975
1976 mutex_lock(&dm_bufio_clients_lock);
1977 __cache_size_refresh();
1978 mutex_unlock(&dm_bufio_clients_lock);
1979
1980 dm_bufio_wq = alloc_workqueue("dm_bufio_cache", WQ_MEM_RECLAIM, 0);
1981 if (!dm_bufio_wq)
1982 return -ENOMEM;
1983
1984 INIT_DELAYED_WORK(&dm_bufio_cleanup_old_work, work_fn);
1985 INIT_WORK(&dm_bufio_replacement_work, do_global_cleanup);
1986 queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
1987 DM_BUFIO_WORK_TIMER_SECS * HZ);
1988
1989 return 0;
1990}
1991
1992
1993
1994
1995static void __exit dm_bufio_exit(void)
1996{
1997 int bug = 0;
1998
1999 cancel_delayed_work_sync(&dm_bufio_cleanup_old_work);
2000 flush_workqueue(dm_bufio_wq);
2001 destroy_workqueue(dm_bufio_wq);
2002
2003 if (dm_bufio_client_count) {
2004 DMCRIT("%s: dm_bufio_client_count leaked: %d",
2005 __func__, dm_bufio_client_count);
2006 bug = 1;
2007 }
2008
2009 if (dm_bufio_current_allocated) {
2010 DMCRIT("%s: dm_bufio_current_allocated leaked: %lu",
2011 __func__, dm_bufio_current_allocated);
2012 bug = 1;
2013 }
2014
2015 if (dm_bufio_allocated_get_free_pages) {
2016 DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu",
2017 __func__, dm_bufio_allocated_get_free_pages);
2018 bug = 1;
2019 }
2020
2021 if (dm_bufio_allocated_vmalloc) {
2022 DMCRIT("%s: dm_bufio_vmalloc leaked: %lu",
2023 __func__, dm_bufio_allocated_vmalloc);
2024 bug = 1;
2025 }
2026
2027 BUG_ON(bug);
2028}
2029
2030module_init(dm_bufio_init)
2031module_exit(dm_bufio_exit)
2032
2033module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, S_IRUGO | S_IWUSR);
2034MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
2035
2036module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR);
2037MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds");
2038
2039module_param_named(retain_bytes, dm_bufio_retain_bytes, ulong, S_IRUGO | S_IWUSR);
2040MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory");
2041
2042module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR);
2043MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory");
2044
2045module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, S_IRUGO);
2046MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc");
2047
2048module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, S_IRUGO);
2049MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages");
2050
2051module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, S_IRUGO);
2052MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc");
2053
2054module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, S_IRUGO);
2055MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache");
2056
2057MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
2058MODULE_DESCRIPTION(DM_NAME " buffered I/O library");
2059MODULE_LICENSE("GPL");
2060