1
2
3
4
5
6
7
8
9#include <linux/dm-bufio.h>
10
11#include <linux/device-mapper.h>
12#include <linux/dm-io.h>
13#include <linux/slab.h>
14#include <linux/sched/mm.h>
15#include <linux/jiffies.h>
16#include <linux/vmalloc.h>
17#include <linux/shrinker.h>
18#include <linux/module.h>
19#include <linux/rbtree.h>
20#include <linux/stacktrace.h>
21
22#define DM_MSG_PREFIX "bufio"
23
24
25
26
27
28
29
30
31
32#define DM_BUFIO_MIN_BUFFERS 8
33
34#define DM_BUFIO_MEMORY_PERCENT 2
35#define DM_BUFIO_VMALLOC_PERCENT 25
36#define DM_BUFIO_WRITEBACK_RATIO 3
37#define DM_BUFIO_LOW_WATERMARK_RATIO 16
38
39
40
41
42#define DM_BUFIO_WORK_TIMER_SECS 30
43
44
45
46
47#define DM_BUFIO_DEFAULT_AGE_SECS 300
48
49
50
51
52#define DM_BUFIO_DEFAULT_RETAIN_BYTES (256 * 1024)
53
54
55
56
57
58#define DM_BUFIO_WRITE_ALIGN 4096
59
60
61
62
63#define LIST_CLEAN 0
64#define LIST_DIRTY 1
65#define LIST_SIZE 2
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82struct dm_bufio_client {
83 struct mutex lock;
84
85 struct list_head lru[LIST_SIZE];
86 unsigned long n_buffers[LIST_SIZE];
87
88 struct block_device *bdev;
89 unsigned block_size;
90 s8 sectors_per_block_bits;
91 void (*alloc_callback)(struct dm_buffer *);
92 void (*write_callback)(struct dm_buffer *);
93
94 struct kmem_cache *slab_buffer;
95 struct kmem_cache *slab_cache;
96 struct dm_io_client *dm_io;
97
98 struct list_head reserved_buffers;
99 unsigned need_reserved_buffers;
100
101 unsigned minimum_buffers;
102
103 struct rb_root buffer_tree;
104 wait_queue_head_t free_buffer_wait;
105
106 sector_t start;
107
108 int async_write_error;
109
110 struct list_head client_list;
111 struct shrinker shrinker;
112};
113
114
115
116
117#define B_READING 0
118#define B_WRITING 1
119#define B_DIRTY 2
120
121
122
123
124
125
126enum data_mode {
127 DATA_MODE_SLAB = 0,
128 DATA_MODE_GET_FREE_PAGES = 1,
129 DATA_MODE_VMALLOC = 2,
130 DATA_MODE_LIMIT = 3
131};
132
133struct dm_buffer {
134 struct rb_node node;
135 struct list_head lru_list;
136 struct list_head global_list;
137 sector_t block;
138 void *data;
139 unsigned char data_mode;
140 unsigned char list_mode;
141 blk_status_t read_error;
142 blk_status_t write_error;
143 unsigned accessed;
144 unsigned hold_count;
145 unsigned long state;
146 unsigned long last_accessed;
147 unsigned dirty_start;
148 unsigned dirty_end;
149 unsigned write_start;
150 unsigned write_end;
151 struct dm_bufio_client *c;
152 struct list_head write_list;
153 void (*end_io)(struct dm_buffer *, blk_status_t);
154#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
155#define MAX_STACK 10
156 unsigned int stack_len;
157 unsigned long stack_entries[MAX_STACK];
158#endif
159};
160
161
162
163#define dm_bufio_in_request() (!!current->bio_list)
164
165static void dm_bufio_lock(struct dm_bufio_client *c)
166{
167 mutex_lock_nested(&c->lock, dm_bufio_in_request());
168}
169
170static int dm_bufio_trylock(struct dm_bufio_client *c)
171{
172 return mutex_trylock(&c->lock);
173}
174
175static void dm_bufio_unlock(struct dm_bufio_client *c)
176{
177 mutex_unlock(&c->lock);
178}
179
180
181
182
183
184
185static unsigned long dm_bufio_default_cache_size;
186
187
188
189
190static unsigned long dm_bufio_cache_size;
191
192
193
194
195
196static unsigned long dm_bufio_cache_size_latch;
197
198static DEFINE_SPINLOCK(global_spinlock);
199
200static LIST_HEAD(global_queue);
201
202static unsigned long global_num = 0;
203
204
205
206
207static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS;
208static unsigned long dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES;
209
210static unsigned long dm_bufio_peak_allocated;
211static unsigned long dm_bufio_allocated_kmem_cache;
212static unsigned long dm_bufio_allocated_get_free_pages;
213static unsigned long dm_bufio_allocated_vmalloc;
214static unsigned long dm_bufio_current_allocated;
215
216
217
218
219
220
221static int dm_bufio_client_count;
222
223
224
225
226static LIST_HEAD(dm_bufio_all_clients);
227
228
229
230
231static DEFINE_MUTEX(dm_bufio_clients_lock);
232
233static struct workqueue_struct *dm_bufio_wq;
234static struct delayed_work dm_bufio_cleanup_old_work;
235static struct work_struct dm_bufio_replacement_work;
236
237
238#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
239static void buffer_record_stack(struct dm_buffer *b)
240{
241 b->stack_len = stack_trace_save(b->stack_entries, MAX_STACK, 2);
242}
243#endif
244
245
246
247
248static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
249{
250 struct rb_node *n = c->buffer_tree.rb_node;
251 struct dm_buffer *b;
252
253 while (n) {
254 b = container_of(n, struct dm_buffer, node);
255
256 if (b->block == block)
257 return b;
258
259 n = block < b->block ? n->rb_left : n->rb_right;
260 }
261
262 return NULL;
263}
264
265static struct dm_buffer *__find_next(struct dm_bufio_client *c, sector_t block)
266{
267 struct rb_node *n = c->buffer_tree.rb_node;
268 struct dm_buffer *b;
269 struct dm_buffer *best = NULL;
270
271 while (n) {
272 b = container_of(n, struct dm_buffer, node);
273
274 if (b->block == block)
275 return b;
276
277 if (block <= b->block) {
278 n = n->rb_left;
279 best = b;
280 } else {
281 n = n->rb_right;
282 }
283 }
284
285 return best;
286}
287
288static void __insert(struct dm_bufio_client *c, struct dm_buffer *b)
289{
290 struct rb_node **new = &c->buffer_tree.rb_node, *parent = NULL;
291 struct dm_buffer *found;
292
293 while (*new) {
294 found = container_of(*new, struct dm_buffer, node);
295
296 if (found->block == b->block) {
297 BUG_ON(found != b);
298 return;
299 }
300
301 parent = *new;
302 new = b->block < found->block ?
303 &found->node.rb_left : &found->node.rb_right;
304 }
305
306 rb_link_node(&b->node, parent, new);
307 rb_insert_color(&b->node, &c->buffer_tree);
308}
309
310static void __remove(struct dm_bufio_client *c, struct dm_buffer *b)
311{
312 rb_erase(&b->node, &c->buffer_tree);
313}
314
315
316
317static void adjust_total_allocated(struct dm_buffer *b, bool unlink)
318{
319 unsigned char data_mode;
320 long diff;
321
322 static unsigned long * const class_ptr[DATA_MODE_LIMIT] = {
323 &dm_bufio_allocated_kmem_cache,
324 &dm_bufio_allocated_get_free_pages,
325 &dm_bufio_allocated_vmalloc,
326 };
327
328 data_mode = b->data_mode;
329 diff = (long)b->c->block_size;
330 if (unlink)
331 diff = -diff;
332
333 spin_lock(&global_spinlock);
334
335 *class_ptr[data_mode] += diff;
336
337 dm_bufio_current_allocated += diff;
338
339 if (dm_bufio_current_allocated > dm_bufio_peak_allocated)
340 dm_bufio_peak_allocated = dm_bufio_current_allocated;
341
342 b->accessed = 1;
343
344 if (!unlink) {
345 list_add(&b->global_list, &global_queue);
346 global_num++;
347 if (dm_bufio_current_allocated > dm_bufio_cache_size)
348 queue_work(dm_bufio_wq, &dm_bufio_replacement_work);
349 } else {
350 list_del(&b->global_list);
351 global_num--;
352 }
353
354 spin_unlock(&global_spinlock);
355}
356
357
358
359
360static void __cache_size_refresh(void)
361{
362 BUG_ON(!mutex_is_locked(&dm_bufio_clients_lock));
363 BUG_ON(dm_bufio_client_count < 0);
364
365 dm_bufio_cache_size_latch = READ_ONCE(dm_bufio_cache_size);
366
367
368
369
370 if (!dm_bufio_cache_size_latch) {
371 (void)cmpxchg(&dm_bufio_cache_size, 0,
372 dm_bufio_default_cache_size);
373 dm_bufio_cache_size_latch = dm_bufio_default_cache_size;
374 }
375}
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398static void *alloc_buffer_data(struct dm_bufio_client *c, gfp_t gfp_mask,
399 unsigned char *data_mode)
400{
401 if (unlikely(c->slab_cache != NULL)) {
402 *data_mode = DATA_MODE_SLAB;
403 return kmem_cache_alloc(c->slab_cache, gfp_mask);
404 }
405
406 if (c->block_size <= KMALLOC_MAX_SIZE &&
407 gfp_mask & __GFP_NORETRY) {
408 *data_mode = DATA_MODE_GET_FREE_PAGES;
409 return (void *)__get_free_pages(gfp_mask,
410 c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT));
411 }
412
413 *data_mode = DATA_MODE_VMALLOC;
414
415
416
417
418
419
420
421
422
423
424 if (gfp_mask & __GFP_NORETRY) {
425 unsigned noio_flag = memalloc_noio_save();
426 void *ptr = __vmalloc(c->block_size, gfp_mask);
427
428 memalloc_noio_restore(noio_flag);
429 return ptr;
430 }
431
432 return __vmalloc(c->block_size, gfp_mask);
433}
434
435
436
437
438static void free_buffer_data(struct dm_bufio_client *c,
439 void *data, unsigned char data_mode)
440{
441 switch (data_mode) {
442 case DATA_MODE_SLAB:
443 kmem_cache_free(c->slab_cache, data);
444 break;
445
446 case DATA_MODE_GET_FREE_PAGES:
447 free_pages((unsigned long)data,
448 c->sectors_per_block_bits - (PAGE_SHIFT - SECTOR_SHIFT));
449 break;
450
451 case DATA_MODE_VMALLOC:
452 vfree(data);
453 break;
454
455 default:
456 DMCRIT("dm_bufio_free_buffer_data: bad data mode: %d",
457 data_mode);
458 BUG();
459 }
460}
461
462
463
464
465static struct dm_buffer *alloc_buffer(struct dm_bufio_client *c, gfp_t gfp_mask)
466{
467 struct dm_buffer *b = kmem_cache_alloc(c->slab_buffer, gfp_mask);
468
469 if (!b)
470 return NULL;
471
472 b->c = c;
473
474 b->data = alloc_buffer_data(c, gfp_mask, &b->data_mode);
475 if (!b->data) {
476 kmem_cache_free(c->slab_buffer, b);
477 return NULL;
478 }
479
480#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
481 b->stack_len = 0;
482#endif
483 return b;
484}
485
486
487
488
489static void free_buffer(struct dm_buffer *b)
490{
491 struct dm_bufio_client *c = b->c;
492
493 free_buffer_data(c, b->data, b->data_mode);
494 kmem_cache_free(c->slab_buffer, b);
495}
496
497
498
499
500static void __link_buffer(struct dm_buffer *b, sector_t block, int dirty)
501{
502 struct dm_bufio_client *c = b->c;
503
504 c->n_buffers[dirty]++;
505 b->block = block;
506 b->list_mode = dirty;
507 list_add(&b->lru_list, &c->lru[dirty]);
508 __insert(b->c, b);
509 b->last_accessed = jiffies;
510
511 adjust_total_allocated(b, false);
512}
513
514
515
516
517static void __unlink_buffer(struct dm_buffer *b)
518{
519 struct dm_bufio_client *c = b->c;
520
521 BUG_ON(!c->n_buffers[b->list_mode]);
522
523 c->n_buffers[b->list_mode]--;
524 __remove(b->c, b);
525 list_del(&b->lru_list);
526
527 adjust_total_allocated(b, true);
528}
529
530
531
532
533static void __relink_lru(struct dm_buffer *b, int dirty)
534{
535 struct dm_bufio_client *c = b->c;
536
537 b->accessed = 1;
538
539 BUG_ON(!c->n_buffers[b->list_mode]);
540
541 c->n_buffers[b->list_mode]--;
542 c->n_buffers[dirty]++;
543 b->list_mode = dirty;
544 list_move(&b->lru_list, &c->lru[dirty]);
545 b->last_accessed = jiffies;
546}
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570static void dmio_complete(unsigned long error, void *context)
571{
572 struct dm_buffer *b = context;
573
574 b->end_io(b, unlikely(error != 0) ? BLK_STS_IOERR : 0);
575}
576
577static void use_dmio(struct dm_buffer *b, int rw, sector_t sector,
578 unsigned n_sectors, unsigned offset)
579{
580 int r;
581 struct dm_io_request io_req = {
582 .bi_op = rw,
583 .bi_op_flags = 0,
584 .notify.fn = dmio_complete,
585 .notify.context = b,
586 .client = b->c->dm_io,
587 };
588 struct dm_io_region region = {
589 .bdev = b->c->bdev,
590 .sector = sector,
591 .count = n_sectors,
592 };
593
594 if (b->data_mode != DATA_MODE_VMALLOC) {
595 io_req.mem.type = DM_IO_KMEM;
596 io_req.mem.ptr.addr = (char *)b->data + offset;
597 } else {
598 io_req.mem.type = DM_IO_VMA;
599 io_req.mem.ptr.vma = (char *)b->data + offset;
600 }
601
602 r = dm_io(&io_req, 1, ®ion, NULL);
603 if (unlikely(r))
604 b->end_io(b, errno_to_blk_status(r));
605}
606
607static void bio_complete(struct bio *bio)
608{
609 struct dm_buffer *b = bio->bi_private;
610 blk_status_t status = bio->bi_status;
611 bio_put(bio);
612 b->end_io(b, status);
613}
614
615static void use_bio(struct dm_buffer *b, int rw, sector_t sector,
616 unsigned n_sectors, unsigned offset)
617{
618 struct bio *bio;
619 char *ptr;
620 unsigned vec_size, len;
621
622 vec_size = b->c->block_size >> PAGE_SHIFT;
623 if (unlikely(b->c->sectors_per_block_bits < PAGE_SHIFT - SECTOR_SHIFT))
624 vec_size += 2;
625
626 bio = bio_kmalloc(GFP_NOWAIT | __GFP_NORETRY | __GFP_NOWARN, vec_size);
627 if (!bio) {
628dmio:
629 use_dmio(b, rw, sector, n_sectors, offset);
630 return;
631 }
632
633 bio->bi_iter.bi_sector = sector;
634 bio_set_dev(bio, b->c->bdev);
635 bio_set_op_attrs(bio, rw, 0);
636 bio->bi_end_io = bio_complete;
637 bio->bi_private = b;
638
639 ptr = (char *)b->data + offset;
640 len = n_sectors << SECTOR_SHIFT;
641
642 do {
643 unsigned this_step = min((unsigned)(PAGE_SIZE - offset_in_page(ptr)), len);
644 if (!bio_add_page(bio, virt_to_page(ptr), this_step,
645 offset_in_page(ptr))) {
646 bio_put(bio);
647 goto dmio;
648 }
649
650 len -= this_step;
651 ptr += this_step;
652 } while (len > 0);
653
654 submit_bio(bio);
655}
656
657static inline sector_t block_to_sector(struct dm_bufio_client *c, sector_t block)
658{
659 sector_t sector;
660
661 if (likely(c->sectors_per_block_bits >= 0))
662 sector = block << c->sectors_per_block_bits;
663 else
664 sector = block * (c->block_size >> SECTOR_SHIFT);
665 sector += c->start;
666
667 return sector;
668}
669
670static void submit_io(struct dm_buffer *b, int rw, void (*end_io)(struct dm_buffer *, blk_status_t))
671{
672 unsigned n_sectors;
673 sector_t sector;
674 unsigned offset, end;
675
676 b->end_io = end_io;
677
678 sector = block_to_sector(b->c, b->block);
679
680 if (rw != REQ_OP_WRITE) {
681 n_sectors = b->c->block_size >> SECTOR_SHIFT;
682 offset = 0;
683 } else {
684 if (b->c->write_callback)
685 b->c->write_callback(b);
686 offset = b->write_start;
687 end = b->write_end;
688 offset &= -DM_BUFIO_WRITE_ALIGN;
689 end += DM_BUFIO_WRITE_ALIGN - 1;
690 end &= -DM_BUFIO_WRITE_ALIGN;
691 if (unlikely(end > b->c->block_size))
692 end = b->c->block_size;
693
694 sector += offset >> SECTOR_SHIFT;
695 n_sectors = (end - offset) >> SECTOR_SHIFT;
696 }
697
698 if (b->data_mode != DATA_MODE_VMALLOC)
699 use_bio(b, rw, sector, n_sectors, offset);
700 else
701 use_dmio(b, rw, sector, n_sectors, offset);
702}
703
704
705
706
707
708
709
710
711
712
713
714static void write_endio(struct dm_buffer *b, blk_status_t status)
715{
716 b->write_error = status;
717 if (unlikely(status)) {
718 struct dm_bufio_client *c = b->c;
719
720 (void)cmpxchg(&c->async_write_error, 0,
721 blk_status_to_errno(status));
722 }
723
724 BUG_ON(!test_bit(B_WRITING, &b->state));
725
726 smp_mb__before_atomic();
727 clear_bit(B_WRITING, &b->state);
728 smp_mb__after_atomic();
729
730 wake_up_bit(&b->state, B_WRITING);
731}
732
733
734
735
736
737
738
739
740
741
742static void __write_dirty_buffer(struct dm_buffer *b,
743 struct list_head *write_list)
744{
745 if (!test_bit(B_DIRTY, &b->state))
746 return;
747
748 clear_bit(B_DIRTY, &b->state);
749 wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
750
751 b->write_start = b->dirty_start;
752 b->write_end = b->dirty_end;
753
754 if (!write_list)
755 submit_io(b, REQ_OP_WRITE, write_endio);
756 else
757 list_add_tail(&b->write_list, write_list);
758}
759
760static void __flush_write_list(struct list_head *write_list)
761{
762 struct blk_plug plug;
763 blk_start_plug(&plug);
764 while (!list_empty(write_list)) {
765 struct dm_buffer *b =
766 list_entry(write_list->next, struct dm_buffer, write_list);
767 list_del(&b->write_list);
768 submit_io(b, REQ_OP_WRITE, write_endio);
769 cond_resched();
770 }
771 blk_finish_plug(&plug);
772}
773
774
775
776
777
778
779static void __make_buffer_clean(struct dm_buffer *b)
780{
781 BUG_ON(b->hold_count);
782
783 if (!b->state)
784 return;
785
786 wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
787 __write_dirty_buffer(b, NULL);
788 wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE);
789}
790
791
792
793
794
795static struct dm_buffer *__get_unclaimed_buffer(struct dm_bufio_client *c)
796{
797 struct dm_buffer *b;
798
799 list_for_each_entry_reverse(b, &c->lru[LIST_CLEAN], lru_list) {
800 BUG_ON(test_bit(B_WRITING, &b->state));
801 BUG_ON(test_bit(B_DIRTY, &b->state));
802
803 if (!b->hold_count) {
804 __make_buffer_clean(b);
805 __unlink_buffer(b);
806 return b;
807 }
808 cond_resched();
809 }
810
811 list_for_each_entry_reverse(b, &c->lru[LIST_DIRTY], lru_list) {
812 BUG_ON(test_bit(B_READING, &b->state));
813
814 if (!b->hold_count) {
815 __make_buffer_clean(b);
816 __unlink_buffer(b);
817 return b;
818 }
819 cond_resched();
820 }
821
822 return NULL;
823}
824
825
826
827
828
829
830
831
832static void __wait_for_free_buffer(struct dm_bufio_client *c)
833{
834 DECLARE_WAITQUEUE(wait, current);
835
836 add_wait_queue(&c->free_buffer_wait, &wait);
837 set_current_state(TASK_UNINTERRUPTIBLE);
838 dm_bufio_unlock(c);
839
840 io_schedule();
841
842 remove_wait_queue(&c->free_buffer_wait, &wait);
843
844 dm_bufio_lock(c);
845}
846
847enum new_flag {
848 NF_FRESH = 0,
849 NF_READ = 1,
850 NF_GET = 2,
851 NF_PREFETCH = 3
852};
853
854
855
856
857
858
859
860static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf)
861{
862 struct dm_buffer *b;
863 bool tried_noio_alloc = false;
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878 while (1) {
879 if (dm_bufio_cache_size_latch != 1) {
880 b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
881 if (b)
882 return b;
883 }
884
885 if (nf == NF_PREFETCH)
886 return NULL;
887
888 if (dm_bufio_cache_size_latch != 1 && !tried_noio_alloc) {
889 dm_bufio_unlock(c);
890 b = alloc_buffer(c, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
891 dm_bufio_lock(c);
892 if (b)
893 return b;
894 tried_noio_alloc = true;
895 }
896
897 if (!list_empty(&c->reserved_buffers)) {
898 b = list_entry(c->reserved_buffers.next,
899 struct dm_buffer, lru_list);
900 list_del(&b->lru_list);
901 c->need_reserved_buffers++;
902
903 return b;
904 }
905
906 b = __get_unclaimed_buffer(c);
907 if (b)
908 return b;
909
910 __wait_for_free_buffer(c);
911 }
912}
913
914static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf)
915{
916 struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf);
917
918 if (!b)
919 return NULL;
920
921 if (c->alloc_callback)
922 c->alloc_callback(b);
923
924 return b;
925}
926
927
928
929
930static void __free_buffer_wake(struct dm_buffer *b)
931{
932 struct dm_bufio_client *c = b->c;
933
934 if (!c->need_reserved_buffers)
935 free_buffer(b);
936 else {
937 list_add(&b->lru_list, &c->reserved_buffers);
938 c->need_reserved_buffers--;
939 }
940
941 wake_up(&c->free_buffer_wait);
942}
943
944static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait,
945 struct list_head *write_list)
946{
947 struct dm_buffer *b, *tmp;
948
949 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
950 BUG_ON(test_bit(B_READING, &b->state));
951
952 if (!test_bit(B_DIRTY, &b->state) &&
953 !test_bit(B_WRITING, &b->state)) {
954 __relink_lru(b, LIST_CLEAN);
955 continue;
956 }
957
958 if (no_wait && test_bit(B_WRITING, &b->state))
959 return;
960
961 __write_dirty_buffer(b, write_list);
962 cond_resched();
963 }
964}
965
966
967
968
969
970
971static void __check_watermark(struct dm_bufio_client *c,
972 struct list_head *write_list)
973{
974 if (c->n_buffers[LIST_DIRTY] > c->n_buffers[LIST_CLEAN] * DM_BUFIO_WRITEBACK_RATIO)
975 __write_dirty_buffers_async(c, 1, write_list);
976}
977
978
979
980
981
982static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
983 enum new_flag nf, int *need_submit,
984 struct list_head *write_list)
985{
986 struct dm_buffer *b, *new_b = NULL;
987
988 *need_submit = 0;
989
990 b = __find(c, block);
991 if (b)
992 goto found_buffer;
993
994 if (nf == NF_GET)
995 return NULL;
996
997 new_b = __alloc_buffer_wait(c, nf);
998 if (!new_b)
999 return NULL;
1000
1001
1002
1003
1004
1005 b = __find(c, block);
1006 if (b) {
1007 __free_buffer_wake(new_b);
1008 goto found_buffer;
1009 }
1010
1011 __check_watermark(c, write_list);
1012
1013 b = new_b;
1014 b->hold_count = 1;
1015 b->read_error = 0;
1016 b->write_error = 0;
1017 __link_buffer(b, block, LIST_CLEAN);
1018
1019 if (nf == NF_FRESH) {
1020 b->state = 0;
1021 return b;
1022 }
1023
1024 b->state = 1 << B_READING;
1025 *need_submit = 1;
1026
1027 return b;
1028
1029found_buffer:
1030 if (nf == NF_PREFETCH)
1031 return NULL;
1032
1033
1034
1035
1036
1037
1038
1039 if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state)))
1040 return NULL;
1041
1042 b->hold_count++;
1043 __relink_lru(b, test_bit(B_DIRTY, &b->state) ||
1044 test_bit(B_WRITING, &b->state));
1045 return b;
1046}
1047
1048
1049
1050
1051
1052static void read_endio(struct dm_buffer *b, blk_status_t status)
1053{
1054 b->read_error = status;
1055
1056 BUG_ON(!test_bit(B_READING, &b->state));
1057
1058 smp_mb__before_atomic();
1059 clear_bit(B_READING, &b->state);
1060 smp_mb__after_atomic();
1061
1062 wake_up_bit(&b->state, B_READING);
1063}
1064
1065
1066
1067
1068
1069
1070
1071static void *new_read(struct dm_bufio_client *c, sector_t block,
1072 enum new_flag nf, struct dm_buffer **bp)
1073{
1074 int need_submit;
1075 struct dm_buffer *b;
1076
1077 LIST_HEAD(write_list);
1078
1079 dm_bufio_lock(c);
1080 b = __bufio_new(c, block, nf, &need_submit, &write_list);
1081#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1082 if (b && b->hold_count == 1)
1083 buffer_record_stack(b);
1084#endif
1085 dm_bufio_unlock(c);
1086
1087 __flush_write_list(&write_list);
1088
1089 if (!b)
1090 return NULL;
1091
1092 if (need_submit)
1093 submit_io(b, REQ_OP_READ, read_endio);
1094
1095 wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE);
1096
1097 if (b->read_error) {
1098 int error = blk_status_to_errno(b->read_error);
1099
1100 dm_bufio_release(b);
1101
1102 return ERR_PTR(error);
1103 }
1104
1105 *bp = b;
1106
1107 return b->data;
1108}
1109
1110void *dm_bufio_get(struct dm_bufio_client *c, sector_t block,
1111 struct dm_buffer **bp)
1112{
1113 return new_read(c, block, NF_GET, bp);
1114}
1115EXPORT_SYMBOL_GPL(dm_bufio_get);
1116
1117void *dm_bufio_read(struct dm_bufio_client *c, sector_t block,
1118 struct dm_buffer **bp)
1119{
1120 BUG_ON(dm_bufio_in_request());
1121
1122 return new_read(c, block, NF_READ, bp);
1123}
1124EXPORT_SYMBOL_GPL(dm_bufio_read);
1125
1126void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
1127 struct dm_buffer **bp)
1128{
1129 BUG_ON(dm_bufio_in_request());
1130
1131 return new_read(c, block, NF_FRESH, bp);
1132}
1133EXPORT_SYMBOL_GPL(dm_bufio_new);
1134
1135void dm_bufio_prefetch(struct dm_bufio_client *c,
1136 sector_t block, unsigned n_blocks)
1137{
1138 struct blk_plug plug;
1139
1140 LIST_HEAD(write_list);
1141
1142 BUG_ON(dm_bufio_in_request());
1143
1144 blk_start_plug(&plug);
1145 dm_bufio_lock(c);
1146
1147 for (; n_blocks--; block++) {
1148 int need_submit;
1149 struct dm_buffer *b;
1150 b = __bufio_new(c, block, NF_PREFETCH, &need_submit,
1151 &write_list);
1152 if (unlikely(!list_empty(&write_list))) {
1153 dm_bufio_unlock(c);
1154 blk_finish_plug(&plug);
1155 __flush_write_list(&write_list);
1156 blk_start_plug(&plug);
1157 dm_bufio_lock(c);
1158 }
1159 if (unlikely(b != NULL)) {
1160 dm_bufio_unlock(c);
1161
1162 if (need_submit)
1163 submit_io(b, REQ_OP_READ, read_endio);
1164 dm_bufio_release(b);
1165
1166 cond_resched();
1167
1168 if (!n_blocks)
1169 goto flush_plug;
1170 dm_bufio_lock(c);
1171 }
1172 }
1173
1174 dm_bufio_unlock(c);
1175
1176flush_plug:
1177 blk_finish_plug(&plug);
1178}
1179EXPORT_SYMBOL_GPL(dm_bufio_prefetch);
1180
1181void dm_bufio_release(struct dm_buffer *b)
1182{
1183 struct dm_bufio_client *c = b->c;
1184
1185 dm_bufio_lock(c);
1186
1187 BUG_ON(!b->hold_count);
1188
1189 b->hold_count--;
1190 if (!b->hold_count) {
1191 wake_up(&c->free_buffer_wait);
1192
1193
1194
1195
1196
1197
1198 if ((b->read_error || b->write_error) &&
1199 !test_bit(B_READING, &b->state) &&
1200 !test_bit(B_WRITING, &b->state) &&
1201 !test_bit(B_DIRTY, &b->state)) {
1202 __unlink_buffer(b);
1203 __free_buffer_wake(b);
1204 }
1205 }
1206
1207 dm_bufio_unlock(c);
1208}
1209EXPORT_SYMBOL_GPL(dm_bufio_release);
1210
1211void dm_bufio_mark_partial_buffer_dirty(struct dm_buffer *b,
1212 unsigned start, unsigned end)
1213{
1214 struct dm_bufio_client *c = b->c;
1215
1216 BUG_ON(start >= end);
1217 BUG_ON(end > b->c->block_size);
1218
1219 dm_bufio_lock(c);
1220
1221 BUG_ON(test_bit(B_READING, &b->state));
1222
1223 if (!test_and_set_bit(B_DIRTY, &b->state)) {
1224 b->dirty_start = start;
1225 b->dirty_end = end;
1226 __relink_lru(b, LIST_DIRTY);
1227 } else {
1228 if (start < b->dirty_start)
1229 b->dirty_start = start;
1230 if (end > b->dirty_end)
1231 b->dirty_end = end;
1232 }
1233
1234 dm_bufio_unlock(c);
1235}
1236EXPORT_SYMBOL_GPL(dm_bufio_mark_partial_buffer_dirty);
1237
1238void dm_bufio_mark_buffer_dirty(struct dm_buffer *b)
1239{
1240 dm_bufio_mark_partial_buffer_dirty(b, 0, b->c->block_size);
1241}
1242EXPORT_SYMBOL_GPL(dm_bufio_mark_buffer_dirty);
1243
1244void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c)
1245{
1246 LIST_HEAD(write_list);
1247
1248 BUG_ON(dm_bufio_in_request());
1249
1250 dm_bufio_lock(c);
1251 __write_dirty_buffers_async(c, 0, &write_list);
1252 dm_bufio_unlock(c);
1253 __flush_write_list(&write_list);
1254}
1255EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async);
1256
1257
1258
1259
1260
1261
1262
1263
1264int dm_bufio_write_dirty_buffers(struct dm_bufio_client *c)
1265{
1266 int a, f;
1267 unsigned long buffers_processed = 0;
1268 struct dm_buffer *b, *tmp;
1269
1270 LIST_HEAD(write_list);
1271
1272 dm_bufio_lock(c);
1273 __write_dirty_buffers_async(c, 0, &write_list);
1274 dm_bufio_unlock(c);
1275 __flush_write_list(&write_list);
1276 dm_bufio_lock(c);
1277
1278again:
1279 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) {
1280 int dropped_lock = 0;
1281
1282 if (buffers_processed < c->n_buffers[LIST_DIRTY])
1283 buffers_processed++;
1284
1285 BUG_ON(test_bit(B_READING, &b->state));
1286
1287 if (test_bit(B_WRITING, &b->state)) {
1288 if (buffers_processed < c->n_buffers[LIST_DIRTY]) {
1289 dropped_lock = 1;
1290 b->hold_count++;
1291 dm_bufio_unlock(c);
1292 wait_on_bit_io(&b->state, B_WRITING,
1293 TASK_UNINTERRUPTIBLE);
1294 dm_bufio_lock(c);
1295 b->hold_count--;
1296 } else
1297 wait_on_bit_io(&b->state, B_WRITING,
1298 TASK_UNINTERRUPTIBLE);
1299 }
1300
1301 if (!test_bit(B_DIRTY, &b->state) &&
1302 !test_bit(B_WRITING, &b->state))
1303 __relink_lru(b, LIST_CLEAN);
1304
1305 cond_resched();
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321 if (dropped_lock)
1322 goto again;
1323 }
1324 wake_up(&c->free_buffer_wait);
1325 dm_bufio_unlock(c);
1326
1327 a = xchg(&c->async_write_error, 0);
1328 f = dm_bufio_issue_flush(c);
1329 if (a)
1330 return a;
1331
1332 return f;
1333}
1334EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers);
1335
1336
1337
1338
1339int dm_bufio_issue_flush(struct dm_bufio_client *c)
1340{
1341 struct dm_io_request io_req = {
1342 .bi_op = REQ_OP_WRITE,
1343 .bi_op_flags = REQ_PREFLUSH | REQ_SYNC,
1344 .mem.type = DM_IO_KMEM,
1345 .mem.ptr.addr = NULL,
1346 .client = c->dm_io,
1347 };
1348 struct dm_io_region io_reg = {
1349 .bdev = c->bdev,
1350 .sector = 0,
1351 .count = 0,
1352 };
1353
1354 BUG_ON(dm_bufio_in_request());
1355
1356 return dm_io(&io_req, 1, &io_reg, NULL);
1357}
1358EXPORT_SYMBOL_GPL(dm_bufio_issue_flush);
1359
1360
1361
1362
1363int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t count)
1364{
1365 struct dm_io_request io_req = {
1366 .bi_op = REQ_OP_DISCARD,
1367 .bi_op_flags = REQ_SYNC,
1368 .mem.type = DM_IO_KMEM,
1369 .mem.ptr.addr = NULL,
1370 .client = c->dm_io,
1371 };
1372 struct dm_io_region io_reg = {
1373 .bdev = c->bdev,
1374 .sector = block_to_sector(c, block),
1375 .count = block_to_sector(c, count),
1376 };
1377
1378 BUG_ON(dm_bufio_in_request());
1379
1380 return dm_io(&io_req, 1, &io_reg, NULL);
1381}
1382EXPORT_SYMBOL_GPL(dm_bufio_issue_discard);
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396void dm_bufio_release_move(struct dm_buffer *b, sector_t new_block)
1397{
1398 struct dm_bufio_client *c = b->c;
1399 struct dm_buffer *new;
1400
1401 BUG_ON(dm_bufio_in_request());
1402
1403 dm_bufio_lock(c);
1404
1405retry:
1406 new = __find(c, new_block);
1407 if (new) {
1408 if (new->hold_count) {
1409 __wait_for_free_buffer(c);
1410 goto retry;
1411 }
1412
1413
1414
1415
1416
1417 __make_buffer_clean(new);
1418 __unlink_buffer(new);
1419 __free_buffer_wake(new);
1420 }
1421
1422 BUG_ON(!b->hold_count);
1423 BUG_ON(test_bit(B_READING, &b->state));
1424
1425 __write_dirty_buffer(b, NULL);
1426 if (b->hold_count == 1) {
1427 wait_on_bit_io(&b->state, B_WRITING,
1428 TASK_UNINTERRUPTIBLE);
1429 set_bit(B_DIRTY, &b->state);
1430 b->dirty_start = 0;
1431 b->dirty_end = c->block_size;
1432 __unlink_buffer(b);
1433 __link_buffer(b, new_block, LIST_DIRTY);
1434 } else {
1435 sector_t old_block;
1436 wait_on_bit_lock_io(&b->state, B_WRITING,
1437 TASK_UNINTERRUPTIBLE);
1438
1439
1440
1441
1442
1443
1444
1445 old_block = b->block;
1446 __unlink_buffer(b);
1447 __link_buffer(b, new_block, b->list_mode);
1448 submit_io(b, REQ_OP_WRITE, write_endio);
1449 wait_on_bit_io(&b->state, B_WRITING,
1450 TASK_UNINTERRUPTIBLE);
1451 __unlink_buffer(b);
1452 __link_buffer(b, old_block, b->list_mode);
1453 }
1454
1455 dm_bufio_unlock(c);
1456 dm_bufio_release(b);
1457}
1458EXPORT_SYMBOL_GPL(dm_bufio_release_move);
1459
1460static void forget_buffer_locked(struct dm_buffer *b)
1461{
1462 if (likely(!b->hold_count) && likely(!b->state)) {
1463 __unlink_buffer(b);
1464 __free_buffer_wake(b);
1465 }
1466}
1467
1468
1469
1470
1471
1472
1473
1474void dm_bufio_forget(struct dm_bufio_client *c, sector_t block)
1475{
1476 struct dm_buffer *b;
1477
1478 dm_bufio_lock(c);
1479
1480 b = __find(c, block);
1481 if (b)
1482 forget_buffer_locked(b);
1483
1484 dm_bufio_unlock(c);
1485}
1486EXPORT_SYMBOL_GPL(dm_bufio_forget);
1487
1488void dm_bufio_forget_buffers(struct dm_bufio_client *c, sector_t block, sector_t n_blocks)
1489{
1490 struct dm_buffer *b;
1491 sector_t end_block = block + n_blocks;
1492
1493 while (block < end_block) {
1494 dm_bufio_lock(c);
1495
1496 b = __find_next(c, block);
1497 if (b) {
1498 block = b->block + 1;
1499 forget_buffer_locked(b);
1500 }
1501
1502 dm_bufio_unlock(c);
1503
1504 if (!b)
1505 break;
1506 }
1507
1508}
1509EXPORT_SYMBOL_GPL(dm_bufio_forget_buffers);
1510
1511void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n)
1512{
1513 c->minimum_buffers = n;
1514}
1515EXPORT_SYMBOL_GPL(dm_bufio_set_minimum_buffers);
1516
1517unsigned dm_bufio_get_block_size(struct dm_bufio_client *c)
1518{
1519 return c->block_size;
1520}
1521EXPORT_SYMBOL_GPL(dm_bufio_get_block_size);
1522
1523sector_t dm_bufio_get_device_size(struct dm_bufio_client *c)
1524{
1525 sector_t s = i_size_read(c->bdev->bd_inode) >> SECTOR_SHIFT;
1526 if (likely(c->sectors_per_block_bits >= 0))
1527 s >>= c->sectors_per_block_bits;
1528 else
1529 sector_div(s, c->block_size >> SECTOR_SHIFT);
1530 return s;
1531}
1532EXPORT_SYMBOL_GPL(dm_bufio_get_device_size);
1533
1534sector_t dm_bufio_get_block_number(struct dm_buffer *b)
1535{
1536 return b->block;
1537}
1538EXPORT_SYMBOL_GPL(dm_bufio_get_block_number);
1539
1540void *dm_bufio_get_block_data(struct dm_buffer *b)
1541{
1542 return b->data;
1543}
1544EXPORT_SYMBOL_GPL(dm_bufio_get_block_data);
1545
1546void *dm_bufio_get_aux_data(struct dm_buffer *b)
1547{
1548 return b + 1;
1549}
1550EXPORT_SYMBOL_GPL(dm_bufio_get_aux_data);
1551
1552struct dm_bufio_client *dm_bufio_get_client(struct dm_buffer *b)
1553{
1554 return b->c;
1555}
1556EXPORT_SYMBOL_GPL(dm_bufio_get_client);
1557
1558static void drop_buffers(struct dm_bufio_client *c)
1559{
1560 struct dm_buffer *b;
1561 int i;
1562 bool warned = false;
1563
1564 BUG_ON(dm_bufio_in_request());
1565
1566
1567
1568
1569 dm_bufio_write_dirty_buffers_async(c);
1570
1571 dm_bufio_lock(c);
1572
1573 while ((b = __get_unclaimed_buffer(c)))
1574 __free_buffer_wake(b);
1575
1576 for (i = 0; i < LIST_SIZE; i++)
1577 list_for_each_entry(b, &c->lru[i], lru_list) {
1578 WARN_ON(!warned);
1579 warned = true;
1580 DMERR("leaked buffer %llx, hold count %u, list %d",
1581 (unsigned long long)b->block, b->hold_count, i);
1582#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1583 stack_trace_print(b->stack_entries, b->stack_len, 1);
1584
1585 b->hold_count = 0;
1586#endif
1587 }
1588
1589#ifdef CONFIG_DM_DEBUG_BLOCK_STACK_TRACING
1590 while ((b = __get_unclaimed_buffer(c)))
1591 __free_buffer_wake(b);
1592#endif
1593
1594 for (i = 0; i < LIST_SIZE; i++)
1595 BUG_ON(!list_empty(&c->lru[i]));
1596
1597 dm_bufio_unlock(c);
1598}
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp)
1609{
1610 if (!(gfp & __GFP_FS)) {
1611 if (test_bit(B_READING, &b->state) ||
1612 test_bit(B_WRITING, &b->state) ||
1613 test_bit(B_DIRTY, &b->state))
1614 return false;
1615 }
1616
1617 if (b->hold_count)
1618 return false;
1619
1620 __make_buffer_clean(b);
1621 __unlink_buffer(b);
1622 __free_buffer_wake(b);
1623
1624 return true;
1625}
1626
1627static unsigned long get_retain_buffers(struct dm_bufio_client *c)
1628{
1629 unsigned long retain_bytes = READ_ONCE(dm_bufio_retain_bytes);
1630 if (likely(c->sectors_per_block_bits >= 0))
1631 retain_bytes >>= c->sectors_per_block_bits + SECTOR_SHIFT;
1632 else
1633 retain_bytes /= c->block_size;
1634 return retain_bytes;
1635}
1636
1637static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
1638 gfp_t gfp_mask)
1639{
1640 int l;
1641 struct dm_buffer *b, *tmp;
1642 unsigned long freed = 0;
1643 unsigned long count = c->n_buffers[LIST_CLEAN] +
1644 c->n_buffers[LIST_DIRTY];
1645 unsigned long retain_target = get_retain_buffers(c);
1646
1647 for (l = 0; l < LIST_SIZE; l++) {
1648 list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) {
1649 if (__try_evict_buffer(b, gfp_mask))
1650 freed++;
1651 if (!--nr_to_scan || ((count - freed) <= retain_target))
1652 return freed;
1653 cond_resched();
1654 }
1655 }
1656 return freed;
1657}
1658
1659static unsigned long
1660dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
1661{
1662 struct dm_bufio_client *c;
1663 unsigned long freed;
1664
1665 c = container_of(shrink, struct dm_bufio_client, shrinker);
1666 if (sc->gfp_mask & __GFP_FS)
1667 dm_bufio_lock(c);
1668 else if (!dm_bufio_trylock(c))
1669 return SHRINK_STOP;
1670
1671 freed = __scan(c, sc->nr_to_scan, sc->gfp_mask);
1672 dm_bufio_unlock(c);
1673 return freed;
1674}
1675
1676static unsigned long
1677dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
1678{
1679 struct dm_bufio_client *c = container_of(shrink, struct dm_bufio_client, shrinker);
1680 unsigned long count = READ_ONCE(c->n_buffers[LIST_CLEAN]) +
1681 READ_ONCE(c->n_buffers[LIST_DIRTY]);
1682 unsigned long retain_target = get_retain_buffers(c);
1683
1684 return (count < retain_target) ? 0 : (count - retain_target);
1685}
1686
1687
1688
1689
1690struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsigned block_size,
1691 unsigned reserved_buffers, unsigned aux_size,
1692 void (*alloc_callback)(struct dm_buffer *),
1693 void (*write_callback)(struct dm_buffer *))
1694{
1695 int r;
1696 struct dm_bufio_client *c;
1697 unsigned i;
1698 char slab_name[27];
1699
1700 if (!block_size || block_size & ((1 << SECTOR_SHIFT) - 1)) {
1701 DMERR("%s: block size not specified or is not multiple of 512b", __func__);
1702 r = -EINVAL;
1703 goto bad_client;
1704 }
1705
1706 c = kzalloc(sizeof(*c), GFP_KERNEL);
1707 if (!c) {
1708 r = -ENOMEM;
1709 goto bad_client;
1710 }
1711 c->buffer_tree = RB_ROOT;
1712
1713 c->bdev = bdev;
1714 c->block_size = block_size;
1715 if (is_power_of_2(block_size))
1716 c->sectors_per_block_bits = __ffs(block_size) - SECTOR_SHIFT;
1717 else
1718 c->sectors_per_block_bits = -1;
1719
1720 c->alloc_callback = alloc_callback;
1721 c->write_callback = write_callback;
1722
1723 for (i = 0; i < LIST_SIZE; i++) {
1724 INIT_LIST_HEAD(&c->lru[i]);
1725 c->n_buffers[i] = 0;
1726 }
1727
1728 mutex_init(&c->lock);
1729 INIT_LIST_HEAD(&c->reserved_buffers);
1730 c->need_reserved_buffers = reserved_buffers;
1731
1732 dm_bufio_set_minimum_buffers(c, DM_BUFIO_MIN_BUFFERS);
1733
1734 init_waitqueue_head(&c->free_buffer_wait);
1735 c->async_write_error = 0;
1736
1737 c->dm_io = dm_io_client_create();
1738 if (IS_ERR(c->dm_io)) {
1739 r = PTR_ERR(c->dm_io);
1740 goto bad_dm_io;
1741 }
1742
1743 if (block_size <= KMALLOC_MAX_SIZE &&
1744 (block_size < PAGE_SIZE || !is_power_of_2(block_size))) {
1745 unsigned align = min(1U << __ffs(block_size), (unsigned)PAGE_SIZE);
1746 snprintf(slab_name, sizeof slab_name, "dm_bufio_cache-%u", block_size);
1747 c->slab_cache = kmem_cache_create(slab_name, block_size, align,
1748 SLAB_RECLAIM_ACCOUNT, NULL);
1749 if (!c->slab_cache) {
1750 r = -ENOMEM;
1751 goto bad;
1752 }
1753 }
1754 if (aux_size)
1755 snprintf(slab_name, sizeof slab_name, "dm_bufio_buffer-%u", aux_size);
1756 else
1757 snprintf(slab_name, sizeof slab_name, "dm_bufio_buffer");
1758 c->slab_buffer = kmem_cache_create(slab_name, sizeof(struct dm_buffer) + aux_size,
1759 0, SLAB_RECLAIM_ACCOUNT, NULL);
1760 if (!c->slab_buffer) {
1761 r = -ENOMEM;
1762 goto bad;
1763 }
1764
1765 while (c->need_reserved_buffers) {
1766 struct dm_buffer *b = alloc_buffer(c, GFP_KERNEL);
1767
1768 if (!b) {
1769 r = -ENOMEM;
1770 goto bad;
1771 }
1772 __free_buffer_wake(b);
1773 }
1774
1775 c->shrinker.count_objects = dm_bufio_shrink_count;
1776 c->shrinker.scan_objects = dm_bufio_shrink_scan;
1777 c->shrinker.seeks = 1;
1778 c->shrinker.batch = 0;
1779 r = register_shrinker(&c->shrinker);
1780 if (r)
1781 goto bad;
1782
1783 mutex_lock(&dm_bufio_clients_lock);
1784 dm_bufio_client_count++;
1785 list_add(&c->client_list, &dm_bufio_all_clients);
1786 __cache_size_refresh();
1787 mutex_unlock(&dm_bufio_clients_lock);
1788
1789 return c;
1790
1791bad:
1792 while (!list_empty(&c->reserved_buffers)) {
1793 struct dm_buffer *b = list_entry(c->reserved_buffers.next,
1794 struct dm_buffer, lru_list);
1795 list_del(&b->lru_list);
1796 free_buffer(b);
1797 }
1798 kmem_cache_destroy(c->slab_cache);
1799 kmem_cache_destroy(c->slab_buffer);
1800 dm_io_client_destroy(c->dm_io);
1801bad_dm_io:
1802 mutex_destroy(&c->lock);
1803 kfree(c);
1804bad_client:
1805 return ERR_PTR(r);
1806}
1807EXPORT_SYMBOL_GPL(dm_bufio_client_create);
1808
1809
1810
1811
1812
1813void dm_bufio_client_destroy(struct dm_bufio_client *c)
1814{
1815 unsigned i;
1816
1817 drop_buffers(c);
1818
1819 unregister_shrinker(&c->shrinker);
1820
1821 mutex_lock(&dm_bufio_clients_lock);
1822
1823 list_del(&c->client_list);
1824 dm_bufio_client_count--;
1825 __cache_size_refresh();
1826
1827 mutex_unlock(&dm_bufio_clients_lock);
1828
1829 BUG_ON(!RB_EMPTY_ROOT(&c->buffer_tree));
1830 BUG_ON(c->need_reserved_buffers);
1831
1832 while (!list_empty(&c->reserved_buffers)) {
1833 struct dm_buffer *b = list_entry(c->reserved_buffers.next,
1834 struct dm_buffer, lru_list);
1835 list_del(&b->lru_list);
1836 free_buffer(b);
1837 }
1838
1839 for (i = 0; i < LIST_SIZE; i++)
1840 if (c->n_buffers[i])
1841 DMERR("leaked buffer count %d: %ld", i, c->n_buffers[i]);
1842
1843 for (i = 0; i < LIST_SIZE; i++)
1844 BUG_ON(c->n_buffers[i]);
1845
1846 kmem_cache_destroy(c->slab_cache);
1847 kmem_cache_destroy(c->slab_buffer);
1848 dm_io_client_destroy(c->dm_io);
1849 mutex_destroy(&c->lock);
1850 kfree(c);
1851}
1852EXPORT_SYMBOL_GPL(dm_bufio_client_destroy);
1853
1854void dm_bufio_set_sector_offset(struct dm_bufio_client *c, sector_t start)
1855{
1856 c->start = start;
1857}
1858EXPORT_SYMBOL_GPL(dm_bufio_set_sector_offset);
1859
1860static unsigned get_max_age_hz(void)
1861{
1862 unsigned max_age = READ_ONCE(dm_bufio_max_age);
1863
1864 if (max_age > UINT_MAX / HZ)
1865 max_age = UINT_MAX / HZ;
1866
1867 return max_age * HZ;
1868}
1869
1870static bool older_than(struct dm_buffer *b, unsigned long age_hz)
1871{
1872 return time_after_eq(jiffies, b->last_accessed + age_hz);
1873}
1874
1875static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz)
1876{
1877 struct dm_buffer *b, *tmp;
1878 unsigned long retain_target = get_retain_buffers(c);
1879 unsigned long count;
1880 LIST_HEAD(write_list);
1881
1882 dm_bufio_lock(c);
1883
1884 __check_watermark(c, &write_list);
1885 if (unlikely(!list_empty(&write_list))) {
1886 dm_bufio_unlock(c);
1887 __flush_write_list(&write_list);
1888 dm_bufio_lock(c);
1889 }
1890
1891 count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
1892 list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) {
1893 if (count <= retain_target)
1894 break;
1895
1896 if (!older_than(b, age_hz))
1897 break;
1898
1899 if (__try_evict_buffer(b, 0))
1900 count--;
1901
1902 cond_resched();
1903 }
1904
1905 dm_bufio_unlock(c);
1906}
1907
1908static void do_global_cleanup(struct work_struct *w)
1909{
1910 struct dm_bufio_client *locked_client = NULL;
1911 struct dm_bufio_client *current_client;
1912 struct dm_buffer *b;
1913 unsigned spinlock_hold_count;
1914 unsigned long threshold = dm_bufio_cache_size -
1915 dm_bufio_cache_size / DM_BUFIO_LOW_WATERMARK_RATIO;
1916 unsigned long loops = global_num * 2;
1917
1918 mutex_lock(&dm_bufio_clients_lock);
1919
1920 while (1) {
1921 cond_resched();
1922
1923 spin_lock(&global_spinlock);
1924 if (unlikely(dm_bufio_current_allocated <= threshold))
1925 break;
1926
1927 spinlock_hold_count = 0;
1928get_next:
1929 if (!loops--)
1930 break;
1931 if (unlikely(list_empty(&global_queue)))
1932 break;
1933 b = list_entry(global_queue.prev, struct dm_buffer, global_list);
1934
1935 if (b->accessed) {
1936 b->accessed = 0;
1937 list_move(&b->global_list, &global_queue);
1938 if (likely(++spinlock_hold_count < 16))
1939 goto get_next;
1940 spin_unlock(&global_spinlock);
1941 continue;
1942 }
1943
1944 current_client = b->c;
1945 if (unlikely(current_client != locked_client)) {
1946 if (locked_client)
1947 dm_bufio_unlock(locked_client);
1948
1949 if (!dm_bufio_trylock(current_client)) {
1950 spin_unlock(&global_spinlock);
1951 dm_bufio_lock(current_client);
1952 locked_client = current_client;
1953 continue;
1954 }
1955
1956 locked_client = current_client;
1957 }
1958
1959 spin_unlock(&global_spinlock);
1960
1961 if (unlikely(!__try_evict_buffer(b, GFP_KERNEL))) {
1962 spin_lock(&global_spinlock);
1963 list_move(&b->global_list, &global_queue);
1964 spin_unlock(&global_spinlock);
1965 }
1966 }
1967
1968 spin_unlock(&global_spinlock);
1969
1970 if (locked_client)
1971 dm_bufio_unlock(locked_client);
1972
1973 mutex_unlock(&dm_bufio_clients_lock);
1974}
1975
1976static void cleanup_old_buffers(void)
1977{
1978 unsigned long max_age_hz = get_max_age_hz();
1979 struct dm_bufio_client *c;
1980
1981 mutex_lock(&dm_bufio_clients_lock);
1982
1983 __cache_size_refresh();
1984
1985 list_for_each_entry(c, &dm_bufio_all_clients, client_list)
1986 __evict_old_buffers(c, max_age_hz);
1987
1988 mutex_unlock(&dm_bufio_clients_lock);
1989}
1990
1991static void work_fn(struct work_struct *w)
1992{
1993 cleanup_old_buffers();
1994
1995 queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
1996 DM_BUFIO_WORK_TIMER_SECS * HZ);
1997}
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007static int __init dm_bufio_init(void)
2008{
2009 __u64 mem;
2010
2011 dm_bufio_allocated_kmem_cache = 0;
2012 dm_bufio_allocated_get_free_pages = 0;
2013 dm_bufio_allocated_vmalloc = 0;
2014 dm_bufio_current_allocated = 0;
2015
2016 mem = (__u64)mult_frac(totalram_pages() - totalhigh_pages(),
2017 DM_BUFIO_MEMORY_PERCENT, 100) << PAGE_SHIFT;
2018
2019 if (mem > ULONG_MAX)
2020 mem = ULONG_MAX;
2021
2022#ifdef CONFIG_MMU
2023 if (mem > mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100))
2024 mem = mult_frac(VMALLOC_TOTAL, DM_BUFIO_VMALLOC_PERCENT, 100);
2025#endif
2026
2027 dm_bufio_default_cache_size = mem;
2028
2029 mutex_lock(&dm_bufio_clients_lock);
2030 __cache_size_refresh();
2031 mutex_unlock(&dm_bufio_clients_lock);
2032
2033 dm_bufio_wq = alloc_workqueue("dm_bufio_cache", WQ_MEM_RECLAIM, 0);
2034 if (!dm_bufio_wq)
2035 return -ENOMEM;
2036
2037 INIT_DELAYED_WORK(&dm_bufio_cleanup_old_work, work_fn);
2038 INIT_WORK(&dm_bufio_replacement_work, do_global_cleanup);
2039 queue_delayed_work(dm_bufio_wq, &dm_bufio_cleanup_old_work,
2040 DM_BUFIO_WORK_TIMER_SECS * HZ);
2041
2042 return 0;
2043}
2044
2045
2046
2047
2048static void __exit dm_bufio_exit(void)
2049{
2050 int bug = 0;
2051
2052 cancel_delayed_work_sync(&dm_bufio_cleanup_old_work);
2053 flush_workqueue(dm_bufio_wq);
2054 destroy_workqueue(dm_bufio_wq);
2055
2056 if (dm_bufio_client_count) {
2057 DMCRIT("%s: dm_bufio_client_count leaked: %d",
2058 __func__, dm_bufio_client_count);
2059 bug = 1;
2060 }
2061
2062 if (dm_bufio_current_allocated) {
2063 DMCRIT("%s: dm_bufio_current_allocated leaked: %lu",
2064 __func__, dm_bufio_current_allocated);
2065 bug = 1;
2066 }
2067
2068 if (dm_bufio_allocated_get_free_pages) {
2069 DMCRIT("%s: dm_bufio_allocated_get_free_pages leaked: %lu",
2070 __func__, dm_bufio_allocated_get_free_pages);
2071 bug = 1;
2072 }
2073
2074 if (dm_bufio_allocated_vmalloc) {
2075 DMCRIT("%s: dm_bufio_vmalloc leaked: %lu",
2076 __func__, dm_bufio_allocated_vmalloc);
2077 bug = 1;
2078 }
2079
2080 BUG_ON(bug);
2081}
2082
2083module_init(dm_bufio_init)
2084module_exit(dm_bufio_exit)
2085
2086module_param_named(max_cache_size_bytes, dm_bufio_cache_size, ulong, S_IRUGO | S_IWUSR);
2087MODULE_PARM_DESC(max_cache_size_bytes, "Size of metadata cache");
2088
2089module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR);
2090MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds");
2091
2092module_param_named(retain_bytes, dm_bufio_retain_bytes, ulong, S_IRUGO | S_IWUSR);
2093MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory");
2094
2095module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR);
2096MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory");
2097
2098module_param_named(allocated_kmem_cache_bytes, dm_bufio_allocated_kmem_cache, ulong, S_IRUGO);
2099MODULE_PARM_DESC(allocated_kmem_cache_bytes, "Memory allocated with kmem_cache_alloc");
2100
2101module_param_named(allocated_get_free_pages_bytes, dm_bufio_allocated_get_free_pages, ulong, S_IRUGO);
2102MODULE_PARM_DESC(allocated_get_free_pages_bytes, "Memory allocated with get_free_pages");
2103
2104module_param_named(allocated_vmalloc_bytes, dm_bufio_allocated_vmalloc, ulong, S_IRUGO);
2105MODULE_PARM_DESC(allocated_vmalloc_bytes, "Memory allocated with vmalloc");
2106
2107module_param_named(current_allocated_bytes, dm_bufio_current_allocated, ulong, S_IRUGO);
2108MODULE_PARM_DESC(current_allocated_bytes, "Memory currently used by the cache");
2109
2110MODULE_AUTHOR("Mikulas Patocka <dm-devel@redhat.com>");
2111MODULE_DESCRIPTION(DM_NAME " buffered I/O library");
2112MODULE_LICENSE("GPL");
2113