1
2
3
4
5
6
7
8#include "dm-core.h"
9#include "dm-rq.h"
10#include "dm-uevent.h"
11
12#include <linux/init.h>
13#include <linux/module.h>
14#include <linux/mutex.h>
15#include <linux/blkpg.h>
16#include <linux/bio.h>
17#include <linux/mempool.h>
18#include <linux/dax.h>
19#include <linux/slab.h>
20#include <linux/idr.h>
21#include <linux/hdreg.h>
22#include <linux/delay.h>
23#include <linux/wait.h>
24#include <linux/pr.h>
25
26#define DM_MSG_PREFIX "core"
27
28
29
30
31
32#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
33#define DM_COOKIE_LENGTH 24
34
35static const char *_name = DM_NAME;
36
37static unsigned int major = 0;
38static unsigned int _major = 0;
39
40static DEFINE_IDR(_minor_idr);
41
42static DEFINE_SPINLOCK(_minor_lock);
43
44static void do_deferred_remove(struct work_struct *w);
45
46static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
47
48static struct workqueue_struct *deferred_remove_workqueue;
49
50atomic_t dm_global_event_nr = ATOMIC_INIT(0);
51DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq);
52
53void dm_issue_global_event(void)
54{
55 atomic_inc(&dm_global_event_nr);
56 wake_up(&dm_global_eventq);
57}
58
59
60
61
62struct dm_io {
63 struct mapped_device *md;
64 int error;
65 atomic_t io_count;
66 struct bio *bio;
67 unsigned long start_time;
68 spinlock_t endio_lock;
69 struct dm_stats_aux stats_aux;
70};
71
72#define MINOR_ALLOCED ((void *)-1)
73
74
75
76
77#define DMF_BLOCK_IO_FOR_SUSPEND 0
78#define DMF_SUSPENDED 1
79#define DMF_FROZEN 2
80#define DMF_FREEING 3
81#define DMF_DELETING 4
82#define DMF_NOFLUSH_SUSPENDING 5
83#define DMF_MERGE_IS_OPTIONAL 6
84#define DMF_DEFERRED_REMOVE 7
85#define DMF_SUSPENDED_INTERNALLY 8
86
87#define DM_NUMA_NODE NUMA_NO_NODE
88static int dm_numa_node = DM_NUMA_NODE;
89
90
91
92
93struct dm_md_mempools {
94 mempool_t *io_pool;
95 mempool_t *rq_pool;
96 struct bio_set *bs;
97};
98
99struct table_device {
100 struct list_head list;
101 atomic_t count;
102 struct dm_dev dm_dev;
103};
104
105static struct kmem_cache *_io_cache;
106static struct kmem_cache *_rq_tio_cache;
107static struct kmem_cache *_rq_cache;
108
109
110
111
112#define RESERVED_BIO_BASED_IOS 16
113static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
114
115static int __dm_get_module_param_int(int *module_param, int min, int max)
116{
117 int param = ACCESS_ONCE(*module_param);
118 int modified_param = 0;
119 bool modified = true;
120
121 if (param < min)
122 modified_param = min;
123 else if (param > max)
124 modified_param = max;
125 else
126 modified = false;
127
128 if (modified) {
129 (void)cmpxchg(module_param, param, modified_param);
130 param = modified_param;
131 }
132
133 return param;
134}
135
136unsigned __dm_get_module_param(unsigned *module_param,
137 unsigned def, unsigned max)
138{
139 unsigned param = ACCESS_ONCE(*module_param);
140 unsigned modified_param = 0;
141
142 if (!param)
143 modified_param = def;
144 else if (param > max)
145 modified_param = max;
146
147 if (modified_param) {
148 (void)cmpxchg(module_param, param, modified_param);
149 param = modified_param;
150 }
151
152 return param;
153}
154
155unsigned dm_get_reserved_bio_based_ios(void)
156{
157 return __dm_get_module_param(&reserved_bio_based_ios,
158 RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS);
159}
160EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
161
162static unsigned dm_get_numa_node(void)
163{
164 return __dm_get_module_param_int(&dm_numa_node,
165 DM_NUMA_NODE, num_online_nodes() - 1);
166}
167
168static int __init local_init(void)
169{
170 int r = -ENOMEM;
171
172
173 _io_cache = KMEM_CACHE(dm_io, 0);
174 if (!_io_cache)
175 return r;
176
177 _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
178 if (!_rq_tio_cache)
179 goto out_free_io_cache;
180
181 _rq_cache = kmem_cache_create("dm_old_clone_request", sizeof(struct request),
182 __alignof__(struct request), 0, NULL);
183 if (!_rq_cache)
184 goto out_free_rq_tio_cache;
185
186 r = dm_uevent_init();
187 if (r)
188 goto out_free_rq_cache;
189
190 deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
191 if (!deferred_remove_workqueue) {
192 r = -ENOMEM;
193 goto out_uevent_exit;
194 }
195
196 _major = major;
197 r = register_blkdev(_major, _name);
198 if (r < 0)
199 goto out_free_workqueue;
200
201 if (!_major)
202 _major = r;
203
204 return 0;
205
206out_free_workqueue:
207 destroy_workqueue(deferred_remove_workqueue);
208out_uevent_exit:
209 dm_uevent_exit();
210out_free_rq_cache:
211 kmem_cache_destroy(_rq_cache);
212out_free_rq_tio_cache:
213 kmem_cache_destroy(_rq_tio_cache);
214out_free_io_cache:
215 kmem_cache_destroy(_io_cache);
216
217 return r;
218}
219
220static void local_exit(void)
221{
222 flush_scheduled_work();
223 destroy_workqueue(deferred_remove_workqueue);
224
225 kmem_cache_destroy(_rq_cache);
226 kmem_cache_destroy(_rq_tio_cache);
227 kmem_cache_destroy(_io_cache);
228 unregister_blkdev(_major, _name);
229 dm_uevent_exit();
230
231 _major = 0;
232
233 DMINFO("cleaned up");
234}
235
236static int (*_inits[])(void) __initdata = {
237 local_init,
238 dm_target_init,
239 dm_linear_init,
240 dm_stripe_init,
241 dm_io_init,
242 dm_kcopyd_init,
243 dm_interface_init,
244 dm_statistics_init,
245};
246
247static void (*_exits[])(void) = {
248 local_exit,
249 dm_target_exit,
250 dm_linear_exit,
251 dm_stripe_exit,
252 dm_io_exit,
253 dm_kcopyd_exit,
254 dm_interface_exit,
255 dm_statistics_exit,
256};
257
258static int __init dm_init(void)
259{
260 const int count = ARRAY_SIZE(_inits);
261
262 int r, i;
263
264 for (i = 0; i < count; i++) {
265 r = _inits[i]();
266 if (r)
267 goto bad;
268 }
269
270 return 0;
271
272 bad:
273 while (i--)
274 _exits[i]();
275
276 return r;
277}
278
279static void __exit dm_exit(void)
280{
281 int i = ARRAY_SIZE(_exits);
282
283 while (i--)
284 _exits[i]();
285
286
287
288
289 idr_destroy(&_minor_idr);
290}
291
292
293
294
295int dm_deleting_md(struct mapped_device *md)
296{
297 return test_bit(DMF_DELETING, &md->flags);
298}
299
300static int dm_blk_open(struct block_device *bdev, fmode_t mode)
301{
302 struct mapped_device *md;
303
304 spin_lock(&_minor_lock);
305
306 md = bdev->bd_disk->private_data;
307 if (!md)
308 goto out;
309
310 if (test_bit(DMF_FREEING, &md->flags) ||
311 dm_deleting_md(md)) {
312 md = NULL;
313 goto out;
314 }
315
316 dm_get(md);
317 atomic_inc(&md->open_count);
318out:
319 spin_unlock(&_minor_lock);
320
321 return md ? 0 : -ENXIO;
322}
323
324static void dm_blk_close(struct gendisk *disk, fmode_t mode)
325{
326 struct mapped_device *md;
327
328 spin_lock(&_minor_lock);
329
330 md = disk->private_data;
331 if (WARN_ON(!md))
332 goto out;
333
334 if (atomic_dec_and_test(&md->open_count) &&
335 (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
336 queue_work(deferred_remove_workqueue, &deferred_remove_work);
337
338 dm_put(md);
339out:
340 spin_unlock(&_minor_lock);
341}
342
343int dm_open_count(struct mapped_device *md)
344{
345 return atomic_read(&md->open_count);
346}
347
348
349
350
351int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
352{
353 int r = 0;
354
355 spin_lock(&_minor_lock);
356
357 if (dm_open_count(md)) {
358 r = -EBUSY;
359 if (mark_deferred)
360 set_bit(DMF_DEFERRED_REMOVE, &md->flags);
361 } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
362 r = -EEXIST;
363 else
364 set_bit(DMF_DELETING, &md->flags);
365
366 spin_unlock(&_minor_lock);
367
368 return r;
369}
370
371int dm_cancel_deferred_remove(struct mapped_device *md)
372{
373 int r = 0;
374
375 spin_lock(&_minor_lock);
376
377 if (test_bit(DMF_DELETING, &md->flags))
378 r = -EBUSY;
379 else
380 clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
381
382 spin_unlock(&_minor_lock);
383
384 return r;
385}
386
387static void do_deferred_remove(struct work_struct *w)
388{
389 dm_deferred_remove();
390}
391
392sector_t dm_get_size(struct mapped_device *md)
393{
394 return get_capacity(md->disk);
395}
396
397struct request_queue *dm_get_md_queue(struct mapped_device *md)
398{
399 return md->queue;
400}
401
402struct dm_stats *dm_get_stats(struct mapped_device *md)
403{
404 return &md->stats;
405}
406
407static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
408{
409 struct mapped_device *md = bdev->bd_disk->private_data;
410
411 return dm_get_geometry(md, geo);
412}
413
414static char *_dm_claim_ptr = "I belong to device-mapper";
415
416static int dm_get_bdev_for_ioctl(struct mapped_device *md,
417 struct block_device **bdev,
418 fmode_t *mode)
419{
420 struct dm_target *tgt;
421 struct dm_table *map;
422 int srcu_idx, r;
423
424retry:
425 r = -ENOTTY;
426 map = dm_get_live_table(md, &srcu_idx);
427 if (!map || !dm_table_get_size(map))
428 goto out;
429
430
431 if (dm_table_get_num_targets(map) != 1)
432 goto out;
433
434 tgt = dm_table_get_target(map, 0);
435 if (!tgt->type->prepare_ioctl)
436 goto out;
437
438 if (dm_suspended_md(md)) {
439 r = -EAGAIN;
440 goto out;
441 }
442
443 r = tgt->type->prepare_ioctl(tgt, bdev, mode);
444 if (r < 0)
445 goto out;
446
447 bdgrab(*bdev);
448 r = blkdev_get(*bdev, *mode, _dm_claim_ptr);
449 if (r < 0)
450 goto out;
451
452 dm_put_live_table(md, srcu_idx);
453 return r;
454
455out:
456 dm_put_live_table(md, srcu_idx);
457 if (r == -ENOTCONN && !fatal_signal_pending(current)) {
458 msleep(10);
459 goto retry;
460 }
461 return r;
462}
463
464static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
465 unsigned int cmd, unsigned long arg)
466{
467 struct mapped_device *md = bdev->bd_disk->private_data;
468 int r;
469
470 r = dm_get_bdev_for_ioctl(md, &bdev, &mode);
471 if (r < 0)
472 return r;
473
474 if (r > 0) {
475
476
477
478
479
480 r = scsi_verify_blk_ioctl(NULL, cmd);
481 if (r)
482 goto out;
483 }
484
485 r = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
486out:
487 blkdev_put(bdev, mode);
488 return r;
489}
490
491static struct dm_io *alloc_io(struct mapped_device *md)
492{
493 return mempool_alloc(md->io_pool, GFP_NOIO);
494}
495
496static void free_io(struct mapped_device *md, struct dm_io *io)
497{
498 mempool_free(io, md->io_pool);
499}
500
501static void free_tio(struct dm_target_io *tio)
502{
503 bio_put(&tio->clone);
504}
505
506int md_in_flight(struct mapped_device *md)
507{
508 return atomic_read(&md->pending[READ]) +
509 atomic_read(&md->pending[WRITE]);
510}
511
512static void start_io_acct(struct dm_io *io)
513{
514 struct mapped_device *md = io->md;
515 struct bio *bio = io->bio;
516 int cpu;
517 int rw = bio_data_dir(bio);
518
519 io->start_time = jiffies;
520
521 cpu = part_stat_lock();
522 part_round_stats(cpu, &dm_disk(md)->part0);
523 part_stat_unlock();
524 atomic_set(&dm_disk(md)->part0.in_flight[rw],
525 atomic_inc_return(&md->pending[rw]));
526
527 if (unlikely(dm_stats_used(&md->stats)))
528 dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_sector,
529 bio_sectors(bio), false, 0, &io->stats_aux);
530}
531
532static void end_io_acct(struct dm_io *io)
533{
534 struct mapped_device *md = io->md;
535 struct bio *bio = io->bio;
536 unsigned long duration = jiffies - io->start_time;
537 int pending, cpu;
538 int rw = bio_data_dir(bio);
539
540 cpu = part_stat_lock();
541 part_round_stats(cpu, &dm_disk(md)->part0);
542 part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration);
543 part_stat_unlock();
544
545 if (unlikely(dm_stats_used(&md->stats)))
546 dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_sector,
547 bio_sectors(bio), true, duration, &io->stats_aux);
548
549
550
551
552
553 pending = atomic_dec_return(&md->pending[rw]);
554 atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
555 pending += atomic_read(&md->pending[rw^0x1]);
556
557
558 if (!pending)
559 wake_up(&md->wait);
560}
561
562
563
564
565static void queue_io(struct mapped_device *md, struct bio *bio)
566{
567 unsigned long flags;
568
569 spin_lock_irqsave(&md->deferred_lock, flags);
570 bio_list_add(&md->deferred, bio);
571 spin_unlock_irqrestore(&md->deferred_lock, flags);
572 queue_work(md->wq, &md->work);
573}
574
575
576
577
578
579
580struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
581{
582 *srcu_idx = srcu_read_lock(&md->io_barrier);
583
584 return srcu_dereference(md->map, &md->io_barrier);
585}
586
587void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
588{
589 srcu_read_unlock(&md->io_barrier, srcu_idx);
590}
591
592void dm_sync_table(struct mapped_device *md)
593{
594 synchronize_srcu(&md->io_barrier);
595 synchronize_rcu_expedited();
596}
597
598
599
600
601
602static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
603{
604 rcu_read_lock();
605 return rcu_dereference(md->map);
606}
607
608static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
609{
610 rcu_read_unlock();
611}
612
613
614
615
616static int open_table_device(struct table_device *td, dev_t dev,
617 struct mapped_device *md)
618{
619 struct block_device *bdev;
620
621 int r;
622
623 BUG_ON(td->dm_dev.bdev);
624
625 bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr);
626 if (IS_ERR(bdev))
627 return PTR_ERR(bdev);
628
629 r = bd_link_disk_holder(bdev, dm_disk(md));
630 if (r) {
631 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
632 return r;
633 }
634
635 td->dm_dev.bdev = bdev;
636 td->dm_dev.dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
637 return 0;
638}
639
640
641
642
643static void close_table_device(struct table_device *td, struct mapped_device *md)
644{
645 if (!td->dm_dev.bdev)
646 return;
647
648 bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
649 blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
650 put_dax(td->dm_dev.dax_dev);
651 td->dm_dev.bdev = NULL;
652 td->dm_dev.dax_dev = NULL;
653}
654
655static struct table_device *find_table_device(struct list_head *l, dev_t dev,
656 fmode_t mode) {
657 struct table_device *td;
658
659 list_for_each_entry(td, l, list)
660 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
661 return td;
662
663 return NULL;
664}
665
666int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
667 struct dm_dev **result) {
668 int r;
669 struct table_device *td;
670
671 mutex_lock(&md->table_devices_lock);
672 td = find_table_device(&md->table_devices, dev, mode);
673 if (!td) {
674 td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
675 if (!td) {
676 mutex_unlock(&md->table_devices_lock);
677 return -ENOMEM;
678 }
679
680 td->dm_dev.mode = mode;
681 td->dm_dev.bdev = NULL;
682
683 if ((r = open_table_device(td, dev, md))) {
684 mutex_unlock(&md->table_devices_lock);
685 kfree(td);
686 return r;
687 }
688
689 format_dev_t(td->dm_dev.name, dev);
690
691 atomic_set(&td->count, 0);
692 list_add(&td->list, &md->table_devices);
693 }
694 atomic_inc(&td->count);
695 mutex_unlock(&md->table_devices_lock);
696
697 *result = &td->dm_dev;
698 return 0;
699}
700EXPORT_SYMBOL_GPL(dm_get_table_device);
701
702void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
703{
704 struct table_device *td = container_of(d, struct table_device, dm_dev);
705
706 mutex_lock(&md->table_devices_lock);
707 if (atomic_dec_and_test(&td->count)) {
708 close_table_device(td, md);
709 list_del(&td->list);
710 kfree(td);
711 }
712 mutex_unlock(&md->table_devices_lock);
713}
714EXPORT_SYMBOL(dm_put_table_device);
715
716static void free_table_devices(struct list_head *devices)
717{
718 struct list_head *tmp, *next;
719
720 list_for_each_safe(tmp, next, devices) {
721 struct table_device *td = list_entry(tmp, struct table_device, list);
722
723 DMWARN("dm_destroy: %s still exists with %d references",
724 td->dm_dev.name, atomic_read(&td->count));
725 kfree(td);
726 }
727}
728
729
730
731
732int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
733{
734 *geo = md->geometry;
735
736 return 0;
737}
738
739
740
741
742int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
743{
744 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
745
746 if (geo->start > sz) {
747 DMWARN("Start sector is beyond the geometry limits.");
748 return -EINVAL;
749 }
750
751 md->geometry = *geo;
752
753 return 0;
754}
755
756
757
758
759
760
761
762
763
764
765static int __noflush_suspending(struct mapped_device *md)
766{
767 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
768}
769
770
771
772
773
774static void dec_pending(struct dm_io *io, int error)
775{
776 unsigned long flags;
777 int io_error;
778 struct bio *bio;
779 struct mapped_device *md = io->md;
780
781
782 if (unlikely(error)) {
783 spin_lock_irqsave(&io->endio_lock, flags);
784 if (!(io->error > 0 && __noflush_suspending(md)))
785 io->error = error;
786 spin_unlock_irqrestore(&io->endio_lock, flags);
787 }
788
789 if (atomic_dec_and_test(&io->io_count)) {
790 if (io->error == DM_ENDIO_REQUEUE) {
791
792
793
794 spin_lock_irqsave(&md->deferred_lock, flags);
795 if (__noflush_suspending(md))
796 bio_list_add_head(&md->deferred, io->bio);
797 else
798
799 io->error = -EIO;
800 spin_unlock_irqrestore(&md->deferred_lock, flags);
801 }
802
803 io_error = io->error;
804 bio = io->bio;
805 end_io_acct(io);
806 free_io(md, io);
807
808 if (io_error == DM_ENDIO_REQUEUE)
809 return;
810
811 if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) {
812
813
814
815
816 bio->bi_rw &= ~REQ_FLUSH;
817 queue_io(md, bio);
818 } else {
819
820 trace_block_bio_complete(md->queue, bio, io_error);
821 bio_endio(bio, io_error);
822 }
823 }
824}
825
826void disable_write_same(struct mapped_device *md)
827{
828 struct queue_limits *limits = dm_get_queue_limits(md);
829
830
831 limits->max_write_same_sectors = 0;
832}
833
834static void clone_endio(struct bio *bio, int error)
835{
836 int r = error;
837 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
838 struct dm_io *io = tio->io;
839 struct mapped_device *md = tio->io->md;
840 dm_endio_fn endio = tio->ti->type->end_io;
841
842 if (!bio_flagged(bio, BIO_UPTODATE) && !error)
843 error = -EIO;
844
845 if (endio) {
846 r = endio(tio->ti, bio, error);
847 if (r < 0 || r == DM_ENDIO_REQUEUE)
848
849
850
851
852 error = r;
853 else if (r == DM_ENDIO_INCOMPLETE)
854
855 return;
856 else if (r) {
857 DMWARN("unimplemented target endio return value: %d", r);
858 BUG();
859 }
860 }
861
862 if (unlikely(r == -EREMOTEIO && (bio->bi_rw & REQ_WRITE_SAME) &&
863 !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors))
864 disable_write_same(md);
865
866 free_tio(tio);
867 dec_pending(io, error);
868}
869
870
871
872
873
874static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
875{
876 sector_t target_offset = dm_target_offset(ti, sector);
877
878 return ti->len - target_offset;
879}
880
881static sector_t max_io_len(sector_t sector, struct dm_target *ti)
882{
883 sector_t len = max_io_len_target_boundary(sector, ti);
884 sector_t offset, max_len;
885
886
887
888
889 if (ti->max_io_len) {
890 offset = dm_target_offset(ti, sector);
891 if (unlikely(ti->max_io_len & (ti->max_io_len - 1)))
892 max_len = sector_div(offset, ti->max_io_len);
893 else
894 max_len = offset & (ti->max_io_len - 1);
895 max_len = ti->max_io_len - max_len;
896
897 if (len > max_len)
898 len = max_len;
899 }
900
901 return len;
902}
903
904int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
905{
906 if (len > UINT_MAX) {
907 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
908 (unsigned long long)len, UINT_MAX);
909 ti->error = "Maximum size of target IO is too large";
910 return -EINVAL;
911 }
912
913 ti->max_io_len = (uint32_t) len;
914
915 return 0;
916}
917EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
918
919static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
920 sector_t sector, int *srcu_idx)
921{
922 struct dm_table *map;
923 struct dm_target *ti;
924
925 map = dm_get_live_table(md, srcu_idx);
926 if (!map)
927 return NULL;
928
929 ti = dm_table_find_target(map, sector);
930 if (!dm_target_is_valid(ti))
931 return NULL;
932
933 return ti;
934}
935
936static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
937 long nr_pages, void **kaddr, pfn_t *pfn)
938{
939 struct mapped_device *md = dax_get_private(dax_dev);
940 sector_t sector = pgoff * PAGE_SECTORS;
941 struct dm_target *ti;
942 long len, ret = -EIO;
943 int srcu_idx;
944
945 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
946
947 if (!ti)
948 goto out;
949 if (!ti->type->direct_access)
950 goto out;
951 len = max_io_len(sector, ti) / PAGE_SECTORS;
952 if (len < 1)
953 goto out;
954 nr_pages = min(len, nr_pages);
955 if (ti->type->direct_access)
956 ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
957
958 out:
959 dm_put_live_table(md, srcu_idx);
960
961 return ret;
962}
963
964
965
966
967
968struct dm_offload {
969 struct blk_plug plug;
970 struct blk_plug_cb cb;
971};
972
973static void flush_current_bio_list(struct blk_plug_cb *cb, bool from_schedule)
974{
975 struct dm_offload *o = container_of(cb, struct dm_offload, cb);
976 struct bio_list list;
977 struct bio *bio;
978 int i;
979
980 INIT_LIST_HEAD(&o->cb.list);
981
982 if (unlikely(!current->bio_list))
983 return;
984
985 for (i = 0; i < 2; i++) {
986 list = current->bio_list[i];
987 bio_list_init(¤t->bio_list[i]);
988
989 while ((bio = bio_list_pop(&list))) {
990 struct bio_set *bs = bio->bi_pool;
991 if (unlikely(!bs) || bs == fs_bio_set) {
992 bio_list_add(¤t->bio_list[i], bio);
993 continue;
994 }
995
996 spin_lock(&bs->rescue_lock);
997 bio_list_add(&bs->rescue_list, bio);
998 queue_work(bs->rescue_workqueue, &bs->rescue_work);
999 spin_unlock(&bs->rescue_lock);
1000 }
1001 }
1002}
1003
1004static void dm_offload_start(struct dm_offload *o)
1005{
1006 blk_start_plug(&o->plug);
1007 o->cb.callback = flush_current_bio_list;
1008 list_add(&o->cb.list, ¤t->plug->cb_list);
1009}
1010
1011static void dm_offload_end(struct dm_offload *o)
1012{
1013 list_del(&o->cb.list);
1014 blk_finish_plug(&o->plug);
1015}
1016
1017static void __map_bio(struct dm_target_io *tio)
1018{
1019 int r;
1020 sector_t sector;
1021 struct dm_offload o;
1022 struct bio *clone = &tio->clone;
1023 struct dm_target *ti = tio->ti;
1024
1025 clone->bi_end_io = clone_endio;
1026
1027
1028
1029
1030
1031
1032 atomic_inc(&tio->io->io_count);
1033 sector = clone->bi_sector;
1034
1035 dm_offload_start(&o);
1036 r = ti->type->map(ti, clone);
1037 dm_offload_end(&o);
1038
1039 if (r == DM_MAPIO_REMAPPED) {
1040
1041
1042 trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,
1043 tio->io->bio->bi_bdev->bd_dev, sector);
1044
1045 generic_make_request(clone);
1046 } else if (r < 0 || r == DM_MAPIO_REQUEUE) {
1047
1048 dec_pending(tio->io, r);
1049 free_tio(tio);
1050 } else if (r != DM_MAPIO_SUBMITTED) {
1051 DMWARN("unimplemented target map return value: %d", r);
1052 BUG();
1053 }
1054}
1055
1056struct clone_info {
1057 struct mapped_device *md;
1058 struct dm_table *map;
1059 struct bio *bio;
1060 struct dm_io *io;
1061 sector_t sector;
1062 sector_t sector_count;
1063 unsigned short idx;
1064};
1065
1066static void bio_setup_sector(struct bio *bio, sector_t sector, sector_t len)
1067{
1068 bio->bi_sector = sector;
1069 bio->bi_size = to_bytes(len);
1070}
1071
1072static void bio_setup_bv(struct bio *bio, unsigned short idx, unsigned short bv_count)
1073{
1074 bio->bi_idx = idx;
1075 bio->bi_vcnt = idx + bv_count;
1076 bio->bi_flags &= ~(1 << BIO_SEG_VALID);
1077}
1078
1079static int clone_bio_integrity(struct bio *bio, struct bio *clone,
1080 unsigned short idx, unsigned len, unsigned offset,
1081 bool trim)
1082{
1083 int r;
1084
1085 r = bio_integrity_clone(clone, bio, GFP_NOIO);
1086 if (r < 0)
1087 return r;
1088
1089 if (trim)
1090 bio_integrity_trim(clone, bio_sector_offset(bio, idx, offset), len);
1091
1092 return 0;
1093}
1094
1095
1096
1097
1098static int clone_split_bio(struct dm_target_io *tio, struct bio *bio,
1099 sector_t sector, unsigned short idx,
1100 unsigned offset, unsigned len)
1101{
1102 struct bio *clone = &tio->clone;
1103 struct bio_vec *bv = bio->bi_io_vec + idx;
1104
1105 *clone->bi_io_vec = *bv;
1106
1107 bio_setup_sector(clone, sector, len);
1108
1109 clone->bi_bdev = bio->bi_bdev;
1110 clone->bi_rw = bio->bi_rw;
1111 clone->bi_vcnt = 1;
1112 clone->bi_io_vec->bv_offset = offset;
1113 clone->bi_io_vec->bv_len = clone->bi_size;
1114 clone->bi_flags |= 1 << BIO_CLONED;
1115
1116 if (bio_integrity(bio)) {
1117 int r = clone_bio_integrity(bio, clone, idx, len, offset, true);
1118 if (r < 0)
1119 return r;
1120 }
1121
1122 return 0;
1123}
1124
1125
1126
1127
1128static int clone_bio(struct dm_target_io *tio, struct bio *bio,
1129 sector_t sector, unsigned short idx,
1130 unsigned short bv_count, unsigned len)
1131{
1132 struct bio *clone = &tio->clone;
1133
1134 __bio_clone(clone, bio);
1135 bio_setup_sector(clone, sector, len);
1136 bio_setup_bv(clone, idx, bv_count);
1137
1138 if (bio_integrity(bio)) {
1139 int r;
1140 bool trim = false;
1141
1142 if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
1143 trim = true;
1144 r = clone_bio_integrity(bio, clone, idx, len, 0, trim);
1145 if (r < 0)
1146 return r;
1147 }
1148
1149 return 0;
1150}
1151
1152static struct dm_target_io *alloc_tio(struct clone_info *ci,
1153 struct dm_target *ti, int nr_iovecs,
1154 unsigned target_bio_nr)
1155{
1156 struct dm_target_io *tio;
1157 struct bio *clone;
1158
1159 clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, ci->md->bs);
1160 tio = container_of(clone, struct dm_target_io, clone);
1161
1162 tio->io = ci->io;
1163 tio->ti = ti;
1164 tio->target_bio_nr = target_bio_nr;
1165
1166 return tio;
1167}
1168
1169static void __clone_and_map_simple_bio(struct clone_info *ci,
1170 struct dm_target *ti,
1171 unsigned target_bio_nr, sector_t len)
1172{
1173 struct dm_target_io *tio = alloc_tio(ci, ti, ci->bio->bi_max_vecs, target_bio_nr);
1174 struct bio *clone = &tio->clone;
1175
1176
1177
1178
1179
1180
1181 __bio_clone(clone, ci->bio);
1182 if (len)
1183 bio_setup_sector(clone, ci->sector, len);
1184
1185 __map_bio(tio);
1186}
1187
1188static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
1189 unsigned num_bios, sector_t len)
1190{
1191 unsigned target_bio_nr;
1192
1193 for (target_bio_nr = 0; target_bio_nr < num_bios; target_bio_nr++)
1194 __clone_and_map_simple_bio(ci, ti, target_bio_nr, len);
1195}
1196
1197static int __send_empty_flush(struct clone_info *ci)
1198{
1199 unsigned target_nr = 0;
1200 struct dm_target *ti;
1201
1202 BUG_ON(bio_has_data(ci->bio));
1203 while ((ti = dm_table_get_target(ci->map, target_nr++)))
1204 __send_duplicate_bios(ci, ti, ti->num_flush_bios, 0);
1205
1206 return 0;
1207}
1208
1209static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
1210 sector_t sector, int nr_iovecs,
1211 unsigned short idx, unsigned short bv_count,
1212 unsigned offset, unsigned len,
1213 bool split_bvec)
1214{
1215 struct bio *bio = ci->bio;
1216 struct dm_target_io *tio;
1217 unsigned target_bio_nr;
1218 unsigned num_target_bios = 1;
1219 int r = 0;
1220
1221
1222
1223
1224 if (bio_data_dir(bio) == WRITE && ti->num_write_bios)
1225 num_target_bios = ti->num_write_bios(ti, bio);
1226
1227 for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) {
1228 tio = alloc_tio(ci, ti, nr_iovecs, target_bio_nr);
1229 if (split_bvec)
1230 r = clone_split_bio(tio, bio, sector, idx, offset, len);
1231 else
1232 r = clone_bio(tio, bio, sector, idx, bv_count, len);
1233 if (r < 0) {
1234 free_tio(tio);
1235 break;
1236 }
1237 __map_bio(tio);
1238 }
1239
1240 return r;
1241}
1242
1243typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
1244
1245static unsigned get_num_discard_bios(struct dm_target *ti)
1246{
1247 return ti->num_discard_bios;
1248}
1249
1250static unsigned get_num_write_same_bios(struct dm_target *ti)
1251{
1252 return ti->num_write_same_bios;
1253}
1254
1255typedef bool (*is_split_required_fn)(struct dm_target *ti);
1256
1257static bool is_split_required_for_discard(struct dm_target *ti)
1258{
1259 return ti->split_discard_bios;
1260}
1261
1262static int __send_changing_extent_only(struct clone_info *ci,
1263 get_num_bios_fn get_num_bios,
1264 is_split_required_fn is_split_required)
1265{
1266 struct dm_target *ti;
1267 sector_t len;
1268 unsigned num_bios;
1269
1270 do {
1271 ti = dm_table_find_target(ci->map, ci->sector);
1272 if (!dm_target_is_valid(ti))
1273 return -EIO;
1274
1275
1276
1277
1278
1279
1280
1281 num_bios = get_num_bios ? get_num_bios(ti) : 0;
1282 if (!num_bios)
1283 return -EOPNOTSUPP;
1284
1285 if (is_split_required && !is_split_required(ti))
1286 len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
1287 else
1288 len = min(ci->sector_count, max_io_len(ci->sector, ti));
1289
1290 __send_duplicate_bios(ci, ti, num_bios, len);
1291
1292 ci->sector += len;
1293 } while (ci->sector_count -= len);
1294
1295 return 0;
1296}
1297
1298static int __send_discard(struct clone_info *ci)
1299{
1300 return __send_changing_extent_only(ci, get_num_discard_bios,
1301 is_split_required_for_discard);
1302}
1303
1304static int __send_write_same(struct clone_info *ci)
1305{
1306 return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
1307}
1308
1309
1310
1311
1312static sector_t __len_within_target(struct clone_info *ci, sector_t max, int *idx)
1313{
1314 struct bio *bio = ci->bio;
1315 sector_t bv_len, total_len = 0;
1316
1317 for (*idx = ci->idx; max && (*idx < bio->bi_vcnt); (*idx)++) {
1318 bv_len = to_sector(bio->bi_io_vec[*idx].bv_len);
1319
1320 if (bv_len > max)
1321 break;
1322
1323 max -= bv_len;
1324 total_len += bv_len;
1325 }
1326
1327 return total_len;
1328}
1329
1330static int __split_bvec_across_targets(struct clone_info *ci,
1331 struct dm_target *ti, sector_t max)
1332{
1333 struct bio *bio = ci->bio;
1334 struct bio_vec *bv = bio->bi_io_vec + ci->idx;
1335 sector_t remaining = to_sector(bv->bv_len);
1336 unsigned offset = 0;
1337 sector_t len;
1338 int r;
1339
1340 do {
1341 if (offset) {
1342 ti = dm_table_find_target(ci->map, ci->sector);
1343 if (!dm_target_is_valid(ti))
1344 return -EIO;
1345
1346 max = max_io_len(ci->sector, ti);
1347 }
1348
1349 len = min(remaining, max);
1350
1351 r = __clone_and_map_data_bio(ci, ti, ci->sector, 1, ci->idx, 0,
1352 bv->bv_offset + offset, len, true);
1353 if (r < 0)
1354 return r;
1355
1356 ci->sector += len;
1357 ci->sector_count -= len;
1358 offset += to_bytes(len);
1359 } while (remaining -= len);
1360
1361 ci->idx++;
1362
1363 return 0;
1364}
1365
1366
1367
1368
1369static int __split_and_process_non_flush(struct clone_info *ci)
1370{
1371 struct bio *bio = ci->bio;
1372 struct dm_target *ti;
1373 sector_t len, max;
1374 int idx;
1375 int r;
1376
1377 if (unlikely(bio->bi_rw & REQ_DISCARD))
1378 return __send_discard(ci);
1379 else if (unlikely(bio->bi_rw & REQ_WRITE_SAME))
1380 return __send_write_same(ci);
1381
1382 ti = dm_table_find_target(ci->map, ci->sector);
1383 if (!dm_target_is_valid(ti))
1384 return -EIO;
1385
1386 max = max_io_len(ci->sector, ti);
1387
1388
1389
1390
1391
1392 if (ci->sector_count <= max) {
1393 r = __clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs,
1394 ci->idx, bio->bi_vcnt - ci->idx, 0,
1395 ci->sector_count, false);
1396 if (r < 0)
1397 return r;
1398
1399 ci->sector_count = 0;
1400 return 0;
1401 }
1402
1403
1404
1405
1406
1407 if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
1408 len = __len_within_target(ci, max, &idx);
1409
1410 r = __clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs,
1411 ci->idx, idx - ci->idx, 0, len, false);
1412 if (r < 0)
1413 return r;
1414
1415 ci->sector += len;
1416 ci->sector_count -= len;
1417 ci->idx = idx;
1418
1419 return 0;
1420 }
1421
1422
1423
1424
1425 return __split_bvec_across_targets(ci, ti, max);
1426}
1427
1428
1429
1430
1431static void __split_and_process_bio(struct mapped_device *md,
1432 struct dm_table *map, struct bio *bio)
1433{
1434 struct clone_info ci;
1435 int error = 0;
1436
1437 if (unlikely(!map)) {
1438 bio_io_error(bio);
1439 return;
1440 }
1441
1442 ci.map = map;
1443 ci.md = md;
1444 ci.io = alloc_io(md);
1445 ci.io->error = 0;
1446 atomic_set(&ci.io->io_count, 1);
1447 ci.io->bio = bio;
1448 ci.io->md = md;
1449 spin_lock_init(&ci.io->endio_lock);
1450 ci.sector = bio->bi_sector;
1451 ci.idx = bio->bi_idx;
1452
1453 start_io_acct(ci.io);
1454
1455 if (bio->bi_rw & REQ_FLUSH) {
1456 ci.bio = &ci.md->flush_bio;
1457 ci.sector_count = 0;
1458 error = __send_empty_flush(&ci);
1459
1460 } else {
1461 ci.bio = bio;
1462 ci.sector_count = bio_sectors(bio);
1463 while (ci.sector_count && !error)
1464 error = __split_and_process_non_flush(&ci);
1465 }
1466
1467
1468 dec_pending(ci.io, error);
1469}
1470
1471
1472
1473
1474static int dm_merge_bvec(struct request_queue *q,
1475 struct bvec_merge_data *bvm,
1476 struct bio_vec *biovec)
1477{
1478 struct mapped_device *md = q->queuedata;
1479 struct dm_table *map = dm_get_live_table_fast(md);
1480 struct dm_target *ti;
1481 sector_t max_sectors;
1482 int max_size = 0;
1483
1484 if (unlikely(!map))
1485 goto out;
1486
1487 ti = dm_table_find_target(map, bvm->bi_sector);
1488 if (!dm_target_is_valid(ti))
1489 goto out;
1490
1491
1492
1493
1494 max_sectors = min(max_io_len(bvm->bi_sector, ti),
1495 (sector_t) BIO_MAX_SECTORS);
1496 max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
1497 if (max_size < 0)
1498 max_size = 0;
1499
1500
1501
1502
1503
1504
1505 if (max_size && ti->type->merge)
1506 max_size = ti->type->merge(ti, bvm, biovec, max_size);
1507
1508
1509
1510
1511
1512
1513
1514 else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
1515 max_size = 0;
1516
1517out:
1518 dm_put_live_table_fast(md);
1519
1520
1521
1522 if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT))
1523 max_size = biovec->bv_len;
1524
1525 return max_size;
1526}
1527
1528
1529
1530
1531
1532static void dm_make_request(struct request_queue *q, struct bio *bio)
1533{
1534 int rw = bio_data_dir(bio);
1535 struct mapped_device *md = q->queuedata;
1536 int cpu;
1537 int srcu_idx;
1538 struct dm_table *map;
1539
1540 map = dm_get_live_table(md, &srcu_idx);
1541
1542 cpu = part_stat_lock();
1543 part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]);
1544 part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
1545 part_stat_unlock();
1546
1547
1548 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1549 dm_put_live_table(md, srcu_idx);
1550
1551 if (bio_rw(bio) != READA)
1552 queue_io(md, bio);
1553 else
1554 bio_io_error(bio);
1555 return;
1556 }
1557
1558 __split_and_process_bio(md, map, bio);
1559 dm_put_live_table(md, srcu_idx);
1560 return;
1561}
1562
1563static int dm_any_congested(void *congested_data, int bdi_bits)
1564{
1565 int r = bdi_bits;
1566 struct mapped_device *md = congested_data;
1567 struct dm_table *map;
1568
1569 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
1570 if (dm_request_based(md)) {
1571
1572
1573
1574
1575 r = md->queue->backing_dev_info.state & bdi_bits;
1576 } else {
1577 map = dm_get_live_table_fast(md);
1578 if (map)
1579 r = dm_table_any_congested(map, bdi_bits);
1580 dm_put_live_table_fast(md);
1581 }
1582 }
1583
1584 return r;
1585}
1586
1587
1588
1589
1590static void free_minor(int minor)
1591{
1592 spin_lock(&_minor_lock);
1593 idr_remove(&_minor_idr, minor);
1594 spin_unlock(&_minor_lock);
1595}
1596
1597
1598
1599
1600static int specific_minor(int minor)
1601{
1602 int r;
1603
1604 if (minor >= (1 << MINORBITS))
1605 return -EINVAL;
1606
1607 idr_preload(GFP_KERNEL);
1608 spin_lock(&_minor_lock);
1609
1610 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
1611
1612 spin_unlock(&_minor_lock);
1613 idr_preload_end();
1614 if (r < 0)
1615 return r == -ENOSPC ? -EBUSY : r;
1616 return 0;
1617}
1618
1619static int next_free_minor(int *minor)
1620{
1621 int r;
1622
1623 idr_preload(GFP_KERNEL);
1624 spin_lock(&_minor_lock);
1625
1626 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
1627
1628 spin_unlock(&_minor_lock);
1629 idr_preload_end();
1630 if (r < 0)
1631 return r;
1632 *minor = r;
1633 return 0;
1634}
1635
1636static const struct block_device_operations dm_blk_dops;
1637static const struct dax_operations dm_dax_ops;
1638
1639static void dm_wq_work(struct work_struct *work);
1640
1641void dm_init_md_queue(struct mapped_device *md)
1642{
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652 queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
1653
1654
1655
1656
1657
1658 md->queue->queuedata = md;
1659 md->queue->backing_dev_info.congested_data = md;
1660}
1661
1662void dm_init_normal_md_queue(struct mapped_device *md)
1663{
1664 md->use_blk_mq = false;
1665 dm_init_md_queue(md);
1666
1667
1668
1669
1670 md->queue->backing_dev_info.congested_fn = dm_any_congested;
1671 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
1672}
1673
1674static void cleanup_mapped_device(struct mapped_device *md)
1675{
1676 if (md->wq)
1677 destroy_workqueue(md->wq);
1678 if (md->kworker_task)
1679 kthread_stop(md->kworker_task);
1680 mempool_destroy(md->io_pool);
1681 mempool_destroy(md->rq_pool);
1682 if (md->bs)
1683 bioset_free(md->bs);
1684
1685 if (md->dax_dev) {
1686 kill_dax(md->dax_dev);
1687 put_dax(md->dax_dev);
1688 md->dax_dev = NULL;
1689 }
1690
1691 if (md->disk) {
1692 spin_lock(&_minor_lock);
1693 md->disk->private_data = NULL;
1694 spin_unlock(&_minor_lock);
1695 if (blk_get_integrity(md->disk))
1696 blk_integrity_unregister(md->disk);
1697 del_gendisk(md->disk);
1698 put_disk(md->disk);
1699 }
1700
1701 if (md->queue)
1702 blk_cleanup_queue(md->queue);
1703
1704 cleanup_srcu_struct(&md->io_barrier);
1705
1706 if (md->bdev) {
1707 bdput(md->bdev);
1708 md->bdev = NULL;
1709 }
1710
1711 dm_mq_cleanup_mapped_device(md);
1712}
1713
1714
1715
1716
1717static struct mapped_device *alloc_dev(int minor)
1718{
1719 int r, numa_node_id = dm_get_numa_node();
1720 struct dax_device *dax_dev;
1721 struct mapped_device *md;
1722 void *old_md;
1723
1724 md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
1725 if (!md) {
1726 DMWARN("unable to allocate device, out of memory.");
1727 return NULL;
1728 }
1729
1730 if (!try_module_get(THIS_MODULE))
1731 goto bad_module_get;
1732
1733
1734 if (minor == DM_ANY_MINOR)
1735 r = next_free_minor(&minor);
1736 else
1737 r = specific_minor(minor);
1738 if (r < 0)
1739 goto bad_minor;
1740
1741 r = init_srcu_struct(&md->io_barrier);
1742 if (r < 0)
1743 goto bad_io_barrier;
1744
1745 md->numa_node_id = numa_node_id;
1746 md->use_blk_mq = dm_use_blk_mq_default();
1747 md->init_tio_pdu = false;
1748 md->type = DM_TYPE_NONE;
1749 mutex_init(&md->suspend_lock);
1750 mutex_init(&md->type_lock);
1751 mutex_init(&md->table_devices_lock);
1752 spin_lock_init(&md->deferred_lock);
1753 atomic_set(&md->holders, 1);
1754 atomic_set(&md->open_count, 0);
1755 atomic_set(&md->event_nr, 0);
1756 atomic_set(&md->uevent_seq, 0);
1757 INIT_LIST_HEAD(&md->uevent_list);
1758 INIT_LIST_HEAD(&md->table_devices);
1759 spin_lock_init(&md->uevent_lock);
1760
1761 md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id);
1762 if (!md->queue)
1763 goto bad;
1764
1765 dm_init_md_queue(md);
1766
1767 md->disk = alloc_disk_node(1, numa_node_id);
1768 if (!md->disk)
1769 goto bad;
1770
1771 atomic_set(&md->pending[0], 0);
1772 atomic_set(&md->pending[1], 0);
1773 init_waitqueue_head(&md->wait);
1774 INIT_WORK(&md->work, dm_wq_work);
1775 init_waitqueue_head(&md->eventq);
1776 init_completion(&md->kobj_holder.completion);
1777 md->kworker_task = NULL;
1778
1779 md->disk->major = _major;
1780 md->disk->first_minor = minor;
1781 md->disk->fops = &dm_blk_dops;
1782 md->disk->queue = md->queue;
1783 md->disk->private_data = md;
1784 sprintf(md->disk->disk_name, "dm-%d", minor);
1785
1786 dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops);
1787 if (!dax_dev)
1788 goto bad;
1789 md->dax_dev = dax_dev;
1790
1791 add_disk_no_queue_reg(md->disk);
1792 format_dev_t(md->name, MKDEV(_major, minor));
1793
1794 md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
1795 if (!md->wq)
1796 goto bad;
1797
1798 md->bdev = bdget_disk(md->disk, 0);
1799 if (!md->bdev)
1800 goto bad;
1801
1802 bio_init(&md->flush_bio);
1803 md->flush_bio.bi_bdev = md->bdev;
1804 md->flush_bio.bi_rw = WRITE_FLUSH;
1805
1806 dm_stats_init(&md->stats);
1807
1808
1809 spin_lock(&_minor_lock);
1810 old_md = idr_replace(&_minor_idr, md, minor);
1811 spin_unlock(&_minor_lock);
1812
1813 BUG_ON(old_md != MINOR_ALLOCED);
1814
1815 return md;
1816
1817bad:
1818 cleanup_mapped_device(md);
1819bad_io_barrier:
1820 free_minor(minor);
1821bad_minor:
1822 module_put(THIS_MODULE);
1823bad_module_get:
1824 kvfree(md);
1825 return NULL;
1826}
1827
1828static void unlock_fs(struct mapped_device *md);
1829
1830static void free_dev(struct mapped_device *md)
1831{
1832 int minor = MINOR(disk_devt(md->disk));
1833
1834 unlock_fs(md);
1835
1836 cleanup_mapped_device(md);
1837
1838 free_table_devices(&md->table_devices);
1839 dm_stats_cleanup(&md->stats);
1840 free_minor(minor);
1841
1842 module_put(THIS_MODULE);
1843 kvfree(md);
1844}
1845
1846static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
1847{
1848 struct dm_md_mempools *p = dm_table_get_md_mempools(t);
1849
1850 if (md->bs) {
1851
1852 if (dm_table_bio_based(t)) {
1853
1854
1855
1856
1857 bioset_free(md->bs);
1858 md->bs = p->bs;
1859 p->bs = NULL;
1860 }
1861
1862
1863
1864
1865
1866
1867
1868
1869 goto out;
1870 }
1871
1872 BUG_ON(!p || md->io_pool || md->rq_pool || md->bs);
1873
1874 md->io_pool = p->io_pool;
1875 p->io_pool = NULL;
1876 md->rq_pool = p->rq_pool;
1877 p->rq_pool = NULL;
1878 md->bs = p->bs;
1879 p->bs = NULL;
1880
1881out:
1882
1883 dm_table_free_md_mempools(t);
1884}
1885
1886
1887
1888
1889static void event_callback(void *context)
1890{
1891 unsigned long flags;
1892 LIST_HEAD(uevents);
1893 struct mapped_device *md = (struct mapped_device *) context;
1894
1895 spin_lock_irqsave(&md->uevent_lock, flags);
1896 list_splice_init(&md->uevent_list, &uevents);
1897 spin_unlock_irqrestore(&md->uevent_lock, flags);
1898
1899 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
1900
1901 atomic_inc(&md->event_nr);
1902 wake_up(&md->eventq);
1903 dm_issue_global_event();
1904}
1905
1906
1907
1908
1909static void __set_size(struct mapped_device *md, sector_t size)
1910{
1911 lockdep_assert_held(&md->suspend_lock);
1912
1913 set_capacity(md->disk, size);
1914
1915 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
1916}
1917
1918
1919
1920
1921
1922
1923
1924
1925int dm_queue_merge_is_compulsory(struct request_queue *q)
1926{
1927 struct mapped_device *dev_md;
1928
1929 if (!q->merge_bvec_fn)
1930 return 0;
1931
1932 if (q->make_request_fn == dm_make_request) {
1933 dev_md = q->queuedata;
1934 if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags))
1935 return 0;
1936 }
1937
1938 return 1;
1939}
1940
1941static int dm_device_merge_is_compulsory(struct dm_target *ti,
1942 struct dm_dev *dev, sector_t start,
1943 sector_t len, void *data)
1944{
1945 struct block_device *bdev = dev->bdev;
1946 struct request_queue *q = bdev_get_queue(bdev);
1947
1948 return dm_queue_merge_is_compulsory(q);
1949}
1950
1951
1952
1953
1954
1955static int dm_table_merge_is_optional(struct dm_table *table)
1956{
1957 unsigned i = 0;
1958 struct dm_target *ti;
1959
1960 while (i < dm_table_get_num_targets(table)) {
1961 ti = dm_table_get_target(table, i++);
1962
1963 if (ti->type->iterate_devices &&
1964 ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL))
1965 return 0;
1966 }
1967
1968 return 1;
1969}
1970
1971
1972
1973
1974static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
1975 struct queue_limits *limits)
1976{
1977 struct dm_table *old_map;
1978 struct request_queue *q = md->queue;
1979 sector_t size;
1980 int merge_is_optional;
1981
1982 lockdep_assert_held(&md->suspend_lock);
1983
1984 size = dm_table_get_size(t);
1985
1986
1987
1988
1989 if (size != dm_get_size(md))
1990 memset(&md->geometry, 0, sizeof(md->geometry));
1991
1992 __set_size(md, size);
1993
1994 dm_table_event_callback(t, event_callback, md);
1995
1996
1997
1998
1999
2000
2001
2002
2003 if (dm_table_request_based(t)) {
2004 dm_stop_queue(q);
2005
2006
2007
2008
2009
2010 md->immutable_target = dm_table_get_immutable_target(t);
2011 }
2012
2013 __bind_mempools(md, t);
2014
2015 merge_is_optional = dm_table_merge_is_optional(t);
2016
2017 old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2018 rcu_assign_pointer(md->map, (void *)t);
2019 md->immutable_target_type = dm_table_get_immutable_target_type(t);
2020
2021 dm_table_set_restrictions(t, q, limits);
2022 if (merge_is_optional)
2023 set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
2024 else
2025 clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
2026 if (old_map)
2027 dm_sync_table(md);
2028
2029 return old_map;
2030}
2031
2032
2033
2034
2035static struct dm_table *__unbind(struct mapped_device *md)
2036{
2037 struct dm_table *map = rcu_dereference_protected(md->map, 1);
2038
2039 if (!map)
2040 return NULL;
2041
2042 dm_table_event_callback(map, NULL, NULL);
2043 RCU_INIT_POINTER(md->map, NULL);
2044 dm_sync_table(md);
2045
2046 return map;
2047}
2048
2049
2050
2051
2052int dm_create(int minor, struct mapped_device **result)
2053{
2054 struct mapped_device *md;
2055
2056 md = alloc_dev(minor);
2057 if (!md)
2058 return -ENXIO;
2059
2060 dm_sysfs_init(md);
2061
2062 *result = md;
2063 return 0;
2064}
2065
2066
2067
2068
2069
2070void dm_lock_md_type(struct mapped_device *md)
2071{
2072 mutex_lock(&md->type_lock);
2073}
2074
2075void dm_unlock_md_type(struct mapped_device *md)
2076{
2077 mutex_unlock(&md->type_lock);
2078}
2079
2080void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
2081{
2082 BUG_ON(!mutex_is_locked(&md->type_lock));
2083 md->type = type;
2084}
2085
2086enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
2087{
2088 return md->type;
2089}
2090
2091struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
2092{
2093 return md->immutable_target_type;
2094}
2095
2096
2097
2098
2099
2100struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
2101{
2102 BUG_ON(!atomic_read(&md->holders));
2103 return &md->queue->limits;
2104}
2105EXPORT_SYMBOL_GPL(dm_get_queue_limits);
2106
2107
2108
2109
2110int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
2111{
2112 int r;
2113 struct queue_limits limits;
2114 struct queue_limits_aux limits_aux;
2115 enum dm_queue_mode type = dm_get_md_type(md);
2116
2117 switch (type) {
2118 case DM_TYPE_REQUEST_BASED:
2119 r = dm_old_init_request_queue(md);
2120 if (r) {
2121 DMERR("Cannot initialize queue for request-based mapped device");
2122 return r;
2123 }
2124 break;
2125 case DM_TYPE_MQ_REQUEST_BASED:
2126 r = dm_mq_init_request_queue(md, t);
2127 if (r) {
2128 DMERR("Cannot initialize queue for request-based dm-mq mapped device");
2129 return r;
2130 }
2131 break;
2132 case DM_TYPE_BIO_BASED:
2133 case DM_TYPE_DAX_BIO_BASED:
2134 dm_init_normal_md_queue(md);
2135 blk_queue_make_request(md->queue, dm_make_request);
2136 blk_queue_merge_bvec(md->queue, dm_merge_bvec);
2137
2138 if (type == DM_TYPE_DAX_BIO_BASED)
2139 queue_flag_set_unlocked(QUEUE_FLAG_DAX, md->queue);
2140 break;
2141 case DM_TYPE_NONE:
2142 WARN_ON_ONCE(true);
2143 break;
2144 }
2145
2146 limits.limits_aux = &limits_aux;
2147 r = dm_calculate_queue_limits(t, &limits);
2148 if (r) {
2149 DMERR("Cannot calculate initial queue limits");
2150 return r;
2151 }
2152 dm_table_set_restrictions(t, md->queue, &limits);
2153 blk_register_queue(md->disk);
2154
2155 return 0;
2156}
2157
2158struct mapped_device *dm_get_md(dev_t dev)
2159{
2160 struct mapped_device *md;
2161 unsigned minor = MINOR(dev);
2162
2163 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2164 return NULL;
2165
2166 spin_lock(&_minor_lock);
2167
2168 md = idr_find(&_minor_idr, minor);
2169 if (!md || md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) ||
2170 test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2171 md = NULL;
2172 goto out;
2173 }
2174 dm_get(md);
2175out:
2176 spin_unlock(&_minor_lock);
2177
2178 return md;
2179}
2180EXPORT_SYMBOL_GPL(dm_get_md);
2181
2182void *dm_get_mdptr(struct mapped_device *md)
2183{
2184 return md->interface_ptr;
2185}
2186
2187void dm_set_mdptr(struct mapped_device *md, void *ptr)
2188{
2189 md->interface_ptr = ptr;
2190}
2191
2192void dm_get(struct mapped_device *md)
2193{
2194 atomic_inc(&md->holders);
2195 BUG_ON(test_bit(DMF_FREEING, &md->flags));
2196}
2197
2198int dm_hold(struct mapped_device *md)
2199{
2200 spin_lock(&_minor_lock);
2201 if (test_bit(DMF_FREEING, &md->flags)) {
2202 spin_unlock(&_minor_lock);
2203 return -EBUSY;
2204 }
2205 dm_get(md);
2206 spin_unlock(&_minor_lock);
2207 return 0;
2208}
2209EXPORT_SYMBOL_GPL(dm_hold);
2210
2211const char *dm_device_name(struct mapped_device *md)
2212{
2213 return md->name;
2214}
2215EXPORT_SYMBOL_GPL(dm_device_name);
2216
2217static void __dm_destroy(struct mapped_device *md, bool wait)
2218{
2219 struct request_queue *q = dm_get_md_queue(md);
2220 struct dm_table *map;
2221 int srcu_idx;
2222
2223 might_sleep();
2224
2225 spin_lock(&_minor_lock);
2226 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2227 set_bit(DMF_FREEING, &md->flags);
2228 spin_unlock(&_minor_lock);
2229
2230 blk_set_queue_dying(q);
2231
2232 if (dm_request_based(md) && md->kworker_task)
2233 flush_kthread_worker(&md->kworker);
2234
2235
2236
2237
2238
2239 mutex_lock(&md->suspend_lock);
2240 map = dm_get_live_table(md, &srcu_idx);
2241 if (!dm_suspended_md(md)) {
2242 dm_table_presuspend_targets(map);
2243 dm_table_postsuspend_targets(map);
2244 }
2245
2246 dm_put_live_table(md, srcu_idx);
2247 mutex_unlock(&md->suspend_lock);
2248
2249
2250
2251
2252
2253
2254
2255 if (wait)
2256 while (atomic_read(&md->holders))
2257 msleep(1);
2258 else if (atomic_read(&md->holders))
2259 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2260 dm_device_name(md), atomic_read(&md->holders));
2261
2262 dm_sysfs_exit(md);
2263 dm_table_destroy(__unbind(md));
2264 free_dev(md);
2265}
2266
2267void dm_destroy(struct mapped_device *md)
2268{
2269 __dm_destroy(md, true);
2270}
2271
2272void dm_destroy_immediate(struct mapped_device *md)
2273{
2274 __dm_destroy(md, false);
2275}
2276
2277void dm_put(struct mapped_device *md)
2278{
2279 atomic_dec(&md->holders);
2280}
2281EXPORT_SYMBOL_GPL(dm_put);
2282
2283static int dm_wait_for_completion(struct mapped_device *md, long task_state)
2284{
2285 int r = 0;
2286 DEFINE_WAIT(wait);
2287
2288 while (1) {
2289 prepare_to_wait(&md->wait, &wait, task_state);
2290
2291 if (!md_in_flight(md))
2292 break;
2293
2294 if (signal_pending_state(task_state, current)) {
2295 r = -EINTR;
2296 break;
2297 }
2298
2299 io_schedule();
2300 }
2301 finish_wait(&md->wait, &wait);
2302
2303 return r;
2304}
2305
2306
2307
2308
2309static void dm_wq_work(struct work_struct *work)
2310{
2311 struct mapped_device *md = container_of(work, struct mapped_device,
2312 work);
2313 struct bio *c;
2314 int srcu_idx;
2315 struct dm_table *map;
2316
2317 map = dm_get_live_table(md, &srcu_idx);
2318
2319 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2320 spin_lock_irq(&md->deferred_lock);
2321 c = bio_list_pop(&md->deferred);
2322 spin_unlock_irq(&md->deferred_lock);
2323
2324 if (!c)
2325 break;
2326
2327 if (dm_request_based(md))
2328 generic_make_request(c);
2329 else
2330 __split_and_process_bio(md, map, c);
2331 }
2332
2333 dm_put_live_table(md, srcu_idx);
2334}
2335
2336static void dm_queue_flush(struct mapped_device *md)
2337{
2338 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2339 smp_mb__after_clear_bit();
2340 queue_work(md->wq, &md->work);
2341}
2342
2343
2344
2345
2346struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2347{
2348 struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
2349 struct queue_limits limits;
2350 struct queue_limits_aux limits_aux;
2351 int r;
2352
2353 mutex_lock(&md->suspend_lock);
2354
2355
2356 if (!dm_suspended_md(md))
2357 goto out;
2358
2359
2360
2361
2362
2363 limits.limits_aux = &limits_aux;
2364
2365
2366
2367
2368
2369
2370
2371 if (dm_table_has_no_data_devices(table)) {
2372 live_map = dm_get_live_table_fast(md);
2373 if (live_map)
2374 limits = md->queue->limits;
2375 dm_put_live_table_fast(md);
2376 }
2377
2378 if (!live_map) {
2379 r = dm_calculate_queue_limits(table, &limits);
2380 if (r) {
2381 map = ERR_PTR(r);
2382 goto out;
2383 }
2384 }
2385
2386 map = __bind(md, table, &limits);
2387 dm_issue_global_event();
2388
2389out:
2390 mutex_unlock(&md->suspend_lock);
2391 return map;
2392}
2393
2394
2395
2396
2397
2398static int lock_fs(struct mapped_device *md)
2399{
2400 int r;
2401
2402 WARN_ON(md->frozen_sb);
2403
2404 md->frozen_sb = freeze_bdev(md->bdev);
2405 if (IS_ERR(md->frozen_sb)) {
2406 r = PTR_ERR(md->frozen_sb);
2407 md->frozen_sb = NULL;
2408 return r;
2409 }
2410
2411 set_bit(DMF_FROZEN, &md->flags);
2412
2413 return 0;
2414}
2415
2416static void unlock_fs(struct mapped_device *md)
2417{
2418 if (!test_bit(DMF_FROZEN, &md->flags))
2419 return;
2420
2421 thaw_bdev(md->bdev, md->frozen_sb);
2422 md->frozen_sb = NULL;
2423 clear_bit(DMF_FROZEN, &md->flags);
2424}
2425
2426
2427
2428
2429
2430
2431
2432
2433
2434
2435static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
2436 unsigned suspend_flags, long task_state,
2437 int dmf_suspended_flag)
2438{
2439 bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
2440 bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
2441 int r;
2442
2443 lockdep_assert_held(&md->suspend_lock);
2444
2445
2446
2447
2448
2449 if (noflush)
2450 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2451 else
2452 pr_debug("%s: suspending with flush\n", dm_device_name(md));
2453
2454
2455
2456
2457
2458 dm_table_presuspend_targets(map);
2459
2460
2461
2462
2463
2464
2465
2466 if (!noflush && do_lockfs) {
2467 r = lock_fs(md);
2468 if (r) {
2469 dm_table_presuspend_undo_targets(map);
2470 return r;
2471 }
2472 }
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2487 if (map)
2488 synchronize_srcu(&md->io_barrier);
2489
2490
2491
2492
2493
2494 if (dm_request_based(md)) {
2495 dm_stop_queue(md->queue);
2496 if (md->kworker_task)
2497 flush_kthread_worker(&md->kworker);
2498 }
2499
2500 flush_workqueue(md->wq);
2501
2502
2503
2504
2505
2506
2507 r = dm_wait_for_completion(md, task_state);
2508 if (!r)
2509 set_bit(dmf_suspended_flag, &md->flags);
2510
2511 if (noflush)
2512 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2513 if (map)
2514 synchronize_srcu(&md->io_barrier);
2515
2516
2517 if (r < 0) {
2518 dm_queue_flush(md);
2519
2520 if (dm_request_based(md))
2521 dm_start_queue(md->queue);
2522
2523 unlock_fs(md);
2524 dm_table_presuspend_undo_targets(map);
2525
2526 }
2527
2528 return r;
2529}
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2548{
2549 struct dm_table *map = NULL;
2550 int r = 0;
2551
2552retry:
2553 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2554
2555 if (dm_suspended_md(md)) {
2556 r = -EINVAL;
2557 goto out_unlock;
2558 }
2559
2560 if (dm_suspended_internally_md(md)) {
2561
2562 mutex_unlock(&md->suspend_lock);
2563 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2564 if (r)
2565 return r;
2566 goto retry;
2567 }
2568
2569 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2570
2571 r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED);
2572 if (r)
2573 goto out_unlock;
2574
2575 dm_table_postsuspend_targets(map);
2576
2577out_unlock:
2578 mutex_unlock(&md->suspend_lock);
2579 return r;
2580}
2581
2582static int __dm_resume(struct mapped_device *md, struct dm_table *map)
2583{
2584 if (map) {
2585 int r = dm_table_resume_targets(map);
2586 if (r)
2587 return r;
2588 }
2589
2590 dm_queue_flush(md);
2591
2592
2593
2594
2595
2596
2597 if (dm_request_based(md))
2598 dm_start_queue(md->queue);
2599
2600 unlock_fs(md);
2601
2602 return 0;
2603}
2604
2605int dm_resume(struct mapped_device *md)
2606{
2607 int r;
2608 struct dm_table *map = NULL;
2609
2610retry:
2611 r = -EINVAL;
2612 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2613
2614 if (!dm_suspended_md(md))
2615 goto out;
2616
2617 if (dm_suspended_internally_md(md)) {
2618
2619 mutex_unlock(&md->suspend_lock);
2620 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2621 if (r)
2622 return r;
2623 goto retry;
2624 }
2625
2626 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2627 if (!map || !dm_table_get_size(map))
2628 goto out;
2629
2630 r = __dm_resume(md, map);
2631 if (r)
2632 goto out;
2633
2634 clear_bit(DMF_SUSPENDED, &md->flags);
2635out:
2636 mutex_unlock(&md->suspend_lock);
2637
2638 return r;
2639}
2640
2641
2642
2643
2644
2645
2646
2647static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
2648{
2649 struct dm_table *map = NULL;
2650
2651 lockdep_assert_held(&md->suspend_lock);
2652
2653 if (md->internal_suspend_count++)
2654 return;
2655
2656 if (dm_suspended_md(md)) {
2657 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2658 return;
2659 }
2660
2661 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2662
2663
2664
2665
2666
2667
2668
2669 (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE,
2670 DMF_SUSPENDED_INTERNALLY);
2671
2672 dm_table_postsuspend_targets(map);
2673}
2674
2675static void __dm_internal_resume(struct mapped_device *md)
2676{
2677 BUG_ON(!md->internal_suspend_count);
2678
2679 if (--md->internal_suspend_count)
2680 return;
2681
2682 if (dm_suspended_md(md))
2683 goto done;
2684
2685
2686
2687
2688
2689 (void) __dm_resume(md, NULL);
2690
2691done:
2692 clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2693 smp_mb__after_atomic();
2694 wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
2695}
2696
2697void dm_internal_suspend_noflush(struct mapped_device *md)
2698{
2699 mutex_lock(&md->suspend_lock);
2700 __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
2701 mutex_unlock(&md->suspend_lock);
2702}
2703EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
2704
2705void dm_internal_resume(struct mapped_device *md)
2706{
2707 mutex_lock(&md->suspend_lock);
2708 __dm_internal_resume(md);
2709 mutex_unlock(&md->suspend_lock);
2710}
2711EXPORT_SYMBOL_GPL(dm_internal_resume);
2712
2713
2714
2715
2716
2717
2718void dm_internal_suspend_fast(struct mapped_device *md)
2719{
2720 mutex_lock(&md->suspend_lock);
2721 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2722 return;
2723
2724 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2725 synchronize_srcu(&md->io_barrier);
2726 flush_workqueue(md->wq);
2727 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2728}
2729EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
2730
2731void dm_internal_resume_fast(struct mapped_device *md)
2732{
2733 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2734 goto done;
2735
2736 dm_queue_flush(md);
2737
2738done:
2739 mutex_unlock(&md->suspend_lock);
2740}
2741EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
2742
2743
2744
2745
2746int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
2747 unsigned cookie)
2748{
2749 char udev_cookie[DM_COOKIE_LENGTH];
2750 char *envp[] = { udev_cookie, NULL };
2751
2752 if (!cookie)
2753 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
2754 else {
2755 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2756 DM_COOKIE_ENV_VAR_NAME, cookie);
2757 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
2758 action, envp);
2759 }
2760}
2761
2762uint32_t dm_next_uevent_seq(struct mapped_device *md)
2763{
2764 return atomic_add_return(1, &md->uevent_seq);
2765}
2766
2767uint32_t dm_get_event_nr(struct mapped_device *md)
2768{
2769 return atomic_read(&md->event_nr);
2770}
2771
2772int dm_wait_event(struct mapped_device *md, int event_nr)
2773{
2774 return wait_event_interruptible(md->eventq,
2775 (event_nr != atomic_read(&md->event_nr)));
2776}
2777
2778void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
2779{
2780 unsigned long flags;
2781
2782 spin_lock_irqsave(&md->uevent_lock, flags);
2783 list_add(elist, &md->uevent_list);
2784 spin_unlock_irqrestore(&md->uevent_lock, flags);
2785}
2786
2787
2788
2789
2790
2791struct gendisk *dm_disk(struct mapped_device *md)
2792{
2793 return md->disk;
2794}
2795EXPORT_SYMBOL_GPL(dm_disk);
2796
2797struct kobject *dm_kobject(struct mapped_device *md)
2798{
2799 return &md->kobj_holder.kobj;
2800}
2801
2802struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2803{
2804 struct mapped_device *md;
2805
2806 md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
2807
2808 spin_lock(&_minor_lock);
2809 if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2810 md = NULL;
2811 goto out;
2812 }
2813 dm_get(md);
2814out:
2815 spin_unlock(&_minor_lock);
2816
2817 return md;
2818}
2819
2820int dm_suspended_md(struct mapped_device *md)
2821{
2822 return test_bit(DMF_SUSPENDED, &md->flags);
2823}
2824
2825int dm_suspended_internally_md(struct mapped_device *md)
2826{
2827 return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2828}
2829
2830int dm_test_deferred_remove_flag(struct mapped_device *md)
2831{
2832 return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
2833}
2834
2835int dm_suspended(struct dm_target *ti)
2836{
2837 return dm_suspended_md(dm_table_get_md(ti->table));
2838}
2839EXPORT_SYMBOL_GPL(dm_suspended);
2840
2841int dm_noflush_suspending(struct dm_target *ti)
2842{
2843 return __noflush_suspending(dm_table_get_md(ti->table));
2844}
2845EXPORT_SYMBOL_GPL(dm_noflush_suspending);
2846
2847struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
2848 unsigned integrity, unsigned per_io_data_size)
2849{
2850 struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
2851 struct kmem_cache *cachep = NULL;
2852 unsigned int pool_size = 0;
2853 unsigned int front_pad;
2854
2855 if (!pools)
2856 return NULL;
2857
2858 switch (type) {
2859 case DM_TYPE_BIO_BASED:
2860 case DM_TYPE_DAX_BIO_BASED:
2861 cachep = _io_cache;
2862 pool_size = dm_get_reserved_bio_based_ios();
2863 front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
2864 break;
2865 case DM_TYPE_REQUEST_BASED:
2866 cachep = _rq_tio_cache;
2867 pool_size = dm_get_reserved_rq_based_ios();
2868 pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
2869 if (!pools->rq_pool)
2870 goto out;
2871
2872 case DM_TYPE_MQ_REQUEST_BASED:
2873 if (!pool_size)
2874 pool_size = dm_get_reserved_rq_based_ios();
2875 front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
2876
2877 break;
2878 default:
2879 BUG();
2880 }
2881
2882 if (cachep) {
2883 pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
2884 if (!pools->io_pool)
2885 goto out;
2886 }
2887
2888 pools->bs = bioset_create(pool_size, front_pad);
2889 if (!pools->bs)
2890 goto out;
2891
2892 if (integrity && bioset_integrity_create(pools->bs, pool_size))
2893 goto out;
2894
2895 return pools;
2896
2897out:
2898 dm_free_md_mempools(pools);
2899
2900 return NULL;
2901}
2902
2903void dm_free_md_mempools(struct dm_md_mempools *pools)
2904{
2905 if (!pools)
2906 return;
2907
2908 mempool_destroy(pools->io_pool);
2909 mempool_destroy(pools->rq_pool);
2910
2911 if (pools->bs)
2912 bioset_free(pools->bs);
2913
2914 kfree(pools);
2915}
2916
2917struct dm_pr {
2918 u64 old_key;
2919 u64 new_key;
2920 u32 flags;
2921 bool fail_early;
2922};
2923
2924static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
2925 void *data)
2926{
2927 struct mapped_device *md = bdev->bd_disk->private_data;
2928 struct dm_table *table;
2929 struct dm_target *ti;
2930 int ret = -ENOTTY, srcu_idx;
2931
2932 table = dm_get_live_table(md, &srcu_idx);
2933 if (!table || !dm_table_get_size(table))
2934 goto out;
2935
2936
2937 if (dm_table_get_num_targets(table) != 1)
2938 goto out;
2939 ti = dm_table_get_target(table, 0);
2940
2941 ret = -EINVAL;
2942 if (!ti->type->iterate_devices)
2943 goto out;
2944
2945 ret = ti->type->iterate_devices(ti, fn, data);
2946out:
2947 dm_put_live_table(md, srcu_idx);
2948 return ret;
2949}
2950
2951
2952
2953
2954static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev,
2955 sector_t start, sector_t len, void *data)
2956{
2957 struct dm_pr *pr = data;
2958 const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
2959
2960 if (!ops || !ops->pr_register)
2961 return -EOPNOTSUPP;
2962 return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);
2963}
2964
2965static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
2966 u32 flags)
2967{
2968 struct dm_pr pr = {
2969 .old_key = old_key,
2970 .new_key = new_key,
2971 .flags = flags,
2972 .fail_early = true,
2973 };
2974 int ret;
2975
2976 ret = dm_call_pr(bdev, __dm_pr_register, &pr);
2977 if (ret && new_key) {
2978
2979 pr.old_key = new_key;
2980 pr.new_key = 0;
2981 pr.flags = 0;
2982 pr.fail_early = false;
2983 dm_call_pr(bdev, __dm_pr_register, &pr);
2984 }
2985
2986 return ret;
2987}
2988
2989static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
2990 u32 flags)
2991{
2992 struct mapped_device *md = bdev->bd_disk->private_data;
2993 const struct pr_ops *ops;
2994 fmode_t mode;
2995 int r;
2996
2997 r = dm_get_bdev_for_ioctl(md, &bdev, &mode);
2998 if (r < 0)
2999 return r;
3000
3001 ops = bdev->bd_disk->fops->pr_ops;
3002 if (ops && ops->pr_reserve)
3003 r = ops->pr_reserve(bdev, key, type, flags);
3004 else
3005 r = -EOPNOTSUPP;
3006
3007 blkdev_put(bdev, mode);
3008 return r;
3009}
3010
3011static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
3012{
3013 struct mapped_device *md = bdev->bd_disk->private_data;
3014 const struct pr_ops *ops;
3015 fmode_t mode;
3016 int r;
3017
3018 r = dm_get_bdev_for_ioctl(md, &bdev, &mode);
3019 if (r < 0)
3020 return r;
3021
3022 ops = bdev->bd_disk->fops->pr_ops;
3023 if (ops && ops->pr_release)
3024 r = ops->pr_release(bdev, key, type);
3025 else
3026 r = -EOPNOTSUPP;
3027
3028 blkdev_put(bdev, mode);
3029 return r;
3030}
3031
3032static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
3033 enum pr_type type, bool abort)
3034{
3035 struct mapped_device *md = bdev->bd_disk->private_data;
3036 const struct pr_ops *ops;
3037 fmode_t mode;
3038 int r;
3039
3040 r = dm_get_bdev_for_ioctl(md, &bdev, &mode);
3041 if (r < 0)
3042 return r;
3043
3044 ops = bdev->bd_disk->fops->pr_ops;
3045 if (ops && ops->pr_preempt)
3046 r = ops->pr_preempt(bdev, old_key, new_key, type, abort);
3047 else
3048 r = -EOPNOTSUPP;
3049
3050 blkdev_put(bdev, mode);
3051 return r;
3052}
3053
3054static int dm_pr_clear(struct block_device *bdev, u64 key)
3055{
3056 struct mapped_device *md = bdev->bd_disk->private_data;
3057 const struct pr_ops *ops;
3058 fmode_t mode;
3059 int r;
3060
3061 r = dm_get_bdev_for_ioctl(md, &bdev, &mode);
3062 if (r < 0)
3063 return r;
3064
3065 ops = bdev->bd_disk->fops->pr_ops;
3066 if (ops && ops->pr_clear)
3067 r = ops->pr_clear(bdev, key);
3068 else
3069 r = -EOPNOTSUPP;
3070
3071 blkdev_put(bdev, mode);
3072 return r;
3073}
3074
3075static const struct pr_ops dm_pr_ops = {
3076 .pr_register = dm_pr_register,
3077 .pr_reserve = dm_pr_reserve,
3078 .pr_release = dm_pr_release,
3079 .pr_preempt = dm_pr_preempt,
3080 .pr_clear = dm_pr_clear,
3081};
3082
3083static const struct block_device_operations dm_blk_dops = {
3084 .open = dm_blk_open,
3085 .release = dm_blk_close,
3086 .ioctl = dm_blk_ioctl,
3087 .getgeo = dm_blk_getgeo,
3088 .pr_ops = &dm_pr_ops,
3089 .owner = THIS_MODULE
3090};
3091
3092static const struct dax_operations dm_dax_ops = {
3093 .direct_access = dm_dax_direct_access,
3094};
3095
3096
3097
3098
3099module_init(dm_init);
3100module_exit(dm_exit);
3101
3102module_param(major, uint, 0);
3103MODULE_PARM_DESC(major, "The major number of the device mapper");
3104
3105module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
3106MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
3107
3108module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
3109MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
3110
3111MODULE_DESCRIPTION(DM_NAME " driver");
3112MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3113MODULE_LICENSE("GPL");
3114