1
2
3
4
5
6
7
8#include "dm-core.h"
9#include "dm-rq.h"
10#include "dm-uevent.h"
11
12#include <linux/init.h>
13#include <linux/module.h>
14#include <linux/mutex.h>
15#include <linux/sched/signal.h>
16#include <linux/blkpg.h>
17#include <linux/bio.h>
18#include <linux/mempool.h>
19#include <linux/dax.h>
20#include <linux/slab.h>
21#include <linux/idr.h>
22#include <linux/uio.h>
23#include <linux/hdreg.h>
24#include <linux/delay.h>
25#include <linux/wait.h>
26#include <linux/pr.h>
27#include <linux/refcount.h>
28
29#define DM_MSG_PREFIX "core"
30
31
32
33
34
35#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
36#define DM_COOKIE_LENGTH 24
37
38static const char *_name = DM_NAME;
39
40static unsigned int major = 0;
41static unsigned int _major = 0;
42
43static DEFINE_IDR(_minor_idr);
44
45static DEFINE_SPINLOCK(_minor_lock);
46
47static void do_deferred_remove(struct work_struct *w);
48
49static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
50
51static struct workqueue_struct *deferred_remove_workqueue;
52
53atomic_t dm_global_event_nr = ATOMIC_INIT(0);
54DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq);
55
56void dm_issue_global_event(void)
57{
58 atomic_inc(&dm_global_event_nr);
59 wake_up(&dm_global_eventq);
60}
61
62
63
64
65struct clone_info {
66 struct dm_table *map;
67 struct bio *bio;
68 struct dm_io *io;
69 sector_t sector;
70 unsigned sector_count;
71};
72
73
74
75
76#define DM_TIO_MAGIC 7282014
77struct dm_target_io {
78 unsigned magic;
79 struct dm_io *io;
80 struct dm_target *ti;
81 unsigned target_bio_nr;
82 unsigned *len_ptr;
83 bool inside_dm_io;
84 struct bio clone;
85};
86
87
88
89
90
91#define DM_IO_MAGIC 5191977
92struct dm_io {
93 unsigned magic;
94 struct mapped_device *md;
95 blk_status_t status;
96 atomic_t io_count;
97 struct bio *orig_bio;
98 unsigned long start_time;
99 spinlock_t endio_lock;
100 struct dm_stats_aux stats_aux;
101
102 struct dm_target_io tio;
103};
104
105void *dm_per_bio_data(struct bio *bio, size_t data_size)
106{
107 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
108 if (!tio->inside_dm_io)
109 return (char *)bio - offsetof(struct dm_target_io, clone) - data_size;
110 return (char *)bio - offsetof(struct dm_target_io, clone) - offsetof(struct dm_io, tio) - data_size;
111}
112EXPORT_SYMBOL_GPL(dm_per_bio_data);
113
114struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size)
115{
116 struct dm_io *io = (struct dm_io *)((char *)data + data_size);
117 if (io->magic == DM_IO_MAGIC)
118 return (struct bio *)((char *)io + offsetof(struct dm_io, tio) + offsetof(struct dm_target_io, clone));
119 BUG_ON(io->magic != DM_TIO_MAGIC);
120 return (struct bio *)((char *)io + offsetof(struct dm_target_io, clone));
121}
122EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data);
123
124unsigned dm_bio_get_target_bio_nr(const struct bio *bio)
125{
126 return container_of(bio, struct dm_target_io, clone)->target_bio_nr;
127}
128EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr);
129
130#define MINOR_ALLOCED ((void *)-1)
131
132
133
134
135#define DMF_BLOCK_IO_FOR_SUSPEND 0
136#define DMF_SUSPENDED 1
137#define DMF_FROZEN 2
138#define DMF_FREEING 3
139#define DMF_DELETING 4
140#define DMF_NOFLUSH_SUSPENDING 5
141#define DMF_DEFERRED_REMOVE 6
142#define DMF_SUSPENDED_INTERNALLY 7
143
144#define DM_NUMA_NODE NUMA_NO_NODE
145static int dm_numa_node = DM_NUMA_NODE;
146
147
148
149
150struct dm_md_mempools {
151 struct bio_set bs;
152 struct bio_set io_bs;
153};
154
155struct table_device {
156 struct list_head list;
157 refcount_t count;
158 struct dm_dev dm_dev;
159};
160
161
162
163
164#define RESERVED_BIO_BASED_IOS 16
165static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
166
167static int __dm_get_module_param_int(int *module_param, int min, int max)
168{
169 int param = READ_ONCE(*module_param);
170 int modified_param = 0;
171 bool modified = true;
172
173 if (param < min)
174 modified_param = min;
175 else if (param > max)
176 modified_param = max;
177 else
178 modified = false;
179
180 if (modified) {
181 (void)cmpxchg(module_param, param, modified_param);
182 param = modified_param;
183 }
184
185 return param;
186}
187
188unsigned __dm_get_module_param(unsigned *module_param,
189 unsigned def, unsigned max)
190{
191 unsigned param = READ_ONCE(*module_param);
192 unsigned modified_param = 0;
193
194 if (!param)
195 modified_param = def;
196 else if (param > max)
197 modified_param = max;
198
199 if (modified_param) {
200 (void)cmpxchg(module_param, param, modified_param);
201 param = modified_param;
202 }
203
204 return param;
205}
206
207unsigned dm_get_reserved_bio_based_ios(void)
208{
209 return __dm_get_module_param(&reserved_bio_based_ios,
210 RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS);
211}
212EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
213
214static unsigned dm_get_numa_node(void)
215{
216 return __dm_get_module_param_int(&dm_numa_node,
217 DM_NUMA_NODE, num_online_nodes() - 1);
218}
219
220static int __init local_init(void)
221{
222 int r;
223
224 r = dm_uevent_init();
225 if (r)
226 return r;
227
228 deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
229 if (!deferred_remove_workqueue) {
230 r = -ENOMEM;
231 goto out_uevent_exit;
232 }
233
234 _major = major;
235 r = register_blkdev(_major, _name);
236 if (r < 0)
237 goto out_free_workqueue;
238
239 if (!_major)
240 _major = r;
241
242 return 0;
243
244out_free_workqueue:
245 destroy_workqueue(deferred_remove_workqueue);
246out_uevent_exit:
247 dm_uevent_exit();
248
249 return r;
250}
251
252static void local_exit(void)
253{
254 flush_scheduled_work();
255 destroy_workqueue(deferred_remove_workqueue);
256
257 unregister_blkdev(_major, _name);
258 dm_uevent_exit();
259
260 _major = 0;
261
262 DMINFO("cleaned up");
263}
264
265static int (*_inits[])(void) __initdata = {
266 local_init,
267 dm_target_init,
268 dm_linear_init,
269 dm_stripe_init,
270 dm_io_init,
271 dm_kcopyd_init,
272 dm_interface_init,
273 dm_statistics_init,
274};
275
276static void (*_exits[])(void) = {
277 local_exit,
278 dm_target_exit,
279 dm_linear_exit,
280 dm_stripe_exit,
281 dm_io_exit,
282 dm_kcopyd_exit,
283 dm_interface_exit,
284 dm_statistics_exit,
285};
286
287static int __init dm_init(void)
288{
289 const int count = ARRAY_SIZE(_inits);
290
291 int r, i;
292
293 for (i = 0; i < count; i++) {
294 r = _inits[i]();
295 if (r)
296 goto bad;
297 }
298
299 return 0;
300
301 bad:
302 while (i--)
303 _exits[i]();
304
305 return r;
306}
307
308static void __exit dm_exit(void)
309{
310 int i = ARRAY_SIZE(_exits);
311
312 while (i--)
313 _exits[i]();
314
315
316
317
318 idr_destroy(&_minor_idr);
319}
320
321
322
323
324int dm_deleting_md(struct mapped_device *md)
325{
326 return test_bit(DMF_DELETING, &md->flags);
327}
328
329static int dm_blk_open(struct block_device *bdev, fmode_t mode)
330{
331 struct mapped_device *md;
332
333 spin_lock(&_minor_lock);
334
335 md = bdev->bd_disk->private_data;
336 if (!md)
337 goto out;
338
339 if (test_bit(DMF_FREEING, &md->flags) ||
340 dm_deleting_md(md)) {
341 md = NULL;
342 goto out;
343 }
344
345 dm_get(md);
346 atomic_inc(&md->open_count);
347out:
348 spin_unlock(&_minor_lock);
349
350 return md ? 0 : -ENXIO;
351}
352
353static void dm_blk_close(struct gendisk *disk, fmode_t mode)
354{
355 struct mapped_device *md;
356
357 spin_lock(&_minor_lock);
358
359 md = disk->private_data;
360 if (WARN_ON(!md))
361 goto out;
362
363 if (atomic_dec_and_test(&md->open_count) &&
364 (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
365 queue_work(deferred_remove_workqueue, &deferred_remove_work);
366
367 dm_put(md);
368out:
369 spin_unlock(&_minor_lock);
370}
371
372int dm_open_count(struct mapped_device *md)
373{
374 return atomic_read(&md->open_count);
375}
376
377
378
379
380int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
381{
382 int r = 0;
383
384 spin_lock(&_minor_lock);
385
386 if (dm_open_count(md)) {
387 r = -EBUSY;
388 if (mark_deferred)
389 set_bit(DMF_DEFERRED_REMOVE, &md->flags);
390 } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
391 r = -EEXIST;
392 else
393 set_bit(DMF_DELETING, &md->flags);
394
395 spin_unlock(&_minor_lock);
396
397 return r;
398}
399
400int dm_cancel_deferred_remove(struct mapped_device *md)
401{
402 int r = 0;
403
404 spin_lock(&_minor_lock);
405
406 if (test_bit(DMF_DELETING, &md->flags))
407 r = -EBUSY;
408 else
409 clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
410
411 spin_unlock(&_minor_lock);
412
413 return r;
414}
415
416static void do_deferred_remove(struct work_struct *w)
417{
418 dm_deferred_remove();
419}
420
421sector_t dm_get_size(struct mapped_device *md)
422{
423 return get_capacity(md->disk);
424}
425
426struct request_queue *dm_get_md_queue(struct mapped_device *md)
427{
428 return md->queue;
429}
430
431struct dm_stats *dm_get_stats(struct mapped_device *md)
432{
433 return &md->stats;
434}
435
436static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
437{
438 struct mapped_device *md = bdev->bd_disk->private_data;
439
440 return dm_get_geometry(md, geo);
441}
442
443static int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
444 struct blk_zone *zones, unsigned int *nr_zones)
445{
446#ifdef CONFIG_BLK_DEV_ZONED
447 struct mapped_device *md = disk->private_data;
448 struct dm_target *tgt;
449 struct dm_table *map;
450 int srcu_idx, ret;
451
452 if (dm_suspended_md(md))
453 return -EAGAIN;
454
455 map = dm_get_live_table(md, &srcu_idx);
456 if (!map)
457 return -EIO;
458
459 tgt = dm_table_find_target(map, sector);
460 if (!dm_target_is_valid(tgt)) {
461 ret = -EIO;
462 goto out;
463 }
464
465
466
467
468
469
470
471 if (WARN_ON(!tgt->type->report_zones)) {
472 ret = -EIO;
473 goto out;
474 }
475
476
477
478
479
480
481
482 ret = tgt->type->report_zones(tgt, sector, zones, nr_zones);
483
484out:
485 dm_put_live_table(md, srcu_idx);
486 return ret;
487#else
488 return -ENOTSUPP;
489#endif
490}
491
492static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx,
493 struct block_device **bdev)
494 __acquires(md->io_barrier)
495{
496 struct dm_target *tgt;
497 struct dm_table *map;
498 int r;
499
500retry:
501 r = -ENOTTY;
502 map = dm_get_live_table(md, srcu_idx);
503 if (!map || !dm_table_get_size(map))
504 return r;
505
506
507 if (dm_table_get_num_targets(map) != 1)
508 return r;
509
510 tgt = dm_table_get_target(map, 0);
511 if (!tgt->type->prepare_ioctl)
512 return r;
513
514 if (dm_suspended_md(md))
515 return -EAGAIN;
516
517 r = tgt->type->prepare_ioctl(tgt, bdev);
518 if (r == -ENOTCONN && !fatal_signal_pending(current)) {
519 dm_put_live_table(md, *srcu_idx);
520 msleep(10);
521 goto retry;
522 }
523
524 return r;
525}
526
527static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx)
528 __releases(md->io_barrier)
529{
530 dm_put_live_table(md, srcu_idx);
531}
532
533static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
534 unsigned int cmd, unsigned long arg)
535{
536 struct mapped_device *md = bdev->bd_disk->private_data;
537 int r, srcu_idx;
538
539 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
540 if (r < 0)
541 goto out;
542
543 if (r > 0) {
544
545
546
547
548 if (!capable(CAP_SYS_RAWIO)) {
549 DMWARN_LIMIT(
550 "%s: sending ioctl %x to DM device without required privilege.",
551 current->comm, cmd);
552 r = -ENOIOCTLCMD;
553 goto out;
554 }
555 }
556
557 r = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
558out:
559 dm_unprepare_ioctl(md, srcu_idx);
560 return r;
561}
562
563static void start_io_acct(struct dm_io *io);
564
565static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
566{
567 struct dm_io *io;
568 struct dm_target_io *tio;
569 struct bio *clone;
570
571 clone = bio_alloc_bioset(GFP_NOIO, 0, &md->io_bs);
572 if (!clone)
573 return NULL;
574
575 tio = container_of(clone, struct dm_target_io, clone);
576 tio->inside_dm_io = true;
577 tio->io = NULL;
578
579 io = container_of(tio, struct dm_io, tio);
580 io->magic = DM_IO_MAGIC;
581 io->status = 0;
582 atomic_set(&io->io_count, 1);
583 io->orig_bio = bio;
584 io->md = md;
585 spin_lock_init(&io->endio_lock);
586
587 start_io_acct(io);
588
589 return io;
590}
591
592static void free_io(struct mapped_device *md, struct dm_io *io)
593{
594 bio_put(&io->tio.clone);
595}
596
597static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *ti,
598 unsigned target_bio_nr, gfp_t gfp_mask)
599{
600 struct dm_target_io *tio;
601
602 if (!ci->io->tio.io) {
603
604 tio = &ci->io->tio;
605 } else {
606 struct bio *clone = bio_alloc_bioset(gfp_mask, 0, &ci->io->md->bs);
607 if (!clone)
608 return NULL;
609
610 tio = container_of(clone, struct dm_target_io, clone);
611 tio->inside_dm_io = false;
612 }
613
614 tio->magic = DM_TIO_MAGIC;
615 tio->io = ci->io;
616 tio->ti = ti;
617 tio->target_bio_nr = target_bio_nr;
618
619 return tio;
620}
621
622static void free_tio(struct dm_target_io *tio)
623{
624 if (tio->inside_dm_io)
625 return;
626 bio_put(&tio->clone);
627}
628
629static bool md_in_flight_bios(struct mapped_device *md)
630{
631 int cpu;
632 struct hd_struct *part = &dm_disk(md)->part0;
633 long sum = 0;
634
635 for_each_possible_cpu(cpu) {
636 sum += part_stat_local_read_cpu(part, in_flight[0], cpu);
637 sum += part_stat_local_read_cpu(part, in_flight[1], cpu);
638 }
639
640 return sum != 0;
641}
642
643static bool md_in_flight(struct mapped_device *md)
644{
645 if (queue_is_mq(md->queue))
646 return blk_mq_queue_inflight(md->queue);
647 else
648 return md_in_flight_bios(md);
649}
650
651static void start_io_acct(struct dm_io *io)
652{
653 struct mapped_device *md = io->md;
654 struct bio *bio = io->orig_bio;
655
656 io->start_time = jiffies;
657
658 generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio),
659 &dm_disk(md)->part0);
660
661 if (unlikely(dm_stats_used(&md->stats)))
662 dm_stats_account_io(&md->stats, bio_data_dir(bio),
663 bio->bi_iter.bi_sector, bio_sectors(bio),
664 false, 0, &io->stats_aux);
665}
666
667static void end_io_acct(struct dm_io *io)
668{
669 struct mapped_device *md = io->md;
670 struct bio *bio = io->orig_bio;
671 unsigned long duration = jiffies - io->start_time;
672
673 generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0,
674 io->start_time);
675
676 if (unlikely(dm_stats_used(&md->stats)))
677 dm_stats_account_io(&md->stats, bio_data_dir(bio),
678 bio->bi_iter.bi_sector, bio_sectors(bio),
679 true, duration, &io->stats_aux);
680
681
682 if (unlikely(wq_has_sleeper(&md->wait)))
683 wake_up(&md->wait);
684}
685
686
687
688
689static void queue_io(struct mapped_device *md, struct bio *bio)
690{
691 unsigned long flags;
692
693 spin_lock_irqsave(&md->deferred_lock, flags);
694 bio_list_add(&md->deferred, bio);
695 spin_unlock_irqrestore(&md->deferred_lock, flags);
696 queue_work(md->wq, &md->work);
697}
698
699
700
701
702
703
704struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
705{
706 *srcu_idx = srcu_read_lock(&md->io_barrier);
707
708 return srcu_dereference(md->map, &md->io_barrier);
709}
710
711void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
712{
713 srcu_read_unlock(&md->io_barrier, srcu_idx);
714}
715
716void dm_sync_table(struct mapped_device *md)
717{
718 synchronize_srcu(&md->io_barrier);
719 synchronize_rcu_expedited();
720}
721
722
723
724
725
726static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
727{
728 rcu_read_lock();
729 return rcu_dereference(md->map);
730}
731
732static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
733{
734 rcu_read_unlock();
735}
736
737static char *_dm_claim_ptr = "I belong to device-mapper";
738
739
740
741
742static int open_table_device(struct table_device *td, dev_t dev,
743 struct mapped_device *md)
744{
745 struct block_device *bdev;
746
747 int r;
748
749 BUG_ON(td->dm_dev.bdev);
750
751 bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr);
752 if (IS_ERR(bdev))
753 return PTR_ERR(bdev);
754
755 r = bd_link_disk_holder(bdev, dm_disk(md));
756 if (r) {
757 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
758 return r;
759 }
760
761 td->dm_dev.bdev = bdev;
762 td->dm_dev.dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
763 return 0;
764}
765
766
767
768
769static void close_table_device(struct table_device *td, struct mapped_device *md)
770{
771 if (!td->dm_dev.bdev)
772 return;
773
774 bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
775 blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
776 put_dax(td->dm_dev.dax_dev);
777 td->dm_dev.bdev = NULL;
778 td->dm_dev.dax_dev = NULL;
779}
780
781static struct table_device *find_table_device(struct list_head *l, dev_t dev,
782 fmode_t mode)
783{
784 struct table_device *td;
785
786 list_for_each_entry(td, l, list)
787 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
788 return td;
789
790 return NULL;
791}
792
793int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
794 struct dm_dev **result)
795{
796 int r;
797 struct table_device *td;
798
799 mutex_lock(&md->table_devices_lock);
800 td = find_table_device(&md->table_devices, dev, mode);
801 if (!td) {
802 td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
803 if (!td) {
804 mutex_unlock(&md->table_devices_lock);
805 return -ENOMEM;
806 }
807
808 td->dm_dev.mode = mode;
809 td->dm_dev.bdev = NULL;
810
811 if ((r = open_table_device(td, dev, md))) {
812 mutex_unlock(&md->table_devices_lock);
813 kfree(td);
814 return r;
815 }
816
817 format_dev_t(td->dm_dev.name, dev);
818
819 refcount_set(&td->count, 1);
820 list_add(&td->list, &md->table_devices);
821 } else {
822 refcount_inc(&td->count);
823 }
824 mutex_unlock(&md->table_devices_lock);
825
826 *result = &td->dm_dev;
827 return 0;
828}
829EXPORT_SYMBOL_GPL(dm_get_table_device);
830
831void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
832{
833 struct table_device *td = container_of(d, struct table_device, dm_dev);
834
835 mutex_lock(&md->table_devices_lock);
836 if (refcount_dec_and_test(&td->count)) {
837 close_table_device(td, md);
838 list_del(&td->list);
839 kfree(td);
840 }
841 mutex_unlock(&md->table_devices_lock);
842}
843EXPORT_SYMBOL(dm_put_table_device);
844
845static void free_table_devices(struct list_head *devices)
846{
847 struct list_head *tmp, *next;
848
849 list_for_each_safe(tmp, next, devices) {
850 struct table_device *td = list_entry(tmp, struct table_device, list);
851
852 DMWARN("dm_destroy: %s still exists with %d references",
853 td->dm_dev.name, refcount_read(&td->count));
854 kfree(td);
855 }
856}
857
858
859
860
861int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
862{
863 *geo = md->geometry;
864
865 return 0;
866}
867
868
869
870
871int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
872{
873 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
874
875 if (geo->start > sz) {
876 DMWARN("Start sector is beyond the geometry limits.");
877 return -EINVAL;
878 }
879
880 md->geometry = *geo;
881
882 return 0;
883}
884
885static int __noflush_suspending(struct mapped_device *md)
886{
887 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
888}
889
890
891
892
893
894static void dec_pending(struct dm_io *io, blk_status_t error)
895{
896 unsigned long flags;
897 blk_status_t io_error;
898 struct bio *bio;
899 struct mapped_device *md = io->md;
900
901
902 if (unlikely(error)) {
903 spin_lock_irqsave(&io->endio_lock, flags);
904 if (!(io->status == BLK_STS_DM_REQUEUE && __noflush_suspending(md)))
905 io->status = error;
906 spin_unlock_irqrestore(&io->endio_lock, flags);
907 }
908
909 if (atomic_dec_and_test(&io->io_count)) {
910 if (io->status == BLK_STS_DM_REQUEUE) {
911
912
913
914 spin_lock_irqsave(&md->deferred_lock, flags);
915 if (__noflush_suspending(md))
916
917 bio_list_add_head(&md->deferred, io->orig_bio);
918 else
919
920 io->status = BLK_STS_IOERR;
921 spin_unlock_irqrestore(&md->deferred_lock, flags);
922 }
923
924 io_error = io->status;
925 bio = io->orig_bio;
926 end_io_acct(io);
927 free_io(md, io);
928
929 if (io_error == BLK_STS_DM_REQUEUE)
930 return;
931
932 if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
933
934
935
936
937 bio->bi_opf &= ~REQ_PREFLUSH;
938 queue_io(md, bio);
939 } else {
940
941 if (io_error)
942 bio->bi_status = io_error;
943 bio_endio(bio);
944 }
945 }
946}
947
948void disable_discard(struct mapped_device *md)
949{
950 struct queue_limits *limits = dm_get_queue_limits(md);
951
952
953 limits->max_discard_sectors = 0;
954 blk_queue_flag_clear(QUEUE_FLAG_DISCARD, md->queue);
955}
956
957void disable_write_same(struct mapped_device *md)
958{
959 struct queue_limits *limits = dm_get_queue_limits(md);
960
961
962 limits->max_write_same_sectors = 0;
963}
964
965void disable_write_zeroes(struct mapped_device *md)
966{
967 struct queue_limits *limits = dm_get_queue_limits(md);
968
969
970 limits->max_write_zeroes_sectors = 0;
971}
972
973static void clone_endio(struct bio *bio)
974{
975 blk_status_t error = bio->bi_status;
976 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
977 struct dm_io *io = tio->io;
978 struct mapped_device *md = tio->io->md;
979 dm_endio_fn endio = tio->ti->type->end_io;
980
981 if (unlikely(error == BLK_STS_TARGET) && md->type != DM_TYPE_NVME_BIO_BASED) {
982 if (bio_op(bio) == REQ_OP_DISCARD &&
983 !bio->bi_disk->queue->limits.max_discard_sectors)
984 disable_discard(md);
985 else if (bio_op(bio) == REQ_OP_WRITE_SAME &&
986 !bio->bi_disk->queue->limits.max_write_same_sectors)
987 disable_write_same(md);
988 else if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
989 !bio->bi_disk->queue->limits.max_write_zeroes_sectors)
990 disable_write_zeroes(md);
991 }
992
993 if (endio) {
994 int r = endio(tio->ti, bio, &error);
995 switch (r) {
996 case DM_ENDIO_REQUEUE:
997 error = BLK_STS_DM_REQUEUE;
998
999 case DM_ENDIO_DONE:
1000 break;
1001 case DM_ENDIO_INCOMPLETE:
1002
1003 return;
1004 default:
1005 DMWARN("unimplemented target endio return value: %d", r);
1006 BUG();
1007 }
1008 }
1009
1010 free_tio(tio);
1011 dec_pending(io, error);
1012}
1013
1014
1015
1016
1017
1018static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
1019{
1020 sector_t target_offset = dm_target_offset(ti, sector);
1021
1022 return ti->len - target_offset;
1023}
1024
1025static sector_t max_io_len(sector_t sector, struct dm_target *ti)
1026{
1027 sector_t len = max_io_len_target_boundary(sector, ti);
1028 sector_t offset, max_len;
1029
1030
1031
1032
1033 if (ti->max_io_len) {
1034 offset = dm_target_offset(ti, sector);
1035 if (unlikely(ti->max_io_len & (ti->max_io_len - 1)))
1036 max_len = sector_div(offset, ti->max_io_len);
1037 else
1038 max_len = offset & (ti->max_io_len - 1);
1039 max_len = ti->max_io_len - max_len;
1040
1041 if (len > max_len)
1042 len = max_len;
1043 }
1044
1045 return len;
1046}
1047
1048int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
1049{
1050 if (len > UINT_MAX) {
1051 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
1052 (unsigned long long)len, UINT_MAX);
1053 ti->error = "Maximum size of target IO is too large";
1054 return -EINVAL;
1055 }
1056
1057 ti->max_io_len = (uint32_t) len;
1058
1059 return 0;
1060}
1061EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
1062
1063static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
1064 sector_t sector, int *srcu_idx)
1065 __acquires(md->io_barrier)
1066{
1067 struct dm_table *map;
1068 struct dm_target *ti;
1069
1070 map = dm_get_live_table(md, srcu_idx);
1071 if (!map)
1072 return NULL;
1073
1074 ti = dm_table_find_target(map, sector);
1075 if (!dm_target_is_valid(ti))
1076 return NULL;
1077
1078 return ti;
1079}
1080
1081static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
1082 long nr_pages, void **kaddr, pfn_t *pfn)
1083{
1084 struct mapped_device *md = dax_get_private(dax_dev);
1085 sector_t sector = pgoff * PAGE_SECTORS;
1086 struct dm_target *ti;
1087 long len, ret = -EIO;
1088 int srcu_idx;
1089
1090 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1091
1092 if (!ti)
1093 goto out;
1094 if (!ti->type->direct_access)
1095 goto out;
1096 len = max_io_len(sector, ti) / PAGE_SECTORS;
1097 if (len < 1)
1098 goto out;
1099 nr_pages = min(len, nr_pages);
1100 ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
1101
1102 out:
1103 dm_put_live_table(md, srcu_idx);
1104
1105 return ret;
1106}
1107
1108static bool dm_dax_supported(struct dax_device *dax_dev, struct block_device *bdev,
1109 int blocksize, sector_t start, sector_t len)
1110{
1111 struct mapped_device *md = dax_get_private(dax_dev);
1112 struct dm_table *map;
1113 int srcu_idx;
1114 bool ret;
1115
1116 map = dm_get_live_table(md, &srcu_idx);
1117 if (!map)
1118 return false;
1119
1120 ret = dm_table_supports_dax(map, device_supports_dax, &blocksize);
1121
1122 dm_put_live_table(md, srcu_idx);
1123
1124 return ret;
1125}
1126
1127static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
1128 void *addr, size_t bytes, struct iov_iter *i)
1129{
1130 struct mapped_device *md = dax_get_private(dax_dev);
1131 sector_t sector = pgoff * PAGE_SECTORS;
1132 struct dm_target *ti;
1133 long ret = 0;
1134 int srcu_idx;
1135
1136 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1137
1138 if (!ti)
1139 goto out;
1140 if (!ti->type->dax_copy_from_iter) {
1141 ret = copy_from_iter(addr, bytes, i);
1142 goto out;
1143 }
1144 ret = ti->type->dax_copy_from_iter(ti, pgoff, addr, bytes, i);
1145 out:
1146 dm_put_live_table(md, srcu_idx);
1147
1148 return ret;
1149}
1150
1151static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
1152 void *addr, size_t bytes, struct iov_iter *i)
1153{
1154 struct mapped_device *md = dax_get_private(dax_dev);
1155 sector_t sector = pgoff * PAGE_SECTORS;
1156 struct dm_target *ti;
1157 long ret = 0;
1158 int srcu_idx;
1159
1160 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1161
1162 if (!ti)
1163 goto out;
1164 if (!ti->type->dax_copy_to_iter) {
1165 ret = copy_to_iter(addr, bytes, i);
1166 goto out;
1167 }
1168 ret = ti->type->dax_copy_to_iter(ti, pgoff, addr, bytes, i);
1169 out:
1170 dm_put_live_table(md, srcu_idx);
1171
1172 return ret;
1173}
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
1204{
1205 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
1206 unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
1207 BUG_ON(bio->bi_opf & REQ_PREFLUSH);
1208 BUG_ON(bi_size > *tio->len_ptr);
1209 BUG_ON(n_sectors > bi_size);
1210 *tio->len_ptr -= bi_size - n_sectors;
1211 bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
1212}
1213EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223void dm_remap_zone_report(struct dm_target *ti, sector_t start,
1224 struct blk_zone *zones, unsigned int *nr_zones)
1225{
1226#ifdef CONFIG_BLK_DEV_ZONED
1227 struct blk_zone *zone;
1228 unsigned int nrz = *nr_zones;
1229 int i;
1230
1231
1232
1233
1234
1235
1236
1237 for (i = 0; i < nrz; i++) {
1238 zone = zones + i;
1239 if (zone->start >= start + ti->len) {
1240 memset(zone, 0, sizeof(struct blk_zone) * (nrz - i));
1241 break;
1242 }
1243
1244 zone->start = zone->start + ti->begin - start;
1245 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
1246 continue;
1247
1248 if (zone->cond == BLK_ZONE_COND_FULL)
1249 zone->wp = zone->start + zone->len;
1250 else if (zone->cond == BLK_ZONE_COND_EMPTY)
1251 zone->wp = zone->start;
1252 else
1253 zone->wp = zone->wp + ti->begin - start;
1254 }
1255
1256 *nr_zones = i;
1257#else
1258 *nr_zones = 0;
1259#endif
1260}
1261EXPORT_SYMBOL_GPL(dm_remap_zone_report);
1262
1263static blk_qc_t __map_bio(struct dm_target_io *tio)
1264{
1265 int r;
1266 sector_t sector;
1267 struct bio *clone = &tio->clone;
1268 struct dm_io *io = tio->io;
1269 struct mapped_device *md = io->md;
1270 struct dm_target *ti = tio->ti;
1271 blk_qc_t ret = BLK_QC_T_NONE;
1272
1273 clone->bi_end_io = clone_endio;
1274
1275
1276
1277
1278
1279
1280 atomic_inc(&io->io_count);
1281 sector = clone->bi_iter.bi_sector;
1282
1283 r = ti->type->map(ti, clone);
1284 switch (r) {
1285 case DM_MAPIO_SUBMITTED:
1286 break;
1287 case DM_MAPIO_REMAPPED:
1288
1289 trace_block_bio_remap(clone->bi_disk->queue, clone,
1290 bio_dev(io->orig_bio), sector);
1291 if (md->type == DM_TYPE_NVME_BIO_BASED)
1292 ret = direct_make_request(clone);
1293 else
1294 ret = generic_make_request(clone);
1295 break;
1296 case DM_MAPIO_KILL:
1297 free_tio(tio);
1298 dec_pending(io, BLK_STS_IOERR);
1299 break;
1300 case DM_MAPIO_REQUEUE:
1301 free_tio(tio);
1302 dec_pending(io, BLK_STS_DM_REQUEUE);
1303 break;
1304 default:
1305 DMWARN("unimplemented target map return value: %d", r);
1306 BUG();
1307 }
1308
1309 return ret;
1310}
1311
1312static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
1313{
1314 bio->bi_iter.bi_sector = sector;
1315 bio->bi_iter.bi_size = to_bytes(len);
1316}
1317
1318
1319
1320
1321static int clone_bio(struct dm_target_io *tio, struct bio *bio,
1322 sector_t sector, unsigned len)
1323{
1324 struct bio *clone = &tio->clone;
1325
1326 __bio_clone_fast(clone, bio);
1327
1328 if (bio_integrity(bio)) {
1329 int r;
1330
1331 if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
1332 !dm_target_passes_integrity(tio->ti->type))) {
1333 DMWARN("%s: the target %s doesn't support integrity data.",
1334 dm_device_name(tio->io->md),
1335 tio->ti->type->name);
1336 return -EIO;
1337 }
1338
1339 r = bio_integrity_clone(clone, bio, GFP_NOIO);
1340 if (r < 0)
1341 return r;
1342 }
1343
1344 bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
1345 clone->bi_iter.bi_size = to_bytes(len);
1346
1347 if (bio_integrity(bio))
1348 bio_integrity_trim(clone);
1349
1350 return 0;
1351}
1352
1353static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
1354 struct dm_target *ti, unsigned num_bios)
1355{
1356 struct dm_target_io *tio;
1357 int try;
1358
1359 if (!num_bios)
1360 return;
1361
1362 if (num_bios == 1) {
1363 tio = alloc_tio(ci, ti, 0, GFP_NOIO);
1364 bio_list_add(blist, &tio->clone);
1365 return;
1366 }
1367
1368 for (try = 0; try < 2; try++) {
1369 int bio_nr;
1370 struct bio *bio;
1371
1372 if (try)
1373 mutex_lock(&ci->io->md->table_devices_lock);
1374 for (bio_nr = 0; bio_nr < num_bios; bio_nr++) {
1375 tio = alloc_tio(ci, ti, bio_nr, try ? GFP_NOIO : GFP_NOWAIT);
1376 if (!tio)
1377 break;
1378
1379 bio_list_add(blist, &tio->clone);
1380 }
1381 if (try)
1382 mutex_unlock(&ci->io->md->table_devices_lock);
1383 if (bio_nr == num_bios)
1384 return;
1385
1386 while ((bio = bio_list_pop(blist))) {
1387 tio = container_of(bio, struct dm_target_io, clone);
1388 free_tio(tio);
1389 }
1390 }
1391}
1392
1393static blk_qc_t __clone_and_map_simple_bio(struct clone_info *ci,
1394 struct dm_target_io *tio, unsigned *len)
1395{
1396 struct bio *clone = &tio->clone;
1397
1398 tio->len_ptr = len;
1399
1400 __bio_clone_fast(clone, ci->bio);
1401 if (len)
1402 bio_setup_sector(clone, ci->sector, *len);
1403
1404 return __map_bio(tio);
1405}
1406
1407static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
1408 unsigned num_bios, unsigned *len)
1409{
1410 struct bio_list blist = BIO_EMPTY_LIST;
1411 struct bio *bio;
1412 struct dm_target_io *tio;
1413
1414 alloc_multiple_bios(&blist, ci, ti, num_bios);
1415
1416 while ((bio = bio_list_pop(&blist))) {
1417 tio = container_of(bio, struct dm_target_io, clone);
1418 (void) __clone_and_map_simple_bio(ci, tio, len);
1419 }
1420}
1421
1422static int __send_empty_flush(struct clone_info *ci)
1423{
1424 unsigned target_nr = 0;
1425 struct dm_target *ti;
1426
1427
1428
1429
1430
1431
1432
1433
1434 bio_set_dev(ci->bio, ci->io->md->bdev);
1435
1436 BUG_ON(bio_has_data(ci->bio));
1437 while ((ti = dm_table_get_target(ci->map, target_nr++)))
1438 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
1439
1440 bio_disassociate_blkg(ci->bio);
1441
1442 return 0;
1443}
1444
1445static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
1446 sector_t sector, unsigned *len)
1447{
1448 struct bio *bio = ci->bio;
1449 struct dm_target_io *tio;
1450 int r;
1451
1452 tio = alloc_tio(ci, ti, 0, GFP_NOIO);
1453 tio->len_ptr = len;
1454 r = clone_bio(tio, bio, sector, *len);
1455 if (r < 0) {
1456 free_tio(tio);
1457 return r;
1458 }
1459 (void) __map_bio(tio);
1460
1461 return 0;
1462}
1463
1464typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
1465
1466static unsigned get_num_discard_bios(struct dm_target *ti)
1467{
1468 return ti->num_discard_bios;
1469}
1470
1471static unsigned get_num_secure_erase_bios(struct dm_target *ti)
1472{
1473 return ti->num_secure_erase_bios;
1474}
1475
1476static unsigned get_num_write_same_bios(struct dm_target *ti)
1477{
1478 return ti->num_write_same_bios;
1479}
1480
1481static unsigned get_num_write_zeroes_bios(struct dm_target *ti)
1482{
1483 return ti->num_write_zeroes_bios;
1484}
1485
1486static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
1487 unsigned num_bios)
1488{
1489 unsigned len;
1490
1491
1492
1493
1494
1495
1496
1497 if (!num_bios)
1498 return -EOPNOTSUPP;
1499
1500 len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
1501
1502 __send_duplicate_bios(ci, ti, num_bios, &len);
1503
1504 ci->sector += len;
1505 ci->sector_count -= len;
1506
1507 return 0;
1508}
1509
1510static int __send_discard(struct clone_info *ci, struct dm_target *ti)
1511{
1512 return __send_changing_extent_only(ci, ti, get_num_discard_bios(ti));
1513}
1514
1515static int __send_secure_erase(struct clone_info *ci, struct dm_target *ti)
1516{
1517 return __send_changing_extent_only(ci, ti, get_num_secure_erase_bios(ti));
1518}
1519
1520static int __send_write_same(struct clone_info *ci, struct dm_target *ti)
1521{
1522 return __send_changing_extent_only(ci, ti, get_num_write_same_bios(ti));
1523}
1524
1525static int __send_write_zeroes(struct clone_info *ci, struct dm_target *ti)
1526{
1527 return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios(ti));
1528}
1529
1530static bool is_abnormal_io(struct bio *bio)
1531{
1532 bool r = false;
1533
1534 switch (bio_op(bio)) {
1535 case REQ_OP_DISCARD:
1536 case REQ_OP_SECURE_ERASE:
1537 case REQ_OP_WRITE_SAME:
1538 case REQ_OP_WRITE_ZEROES:
1539 r = true;
1540 break;
1541 }
1542
1543 return r;
1544}
1545
1546static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti,
1547 int *result)
1548{
1549 struct bio *bio = ci->bio;
1550
1551 if (bio_op(bio) == REQ_OP_DISCARD)
1552 *result = __send_discard(ci, ti);
1553 else if (bio_op(bio) == REQ_OP_SECURE_ERASE)
1554 *result = __send_secure_erase(ci, ti);
1555 else if (bio_op(bio) == REQ_OP_WRITE_SAME)
1556 *result = __send_write_same(ci, ti);
1557 else if (bio_op(bio) == REQ_OP_WRITE_ZEROES)
1558 *result = __send_write_zeroes(ci, ti);
1559 else
1560 return false;
1561
1562 return true;
1563}
1564
1565
1566
1567
1568static int __split_and_process_non_flush(struct clone_info *ci)
1569{
1570 struct dm_target *ti;
1571 unsigned len;
1572 int r;
1573
1574 ti = dm_table_find_target(ci->map, ci->sector);
1575 if (!dm_target_is_valid(ti))
1576 return -EIO;
1577
1578 if (__process_abnormal_io(ci, ti, &r))
1579 return r;
1580
1581 len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count);
1582
1583 r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
1584 if (r < 0)
1585 return r;
1586
1587 ci->sector += len;
1588 ci->sector_count -= len;
1589
1590 return 0;
1591}
1592
1593static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
1594 struct dm_table *map, struct bio *bio)
1595{
1596 ci->map = map;
1597 ci->io = alloc_io(md, bio);
1598 ci->sector = bio->bi_iter.bi_sector;
1599}
1600
1601#define __dm_part_stat_sub(part, field, subnd) \
1602 (part_stat_get(part, field) -= (subnd))
1603
1604
1605
1606
1607static blk_qc_t __split_and_process_bio(struct mapped_device *md,
1608 struct dm_table *map, struct bio *bio)
1609{
1610 struct clone_info ci;
1611 blk_qc_t ret = BLK_QC_T_NONE;
1612 int error = 0;
1613
1614 init_clone_info(&ci, md, map, bio);
1615
1616 if (bio->bi_opf & REQ_PREFLUSH) {
1617 struct bio flush_bio;
1618
1619
1620
1621
1622
1623
1624 bio_init(&flush_bio, NULL, 0);
1625 flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
1626 ci.bio = &flush_bio;
1627 ci.sector_count = 0;
1628 error = __send_empty_flush(&ci);
1629
1630 } else if (bio_op(bio) == REQ_OP_ZONE_RESET) {
1631 ci.bio = bio;
1632 ci.sector_count = 0;
1633 error = __split_and_process_non_flush(&ci);
1634 } else {
1635 ci.bio = bio;
1636 ci.sector_count = bio_sectors(bio);
1637 while (ci.sector_count && !error) {
1638 error = __split_and_process_non_flush(&ci);
1639 if (current->bio_list && ci.sector_count && !error) {
1640
1641
1642
1643
1644
1645
1646
1647
1648 struct bio *b = bio_split(bio, bio_sectors(bio) - ci.sector_count,
1649 GFP_NOIO, &md->queue->bio_split);
1650 ci.io->orig_bio = b;
1651
1652
1653
1654
1655
1656
1657
1658
1659 part_stat_lock();
1660 __dm_part_stat_sub(&dm_disk(md)->part0,
1661 sectors[op_stat_group(bio_op(bio))], ci.sector_count);
1662 part_stat_unlock();
1663
1664 bio_chain(b, bio);
1665 trace_block_split(md->queue, b, bio->bi_iter.bi_sector);
1666 ret = generic_make_request(bio);
1667 break;
1668 }
1669 }
1670 }
1671
1672
1673 dec_pending(ci.io, errno_to_blk_status(error));
1674 return ret;
1675}
1676
1677
1678
1679
1680
1681static blk_qc_t __process_bio(struct mapped_device *md, struct dm_table *map,
1682 struct bio *bio, struct dm_target *ti)
1683{
1684 struct clone_info ci;
1685 blk_qc_t ret = BLK_QC_T_NONE;
1686 int error = 0;
1687
1688 init_clone_info(&ci, md, map, bio);
1689
1690 if (bio->bi_opf & REQ_PREFLUSH) {
1691 struct bio flush_bio;
1692
1693
1694
1695
1696
1697
1698 bio_init(&flush_bio, NULL, 0);
1699 flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
1700 ci.bio = &flush_bio;
1701 ci.sector_count = 0;
1702 error = __send_empty_flush(&ci);
1703
1704 } else {
1705 struct dm_target_io *tio;
1706
1707 ci.bio = bio;
1708 ci.sector_count = bio_sectors(bio);
1709 if (__process_abnormal_io(&ci, ti, &error))
1710 goto out;
1711
1712 tio = alloc_tio(&ci, ti, 0, GFP_NOIO);
1713 ret = __clone_and_map_simple_bio(&ci, tio, NULL);
1714 }
1715out:
1716
1717 dec_pending(ci.io, errno_to_blk_status(error));
1718 return ret;
1719}
1720
1721static void dm_queue_split(struct mapped_device *md, struct dm_target *ti, struct bio **bio)
1722{
1723 unsigned len, sector_count;
1724
1725 sector_count = bio_sectors(*bio);
1726 len = min_t(sector_t, max_io_len((*bio)->bi_iter.bi_sector, ti), sector_count);
1727
1728 if (sector_count > len) {
1729 struct bio *split = bio_split(*bio, len, GFP_NOIO, &md->queue->bio_split);
1730
1731 bio_chain(split, *bio);
1732 trace_block_split(md->queue, split, (*bio)->bi_iter.bi_sector);
1733 generic_make_request(*bio);
1734 *bio = split;
1735 }
1736}
1737
1738static blk_qc_t dm_process_bio(struct mapped_device *md,
1739 struct dm_table *map, struct bio *bio)
1740{
1741 blk_qc_t ret = BLK_QC_T_NONE;
1742 struct dm_target *ti = md->immutable_target;
1743
1744 if (unlikely(!map)) {
1745 bio_io_error(bio);
1746 return ret;
1747 }
1748
1749 if (!ti) {
1750 ti = dm_table_find_target(map, bio->bi_iter.bi_sector);
1751 if (unlikely(!ti || !dm_target_is_valid(ti))) {
1752 bio_io_error(bio);
1753 return ret;
1754 }
1755 }
1756
1757
1758
1759
1760
1761
1762 if (current->bio_list) {
1763 blk_queue_split(md->queue, &bio);
1764 if (!is_abnormal_io(bio))
1765 dm_queue_split(md, ti, &bio);
1766 }
1767
1768 if (dm_get_md_type(md) == DM_TYPE_NVME_BIO_BASED)
1769 return __process_bio(md, map, bio, ti);
1770 else
1771 return __split_and_process_bio(md, map, bio);
1772}
1773
1774static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
1775{
1776 struct mapped_device *md = q->queuedata;
1777 blk_qc_t ret = BLK_QC_T_NONE;
1778 int srcu_idx;
1779 struct dm_table *map;
1780
1781 map = dm_get_live_table(md, &srcu_idx);
1782
1783
1784 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1785 dm_put_live_table(md, srcu_idx);
1786
1787 if (!(bio->bi_opf & REQ_RAHEAD))
1788 queue_io(md, bio);
1789 else
1790 bio_io_error(bio);
1791 return ret;
1792 }
1793
1794 ret = dm_process_bio(md, map, bio);
1795
1796 dm_put_live_table(md, srcu_idx);
1797 return ret;
1798}
1799
1800static int dm_any_congested(void *congested_data, int bdi_bits)
1801{
1802 int r = bdi_bits;
1803 struct mapped_device *md = congested_data;
1804 struct dm_table *map;
1805
1806 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
1807 if (dm_request_based(md)) {
1808
1809
1810
1811
1812 r = md->queue->backing_dev_info->wb.state & bdi_bits;
1813 } else {
1814 map = dm_get_live_table_fast(md);
1815 if (map)
1816 r = dm_table_any_congested(map, bdi_bits);
1817 dm_put_live_table_fast(md);
1818 }
1819 }
1820
1821 return r;
1822}
1823
1824
1825
1826
1827static void free_minor(int minor)
1828{
1829 spin_lock(&_minor_lock);
1830 idr_remove(&_minor_idr, minor);
1831 spin_unlock(&_minor_lock);
1832}
1833
1834
1835
1836
1837static int specific_minor(int minor)
1838{
1839 int r;
1840
1841 if (minor >= (1 << MINORBITS))
1842 return -EINVAL;
1843
1844 idr_preload(GFP_KERNEL);
1845 spin_lock(&_minor_lock);
1846
1847 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
1848
1849 spin_unlock(&_minor_lock);
1850 idr_preload_end();
1851 if (r < 0)
1852 return r == -ENOSPC ? -EBUSY : r;
1853 return 0;
1854}
1855
1856static int next_free_minor(int *minor)
1857{
1858 int r;
1859
1860 idr_preload(GFP_KERNEL);
1861 spin_lock(&_minor_lock);
1862
1863 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
1864
1865 spin_unlock(&_minor_lock);
1866 idr_preload_end();
1867 if (r < 0)
1868 return r;
1869 *minor = r;
1870 return 0;
1871}
1872
1873static const struct block_device_operations dm_blk_dops;
1874static const struct dax_operations dm_dax_ops;
1875
1876static void dm_wq_work(struct work_struct *work);
1877
1878static void dm_init_normal_md_queue(struct mapped_device *md)
1879{
1880
1881
1882
1883 md->queue->backing_dev_info->congested_fn = dm_any_congested;
1884}
1885
1886static void cleanup_mapped_device(struct mapped_device *md)
1887{
1888 if (md->wq)
1889 destroy_workqueue(md->wq);
1890 bioset_exit(&md->bs);
1891 bioset_exit(&md->io_bs);
1892
1893 if (md->dax_dev) {
1894 kill_dax(md->dax_dev);
1895 put_dax(md->dax_dev);
1896 md->dax_dev = NULL;
1897 }
1898
1899 if (md->disk) {
1900 spin_lock(&_minor_lock);
1901 md->disk->private_data = NULL;
1902 spin_unlock(&_minor_lock);
1903 del_gendisk(md->disk);
1904 put_disk(md->disk);
1905 }
1906
1907 if (md->queue)
1908 blk_cleanup_queue(md->queue);
1909
1910 cleanup_srcu_struct(&md->io_barrier);
1911
1912 if (md->bdev) {
1913 bdput(md->bdev);
1914 md->bdev = NULL;
1915 }
1916
1917 mutex_destroy(&md->suspend_lock);
1918 mutex_destroy(&md->type_lock);
1919 mutex_destroy(&md->table_devices_lock);
1920
1921 dm_mq_cleanup_mapped_device(md);
1922}
1923
1924
1925
1926
1927static struct mapped_device *alloc_dev(int minor)
1928{
1929 int r, numa_node_id = dm_get_numa_node();
1930 struct mapped_device *md;
1931 void *old_md;
1932
1933 md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
1934 if (!md) {
1935 DMWARN("unable to allocate device, out of memory.");
1936 return NULL;
1937 }
1938
1939 if (!try_module_get(THIS_MODULE))
1940 goto bad_module_get;
1941
1942
1943 if (minor == DM_ANY_MINOR)
1944 r = next_free_minor(&minor);
1945 else
1946 r = specific_minor(minor);
1947 if (r < 0)
1948 goto bad_minor;
1949
1950 r = init_srcu_struct(&md->io_barrier);
1951 if (r < 0)
1952 goto bad_io_barrier;
1953
1954 md->numa_node_id = numa_node_id;
1955 md->init_tio_pdu = false;
1956 md->type = DM_TYPE_NONE;
1957 mutex_init(&md->suspend_lock);
1958 mutex_init(&md->type_lock);
1959 mutex_init(&md->table_devices_lock);
1960 spin_lock_init(&md->deferred_lock);
1961 atomic_set(&md->holders, 1);
1962 atomic_set(&md->open_count, 0);
1963 atomic_set(&md->event_nr, 0);
1964 atomic_set(&md->uevent_seq, 0);
1965 INIT_LIST_HEAD(&md->uevent_list);
1966 INIT_LIST_HEAD(&md->table_devices);
1967 spin_lock_init(&md->uevent_lock);
1968
1969 md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id);
1970 if (!md->queue)
1971 goto bad;
1972 md->queue->queuedata = md;
1973 md->queue->backing_dev_info->congested_data = md;
1974
1975 md->disk = alloc_disk_node(1, md->numa_node_id);
1976 if (!md->disk)
1977 goto bad;
1978
1979 init_waitqueue_head(&md->wait);
1980 INIT_WORK(&md->work, dm_wq_work);
1981 init_waitqueue_head(&md->eventq);
1982 init_completion(&md->kobj_holder.completion);
1983
1984 md->disk->major = _major;
1985 md->disk->first_minor = minor;
1986 md->disk->fops = &dm_blk_dops;
1987 md->disk->queue = md->queue;
1988 md->disk->private_data = md;
1989 sprintf(md->disk->disk_name, "dm-%d", minor);
1990
1991 if (IS_ENABLED(CONFIG_DAX_DRIVER)) {
1992 md->dax_dev = alloc_dax(md, md->disk->disk_name,
1993 &dm_dax_ops, 0);
1994 if (!md->dax_dev)
1995 goto bad;
1996 }
1997
1998 add_disk_no_queue_reg(md->disk);
1999 format_dev_t(md->name, MKDEV(_major, minor));
2000
2001 md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
2002 if (!md->wq)
2003 goto bad;
2004
2005 md->bdev = bdget_disk(md->disk, 0);
2006 if (!md->bdev)
2007 goto bad;
2008
2009 dm_stats_init(&md->stats);
2010
2011
2012 spin_lock(&_minor_lock);
2013 old_md = idr_replace(&_minor_idr, md, minor);
2014 spin_unlock(&_minor_lock);
2015
2016 BUG_ON(old_md != MINOR_ALLOCED);
2017
2018 return md;
2019
2020bad:
2021 cleanup_mapped_device(md);
2022bad_io_barrier:
2023 free_minor(minor);
2024bad_minor:
2025 module_put(THIS_MODULE);
2026bad_module_get:
2027 kvfree(md);
2028 return NULL;
2029}
2030
2031static void unlock_fs(struct mapped_device *md);
2032
2033static void free_dev(struct mapped_device *md)
2034{
2035 int minor = MINOR(disk_devt(md->disk));
2036
2037 unlock_fs(md);
2038
2039 cleanup_mapped_device(md);
2040
2041 free_table_devices(&md->table_devices);
2042 dm_stats_cleanup(&md->stats);
2043 free_minor(minor);
2044
2045 module_put(THIS_MODULE);
2046 kvfree(md);
2047}
2048
2049static int __bind_mempools(struct mapped_device *md, struct dm_table *t)
2050{
2051 struct dm_md_mempools *p = dm_table_get_md_mempools(t);
2052 int ret = 0;
2053
2054 if (dm_table_bio_based(t)) {
2055
2056
2057
2058
2059
2060 bioset_exit(&md->bs);
2061 bioset_exit(&md->io_bs);
2062
2063 } else if (bioset_initialized(&md->bs)) {
2064
2065
2066
2067
2068
2069
2070
2071
2072 goto out;
2073 }
2074
2075 BUG_ON(!p ||
2076 bioset_initialized(&md->bs) ||
2077 bioset_initialized(&md->io_bs));
2078
2079 ret = bioset_init_from_src(&md->bs, &p->bs);
2080 if (ret)
2081 goto out;
2082 ret = bioset_init_from_src(&md->io_bs, &p->io_bs);
2083 if (ret)
2084 bioset_exit(&md->bs);
2085out:
2086
2087 dm_table_free_md_mempools(t);
2088 return ret;
2089}
2090
2091
2092
2093
2094static void event_callback(void *context)
2095{
2096 unsigned long flags;
2097 LIST_HEAD(uevents);
2098 struct mapped_device *md = (struct mapped_device *) context;
2099
2100 spin_lock_irqsave(&md->uevent_lock, flags);
2101 list_splice_init(&md->uevent_list, &uevents);
2102 spin_unlock_irqrestore(&md->uevent_lock, flags);
2103
2104 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
2105
2106 atomic_inc(&md->event_nr);
2107 wake_up(&md->eventq);
2108 dm_issue_global_event();
2109}
2110
2111
2112
2113
2114static void __set_size(struct mapped_device *md, sector_t size)
2115{
2116 lockdep_assert_held(&md->suspend_lock);
2117
2118 set_capacity(md->disk, size);
2119
2120 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
2121}
2122
2123
2124
2125
2126static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2127 struct queue_limits *limits)
2128{
2129 struct dm_table *old_map;
2130 struct request_queue *q = md->queue;
2131 bool request_based = dm_table_request_based(t);
2132 sector_t size;
2133 int ret;
2134
2135 lockdep_assert_held(&md->suspend_lock);
2136
2137 size = dm_table_get_size(t);
2138
2139
2140
2141
2142 if (size != dm_get_size(md))
2143 memset(&md->geometry, 0, sizeof(md->geometry));
2144
2145 __set_size(md, size);
2146
2147 dm_table_event_callback(t, event_callback, md);
2148
2149
2150
2151
2152
2153
2154
2155
2156 if (request_based)
2157 dm_stop_queue(q);
2158
2159 if (request_based || md->type == DM_TYPE_NVME_BIO_BASED) {
2160
2161
2162
2163
2164
2165
2166 md->immutable_target = dm_table_get_immutable_target(t);
2167 }
2168
2169 ret = __bind_mempools(md, t);
2170 if (ret) {
2171 old_map = ERR_PTR(ret);
2172 goto out;
2173 }
2174
2175 old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2176 rcu_assign_pointer(md->map, (void *)t);
2177 md->immutable_target_type = dm_table_get_immutable_target_type(t);
2178
2179 dm_table_set_restrictions(t, q, limits);
2180 if (old_map)
2181 dm_sync_table(md);
2182
2183out:
2184 return old_map;
2185}
2186
2187
2188
2189
2190static struct dm_table *__unbind(struct mapped_device *md)
2191{
2192 struct dm_table *map = rcu_dereference_protected(md->map, 1);
2193
2194 if (!map)
2195 return NULL;
2196
2197 dm_table_event_callback(map, NULL, NULL);
2198 RCU_INIT_POINTER(md->map, NULL);
2199 dm_sync_table(md);
2200
2201 return map;
2202}
2203
2204
2205
2206
2207int dm_create(int minor, struct mapped_device **result)
2208{
2209 int r;
2210 struct mapped_device *md;
2211
2212 md = alloc_dev(minor);
2213 if (!md)
2214 return -ENXIO;
2215
2216 r = dm_sysfs_init(md);
2217 if (r) {
2218 free_dev(md);
2219 return r;
2220 }
2221
2222 *result = md;
2223 return 0;
2224}
2225
2226
2227
2228
2229
2230void dm_lock_md_type(struct mapped_device *md)
2231{
2232 mutex_lock(&md->type_lock);
2233}
2234
2235void dm_unlock_md_type(struct mapped_device *md)
2236{
2237 mutex_unlock(&md->type_lock);
2238}
2239
2240void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
2241{
2242 BUG_ON(!mutex_is_locked(&md->type_lock));
2243 md->type = type;
2244}
2245
2246enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
2247{
2248 return md->type;
2249}
2250
2251struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
2252{
2253 return md->immutable_target_type;
2254}
2255
2256
2257
2258
2259
2260struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
2261{
2262 BUG_ON(!atomic_read(&md->holders));
2263 return &md->queue->limits;
2264}
2265EXPORT_SYMBOL_GPL(dm_get_queue_limits);
2266
2267
2268
2269
2270int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
2271{
2272 int r;
2273 struct queue_limits limits;
2274 enum dm_queue_mode type = dm_get_md_type(md);
2275
2276 switch (type) {
2277 case DM_TYPE_REQUEST_BASED:
2278 r = dm_mq_init_request_queue(md, t);
2279 if (r) {
2280 DMERR("Cannot initialize queue for request-based dm-mq mapped device");
2281 return r;
2282 }
2283 break;
2284 case DM_TYPE_BIO_BASED:
2285 case DM_TYPE_DAX_BIO_BASED:
2286 case DM_TYPE_NVME_BIO_BASED:
2287 dm_init_normal_md_queue(md);
2288 blk_queue_make_request(md->queue, dm_make_request);
2289 break;
2290 case DM_TYPE_NONE:
2291 WARN_ON_ONCE(true);
2292 break;
2293 }
2294
2295 r = dm_calculate_queue_limits(t, &limits);
2296 if (r) {
2297 DMERR("Cannot calculate initial queue limits");
2298 return r;
2299 }
2300 dm_table_set_restrictions(t, md->queue, &limits);
2301 blk_register_queue(md->disk);
2302
2303 return 0;
2304}
2305
2306struct mapped_device *dm_get_md(dev_t dev)
2307{
2308 struct mapped_device *md;
2309 unsigned minor = MINOR(dev);
2310
2311 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2312 return NULL;
2313
2314 spin_lock(&_minor_lock);
2315
2316 md = idr_find(&_minor_idr, minor);
2317 if (!md || md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) ||
2318 test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2319 md = NULL;
2320 goto out;
2321 }
2322 dm_get(md);
2323out:
2324 spin_unlock(&_minor_lock);
2325
2326 return md;
2327}
2328EXPORT_SYMBOL_GPL(dm_get_md);
2329
2330void *dm_get_mdptr(struct mapped_device *md)
2331{
2332 return md->interface_ptr;
2333}
2334
2335void dm_set_mdptr(struct mapped_device *md, void *ptr)
2336{
2337 md->interface_ptr = ptr;
2338}
2339
2340void dm_get(struct mapped_device *md)
2341{
2342 atomic_inc(&md->holders);
2343 BUG_ON(test_bit(DMF_FREEING, &md->flags));
2344}
2345
2346int dm_hold(struct mapped_device *md)
2347{
2348 spin_lock(&_minor_lock);
2349 if (test_bit(DMF_FREEING, &md->flags)) {
2350 spin_unlock(&_minor_lock);
2351 return -EBUSY;
2352 }
2353 dm_get(md);
2354 spin_unlock(&_minor_lock);
2355 return 0;
2356}
2357EXPORT_SYMBOL_GPL(dm_hold);
2358
2359const char *dm_device_name(struct mapped_device *md)
2360{
2361 return md->name;
2362}
2363EXPORT_SYMBOL_GPL(dm_device_name);
2364
2365static void __dm_destroy(struct mapped_device *md, bool wait)
2366{
2367 struct dm_table *map;
2368 int srcu_idx;
2369
2370 might_sleep();
2371
2372 spin_lock(&_minor_lock);
2373 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2374 set_bit(DMF_FREEING, &md->flags);
2375 spin_unlock(&_minor_lock);
2376
2377 blk_set_queue_dying(md->queue);
2378
2379
2380
2381
2382
2383 mutex_lock(&md->suspend_lock);
2384 map = dm_get_live_table(md, &srcu_idx);
2385 if (!dm_suspended_md(md)) {
2386 dm_table_presuspend_targets(map);
2387 dm_table_postsuspend_targets(map);
2388 }
2389
2390 dm_put_live_table(md, srcu_idx);
2391 mutex_unlock(&md->suspend_lock);
2392
2393
2394
2395
2396
2397
2398
2399 if (wait)
2400 while (atomic_read(&md->holders))
2401 msleep(1);
2402 else if (atomic_read(&md->holders))
2403 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2404 dm_device_name(md), atomic_read(&md->holders));
2405
2406 dm_sysfs_exit(md);
2407 dm_table_destroy(__unbind(md));
2408 free_dev(md);
2409}
2410
2411void dm_destroy(struct mapped_device *md)
2412{
2413 __dm_destroy(md, true);
2414}
2415
2416void dm_destroy_immediate(struct mapped_device *md)
2417{
2418 __dm_destroy(md, false);
2419}
2420
2421void dm_put(struct mapped_device *md)
2422{
2423 atomic_dec(&md->holders);
2424}
2425EXPORT_SYMBOL_GPL(dm_put);
2426
2427static int dm_wait_for_completion(struct mapped_device *md, long task_state)
2428{
2429 int r = 0;
2430 DEFINE_WAIT(wait);
2431
2432 while (1) {
2433 prepare_to_wait(&md->wait, &wait, task_state);
2434
2435 if (!md_in_flight(md))
2436 break;
2437
2438 if (signal_pending_state(task_state, current)) {
2439 r = -EINTR;
2440 break;
2441 }
2442
2443 io_schedule();
2444 }
2445 finish_wait(&md->wait, &wait);
2446
2447 return r;
2448}
2449
2450
2451
2452
2453static void dm_wq_work(struct work_struct *work)
2454{
2455 struct mapped_device *md = container_of(work, struct mapped_device,
2456 work);
2457 struct bio *c;
2458 int srcu_idx;
2459 struct dm_table *map;
2460
2461 map = dm_get_live_table(md, &srcu_idx);
2462
2463 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2464 spin_lock_irq(&md->deferred_lock);
2465 c = bio_list_pop(&md->deferred);
2466 spin_unlock_irq(&md->deferred_lock);
2467
2468 if (!c)
2469 break;
2470
2471 if (dm_request_based(md))
2472 (void) generic_make_request(c);
2473 else
2474 (void) dm_process_bio(md, map, c);
2475 }
2476
2477 dm_put_live_table(md, srcu_idx);
2478}
2479
2480static void dm_queue_flush(struct mapped_device *md)
2481{
2482 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2483 smp_mb__after_atomic();
2484 queue_work(md->wq, &md->work);
2485}
2486
2487
2488
2489
2490struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2491{
2492 struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
2493 struct queue_limits limits;
2494 int r;
2495
2496 mutex_lock(&md->suspend_lock);
2497
2498
2499 if (!dm_suspended_md(md))
2500 goto out;
2501
2502
2503
2504
2505
2506
2507
2508 if (dm_table_has_no_data_devices(table)) {
2509 live_map = dm_get_live_table_fast(md);
2510 if (live_map)
2511 limits = md->queue->limits;
2512 dm_put_live_table_fast(md);
2513 }
2514
2515 if (!live_map) {
2516 r = dm_calculate_queue_limits(table, &limits);
2517 if (r) {
2518 map = ERR_PTR(r);
2519 goto out;
2520 }
2521 }
2522
2523 map = __bind(md, table, &limits);
2524 dm_issue_global_event();
2525
2526out:
2527 mutex_unlock(&md->suspend_lock);
2528 return map;
2529}
2530
2531
2532
2533
2534
2535static int lock_fs(struct mapped_device *md)
2536{
2537 int r;
2538
2539 WARN_ON(md->frozen_sb);
2540
2541 md->frozen_sb = freeze_bdev(md->bdev);
2542 if (IS_ERR(md->frozen_sb)) {
2543 r = PTR_ERR(md->frozen_sb);
2544 md->frozen_sb = NULL;
2545 return r;
2546 }
2547
2548 set_bit(DMF_FROZEN, &md->flags);
2549
2550 return 0;
2551}
2552
2553static void unlock_fs(struct mapped_device *md)
2554{
2555 if (!test_bit(DMF_FROZEN, &md->flags))
2556 return;
2557
2558 thaw_bdev(md->bdev, md->frozen_sb);
2559 md->frozen_sb = NULL;
2560 clear_bit(DMF_FROZEN, &md->flags);
2561}
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
2573 unsigned suspend_flags, long task_state,
2574 int dmf_suspended_flag)
2575{
2576 bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
2577 bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
2578 int r;
2579
2580 lockdep_assert_held(&md->suspend_lock);
2581
2582
2583
2584
2585
2586 if (noflush)
2587 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2588 else
2589 pr_debug("%s: suspending with flush\n", dm_device_name(md));
2590
2591
2592
2593
2594
2595 dm_table_presuspend_targets(map);
2596
2597
2598
2599
2600
2601
2602
2603 if (!noflush && do_lockfs) {
2604 r = lock_fs(md);
2605 if (r) {
2606 dm_table_presuspend_undo_targets(map);
2607 return r;
2608 }
2609 }
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2624 if (map)
2625 synchronize_srcu(&md->io_barrier);
2626
2627
2628
2629
2630
2631 if (dm_request_based(md))
2632 dm_stop_queue(md->queue);
2633
2634 flush_workqueue(md->wq);
2635
2636
2637
2638
2639
2640
2641 r = dm_wait_for_completion(md, task_state);
2642 if (!r)
2643 set_bit(dmf_suspended_flag, &md->flags);
2644
2645 if (noflush)
2646 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2647 if (map)
2648 synchronize_srcu(&md->io_barrier);
2649
2650
2651 if (r < 0) {
2652 dm_queue_flush(md);
2653
2654 if (dm_request_based(md))
2655 dm_start_queue(md->queue);
2656
2657 unlock_fs(md);
2658 dm_table_presuspend_undo_targets(map);
2659
2660 }
2661
2662 return r;
2663}
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2682{
2683 struct dm_table *map = NULL;
2684 int r = 0;
2685
2686retry:
2687 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2688
2689 if (dm_suspended_md(md)) {
2690 r = -EINVAL;
2691 goto out_unlock;
2692 }
2693
2694 if (dm_suspended_internally_md(md)) {
2695
2696 mutex_unlock(&md->suspend_lock);
2697 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2698 if (r)
2699 return r;
2700 goto retry;
2701 }
2702
2703 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2704
2705 r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED);
2706 if (r)
2707 goto out_unlock;
2708
2709 dm_table_postsuspend_targets(map);
2710
2711out_unlock:
2712 mutex_unlock(&md->suspend_lock);
2713 return r;
2714}
2715
2716static int __dm_resume(struct mapped_device *md, struct dm_table *map)
2717{
2718 if (map) {
2719 int r = dm_table_resume_targets(map);
2720 if (r)
2721 return r;
2722 }
2723
2724 dm_queue_flush(md);
2725
2726
2727
2728
2729
2730
2731 if (dm_request_based(md))
2732 dm_start_queue(md->queue);
2733
2734 unlock_fs(md);
2735
2736 return 0;
2737}
2738
2739int dm_resume(struct mapped_device *md)
2740{
2741 int r;
2742 struct dm_table *map = NULL;
2743
2744retry:
2745 r = -EINVAL;
2746 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2747
2748 if (!dm_suspended_md(md))
2749 goto out;
2750
2751 if (dm_suspended_internally_md(md)) {
2752
2753 mutex_unlock(&md->suspend_lock);
2754 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2755 if (r)
2756 return r;
2757 goto retry;
2758 }
2759
2760 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2761 if (!map || !dm_table_get_size(map))
2762 goto out;
2763
2764 r = __dm_resume(md, map);
2765 if (r)
2766 goto out;
2767
2768 clear_bit(DMF_SUSPENDED, &md->flags);
2769out:
2770 mutex_unlock(&md->suspend_lock);
2771
2772 return r;
2773}
2774
2775
2776
2777
2778
2779
2780
2781static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
2782{
2783 struct dm_table *map = NULL;
2784
2785 lockdep_assert_held(&md->suspend_lock);
2786
2787 if (md->internal_suspend_count++)
2788 return;
2789
2790 if (dm_suspended_md(md)) {
2791 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2792 return;
2793 }
2794
2795 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2796
2797
2798
2799
2800
2801
2802
2803 (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE,
2804 DMF_SUSPENDED_INTERNALLY);
2805
2806 dm_table_postsuspend_targets(map);
2807}
2808
2809static void __dm_internal_resume(struct mapped_device *md)
2810{
2811 BUG_ON(!md->internal_suspend_count);
2812
2813 if (--md->internal_suspend_count)
2814 return;
2815
2816 if (dm_suspended_md(md))
2817 goto done;
2818
2819
2820
2821
2822
2823 (void) __dm_resume(md, NULL);
2824
2825done:
2826 clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2827 smp_mb__after_atomic();
2828 wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
2829}
2830
2831void dm_internal_suspend_noflush(struct mapped_device *md)
2832{
2833 mutex_lock(&md->suspend_lock);
2834 __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
2835 mutex_unlock(&md->suspend_lock);
2836}
2837EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
2838
2839void dm_internal_resume(struct mapped_device *md)
2840{
2841 mutex_lock(&md->suspend_lock);
2842 __dm_internal_resume(md);
2843 mutex_unlock(&md->suspend_lock);
2844}
2845EXPORT_SYMBOL_GPL(dm_internal_resume);
2846
2847
2848
2849
2850
2851
2852void dm_internal_suspend_fast(struct mapped_device *md)
2853{
2854 mutex_lock(&md->suspend_lock);
2855 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2856 return;
2857
2858 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2859 synchronize_srcu(&md->io_barrier);
2860 flush_workqueue(md->wq);
2861 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2862}
2863EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
2864
2865void dm_internal_resume_fast(struct mapped_device *md)
2866{
2867 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2868 goto done;
2869
2870 dm_queue_flush(md);
2871
2872done:
2873 mutex_unlock(&md->suspend_lock);
2874}
2875EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
2876
2877
2878
2879
2880int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
2881 unsigned cookie)
2882{
2883 char udev_cookie[DM_COOKIE_LENGTH];
2884 char *envp[] = { udev_cookie, NULL };
2885
2886 if (!cookie)
2887 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
2888 else {
2889 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2890 DM_COOKIE_ENV_VAR_NAME, cookie);
2891 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
2892 action, envp);
2893 }
2894}
2895
2896uint32_t dm_next_uevent_seq(struct mapped_device *md)
2897{
2898 return atomic_add_return(1, &md->uevent_seq);
2899}
2900
2901uint32_t dm_get_event_nr(struct mapped_device *md)
2902{
2903 return atomic_read(&md->event_nr);
2904}
2905
2906int dm_wait_event(struct mapped_device *md, int event_nr)
2907{
2908 return wait_event_interruptible(md->eventq,
2909 (event_nr != atomic_read(&md->event_nr)));
2910}
2911
2912void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
2913{
2914 unsigned long flags;
2915
2916 spin_lock_irqsave(&md->uevent_lock, flags);
2917 list_add(elist, &md->uevent_list);
2918 spin_unlock_irqrestore(&md->uevent_lock, flags);
2919}
2920
2921
2922
2923
2924
2925struct gendisk *dm_disk(struct mapped_device *md)
2926{
2927 return md->disk;
2928}
2929EXPORT_SYMBOL_GPL(dm_disk);
2930
2931struct kobject *dm_kobject(struct mapped_device *md)
2932{
2933 return &md->kobj_holder.kobj;
2934}
2935
2936struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2937{
2938 struct mapped_device *md;
2939
2940 md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
2941
2942 spin_lock(&_minor_lock);
2943 if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2944 md = NULL;
2945 goto out;
2946 }
2947 dm_get(md);
2948out:
2949 spin_unlock(&_minor_lock);
2950
2951 return md;
2952}
2953
2954int dm_suspended_md(struct mapped_device *md)
2955{
2956 return test_bit(DMF_SUSPENDED, &md->flags);
2957}
2958
2959int dm_suspended_internally_md(struct mapped_device *md)
2960{
2961 return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2962}
2963
2964int dm_test_deferred_remove_flag(struct mapped_device *md)
2965{
2966 return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
2967}
2968
2969int dm_suspended(struct dm_target *ti)
2970{
2971 return dm_suspended_md(dm_table_get_md(ti->table));
2972}
2973EXPORT_SYMBOL_GPL(dm_suspended);
2974
2975int dm_noflush_suspending(struct dm_target *ti)
2976{
2977 return __noflush_suspending(dm_table_get_md(ti->table));
2978}
2979EXPORT_SYMBOL_GPL(dm_noflush_suspending);
2980
2981struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
2982 unsigned integrity, unsigned per_io_data_size,
2983 unsigned min_pool_size)
2984{
2985 struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
2986 unsigned int pool_size = 0;
2987 unsigned int front_pad, io_front_pad;
2988 int ret;
2989
2990 if (!pools)
2991 return NULL;
2992
2993 switch (type) {
2994 case DM_TYPE_BIO_BASED:
2995 case DM_TYPE_DAX_BIO_BASED:
2996 case DM_TYPE_NVME_BIO_BASED:
2997 pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
2998 front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
2999 io_front_pad = roundup(front_pad, __alignof__(struct dm_io)) + offsetof(struct dm_io, tio);
3000 ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0);
3001 if (ret)
3002 goto out;
3003 if (integrity && bioset_integrity_create(&pools->io_bs, pool_size))
3004 goto out;
3005 break;
3006 case DM_TYPE_REQUEST_BASED:
3007 pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size);
3008 front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
3009
3010 break;
3011 default:
3012 BUG();
3013 }
3014
3015 ret = bioset_init(&pools->bs, pool_size, front_pad, 0);
3016 if (ret)
3017 goto out;
3018
3019 if (integrity && bioset_integrity_create(&pools->bs, pool_size))
3020 goto out;
3021
3022 return pools;
3023
3024out:
3025 dm_free_md_mempools(pools);
3026
3027 return NULL;
3028}
3029
3030void dm_free_md_mempools(struct dm_md_mempools *pools)
3031{
3032 if (!pools)
3033 return;
3034
3035 bioset_exit(&pools->bs);
3036 bioset_exit(&pools->io_bs);
3037
3038 kfree(pools);
3039}
3040
3041struct dm_pr {
3042 u64 old_key;
3043 u64 new_key;
3044 u32 flags;
3045 bool fail_early;
3046};
3047
3048static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
3049 void *data)
3050{
3051 struct mapped_device *md = bdev->bd_disk->private_data;
3052 struct dm_table *table;
3053 struct dm_target *ti;
3054 int ret = -ENOTTY, srcu_idx;
3055
3056 table = dm_get_live_table(md, &srcu_idx);
3057 if (!table || !dm_table_get_size(table))
3058 goto out;
3059
3060
3061 if (dm_table_get_num_targets(table) != 1)
3062 goto out;
3063 ti = dm_table_get_target(table, 0);
3064
3065 ret = -EINVAL;
3066 if (!ti->type->iterate_devices)
3067 goto out;
3068
3069 ret = ti->type->iterate_devices(ti, fn, data);
3070out:
3071 dm_put_live_table(md, srcu_idx);
3072 return ret;
3073}
3074
3075
3076
3077
3078static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev,
3079 sector_t start, sector_t len, void *data)
3080{
3081 struct dm_pr *pr = data;
3082 const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
3083
3084 if (!ops || !ops->pr_register)
3085 return -EOPNOTSUPP;
3086 return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);
3087}
3088
3089static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
3090 u32 flags)
3091{
3092 struct dm_pr pr = {
3093 .old_key = old_key,
3094 .new_key = new_key,
3095 .flags = flags,
3096 .fail_early = true,
3097 };
3098 int ret;
3099
3100 ret = dm_call_pr(bdev, __dm_pr_register, &pr);
3101 if (ret && new_key) {
3102
3103 pr.old_key = new_key;
3104 pr.new_key = 0;
3105 pr.flags = 0;
3106 pr.fail_early = false;
3107 dm_call_pr(bdev, __dm_pr_register, &pr);
3108 }
3109
3110 return ret;
3111}
3112
3113static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
3114 u32 flags)
3115{
3116 struct mapped_device *md = bdev->bd_disk->private_data;
3117 const struct pr_ops *ops;
3118 int r, srcu_idx;
3119
3120 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3121 if (r < 0)
3122 goto out;
3123
3124 ops = bdev->bd_disk->fops->pr_ops;
3125 if (ops && ops->pr_reserve)
3126 r = ops->pr_reserve(bdev, key, type, flags);
3127 else
3128 r = -EOPNOTSUPP;
3129out:
3130 dm_unprepare_ioctl(md, srcu_idx);
3131 return r;
3132}
3133
3134static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
3135{
3136 struct mapped_device *md = bdev->bd_disk->private_data;
3137 const struct pr_ops *ops;
3138 int r, srcu_idx;
3139
3140 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3141 if (r < 0)
3142 goto out;
3143
3144 ops = bdev->bd_disk->fops->pr_ops;
3145 if (ops && ops->pr_release)
3146 r = ops->pr_release(bdev, key, type);
3147 else
3148 r = -EOPNOTSUPP;
3149out:
3150 dm_unprepare_ioctl(md, srcu_idx);
3151 return r;
3152}
3153
3154static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
3155 enum pr_type type, bool abort)
3156{
3157 struct mapped_device *md = bdev->bd_disk->private_data;
3158 const struct pr_ops *ops;
3159 int r, srcu_idx;
3160
3161 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3162 if (r < 0)
3163 goto out;
3164
3165 ops = bdev->bd_disk->fops->pr_ops;
3166 if (ops && ops->pr_preempt)
3167 r = ops->pr_preempt(bdev, old_key, new_key, type, abort);
3168 else
3169 r = -EOPNOTSUPP;
3170out:
3171 dm_unprepare_ioctl(md, srcu_idx);
3172 return r;
3173}
3174
3175static int dm_pr_clear(struct block_device *bdev, u64 key)
3176{
3177 struct mapped_device *md = bdev->bd_disk->private_data;
3178 const struct pr_ops *ops;
3179 int r, srcu_idx;
3180
3181 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3182 if (r < 0)
3183 goto out;
3184
3185 ops = bdev->bd_disk->fops->pr_ops;
3186 if (ops && ops->pr_clear)
3187 r = ops->pr_clear(bdev, key);
3188 else
3189 r = -EOPNOTSUPP;
3190out:
3191 dm_unprepare_ioctl(md, srcu_idx);
3192 return r;
3193}
3194
3195static const struct pr_ops dm_pr_ops = {
3196 .pr_register = dm_pr_register,
3197 .pr_reserve = dm_pr_reserve,
3198 .pr_release = dm_pr_release,
3199 .pr_preempt = dm_pr_preempt,
3200 .pr_clear = dm_pr_clear,
3201};
3202
3203static const struct block_device_operations dm_blk_dops = {
3204 .open = dm_blk_open,
3205 .release = dm_blk_close,
3206 .ioctl = dm_blk_ioctl,
3207 .getgeo = dm_blk_getgeo,
3208 .report_zones = dm_blk_report_zones,
3209 .pr_ops = &dm_pr_ops,
3210 .owner = THIS_MODULE
3211};
3212
3213static const struct dax_operations dm_dax_ops = {
3214 .direct_access = dm_dax_direct_access,
3215 .dax_supported = dm_dax_supported,
3216 .copy_from_iter = dm_dax_copy_from_iter,
3217 .copy_to_iter = dm_dax_copy_to_iter,
3218};
3219
3220
3221
3222
3223module_init(dm_init);
3224module_exit(dm_exit);
3225
3226module_param(major, uint, 0);
3227MODULE_PARM_DESC(major, "The major number of the device mapper");
3228
3229module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
3230MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
3231
3232module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
3233MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
3234
3235MODULE_DESCRIPTION(DM_NAME " driver");
3236MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3237MODULE_LICENSE("GPL");
3238