1
2
3
4
5
6
7
8#include "dm-core.h"
9#include "dm-rq.h"
10#include "dm-uevent.h"
11
12#include <linux/init.h>
13#include <linux/module.h>
14#include <linux/mutex.h>
15#include <linux/sched/signal.h>
16#include <linux/blkpg.h>
17#include <linux/bio.h>
18#include <linux/mempool.h>
19#include <linux/dax.h>
20#include <linux/slab.h>
21#include <linux/idr.h>
22#include <linux/uio.h>
23#include <linux/hdreg.h>
24#include <linux/delay.h>
25#include <linux/wait.h>
26#include <linux/pr.h>
27#include <linux/refcount.h>
28
29#define DM_MSG_PREFIX "core"
30
31
32
33
34
35#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
36#define DM_COOKIE_LENGTH 24
37
38static const char *_name = DM_NAME;
39
40static unsigned int major = 0;
41static unsigned int _major = 0;
42
43static DEFINE_IDR(_minor_idr);
44
45static DEFINE_SPINLOCK(_minor_lock);
46
47static void do_deferred_remove(struct work_struct *w);
48
49static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
50
51static struct workqueue_struct *deferred_remove_workqueue;
52
53atomic_t dm_global_event_nr = ATOMIC_INIT(0);
54DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq);
55
56void dm_issue_global_event(void)
57{
58 atomic_inc(&dm_global_event_nr);
59 wake_up(&dm_global_eventq);
60}
61
62
63
64
65struct clone_info {
66 struct dm_table *map;
67 struct bio *bio;
68 struct dm_io *io;
69 sector_t sector;
70 unsigned sector_count;
71};
72
73
74
75
76#define DM_TIO_MAGIC 7282014
77struct dm_target_io {
78 unsigned magic;
79 struct dm_io *io;
80 struct dm_target *ti;
81 unsigned target_bio_nr;
82 unsigned *len_ptr;
83 bool inside_dm_io;
84 struct bio clone;
85};
86
87
88
89
90
91#define DM_IO_MAGIC 5191977
92struct dm_io {
93 unsigned magic;
94 struct mapped_device *md;
95 blk_status_t status;
96 atomic_t io_count;
97 struct bio *orig_bio;
98 unsigned long start_time;
99 spinlock_t endio_lock;
100 struct dm_stats_aux stats_aux;
101
102 struct dm_target_io tio;
103};
104
105void *dm_per_bio_data(struct bio *bio, size_t data_size)
106{
107 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
108 if (!tio->inside_dm_io)
109 return (char *)bio - offsetof(struct dm_target_io, clone) - data_size;
110 return (char *)bio - offsetof(struct dm_target_io, clone) - offsetof(struct dm_io, tio) - data_size;
111}
112EXPORT_SYMBOL_GPL(dm_per_bio_data);
113
114struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size)
115{
116 struct dm_io *io = (struct dm_io *)((char *)data + data_size);
117 if (io->magic == DM_IO_MAGIC)
118 return (struct bio *)((char *)io + offsetof(struct dm_io, tio) + offsetof(struct dm_target_io, clone));
119 BUG_ON(io->magic != DM_TIO_MAGIC);
120 return (struct bio *)((char *)io + offsetof(struct dm_target_io, clone));
121}
122EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data);
123
124unsigned dm_bio_get_target_bio_nr(const struct bio *bio)
125{
126 return container_of(bio, struct dm_target_io, clone)->target_bio_nr;
127}
128EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr);
129
130#define MINOR_ALLOCED ((void *)-1)
131
132
133
134
135#define DMF_BLOCK_IO_FOR_SUSPEND 0
136#define DMF_SUSPENDED 1
137#define DMF_FROZEN 2
138#define DMF_FREEING 3
139#define DMF_DELETING 4
140#define DMF_NOFLUSH_SUSPENDING 5
141#define DMF_DEFERRED_REMOVE 6
142#define DMF_SUSPENDED_INTERNALLY 7
143
144#define DM_NUMA_NODE NUMA_NO_NODE
145static int dm_numa_node = DM_NUMA_NODE;
146
147
148
149
150struct dm_md_mempools {
151 struct bio_set bs;
152 struct bio_set io_bs;
153};
154
155struct table_device {
156 struct list_head list;
157 refcount_t count;
158 struct dm_dev dm_dev;
159};
160
161
162
163
164#define RESERVED_BIO_BASED_IOS 16
165static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
166
167static int __dm_get_module_param_int(int *module_param, int min, int max)
168{
169 int param = READ_ONCE(*module_param);
170 int modified_param = 0;
171 bool modified = true;
172
173 if (param < min)
174 modified_param = min;
175 else if (param > max)
176 modified_param = max;
177 else
178 modified = false;
179
180 if (modified) {
181 (void)cmpxchg(module_param, param, modified_param);
182 param = modified_param;
183 }
184
185 return param;
186}
187
188unsigned __dm_get_module_param(unsigned *module_param,
189 unsigned def, unsigned max)
190{
191 unsigned param = READ_ONCE(*module_param);
192 unsigned modified_param = 0;
193
194 if (!param)
195 modified_param = def;
196 else if (param > max)
197 modified_param = max;
198
199 if (modified_param) {
200 (void)cmpxchg(module_param, param, modified_param);
201 param = modified_param;
202 }
203
204 return param;
205}
206
207unsigned dm_get_reserved_bio_based_ios(void)
208{
209 return __dm_get_module_param(&reserved_bio_based_ios,
210 RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS);
211}
212EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
213
214static unsigned dm_get_numa_node(void)
215{
216 return __dm_get_module_param_int(&dm_numa_node,
217 DM_NUMA_NODE, num_online_nodes() - 1);
218}
219
220static int __init local_init(void)
221{
222 int r;
223
224 r = dm_uevent_init();
225 if (r)
226 return r;
227
228 deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
229 if (!deferred_remove_workqueue) {
230 r = -ENOMEM;
231 goto out_uevent_exit;
232 }
233
234 _major = major;
235 r = register_blkdev(_major, _name);
236 if (r < 0)
237 goto out_free_workqueue;
238
239 if (!_major)
240 _major = r;
241
242 return 0;
243
244out_free_workqueue:
245 destroy_workqueue(deferred_remove_workqueue);
246out_uevent_exit:
247 dm_uevent_exit();
248
249 return r;
250}
251
252static void local_exit(void)
253{
254 flush_scheduled_work();
255 destroy_workqueue(deferred_remove_workqueue);
256
257 unregister_blkdev(_major, _name);
258 dm_uevent_exit();
259
260 _major = 0;
261
262 DMINFO("cleaned up");
263}
264
265static int (*_inits[])(void) __initdata = {
266 local_init,
267 dm_target_init,
268 dm_linear_init,
269 dm_stripe_init,
270 dm_io_init,
271 dm_kcopyd_init,
272 dm_interface_init,
273 dm_statistics_init,
274};
275
276static void (*_exits[])(void) = {
277 local_exit,
278 dm_target_exit,
279 dm_linear_exit,
280 dm_stripe_exit,
281 dm_io_exit,
282 dm_kcopyd_exit,
283 dm_interface_exit,
284 dm_statistics_exit,
285};
286
287static int __init dm_init(void)
288{
289 const int count = ARRAY_SIZE(_inits);
290
291 int r, i;
292
293 for (i = 0; i < count; i++) {
294 r = _inits[i]();
295 if (r)
296 goto bad;
297 }
298
299 return 0;
300
301 bad:
302 while (i--)
303 _exits[i]();
304
305 return r;
306}
307
308static void __exit dm_exit(void)
309{
310 int i = ARRAY_SIZE(_exits);
311
312 while (i--)
313 _exits[i]();
314
315
316
317
318 idr_destroy(&_minor_idr);
319}
320
321
322
323
324int dm_deleting_md(struct mapped_device *md)
325{
326 return test_bit(DMF_DELETING, &md->flags);
327}
328
329static int dm_blk_open(struct block_device *bdev, fmode_t mode)
330{
331 struct mapped_device *md;
332
333 spin_lock(&_minor_lock);
334
335 md = bdev->bd_disk->private_data;
336 if (!md)
337 goto out;
338
339 if (test_bit(DMF_FREEING, &md->flags) ||
340 dm_deleting_md(md)) {
341 md = NULL;
342 goto out;
343 }
344
345 dm_get(md);
346 atomic_inc(&md->open_count);
347out:
348 spin_unlock(&_minor_lock);
349
350 return md ? 0 : -ENXIO;
351}
352
353static void dm_blk_close(struct gendisk *disk, fmode_t mode)
354{
355 struct mapped_device *md;
356
357 spin_lock(&_minor_lock);
358
359 md = disk->private_data;
360 if (WARN_ON(!md))
361 goto out;
362
363 if (atomic_dec_and_test(&md->open_count) &&
364 (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
365 queue_work(deferred_remove_workqueue, &deferred_remove_work);
366
367 dm_put(md);
368out:
369 spin_unlock(&_minor_lock);
370}
371
372int dm_open_count(struct mapped_device *md)
373{
374 return atomic_read(&md->open_count);
375}
376
377
378
379
380int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
381{
382 int r = 0;
383
384 spin_lock(&_minor_lock);
385
386 if (dm_open_count(md)) {
387 r = -EBUSY;
388 if (mark_deferred)
389 set_bit(DMF_DEFERRED_REMOVE, &md->flags);
390 } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
391 r = -EEXIST;
392 else
393 set_bit(DMF_DELETING, &md->flags);
394
395 spin_unlock(&_minor_lock);
396
397 return r;
398}
399
400int dm_cancel_deferred_remove(struct mapped_device *md)
401{
402 int r = 0;
403
404 spin_lock(&_minor_lock);
405
406 if (test_bit(DMF_DELETING, &md->flags))
407 r = -EBUSY;
408 else
409 clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
410
411 spin_unlock(&_minor_lock);
412
413 return r;
414}
415
416static void do_deferred_remove(struct work_struct *w)
417{
418 dm_deferred_remove();
419}
420
421sector_t dm_get_size(struct mapped_device *md)
422{
423 return get_capacity(md->disk);
424}
425
426struct request_queue *dm_get_md_queue(struct mapped_device *md)
427{
428 return md->queue;
429}
430
431struct dm_stats *dm_get_stats(struct mapped_device *md)
432{
433 return &md->stats;
434}
435
436static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
437{
438 struct mapped_device *md = bdev->bd_disk->private_data;
439
440 return dm_get_geometry(md, geo);
441}
442
443static int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
444 struct blk_zone *zones, unsigned int *nr_zones,
445 gfp_t gfp_mask)
446{
447#ifdef CONFIG_BLK_DEV_ZONED
448 struct mapped_device *md = disk->private_data;
449 struct dm_target *tgt;
450 struct dm_table *map;
451 int srcu_idx, ret;
452
453 if (dm_suspended_md(md))
454 return -EAGAIN;
455
456 map = dm_get_live_table(md, &srcu_idx);
457 if (!map)
458 return -EIO;
459
460 tgt = dm_table_find_target(map, sector);
461 if (!dm_target_is_valid(tgt)) {
462 ret = -EIO;
463 goto out;
464 }
465
466
467
468
469
470
471
472 if (WARN_ON(!tgt->type->report_zones)) {
473 ret = -EIO;
474 goto out;
475 }
476
477
478
479
480
481
482
483 ret = tgt->type->report_zones(tgt, sector, zones,
484 nr_zones, gfp_mask);
485
486out:
487 dm_put_live_table(md, srcu_idx);
488 return ret;
489#else
490 return -ENOTSUPP;
491#endif
492}
493
494static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx,
495 struct block_device **bdev)
496 __acquires(md->io_barrier)
497{
498 struct dm_target *tgt;
499 struct dm_table *map;
500 int r;
501
502retry:
503 r = -ENOTTY;
504 map = dm_get_live_table(md, srcu_idx);
505 if (!map || !dm_table_get_size(map))
506 return r;
507
508
509 if (dm_table_get_num_targets(map) != 1)
510 return r;
511
512 tgt = dm_table_get_target(map, 0);
513 if (!tgt->type->prepare_ioctl)
514 return r;
515
516 if (dm_suspended_md(md))
517 return -EAGAIN;
518
519 r = tgt->type->prepare_ioctl(tgt, bdev);
520 if (r == -ENOTCONN && !fatal_signal_pending(current)) {
521 dm_put_live_table(md, *srcu_idx);
522 msleep(10);
523 goto retry;
524 }
525
526 return r;
527}
528
529static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx)
530 __releases(md->io_barrier)
531{
532 dm_put_live_table(md, srcu_idx);
533}
534
535static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
536 unsigned int cmd, unsigned long arg)
537{
538 struct mapped_device *md = bdev->bd_disk->private_data;
539 int r, srcu_idx;
540
541 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
542 if (r < 0)
543 goto out;
544
545 if (r > 0) {
546
547
548
549
550 if (!capable(CAP_SYS_RAWIO)) {
551 DMWARN_LIMIT(
552 "%s: sending ioctl %x to DM device without required privilege.",
553 current->comm, cmd);
554 r = -ENOIOCTLCMD;
555 goto out;
556 }
557 }
558
559 r = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
560out:
561 dm_unprepare_ioctl(md, srcu_idx);
562 return r;
563}
564
565static void start_io_acct(struct dm_io *io);
566
567static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
568{
569 struct dm_io *io;
570 struct dm_target_io *tio;
571 struct bio *clone;
572
573 clone = bio_alloc_bioset(GFP_NOIO, 0, &md->io_bs);
574 if (!clone)
575 return NULL;
576
577 tio = container_of(clone, struct dm_target_io, clone);
578 tio->inside_dm_io = true;
579 tio->io = NULL;
580
581 io = container_of(tio, struct dm_io, tio);
582 io->magic = DM_IO_MAGIC;
583 io->status = 0;
584 atomic_set(&io->io_count, 1);
585 io->orig_bio = bio;
586 io->md = md;
587 spin_lock_init(&io->endio_lock);
588
589 start_io_acct(io);
590
591 return io;
592}
593
594static void free_io(struct mapped_device *md, struct dm_io *io)
595{
596 bio_put(&io->tio.clone);
597}
598
599static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *ti,
600 unsigned target_bio_nr, gfp_t gfp_mask)
601{
602 struct dm_target_io *tio;
603
604 if (!ci->io->tio.io) {
605
606 tio = &ci->io->tio;
607 } else {
608 struct bio *clone = bio_alloc_bioset(gfp_mask, 0, &ci->io->md->bs);
609 if (!clone)
610 return NULL;
611
612 tio = container_of(clone, struct dm_target_io, clone);
613 tio->inside_dm_io = false;
614 }
615
616 tio->magic = DM_TIO_MAGIC;
617 tio->io = ci->io;
618 tio->ti = ti;
619 tio->target_bio_nr = target_bio_nr;
620
621 return tio;
622}
623
624static void free_tio(struct dm_target_io *tio)
625{
626 if (tio->inside_dm_io)
627 return;
628 bio_put(&tio->clone);
629}
630
631static bool md_in_flight_bios(struct mapped_device *md)
632{
633 int cpu;
634 struct hd_struct *part = &dm_disk(md)->part0;
635 long sum = 0;
636
637 for_each_possible_cpu(cpu) {
638 sum += part_stat_local_read_cpu(part, in_flight[0], cpu);
639 sum += part_stat_local_read_cpu(part, in_flight[1], cpu);
640 }
641
642 return sum != 0;
643}
644
645static bool md_in_flight(struct mapped_device *md)
646{
647 if (queue_is_mq(md->queue))
648 return blk_mq_queue_inflight(md->queue);
649 else
650 return md_in_flight_bios(md);
651}
652
653static void start_io_acct(struct dm_io *io)
654{
655 struct mapped_device *md = io->md;
656 struct bio *bio = io->orig_bio;
657
658 io->start_time = jiffies;
659
660 generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio),
661 &dm_disk(md)->part0);
662
663 if (unlikely(dm_stats_used(&md->stats)))
664 dm_stats_account_io(&md->stats, bio_data_dir(bio),
665 bio->bi_iter.bi_sector, bio_sectors(bio),
666 false, 0, &io->stats_aux);
667}
668
669static void end_io_acct(struct dm_io *io)
670{
671 struct mapped_device *md = io->md;
672 struct bio *bio = io->orig_bio;
673 unsigned long duration = jiffies - io->start_time;
674
675 generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0,
676 io->start_time);
677
678 if (unlikely(dm_stats_used(&md->stats)))
679 dm_stats_account_io(&md->stats, bio_data_dir(bio),
680 bio->bi_iter.bi_sector, bio_sectors(bio),
681 true, duration, &io->stats_aux);
682
683
684 if (unlikely(wq_has_sleeper(&md->wait)))
685 wake_up(&md->wait);
686}
687
688
689
690
691static void queue_io(struct mapped_device *md, struct bio *bio)
692{
693 unsigned long flags;
694
695 spin_lock_irqsave(&md->deferred_lock, flags);
696 bio_list_add(&md->deferred, bio);
697 spin_unlock_irqrestore(&md->deferred_lock, flags);
698 queue_work(md->wq, &md->work);
699}
700
701
702
703
704
705
706struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
707{
708 *srcu_idx = srcu_read_lock(&md->io_barrier);
709
710 return srcu_dereference(md->map, &md->io_barrier);
711}
712
713void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
714{
715 srcu_read_unlock(&md->io_barrier, srcu_idx);
716}
717
718void dm_sync_table(struct mapped_device *md)
719{
720 synchronize_srcu(&md->io_barrier);
721 synchronize_rcu_expedited();
722}
723
724
725
726
727
728static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
729{
730 rcu_read_lock();
731 return rcu_dereference(md->map);
732}
733
734static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
735{
736 rcu_read_unlock();
737}
738
739static char *_dm_claim_ptr = "I belong to device-mapper";
740
741
742
743
744static int open_table_device(struct table_device *td, dev_t dev,
745 struct mapped_device *md)
746{
747 struct block_device *bdev;
748
749 int r;
750
751 BUG_ON(td->dm_dev.bdev);
752
753 bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr);
754 if (IS_ERR(bdev))
755 return PTR_ERR(bdev);
756
757 r = bd_link_disk_holder(bdev, dm_disk(md));
758 if (r) {
759 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
760 return r;
761 }
762
763 td->dm_dev.bdev = bdev;
764 td->dm_dev.dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
765 return 0;
766}
767
768
769
770
771static void close_table_device(struct table_device *td, struct mapped_device *md)
772{
773 if (!td->dm_dev.bdev)
774 return;
775
776 bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
777 blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
778 put_dax(td->dm_dev.dax_dev);
779 td->dm_dev.bdev = NULL;
780 td->dm_dev.dax_dev = NULL;
781}
782
783static struct table_device *find_table_device(struct list_head *l, dev_t dev,
784 fmode_t mode) {
785 struct table_device *td;
786
787 list_for_each_entry(td, l, list)
788 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
789 return td;
790
791 return NULL;
792}
793
794int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
795 struct dm_dev **result) {
796 int r;
797 struct table_device *td;
798
799 mutex_lock(&md->table_devices_lock);
800 td = find_table_device(&md->table_devices, dev, mode);
801 if (!td) {
802 td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
803 if (!td) {
804 mutex_unlock(&md->table_devices_lock);
805 return -ENOMEM;
806 }
807
808 td->dm_dev.mode = mode;
809 td->dm_dev.bdev = NULL;
810
811 if ((r = open_table_device(td, dev, md))) {
812 mutex_unlock(&md->table_devices_lock);
813 kfree(td);
814 return r;
815 }
816
817 format_dev_t(td->dm_dev.name, dev);
818
819 refcount_set(&td->count, 1);
820 list_add(&td->list, &md->table_devices);
821 } else {
822 refcount_inc(&td->count);
823 }
824 mutex_unlock(&md->table_devices_lock);
825
826 *result = &td->dm_dev;
827 return 0;
828}
829EXPORT_SYMBOL_GPL(dm_get_table_device);
830
831void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
832{
833 struct table_device *td = container_of(d, struct table_device, dm_dev);
834
835 mutex_lock(&md->table_devices_lock);
836 if (refcount_dec_and_test(&td->count)) {
837 close_table_device(td, md);
838 list_del(&td->list);
839 kfree(td);
840 }
841 mutex_unlock(&md->table_devices_lock);
842}
843EXPORT_SYMBOL(dm_put_table_device);
844
845static void free_table_devices(struct list_head *devices)
846{
847 struct list_head *tmp, *next;
848
849 list_for_each_safe(tmp, next, devices) {
850 struct table_device *td = list_entry(tmp, struct table_device, list);
851
852 DMWARN("dm_destroy: %s still exists with %d references",
853 td->dm_dev.name, refcount_read(&td->count));
854 kfree(td);
855 }
856}
857
858
859
860
861int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
862{
863 *geo = md->geometry;
864
865 return 0;
866}
867
868
869
870
871int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
872{
873 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
874
875 if (geo->start > sz) {
876 DMWARN("Start sector is beyond the geometry limits.");
877 return -EINVAL;
878 }
879
880 md->geometry = *geo;
881
882 return 0;
883}
884
885static int __noflush_suspending(struct mapped_device *md)
886{
887 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
888}
889
890
891
892
893
894static void dec_pending(struct dm_io *io, blk_status_t error)
895{
896 unsigned long flags;
897 blk_status_t io_error;
898 struct bio *bio;
899 struct mapped_device *md = io->md;
900
901
902 if (unlikely(error)) {
903 spin_lock_irqsave(&io->endio_lock, flags);
904 if (!(io->status == BLK_STS_DM_REQUEUE && __noflush_suspending(md)))
905 io->status = error;
906 spin_unlock_irqrestore(&io->endio_lock, flags);
907 }
908
909 if (atomic_dec_and_test(&io->io_count)) {
910 if (io->status == BLK_STS_DM_REQUEUE) {
911
912
913
914 spin_lock_irqsave(&md->deferred_lock, flags);
915 if (__noflush_suspending(md))
916
917 bio_list_add_head(&md->deferred, io->orig_bio);
918 else
919
920 io->status = BLK_STS_IOERR;
921 spin_unlock_irqrestore(&md->deferred_lock, flags);
922 }
923
924 io_error = io->status;
925 bio = io->orig_bio;
926 end_io_acct(io);
927 free_io(md, io);
928
929 if (io_error == BLK_STS_DM_REQUEUE)
930 return;
931
932 if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
933
934
935
936
937 bio->bi_opf &= ~REQ_PREFLUSH;
938 queue_io(md, bio);
939 } else {
940
941 if (io_error)
942 bio->bi_status = io_error;
943 bio_endio(bio);
944 }
945 }
946}
947
948void disable_write_same(struct mapped_device *md)
949{
950 struct queue_limits *limits = dm_get_queue_limits(md);
951
952
953 limits->max_write_same_sectors = 0;
954}
955
956void disable_write_zeroes(struct mapped_device *md)
957{
958 struct queue_limits *limits = dm_get_queue_limits(md);
959
960
961 limits->max_write_zeroes_sectors = 0;
962}
963
964static void clone_endio(struct bio *bio)
965{
966 blk_status_t error = bio->bi_status;
967 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
968 struct dm_io *io = tio->io;
969 struct mapped_device *md = tio->io->md;
970 dm_endio_fn endio = tio->ti->type->end_io;
971
972 if (unlikely(error == BLK_STS_TARGET) && md->type != DM_TYPE_NVME_BIO_BASED) {
973 if (bio_op(bio) == REQ_OP_WRITE_SAME &&
974 !bio->bi_disk->queue->limits.max_write_same_sectors)
975 disable_write_same(md);
976 if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
977 !bio->bi_disk->queue->limits.max_write_zeroes_sectors)
978 disable_write_zeroes(md);
979 }
980
981 if (endio) {
982 int r = endio(tio->ti, bio, &error);
983 switch (r) {
984 case DM_ENDIO_REQUEUE:
985 error = BLK_STS_DM_REQUEUE;
986
987 case DM_ENDIO_DONE:
988 break;
989 case DM_ENDIO_INCOMPLETE:
990
991 return;
992 default:
993 DMWARN("unimplemented target endio return value: %d", r);
994 BUG();
995 }
996 }
997
998 free_tio(tio);
999 dec_pending(io, error);
1000}
1001
1002
1003
1004
1005
1006static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
1007{
1008 sector_t target_offset = dm_target_offset(ti, sector);
1009
1010 return ti->len - target_offset;
1011}
1012
1013static sector_t max_io_len(sector_t sector, struct dm_target *ti)
1014{
1015 sector_t len = max_io_len_target_boundary(sector, ti);
1016 sector_t offset, max_len;
1017
1018
1019
1020
1021 if (ti->max_io_len) {
1022 offset = dm_target_offset(ti, sector);
1023 if (unlikely(ti->max_io_len & (ti->max_io_len - 1)))
1024 max_len = sector_div(offset, ti->max_io_len);
1025 else
1026 max_len = offset & (ti->max_io_len - 1);
1027 max_len = ti->max_io_len - max_len;
1028
1029 if (len > max_len)
1030 len = max_len;
1031 }
1032
1033 return len;
1034}
1035
1036int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
1037{
1038 if (len > UINT_MAX) {
1039 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
1040 (unsigned long long)len, UINT_MAX);
1041 ti->error = "Maximum size of target IO is too large";
1042 return -EINVAL;
1043 }
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053 ti->max_io_len = min_t(uint32_t, len, BIO_MAX_PAGES * PAGE_SIZE);
1054
1055 return 0;
1056}
1057EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
1058
1059static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
1060 sector_t sector, int *srcu_idx)
1061 __acquires(md->io_barrier)
1062{
1063 struct dm_table *map;
1064 struct dm_target *ti;
1065
1066 map = dm_get_live_table(md, srcu_idx);
1067 if (!map)
1068 return NULL;
1069
1070 ti = dm_table_find_target(map, sector);
1071 if (!dm_target_is_valid(ti))
1072 return NULL;
1073
1074 return ti;
1075}
1076
1077static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
1078 long nr_pages, void **kaddr, pfn_t *pfn)
1079{
1080 struct mapped_device *md = dax_get_private(dax_dev);
1081 sector_t sector = pgoff * PAGE_SECTORS;
1082 struct dm_target *ti;
1083 long len, ret = -EIO;
1084 int srcu_idx;
1085
1086 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1087
1088 if (!ti)
1089 goto out;
1090 if (!ti->type->direct_access)
1091 goto out;
1092 len = max_io_len(sector, ti) / PAGE_SECTORS;
1093 if (len < 1)
1094 goto out;
1095 nr_pages = min(len, nr_pages);
1096 ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
1097
1098 out:
1099 dm_put_live_table(md, srcu_idx);
1100
1101 return ret;
1102}
1103
1104static bool dm_dax_supported(struct dax_device *dax_dev, struct block_device *bdev,
1105 int blocksize, sector_t start, sector_t len)
1106{
1107 struct mapped_device *md = dax_get_private(dax_dev);
1108 struct dm_table *map;
1109 int srcu_idx;
1110 bool ret;
1111
1112 map = dm_get_live_table(md, &srcu_idx);
1113 if (!map)
1114 return false;
1115
1116 ret = dm_table_supports_dax(map, blocksize);
1117
1118 dm_put_live_table(md, srcu_idx);
1119
1120 return ret;
1121}
1122
1123static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
1124 void *addr, size_t bytes, struct iov_iter *i)
1125{
1126 struct mapped_device *md = dax_get_private(dax_dev);
1127 sector_t sector = pgoff * PAGE_SECTORS;
1128 struct dm_target *ti;
1129 long ret = 0;
1130 int srcu_idx;
1131
1132 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1133
1134 if (!ti)
1135 goto out;
1136 if (!ti->type->dax_copy_from_iter) {
1137 ret = copy_from_iter(addr, bytes, i);
1138 goto out;
1139 }
1140 ret = ti->type->dax_copy_from_iter(ti, pgoff, addr, bytes, i);
1141 out:
1142 dm_put_live_table(md, srcu_idx);
1143
1144 return ret;
1145}
1146
1147static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
1148 void *addr, size_t bytes, struct iov_iter *i)
1149{
1150 struct mapped_device *md = dax_get_private(dax_dev);
1151 sector_t sector = pgoff * PAGE_SECTORS;
1152 struct dm_target *ti;
1153 long ret = 0;
1154 int srcu_idx;
1155
1156 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1157
1158 if (!ti)
1159 goto out;
1160 if (!ti->type->dax_copy_to_iter) {
1161 ret = copy_to_iter(addr, bytes, i);
1162 goto out;
1163 }
1164 ret = ti->type->dax_copy_to_iter(ti, pgoff, addr, bytes, i);
1165 out:
1166 dm_put_live_table(md, srcu_idx);
1167
1168 return ret;
1169}
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
1200{
1201 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
1202 unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
1203 BUG_ON(bio->bi_opf & REQ_PREFLUSH);
1204 BUG_ON(bi_size > *tio->len_ptr);
1205 BUG_ON(n_sectors > bi_size);
1206 *tio->len_ptr -= bi_size - n_sectors;
1207 bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
1208}
1209EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219void dm_remap_zone_report(struct dm_target *ti, sector_t start,
1220 struct blk_zone *zones, unsigned int *nr_zones)
1221{
1222#ifdef CONFIG_BLK_DEV_ZONED
1223 struct blk_zone *zone;
1224 unsigned int nrz = *nr_zones;
1225 int i;
1226
1227
1228
1229
1230
1231
1232
1233 for (i = 0; i < nrz; i++) {
1234 zone = zones + i;
1235 if (zone->start >= start + ti->len) {
1236 memset(zone, 0, sizeof(struct blk_zone) * (nrz - i));
1237 break;
1238 }
1239
1240 zone->start = zone->start + ti->begin - start;
1241 if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
1242 continue;
1243
1244 if (zone->cond == BLK_ZONE_COND_FULL)
1245 zone->wp = zone->start + zone->len;
1246 else if (zone->cond == BLK_ZONE_COND_EMPTY)
1247 zone->wp = zone->start;
1248 else
1249 zone->wp = zone->wp + ti->begin - start;
1250 }
1251
1252 *nr_zones = i;
1253#else
1254 *nr_zones = 0;
1255#endif
1256}
1257EXPORT_SYMBOL_GPL(dm_remap_zone_report);
1258
1259static blk_qc_t __map_bio(struct dm_target_io *tio)
1260{
1261 int r;
1262 sector_t sector;
1263 struct bio *clone = &tio->clone;
1264 struct dm_io *io = tio->io;
1265 struct mapped_device *md = io->md;
1266 struct dm_target *ti = tio->ti;
1267 blk_qc_t ret = BLK_QC_T_NONE;
1268
1269 clone->bi_end_io = clone_endio;
1270
1271
1272
1273
1274
1275
1276 atomic_inc(&io->io_count);
1277 sector = clone->bi_iter.bi_sector;
1278
1279 r = ti->type->map(ti, clone);
1280 switch (r) {
1281 case DM_MAPIO_SUBMITTED:
1282 break;
1283 case DM_MAPIO_REMAPPED:
1284
1285 trace_block_bio_remap(clone->bi_disk->queue, clone,
1286 bio_dev(io->orig_bio), sector);
1287 if (md->type == DM_TYPE_NVME_BIO_BASED)
1288 ret = direct_make_request(clone);
1289 else
1290 ret = generic_make_request(clone);
1291 break;
1292 case DM_MAPIO_KILL:
1293 free_tio(tio);
1294 dec_pending(io, BLK_STS_IOERR);
1295 break;
1296 case DM_MAPIO_REQUEUE:
1297 free_tio(tio);
1298 dec_pending(io, BLK_STS_DM_REQUEUE);
1299 break;
1300 default:
1301 DMWARN("unimplemented target map return value: %d", r);
1302 BUG();
1303 }
1304
1305 return ret;
1306}
1307
1308static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
1309{
1310 bio->bi_iter.bi_sector = sector;
1311 bio->bi_iter.bi_size = to_bytes(len);
1312}
1313
1314
1315
1316
1317static int clone_bio(struct dm_target_io *tio, struct bio *bio,
1318 sector_t sector, unsigned len)
1319{
1320 struct bio *clone = &tio->clone;
1321
1322 __bio_clone_fast(clone, bio);
1323
1324 if (bio_integrity(bio)) {
1325 int r;
1326
1327 if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
1328 !dm_target_passes_integrity(tio->ti->type))) {
1329 DMWARN("%s: the target %s doesn't support integrity data.",
1330 dm_device_name(tio->io->md),
1331 tio->ti->type->name);
1332 return -EIO;
1333 }
1334
1335 r = bio_integrity_clone(clone, bio, GFP_NOIO);
1336 if (r < 0)
1337 return r;
1338 }
1339
1340 bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
1341 clone->bi_iter.bi_size = to_bytes(len);
1342
1343 if (bio_integrity(bio))
1344 bio_integrity_trim(clone);
1345
1346 return 0;
1347}
1348
1349static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
1350 struct dm_target *ti, unsigned num_bios)
1351{
1352 struct dm_target_io *tio;
1353 int try;
1354
1355 if (!num_bios)
1356 return;
1357
1358 if (num_bios == 1) {
1359 tio = alloc_tio(ci, ti, 0, GFP_NOIO);
1360 bio_list_add(blist, &tio->clone);
1361 return;
1362 }
1363
1364 for (try = 0; try < 2; try++) {
1365 int bio_nr;
1366 struct bio *bio;
1367
1368 if (try)
1369 mutex_lock(&ci->io->md->table_devices_lock);
1370 for (bio_nr = 0; bio_nr < num_bios; bio_nr++) {
1371 tio = alloc_tio(ci, ti, bio_nr, try ? GFP_NOIO : GFP_NOWAIT);
1372 if (!tio)
1373 break;
1374
1375 bio_list_add(blist, &tio->clone);
1376 }
1377 if (try)
1378 mutex_unlock(&ci->io->md->table_devices_lock);
1379 if (bio_nr == num_bios)
1380 return;
1381
1382 while ((bio = bio_list_pop(blist))) {
1383 tio = container_of(bio, struct dm_target_io, clone);
1384 free_tio(tio);
1385 }
1386 }
1387}
1388
1389static blk_qc_t __clone_and_map_simple_bio(struct clone_info *ci,
1390 struct dm_target_io *tio, unsigned *len)
1391{
1392 struct bio *clone = &tio->clone;
1393
1394 tio->len_ptr = len;
1395
1396 __bio_clone_fast(clone, ci->bio);
1397 if (len)
1398 bio_setup_sector(clone, ci->sector, *len);
1399
1400 return __map_bio(tio);
1401}
1402
1403static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
1404 unsigned num_bios, unsigned *len)
1405{
1406 struct bio_list blist = BIO_EMPTY_LIST;
1407 struct bio *bio;
1408 struct dm_target_io *tio;
1409
1410 alloc_multiple_bios(&blist, ci, ti, num_bios);
1411
1412 while ((bio = bio_list_pop(&blist))) {
1413 tio = container_of(bio, struct dm_target_io, clone);
1414 (void) __clone_and_map_simple_bio(ci, tio, len);
1415 }
1416}
1417
1418static int __send_empty_flush(struct clone_info *ci)
1419{
1420 unsigned target_nr = 0;
1421 struct dm_target *ti;
1422
1423
1424
1425
1426
1427
1428
1429
1430 bio_set_dev(ci->bio, ci->io->md->bdev);
1431
1432 BUG_ON(bio_has_data(ci->bio));
1433 while ((ti = dm_table_get_target(ci->map, target_nr++)))
1434 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
1435
1436 bio_disassociate_blkg(ci->bio);
1437
1438 return 0;
1439}
1440
1441static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
1442 sector_t sector, unsigned *len)
1443{
1444 struct bio *bio = ci->bio;
1445 struct dm_target_io *tio;
1446 int r;
1447
1448 tio = alloc_tio(ci, ti, 0, GFP_NOIO);
1449 tio->len_ptr = len;
1450 r = clone_bio(tio, bio, sector, *len);
1451 if (r < 0) {
1452 free_tio(tio);
1453 return r;
1454 }
1455 (void) __map_bio(tio);
1456
1457 return 0;
1458}
1459
1460typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
1461
1462static unsigned get_num_discard_bios(struct dm_target *ti)
1463{
1464 return ti->num_discard_bios;
1465}
1466
1467static unsigned get_num_secure_erase_bios(struct dm_target *ti)
1468{
1469 return ti->num_secure_erase_bios;
1470}
1471
1472static unsigned get_num_write_same_bios(struct dm_target *ti)
1473{
1474 return ti->num_write_same_bios;
1475}
1476
1477static unsigned get_num_write_zeroes_bios(struct dm_target *ti)
1478{
1479 return ti->num_write_zeroes_bios;
1480}
1481
1482typedef bool (*is_split_required_fn)(struct dm_target *ti);
1483
1484static bool is_split_required_for_discard(struct dm_target *ti)
1485{
1486 return ti->split_discard_bios;
1487}
1488
1489static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
1490 unsigned num_bios, bool is_split_required)
1491{
1492 unsigned len;
1493
1494
1495
1496
1497
1498
1499
1500 if (!num_bios)
1501 return -EOPNOTSUPP;
1502
1503 if (!is_split_required)
1504 len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
1505 else
1506 len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti));
1507
1508 __send_duplicate_bios(ci, ti, num_bios, &len);
1509
1510 ci->sector += len;
1511 ci->sector_count -= len;
1512
1513 return 0;
1514}
1515
1516static int __send_discard(struct clone_info *ci, struct dm_target *ti)
1517{
1518 return __send_changing_extent_only(ci, ti, get_num_discard_bios(ti),
1519 is_split_required_for_discard(ti));
1520}
1521
1522static int __send_secure_erase(struct clone_info *ci, struct dm_target *ti)
1523{
1524 return __send_changing_extent_only(ci, ti, get_num_secure_erase_bios(ti), false);
1525}
1526
1527static int __send_write_same(struct clone_info *ci, struct dm_target *ti)
1528{
1529 return __send_changing_extent_only(ci, ti, get_num_write_same_bios(ti), false);
1530}
1531
1532static int __send_write_zeroes(struct clone_info *ci, struct dm_target *ti)
1533{
1534 return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios(ti), false);
1535}
1536
1537static bool is_abnormal_io(struct bio *bio)
1538{
1539 bool r = false;
1540
1541 switch (bio_op(bio)) {
1542 case REQ_OP_DISCARD:
1543 case REQ_OP_SECURE_ERASE:
1544 case REQ_OP_WRITE_SAME:
1545 case REQ_OP_WRITE_ZEROES:
1546 r = true;
1547 break;
1548 }
1549
1550 return r;
1551}
1552
1553static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti,
1554 int *result)
1555{
1556 struct bio *bio = ci->bio;
1557
1558 if (bio_op(bio) == REQ_OP_DISCARD)
1559 *result = __send_discard(ci, ti);
1560 else if (bio_op(bio) == REQ_OP_SECURE_ERASE)
1561 *result = __send_secure_erase(ci, ti);
1562 else if (bio_op(bio) == REQ_OP_WRITE_SAME)
1563 *result = __send_write_same(ci, ti);
1564 else if (bio_op(bio) == REQ_OP_WRITE_ZEROES)
1565 *result = __send_write_zeroes(ci, ti);
1566 else
1567 return false;
1568
1569 return true;
1570}
1571
1572
1573
1574
1575static int __split_and_process_non_flush(struct clone_info *ci)
1576{
1577 struct dm_target *ti;
1578 unsigned len;
1579 int r;
1580
1581 ti = dm_table_find_target(ci->map, ci->sector);
1582 if (!dm_target_is_valid(ti))
1583 return -EIO;
1584
1585 if (__process_abnormal_io(ci, ti, &r))
1586 return r;
1587
1588 len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count);
1589
1590 r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
1591 if (r < 0)
1592 return r;
1593
1594 ci->sector += len;
1595 ci->sector_count -= len;
1596
1597 return 0;
1598}
1599
1600static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
1601 struct dm_table *map, struct bio *bio)
1602{
1603 ci->map = map;
1604 ci->io = alloc_io(md, bio);
1605 ci->sector = bio->bi_iter.bi_sector;
1606}
1607
1608#define __dm_part_stat_sub(part, field, subnd) \
1609 (part_stat_get(part, field) -= (subnd))
1610
1611
1612
1613
1614static blk_qc_t __split_and_process_bio(struct mapped_device *md,
1615 struct dm_table *map, struct bio *bio)
1616{
1617 struct clone_info ci;
1618 blk_qc_t ret = BLK_QC_T_NONE;
1619 int error = 0;
1620
1621 init_clone_info(&ci, md, map, bio);
1622
1623 if (bio->bi_opf & REQ_PREFLUSH) {
1624 struct bio flush_bio;
1625
1626
1627
1628
1629
1630
1631 bio_init(&flush_bio, NULL, 0);
1632 flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
1633 ci.bio = &flush_bio;
1634 ci.sector_count = 0;
1635 error = __send_empty_flush(&ci);
1636
1637 } else if (bio_op(bio) == REQ_OP_ZONE_RESET) {
1638 ci.bio = bio;
1639 ci.sector_count = 0;
1640 error = __split_and_process_non_flush(&ci);
1641 } else {
1642 ci.bio = bio;
1643 ci.sector_count = bio_sectors(bio);
1644 while (ci.sector_count && !error) {
1645 error = __split_and_process_non_flush(&ci);
1646 if (current->bio_list && ci.sector_count && !error) {
1647
1648
1649
1650
1651
1652
1653
1654
1655 struct bio *b = bio_split(bio, bio_sectors(bio) - ci.sector_count,
1656 GFP_NOIO, &md->queue->bio_split);
1657 ci.io->orig_bio = b;
1658
1659
1660
1661
1662
1663
1664
1665
1666 part_stat_lock();
1667 __dm_part_stat_sub(&dm_disk(md)->part0,
1668 sectors[op_stat_group(bio_op(bio))], ci.sector_count);
1669 part_stat_unlock();
1670
1671 bio_chain(b, bio);
1672 trace_block_split(md->queue, b, bio->bi_iter.bi_sector);
1673 ret = generic_make_request(bio);
1674 break;
1675 }
1676 }
1677 }
1678
1679
1680 dec_pending(ci.io, errno_to_blk_status(error));
1681 return ret;
1682}
1683
1684
1685
1686
1687
1688static blk_qc_t __process_bio(struct mapped_device *md, struct dm_table *map,
1689 struct bio *bio, struct dm_target *ti)
1690{
1691 struct clone_info ci;
1692 blk_qc_t ret = BLK_QC_T_NONE;
1693 int error = 0;
1694
1695 init_clone_info(&ci, md, map, bio);
1696
1697 if (bio->bi_opf & REQ_PREFLUSH) {
1698 struct bio flush_bio;
1699
1700
1701
1702
1703
1704
1705 bio_init(&flush_bio, NULL, 0);
1706 flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
1707 ci.bio = &flush_bio;
1708 ci.sector_count = 0;
1709 error = __send_empty_flush(&ci);
1710
1711 } else {
1712 struct dm_target_io *tio;
1713
1714 ci.bio = bio;
1715 ci.sector_count = bio_sectors(bio);
1716 if (__process_abnormal_io(&ci, ti, &error))
1717 goto out;
1718
1719 tio = alloc_tio(&ci, ti, 0, GFP_NOIO);
1720 ret = __clone_and_map_simple_bio(&ci, tio, NULL);
1721 }
1722out:
1723
1724 dec_pending(ci.io, errno_to_blk_status(error));
1725 return ret;
1726}
1727
1728static void dm_queue_split(struct mapped_device *md, struct dm_target *ti, struct bio **bio)
1729{
1730 unsigned len, sector_count;
1731
1732 sector_count = bio_sectors(*bio);
1733 len = min_t(sector_t, max_io_len((*bio)->bi_iter.bi_sector, ti), sector_count);
1734
1735 if (sector_count > len) {
1736 struct bio *split = bio_split(*bio, len, GFP_NOIO, &md->queue->bio_split);
1737
1738 bio_chain(split, *bio);
1739 trace_block_split(md->queue, split, (*bio)->bi_iter.bi_sector);
1740 generic_make_request(*bio);
1741 *bio = split;
1742 }
1743}
1744
1745static blk_qc_t dm_process_bio(struct mapped_device *md,
1746 struct dm_table *map, struct bio *bio)
1747{
1748 blk_qc_t ret = BLK_QC_T_NONE;
1749 struct dm_target *ti = md->immutable_target;
1750
1751 if (unlikely(!map)) {
1752 bio_io_error(bio);
1753 return ret;
1754 }
1755
1756 if (!ti) {
1757 ti = dm_table_find_target(map, bio->bi_iter.bi_sector);
1758 if (unlikely(!ti || !dm_target_is_valid(ti))) {
1759 bio_io_error(bio);
1760 return ret;
1761 }
1762 }
1763
1764
1765
1766
1767
1768
1769 if (current->bio_list) {
1770 blk_queue_split(md->queue, &bio);
1771 if (!is_abnormal_io(bio))
1772 dm_queue_split(md, ti, &bio);
1773 }
1774
1775 if (dm_get_md_type(md) == DM_TYPE_NVME_BIO_BASED)
1776 return __process_bio(md, map, bio, ti);
1777 else
1778 return __split_and_process_bio(md, map, bio);
1779}
1780
1781static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
1782{
1783 struct mapped_device *md = q->queuedata;
1784 blk_qc_t ret = BLK_QC_T_NONE;
1785 int srcu_idx;
1786 struct dm_table *map;
1787
1788 map = dm_get_live_table(md, &srcu_idx);
1789
1790
1791 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1792 dm_put_live_table(md, srcu_idx);
1793
1794 if (!(bio->bi_opf & REQ_RAHEAD))
1795 queue_io(md, bio);
1796 else
1797 bio_io_error(bio);
1798 return ret;
1799 }
1800
1801 ret = dm_process_bio(md, map, bio);
1802
1803 dm_put_live_table(md, srcu_idx);
1804 return ret;
1805}
1806
1807static int dm_any_congested(void *congested_data, int bdi_bits)
1808{
1809 int r = bdi_bits;
1810 struct mapped_device *md = congested_data;
1811 struct dm_table *map;
1812
1813 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
1814 if (dm_request_based(md)) {
1815
1816
1817
1818
1819 r = md->queue->backing_dev_info->wb.state & bdi_bits;
1820 } else {
1821 map = dm_get_live_table_fast(md);
1822 if (map)
1823 r = dm_table_any_congested(map, bdi_bits);
1824 dm_put_live_table_fast(md);
1825 }
1826 }
1827
1828 return r;
1829}
1830
1831
1832
1833
1834static void free_minor(int minor)
1835{
1836 spin_lock(&_minor_lock);
1837 idr_remove(&_minor_idr, minor);
1838 spin_unlock(&_minor_lock);
1839}
1840
1841
1842
1843
1844static int specific_minor(int minor)
1845{
1846 int r;
1847
1848 if (minor >= (1 << MINORBITS))
1849 return -EINVAL;
1850
1851 idr_preload(GFP_KERNEL);
1852 spin_lock(&_minor_lock);
1853
1854 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
1855
1856 spin_unlock(&_minor_lock);
1857 idr_preload_end();
1858 if (r < 0)
1859 return r == -ENOSPC ? -EBUSY : r;
1860 return 0;
1861}
1862
1863static int next_free_minor(int *minor)
1864{
1865 int r;
1866
1867 idr_preload(GFP_KERNEL);
1868 spin_lock(&_minor_lock);
1869
1870 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
1871
1872 spin_unlock(&_minor_lock);
1873 idr_preload_end();
1874 if (r < 0)
1875 return r;
1876 *minor = r;
1877 return 0;
1878}
1879
1880static const struct block_device_operations dm_blk_dops;
1881static const struct dax_operations dm_dax_ops;
1882
1883static void dm_wq_work(struct work_struct *work);
1884
1885static void dm_init_normal_md_queue(struct mapped_device *md)
1886{
1887
1888
1889
1890 md->queue->backing_dev_info->congested_fn = dm_any_congested;
1891}
1892
1893static void cleanup_mapped_device(struct mapped_device *md)
1894{
1895 if (md->wq)
1896 destroy_workqueue(md->wq);
1897 bioset_exit(&md->bs);
1898 bioset_exit(&md->io_bs);
1899
1900 if (md->dax_dev) {
1901 kill_dax(md->dax_dev);
1902 put_dax(md->dax_dev);
1903 md->dax_dev = NULL;
1904 }
1905
1906 if (md->disk) {
1907 spin_lock(&_minor_lock);
1908 md->disk->private_data = NULL;
1909 spin_unlock(&_minor_lock);
1910 del_gendisk(md->disk);
1911 put_disk(md->disk);
1912 }
1913
1914 if (md->queue)
1915 blk_cleanup_queue(md->queue);
1916
1917 cleanup_srcu_struct(&md->io_barrier);
1918
1919 if (md->bdev) {
1920 bdput(md->bdev);
1921 md->bdev = NULL;
1922 }
1923
1924 mutex_destroy(&md->suspend_lock);
1925 mutex_destroy(&md->type_lock);
1926 mutex_destroy(&md->table_devices_lock);
1927
1928 dm_mq_cleanup_mapped_device(md);
1929}
1930
1931
1932
1933
1934static struct mapped_device *alloc_dev(int minor)
1935{
1936 int r, numa_node_id = dm_get_numa_node();
1937 struct dax_device *dax_dev = NULL;
1938 struct mapped_device *md;
1939 void *old_md;
1940
1941 md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
1942 if (!md) {
1943 DMWARN("unable to allocate device, out of memory.");
1944 return NULL;
1945 }
1946
1947 if (!try_module_get(THIS_MODULE))
1948 goto bad_module_get;
1949
1950
1951 if (minor == DM_ANY_MINOR)
1952 r = next_free_minor(&minor);
1953 else
1954 r = specific_minor(minor);
1955 if (r < 0)
1956 goto bad_minor;
1957
1958 r = init_srcu_struct(&md->io_barrier);
1959 if (r < 0)
1960 goto bad_io_barrier;
1961
1962 md->numa_node_id = numa_node_id;
1963 md->init_tio_pdu = false;
1964 md->type = DM_TYPE_NONE;
1965 mutex_init(&md->suspend_lock);
1966 mutex_init(&md->type_lock);
1967 mutex_init(&md->table_devices_lock);
1968 spin_lock_init(&md->deferred_lock);
1969 atomic_set(&md->holders, 1);
1970 atomic_set(&md->open_count, 0);
1971 atomic_set(&md->event_nr, 0);
1972 atomic_set(&md->uevent_seq, 0);
1973 INIT_LIST_HEAD(&md->uevent_list);
1974 INIT_LIST_HEAD(&md->table_devices);
1975 spin_lock_init(&md->uevent_lock);
1976
1977 md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id);
1978 if (!md->queue)
1979 goto bad;
1980 md->queue->queuedata = md;
1981 md->queue->backing_dev_info->congested_data = md;
1982
1983 md->disk = alloc_disk_node(1, md->numa_node_id);
1984 if (!md->disk)
1985 goto bad;
1986
1987 init_waitqueue_head(&md->wait);
1988 INIT_WORK(&md->work, dm_wq_work);
1989 init_waitqueue_head(&md->eventq);
1990 init_completion(&md->kobj_holder.completion);
1991
1992 md->disk->major = _major;
1993 md->disk->first_minor = minor;
1994 md->disk->fops = &dm_blk_dops;
1995 md->disk->queue = md->queue;
1996 md->disk->private_data = md;
1997 sprintf(md->disk->disk_name, "dm-%d", minor);
1998
1999 if (IS_ENABLED(CONFIG_DAX_DRIVER)) {
2000 dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops);
2001 if (!dax_dev)
2002 goto bad;
2003 }
2004 md->dax_dev = dax_dev;
2005
2006 add_disk_no_queue_reg(md->disk);
2007 format_dev_t(md->name, MKDEV(_major, minor));
2008
2009 md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
2010 if (!md->wq)
2011 goto bad;
2012
2013 md->bdev = bdget_disk(md->disk, 0);
2014 if (!md->bdev)
2015 goto bad;
2016
2017 dm_stats_init(&md->stats);
2018
2019
2020 spin_lock(&_minor_lock);
2021 old_md = idr_replace(&_minor_idr, md, minor);
2022 spin_unlock(&_minor_lock);
2023
2024 BUG_ON(old_md != MINOR_ALLOCED);
2025
2026 return md;
2027
2028bad:
2029 cleanup_mapped_device(md);
2030bad_io_barrier:
2031 free_minor(minor);
2032bad_minor:
2033 module_put(THIS_MODULE);
2034bad_module_get:
2035 kvfree(md);
2036 return NULL;
2037}
2038
2039static void unlock_fs(struct mapped_device *md);
2040
2041static void free_dev(struct mapped_device *md)
2042{
2043 int minor = MINOR(disk_devt(md->disk));
2044
2045 unlock_fs(md);
2046
2047 cleanup_mapped_device(md);
2048
2049 free_table_devices(&md->table_devices);
2050 dm_stats_cleanup(&md->stats);
2051 free_minor(minor);
2052
2053 module_put(THIS_MODULE);
2054 kvfree(md);
2055}
2056
2057static int __bind_mempools(struct mapped_device *md, struct dm_table *t)
2058{
2059 struct dm_md_mempools *p = dm_table_get_md_mempools(t);
2060 int ret = 0;
2061
2062 if (dm_table_bio_based(t)) {
2063
2064
2065
2066
2067
2068 bioset_exit(&md->bs);
2069 bioset_exit(&md->io_bs);
2070
2071 } else if (bioset_initialized(&md->bs)) {
2072
2073
2074
2075
2076
2077
2078
2079
2080 goto out;
2081 }
2082
2083 BUG_ON(!p ||
2084 bioset_initialized(&md->bs) ||
2085 bioset_initialized(&md->io_bs));
2086
2087 ret = bioset_init_from_src(&md->bs, &p->bs);
2088 if (ret)
2089 goto out;
2090 ret = bioset_init_from_src(&md->io_bs, &p->io_bs);
2091 if (ret)
2092 bioset_exit(&md->bs);
2093out:
2094
2095 dm_table_free_md_mempools(t);
2096 return ret;
2097}
2098
2099
2100
2101
2102static void event_callback(void *context)
2103{
2104 unsigned long flags;
2105 LIST_HEAD(uevents);
2106 struct mapped_device *md = (struct mapped_device *) context;
2107
2108 spin_lock_irqsave(&md->uevent_lock, flags);
2109 list_splice_init(&md->uevent_list, &uevents);
2110 spin_unlock_irqrestore(&md->uevent_lock, flags);
2111
2112 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
2113
2114 atomic_inc(&md->event_nr);
2115 wake_up(&md->eventq);
2116 dm_issue_global_event();
2117}
2118
2119
2120
2121
2122static void __set_size(struct mapped_device *md, sector_t size)
2123{
2124 lockdep_assert_held(&md->suspend_lock);
2125
2126 set_capacity(md->disk, size);
2127
2128 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
2129}
2130
2131
2132
2133
2134static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2135 struct queue_limits *limits)
2136{
2137 struct dm_table *old_map;
2138 struct request_queue *q = md->queue;
2139 bool request_based = dm_table_request_based(t);
2140 sector_t size;
2141 int ret;
2142
2143 lockdep_assert_held(&md->suspend_lock);
2144
2145 size = dm_table_get_size(t);
2146
2147
2148
2149
2150 if (size != dm_get_size(md))
2151 memset(&md->geometry, 0, sizeof(md->geometry));
2152
2153 __set_size(md, size);
2154
2155 dm_table_event_callback(t, event_callback, md);
2156
2157
2158
2159
2160
2161
2162
2163
2164 if (request_based)
2165 dm_stop_queue(q);
2166
2167 if (request_based || md->type == DM_TYPE_NVME_BIO_BASED) {
2168
2169
2170
2171
2172
2173
2174 md->immutable_target = dm_table_get_immutable_target(t);
2175 }
2176
2177 ret = __bind_mempools(md, t);
2178 if (ret) {
2179 old_map = ERR_PTR(ret);
2180 goto out;
2181 }
2182
2183 old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2184 rcu_assign_pointer(md->map, (void *)t);
2185 md->immutable_target_type = dm_table_get_immutable_target_type(t);
2186
2187 dm_table_set_restrictions(t, q, limits);
2188 if (old_map)
2189 dm_sync_table(md);
2190
2191out:
2192 return old_map;
2193}
2194
2195
2196
2197
2198static struct dm_table *__unbind(struct mapped_device *md)
2199{
2200 struct dm_table *map = rcu_dereference_protected(md->map, 1);
2201
2202 if (!map)
2203 return NULL;
2204
2205 dm_table_event_callback(map, NULL, NULL);
2206 RCU_INIT_POINTER(md->map, NULL);
2207 dm_sync_table(md);
2208
2209 return map;
2210}
2211
2212
2213
2214
2215int dm_create(int minor, struct mapped_device **result)
2216{
2217 int r;
2218 struct mapped_device *md;
2219
2220 md = alloc_dev(minor);
2221 if (!md)
2222 return -ENXIO;
2223
2224 r = dm_sysfs_init(md);
2225 if (r) {
2226 free_dev(md);
2227 return r;
2228 }
2229
2230 *result = md;
2231 return 0;
2232}
2233
2234
2235
2236
2237
2238void dm_lock_md_type(struct mapped_device *md)
2239{
2240 mutex_lock(&md->type_lock);
2241}
2242
2243void dm_unlock_md_type(struct mapped_device *md)
2244{
2245 mutex_unlock(&md->type_lock);
2246}
2247
2248void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
2249{
2250 BUG_ON(!mutex_is_locked(&md->type_lock));
2251 md->type = type;
2252}
2253
2254enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
2255{
2256 return md->type;
2257}
2258
2259struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
2260{
2261 return md->immutable_target_type;
2262}
2263
2264
2265
2266
2267
2268struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
2269{
2270 BUG_ON(!atomic_read(&md->holders));
2271 return &md->queue->limits;
2272}
2273EXPORT_SYMBOL_GPL(dm_get_queue_limits);
2274
2275
2276
2277
2278int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
2279{
2280 int r;
2281 struct queue_limits limits;
2282 enum dm_queue_mode type = dm_get_md_type(md);
2283
2284 switch (type) {
2285 case DM_TYPE_REQUEST_BASED:
2286 r = dm_mq_init_request_queue(md, t);
2287 if (r) {
2288 DMERR("Cannot initialize queue for request-based dm-mq mapped device");
2289 return r;
2290 }
2291 break;
2292 case DM_TYPE_BIO_BASED:
2293 case DM_TYPE_DAX_BIO_BASED:
2294 case DM_TYPE_NVME_BIO_BASED:
2295 dm_init_normal_md_queue(md);
2296 blk_queue_make_request(md->queue, dm_make_request);
2297 break;
2298 case DM_TYPE_NONE:
2299 WARN_ON_ONCE(true);
2300 break;
2301 }
2302
2303 r = dm_calculate_queue_limits(t, &limits);
2304 if (r) {
2305 DMERR("Cannot calculate initial queue limits");
2306 return r;
2307 }
2308 dm_table_set_restrictions(t, md->queue, &limits);
2309 blk_register_queue(md->disk);
2310
2311 return 0;
2312}
2313
2314struct mapped_device *dm_get_md(dev_t dev)
2315{
2316 struct mapped_device *md;
2317 unsigned minor = MINOR(dev);
2318
2319 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2320 return NULL;
2321
2322 spin_lock(&_minor_lock);
2323
2324 md = idr_find(&_minor_idr, minor);
2325 if (!md || md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) ||
2326 test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2327 md = NULL;
2328 goto out;
2329 }
2330 dm_get(md);
2331out:
2332 spin_unlock(&_minor_lock);
2333
2334 return md;
2335}
2336EXPORT_SYMBOL_GPL(dm_get_md);
2337
2338void *dm_get_mdptr(struct mapped_device *md)
2339{
2340 return md->interface_ptr;
2341}
2342
2343void dm_set_mdptr(struct mapped_device *md, void *ptr)
2344{
2345 md->interface_ptr = ptr;
2346}
2347
2348void dm_get(struct mapped_device *md)
2349{
2350 atomic_inc(&md->holders);
2351 BUG_ON(test_bit(DMF_FREEING, &md->flags));
2352}
2353
2354int dm_hold(struct mapped_device *md)
2355{
2356 spin_lock(&_minor_lock);
2357 if (test_bit(DMF_FREEING, &md->flags)) {
2358 spin_unlock(&_minor_lock);
2359 return -EBUSY;
2360 }
2361 dm_get(md);
2362 spin_unlock(&_minor_lock);
2363 return 0;
2364}
2365EXPORT_SYMBOL_GPL(dm_hold);
2366
2367const char *dm_device_name(struct mapped_device *md)
2368{
2369 return md->name;
2370}
2371EXPORT_SYMBOL_GPL(dm_device_name);
2372
2373static void __dm_destroy(struct mapped_device *md, bool wait)
2374{
2375 struct dm_table *map;
2376 int srcu_idx;
2377
2378 might_sleep();
2379
2380 spin_lock(&_minor_lock);
2381 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2382 set_bit(DMF_FREEING, &md->flags);
2383 spin_unlock(&_minor_lock);
2384
2385 blk_set_queue_dying(md->queue);
2386
2387
2388
2389
2390
2391 mutex_lock(&md->suspend_lock);
2392 map = dm_get_live_table(md, &srcu_idx);
2393 if (!dm_suspended_md(md)) {
2394 dm_table_presuspend_targets(map);
2395 dm_table_postsuspend_targets(map);
2396 }
2397
2398 dm_put_live_table(md, srcu_idx);
2399 mutex_unlock(&md->suspend_lock);
2400
2401
2402
2403
2404
2405
2406
2407 if (wait)
2408 while (atomic_read(&md->holders))
2409 msleep(1);
2410 else if (atomic_read(&md->holders))
2411 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2412 dm_device_name(md), atomic_read(&md->holders));
2413
2414 dm_sysfs_exit(md);
2415 dm_table_destroy(__unbind(md));
2416 free_dev(md);
2417}
2418
2419void dm_destroy(struct mapped_device *md)
2420{
2421 __dm_destroy(md, true);
2422}
2423
2424void dm_destroy_immediate(struct mapped_device *md)
2425{
2426 __dm_destroy(md, false);
2427}
2428
2429void dm_put(struct mapped_device *md)
2430{
2431 atomic_dec(&md->holders);
2432}
2433EXPORT_SYMBOL_GPL(dm_put);
2434
2435static int dm_wait_for_completion(struct mapped_device *md, long task_state)
2436{
2437 int r = 0;
2438 DEFINE_WAIT(wait);
2439
2440 while (1) {
2441 prepare_to_wait(&md->wait, &wait, task_state);
2442
2443 if (!md_in_flight(md))
2444 break;
2445
2446 if (signal_pending_state(task_state, current)) {
2447 r = -EINTR;
2448 break;
2449 }
2450
2451 io_schedule();
2452 }
2453 finish_wait(&md->wait, &wait);
2454
2455 return r;
2456}
2457
2458
2459
2460
2461static void dm_wq_work(struct work_struct *work)
2462{
2463 struct mapped_device *md = container_of(work, struct mapped_device,
2464 work);
2465 struct bio *c;
2466 int srcu_idx;
2467 struct dm_table *map;
2468
2469 map = dm_get_live_table(md, &srcu_idx);
2470
2471 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2472 spin_lock_irq(&md->deferred_lock);
2473 c = bio_list_pop(&md->deferred);
2474 spin_unlock_irq(&md->deferred_lock);
2475
2476 if (!c)
2477 break;
2478
2479 if (dm_request_based(md))
2480 (void) generic_make_request(c);
2481 else
2482 (void) dm_process_bio(md, map, c);
2483 }
2484
2485 dm_put_live_table(md, srcu_idx);
2486}
2487
2488static void dm_queue_flush(struct mapped_device *md)
2489{
2490 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2491 smp_mb__after_atomic();
2492 queue_work(md->wq, &md->work);
2493}
2494
2495
2496
2497
2498struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2499{
2500 struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
2501 struct queue_limits limits;
2502 int r;
2503
2504 mutex_lock(&md->suspend_lock);
2505
2506
2507 if (!dm_suspended_md(md))
2508 goto out;
2509
2510
2511
2512
2513
2514
2515
2516 if (dm_table_has_no_data_devices(table)) {
2517 live_map = dm_get_live_table_fast(md);
2518 if (live_map)
2519 limits = md->queue->limits;
2520 dm_put_live_table_fast(md);
2521 }
2522
2523 if (!live_map) {
2524 r = dm_calculate_queue_limits(table, &limits);
2525 if (r) {
2526 map = ERR_PTR(r);
2527 goto out;
2528 }
2529 }
2530
2531 map = __bind(md, table, &limits);
2532 dm_issue_global_event();
2533
2534out:
2535 mutex_unlock(&md->suspend_lock);
2536 return map;
2537}
2538
2539
2540
2541
2542
2543static int lock_fs(struct mapped_device *md)
2544{
2545 int r;
2546
2547 WARN_ON(md->frozen_sb);
2548
2549 md->frozen_sb = freeze_bdev(md->bdev);
2550 if (IS_ERR(md->frozen_sb)) {
2551 r = PTR_ERR(md->frozen_sb);
2552 md->frozen_sb = NULL;
2553 return r;
2554 }
2555
2556 set_bit(DMF_FROZEN, &md->flags);
2557
2558 return 0;
2559}
2560
2561static void unlock_fs(struct mapped_device *md)
2562{
2563 if (!test_bit(DMF_FROZEN, &md->flags))
2564 return;
2565
2566 thaw_bdev(md->bdev, md->frozen_sb);
2567 md->frozen_sb = NULL;
2568 clear_bit(DMF_FROZEN, &md->flags);
2569}
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
2581 unsigned suspend_flags, long task_state,
2582 int dmf_suspended_flag)
2583{
2584 bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
2585 bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
2586 int r;
2587
2588 lockdep_assert_held(&md->suspend_lock);
2589
2590
2591
2592
2593
2594 if (noflush)
2595 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2596 else
2597 pr_debug("%s: suspending with flush\n", dm_device_name(md));
2598
2599
2600
2601
2602
2603 dm_table_presuspend_targets(map);
2604
2605
2606
2607
2608
2609
2610
2611 if (!noflush && do_lockfs) {
2612 r = lock_fs(md);
2613 if (r) {
2614 dm_table_presuspend_undo_targets(map);
2615 return r;
2616 }
2617 }
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2632 if (map)
2633 synchronize_srcu(&md->io_barrier);
2634
2635
2636
2637
2638
2639 if (dm_request_based(md))
2640 dm_stop_queue(md->queue);
2641
2642 flush_workqueue(md->wq);
2643
2644
2645
2646
2647
2648
2649 r = dm_wait_for_completion(md, task_state);
2650 if (!r)
2651 set_bit(dmf_suspended_flag, &md->flags);
2652
2653 if (noflush)
2654 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2655 if (map)
2656 synchronize_srcu(&md->io_barrier);
2657
2658
2659 if (r < 0) {
2660 dm_queue_flush(md);
2661
2662 if (dm_request_based(md))
2663 dm_start_queue(md->queue);
2664
2665 unlock_fs(md);
2666 dm_table_presuspend_undo_targets(map);
2667
2668 }
2669
2670 return r;
2671}
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2690{
2691 struct dm_table *map = NULL;
2692 int r = 0;
2693
2694retry:
2695 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2696
2697 if (dm_suspended_md(md)) {
2698 r = -EINVAL;
2699 goto out_unlock;
2700 }
2701
2702 if (dm_suspended_internally_md(md)) {
2703
2704 mutex_unlock(&md->suspend_lock);
2705 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2706 if (r)
2707 return r;
2708 goto retry;
2709 }
2710
2711 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2712
2713 r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED);
2714 if (r)
2715 goto out_unlock;
2716
2717 dm_table_postsuspend_targets(map);
2718
2719out_unlock:
2720 mutex_unlock(&md->suspend_lock);
2721 return r;
2722}
2723
2724static int __dm_resume(struct mapped_device *md, struct dm_table *map)
2725{
2726 if (map) {
2727 int r = dm_table_resume_targets(map);
2728 if (r)
2729 return r;
2730 }
2731
2732 dm_queue_flush(md);
2733
2734
2735
2736
2737
2738
2739 if (dm_request_based(md))
2740 dm_start_queue(md->queue);
2741
2742 unlock_fs(md);
2743
2744 return 0;
2745}
2746
2747int dm_resume(struct mapped_device *md)
2748{
2749 int r;
2750 struct dm_table *map = NULL;
2751
2752retry:
2753 r = -EINVAL;
2754 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2755
2756 if (!dm_suspended_md(md))
2757 goto out;
2758
2759 if (dm_suspended_internally_md(md)) {
2760
2761 mutex_unlock(&md->suspend_lock);
2762 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2763 if (r)
2764 return r;
2765 goto retry;
2766 }
2767
2768 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2769 if (!map || !dm_table_get_size(map))
2770 goto out;
2771
2772 r = __dm_resume(md, map);
2773 if (r)
2774 goto out;
2775
2776 clear_bit(DMF_SUSPENDED, &md->flags);
2777out:
2778 mutex_unlock(&md->suspend_lock);
2779
2780 return r;
2781}
2782
2783
2784
2785
2786
2787
2788
2789static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
2790{
2791 struct dm_table *map = NULL;
2792
2793 lockdep_assert_held(&md->suspend_lock);
2794
2795 if (md->internal_suspend_count++)
2796 return;
2797
2798 if (dm_suspended_md(md)) {
2799 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2800 return;
2801 }
2802
2803 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2804
2805
2806
2807
2808
2809
2810
2811 (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE,
2812 DMF_SUSPENDED_INTERNALLY);
2813
2814 dm_table_postsuspend_targets(map);
2815}
2816
2817static void __dm_internal_resume(struct mapped_device *md)
2818{
2819 BUG_ON(!md->internal_suspend_count);
2820
2821 if (--md->internal_suspend_count)
2822 return;
2823
2824 if (dm_suspended_md(md))
2825 goto done;
2826
2827
2828
2829
2830
2831 (void) __dm_resume(md, NULL);
2832
2833done:
2834 clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2835 smp_mb__after_atomic();
2836 wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
2837}
2838
2839void dm_internal_suspend_noflush(struct mapped_device *md)
2840{
2841 mutex_lock(&md->suspend_lock);
2842 __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
2843 mutex_unlock(&md->suspend_lock);
2844}
2845EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
2846
2847void dm_internal_resume(struct mapped_device *md)
2848{
2849 mutex_lock(&md->suspend_lock);
2850 __dm_internal_resume(md);
2851 mutex_unlock(&md->suspend_lock);
2852}
2853EXPORT_SYMBOL_GPL(dm_internal_resume);
2854
2855
2856
2857
2858
2859
2860void dm_internal_suspend_fast(struct mapped_device *md)
2861{
2862 mutex_lock(&md->suspend_lock);
2863 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2864 return;
2865
2866 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2867 synchronize_srcu(&md->io_barrier);
2868 flush_workqueue(md->wq);
2869 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2870}
2871EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
2872
2873void dm_internal_resume_fast(struct mapped_device *md)
2874{
2875 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2876 goto done;
2877
2878 dm_queue_flush(md);
2879
2880done:
2881 mutex_unlock(&md->suspend_lock);
2882}
2883EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
2884
2885
2886
2887
2888int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
2889 unsigned cookie)
2890{
2891 char udev_cookie[DM_COOKIE_LENGTH];
2892 char *envp[] = { udev_cookie, NULL };
2893
2894 if (!cookie)
2895 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
2896 else {
2897 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2898 DM_COOKIE_ENV_VAR_NAME, cookie);
2899 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
2900 action, envp);
2901 }
2902}
2903
2904uint32_t dm_next_uevent_seq(struct mapped_device *md)
2905{
2906 return atomic_add_return(1, &md->uevent_seq);
2907}
2908
2909uint32_t dm_get_event_nr(struct mapped_device *md)
2910{
2911 return atomic_read(&md->event_nr);
2912}
2913
2914int dm_wait_event(struct mapped_device *md, int event_nr)
2915{
2916 return wait_event_interruptible(md->eventq,
2917 (event_nr != atomic_read(&md->event_nr)));
2918}
2919
2920void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
2921{
2922 unsigned long flags;
2923
2924 spin_lock_irqsave(&md->uevent_lock, flags);
2925 list_add(elist, &md->uevent_list);
2926 spin_unlock_irqrestore(&md->uevent_lock, flags);
2927}
2928
2929
2930
2931
2932
2933struct gendisk *dm_disk(struct mapped_device *md)
2934{
2935 return md->disk;
2936}
2937EXPORT_SYMBOL_GPL(dm_disk);
2938
2939struct kobject *dm_kobject(struct mapped_device *md)
2940{
2941 return &md->kobj_holder.kobj;
2942}
2943
2944struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2945{
2946 struct mapped_device *md;
2947
2948 md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
2949
2950 spin_lock(&_minor_lock);
2951 if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2952 md = NULL;
2953 goto out;
2954 }
2955 dm_get(md);
2956out:
2957 spin_unlock(&_minor_lock);
2958
2959 return md;
2960}
2961
2962int dm_suspended_md(struct mapped_device *md)
2963{
2964 return test_bit(DMF_SUSPENDED, &md->flags);
2965}
2966
2967int dm_suspended_internally_md(struct mapped_device *md)
2968{
2969 return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2970}
2971
2972int dm_test_deferred_remove_flag(struct mapped_device *md)
2973{
2974 return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
2975}
2976
2977int dm_suspended(struct dm_target *ti)
2978{
2979 return dm_suspended_md(dm_table_get_md(ti->table));
2980}
2981EXPORT_SYMBOL_GPL(dm_suspended);
2982
2983int dm_noflush_suspending(struct dm_target *ti)
2984{
2985 return __noflush_suspending(dm_table_get_md(ti->table));
2986}
2987EXPORT_SYMBOL_GPL(dm_noflush_suspending);
2988
2989struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
2990 unsigned integrity, unsigned per_io_data_size,
2991 unsigned min_pool_size)
2992{
2993 struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
2994 unsigned int pool_size = 0;
2995 unsigned int front_pad, io_front_pad;
2996 int ret;
2997
2998 if (!pools)
2999 return NULL;
3000
3001 switch (type) {
3002 case DM_TYPE_BIO_BASED:
3003 case DM_TYPE_DAX_BIO_BASED:
3004 case DM_TYPE_NVME_BIO_BASED:
3005 pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
3006 front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
3007 io_front_pad = roundup(front_pad, __alignof__(struct dm_io)) + offsetof(struct dm_io, tio);
3008 ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0);
3009 if (ret)
3010 goto out;
3011 if (integrity && bioset_integrity_create(&pools->io_bs, pool_size))
3012 goto out;
3013 break;
3014 case DM_TYPE_REQUEST_BASED:
3015 pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size);
3016 front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
3017
3018 break;
3019 default:
3020 BUG();
3021 }
3022
3023 ret = bioset_init(&pools->bs, pool_size, front_pad, 0);
3024 if (ret)
3025 goto out;
3026
3027 if (integrity && bioset_integrity_create(&pools->bs, pool_size))
3028 goto out;
3029
3030 return pools;
3031
3032out:
3033 dm_free_md_mempools(pools);
3034
3035 return NULL;
3036}
3037
3038void dm_free_md_mempools(struct dm_md_mempools *pools)
3039{
3040 if (!pools)
3041 return;
3042
3043 bioset_exit(&pools->bs);
3044 bioset_exit(&pools->io_bs);
3045
3046 kfree(pools);
3047}
3048
3049struct dm_pr {
3050 u64 old_key;
3051 u64 new_key;
3052 u32 flags;
3053 bool fail_early;
3054};
3055
3056static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
3057 void *data)
3058{
3059 struct mapped_device *md = bdev->bd_disk->private_data;
3060 struct dm_table *table;
3061 struct dm_target *ti;
3062 int ret = -ENOTTY, srcu_idx;
3063
3064 table = dm_get_live_table(md, &srcu_idx);
3065 if (!table || !dm_table_get_size(table))
3066 goto out;
3067
3068
3069 if (dm_table_get_num_targets(table) != 1)
3070 goto out;
3071 ti = dm_table_get_target(table, 0);
3072
3073 ret = -EINVAL;
3074 if (!ti->type->iterate_devices)
3075 goto out;
3076
3077 ret = ti->type->iterate_devices(ti, fn, data);
3078out:
3079 dm_put_live_table(md, srcu_idx);
3080 return ret;
3081}
3082
3083
3084
3085
3086static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev,
3087 sector_t start, sector_t len, void *data)
3088{
3089 struct dm_pr *pr = data;
3090 const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
3091
3092 if (!ops || !ops->pr_register)
3093 return -EOPNOTSUPP;
3094 return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);
3095}
3096
3097static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
3098 u32 flags)
3099{
3100 struct dm_pr pr = {
3101 .old_key = old_key,
3102 .new_key = new_key,
3103 .flags = flags,
3104 .fail_early = true,
3105 };
3106 int ret;
3107
3108 ret = dm_call_pr(bdev, __dm_pr_register, &pr);
3109 if (ret && new_key) {
3110
3111 pr.old_key = new_key;
3112 pr.new_key = 0;
3113 pr.flags = 0;
3114 pr.fail_early = false;
3115 dm_call_pr(bdev, __dm_pr_register, &pr);
3116 }
3117
3118 return ret;
3119}
3120
3121static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
3122 u32 flags)
3123{
3124 struct mapped_device *md = bdev->bd_disk->private_data;
3125 const struct pr_ops *ops;
3126 int r, srcu_idx;
3127
3128 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3129 if (r < 0)
3130 goto out;
3131
3132 ops = bdev->bd_disk->fops->pr_ops;
3133 if (ops && ops->pr_reserve)
3134 r = ops->pr_reserve(bdev, key, type, flags);
3135 else
3136 r = -EOPNOTSUPP;
3137out:
3138 dm_unprepare_ioctl(md, srcu_idx);
3139 return r;
3140}
3141
3142static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
3143{
3144 struct mapped_device *md = bdev->bd_disk->private_data;
3145 const struct pr_ops *ops;
3146 int r, srcu_idx;
3147
3148 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3149 if (r < 0)
3150 goto out;
3151
3152 ops = bdev->bd_disk->fops->pr_ops;
3153 if (ops && ops->pr_release)
3154 r = ops->pr_release(bdev, key, type);
3155 else
3156 r = -EOPNOTSUPP;
3157out:
3158 dm_unprepare_ioctl(md, srcu_idx);
3159 return r;
3160}
3161
3162static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
3163 enum pr_type type, bool abort)
3164{
3165 struct mapped_device *md = bdev->bd_disk->private_data;
3166 const struct pr_ops *ops;
3167 int r, srcu_idx;
3168
3169 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3170 if (r < 0)
3171 goto out;
3172
3173 ops = bdev->bd_disk->fops->pr_ops;
3174 if (ops && ops->pr_preempt)
3175 r = ops->pr_preempt(bdev, old_key, new_key, type, abort);
3176 else
3177 r = -EOPNOTSUPP;
3178out:
3179 dm_unprepare_ioctl(md, srcu_idx);
3180 return r;
3181}
3182
3183static int dm_pr_clear(struct block_device *bdev, u64 key)
3184{
3185 struct mapped_device *md = bdev->bd_disk->private_data;
3186 const struct pr_ops *ops;
3187 int r, srcu_idx;
3188
3189 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3190 if (r < 0)
3191 goto out;
3192
3193 ops = bdev->bd_disk->fops->pr_ops;
3194 if (ops && ops->pr_clear)
3195 r = ops->pr_clear(bdev, key);
3196 else
3197 r = -EOPNOTSUPP;
3198out:
3199 dm_unprepare_ioctl(md, srcu_idx);
3200 return r;
3201}
3202
3203static const struct pr_ops dm_pr_ops = {
3204 .pr_register = dm_pr_register,
3205 .pr_reserve = dm_pr_reserve,
3206 .pr_release = dm_pr_release,
3207 .pr_preempt = dm_pr_preempt,
3208 .pr_clear = dm_pr_clear,
3209};
3210
3211static const struct block_device_operations dm_blk_dops = {
3212 .open = dm_blk_open,
3213 .release = dm_blk_close,
3214 .ioctl = dm_blk_ioctl,
3215 .getgeo = dm_blk_getgeo,
3216 .report_zones = dm_blk_report_zones,
3217 .pr_ops = &dm_pr_ops,
3218 .owner = THIS_MODULE
3219};
3220
3221static const struct dax_operations dm_dax_ops = {
3222 .direct_access = dm_dax_direct_access,
3223 .dax_supported = dm_dax_supported,
3224 .copy_from_iter = dm_dax_copy_from_iter,
3225 .copy_to_iter = dm_dax_copy_to_iter,
3226};
3227
3228
3229
3230
3231module_init(dm_init);
3232module_exit(dm_exit);
3233
3234module_param(major, uint, 0);
3235MODULE_PARM_DESC(major, "The major number of the device mapper");
3236
3237module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
3238MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
3239
3240module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
3241MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
3242
3243MODULE_DESCRIPTION(DM_NAME " driver");
3244MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3245MODULE_LICENSE("GPL");
3246