1
2
3
4
5
6
7
8#include "dm-core.h"
9#include "dm-rq.h"
10#include "dm-uevent.h"
11
12#include <linux/init.h>
13#include <linux/module.h>
14#include <linux/mutex.h>
15#include <linux/sched/mm.h>
16#include <linux/sched/signal.h>
17#include <linux/blkpg.h>
18#include <linux/bio.h>
19#include <linux/mempool.h>
20#include <linux/dax.h>
21#include <linux/slab.h>
22#include <linux/idr.h>
23#include <linux/uio.h>
24#include <linux/hdreg.h>
25#include <linux/delay.h>
26#include <linux/wait.h>
27#include <linux/pr.h>
28#include <linux/refcount.h>
29#include <linux/part_stat.h>
30#include <linux/blk-crypto.h>
31#include <linux/keyslot-manager.h>
32
33#define DM_MSG_PREFIX "core"
34
35
36
37
38
39#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
40#define DM_COOKIE_LENGTH 24
41
42static const char *_name = DM_NAME;
43
44static unsigned int major = 0;
45static unsigned int _major = 0;
46
47static DEFINE_IDR(_minor_idr);
48
49static DEFINE_SPINLOCK(_minor_lock);
50
51static void do_deferred_remove(struct work_struct *w);
52
53static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
54
55static struct workqueue_struct *deferred_remove_workqueue;
56
57atomic_t dm_global_event_nr = ATOMIC_INIT(0);
58DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq);
59
60void dm_issue_global_event(void)
61{
62 atomic_inc(&dm_global_event_nr);
63 wake_up(&dm_global_eventq);
64}
65
66
67
68
69struct clone_info {
70 struct dm_table *map;
71 struct bio *bio;
72 struct dm_io *io;
73 sector_t sector;
74 unsigned sector_count;
75};
76
77#define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone))
78#define DM_IO_BIO_OFFSET \
79 (offsetof(struct dm_target_io, clone) + offsetof(struct dm_io, tio))
80
81void *dm_per_bio_data(struct bio *bio, size_t data_size)
82{
83 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
84 if (!tio->inside_dm_io)
85 return (char *)bio - DM_TARGET_IO_BIO_OFFSET - data_size;
86 return (char *)bio - DM_IO_BIO_OFFSET - data_size;
87}
88EXPORT_SYMBOL_GPL(dm_per_bio_data);
89
90struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size)
91{
92 struct dm_io *io = (struct dm_io *)((char *)data + data_size);
93 if (io->magic == DM_IO_MAGIC)
94 return (struct bio *)((char *)io + DM_IO_BIO_OFFSET);
95 BUG_ON(io->magic != DM_TIO_MAGIC);
96 return (struct bio *)((char *)io + DM_TARGET_IO_BIO_OFFSET);
97}
98EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data);
99
100unsigned dm_bio_get_target_bio_nr(const struct bio *bio)
101{
102 return container_of(bio, struct dm_target_io, clone)->target_bio_nr;
103}
104EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr);
105
106#define MINOR_ALLOCED ((void *)-1)
107
108#define DM_NUMA_NODE NUMA_NO_NODE
109static int dm_numa_node = DM_NUMA_NODE;
110
111#define DEFAULT_SWAP_BIOS (8 * 1048576 / PAGE_SIZE)
112static int swap_bios = DEFAULT_SWAP_BIOS;
113static int get_swap_bios(void)
114{
115 int latch = READ_ONCE(swap_bios);
116 if (unlikely(latch <= 0))
117 latch = DEFAULT_SWAP_BIOS;
118 return latch;
119}
120
121
122
123
124struct dm_md_mempools {
125 struct bio_set bs;
126 struct bio_set io_bs;
127};
128
129struct table_device {
130 struct list_head list;
131 refcount_t count;
132 struct dm_dev dm_dev;
133};
134
135
136
137
138#define RESERVED_BIO_BASED_IOS 16
139static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
140
141static int __dm_get_module_param_int(int *module_param, int min, int max)
142{
143 int param = READ_ONCE(*module_param);
144 int modified_param = 0;
145 bool modified = true;
146
147 if (param < min)
148 modified_param = min;
149 else if (param > max)
150 modified_param = max;
151 else
152 modified = false;
153
154 if (modified) {
155 (void)cmpxchg(module_param, param, modified_param);
156 param = modified_param;
157 }
158
159 return param;
160}
161
162unsigned __dm_get_module_param(unsigned *module_param,
163 unsigned def, unsigned max)
164{
165 unsigned param = READ_ONCE(*module_param);
166 unsigned modified_param = 0;
167
168 if (!param)
169 modified_param = def;
170 else if (param > max)
171 modified_param = max;
172
173 if (modified_param) {
174 (void)cmpxchg(module_param, param, modified_param);
175 param = modified_param;
176 }
177
178 return param;
179}
180
181unsigned dm_get_reserved_bio_based_ios(void)
182{
183 return __dm_get_module_param(&reserved_bio_based_ios,
184 RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS);
185}
186EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
187
188static unsigned dm_get_numa_node(void)
189{
190 return __dm_get_module_param_int(&dm_numa_node,
191 DM_NUMA_NODE, num_online_nodes() - 1);
192}
193
194static int __init local_init(void)
195{
196 int r;
197
198 r = dm_uevent_init();
199 if (r)
200 return r;
201
202 deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
203 if (!deferred_remove_workqueue) {
204 r = -ENOMEM;
205 goto out_uevent_exit;
206 }
207
208 _major = major;
209 r = register_blkdev(_major, _name);
210 if (r < 0)
211 goto out_free_workqueue;
212
213 if (!_major)
214 _major = r;
215
216 return 0;
217
218out_free_workqueue:
219 destroy_workqueue(deferred_remove_workqueue);
220out_uevent_exit:
221 dm_uevent_exit();
222
223 return r;
224}
225
226static void local_exit(void)
227{
228 flush_scheduled_work();
229 destroy_workqueue(deferred_remove_workqueue);
230
231 unregister_blkdev(_major, _name);
232 dm_uevent_exit();
233
234 _major = 0;
235
236 DMINFO("cleaned up");
237}
238
239static int (*_inits[])(void) __initdata = {
240 local_init,
241 dm_target_init,
242 dm_linear_init,
243 dm_stripe_init,
244 dm_io_init,
245 dm_kcopyd_init,
246 dm_interface_init,
247 dm_statistics_init,
248};
249
250static void (*_exits[])(void) = {
251 local_exit,
252 dm_target_exit,
253 dm_linear_exit,
254 dm_stripe_exit,
255 dm_io_exit,
256 dm_kcopyd_exit,
257 dm_interface_exit,
258 dm_statistics_exit,
259};
260
261static int __init dm_init(void)
262{
263 const int count = ARRAY_SIZE(_inits);
264
265 int r, i;
266
267 for (i = 0; i < count; i++) {
268 r = _inits[i]();
269 if (r)
270 goto bad;
271 }
272
273 return 0;
274
275 bad:
276 while (i--)
277 _exits[i]();
278
279 return r;
280}
281
282static void __exit dm_exit(void)
283{
284 int i = ARRAY_SIZE(_exits);
285
286 while (i--)
287 _exits[i]();
288
289
290
291
292 idr_destroy(&_minor_idr);
293}
294
295
296
297
298int dm_deleting_md(struct mapped_device *md)
299{
300 return test_bit(DMF_DELETING, &md->flags);
301}
302
303static int dm_blk_open(struct block_device *bdev, fmode_t mode)
304{
305 struct mapped_device *md;
306
307 spin_lock(&_minor_lock);
308
309 md = bdev->bd_disk->private_data;
310 if (!md)
311 goto out;
312
313 if (test_bit(DMF_FREEING, &md->flags) ||
314 dm_deleting_md(md)) {
315 md = NULL;
316 goto out;
317 }
318
319 dm_get(md);
320 atomic_inc(&md->open_count);
321out:
322 spin_unlock(&_minor_lock);
323
324 return md ? 0 : -ENXIO;
325}
326
327static void dm_blk_close(struct gendisk *disk, fmode_t mode)
328{
329 struct mapped_device *md;
330
331 spin_lock(&_minor_lock);
332
333 md = disk->private_data;
334 if (WARN_ON(!md))
335 goto out;
336
337 if (atomic_dec_and_test(&md->open_count) &&
338 (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
339 queue_work(deferred_remove_workqueue, &deferred_remove_work);
340
341 dm_put(md);
342out:
343 spin_unlock(&_minor_lock);
344}
345
346int dm_open_count(struct mapped_device *md)
347{
348 return atomic_read(&md->open_count);
349}
350
351
352
353
354int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
355{
356 int r = 0;
357
358 spin_lock(&_minor_lock);
359
360 if (dm_open_count(md)) {
361 r = -EBUSY;
362 if (mark_deferred)
363 set_bit(DMF_DEFERRED_REMOVE, &md->flags);
364 } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
365 r = -EEXIST;
366 else
367 set_bit(DMF_DELETING, &md->flags);
368
369 spin_unlock(&_minor_lock);
370
371 return r;
372}
373
374int dm_cancel_deferred_remove(struct mapped_device *md)
375{
376 int r = 0;
377
378 spin_lock(&_minor_lock);
379
380 if (test_bit(DMF_DELETING, &md->flags))
381 r = -EBUSY;
382 else
383 clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
384
385 spin_unlock(&_minor_lock);
386
387 return r;
388}
389
390static void do_deferred_remove(struct work_struct *w)
391{
392 dm_deferred_remove();
393}
394
395static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
396{
397 struct mapped_device *md = bdev->bd_disk->private_data;
398
399 return dm_get_geometry(md, geo);
400}
401
402static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx,
403 struct block_device **bdev)
404{
405 struct dm_target *tgt;
406 struct dm_table *map;
407 int r;
408
409retry:
410 r = -ENOTTY;
411 map = dm_get_live_table(md, srcu_idx);
412 if (!map || !dm_table_get_size(map))
413 return r;
414
415
416 if (dm_table_get_num_targets(map) != 1)
417 return r;
418
419 tgt = dm_table_get_target(map, 0);
420 if (!tgt->type->prepare_ioctl)
421 return r;
422
423 if (dm_suspended_md(md))
424 return -EAGAIN;
425
426 r = tgt->type->prepare_ioctl(tgt, bdev);
427 if (r == -ENOTCONN && !fatal_signal_pending(current)) {
428 dm_put_live_table(md, *srcu_idx);
429 msleep(10);
430 goto retry;
431 }
432
433 return r;
434}
435
436static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx)
437{
438 dm_put_live_table(md, srcu_idx);
439}
440
441static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
442 unsigned int cmd, unsigned long arg)
443{
444 struct mapped_device *md = bdev->bd_disk->private_data;
445 int r, srcu_idx;
446
447 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
448 if (r < 0)
449 goto out;
450
451 if (r > 0) {
452
453
454
455
456 if (!capable(CAP_SYS_RAWIO)) {
457 DMDEBUG_LIMIT(
458 "%s: sending ioctl %x to DM device without required privilege.",
459 current->comm, cmd);
460 r = -ENOIOCTLCMD;
461 goto out;
462 }
463 }
464
465 if (!bdev->bd_disk->fops->ioctl)
466 r = -ENOTTY;
467 else
468 r = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg);
469out:
470 dm_unprepare_ioctl(md, srcu_idx);
471 return r;
472}
473
474u64 dm_start_time_ns_from_clone(struct bio *bio)
475{
476 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
477 struct dm_io *io = tio->io;
478
479 return jiffies_to_nsecs(io->start_time);
480}
481EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone);
482
483static void start_io_acct(struct dm_io *io)
484{
485 struct mapped_device *md = io->md;
486 struct bio *bio = io->orig_bio;
487
488 io->start_time = bio_start_io_acct(bio);
489 if (unlikely(dm_stats_used(&md->stats)))
490 dm_stats_account_io(&md->stats, bio_data_dir(bio),
491 bio->bi_iter.bi_sector, bio_sectors(bio),
492 false, 0, &io->stats_aux);
493}
494
495static void end_io_acct(struct dm_io *io)
496{
497 struct mapped_device *md = io->md;
498 struct bio *bio = io->orig_bio;
499 unsigned long duration = jiffies - io->start_time;
500
501 bio_end_io_acct(bio, io->start_time);
502
503 if (unlikely(dm_stats_used(&md->stats)))
504 dm_stats_account_io(&md->stats, bio_data_dir(bio),
505 bio->bi_iter.bi_sector, bio_sectors(bio),
506 true, duration, &io->stats_aux);
507
508
509 if (unlikely(wq_has_sleeper(&md->wait)))
510 wake_up(&md->wait);
511}
512
513static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
514{
515 struct dm_io *io;
516 struct dm_target_io *tio;
517 struct bio *clone;
518
519 clone = bio_alloc_bioset(GFP_NOIO, 0, &md->io_bs);
520 if (!clone)
521 return NULL;
522
523 tio = container_of(clone, struct dm_target_io, clone);
524 tio->inside_dm_io = true;
525 tio->io = NULL;
526
527 io = container_of(tio, struct dm_io, tio);
528 io->magic = DM_IO_MAGIC;
529 io->status = 0;
530 atomic_set(&io->io_count, 1);
531 io->orig_bio = bio;
532 io->md = md;
533 spin_lock_init(&io->endio_lock);
534
535 start_io_acct(io);
536
537 return io;
538}
539
540static void free_io(struct mapped_device *md, struct dm_io *io)
541{
542 bio_put(&io->tio.clone);
543}
544
545static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *ti,
546 unsigned target_bio_nr, gfp_t gfp_mask)
547{
548 struct dm_target_io *tio;
549
550 if (!ci->io->tio.io) {
551
552 tio = &ci->io->tio;
553 } else {
554 struct bio *clone = bio_alloc_bioset(gfp_mask, 0, &ci->io->md->bs);
555 if (!clone)
556 return NULL;
557
558 tio = container_of(clone, struct dm_target_io, clone);
559 tio->inside_dm_io = false;
560 }
561
562 tio->magic = DM_TIO_MAGIC;
563 tio->io = ci->io;
564 tio->ti = ti;
565 tio->target_bio_nr = target_bio_nr;
566
567 return tio;
568}
569
570static void free_tio(struct dm_target_io *tio)
571{
572 if (tio->inside_dm_io)
573 return;
574 bio_put(&tio->clone);
575}
576
577
578
579
580static void queue_io(struct mapped_device *md, struct bio *bio)
581{
582 unsigned long flags;
583
584 spin_lock_irqsave(&md->deferred_lock, flags);
585 bio_list_add(&md->deferred, bio);
586 spin_unlock_irqrestore(&md->deferred_lock, flags);
587 queue_work(md->wq, &md->work);
588}
589
590
591
592
593
594
595struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
596{
597 *srcu_idx = srcu_read_lock(&md->io_barrier);
598
599 return srcu_dereference(md->map, &md->io_barrier);
600}
601
602void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
603{
604 srcu_read_unlock(&md->io_barrier, srcu_idx);
605}
606
607void dm_sync_table(struct mapped_device *md)
608{
609 synchronize_srcu(&md->io_barrier);
610 synchronize_rcu_expedited();
611}
612
613
614
615
616
617static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
618{
619 rcu_read_lock();
620 return rcu_dereference(md->map);
621}
622
623static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
624{
625 rcu_read_unlock();
626}
627
628static char *_dm_claim_ptr = "I belong to device-mapper";
629
630
631
632
633static int open_table_device(struct table_device *td, dev_t dev,
634 struct mapped_device *md)
635{
636 struct block_device *bdev;
637
638 int r;
639
640 BUG_ON(td->dm_dev.bdev);
641
642 bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr);
643 if (IS_ERR(bdev))
644 return PTR_ERR(bdev);
645
646 r = bd_link_disk_holder(bdev, dm_disk(md));
647 if (r) {
648 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
649 return r;
650 }
651
652 td->dm_dev.bdev = bdev;
653 td->dm_dev.dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
654 return 0;
655}
656
657
658
659
660static void close_table_device(struct table_device *td, struct mapped_device *md)
661{
662 if (!td->dm_dev.bdev)
663 return;
664
665 bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
666 blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
667 put_dax(td->dm_dev.dax_dev);
668 td->dm_dev.bdev = NULL;
669 td->dm_dev.dax_dev = NULL;
670}
671
672static struct table_device *find_table_device(struct list_head *l, dev_t dev,
673 fmode_t mode)
674{
675 struct table_device *td;
676
677 list_for_each_entry(td, l, list)
678 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
679 return td;
680
681 return NULL;
682}
683
684int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
685 struct dm_dev **result)
686{
687 int r;
688 struct table_device *td;
689
690 mutex_lock(&md->table_devices_lock);
691 td = find_table_device(&md->table_devices, dev, mode);
692 if (!td) {
693 td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
694 if (!td) {
695 mutex_unlock(&md->table_devices_lock);
696 return -ENOMEM;
697 }
698
699 td->dm_dev.mode = mode;
700 td->dm_dev.bdev = NULL;
701
702 if ((r = open_table_device(td, dev, md))) {
703 mutex_unlock(&md->table_devices_lock);
704 kfree(td);
705 return r;
706 }
707
708 format_dev_t(td->dm_dev.name, dev);
709
710 refcount_set(&td->count, 1);
711 list_add(&td->list, &md->table_devices);
712 } else {
713 refcount_inc(&td->count);
714 }
715 mutex_unlock(&md->table_devices_lock);
716
717 *result = &td->dm_dev;
718 return 0;
719}
720
721void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
722{
723 struct table_device *td = container_of(d, struct table_device, dm_dev);
724
725 mutex_lock(&md->table_devices_lock);
726 if (refcount_dec_and_test(&td->count)) {
727 close_table_device(td, md);
728 list_del(&td->list);
729 kfree(td);
730 }
731 mutex_unlock(&md->table_devices_lock);
732}
733
734static void free_table_devices(struct list_head *devices)
735{
736 struct list_head *tmp, *next;
737
738 list_for_each_safe(tmp, next, devices) {
739 struct table_device *td = list_entry(tmp, struct table_device, list);
740
741 DMWARN("dm_destroy: %s still exists with %d references",
742 td->dm_dev.name, refcount_read(&td->count));
743 kfree(td);
744 }
745}
746
747
748
749
750int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
751{
752 *geo = md->geometry;
753
754 return 0;
755}
756
757
758
759
760int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
761{
762 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
763
764 if (geo->start > sz) {
765 DMWARN("Start sector is beyond the geometry limits.");
766 return -EINVAL;
767 }
768
769 md->geometry = *geo;
770
771 return 0;
772}
773
774static int __noflush_suspending(struct mapped_device *md)
775{
776 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
777}
778
779
780
781
782
783void dm_io_dec_pending(struct dm_io *io, blk_status_t error)
784{
785 unsigned long flags;
786 blk_status_t io_error;
787 struct bio *bio;
788 struct mapped_device *md = io->md;
789
790
791 if (unlikely(error)) {
792 spin_lock_irqsave(&io->endio_lock, flags);
793 if (!(io->status == BLK_STS_DM_REQUEUE && __noflush_suspending(md)))
794 io->status = error;
795 spin_unlock_irqrestore(&io->endio_lock, flags);
796 }
797
798 if (atomic_dec_and_test(&io->io_count)) {
799 bio = io->orig_bio;
800 if (io->status == BLK_STS_DM_REQUEUE) {
801
802
803
804 spin_lock_irqsave(&md->deferred_lock, flags);
805 if (__noflush_suspending(md) &&
806 !WARN_ON_ONCE(dm_is_zone_write(md, bio))) {
807
808 bio_list_add_head(&md->deferred, bio);
809 } else {
810
811
812
813
814 io->status = BLK_STS_IOERR;
815 }
816 spin_unlock_irqrestore(&md->deferred_lock, flags);
817 }
818
819 io_error = io->status;
820 end_io_acct(io);
821 free_io(md, io);
822
823 if (io_error == BLK_STS_DM_REQUEUE)
824 return;
825
826 if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
827
828
829
830
831 bio->bi_opf &= ~REQ_PREFLUSH;
832 queue_io(md, bio);
833 } else {
834
835 if (io_error)
836 bio->bi_status = io_error;
837 bio_endio(bio);
838 }
839 }
840}
841
842void disable_discard(struct mapped_device *md)
843{
844 struct queue_limits *limits = dm_get_queue_limits(md);
845
846
847 limits->max_discard_sectors = 0;
848 blk_queue_flag_clear(QUEUE_FLAG_DISCARD, md->queue);
849}
850
851void disable_write_same(struct mapped_device *md)
852{
853 struct queue_limits *limits = dm_get_queue_limits(md);
854
855
856 limits->max_write_same_sectors = 0;
857}
858
859void disable_write_zeroes(struct mapped_device *md)
860{
861 struct queue_limits *limits = dm_get_queue_limits(md);
862
863
864 limits->max_write_zeroes_sectors = 0;
865}
866
867static bool swap_bios_limit(struct dm_target *ti, struct bio *bio)
868{
869 return unlikely((bio->bi_opf & REQ_SWAP) != 0) && unlikely(ti->limit_swap_bios);
870}
871
872static void clone_endio(struct bio *bio)
873{
874 blk_status_t error = bio->bi_status;
875 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
876 struct dm_io *io = tio->io;
877 struct mapped_device *md = tio->io->md;
878 dm_endio_fn endio = tio->ti->type->end_io;
879 struct request_queue *q = bio->bi_bdev->bd_disk->queue;
880
881 if (unlikely(error == BLK_STS_TARGET)) {
882 if (bio_op(bio) == REQ_OP_DISCARD &&
883 !q->limits.max_discard_sectors)
884 disable_discard(md);
885 else if (bio_op(bio) == REQ_OP_WRITE_SAME &&
886 !q->limits.max_write_same_sectors)
887 disable_write_same(md);
888 else if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
889 !q->limits.max_write_zeroes_sectors)
890 disable_write_zeroes(md);
891 }
892
893 if (blk_queue_is_zoned(q))
894 dm_zone_endio(io, bio);
895
896 if (endio) {
897 int r = endio(tio->ti, bio, &error);
898 switch (r) {
899 case DM_ENDIO_REQUEUE:
900
901
902
903
904
905 if (WARN_ON_ONCE(dm_is_zone_write(md, bio)))
906 error = BLK_STS_IOERR;
907 else
908 error = BLK_STS_DM_REQUEUE;
909 fallthrough;
910 case DM_ENDIO_DONE:
911 break;
912 case DM_ENDIO_INCOMPLETE:
913
914 return;
915 default:
916 DMWARN("unimplemented target endio return value: %d", r);
917 BUG();
918 }
919 }
920
921 if (unlikely(swap_bios_limit(tio->ti, bio))) {
922 struct mapped_device *md = io->md;
923 up(&md->swap_bios_semaphore);
924 }
925
926 free_tio(tio);
927 dm_io_dec_pending(io, error);
928}
929
930
931
932
933
934static inline sector_t max_io_len_target_boundary(struct dm_target *ti,
935 sector_t target_offset)
936{
937 return ti->len - target_offset;
938}
939
940static sector_t max_io_len(struct dm_target *ti, sector_t sector)
941{
942 sector_t target_offset = dm_target_offset(ti, sector);
943 sector_t len = max_io_len_target_boundary(ti, target_offset);
944 sector_t max_len;
945
946
947
948
949
950
951
952
953 if (ti->max_io_len) {
954 max_len = blk_max_size_offset(ti->table->md->queue,
955 target_offset, ti->max_io_len);
956 if (len > max_len)
957 len = max_len;
958 }
959
960 return len;
961}
962
963int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
964{
965 if (len > UINT_MAX) {
966 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
967 (unsigned long long)len, UINT_MAX);
968 ti->error = "Maximum size of target IO is too large";
969 return -EINVAL;
970 }
971
972 ti->max_io_len = (uint32_t) len;
973
974 return 0;
975}
976EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
977
978static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
979 sector_t sector, int *srcu_idx)
980 __acquires(md->io_barrier)
981{
982 struct dm_table *map;
983 struct dm_target *ti;
984
985 map = dm_get_live_table(md, srcu_idx);
986 if (!map)
987 return NULL;
988
989 ti = dm_table_find_target(map, sector);
990 if (!ti)
991 return NULL;
992
993 return ti;
994}
995
996static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
997 long nr_pages, void **kaddr, pfn_t *pfn)
998{
999 struct mapped_device *md = dax_get_private(dax_dev);
1000 sector_t sector = pgoff * PAGE_SECTORS;
1001 struct dm_target *ti;
1002 long len, ret = -EIO;
1003 int srcu_idx;
1004
1005 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1006
1007 if (!ti)
1008 goto out;
1009 if (!ti->type->direct_access)
1010 goto out;
1011 len = max_io_len(ti, sector) / PAGE_SECTORS;
1012 if (len < 1)
1013 goto out;
1014 nr_pages = min(len, nr_pages);
1015 ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
1016
1017 out:
1018 dm_put_live_table(md, srcu_idx);
1019
1020 return ret;
1021}
1022
1023static bool dm_dax_supported(struct dax_device *dax_dev, struct block_device *bdev,
1024 int blocksize, sector_t start, sector_t len)
1025{
1026 struct mapped_device *md = dax_get_private(dax_dev);
1027 struct dm_table *map;
1028 bool ret = false;
1029 int srcu_idx;
1030
1031 map = dm_get_live_table(md, &srcu_idx);
1032 if (!map)
1033 goto out;
1034
1035 ret = dm_table_supports_dax(map, device_not_dax_capable, &blocksize);
1036
1037out:
1038 dm_put_live_table(md, srcu_idx);
1039
1040 return ret;
1041}
1042
1043static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
1044 void *addr, size_t bytes, struct iov_iter *i)
1045{
1046 struct mapped_device *md = dax_get_private(dax_dev);
1047 sector_t sector = pgoff * PAGE_SECTORS;
1048 struct dm_target *ti;
1049 long ret = 0;
1050 int srcu_idx;
1051
1052 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1053
1054 if (!ti)
1055 goto out;
1056 if (!ti->type->dax_copy_from_iter) {
1057 ret = copy_from_iter(addr, bytes, i);
1058 goto out;
1059 }
1060 ret = ti->type->dax_copy_from_iter(ti, pgoff, addr, bytes, i);
1061 out:
1062 dm_put_live_table(md, srcu_idx);
1063
1064 return ret;
1065}
1066
1067static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
1068 void *addr, size_t bytes, struct iov_iter *i)
1069{
1070 struct mapped_device *md = dax_get_private(dax_dev);
1071 sector_t sector = pgoff * PAGE_SECTORS;
1072 struct dm_target *ti;
1073 long ret = 0;
1074 int srcu_idx;
1075
1076 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1077
1078 if (!ti)
1079 goto out;
1080 if (!ti->type->dax_copy_to_iter) {
1081 ret = copy_to_iter(addr, bytes, i);
1082 goto out;
1083 }
1084 ret = ti->type->dax_copy_to_iter(ti, pgoff, addr, bytes, i);
1085 out:
1086 dm_put_live_table(md, srcu_idx);
1087
1088 return ret;
1089}
1090
1091static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
1092 size_t nr_pages)
1093{
1094 struct mapped_device *md = dax_get_private(dax_dev);
1095 sector_t sector = pgoff * PAGE_SECTORS;
1096 struct dm_target *ti;
1097 int ret = -EIO;
1098 int srcu_idx;
1099
1100 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1101
1102 if (!ti)
1103 goto out;
1104 if (WARN_ON(!ti->type->dax_zero_page_range)) {
1105
1106
1107
1108
1109 goto out;
1110 }
1111 ret = ti->type->dax_zero_page_range(ti, pgoff, nr_pages);
1112 out:
1113 dm_put_live_table(md, srcu_idx);
1114
1115 return ret;
1116}
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
1148{
1149 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
1150 unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
1151
1152 BUG_ON(bio->bi_opf & REQ_PREFLUSH);
1153 BUG_ON(op_is_zone_mgmt(bio_op(bio)));
1154 BUG_ON(bio_op(bio) == REQ_OP_ZONE_APPEND);
1155 BUG_ON(bi_size > *tio->len_ptr);
1156 BUG_ON(n_sectors > bi_size);
1157
1158 *tio->len_ptr -= bi_size - n_sectors;
1159 bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
1160}
1161EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
1162
1163static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch)
1164{
1165 mutex_lock(&md->swap_bios_lock);
1166 while (latch < md->swap_bios) {
1167 cond_resched();
1168 down(&md->swap_bios_semaphore);
1169 md->swap_bios--;
1170 }
1171 while (latch > md->swap_bios) {
1172 cond_resched();
1173 up(&md->swap_bios_semaphore);
1174 md->swap_bios++;
1175 }
1176 mutex_unlock(&md->swap_bios_lock);
1177}
1178
1179static blk_qc_t __map_bio(struct dm_target_io *tio)
1180{
1181 int r;
1182 sector_t sector;
1183 struct bio *clone = &tio->clone;
1184 struct dm_io *io = tio->io;
1185 struct dm_target *ti = tio->ti;
1186 blk_qc_t ret = BLK_QC_T_NONE;
1187
1188 clone->bi_end_io = clone_endio;
1189
1190
1191
1192
1193
1194
1195 dm_io_inc_pending(io);
1196 sector = clone->bi_iter.bi_sector;
1197
1198 if (unlikely(swap_bios_limit(ti, clone))) {
1199 struct mapped_device *md = io->md;
1200 int latch = get_swap_bios();
1201 if (unlikely(latch != md->swap_bios))
1202 __set_swap_bios_limit(md, latch);
1203 down(&md->swap_bios_semaphore);
1204 }
1205
1206
1207
1208
1209
1210
1211 if (dm_emulate_zone_append(io->md))
1212 r = dm_zone_map_bio(tio);
1213 else
1214 r = ti->type->map(ti, clone);
1215
1216 switch (r) {
1217 case DM_MAPIO_SUBMITTED:
1218 break;
1219 case DM_MAPIO_REMAPPED:
1220
1221 trace_block_bio_remap(clone, bio_dev(io->orig_bio), sector);
1222 ret = submit_bio_noacct(clone);
1223 break;
1224 case DM_MAPIO_KILL:
1225 if (unlikely(swap_bios_limit(ti, clone))) {
1226 struct mapped_device *md = io->md;
1227 up(&md->swap_bios_semaphore);
1228 }
1229 free_tio(tio);
1230 dm_io_dec_pending(io, BLK_STS_IOERR);
1231 break;
1232 case DM_MAPIO_REQUEUE:
1233 if (unlikely(swap_bios_limit(ti, clone))) {
1234 struct mapped_device *md = io->md;
1235 up(&md->swap_bios_semaphore);
1236 }
1237 free_tio(tio);
1238 dm_io_dec_pending(io, BLK_STS_DM_REQUEUE);
1239 break;
1240 default:
1241 DMWARN("unimplemented target map return value: %d", r);
1242 BUG();
1243 }
1244
1245 return ret;
1246}
1247
1248static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
1249{
1250 bio->bi_iter.bi_sector = sector;
1251 bio->bi_iter.bi_size = to_bytes(len);
1252}
1253
1254
1255
1256
1257static int clone_bio(struct dm_target_io *tio, struct bio *bio,
1258 sector_t sector, unsigned len)
1259{
1260 struct bio *clone = &tio->clone;
1261 int r;
1262
1263 __bio_clone_fast(clone, bio);
1264
1265 r = bio_crypt_clone(clone, bio, GFP_NOIO);
1266 if (r < 0)
1267 return r;
1268
1269 if (bio_integrity(bio)) {
1270 if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
1271 !dm_target_passes_integrity(tio->ti->type))) {
1272 DMWARN("%s: the target %s doesn't support integrity data.",
1273 dm_device_name(tio->io->md),
1274 tio->ti->type->name);
1275 return -EIO;
1276 }
1277
1278 r = bio_integrity_clone(clone, bio, GFP_NOIO);
1279 if (r < 0)
1280 return r;
1281 }
1282
1283 bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
1284 clone->bi_iter.bi_size = to_bytes(len);
1285
1286 if (bio_integrity(bio))
1287 bio_integrity_trim(clone);
1288
1289 return 0;
1290}
1291
1292static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
1293 struct dm_target *ti, unsigned num_bios)
1294{
1295 struct dm_target_io *tio;
1296 int try;
1297
1298 if (!num_bios)
1299 return;
1300
1301 if (num_bios == 1) {
1302 tio = alloc_tio(ci, ti, 0, GFP_NOIO);
1303 bio_list_add(blist, &tio->clone);
1304 return;
1305 }
1306
1307 for (try = 0; try < 2; try++) {
1308 int bio_nr;
1309 struct bio *bio;
1310
1311 if (try)
1312 mutex_lock(&ci->io->md->table_devices_lock);
1313 for (bio_nr = 0; bio_nr < num_bios; bio_nr++) {
1314 tio = alloc_tio(ci, ti, bio_nr, try ? GFP_NOIO : GFP_NOWAIT);
1315 if (!tio)
1316 break;
1317
1318 bio_list_add(blist, &tio->clone);
1319 }
1320 if (try)
1321 mutex_unlock(&ci->io->md->table_devices_lock);
1322 if (bio_nr == num_bios)
1323 return;
1324
1325 while ((bio = bio_list_pop(blist))) {
1326 tio = container_of(bio, struct dm_target_io, clone);
1327 free_tio(tio);
1328 }
1329 }
1330}
1331
1332static blk_qc_t __clone_and_map_simple_bio(struct clone_info *ci,
1333 struct dm_target_io *tio, unsigned *len)
1334{
1335 struct bio *clone = &tio->clone;
1336
1337 tio->len_ptr = len;
1338
1339 __bio_clone_fast(clone, ci->bio);
1340 if (len)
1341 bio_setup_sector(clone, ci->sector, *len);
1342
1343 return __map_bio(tio);
1344}
1345
1346static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
1347 unsigned num_bios, unsigned *len)
1348{
1349 struct bio_list blist = BIO_EMPTY_LIST;
1350 struct bio *bio;
1351 struct dm_target_io *tio;
1352
1353 alloc_multiple_bios(&blist, ci, ti, num_bios);
1354
1355 while ((bio = bio_list_pop(&blist))) {
1356 tio = container_of(bio, struct dm_target_io, clone);
1357 (void) __clone_and_map_simple_bio(ci, tio, len);
1358 }
1359}
1360
1361static int __send_empty_flush(struct clone_info *ci)
1362{
1363 unsigned target_nr = 0;
1364 struct dm_target *ti;
1365 struct bio flush_bio;
1366
1367
1368
1369
1370
1371
1372 bio_init(&flush_bio, NULL, 0);
1373 flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
1374 bio_set_dev(&flush_bio, ci->io->md->disk->part0);
1375
1376 ci->bio = &flush_bio;
1377 ci->sector_count = 0;
1378
1379 BUG_ON(bio_has_data(ci->bio));
1380 while ((ti = dm_table_get_target(ci->map, target_nr++)))
1381 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
1382
1383 bio_uninit(ci->bio);
1384 return 0;
1385}
1386
1387static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
1388 sector_t sector, unsigned *len)
1389{
1390 struct bio *bio = ci->bio;
1391 struct dm_target_io *tio;
1392 int r;
1393
1394 tio = alloc_tio(ci, ti, 0, GFP_NOIO);
1395 tio->len_ptr = len;
1396 r = clone_bio(tio, bio, sector, *len);
1397 if (r < 0) {
1398 free_tio(tio);
1399 return r;
1400 }
1401 (void) __map_bio(tio);
1402
1403 return 0;
1404}
1405
1406static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
1407 unsigned num_bios)
1408{
1409 unsigned len;
1410
1411
1412
1413
1414
1415
1416
1417 if (!num_bios)
1418 return -EOPNOTSUPP;
1419
1420 len = min_t(sector_t, ci->sector_count,
1421 max_io_len_target_boundary(ti, dm_target_offset(ti, ci->sector)));
1422
1423 __send_duplicate_bios(ci, ti, num_bios, &len);
1424
1425 ci->sector += len;
1426 ci->sector_count -= len;
1427
1428 return 0;
1429}
1430
1431static bool is_abnormal_io(struct bio *bio)
1432{
1433 bool r = false;
1434
1435 switch (bio_op(bio)) {
1436 case REQ_OP_DISCARD:
1437 case REQ_OP_SECURE_ERASE:
1438 case REQ_OP_WRITE_SAME:
1439 case REQ_OP_WRITE_ZEROES:
1440 r = true;
1441 break;
1442 }
1443
1444 return r;
1445}
1446
1447static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti,
1448 int *result)
1449{
1450 struct bio *bio = ci->bio;
1451 unsigned num_bios = 0;
1452
1453 switch (bio_op(bio)) {
1454 case REQ_OP_DISCARD:
1455 num_bios = ti->num_discard_bios;
1456 break;
1457 case REQ_OP_SECURE_ERASE:
1458 num_bios = ti->num_secure_erase_bios;
1459 break;
1460 case REQ_OP_WRITE_SAME:
1461 num_bios = ti->num_write_same_bios;
1462 break;
1463 case REQ_OP_WRITE_ZEROES:
1464 num_bios = ti->num_write_zeroes_bios;
1465 break;
1466 default:
1467 return false;
1468 }
1469
1470 *result = __send_changing_extent_only(ci, ti, num_bios);
1471 return true;
1472}
1473
1474
1475
1476
1477static int __split_and_process_non_flush(struct clone_info *ci)
1478{
1479 struct dm_target *ti;
1480 unsigned len;
1481 int r;
1482
1483 ti = dm_table_find_target(ci->map, ci->sector);
1484 if (!ti)
1485 return -EIO;
1486
1487 if (__process_abnormal_io(ci, ti, &r))
1488 return r;
1489
1490 len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count);
1491
1492 r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
1493 if (r < 0)
1494 return r;
1495
1496 ci->sector += len;
1497 ci->sector_count -= len;
1498
1499 return 0;
1500}
1501
1502static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
1503 struct dm_table *map, struct bio *bio)
1504{
1505 ci->map = map;
1506 ci->io = alloc_io(md, bio);
1507 ci->sector = bio->bi_iter.bi_sector;
1508}
1509
1510#define __dm_part_stat_sub(part, field, subnd) \
1511 (part_stat_get(part, field) -= (subnd))
1512
1513
1514
1515
1516static blk_qc_t __split_and_process_bio(struct mapped_device *md,
1517 struct dm_table *map, struct bio *bio)
1518{
1519 struct clone_info ci;
1520 blk_qc_t ret = BLK_QC_T_NONE;
1521 int error = 0;
1522
1523 init_clone_info(&ci, md, map, bio);
1524
1525 if (bio->bi_opf & REQ_PREFLUSH) {
1526 error = __send_empty_flush(&ci);
1527
1528 } else if (op_is_zone_mgmt(bio_op(bio))) {
1529 ci.bio = bio;
1530 ci.sector_count = 0;
1531 error = __split_and_process_non_flush(&ci);
1532 } else {
1533 ci.bio = bio;
1534 ci.sector_count = bio_sectors(bio);
1535 error = __split_and_process_non_flush(&ci);
1536 if (ci.sector_count && !error) {
1537
1538
1539
1540
1541
1542
1543
1544
1545 struct bio *b = bio_split(bio, bio_sectors(bio) - ci.sector_count,
1546 GFP_NOIO, &md->queue->bio_split);
1547 ci.io->orig_bio = b;
1548
1549
1550
1551
1552
1553
1554
1555
1556 part_stat_lock();
1557 __dm_part_stat_sub(dm_disk(md)->part0,
1558 sectors[op_stat_group(bio_op(bio))], ci.sector_count);
1559 part_stat_unlock();
1560
1561 bio_chain(b, bio);
1562 trace_block_split(b, bio->bi_iter.bi_sector);
1563 ret = submit_bio_noacct(bio);
1564 }
1565 }
1566
1567
1568 dm_io_dec_pending(ci.io, errno_to_blk_status(error));
1569 return ret;
1570}
1571
1572static blk_qc_t dm_submit_bio(struct bio *bio)
1573{
1574 struct mapped_device *md = bio->bi_bdev->bd_disk->private_data;
1575 blk_qc_t ret = BLK_QC_T_NONE;
1576 int srcu_idx;
1577 struct dm_table *map;
1578
1579 map = dm_get_live_table(md, &srcu_idx);
1580 if (unlikely(!map)) {
1581 DMERR_LIMIT("%s: mapping table unavailable, erroring io",
1582 dm_device_name(md));
1583 bio_io_error(bio);
1584 goto out;
1585 }
1586
1587
1588 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1589 if (bio->bi_opf & REQ_NOWAIT)
1590 bio_wouldblock_error(bio);
1591 else if (bio->bi_opf & REQ_RAHEAD)
1592 bio_io_error(bio);
1593 else
1594 queue_io(md, bio);
1595 goto out;
1596 }
1597
1598
1599
1600
1601
1602 if (is_abnormal_io(bio))
1603 blk_queue_split(&bio);
1604
1605 ret = __split_and_process_bio(md, map, bio);
1606out:
1607 dm_put_live_table(md, srcu_idx);
1608 return ret;
1609}
1610
1611
1612
1613
1614static void free_minor(int minor)
1615{
1616 spin_lock(&_minor_lock);
1617 idr_remove(&_minor_idr, minor);
1618 spin_unlock(&_minor_lock);
1619}
1620
1621
1622
1623
1624static int specific_minor(int minor)
1625{
1626 int r;
1627
1628 if (minor >= (1 << MINORBITS))
1629 return -EINVAL;
1630
1631 idr_preload(GFP_KERNEL);
1632 spin_lock(&_minor_lock);
1633
1634 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
1635
1636 spin_unlock(&_minor_lock);
1637 idr_preload_end();
1638 if (r < 0)
1639 return r == -ENOSPC ? -EBUSY : r;
1640 return 0;
1641}
1642
1643static int next_free_minor(int *minor)
1644{
1645 int r;
1646
1647 idr_preload(GFP_KERNEL);
1648 spin_lock(&_minor_lock);
1649
1650 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
1651
1652 spin_unlock(&_minor_lock);
1653 idr_preload_end();
1654 if (r < 0)
1655 return r;
1656 *minor = r;
1657 return 0;
1658}
1659
1660static const struct block_device_operations dm_blk_dops;
1661static const struct block_device_operations dm_rq_blk_dops;
1662static const struct dax_operations dm_dax_ops;
1663
1664static void dm_wq_work(struct work_struct *work);
1665
1666#ifdef CONFIG_BLK_INLINE_ENCRYPTION
1667static void dm_queue_destroy_keyslot_manager(struct request_queue *q)
1668{
1669 dm_destroy_keyslot_manager(q->ksm);
1670}
1671
1672#else
1673
1674static inline void dm_queue_destroy_keyslot_manager(struct request_queue *q)
1675{
1676}
1677#endif
1678
1679static void cleanup_mapped_device(struct mapped_device *md)
1680{
1681 if (md->wq)
1682 destroy_workqueue(md->wq);
1683 bioset_exit(&md->bs);
1684 bioset_exit(&md->io_bs);
1685
1686 if (md->dax_dev) {
1687 kill_dax(md->dax_dev);
1688 put_dax(md->dax_dev);
1689 md->dax_dev = NULL;
1690 }
1691
1692 if (md->disk) {
1693 spin_lock(&_minor_lock);
1694 md->disk->private_data = NULL;
1695 spin_unlock(&_minor_lock);
1696 del_gendisk(md->disk);
1697 }
1698
1699 if (md->queue)
1700 dm_queue_destroy_keyslot_manager(md->queue);
1701
1702 if (md->disk)
1703 blk_cleanup_disk(md->disk);
1704
1705 cleanup_srcu_struct(&md->io_barrier);
1706
1707 mutex_destroy(&md->suspend_lock);
1708 mutex_destroy(&md->type_lock);
1709 mutex_destroy(&md->table_devices_lock);
1710 mutex_destroy(&md->swap_bios_lock);
1711
1712 dm_mq_cleanup_mapped_device(md);
1713 dm_cleanup_zoned_dev(md);
1714}
1715
1716
1717
1718
1719static struct mapped_device *alloc_dev(int minor)
1720{
1721 int r, numa_node_id = dm_get_numa_node();
1722 struct mapped_device *md;
1723 void *old_md;
1724
1725 md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
1726 if (!md) {
1727 DMWARN("unable to allocate device, out of memory.");
1728 return NULL;
1729 }
1730
1731 if (!try_module_get(THIS_MODULE))
1732 goto bad_module_get;
1733
1734
1735 if (minor == DM_ANY_MINOR)
1736 r = next_free_minor(&minor);
1737 else
1738 r = specific_minor(minor);
1739 if (r < 0)
1740 goto bad_minor;
1741
1742 r = init_srcu_struct(&md->io_barrier);
1743 if (r < 0)
1744 goto bad_io_barrier;
1745
1746 md->numa_node_id = numa_node_id;
1747 md->init_tio_pdu = false;
1748 md->type = DM_TYPE_NONE;
1749 mutex_init(&md->suspend_lock);
1750 mutex_init(&md->type_lock);
1751 mutex_init(&md->table_devices_lock);
1752 spin_lock_init(&md->deferred_lock);
1753 atomic_set(&md->holders, 1);
1754 atomic_set(&md->open_count, 0);
1755 atomic_set(&md->event_nr, 0);
1756 atomic_set(&md->uevent_seq, 0);
1757 INIT_LIST_HEAD(&md->uevent_list);
1758 INIT_LIST_HEAD(&md->table_devices);
1759 spin_lock_init(&md->uevent_lock);
1760
1761
1762
1763
1764
1765
1766 md->disk = blk_alloc_disk(md->numa_node_id);
1767 if (!md->disk)
1768 goto bad;
1769 md->queue = md->disk->queue;
1770
1771 init_waitqueue_head(&md->wait);
1772 INIT_WORK(&md->work, dm_wq_work);
1773 init_waitqueue_head(&md->eventq);
1774 init_completion(&md->kobj_holder.completion);
1775
1776 md->swap_bios = get_swap_bios();
1777 sema_init(&md->swap_bios_semaphore, md->swap_bios);
1778 mutex_init(&md->swap_bios_lock);
1779
1780 md->disk->major = _major;
1781 md->disk->first_minor = minor;
1782 md->disk->minors = 1;
1783 md->disk->fops = &dm_blk_dops;
1784 md->disk->queue = md->queue;
1785 md->disk->private_data = md;
1786 sprintf(md->disk->disk_name, "dm-%d", minor);
1787
1788 if (IS_ENABLED(CONFIG_DAX_DRIVER)) {
1789 md->dax_dev = alloc_dax(md, md->disk->disk_name,
1790 &dm_dax_ops, 0);
1791 if (IS_ERR(md->dax_dev))
1792 goto bad;
1793 }
1794
1795 add_disk_no_queue_reg(md->disk);
1796 format_dev_t(md->name, MKDEV(_major, minor));
1797
1798 md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
1799 if (!md->wq)
1800 goto bad;
1801
1802 dm_stats_init(&md->stats);
1803
1804
1805 spin_lock(&_minor_lock);
1806 old_md = idr_replace(&_minor_idr, md, minor);
1807 spin_unlock(&_minor_lock);
1808
1809 BUG_ON(old_md != MINOR_ALLOCED);
1810
1811 return md;
1812
1813bad:
1814 cleanup_mapped_device(md);
1815bad_io_barrier:
1816 free_minor(minor);
1817bad_minor:
1818 module_put(THIS_MODULE);
1819bad_module_get:
1820 kvfree(md);
1821 return NULL;
1822}
1823
1824static void unlock_fs(struct mapped_device *md);
1825
1826static void free_dev(struct mapped_device *md)
1827{
1828 int minor = MINOR(disk_devt(md->disk));
1829
1830 unlock_fs(md);
1831
1832 cleanup_mapped_device(md);
1833
1834 free_table_devices(&md->table_devices);
1835 dm_stats_cleanup(&md->stats);
1836 free_minor(minor);
1837
1838 module_put(THIS_MODULE);
1839 kvfree(md);
1840}
1841
1842static int __bind_mempools(struct mapped_device *md, struct dm_table *t)
1843{
1844 struct dm_md_mempools *p = dm_table_get_md_mempools(t);
1845 int ret = 0;
1846
1847 if (dm_table_bio_based(t)) {
1848
1849
1850
1851
1852
1853 bioset_exit(&md->bs);
1854 bioset_exit(&md->io_bs);
1855
1856 } else if (bioset_initialized(&md->bs)) {
1857
1858
1859
1860
1861
1862
1863
1864
1865 goto out;
1866 }
1867
1868 BUG_ON(!p ||
1869 bioset_initialized(&md->bs) ||
1870 bioset_initialized(&md->io_bs));
1871
1872 ret = bioset_init_from_src(&md->bs, &p->bs);
1873 if (ret)
1874 goto out;
1875 ret = bioset_init_from_src(&md->io_bs, &p->io_bs);
1876 if (ret)
1877 bioset_exit(&md->bs);
1878out:
1879
1880 dm_table_free_md_mempools(t);
1881 return ret;
1882}
1883
1884
1885
1886
1887static void event_callback(void *context)
1888{
1889 unsigned long flags;
1890 LIST_HEAD(uevents);
1891 struct mapped_device *md = (struct mapped_device *) context;
1892
1893 spin_lock_irqsave(&md->uevent_lock, flags);
1894 list_splice_init(&md->uevent_list, &uevents);
1895 spin_unlock_irqrestore(&md->uevent_lock, flags);
1896
1897 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
1898
1899 atomic_inc(&md->event_nr);
1900 wake_up(&md->eventq);
1901 dm_issue_global_event();
1902}
1903
1904
1905
1906
1907static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
1908 struct queue_limits *limits)
1909{
1910 struct dm_table *old_map;
1911 struct request_queue *q = md->queue;
1912 bool request_based = dm_table_request_based(t);
1913 sector_t size;
1914 int ret;
1915
1916 lockdep_assert_held(&md->suspend_lock);
1917
1918 size = dm_table_get_size(t);
1919
1920
1921
1922
1923 if (size != dm_get_size(md))
1924 memset(&md->geometry, 0, sizeof(md->geometry));
1925
1926 if (!get_capacity(md->disk))
1927 set_capacity(md->disk, size);
1928 else
1929 set_capacity_and_notify(md->disk, size);
1930
1931 dm_table_event_callback(t, event_callback, md);
1932
1933
1934
1935
1936
1937
1938
1939
1940 if (request_based)
1941 dm_stop_queue(q);
1942
1943 if (request_based) {
1944
1945
1946
1947
1948 md->immutable_target = dm_table_get_immutable_target(t);
1949 }
1950
1951 ret = __bind_mempools(md, t);
1952 if (ret) {
1953 old_map = ERR_PTR(ret);
1954 goto out;
1955 }
1956
1957 ret = dm_table_set_restrictions(t, q, limits);
1958 if (ret) {
1959 old_map = ERR_PTR(ret);
1960 goto out;
1961 }
1962
1963 old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
1964 rcu_assign_pointer(md->map, (void *)t);
1965 md->immutable_target_type = dm_table_get_immutable_target_type(t);
1966
1967 if (old_map)
1968 dm_sync_table(md);
1969
1970out:
1971 return old_map;
1972}
1973
1974
1975
1976
1977static struct dm_table *__unbind(struct mapped_device *md)
1978{
1979 struct dm_table *map = rcu_dereference_protected(md->map, 1);
1980
1981 if (!map)
1982 return NULL;
1983
1984 dm_table_event_callback(map, NULL, NULL);
1985 RCU_INIT_POINTER(md->map, NULL);
1986 dm_sync_table(md);
1987
1988 return map;
1989}
1990
1991
1992
1993
1994int dm_create(int minor, struct mapped_device **result)
1995{
1996 int r;
1997 struct mapped_device *md;
1998
1999 md = alloc_dev(minor);
2000 if (!md)
2001 return -ENXIO;
2002
2003 r = dm_sysfs_init(md);
2004 if (r) {
2005 free_dev(md);
2006 return r;
2007 }
2008
2009 *result = md;
2010 return 0;
2011}
2012
2013
2014
2015
2016
2017void dm_lock_md_type(struct mapped_device *md)
2018{
2019 mutex_lock(&md->type_lock);
2020}
2021
2022void dm_unlock_md_type(struct mapped_device *md)
2023{
2024 mutex_unlock(&md->type_lock);
2025}
2026
2027void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
2028{
2029 BUG_ON(!mutex_is_locked(&md->type_lock));
2030 md->type = type;
2031}
2032
2033enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
2034{
2035 return md->type;
2036}
2037
2038struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
2039{
2040 return md->immutable_target_type;
2041}
2042
2043
2044
2045
2046
2047struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
2048{
2049 BUG_ON(!atomic_read(&md->holders));
2050 return &md->queue->limits;
2051}
2052EXPORT_SYMBOL_GPL(dm_get_queue_limits);
2053
2054
2055
2056
2057int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
2058{
2059 int r;
2060 struct queue_limits limits;
2061 enum dm_queue_mode type = dm_get_md_type(md);
2062
2063 switch (type) {
2064 case DM_TYPE_REQUEST_BASED:
2065 md->disk->fops = &dm_rq_blk_dops;
2066 r = dm_mq_init_request_queue(md, t);
2067 if (r) {
2068 DMERR("Cannot initialize queue for request-based dm mapped device");
2069 return r;
2070 }
2071 break;
2072 case DM_TYPE_BIO_BASED:
2073 case DM_TYPE_DAX_BIO_BASED:
2074 break;
2075 case DM_TYPE_NONE:
2076 WARN_ON_ONCE(true);
2077 break;
2078 }
2079
2080 r = dm_calculate_queue_limits(t, &limits);
2081 if (r) {
2082 DMERR("Cannot calculate initial queue limits");
2083 return r;
2084 }
2085 r = dm_table_set_restrictions(t, md->queue, &limits);
2086 if (r)
2087 return r;
2088
2089 blk_register_queue(md->disk);
2090
2091 return 0;
2092}
2093
2094struct mapped_device *dm_get_md(dev_t dev)
2095{
2096 struct mapped_device *md;
2097 unsigned minor = MINOR(dev);
2098
2099 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2100 return NULL;
2101
2102 spin_lock(&_minor_lock);
2103
2104 md = idr_find(&_minor_idr, minor);
2105 if (!md || md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) ||
2106 test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2107 md = NULL;
2108 goto out;
2109 }
2110 dm_get(md);
2111out:
2112 spin_unlock(&_minor_lock);
2113
2114 return md;
2115}
2116EXPORT_SYMBOL_GPL(dm_get_md);
2117
2118void *dm_get_mdptr(struct mapped_device *md)
2119{
2120 return md->interface_ptr;
2121}
2122
2123void dm_set_mdptr(struct mapped_device *md, void *ptr)
2124{
2125 md->interface_ptr = ptr;
2126}
2127
2128void dm_get(struct mapped_device *md)
2129{
2130 atomic_inc(&md->holders);
2131 BUG_ON(test_bit(DMF_FREEING, &md->flags));
2132}
2133
2134int dm_hold(struct mapped_device *md)
2135{
2136 spin_lock(&_minor_lock);
2137 if (test_bit(DMF_FREEING, &md->flags)) {
2138 spin_unlock(&_minor_lock);
2139 return -EBUSY;
2140 }
2141 dm_get(md);
2142 spin_unlock(&_minor_lock);
2143 return 0;
2144}
2145EXPORT_SYMBOL_GPL(dm_hold);
2146
2147const char *dm_device_name(struct mapped_device *md)
2148{
2149 return md->name;
2150}
2151EXPORT_SYMBOL_GPL(dm_device_name);
2152
2153static void __dm_destroy(struct mapped_device *md, bool wait)
2154{
2155 struct dm_table *map;
2156 int srcu_idx;
2157
2158 might_sleep();
2159
2160 spin_lock(&_minor_lock);
2161 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2162 set_bit(DMF_FREEING, &md->flags);
2163 spin_unlock(&_minor_lock);
2164
2165 blk_set_queue_dying(md->queue);
2166
2167
2168
2169
2170
2171 mutex_lock(&md->suspend_lock);
2172 map = dm_get_live_table(md, &srcu_idx);
2173 if (!dm_suspended_md(md)) {
2174 dm_table_presuspend_targets(map);
2175 set_bit(DMF_SUSPENDED, &md->flags);
2176 set_bit(DMF_POST_SUSPENDING, &md->flags);
2177 dm_table_postsuspend_targets(map);
2178 }
2179
2180 dm_put_live_table(md, srcu_idx);
2181 mutex_unlock(&md->suspend_lock);
2182
2183
2184
2185
2186
2187
2188
2189 if (wait)
2190 while (atomic_read(&md->holders))
2191 msleep(1);
2192 else if (atomic_read(&md->holders))
2193 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2194 dm_device_name(md), atomic_read(&md->holders));
2195
2196 dm_sysfs_exit(md);
2197 dm_table_destroy(__unbind(md));
2198 free_dev(md);
2199}
2200
2201void dm_destroy(struct mapped_device *md)
2202{
2203 __dm_destroy(md, true);
2204}
2205
2206void dm_destroy_immediate(struct mapped_device *md)
2207{
2208 __dm_destroy(md, false);
2209}
2210
2211void dm_put(struct mapped_device *md)
2212{
2213 atomic_dec(&md->holders);
2214}
2215EXPORT_SYMBOL_GPL(dm_put);
2216
2217static bool md_in_flight_bios(struct mapped_device *md)
2218{
2219 int cpu;
2220 struct block_device *part = dm_disk(md)->part0;
2221 long sum = 0;
2222
2223 for_each_possible_cpu(cpu) {
2224 sum += part_stat_local_read_cpu(part, in_flight[0], cpu);
2225 sum += part_stat_local_read_cpu(part, in_flight[1], cpu);
2226 }
2227
2228 return sum != 0;
2229}
2230
2231static int dm_wait_for_bios_completion(struct mapped_device *md, unsigned int task_state)
2232{
2233 int r = 0;
2234 DEFINE_WAIT(wait);
2235
2236 while (true) {
2237 prepare_to_wait(&md->wait, &wait, task_state);
2238
2239 if (!md_in_flight_bios(md))
2240 break;
2241
2242 if (signal_pending_state(task_state, current)) {
2243 r = -EINTR;
2244 break;
2245 }
2246
2247 io_schedule();
2248 }
2249 finish_wait(&md->wait, &wait);
2250
2251 return r;
2252}
2253
2254static int dm_wait_for_completion(struct mapped_device *md, unsigned int task_state)
2255{
2256 int r = 0;
2257
2258 if (!queue_is_mq(md->queue))
2259 return dm_wait_for_bios_completion(md, task_state);
2260
2261 while (true) {
2262 if (!blk_mq_queue_inflight(md->queue))
2263 break;
2264
2265 if (signal_pending_state(task_state, current)) {
2266 r = -EINTR;
2267 break;
2268 }
2269
2270 msleep(5);
2271 }
2272
2273 return r;
2274}
2275
2276
2277
2278
2279static void dm_wq_work(struct work_struct *work)
2280{
2281 struct mapped_device *md = container_of(work, struct mapped_device, work);
2282 struct bio *bio;
2283
2284 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2285 spin_lock_irq(&md->deferred_lock);
2286 bio = bio_list_pop(&md->deferred);
2287 spin_unlock_irq(&md->deferred_lock);
2288
2289 if (!bio)
2290 break;
2291
2292 submit_bio_noacct(bio);
2293 }
2294}
2295
2296static void dm_queue_flush(struct mapped_device *md)
2297{
2298 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2299 smp_mb__after_atomic();
2300 queue_work(md->wq, &md->work);
2301}
2302
2303
2304
2305
2306struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2307{
2308 struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
2309 struct queue_limits limits;
2310 int r;
2311
2312 mutex_lock(&md->suspend_lock);
2313
2314
2315 if (!dm_suspended_md(md))
2316 goto out;
2317
2318
2319
2320
2321
2322
2323
2324 if (dm_table_has_no_data_devices(table)) {
2325 live_map = dm_get_live_table_fast(md);
2326 if (live_map)
2327 limits = md->queue->limits;
2328 dm_put_live_table_fast(md);
2329 }
2330
2331 if (!live_map) {
2332 r = dm_calculate_queue_limits(table, &limits);
2333 if (r) {
2334 map = ERR_PTR(r);
2335 goto out;
2336 }
2337 }
2338
2339 map = __bind(md, table, &limits);
2340 dm_issue_global_event();
2341
2342out:
2343 mutex_unlock(&md->suspend_lock);
2344 return map;
2345}
2346
2347
2348
2349
2350
2351static int lock_fs(struct mapped_device *md)
2352{
2353 int r;
2354
2355 WARN_ON(test_bit(DMF_FROZEN, &md->flags));
2356
2357 r = freeze_bdev(md->disk->part0);
2358 if (!r)
2359 set_bit(DMF_FROZEN, &md->flags);
2360 return r;
2361}
2362
2363static void unlock_fs(struct mapped_device *md)
2364{
2365 if (!test_bit(DMF_FROZEN, &md->flags))
2366 return;
2367 thaw_bdev(md->disk->part0);
2368 clear_bit(DMF_FROZEN, &md->flags);
2369}
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
2381 unsigned suspend_flags, unsigned int task_state,
2382 int dmf_suspended_flag)
2383{
2384 bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
2385 bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
2386 int r;
2387
2388 lockdep_assert_held(&md->suspend_lock);
2389
2390
2391
2392
2393
2394 if (noflush)
2395 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2396 else
2397 DMDEBUG("%s: suspending with flush", dm_device_name(md));
2398
2399
2400
2401
2402
2403 dm_table_presuspend_targets(map);
2404
2405
2406
2407
2408
2409
2410
2411 if (!noflush && do_lockfs) {
2412 r = lock_fs(md);
2413 if (r) {
2414 dm_table_presuspend_undo_targets(map);
2415 return r;
2416 }
2417 }
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2431 if (map)
2432 synchronize_srcu(&md->io_barrier);
2433
2434
2435
2436
2437
2438 if (dm_request_based(md))
2439 dm_stop_queue(md->queue);
2440
2441 flush_workqueue(md->wq);
2442
2443
2444
2445
2446
2447
2448 r = dm_wait_for_completion(md, task_state);
2449 if (!r)
2450 set_bit(dmf_suspended_flag, &md->flags);
2451
2452 if (noflush)
2453 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2454 if (map)
2455 synchronize_srcu(&md->io_barrier);
2456
2457
2458 if (r < 0) {
2459 dm_queue_flush(md);
2460
2461 if (dm_request_based(md))
2462 dm_start_queue(md->queue);
2463
2464 unlock_fs(md);
2465 dm_table_presuspend_undo_targets(map);
2466
2467 }
2468
2469 return r;
2470}
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2489{
2490 struct dm_table *map = NULL;
2491 int r = 0;
2492
2493retry:
2494 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2495
2496 if (dm_suspended_md(md)) {
2497 r = -EINVAL;
2498 goto out_unlock;
2499 }
2500
2501 if (dm_suspended_internally_md(md)) {
2502
2503 mutex_unlock(&md->suspend_lock);
2504 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2505 if (r)
2506 return r;
2507 goto retry;
2508 }
2509
2510 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2511
2512 r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED);
2513 if (r)
2514 goto out_unlock;
2515
2516 set_bit(DMF_POST_SUSPENDING, &md->flags);
2517 dm_table_postsuspend_targets(map);
2518 clear_bit(DMF_POST_SUSPENDING, &md->flags);
2519
2520out_unlock:
2521 mutex_unlock(&md->suspend_lock);
2522 return r;
2523}
2524
2525static int __dm_resume(struct mapped_device *md, struct dm_table *map)
2526{
2527 if (map) {
2528 int r = dm_table_resume_targets(map);
2529 if (r)
2530 return r;
2531 }
2532
2533 dm_queue_flush(md);
2534
2535
2536
2537
2538
2539
2540 if (dm_request_based(md))
2541 dm_start_queue(md->queue);
2542
2543 unlock_fs(md);
2544
2545 return 0;
2546}
2547
2548int dm_resume(struct mapped_device *md)
2549{
2550 int r;
2551 struct dm_table *map = NULL;
2552
2553retry:
2554 r = -EINVAL;
2555 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2556
2557 if (!dm_suspended_md(md))
2558 goto out;
2559
2560 if (dm_suspended_internally_md(md)) {
2561
2562 mutex_unlock(&md->suspend_lock);
2563 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2564 if (r)
2565 return r;
2566 goto retry;
2567 }
2568
2569 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2570 if (!map || !dm_table_get_size(map))
2571 goto out;
2572
2573 r = __dm_resume(md, map);
2574 if (r)
2575 goto out;
2576
2577 clear_bit(DMF_SUSPENDED, &md->flags);
2578out:
2579 mutex_unlock(&md->suspend_lock);
2580
2581 return r;
2582}
2583
2584
2585
2586
2587
2588
2589
2590static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
2591{
2592 struct dm_table *map = NULL;
2593
2594 lockdep_assert_held(&md->suspend_lock);
2595
2596 if (md->internal_suspend_count++)
2597 return;
2598
2599 if (dm_suspended_md(md)) {
2600 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2601 return;
2602 }
2603
2604 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2605
2606
2607
2608
2609
2610
2611
2612 (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE,
2613 DMF_SUSPENDED_INTERNALLY);
2614
2615 set_bit(DMF_POST_SUSPENDING, &md->flags);
2616 dm_table_postsuspend_targets(map);
2617 clear_bit(DMF_POST_SUSPENDING, &md->flags);
2618}
2619
2620static void __dm_internal_resume(struct mapped_device *md)
2621{
2622 BUG_ON(!md->internal_suspend_count);
2623
2624 if (--md->internal_suspend_count)
2625 return;
2626
2627 if (dm_suspended_md(md))
2628 goto done;
2629
2630
2631
2632
2633
2634 (void) __dm_resume(md, NULL);
2635
2636done:
2637 clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2638 smp_mb__after_atomic();
2639 wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
2640}
2641
2642void dm_internal_suspend_noflush(struct mapped_device *md)
2643{
2644 mutex_lock(&md->suspend_lock);
2645 __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
2646 mutex_unlock(&md->suspend_lock);
2647}
2648EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
2649
2650void dm_internal_resume(struct mapped_device *md)
2651{
2652 mutex_lock(&md->suspend_lock);
2653 __dm_internal_resume(md);
2654 mutex_unlock(&md->suspend_lock);
2655}
2656EXPORT_SYMBOL_GPL(dm_internal_resume);
2657
2658
2659
2660
2661
2662
2663void dm_internal_suspend_fast(struct mapped_device *md)
2664{
2665 mutex_lock(&md->suspend_lock);
2666 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2667 return;
2668
2669 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2670 synchronize_srcu(&md->io_barrier);
2671 flush_workqueue(md->wq);
2672 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2673}
2674EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
2675
2676void dm_internal_resume_fast(struct mapped_device *md)
2677{
2678 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2679 goto done;
2680
2681 dm_queue_flush(md);
2682
2683done:
2684 mutex_unlock(&md->suspend_lock);
2685}
2686EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
2687
2688
2689
2690
2691int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
2692 unsigned cookie)
2693{
2694 int r;
2695 unsigned noio_flag;
2696 char udev_cookie[DM_COOKIE_LENGTH];
2697 char *envp[] = { udev_cookie, NULL };
2698
2699 noio_flag = memalloc_noio_save();
2700
2701 if (!cookie)
2702 r = kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
2703 else {
2704 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2705 DM_COOKIE_ENV_VAR_NAME, cookie);
2706 r = kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
2707 action, envp);
2708 }
2709
2710 memalloc_noio_restore(noio_flag);
2711
2712 return r;
2713}
2714
2715uint32_t dm_next_uevent_seq(struct mapped_device *md)
2716{
2717 return atomic_add_return(1, &md->uevent_seq);
2718}
2719
2720uint32_t dm_get_event_nr(struct mapped_device *md)
2721{
2722 return atomic_read(&md->event_nr);
2723}
2724
2725int dm_wait_event(struct mapped_device *md, int event_nr)
2726{
2727 return wait_event_interruptible(md->eventq,
2728 (event_nr != atomic_read(&md->event_nr)));
2729}
2730
2731void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
2732{
2733 unsigned long flags;
2734
2735 spin_lock_irqsave(&md->uevent_lock, flags);
2736 list_add(elist, &md->uevent_list);
2737 spin_unlock_irqrestore(&md->uevent_lock, flags);
2738}
2739
2740
2741
2742
2743
2744struct gendisk *dm_disk(struct mapped_device *md)
2745{
2746 return md->disk;
2747}
2748EXPORT_SYMBOL_GPL(dm_disk);
2749
2750struct kobject *dm_kobject(struct mapped_device *md)
2751{
2752 return &md->kobj_holder.kobj;
2753}
2754
2755struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2756{
2757 struct mapped_device *md;
2758
2759 md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
2760
2761 spin_lock(&_minor_lock);
2762 if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2763 md = NULL;
2764 goto out;
2765 }
2766 dm_get(md);
2767out:
2768 spin_unlock(&_minor_lock);
2769
2770 return md;
2771}
2772
2773int dm_suspended_md(struct mapped_device *md)
2774{
2775 return test_bit(DMF_SUSPENDED, &md->flags);
2776}
2777
2778static int dm_post_suspending_md(struct mapped_device *md)
2779{
2780 return test_bit(DMF_POST_SUSPENDING, &md->flags);
2781}
2782
2783int dm_suspended_internally_md(struct mapped_device *md)
2784{
2785 return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2786}
2787
2788int dm_test_deferred_remove_flag(struct mapped_device *md)
2789{
2790 return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
2791}
2792
2793int dm_suspended(struct dm_target *ti)
2794{
2795 return dm_suspended_md(ti->table->md);
2796}
2797EXPORT_SYMBOL_GPL(dm_suspended);
2798
2799int dm_post_suspending(struct dm_target *ti)
2800{
2801 return dm_post_suspending_md(ti->table->md);
2802}
2803EXPORT_SYMBOL_GPL(dm_post_suspending);
2804
2805int dm_noflush_suspending(struct dm_target *ti)
2806{
2807 return __noflush_suspending(ti->table->md);
2808}
2809EXPORT_SYMBOL_GPL(dm_noflush_suspending);
2810
2811struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
2812 unsigned integrity, unsigned per_io_data_size,
2813 unsigned min_pool_size)
2814{
2815 struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
2816 unsigned int pool_size = 0;
2817 unsigned int front_pad, io_front_pad;
2818 int ret;
2819
2820 if (!pools)
2821 return NULL;
2822
2823 switch (type) {
2824 case DM_TYPE_BIO_BASED:
2825 case DM_TYPE_DAX_BIO_BASED:
2826 pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
2827 front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + DM_TARGET_IO_BIO_OFFSET;
2828 io_front_pad = roundup(per_io_data_size, __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET;
2829 ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0);
2830 if (ret)
2831 goto out;
2832 if (integrity && bioset_integrity_create(&pools->io_bs, pool_size))
2833 goto out;
2834 break;
2835 case DM_TYPE_REQUEST_BASED:
2836 pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size);
2837 front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
2838
2839 break;
2840 default:
2841 BUG();
2842 }
2843
2844 ret = bioset_init(&pools->bs, pool_size, front_pad, 0);
2845 if (ret)
2846 goto out;
2847
2848 if (integrity && bioset_integrity_create(&pools->bs, pool_size))
2849 goto out;
2850
2851 return pools;
2852
2853out:
2854 dm_free_md_mempools(pools);
2855
2856 return NULL;
2857}
2858
2859void dm_free_md_mempools(struct dm_md_mempools *pools)
2860{
2861 if (!pools)
2862 return;
2863
2864 bioset_exit(&pools->bs);
2865 bioset_exit(&pools->io_bs);
2866
2867 kfree(pools);
2868}
2869
2870struct dm_pr {
2871 u64 old_key;
2872 u64 new_key;
2873 u32 flags;
2874 bool fail_early;
2875};
2876
2877static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
2878 void *data)
2879{
2880 struct mapped_device *md = bdev->bd_disk->private_data;
2881 struct dm_table *table;
2882 struct dm_target *ti;
2883 int ret = -ENOTTY, srcu_idx;
2884
2885 table = dm_get_live_table(md, &srcu_idx);
2886 if (!table || !dm_table_get_size(table))
2887 goto out;
2888
2889
2890 if (dm_table_get_num_targets(table) != 1)
2891 goto out;
2892 ti = dm_table_get_target(table, 0);
2893
2894 ret = -EINVAL;
2895 if (!ti->type->iterate_devices)
2896 goto out;
2897
2898 ret = ti->type->iterate_devices(ti, fn, data);
2899out:
2900 dm_put_live_table(md, srcu_idx);
2901 return ret;
2902}
2903
2904
2905
2906
2907static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev,
2908 sector_t start, sector_t len, void *data)
2909{
2910 struct dm_pr *pr = data;
2911 const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
2912
2913 if (!ops || !ops->pr_register)
2914 return -EOPNOTSUPP;
2915 return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);
2916}
2917
2918static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
2919 u32 flags)
2920{
2921 struct dm_pr pr = {
2922 .old_key = old_key,
2923 .new_key = new_key,
2924 .flags = flags,
2925 .fail_early = true,
2926 };
2927 int ret;
2928
2929 ret = dm_call_pr(bdev, __dm_pr_register, &pr);
2930 if (ret && new_key) {
2931
2932 pr.old_key = new_key;
2933 pr.new_key = 0;
2934 pr.flags = 0;
2935 pr.fail_early = false;
2936 dm_call_pr(bdev, __dm_pr_register, &pr);
2937 }
2938
2939 return ret;
2940}
2941
2942static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
2943 u32 flags)
2944{
2945 struct mapped_device *md = bdev->bd_disk->private_data;
2946 const struct pr_ops *ops;
2947 int r, srcu_idx;
2948
2949 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
2950 if (r < 0)
2951 goto out;
2952
2953 ops = bdev->bd_disk->fops->pr_ops;
2954 if (ops && ops->pr_reserve)
2955 r = ops->pr_reserve(bdev, key, type, flags);
2956 else
2957 r = -EOPNOTSUPP;
2958out:
2959 dm_unprepare_ioctl(md, srcu_idx);
2960 return r;
2961}
2962
2963static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
2964{
2965 struct mapped_device *md = bdev->bd_disk->private_data;
2966 const struct pr_ops *ops;
2967 int r, srcu_idx;
2968
2969 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
2970 if (r < 0)
2971 goto out;
2972
2973 ops = bdev->bd_disk->fops->pr_ops;
2974 if (ops && ops->pr_release)
2975 r = ops->pr_release(bdev, key, type);
2976 else
2977 r = -EOPNOTSUPP;
2978out:
2979 dm_unprepare_ioctl(md, srcu_idx);
2980 return r;
2981}
2982
2983static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
2984 enum pr_type type, bool abort)
2985{
2986 struct mapped_device *md = bdev->bd_disk->private_data;
2987 const struct pr_ops *ops;
2988 int r, srcu_idx;
2989
2990 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
2991 if (r < 0)
2992 goto out;
2993
2994 ops = bdev->bd_disk->fops->pr_ops;
2995 if (ops && ops->pr_preempt)
2996 r = ops->pr_preempt(bdev, old_key, new_key, type, abort);
2997 else
2998 r = -EOPNOTSUPP;
2999out:
3000 dm_unprepare_ioctl(md, srcu_idx);
3001 return r;
3002}
3003
3004static int dm_pr_clear(struct block_device *bdev, u64 key)
3005{
3006 struct mapped_device *md = bdev->bd_disk->private_data;
3007 const struct pr_ops *ops;
3008 int r, srcu_idx;
3009
3010 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3011 if (r < 0)
3012 goto out;
3013
3014 ops = bdev->bd_disk->fops->pr_ops;
3015 if (ops && ops->pr_clear)
3016 r = ops->pr_clear(bdev, key);
3017 else
3018 r = -EOPNOTSUPP;
3019out:
3020 dm_unprepare_ioctl(md, srcu_idx);
3021 return r;
3022}
3023
3024static const struct pr_ops dm_pr_ops = {
3025 .pr_register = dm_pr_register,
3026 .pr_reserve = dm_pr_reserve,
3027 .pr_release = dm_pr_release,
3028 .pr_preempt = dm_pr_preempt,
3029 .pr_clear = dm_pr_clear,
3030};
3031
3032static const struct block_device_operations dm_blk_dops = {
3033 .submit_bio = dm_submit_bio,
3034 .open = dm_blk_open,
3035 .release = dm_blk_close,
3036 .ioctl = dm_blk_ioctl,
3037 .getgeo = dm_blk_getgeo,
3038 .report_zones = dm_blk_report_zones,
3039 .pr_ops = &dm_pr_ops,
3040 .owner = THIS_MODULE
3041};
3042
3043static const struct block_device_operations dm_rq_blk_dops = {
3044 .open = dm_blk_open,
3045 .release = dm_blk_close,
3046 .ioctl = dm_blk_ioctl,
3047 .getgeo = dm_blk_getgeo,
3048 .pr_ops = &dm_pr_ops,
3049 .owner = THIS_MODULE
3050};
3051
3052static const struct dax_operations dm_dax_ops = {
3053 .direct_access = dm_dax_direct_access,
3054 .dax_supported = dm_dax_supported,
3055 .copy_from_iter = dm_dax_copy_from_iter,
3056 .copy_to_iter = dm_dax_copy_to_iter,
3057 .zero_page_range = dm_dax_zero_page_range,
3058};
3059
3060
3061
3062
3063module_init(dm_init);
3064module_exit(dm_exit);
3065
3066module_param(major, uint, 0);
3067MODULE_PARM_DESC(major, "The major number of the device mapper");
3068
3069module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
3070MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
3071
3072module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
3073MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
3074
3075module_param(swap_bios, int, S_IRUGO | S_IWUSR);
3076MODULE_PARM_DESC(swap_bios, "Maximum allowed inflight swap IOs");
3077
3078MODULE_DESCRIPTION(DM_NAME " driver");
3079MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3080MODULE_LICENSE("GPL");
3081