1
2
3
4
5
6
7
8#include "dm-core.h"
9#include "dm-rq.h"
10#include "dm-uevent.h"
11#include "dm-ima.h"
12
13#include <linux/init.h>
14#include <linux/module.h>
15#include <linux/mutex.h>
16#include <linux/sched/mm.h>
17#include <linux/sched/signal.h>
18#include <linux/blkpg.h>
19#include <linux/bio.h>
20#include <linux/mempool.h>
21#include <linux/dax.h>
22#include <linux/slab.h>
23#include <linux/idr.h>
24#include <linux/uio.h>
25#include <linux/hdreg.h>
26#include <linux/delay.h>
27#include <linux/wait.h>
28#include <linux/pr.h>
29#include <linux/refcount.h>
30#include <linux/part_stat.h>
31#include <linux/blk-crypto.h>
32#include <linux/blk-crypto-profile.h>
33
34#define DM_MSG_PREFIX "core"
35
36
37
38
39
40#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
41#define DM_COOKIE_LENGTH 24
42
43
44
45
46
47
48#define REQ_DM_POLL_LIST REQ_DRV
49
50static const char *_name = DM_NAME;
51
52static unsigned int major = 0;
53static unsigned int _major = 0;
54
55static DEFINE_IDR(_minor_idr);
56
57static DEFINE_SPINLOCK(_minor_lock);
58
59static void do_deferred_remove(struct work_struct *w);
60
61static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
62
63static struct workqueue_struct *deferred_remove_workqueue;
64
65atomic_t dm_global_event_nr = ATOMIC_INIT(0);
66DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq);
67
68void dm_issue_global_event(void)
69{
70 atomic_inc(&dm_global_event_nr);
71 wake_up(&dm_global_eventq);
72}
73
74
75
76
77struct clone_info {
78 struct dm_table *map;
79 struct bio *bio;
80 struct dm_io *io;
81 sector_t sector;
82 unsigned sector_count;
83 bool submit_as_polled;
84};
85
86#define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone))
87#define DM_IO_BIO_OFFSET \
88 (offsetof(struct dm_target_io, clone) + offsetof(struct dm_io, tio))
89
90static inline struct dm_target_io *clone_to_tio(struct bio *clone)
91{
92 return container_of(clone, struct dm_target_io, clone);
93}
94
95void *dm_per_bio_data(struct bio *bio, size_t data_size)
96{
97 if (!dm_tio_flagged(clone_to_tio(bio), DM_TIO_INSIDE_DM_IO))
98 return (char *)bio - DM_TARGET_IO_BIO_OFFSET - data_size;
99 return (char *)bio - DM_IO_BIO_OFFSET - data_size;
100}
101EXPORT_SYMBOL_GPL(dm_per_bio_data);
102
103struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size)
104{
105 struct dm_io *io = (struct dm_io *)((char *)data + data_size);
106 if (io->magic == DM_IO_MAGIC)
107 return (struct bio *)((char *)io + DM_IO_BIO_OFFSET);
108 BUG_ON(io->magic != DM_TIO_MAGIC);
109 return (struct bio *)((char *)io + DM_TARGET_IO_BIO_OFFSET);
110}
111EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data);
112
113unsigned dm_bio_get_target_bio_nr(const struct bio *bio)
114{
115 return container_of(bio, struct dm_target_io, clone)->target_bio_nr;
116}
117EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr);
118
119#define MINOR_ALLOCED ((void *)-1)
120
121#define DM_NUMA_NODE NUMA_NO_NODE
122static int dm_numa_node = DM_NUMA_NODE;
123
124#define DEFAULT_SWAP_BIOS (8 * 1048576 / PAGE_SIZE)
125static int swap_bios = DEFAULT_SWAP_BIOS;
126static int get_swap_bios(void)
127{
128 int latch = READ_ONCE(swap_bios);
129 if (unlikely(latch <= 0))
130 latch = DEFAULT_SWAP_BIOS;
131 return latch;
132}
133
134
135
136
137struct dm_md_mempools {
138 struct bio_set bs;
139 struct bio_set io_bs;
140};
141
142struct table_device {
143 struct list_head list;
144 refcount_t count;
145 struct dm_dev dm_dev;
146};
147
148
149
150
151#define RESERVED_BIO_BASED_IOS 16
152static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
153
154static int __dm_get_module_param_int(int *module_param, int min, int max)
155{
156 int param = READ_ONCE(*module_param);
157 int modified_param = 0;
158 bool modified = true;
159
160 if (param < min)
161 modified_param = min;
162 else if (param > max)
163 modified_param = max;
164 else
165 modified = false;
166
167 if (modified) {
168 (void)cmpxchg(module_param, param, modified_param);
169 param = modified_param;
170 }
171
172 return param;
173}
174
175unsigned __dm_get_module_param(unsigned *module_param,
176 unsigned def, unsigned max)
177{
178 unsigned param = READ_ONCE(*module_param);
179 unsigned modified_param = 0;
180
181 if (!param)
182 modified_param = def;
183 else if (param > max)
184 modified_param = max;
185
186 if (modified_param) {
187 (void)cmpxchg(module_param, param, modified_param);
188 param = modified_param;
189 }
190
191 return param;
192}
193
194unsigned dm_get_reserved_bio_based_ios(void)
195{
196 return __dm_get_module_param(&reserved_bio_based_ios,
197 RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS);
198}
199EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
200
201static unsigned dm_get_numa_node(void)
202{
203 return __dm_get_module_param_int(&dm_numa_node,
204 DM_NUMA_NODE, num_online_nodes() - 1);
205}
206
207static int __init local_init(void)
208{
209 int r;
210
211 r = dm_uevent_init();
212 if (r)
213 return r;
214
215 deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
216 if (!deferred_remove_workqueue) {
217 r = -ENOMEM;
218 goto out_uevent_exit;
219 }
220
221 _major = major;
222 r = register_blkdev(_major, _name);
223 if (r < 0)
224 goto out_free_workqueue;
225
226 if (!_major)
227 _major = r;
228
229 return 0;
230
231out_free_workqueue:
232 destroy_workqueue(deferred_remove_workqueue);
233out_uevent_exit:
234 dm_uevent_exit();
235
236 return r;
237}
238
239static void local_exit(void)
240{
241 flush_scheduled_work();
242 destroy_workqueue(deferred_remove_workqueue);
243
244 unregister_blkdev(_major, _name);
245 dm_uevent_exit();
246
247 _major = 0;
248
249 DMINFO("cleaned up");
250}
251
252static int (*_inits[])(void) __initdata = {
253 local_init,
254 dm_target_init,
255 dm_linear_init,
256 dm_stripe_init,
257 dm_io_init,
258 dm_kcopyd_init,
259 dm_interface_init,
260 dm_statistics_init,
261};
262
263static void (*_exits[])(void) = {
264 local_exit,
265 dm_target_exit,
266 dm_linear_exit,
267 dm_stripe_exit,
268 dm_io_exit,
269 dm_kcopyd_exit,
270 dm_interface_exit,
271 dm_statistics_exit,
272};
273
274static int __init dm_init(void)
275{
276 const int count = ARRAY_SIZE(_inits);
277 int r, i;
278
279#if (IS_ENABLED(CONFIG_IMA) && !IS_ENABLED(CONFIG_IMA_DISABLE_HTABLE))
280 DMWARN("CONFIG_IMA_DISABLE_HTABLE is disabled."
281 " Duplicate IMA measurements will not be recorded in the IMA log.");
282#endif
283
284 for (i = 0; i < count; i++) {
285 r = _inits[i]();
286 if (r)
287 goto bad;
288 }
289
290 return 0;
291bad:
292 while (i--)
293 _exits[i]();
294
295 return r;
296}
297
298static void __exit dm_exit(void)
299{
300 int i = ARRAY_SIZE(_exits);
301
302 while (i--)
303 _exits[i]();
304
305
306
307
308 idr_destroy(&_minor_idr);
309}
310
311
312
313
314int dm_deleting_md(struct mapped_device *md)
315{
316 return test_bit(DMF_DELETING, &md->flags);
317}
318
319static int dm_blk_open(struct block_device *bdev, fmode_t mode)
320{
321 struct mapped_device *md;
322
323 spin_lock(&_minor_lock);
324
325 md = bdev->bd_disk->private_data;
326 if (!md)
327 goto out;
328
329 if (test_bit(DMF_FREEING, &md->flags) ||
330 dm_deleting_md(md)) {
331 md = NULL;
332 goto out;
333 }
334
335 dm_get(md);
336 atomic_inc(&md->open_count);
337out:
338 spin_unlock(&_minor_lock);
339
340 return md ? 0 : -ENXIO;
341}
342
343static void dm_blk_close(struct gendisk *disk, fmode_t mode)
344{
345 struct mapped_device *md;
346
347 spin_lock(&_minor_lock);
348
349 md = disk->private_data;
350 if (WARN_ON(!md))
351 goto out;
352
353 if (atomic_dec_and_test(&md->open_count) &&
354 (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
355 queue_work(deferred_remove_workqueue, &deferred_remove_work);
356
357 dm_put(md);
358out:
359 spin_unlock(&_minor_lock);
360}
361
362int dm_open_count(struct mapped_device *md)
363{
364 return atomic_read(&md->open_count);
365}
366
367
368
369
370int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
371{
372 int r = 0;
373
374 spin_lock(&_minor_lock);
375
376 if (dm_open_count(md)) {
377 r = -EBUSY;
378 if (mark_deferred)
379 set_bit(DMF_DEFERRED_REMOVE, &md->flags);
380 } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
381 r = -EEXIST;
382 else
383 set_bit(DMF_DELETING, &md->flags);
384
385 spin_unlock(&_minor_lock);
386
387 return r;
388}
389
390int dm_cancel_deferred_remove(struct mapped_device *md)
391{
392 int r = 0;
393
394 spin_lock(&_minor_lock);
395
396 if (test_bit(DMF_DELETING, &md->flags))
397 r = -EBUSY;
398 else
399 clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
400
401 spin_unlock(&_minor_lock);
402
403 return r;
404}
405
406static void do_deferred_remove(struct work_struct *w)
407{
408 dm_deferred_remove();
409}
410
411static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
412{
413 struct mapped_device *md = bdev->bd_disk->private_data;
414
415 return dm_get_geometry(md, geo);
416}
417
418static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx,
419 struct block_device **bdev)
420{
421 struct dm_target *tgt;
422 struct dm_table *map;
423 int r;
424
425retry:
426 r = -ENOTTY;
427 map = dm_get_live_table(md, srcu_idx);
428 if (!map || !dm_table_get_size(map))
429 return r;
430
431
432 if (dm_table_get_num_targets(map) != 1)
433 return r;
434
435 tgt = dm_table_get_target(map, 0);
436 if (!tgt->type->prepare_ioctl)
437 return r;
438
439 if (dm_suspended_md(md))
440 return -EAGAIN;
441
442 r = tgt->type->prepare_ioctl(tgt, bdev);
443 if (r == -ENOTCONN && !fatal_signal_pending(current)) {
444 dm_put_live_table(md, *srcu_idx);
445 msleep(10);
446 goto retry;
447 }
448
449 return r;
450}
451
452static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx)
453{
454 dm_put_live_table(md, srcu_idx);
455}
456
457static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
458 unsigned int cmd, unsigned long arg)
459{
460 struct mapped_device *md = bdev->bd_disk->private_data;
461 int r, srcu_idx;
462
463 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
464 if (r < 0)
465 goto out;
466
467 if (r > 0) {
468
469
470
471
472 if (!capable(CAP_SYS_RAWIO)) {
473 DMDEBUG_LIMIT(
474 "%s: sending ioctl %x to DM device without required privilege.",
475 current->comm, cmd);
476 r = -ENOIOCTLCMD;
477 goto out;
478 }
479 }
480
481 if (!bdev->bd_disk->fops->ioctl)
482 r = -ENOTTY;
483 else
484 r = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg);
485out:
486 dm_unprepare_ioctl(md, srcu_idx);
487 return r;
488}
489
490u64 dm_start_time_ns_from_clone(struct bio *bio)
491{
492 return jiffies_to_nsecs(clone_to_tio(bio)->io->start_time);
493}
494EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone);
495
496static bool bio_is_flush_with_data(struct bio *bio)
497{
498 return ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size);
499}
500
501static void dm_io_acct(bool end, struct mapped_device *md, struct bio *bio,
502 unsigned long start_time, struct dm_stats_aux *stats_aux)
503{
504 bool is_flush_with_data;
505 unsigned int bi_size;
506
507
508 is_flush_with_data = bio_is_flush_with_data(bio);
509 if (is_flush_with_data) {
510 bi_size = bio->bi_iter.bi_size;
511 bio->bi_iter.bi_size = 0;
512 }
513
514 if (!end)
515 bio_start_io_acct_time(bio, start_time);
516 else
517 bio_end_io_acct(bio, start_time);
518
519 if (unlikely(dm_stats_used(&md->stats)))
520 dm_stats_account_io(&md->stats, bio_data_dir(bio),
521 bio->bi_iter.bi_sector, bio_sectors(bio),
522 end, start_time, stats_aux);
523
524
525 if (is_flush_with_data)
526 bio->bi_iter.bi_size = bi_size;
527}
528
529static void __dm_start_io_acct(struct dm_io *io, struct bio *bio)
530{
531 dm_io_acct(false, io->md, bio, io->start_time, &io->stats_aux);
532}
533
534static void dm_start_io_acct(struct dm_io *io, struct bio *clone)
535{
536
537 struct bio *bio = io->orig_bio;
538
539
540
541
542
543 if (!clone ||
544 likely(!dm_tio_flagged(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO))) {
545 if (WARN_ON_ONCE(dm_io_flagged(io, DM_IO_ACCOUNTED)))
546 return;
547 dm_io_set_flag(io, DM_IO_ACCOUNTED);
548 } else {
549 unsigned long flags;
550 if (dm_io_flagged(io, DM_IO_ACCOUNTED))
551 return;
552
553 spin_lock_irqsave(&io->lock, flags);
554 dm_io_set_flag(io, DM_IO_ACCOUNTED);
555 spin_unlock_irqrestore(&io->lock, flags);
556 }
557
558 __dm_start_io_acct(io, bio);
559}
560
561static void dm_end_io_acct(struct dm_io *io, struct bio *bio)
562{
563 dm_io_acct(true, io->md, bio, io->start_time, &io->stats_aux);
564}
565
566static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
567{
568 struct dm_io *io;
569 struct dm_target_io *tio;
570 struct bio *clone;
571
572 clone = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO, &md->io_bs);
573
574 tio = clone_to_tio(clone);
575 tio->flags = 0;
576 dm_tio_set_flag(tio, DM_TIO_INSIDE_DM_IO);
577 tio->io = NULL;
578
579 io = container_of(tio, struct dm_io, tio);
580 io->magic = DM_IO_MAGIC;
581 io->status = 0;
582 atomic_set(&io->io_count, 1);
583 this_cpu_inc(*md->pending_io);
584 io->orig_bio = NULL;
585 io->md = md;
586 io->map_task = current;
587 spin_lock_init(&io->lock);
588 io->start_time = jiffies;
589 io->flags = 0;
590
591 dm_stats_record_start(&md->stats, &io->stats_aux);
592
593 return io;
594}
595
596static void free_io(struct dm_io *io)
597{
598 bio_put(&io->tio.clone);
599}
600
601static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti,
602 unsigned target_bio_nr, unsigned *len, gfp_t gfp_mask)
603{
604 struct dm_target_io *tio;
605 struct bio *clone;
606
607 if (!ci->io->tio.io) {
608
609 tio = &ci->io->tio;
610
611 clone = &tio->clone;
612 } else {
613 clone = bio_alloc_clone(ci->bio->bi_bdev, ci->bio,
614 gfp_mask, &ci->io->md->bs);
615 if (!clone)
616 return NULL;
617
618
619 clone->bi_opf &= ~REQ_DM_POLL_LIST;
620
621 tio = clone_to_tio(clone);
622 tio->flags = 0;
623 }
624
625 tio->magic = DM_TIO_MAGIC;
626 tio->io = ci->io;
627 tio->ti = ti;
628 tio->target_bio_nr = target_bio_nr;
629 tio->len_ptr = len;
630 tio->old_sector = 0;
631
632 if (len) {
633 clone->bi_iter.bi_size = to_bytes(*len);
634 if (bio_integrity(clone))
635 bio_integrity_trim(clone);
636 }
637
638 return clone;
639}
640
641static void free_tio(struct bio *clone)
642{
643 if (dm_tio_flagged(clone_to_tio(clone), DM_TIO_INSIDE_DM_IO))
644 return;
645 bio_put(clone);
646}
647
648
649
650
651static void queue_io(struct mapped_device *md, struct bio *bio)
652{
653 unsigned long flags;
654
655 spin_lock_irqsave(&md->deferred_lock, flags);
656 bio_list_add(&md->deferred, bio);
657 spin_unlock_irqrestore(&md->deferred_lock, flags);
658 queue_work(md->wq, &md->work);
659}
660
661
662
663
664
665
666struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
667{
668 *srcu_idx = srcu_read_lock(&md->io_barrier);
669
670 return srcu_dereference(md->map, &md->io_barrier);
671}
672
673void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
674{
675 srcu_read_unlock(&md->io_barrier, srcu_idx);
676}
677
678void dm_sync_table(struct mapped_device *md)
679{
680 synchronize_srcu(&md->io_barrier);
681 synchronize_rcu_expedited();
682}
683
684
685
686
687
688static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
689{
690 rcu_read_lock();
691 return rcu_dereference(md->map);
692}
693
694static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
695{
696 rcu_read_unlock();
697}
698
699static char *_dm_claim_ptr = "I belong to device-mapper";
700
701
702
703
704static int open_table_device(struct table_device *td, dev_t dev,
705 struct mapped_device *md)
706{
707 struct block_device *bdev;
708 u64 part_off;
709 int r;
710
711 BUG_ON(td->dm_dev.bdev);
712
713 bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr);
714 if (IS_ERR(bdev))
715 return PTR_ERR(bdev);
716
717 r = bd_link_disk_holder(bdev, dm_disk(md));
718 if (r) {
719 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
720 return r;
721 }
722
723 td->dm_dev.bdev = bdev;
724 td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off);
725 return 0;
726}
727
728
729
730
731static void close_table_device(struct table_device *td, struct mapped_device *md)
732{
733 if (!td->dm_dev.bdev)
734 return;
735
736 bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
737 blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
738 put_dax(td->dm_dev.dax_dev);
739 td->dm_dev.bdev = NULL;
740 td->dm_dev.dax_dev = NULL;
741}
742
743static struct table_device *find_table_device(struct list_head *l, dev_t dev,
744 fmode_t mode)
745{
746 struct table_device *td;
747
748 list_for_each_entry(td, l, list)
749 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
750 return td;
751
752 return NULL;
753}
754
755int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
756 struct dm_dev **result)
757{
758 int r;
759 struct table_device *td;
760
761 mutex_lock(&md->table_devices_lock);
762 td = find_table_device(&md->table_devices, dev, mode);
763 if (!td) {
764 td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
765 if (!td) {
766 mutex_unlock(&md->table_devices_lock);
767 return -ENOMEM;
768 }
769
770 td->dm_dev.mode = mode;
771 td->dm_dev.bdev = NULL;
772
773 if ((r = open_table_device(td, dev, md))) {
774 mutex_unlock(&md->table_devices_lock);
775 kfree(td);
776 return r;
777 }
778
779 format_dev_t(td->dm_dev.name, dev);
780
781 refcount_set(&td->count, 1);
782 list_add(&td->list, &md->table_devices);
783 } else {
784 refcount_inc(&td->count);
785 }
786 mutex_unlock(&md->table_devices_lock);
787
788 *result = &td->dm_dev;
789 return 0;
790}
791
792void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
793{
794 struct table_device *td = container_of(d, struct table_device, dm_dev);
795
796 mutex_lock(&md->table_devices_lock);
797 if (refcount_dec_and_test(&td->count)) {
798 close_table_device(td, md);
799 list_del(&td->list);
800 kfree(td);
801 }
802 mutex_unlock(&md->table_devices_lock);
803}
804
805static void free_table_devices(struct list_head *devices)
806{
807 struct list_head *tmp, *next;
808
809 list_for_each_safe(tmp, next, devices) {
810 struct table_device *td = list_entry(tmp, struct table_device, list);
811
812 DMWARN("dm_destroy: %s still exists with %d references",
813 td->dm_dev.name, refcount_read(&td->count));
814 kfree(td);
815 }
816}
817
818
819
820
821int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
822{
823 *geo = md->geometry;
824
825 return 0;
826}
827
828
829
830
831int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
832{
833 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
834
835 if (geo->start > sz) {
836 DMWARN("Start sector is beyond the geometry limits.");
837 return -EINVAL;
838 }
839
840 md->geometry = *geo;
841
842 return 0;
843}
844
845static int __noflush_suspending(struct mapped_device *md)
846{
847 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
848}
849
850static void dm_io_complete(struct dm_io *io)
851{
852 blk_status_t io_error;
853 struct mapped_device *md = io->md;
854 struct bio *bio = io->orig_bio;
855
856 if (io->status == BLK_STS_DM_REQUEUE) {
857 unsigned long flags;
858
859
860
861 spin_lock_irqsave(&md->deferred_lock, flags);
862 if (__noflush_suspending(md) &&
863 !WARN_ON_ONCE(dm_is_zone_write(md, bio))) {
864
865 bio_list_add_head(&md->deferred, bio);
866 } else {
867
868
869
870
871 io->status = BLK_STS_IOERR;
872 }
873 spin_unlock_irqrestore(&md->deferred_lock, flags);
874 }
875
876 io_error = io->status;
877 if (dm_io_flagged(io, DM_IO_ACCOUNTED))
878 dm_end_io_acct(io, bio);
879 else if (!io_error) {
880
881
882
883
884 __dm_start_io_acct(io, bio);
885 dm_end_io_acct(io, bio);
886 }
887 free_io(io);
888 smp_wmb();
889 this_cpu_dec(*md->pending_io);
890
891
892 if (unlikely(wq_has_sleeper(&md->wait)))
893 wake_up(&md->wait);
894
895 if (io_error == BLK_STS_DM_REQUEUE || io_error == BLK_STS_AGAIN) {
896 if (bio->bi_opf & REQ_POLLED) {
897
898
899
900
901
902 bio->bi_opf &= ~REQ_POLLED;
903 if (io_error == BLK_STS_AGAIN) {
904
905 queue_io(md, bio);
906 }
907 }
908 return;
909 }
910
911 if (bio_is_flush_with_data(bio)) {
912
913
914
915
916 bio->bi_opf &= ~REQ_PREFLUSH;
917 queue_io(md, bio);
918 } else {
919
920 if (io_error)
921 bio->bi_status = io_error;
922 bio_endio(bio);
923 }
924}
925
926static inline bool dm_tio_is_normal(struct dm_target_io *tio)
927{
928 return (dm_tio_flagged(tio, DM_TIO_INSIDE_DM_IO) &&
929 !dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO));
930}
931
932
933
934
935
936void dm_io_dec_pending(struct dm_io *io, blk_status_t error)
937{
938
939 if (unlikely(error)) {
940 unsigned long flags;
941 spin_lock_irqsave(&io->lock, flags);
942 if (!(io->status == BLK_STS_DM_REQUEUE &&
943 __noflush_suspending(io->md)))
944 io->status = error;
945 spin_unlock_irqrestore(&io->lock, flags);
946 }
947
948 if (atomic_dec_and_test(&io->io_count))
949 dm_io_complete(io);
950}
951
952void disable_discard(struct mapped_device *md)
953{
954 struct queue_limits *limits = dm_get_queue_limits(md);
955
956
957 limits->max_discard_sectors = 0;
958 blk_queue_flag_clear(QUEUE_FLAG_DISCARD, md->queue);
959}
960
961void disable_write_zeroes(struct mapped_device *md)
962{
963 struct queue_limits *limits = dm_get_queue_limits(md);
964
965
966 limits->max_write_zeroes_sectors = 0;
967}
968
969static bool swap_bios_limit(struct dm_target *ti, struct bio *bio)
970{
971 return unlikely((bio->bi_opf & REQ_SWAP) != 0) && unlikely(ti->limit_swap_bios);
972}
973
974static void clone_endio(struct bio *bio)
975{
976 blk_status_t error = bio->bi_status;
977 struct dm_target_io *tio = clone_to_tio(bio);
978 struct dm_io *io = tio->io;
979 struct mapped_device *md = tio->io->md;
980 dm_endio_fn endio = tio->ti->type->end_io;
981 struct request_queue *q = bio->bi_bdev->bd_disk->queue;
982
983 if (unlikely(error == BLK_STS_TARGET)) {
984 if (bio_op(bio) == REQ_OP_DISCARD &&
985 !q->limits.max_discard_sectors)
986 disable_discard(md);
987 else if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
988 !q->limits.max_write_zeroes_sectors)
989 disable_write_zeroes(md);
990 }
991
992 if (blk_queue_is_zoned(q))
993 dm_zone_endio(io, bio);
994
995 if (endio) {
996 int r = endio(tio->ti, bio, &error);
997 switch (r) {
998 case DM_ENDIO_REQUEUE:
999
1000
1001
1002
1003
1004 if (WARN_ON_ONCE(dm_is_zone_write(md, bio)))
1005 error = BLK_STS_IOERR;
1006 else
1007 error = BLK_STS_DM_REQUEUE;
1008 fallthrough;
1009 case DM_ENDIO_DONE:
1010 break;
1011 case DM_ENDIO_INCOMPLETE:
1012
1013 return;
1014 default:
1015 DMWARN("unimplemented target endio return value: %d", r);
1016 BUG();
1017 }
1018 }
1019
1020 if (unlikely(swap_bios_limit(tio->ti, bio))) {
1021 struct mapped_device *md = io->md;
1022 up(&md->swap_bios_semaphore);
1023 }
1024
1025 free_tio(bio);
1026 dm_io_dec_pending(io, error);
1027}
1028
1029
1030
1031
1032
1033static inline sector_t max_io_len_target_boundary(struct dm_target *ti,
1034 sector_t target_offset)
1035{
1036 return ti->len - target_offset;
1037}
1038
1039static sector_t max_io_len(struct dm_target *ti, sector_t sector)
1040{
1041 sector_t target_offset = dm_target_offset(ti, sector);
1042 sector_t len = max_io_len_target_boundary(ti, target_offset);
1043 sector_t max_len;
1044
1045
1046
1047
1048
1049
1050
1051
1052 if (ti->max_io_len) {
1053 max_len = blk_max_size_offset(ti->table->md->queue,
1054 target_offset, ti->max_io_len);
1055 if (len > max_len)
1056 len = max_len;
1057 }
1058
1059 return len;
1060}
1061
1062int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
1063{
1064 if (len > UINT_MAX) {
1065 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
1066 (unsigned long long)len, UINT_MAX);
1067 ti->error = "Maximum size of target IO is too large";
1068 return -EINVAL;
1069 }
1070
1071 ti->max_io_len = (uint32_t) len;
1072
1073 return 0;
1074}
1075EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
1076
1077static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
1078 sector_t sector, int *srcu_idx)
1079 __acquires(md->io_barrier)
1080{
1081 struct dm_table *map;
1082 struct dm_target *ti;
1083
1084 map = dm_get_live_table(md, srcu_idx);
1085 if (!map)
1086 return NULL;
1087
1088 ti = dm_table_find_target(map, sector);
1089 if (!ti)
1090 return NULL;
1091
1092 return ti;
1093}
1094
1095static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
1096 long nr_pages, void **kaddr, pfn_t *pfn)
1097{
1098 struct mapped_device *md = dax_get_private(dax_dev);
1099 sector_t sector = pgoff * PAGE_SECTORS;
1100 struct dm_target *ti;
1101 long len, ret = -EIO;
1102 int srcu_idx;
1103
1104 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1105
1106 if (!ti)
1107 goto out;
1108 if (!ti->type->direct_access)
1109 goto out;
1110 len = max_io_len(ti, sector) / PAGE_SECTORS;
1111 if (len < 1)
1112 goto out;
1113 nr_pages = min(len, nr_pages);
1114 ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
1115
1116 out:
1117 dm_put_live_table(md, srcu_idx);
1118
1119 return ret;
1120}
1121
1122static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
1123 size_t nr_pages)
1124{
1125 struct mapped_device *md = dax_get_private(dax_dev);
1126 sector_t sector = pgoff * PAGE_SECTORS;
1127 struct dm_target *ti;
1128 int ret = -EIO;
1129 int srcu_idx;
1130
1131 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1132
1133 if (!ti)
1134 goto out;
1135 if (WARN_ON(!ti->type->dax_zero_page_range)) {
1136
1137
1138
1139
1140 goto out;
1141 }
1142 ret = ti->type->dax_zero_page_range(ti, pgoff, nr_pages);
1143 out:
1144 dm_put_live_table(md, srcu_idx);
1145
1146 return ret;
1147}
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
1180{
1181 struct dm_target_io *tio = clone_to_tio(bio);
1182 unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
1183
1184 BUG_ON(dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO));
1185 BUG_ON(op_is_zone_mgmt(bio_op(bio)));
1186 BUG_ON(bio_op(bio) == REQ_OP_ZONE_APPEND);
1187 BUG_ON(bi_size > *tio->len_ptr);
1188 BUG_ON(n_sectors > bi_size);
1189
1190 *tio->len_ptr -= bi_size - n_sectors;
1191 bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
1192}
1193EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
1194
1195static inline void __dm_submit_bio_remap(struct bio *clone,
1196 dev_t dev, sector_t old_sector)
1197{
1198 trace_block_bio_remap(clone, dev, old_sector);
1199 submit_bio_noacct(clone);
1200}
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone)
1212{
1213 struct dm_target_io *tio = clone_to_tio(clone);
1214 struct dm_io *io = tio->io;
1215
1216 WARN_ON_ONCE(!tio->ti->accounts_remapped_io);
1217
1218
1219 if (!tgt_clone)
1220 tgt_clone = clone;
1221
1222
1223
1224
1225
1226 if (io->map_task == current) {
1227
1228 dm_io_set_flag(io, DM_IO_START_ACCT);
1229 } else {
1230
1231
1232
1233
1234
1235 while (unlikely(!smp_load_acquire(&io->orig_bio)))
1236 msleep(1);
1237 dm_start_io_acct(io, clone);
1238 }
1239
1240 __dm_submit_bio_remap(tgt_clone, disk_devt(io->md->disk),
1241 tio->old_sector);
1242}
1243EXPORT_SYMBOL_GPL(dm_submit_bio_remap);
1244
1245static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch)
1246{
1247 mutex_lock(&md->swap_bios_lock);
1248 while (latch < md->swap_bios) {
1249 cond_resched();
1250 down(&md->swap_bios_semaphore);
1251 md->swap_bios--;
1252 }
1253 while (latch > md->swap_bios) {
1254 cond_resched();
1255 up(&md->swap_bios_semaphore);
1256 md->swap_bios++;
1257 }
1258 mutex_unlock(&md->swap_bios_lock);
1259}
1260
1261static void __map_bio(struct bio *clone)
1262{
1263 struct dm_target_io *tio = clone_to_tio(clone);
1264 int r;
1265 struct dm_io *io = tio->io;
1266 struct dm_target *ti = tio->ti;
1267
1268 clone->bi_end_io = clone_endio;
1269
1270
1271
1272
1273 dm_io_inc_pending(io);
1274 tio->old_sector = clone->bi_iter.bi_sector;
1275
1276 if (unlikely(swap_bios_limit(ti, clone))) {
1277 struct mapped_device *md = io->md;
1278 int latch = get_swap_bios();
1279 if (unlikely(latch != md->swap_bios))
1280 __set_swap_bios_limit(md, latch);
1281 down(&md->swap_bios_semaphore);
1282 }
1283
1284
1285
1286
1287
1288
1289 if (dm_emulate_zone_append(io->md))
1290 r = dm_zone_map_bio(tio);
1291 else
1292 r = ti->type->map(ti, clone);
1293
1294 switch (r) {
1295 case DM_MAPIO_SUBMITTED:
1296
1297 if (!ti->accounts_remapped_io)
1298 dm_io_set_flag(io, DM_IO_START_ACCT);
1299 break;
1300 case DM_MAPIO_REMAPPED:
1301
1302
1303
1304
1305 __dm_submit_bio_remap(clone, disk_devt(io->md->disk),
1306 tio->old_sector);
1307 dm_io_set_flag(io, DM_IO_START_ACCT);
1308 break;
1309 case DM_MAPIO_KILL:
1310 case DM_MAPIO_REQUEUE:
1311 if (unlikely(swap_bios_limit(ti, clone)))
1312 up(&io->md->swap_bios_semaphore);
1313 free_tio(clone);
1314 if (r == DM_MAPIO_KILL)
1315 dm_io_dec_pending(io, BLK_STS_IOERR);
1316 else
1317 dm_io_dec_pending(io, BLK_STS_DM_REQUEUE);
1318 break;
1319 default:
1320 DMWARN("unimplemented target map return value: %d", r);
1321 BUG();
1322 }
1323}
1324
1325static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
1326 struct dm_target *ti, unsigned num_bios)
1327{
1328 struct bio *bio;
1329 int try;
1330
1331 for (try = 0; try < 2; try++) {
1332 int bio_nr;
1333
1334 if (try)
1335 mutex_lock(&ci->io->md->table_devices_lock);
1336 for (bio_nr = 0; bio_nr < num_bios; bio_nr++) {
1337 bio = alloc_tio(ci, ti, bio_nr, NULL,
1338 try ? GFP_NOIO : GFP_NOWAIT);
1339 if (!bio)
1340 break;
1341
1342 bio_list_add(blist, bio);
1343 }
1344 if (try)
1345 mutex_unlock(&ci->io->md->table_devices_lock);
1346 if (bio_nr == num_bios)
1347 return;
1348
1349 while ((bio = bio_list_pop(blist)))
1350 free_tio(bio);
1351 }
1352}
1353
1354static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
1355 unsigned num_bios, unsigned *len)
1356{
1357 struct bio_list blist = BIO_EMPTY_LIST;
1358 struct bio *clone;
1359
1360 switch (num_bios) {
1361 case 0:
1362 break;
1363 case 1:
1364 clone = alloc_tio(ci, ti, 0, len, GFP_NOIO);
1365 __map_bio(clone);
1366 break;
1367 default:
1368
1369 alloc_multiple_bios(&blist, ci, ti, num_bios);
1370 while ((clone = bio_list_pop(&blist))) {
1371 dm_tio_set_flag(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO);
1372 __map_bio(clone);
1373 }
1374 break;
1375 }
1376}
1377
1378static void __send_empty_flush(struct clone_info *ci)
1379{
1380 unsigned target_nr = 0;
1381 struct dm_target *ti;
1382 struct bio flush_bio;
1383
1384
1385
1386
1387
1388
1389 bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0,
1390 REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC);
1391
1392 ci->bio = &flush_bio;
1393 ci->sector_count = 0;
1394 ci->io->tio.clone.bi_iter.bi_size = 0;
1395
1396 while ((ti = dm_table_get_target(ci->map, target_nr++)))
1397 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
1398
1399 bio_uninit(ci->bio);
1400}
1401
1402static void __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
1403 unsigned num_bios)
1404{
1405 unsigned len;
1406
1407 len = min_t(sector_t, ci->sector_count,
1408 max_io_len_target_boundary(ti, dm_target_offset(ti, ci->sector)));
1409
1410 __send_duplicate_bios(ci, ti, num_bios, &len);
1411
1412 ci->sector += len;
1413 ci->sector_count -= len;
1414}
1415
1416static bool is_abnormal_io(struct bio *bio)
1417{
1418 bool r = false;
1419
1420 switch (bio_op(bio)) {
1421 case REQ_OP_DISCARD:
1422 case REQ_OP_SECURE_ERASE:
1423 case REQ_OP_WRITE_ZEROES:
1424 r = true;
1425 break;
1426 }
1427
1428 return r;
1429}
1430
1431static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti,
1432 int *result)
1433{
1434 unsigned num_bios = 0;
1435
1436 switch (bio_op(ci->bio)) {
1437 case REQ_OP_DISCARD:
1438 num_bios = ti->num_discard_bios;
1439 break;
1440 case REQ_OP_SECURE_ERASE:
1441 num_bios = ti->num_secure_erase_bios;
1442 break;
1443 case REQ_OP_WRITE_ZEROES:
1444 num_bios = ti->num_write_zeroes_bios;
1445 break;
1446 default:
1447 return false;
1448 }
1449
1450
1451
1452
1453
1454
1455
1456 if (!num_bios)
1457 *result = -EOPNOTSUPP;
1458 else {
1459 __send_changing_extent_only(ci, ti, num_bios);
1460 *result = 0;
1461 }
1462 return true;
1463}
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474static inline struct hlist_head *dm_get_bio_hlist_head(struct bio *bio)
1475{
1476 return (struct hlist_head *)&bio->bi_private;
1477}
1478
1479static void dm_queue_poll_io(struct bio *bio, struct dm_io *io)
1480{
1481 struct hlist_head *head = dm_get_bio_hlist_head(bio);
1482
1483 if (!(bio->bi_opf & REQ_DM_POLL_LIST)) {
1484 bio->bi_opf |= REQ_DM_POLL_LIST;
1485
1486
1487
1488
1489 io->data = bio->bi_private;
1490
1491 INIT_HLIST_HEAD(head);
1492
1493
1494 bio->bi_cookie = ~BLK_QC_T_NONE;
1495 } else {
1496
1497
1498
1499
1500 io->data = hlist_entry(head->first, struct dm_io, node)->data;
1501 }
1502
1503 hlist_add_head(&io->node, head);
1504}
1505
1506
1507
1508
1509static int __split_and_process_bio(struct clone_info *ci)
1510{
1511 struct bio *clone;
1512 struct dm_target *ti;
1513 unsigned len;
1514 int r;
1515
1516 ti = dm_table_find_target(ci->map, ci->sector);
1517 if (!ti)
1518 return -EIO;
1519
1520 if (__process_abnormal_io(ci, ti, &r))
1521 return r;
1522
1523
1524
1525
1526
1527 ci->submit_as_polled = ci->bio->bi_opf & REQ_POLLED;
1528
1529 len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count);
1530 clone = alloc_tio(ci, ti, 0, &len, GFP_NOIO);
1531 __map_bio(clone);
1532
1533 ci->sector += len;
1534 ci->sector_count -= len;
1535
1536 return 0;
1537}
1538
1539static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
1540 struct dm_table *map, struct bio *bio)
1541{
1542 ci->map = map;
1543 ci->io = alloc_io(md, bio);
1544 ci->bio = bio;
1545 ci->submit_as_polled = false;
1546 ci->sector = bio->bi_iter.bi_sector;
1547 ci->sector_count = bio_sectors(bio);
1548
1549
1550 if (WARN_ON_ONCE(op_is_zone_mgmt(bio_op(bio)) && ci->sector_count))
1551 ci->sector_count = 0;
1552}
1553
1554
1555
1556
1557static void dm_split_and_process_bio(struct mapped_device *md,
1558 struct dm_table *map, struct bio *bio)
1559{
1560 struct clone_info ci;
1561 struct bio *orig_bio = NULL;
1562 int error = 0;
1563
1564 init_clone_info(&ci, md, map, bio);
1565
1566 if (bio->bi_opf & REQ_PREFLUSH) {
1567 __send_empty_flush(&ci);
1568
1569 goto out;
1570 }
1571
1572 error = __split_and_process_bio(&ci);
1573 ci.io->map_task = NULL;
1574 if (error || !ci.sector_count)
1575 goto out;
1576
1577
1578
1579
1580
1581
1582
1583
1584 orig_bio = bio_split(bio, bio_sectors(bio) - ci.sector_count,
1585 GFP_NOIO, &md->queue->bio_split);
1586 bio_chain(orig_bio, bio);
1587 trace_block_split(orig_bio, bio->bi_iter.bi_sector);
1588 submit_bio_noacct(bio);
1589out:
1590 if (!orig_bio)
1591 orig_bio = bio;
1592 smp_store_release(&ci.io->orig_bio, orig_bio);
1593 if (dm_io_flagged(ci.io, DM_IO_START_ACCT))
1594 dm_start_io_acct(ci.io, NULL);
1595
1596
1597
1598
1599
1600
1601
1602
1603 if (error || !ci.submit_as_polled)
1604 dm_io_dec_pending(ci.io, errno_to_blk_status(error));
1605 else
1606 dm_queue_poll_io(bio, ci.io);
1607}
1608
1609static void dm_submit_bio(struct bio *bio)
1610{
1611 struct mapped_device *md = bio->bi_bdev->bd_disk->private_data;
1612 int srcu_idx;
1613 struct dm_table *map;
1614
1615 map = dm_get_live_table(md, &srcu_idx);
1616
1617
1618 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) ||
1619 unlikely(!map)) {
1620 if (bio->bi_opf & REQ_NOWAIT)
1621 bio_wouldblock_error(bio);
1622 else if (bio->bi_opf & REQ_RAHEAD)
1623 bio_io_error(bio);
1624 else
1625 queue_io(md, bio);
1626 goto out;
1627 }
1628
1629
1630
1631
1632
1633 if (is_abnormal_io(bio))
1634 blk_queue_split(&bio);
1635
1636 dm_split_and_process_bio(md, map, bio);
1637out:
1638 dm_put_live_table(md, srcu_idx);
1639}
1640
1641static bool dm_poll_dm_io(struct dm_io *io, struct io_comp_batch *iob,
1642 unsigned int flags)
1643{
1644 WARN_ON_ONCE(!dm_tio_is_normal(&io->tio));
1645
1646
1647 if (atomic_read(&io->io_count) > 1)
1648 bio_poll(&io->tio.clone, iob, flags);
1649
1650
1651 return atomic_read(&io->io_count) == 1;
1652}
1653
1654static int dm_poll_bio(struct bio *bio, struct io_comp_batch *iob,
1655 unsigned int flags)
1656{
1657 struct hlist_head *head = dm_get_bio_hlist_head(bio);
1658 struct hlist_head tmp = HLIST_HEAD_INIT;
1659 struct hlist_node *next;
1660 struct dm_io *io;
1661
1662
1663 if (!(bio->bi_opf & REQ_DM_POLL_LIST))
1664 return 0;
1665
1666 WARN_ON_ONCE(hlist_empty(head));
1667
1668 hlist_move_list(head, &tmp);
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678 bio->bi_opf &= ~REQ_DM_POLL_LIST;
1679 bio->bi_private = hlist_entry(tmp.first, struct dm_io, node)->data;
1680
1681 hlist_for_each_entry_safe(io, next, &tmp, node) {
1682 if (dm_poll_dm_io(io, iob, flags)) {
1683 hlist_del_init(&io->node);
1684
1685
1686
1687
1688 dm_io_dec_pending(io, 0);
1689 }
1690 }
1691
1692
1693 if (!hlist_empty(&tmp)) {
1694 bio->bi_opf |= REQ_DM_POLL_LIST;
1695
1696 hlist_move_list(&tmp, head);
1697 return 0;
1698 }
1699 return 1;
1700}
1701
1702
1703
1704
1705static void free_minor(int minor)
1706{
1707 spin_lock(&_minor_lock);
1708 idr_remove(&_minor_idr, minor);
1709 spin_unlock(&_minor_lock);
1710}
1711
1712
1713
1714
1715static int specific_minor(int minor)
1716{
1717 int r;
1718
1719 if (minor >= (1 << MINORBITS))
1720 return -EINVAL;
1721
1722 idr_preload(GFP_KERNEL);
1723 spin_lock(&_minor_lock);
1724
1725 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
1726
1727 spin_unlock(&_minor_lock);
1728 idr_preload_end();
1729 if (r < 0)
1730 return r == -ENOSPC ? -EBUSY : r;
1731 return 0;
1732}
1733
1734static int next_free_minor(int *minor)
1735{
1736 int r;
1737
1738 idr_preload(GFP_KERNEL);
1739 spin_lock(&_minor_lock);
1740
1741 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
1742
1743 spin_unlock(&_minor_lock);
1744 idr_preload_end();
1745 if (r < 0)
1746 return r;
1747 *minor = r;
1748 return 0;
1749}
1750
1751static const struct block_device_operations dm_blk_dops;
1752static const struct block_device_operations dm_rq_blk_dops;
1753static const struct dax_operations dm_dax_ops;
1754
1755static void dm_wq_work(struct work_struct *work);
1756
1757#ifdef CONFIG_BLK_INLINE_ENCRYPTION
1758static void dm_queue_destroy_crypto_profile(struct request_queue *q)
1759{
1760 dm_destroy_crypto_profile(q->crypto_profile);
1761}
1762
1763#else
1764
1765static inline void dm_queue_destroy_crypto_profile(struct request_queue *q)
1766{
1767}
1768#endif
1769
1770static void cleanup_mapped_device(struct mapped_device *md)
1771{
1772 if (md->wq)
1773 destroy_workqueue(md->wq);
1774 bioset_exit(&md->bs);
1775 bioset_exit(&md->io_bs);
1776
1777 if (md->dax_dev) {
1778 dax_remove_host(md->disk);
1779 kill_dax(md->dax_dev);
1780 put_dax(md->dax_dev);
1781 md->dax_dev = NULL;
1782 }
1783
1784 dm_cleanup_zoned_dev(md);
1785 if (md->disk) {
1786 spin_lock(&_minor_lock);
1787 md->disk->private_data = NULL;
1788 spin_unlock(&_minor_lock);
1789 if (dm_get_md_type(md) != DM_TYPE_NONE) {
1790 dm_sysfs_exit(md);
1791 del_gendisk(md->disk);
1792 }
1793 dm_queue_destroy_crypto_profile(md->queue);
1794 blk_cleanup_disk(md->disk);
1795 }
1796
1797 if (md->pending_io) {
1798 free_percpu(md->pending_io);
1799 md->pending_io = NULL;
1800 }
1801
1802 cleanup_srcu_struct(&md->io_barrier);
1803
1804 mutex_destroy(&md->suspend_lock);
1805 mutex_destroy(&md->type_lock);
1806 mutex_destroy(&md->table_devices_lock);
1807 mutex_destroy(&md->swap_bios_lock);
1808
1809 dm_mq_cleanup_mapped_device(md);
1810}
1811
1812
1813
1814
1815static struct mapped_device *alloc_dev(int minor)
1816{
1817 int r, numa_node_id = dm_get_numa_node();
1818 struct mapped_device *md;
1819 void *old_md;
1820
1821 md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
1822 if (!md) {
1823 DMWARN("unable to allocate device, out of memory.");
1824 return NULL;
1825 }
1826
1827 if (!try_module_get(THIS_MODULE))
1828 goto bad_module_get;
1829
1830
1831 if (minor == DM_ANY_MINOR)
1832 r = next_free_minor(&minor);
1833 else
1834 r = specific_minor(minor);
1835 if (r < 0)
1836 goto bad_minor;
1837
1838 r = init_srcu_struct(&md->io_barrier);
1839 if (r < 0)
1840 goto bad_io_barrier;
1841
1842 md->numa_node_id = numa_node_id;
1843 md->init_tio_pdu = false;
1844 md->type = DM_TYPE_NONE;
1845 mutex_init(&md->suspend_lock);
1846 mutex_init(&md->type_lock);
1847 mutex_init(&md->table_devices_lock);
1848 spin_lock_init(&md->deferred_lock);
1849 atomic_set(&md->holders, 1);
1850 atomic_set(&md->open_count, 0);
1851 atomic_set(&md->event_nr, 0);
1852 atomic_set(&md->uevent_seq, 0);
1853 INIT_LIST_HEAD(&md->uevent_list);
1854 INIT_LIST_HEAD(&md->table_devices);
1855 spin_lock_init(&md->uevent_lock);
1856
1857
1858
1859
1860
1861
1862 md->disk = blk_alloc_disk(md->numa_node_id);
1863 if (!md->disk)
1864 goto bad;
1865 md->queue = md->disk->queue;
1866
1867 init_waitqueue_head(&md->wait);
1868 INIT_WORK(&md->work, dm_wq_work);
1869 init_waitqueue_head(&md->eventq);
1870 init_completion(&md->kobj_holder.completion);
1871
1872 md->swap_bios = get_swap_bios();
1873 sema_init(&md->swap_bios_semaphore, md->swap_bios);
1874 mutex_init(&md->swap_bios_lock);
1875
1876 md->disk->major = _major;
1877 md->disk->first_minor = minor;
1878 md->disk->minors = 1;
1879 md->disk->flags |= GENHD_FL_NO_PART;
1880 md->disk->fops = &dm_blk_dops;
1881 md->disk->queue = md->queue;
1882 md->disk->private_data = md;
1883 sprintf(md->disk->disk_name, "dm-%d", minor);
1884
1885 if (IS_ENABLED(CONFIG_FS_DAX)) {
1886 md->dax_dev = alloc_dax(md, &dm_dax_ops);
1887 if (IS_ERR(md->dax_dev)) {
1888 md->dax_dev = NULL;
1889 goto bad;
1890 }
1891 set_dax_nocache(md->dax_dev);
1892 set_dax_nomc(md->dax_dev);
1893 if (dax_add_host(md->dax_dev, md->disk))
1894 goto bad;
1895 }
1896
1897 format_dev_t(md->name, MKDEV(_major, minor));
1898
1899 md->wq = alloc_workqueue("kdmflush/%s", WQ_MEM_RECLAIM, 0, md->name);
1900 if (!md->wq)
1901 goto bad;
1902
1903 md->pending_io = alloc_percpu(unsigned long);
1904 if (!md->pending_io)
1905 goto bad;
1906
1907 dm_stats_init(&md->stats);
1908
1909
1910 spin_lock(&_minor_lock);
1911 old_md = idr_replace(&_minor_idr, md, minor);
1912 spin_unlock(&_minor_lock);
1913
1914 BUG_ON(old_md != MINOR_ALLOCED);
1915
1916 return md;
1917
1918bad:
1919 cleanup_mapped_device(md);
1920bad_io_barrier:
1921 free_minor(minor);
1922bad_minor:
1923 module_put(THIS_MODULE);
1924bad_module_get:
1925 kvfree(md);
1926 return NULL;
1927}
1928
1929static void unlock_fs(struct mapped_device *md);
1930
1931static void free_dev(struct mapped_device *md)
1932{
1933 int minor = MINOR(disk_devt(md->disk));
1934
1935 unlock_fs(md);
1936
1937 cleanup_mapped_device(md);
1938
1939 free_table_devices(&md->table_devices);
1940 dm_stats_cleanup(&md->stats);
1941 free_minor(minor);
1942
1943 module_put(THIS_MODULE);
1944 kvfree(md);
1945}
1946
1947static int __bind_mempools(struct mapped_device *md, struct dm_table *t)
1948{
1949 struct dm_md_mempools *p = dm_table_get_md_mempools(t);
1950 int ret = 0;
1951
1952 if (dm_table_bio_based(t)) {
1953
1954
1955
1956
1957
1958 bioset_exit(&md->bs);
1959 bioset_exit(&md->io_bs);
1960
1961 } else if (bioset_initialized(&md->bs)) {
1962
1963
1964
1965
1966
1967
1968
1969
1970 goto out;
1971 }
1972
1973 BUG_ON(!p ||
1974 bioset_initialized(&md->bs) ||
1975 bioset_initialized(&md->io_bs));
1976
1977 ret = bioset_init_from_src(&md->bs, &p->bs);
1978 if (ret)
1979 goto out;
1980 ret = bioset_init_from_src(&md->io_bs, &p->io_bs);
1981 if (ret)
1982 bioset_exit(&md->bs);
1983out:
1984
1985 dm_table_free_md_mempools(t);
1986 return ret;
1987}
1988
1989
1990
1991
1992static void event_callback(void *context)
1993{
1994 unsigned long flags;
1995 LIST_HEAD(uevents);
1996 struct mapped_device *md = (struct mapped_device *) context;
1997
1998 spin_lock_irqsave(&md->uevent_lock, flags);
1999 list_splice_init(&md->uevent_list, &uevents);
2000 spin_unlock_irqrestore(&md->uevent_lock, flags);
2001
2002 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
2003
2004 atomic_inc(&md->event_nr);
2005 wake_up(&md->eventq);
2006 dm_issue_global_event();
2007}
2008
2009
2010
2011
2012static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2013 struct queue_limits *limits)
2014{
2015 struct dm_table *old_map;
2016 sector_t size;
2017 int ret;
2018
2019 lockdep_assert_held(&md->suspend_lock);
2020
2021 size = dm_table_get_size(t);
2022
2023
2024
2025
2026 if (size != dm_get_size(md))
2027 memset(&md->geometry, 0, sizeof(md->geometry));
2028
2029 if (!get_capacity(md->disk))
2030 set_capacity(md->disk, size);
2031 else
2032 set_capacity_and_notify(md->disk, size);
2033
2034 dm_table_event_callback(t, event_callback, md);
2035
2036 if (dm_table_request_based(t)) {
2037
2038
2039
2040
2041 md->immutable_target = dm_table_get_immutable_target(t);
2042 }
2043
2044 ret = __bind_mempools(md, t);
2045 if (ret) {
2046 old_map = ERR_PTR(ret);
2047 goto out;
2048 }
2049
2050 ret = dm_table_set_restrictions(t, md->queue, limits);
2051 if (ret) {
2052 old_map = ERR_PTR(ret);
2053 goto out;
2054 }
2055
2056 old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2057 rcu_assign_pointer(md->map, (void *)t);
2058 md->immutable_target_type = dm_table_get_immutable_target_type(t);
2059
2060 if (old_map)
2061 dm_sync_table(md);
2062out:
2063 return old_map;
2064}
2065
2066
2067
2068
2069static struct dm_table *__unbind(struct mapped_device *md)
2070{
2071 struct dm_table *map = rcu_dereference_protected(md->map, 1);
2072
2073 if (!map)
2074 return NULL;
2075
2076 dm_table_event_callback(map, NULL, NULL);
2077 RCU_INIT_POINTER(md->map, NULL);
2078 dm_sync_table(md);
2079
2080 return map;
2081}
2082
2083
2084
2085
2086int dm_create(int minor, struct mapped_device **result)
2087{
2088 struct mapped_device *md;
2089
2090 md = alloc_dev(minor);
2091 if (!md)
2092 return -ENXIO;
2093
2094 dm_ima_reset_data(md);
2095
2096 *result = md;
2097 return 0;
2098}
2099
2100
2101
2102
2103
2104void dm_lock_md_type(struct mapped_device *md)
2105{
2106 mutex_lock(&md->type_lock);
2107}
2108
2109void dm_unlock_md_type(struct mapped_device *md)
2110{
2111 mutex_unlock(&md->type_lock);
2112}
2113
2114void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
2115{
2116 BUG_ON(!mutex_is_locked(&md->type_lock));
2117 md->type = type;
2118}
2119
2120enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
2121{
2122 return md->type;
2123}
2124
2125struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
2126{
2127 return md->immutable_target_type;
2128}
2129
2130
2131
2132
2133
2134struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
2135{
2136 BUG_ON(!atomic_read(&md->holders));
2137 return &md->queue->limits;
2138}
2139EXPORT_SYMBOL_GPL(dm_get_queue_limits);
2140
2141
2142
2143
2144int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
2145{
2146 enum dm_queue_mode type = dm_table_get_type(t);
2147 struct queue_limits limits;
2148 int r;
2149
2150 switch (type) {
2151 case DM_TYPE_REQUEST_BASED:
2152 md->disk->fops = &dm_rq_blk_dops;
2153 r = dm_mq_init_request_queue(md, t);
2154 if (r) {
2155 DMERR("Cannot initialize queue for request-based dm mapped device");
2156 return r;
2157 }
2158 break;
2159 case DM_TYPE_BIO_BASED:
2160 case DM_TYPE_DAX_BIO_BASED:
2161 break;
2162 case DM_TYPE_NONE:
2163 WARN_ON_ONCE(true);
2164 break;
2165 }
2166
2167 r = dm_calculate_queue_limits(t, &limits);
2168 if (r) {
2169 DMERR("Cannot calculate initial queue limits");
2170 return r;
2171 }
2172 r = dm_table_set_restrictions(t, md->queue, &limits);
2173 if (r)
2174 return r;
2175
2176 r = add_disk(md->disk);
2177 if (r)
2178 return r;
2179
2180 r = dm_sysfs_init(md);
2181 if (r) {
2182 del_gendisk(md->disk);
2183 return r;
2184 }
2185 md->type = type;
2186 return 0;
2187}
2188
2189struct mapped_device *dm_get_md(dev_t dev)
2190{
2191 struct mapped_device *md;
2192 unsigned minor = MINOR(dev);
2193
2194 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2195 return NULL;
2196
2197 spin_lock(&_minor_lock);
2198
2199 md = idr_find(&_minor_idr, minor);
2200 if (!md || md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) ||
2201 test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2202 md = NULL;
2203 goto out;
2204 }
2205 dm_get(md);
2206out:
2207 spin_unlock(&_minor_lock);
2208
2209 return md;
2210}
2211EXPORT_SYMBOL_GPL(dm_get_md);
2212
2213void *dm_get_mdptr(struct mapped_device *md)
2214{
2215 return md->interface_ptr;
2216}
2217
2218void dm_set_mdptr(struct mapped_device *md, void *ptr)
2219{
2220 md->interface_ptr = ptr;
2221}
2222
2223void dm_get(struct mapped_device *md)
2224{
2225 atomic_inc(&md->holders);
2226 BUG_ON(test_bit(DMF_FREEING, &md->flags));
2227}
2228
2229int dm_hold(struct mapped_device *md)
2230{
2231 spin_lock(&_minor_lock);
2232 if (test_bit(DMF_FREEING, &md->flags)) {
2233 spin_unlock(&_minor_lock);
2234 return -EBUSY;
2235 }
2236 dm_get(md);
2237 spin_unlock(&_minor_lock);
2238 return 0;
2239}
2240EXPORT_SYMBOL_GPL(dm_hold);
2241
2242const char *dm_device_name(struct mapped_device *md)
2243{
2244 return md->name;
2245}
2246EXPORT_SYMBOL_GPL(dm_device_name);
2247
2248static void __dm_destroy(struct mapped_device *md, bool wait)
2249{
2250 struct dm_table *map;
2251 int srcu_idx;
2252
2253 might_sleep();
2254
2255 spin_lock(&_minor_lock);
2256 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2257 set_bit(DMF_FREEING, &md->flags);
2258 spin_unlock(&_minor_lock);
2259
2260 blk_mark_disk_dead(md->disk);
2261
2262
2263
2264
2265
2266 mutex_lock(&md->suspend_lock);
2267 map = dm_get_live_table(md, &srcu_idx);
2268 if (!dm_suspended_md(md)) {
2269 dm_table_presuspend_targets(map);
2270 set_bit(DMF_SUSPENDED, &md->flags);
2271 set_bit(DMF_POST_SUSPENDING, &md->flags);
2272 dm_table_postsuspend_targets(map);
2273 }
2274
2275 dm_put_live_table(md, srcu_idx);
2276 mutex_unlock(&md->suspend_lock);
2277
2278
2279
2280
2281
2282
2283
2284 if (wait)
2285 while (atomic_read(&md->holders))
2286 msleep(1);
2287 else if (atomic_read(&md->holders))
2288 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2289 dm_device_name(md), atomic_read(&md->holders));
2290
2291 dm_table_destroy(__unbind(md));
2292 free_dev(md);
2293}
2294
2295void dm_destroy(struct mapped_device *md)
2296{
2297 __dm_destroy(md, true);
2298}
2299
2300void dm_destroy_immediate(struct mapped_device *md)
2301{
2302 __dm_destroy(md, false);
2303}
2304
2305void dm_put(struct mapped_device *md)
2306{
2307 atomic_dec(&md->holders);
2308}
2309EXPORT_SYMBOL_GPL(dm_put);
2310
2311static bool dm_in_flight_bios(struct mapped_device *md)
2312{
2313 int cpu;
2314 unsigned long sum = 0;
2315
2316 for_each_possible_cpu(cpu)
2317 sum += *per_cpu_ptr(md->pending_io, cpu);
2318
2319 return sum != 0;
2320}
2321
2322static int dm_wait_for_bios_completion(struct mapped_device *md, unsigned int task_state)
2323{
2324 int r = 0;
2325 DEFINE_WAIT(wait);
2326
2327 while (true) {
2328 prepare_to_wait(&md->wait, &wait, task_state);
2329
2330 if (!dm_in_flight_bios(md))
2331 break;
2332
2333 if (signal_pending_state(task_state, current)) {
2334 r = -EINTR;
2335 break;
2336 }
2337
2338 io_schedule();
2339 }
2340 finish_wait(&md->wait, &wait);
2341
2342 smp_rmb();
2343
2344 return r;
2345}
2346
2347static int dm_wait_for_completion(struct mapped_device *md, unsigned int task_state)
2348{
2349 int r = 0;
2350
2351 if (!queue_is_mq(md->queue))
2352 return dm_wait_for_bios_completion(md, task_state);
2353
2354 while (true) {
2355 if (!blk_mq_queue_inflight(md->queue))
2356 break;
2357
2358 if (signal_pending_state(task_state, current)) {
2359 r = -EINTR;
2360 break;
2361 }
2362
2363 msleep(5);
2364 }
2365
2366 return r;
2367}
2368
2369
2370
2371
2372static void dm_wq_work(struct work_struct *work)
2373{
2374 struct mapped_device *md = container_of(work, struct mapped_device, work);
2375 struct bio *bio;
2376
2377 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2378 spin_lock_irq(&md->deferred_lock);
2379 bio = bio_list_pop(&md->deferred);
2380 spin_unlock_irq(&md->deferred_lock);
2381
2382 if (!bio)
2383 break;
2384
2385 submit_bio_noacct(bio);
2386 }
2387}
2388
2389static void dm_queue_flush(struct mapped_device *md)
2390{
2391 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2392 smp_mb__after_atomic();
2393 queue_work(md->wq, &md->work);
2394}
2395
2396
2397
2398
2399struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2400{
2401 struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
2402 struct queue_limits limits;
2403 int r;
2404
2405 mutex_lock(&md->suspend_lock);
2406
2407
2408 if (!dm_suspended_md(md))
2409 goto out;
2410
2411
2412
2413
2414
2415
2416
2417 if (dm_table_has_no_data_devices(table)) {
2418 live_map = dm_get_live_table_fast(md);
2419 if (live_map)
2420 limits = md->queue->limits;
2421 dm_put_live_table_fast(md);
2422 }
2423
2424 if (!live_map) {
2425 r = dm_calculate_queue_limits(table, &limits);
2426 if (r) {
2427 map = ERR_PTR(r);
2428 goto out;
2429 }
2430 }
2431
2432 map = __bind(md, table, &limits);
2433 dm_issue_global_event();
2434
2435out:
2436 mutex_unlock(&md->suspend_lock);
2437 return map;
2438}
2439
2440
2441
2442
2443
2444static int lock_fs(struct mapped_device *md)
2445{
2446 int r;
2447
2448 WARN_ON(test_bit(DMF_FROZEN, &md->flags));
2449
2450 r = freeze_bdev(md->disk->part0);
2451 if (!r)
2452 set_bit(DMF_FROZEN, &md->flags);
2453 return r;
2454}
2455
2456static void unlock_fs(struct mapped_device *md)
2457{
2458 if (!test_bit(DMF_FROZEN, &md->flags))
2459 return;
2460 thaw_bdev(md->disk->part0);
2461 clear_bit(DMF_FROZEN, &md->flags);
2462}
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
2474 unsigned suspend_flags, unsigned int task_state,
2475 int dmf_suspended_flag)
2476{
2477 bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
2478 bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
2479 int r;
2480
2481 lockdep_assert_held(&md->suspend_lock);
2482
2483
2484
2485
2486
2487 if (noflush)
2488 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2489 else
2490 DMDEBUG("%s: suspending with flush", dm_device_name(md));
2491
2492
2493
2494
2495
2496 dm_table_presuspend_targets(map);
2497
2498
2499
2500
2501
2502
2503
2504 if (!noflush && do_lockfs) {
2505 r = lock_fs(md);
2506 if (r) {
2507 dm_table_presuspend_undo_targets(map);
2508 return r;
2509 }
2510 }
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
2521
2522
2523 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2524 if (map)
2525 synchronize_srcu(&md->io_barrier);
2526
2527
2528
2529
2530
2531 if (dm_request_based(md))
2532 dm_stop_queue(md->queue);
2533
2534 flush_workqueue(md->wq);
2535
2536
2537
2538
2539
2540
2541 r = dm_wait_for_completion(md, task_state);
2542 if (!r)
2543 set_bit(dmf_suspended_flag, &md->flags);
2544
2545 if (noflush)
2546 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2547 if (map)
2548 synchronize_srcu(&md->io_barrier);
2549
2550
2551 if (r < 0) {
2552 dm_queue_flush(md);
2553
2554 if (dm_request_based(md))
2555 dm_start_queue(md->queue);
2556
2557 unlock_fs(md);
2558 dm_table_presuspend_undo_targets(map);
2559
2560 }
2561
2562 return r;
2563}
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2582{
2583 struct dm_table *map = NULL;
2584 int r = 0;
2585
2586retry:
2587 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2588
2589 if (dm_suspended_md(md)) {
2590 r = -EINVAL;
2591 goto out_unlock;
2592 }
2593
2594 if (dm_suspended_internally_md(md)) {
2595
2596 mutex_unlock(&md->suspend_lock);
2597 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2598 if (r)
2599 return r;
2600 goto retry;
2601 }
2602
2603 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2604
2605 r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED);
2606 if (r)
2607 goto out_unlock;
2608
2609 set_bit(DMF_POST_SUSPENDING, &md->flags);
2610 dm_table_postsuspend_targets(map);
2611 clear_bit(DMF_POST_SUSPENDING, &md->flags);
2612
2613out_unlock:
2614 mutex_unlock(&md->suspend_lock);
2615 return r;
2616}
2617
2618static int __dm_resume(struct mapped_device *md, struct dm_table *map)
2619{
2620 if (map) {
2621 int r = dm_table_resume_targets(map);
2622 if (r)
2623 return r;
2624 }
2625
2626 dm_queue_flush(md);
2627
2628
2629
2630
2631
2632
2633 if (dm_request_based(md))
2634 dm_start_queue(md->queue);
2635
2636 unlock_fs(md);
2637
2638 return 0;
2639}
2640
2641int dm_resume(struct mapped_device *md)
2642{
2643 int r;
2644 struct dm_table *map = NULL;
2645
2646retry:
2647 r = -EINVAL;
2648 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2649
2650 if (!dm_suspended_md(md))
2651 goto out;
2652
2653 if (dm_suspended_internally_md(md)) {
2654
2655 mutex_unlock(&md->suspend_lock);
2656 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2657 if (r)
2658 return r;
2659 goto retry;
2660 }
2661
2662 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2663 if (!map || !dm_table_get_size(map))
2664 goto out;
2665
2666 r = __dm_resume(md, map);
2667 if (r)
2668 goto out;
2669
2670 clear_bit(DMF_SUSPENDED, &md->flags);
2671out:
2672 mutex_unlock(&md->suspend_lock);
2673
2674 return r;
2675}
2676
2677
2678
2679
2680
2681
2682
2683static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
2684{
2685 struct dm_table *map = NULL;
2686
2687 lockdep_assert_held(&md->suspend_lock);
2688
2689 if (md->internal_suspend_count++)
2690 return;
2691
2692 if (dm_suspended_md(md)) {
2693 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2694 return;
2695 }
2696
2697 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2698
2699
2700
2701
2702
2703
2704
2705 (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE,
2706 DMF_SUSPENDED_INTERNALLY);
2707
2708 set_bit(DMF_POST_SUSPENDING, &md->flags);
2709 dm_table_postsuspend_targets(map);
2710 clear_bit(DMF_POST_SUSPENDING, &md->flags);
2711}
2712
2713static void __dm_internal_resume(struct mapped_device *md)
2714{
2715 BUG_ON(!md->internal_suspend_count);
2716
2717 if (--md->internal_suspend_count)
2718 return;
2719
2720 if (dm_suspended_md(md))
2721 goto done;
2722
2723
2724
2725
2726
2727 (void) __dm_resume(md, NULL);
2728
2729done:
2730 clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2731 smp_mb__after_atomic();
2732 wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
2733}
2734
2735void dm_internal_suspend_noflush(struct mapped_device *md)
2736{
2737 mutex_lock(&md->suspend_lock);
2738 __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
2739 mutex_unlock(&md->suspend_lock);
2740}
2741EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
2742
2743void dm_internal_resume(struct mapped_device *md)
2744{
2745 mutex_lock(&md->suspend_lock);
2746 __dm_internal_resume(md);
2747 mutex_unlock(&md->suspend_lock);
2748}
2749EXPORT_SYMBOL_GPL(dm_internal_resume);
2750
2751
2752
2753
2754
2755
2756void dm_internal_suspend_fast(struct mapped_device *md)
2757{
2758 mutex_lock(&md->suspend_lock);
2759 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2760 return;
2761
2762 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2763 synchronize_srcu(&md->io_barrier);
2764 flush_workqueue(md->wq);
2765 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2766}
2767EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
2768
2769void dm_internal_resume_fast(struct mapped_device *md)
2770{
2771 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2772 goto done;
2773
2774 dm_queue_flush(md);
2775
2776done:
2777 mutex_unlock(&md->suspend_lock);
2778}
2779EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
2780
2781
2782
2783
2784int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
2785 unsigned cookie)
2786{
2787 int r;
2788 unsigned noio_flag;
2789 char udev_cookie[DM_COOKIE_LENGTH];
2790 char *envp[] = { udev_cookie, NULL };
2791
2792 noio_flag = memalloc_noio_save();
2793
2794 if (!cookie)
2795 r = kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
2796 else {
2797 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2798 DM_COOKIE_ENV_VAR_NAME, cookie);
2799 r = kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
2800 action, envp);
2801 }
2802
2803 memalloc_noio_restore(noio_flag);
2804
2805 return r;
2806}
2807
2808uint32_t dm_next_uevent_seq(struct mapped_device *md)
2809{
2810 return atomic_add_return(1, &md->uevent_seq);
2811}
2812
2813uint32_t dm_get_event_nr(struct mapped_device *md)
2814{
2815 return atomic_read(&md->event_nr);
2816}
2817
2818int dm_wait_event(struct mapped_device *md, int event_nr)
2819{
2820 return wait_event_interruptible(md->eventq,
2821 (event_nr != atomic_read(&md->event_nr)));
2822}
2823
2824void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
2825{
2826 unsigned long flags;
2827
2828 spin_lock_irqsave(&md->uevent_lock, flags);
2829 list_add(elist, &md->uevent_list);
2830 spin_unlock_irqrestore(&md->uevent_lock, flags);
2831}
2832
2833
2834
2835
2836
2837struct gendisk *dm_disk(struct mapped_device *md)
2838{
2839 return md->disk;
2840}
2841EXPORT_SYMBOL_GPL(dm_disk);
2842
2843struct kobject *dm_kobject(struct mapped_device *md)
2844{
2845 return &md->kobj_holder.kobj;
2846}
2847
2848struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2849{
2850 struct mapped_device *md;
2851
2852 md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
2853
2854 spin_lock(&_minor_lock);
2855 if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2856 md = NULL;
2857 goto out;
2858 }
2859 dm_get(md);
2860out:
2861 spin_unlock(&_minor_lock);
2862
2863 return md;
2864}
2865
2866int dm_suspended_md(struct mapped_device *md)
2867{
2868 return test_bit(DMF_SUSPENDED, &md->flags);
2869}
2870
2871static int dm_post_suspending_md(struct mapped_device *md)
2872{
2873 return test_bit(DMF_POST_SUSPENDING, &md->flags);
2874}
2875
2876int dm_suspended_internally_md(struct mapped_device *md)
2877{
2878 return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2879}
2880
2881int dm_test_deferred_remove_flag(struct mapped_device *md)
2882{
2883 return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
2884}
2885
2886int dm_suspended(struct dm_target *ti)
2887{
2888 return dm_suspended_md(ti->table->md);
2889}
2890EXPORT_SYMBOL_GPL(dm_suspended);
2891
2892int dm_post_suspending(struct dm_target *ti)
2893{
2894 return dm_post_suspending_md(ti->table->md);
2895}
2896EXPORT_SYMBOL_GPL(dm_post_suspending);
2897
2898int dm_noflush_suspending(struct dm_target *ti)
2899{
2900 return __noflush_suspending(ti->table->md);
2901}
2902EXPORT_SYMBOL_GPL(dm_noflush_suspending);
2903
2904struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
2905 unsigned integrity, unsigned per_io_data_size,
2906 unsigned min_pool_size)
2907{
2908 struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
2909 unsigned int pool_size = 0;
2910 unsigned int front_pad, io_front_pad;
2911 int ret;
2912
2913 if (!pools)
2914 return NULL;
2915
2916 switch (type) {
2917 case DM_TYPE_BIO_BASED:
2918 case DM_TYPE_DAX_BIO_BASED:
2919 pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
2920 front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + DM_TARGET_IO_BIO_OFFSET;
2921 io_front_pad = roundup(per_io_data_size, __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET;
2922 ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0);
2923 if (ret)
2924 goto out;
2925 if (integrity && bioset_integrity_create(&pools->io_bs, pool_size))
2926 goto out;
2927 break;
2928 case DM_TYPE_REQUEST_BASED:
2929 pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size);
2930 front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
2931
2932 break;
2933 default:
2934 BUG();
2935 }
2936
2937 ret = bioset_init(&pools->bs, pool_size, front_pad, 0);
2938 if (ret)
2939 goto out;
2940
2941 if (integrity && bioset_integrity_create(&pools->bs, pool_size))
2942 goto out;
2943
2944 return pools;
2945
2946out:
2947 dm_free_md_mempools(pools);
2948
2949 return NULL;
2950}
2951
2952void dm_free_md_mempools(struct dm_md_mempools *pools)
2953{
2954 if (!pools)
2955 return;
2956
2957 bioset_exit(&pools->bs);
2958 bioset_exit(&pools->io_bs);
2959
2960 kfree(pools);
2961}
2962
2963struct dm_pr {
2964 u64 old_key;
2965 u64 new_key;
2966 u32 flags;
2967 bool fail_early;
2968};
2969
2970static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
2971 void *data)
2972{
2973 struct mapped_device *md = bdev->bd_disk->private_data;
2974 struct dm_table *table;
2975 struct dm_target *ti;
2976 int ret = -ENOTTY, srcu_idx;
2977
2978 table = dm_get_live_table(md, &srcu_idx);
2979 if (!table || !dm_table_get_size(table))
2980 goto out;
2981
2982
2983 if (dm_table_get_num_targets(table) != 1)
2984 goto out;
2985 ti = dm_table_get_target(table, 0);
2986
2987 ret = -EINVAL;
2988 if (!ti->type->iterate_devices)
2989 goto out;
2990
2991 ret = ti->type->iterate_devices(ti, fn, data);
2992out:
2993 dm_put_live_table(md, srcu_idx);
2994 return ret;
2995}
2996
2997
2998
2999
3000static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev,
3001 sector_t start, sector_t len, void *data)
3002{
3003 struct dm_pr *pr = data;
3004 const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
3005
3006 if (!ops || !ops->pr_register)
3007 return -EOPNOTSUPP;
3008 return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);
3009}
3010
3011static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
3012 u32 flags)
3013{
3014 struct dm_pr pr = {
3015 .old_key = old_key,
3016 .new_key = new_key,
3017 .flags = flags,
3018 .fail_early = true,
3019 };
3020 int ret;
3021
3022 ret = dm_call_pr(bdev, __dm_pr_register, &pr);
3023 if (ret && new_key) {
3024
3025 pr.old_key = new_key;
3026 pr.new_key = 0;
3027 pr.flags = 0;
3028 pr.fail_early = false;
3029 dm_call_pr(bdev, __dm_pr_register, &pr);
3030 }
3031
3032 return ret;
3033}
3034
3035static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
3036 u32 flags)
3037{
3038 struct mapped_device *md = bdev->bd_disk->private_data;
3039 const struct pr_ops *ops;
3040 int r, srcu_idx;
3041
3042 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3043 if (r < 0)
3044 goto out;
3045
3046 ops = bdev->bd_disk->fops->pr_ops;
3047 if (ops && ops->pr_reserve)
3048 r = ops->pr_reserve(bdev, key, type, flags);
3049 else
3050 r = -EOPNOTSUPP;
3051out:
3052 dm_unprepare_ioctl(md, srcu_idx);
3053 return r;
3054}
3055
3056static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
3057{
3058 struct mapped_device *md = bdev->bd_disk->private_data;
3059 const struct pr_ops *ops;
3060 int r, srcu_idx;
3061
3062 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3063 if (r < 0)
3064 goto out;
3065
3066 ops = bdev->bd_disk->fops->pr_ops;
3067 if (ops && ops->pr_release)
3068 r = ops->pr_release(bdev, key, type);
3069 else
3070 r = -EOPNOTSUPP;
3071out:
3072 dm_unprepare_ioctl(md, srcu_idx);
3073 return r;
3074}
3075
3076static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
3077 enum pr_type type, bool abort)
3078{
3079 struct mapped_device *md = bdev->bd_disk->private_data;
3080 const struct pr_ops *ops;
3081 int r, srcu_idx;
3082
3083 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3084 if (r < 0)
3085 goto out;
3086
3087 ops = bdev->bd_disk->fops->pr_ops;
3088 if (ops && ops->pr_preempt)
3089 r = ops->pr_preempt(bdev, old_key, new_key, type, abort);
3090 else
3091 r = -EOPNOTSUPP;
3092out:
3093 dm_unprepare_ioctl(md, srcu_idx);
3094 return r;
3095}
3096
3097static int dm_pr_clear(struct block_device *bdev, u64 key)
3098{
3099 struct mapped_device *md = bdev->bd_disk->private_data;
3100 const struct pr_ops *ops;
3101 int r, srcu_idx;
3102
3103 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3104 if (r < 0)
3105 goto out;
3106
3107 ops = bdev->bd_disk->fops->pr_ops;
3108 if (ops && ops->pr_clear)
3109 r = ops->pr_clear(bdev, key);
3110 else
3111 r = -EOPNOTSUPP;
3112out:
3113 dm_unprepare_ioctl(md, srcu_idx);
3114 return r;
3115}
3116
3117static const struct pr_ops dm_pr_ops = {
3118 .pr_register = dm_pr_register,
3119 .pr_reserve = dm_pr_reserve,
3120 .pr_release = dm_pr_release,
3121 .pr_preempt = dm_pr_preempt,
3122 .pr_clear = dm_pr_clear,
3123};
3124
3125static const struct block_device_operations dm_blk_dops = {
3126 .submit_bio = dm_submit_bio,
3127 .poll_bio = dm_poll_bio,
3128 .open = dm_blk_open,
3129 .release = dm_blk_close,
3130 .ioctl = dm_blk_ioctl,
3131 .getgeo = dm_blk_getgeo,
3132 .report_zones = dm_blk_report_zones,
3133 .pr_ops = &dm_pr_ops,
3134 .owner = THIS_MODULE
3135};
3136
3137static const struct block_device_operations dm_rq_blk_dops = {
3138 .open = dm_blk_open,
3139 .release = dm_blk_close,
3140 .ioctl = dm_blk_ioctl,
3141 .getgeo = dm_blk_getgeo,
3142 .pr_ops = &dm_pr_ops,
3143 .owner = THIS_MODULE
3144};
3145
3146static const struct dax_operations dm_dax_ops = {
3147 .direct_access = dm_dax_direct_access,
3148 .zero_page_range = dm_dax_zero_page_range,
3149};
3150
3151
3152
3153
3154module_init(dm_init);
3155module_exit(dm_exit);
3156
3157module_param(major, uint, 0);
3158MODULE_PARM_DESC(major, "The major number of the device mapper");
3159
3160module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
3161MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
3162
3163module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
3164MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
3165
3166module_param(swap_bios, int, S_IRUGO | S_IWUSR);
3167MODULE_PARM_DESC(swap_bios, "Maximum allowed inflight swap IOs");
3168
3169MODULE_DESCRIPTION(DM_NAME " driver");
3170MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3171MODULE_LICENSE("GPL");
3172