1
2
3
4
5
6
7
8#include "dm-core.h"
9#include "dm-rq.h"
10#include "dm-uevent.h"
11
12#include <linux/init.h>
13#include <linux/module.h>
14#include <linux/mutex.h>
15#include <linux/sched/mm.h>
16#include <linux/sched/signal.h>
17#include <linux/blkpg.h>
18#include <linux/bio.h>
19#include <linux/mempool.h>
20#include <linux/dax.h>
21#include <linux/slab.h>
22#include <linux/idr.h>
23#include <linux/uio.h>
24#include <linux/hdreg.h>
25#include <linux/delay.h>
26#include <linux/wait.h>
27#include <linux/pr.h>
28#include <linux/refcount.h>
29#include <linux/part_stat.h>
30#include <linux/blk-crypto.h>
31
32#define DM_MSG_PREFIX "core"
33
34
35
36
37
38#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
39#define DM_COOKIE_LENGTH 24
40
41static const char *_name = DM_NAME;
42
43static unsigned int major = 0;
44static unsigned int _major = 0;
45
46static DEFINE_IDR(_minor_idr);
47
48static DEFINE_SPINLOCK(_minor_lock);
49
50static void do_deferred_remove(struct work_struct *w);
51
52static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
53
54static struct workqueue_struct *deferred_remove_workqueue;
55
56atomic_t dm_global_event_nr = ATOMIC_INIT(0);
57DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq);
58
59void dm_issue_global_event(void)
60{
61 atomic_inc(&dm_global_event_nr);
62 wake_up(&dm_global_eventq);
63}
64
65
66
67
68struct clone_info {
69 struct dm_table *map;
70 struct bio *bio;
71 struct dm_io *io;
72 sector_t sector;
73 unsigned sector_count;
74};
75
76
77
78
79#define DM_TIO_MAGIC 7282014
80struct dm_target_io {
81 unsigned magic;
82 struct dm_io *io;
83 struct dm_target *ti;
84 unsigned target_bio_nr;
85 unsigned *len_ptr;
86 bool inside_dm_io;
87 struct bio clone;
88};
89
90
91
92
93
94#define DM_IO_MAGIC 5191977
95struct dm_io {
96 unsigned magic;
97 struct mapped_device *md;
98 blk_status_t status;
99 atomic_t io_count;
100 struct bio *orig_bio;
101 unsigned long start_time;
102 spinlock_t endio_lock;
103 struct dm_stats_aux stats_aux;
104
105 struct dm_target_io tio;
106};
107
108void *dm_per_bio_data(struct bio *bio, size_t data_size)
109{
110 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
111 if (!tio->inside_dm_io)
112 return (char *)bio - offsetof(struct dm_target_io, clone) - data_size;
113 return (char *)bio - offsetof(struct dm_target_io, clone) - offsetof(struct dm_io, tio) - data_size;
114}
115EXPORT_SYMBOL_GPL(dm_per_bio_data);
116
117struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size)
118{
119 struct dm_io *io = (struct dm_io *)((char *)data + data_size);
120 if (io->magic == DM_IO_MAGIC)
121 return (struct bio *)((char *)io + offsetof(struct dm_io, tio) + offsetof(struct dm_target_io, clone));
122 BUG_ON(io->magic != DM_TIO_MAGIC);
123 return (struct bio *)((char *)io + offsetof(struct dm_target_io, clone));
124}
125EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data);
126
127unsigned dm_bio_get_target_bio_nr(const struct bio *bio)
128{
129 return container_of(bio, struct dm_target_io, clone)->target_bio_nr;
130}
131EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr);
132
133#define MINOR_ALLOCED ((void *)-1)
134
135
136
137
138#define DMF_BLOCK_IO_FOR_SUSPEND 0
139#define DMF_SUSPENDED 1
140#define DMF_FROZEN 2
141#define DMF_FREEING 3
142#define DMF_DELETING 4
143#define DMF_NOFLUSH_SUSPENDING 5
144#define DMF_DEFERRED_REMOVE 6
145#define DMF_SUSPENDED_INTERNALLY 7
146#define DMF_POST_SUSPENDING 8
147
148#define DM_NUMA_NODE NUMA_NO_NODE
149static int dm_numa_node = DM_NUMA_NODE;
150
151
152
153
154struct dm_md_mempools {
155 struct bio_set bs;
156 struct bio_set io_bs;
157};
158
159struct table_device {
160 struct list_head list;
161 refcount_t count;
162 struct dm_dev dm_dev;
163};
164
165
166
167
168#define RESERVED_BIO_BASED_IOS 16
169static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
170
171static int __dm_get_module_param_int(int *module_param, int min, int max)
172{
173 int param = READ_ONCE(*module_param);
174 int modified_param = 0;
175 bool modified = true;
176
177 if (param < min)
178 modified_param = min;
179 else if (param > max)
180 modified_param = max;
181 else
182 modified = false;
183
184 if (modified) {
185 (void)cmpxchg(module_param, param, modified_param);
186 param = modified_param;
187 }
188
189 return param;
190}
191
192unsigned __dm_get_module_param(unsigned *module_param,
193 unsigned def, unsigned max)
194{
195 unsigned param = READ_ONCE(*module_param);
196 unsigned modified_param = 0;
197
198 if (!param)
199 modified_param = def;
200 else if (param > max)
201 modified_param = max;
202
203 if (modified_param) {
204 (void)cmpxchg(module_param, param, modified_param);
205 param = modified_param;
206 }
207
208 return param;
209}
210
211unsigned dm_get_reserved_bio_based_ios(void)
212{
213 return __dm_get_module_param(&reserved_bio_based_ios,
214 RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS);
215}
216EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
217
218static unsigned dm_get_numa_node(void)
219{
220 return __dm_get_module_param_int(&dm_numa_node,
221 DM_NUMA_NODE, num_online_nodes() - 1);
222}
223
224static int __init local_init(void)
225{
226 int r;
227
228 r = dm_uevent_init();
229 if (r)
230 return r;
231
232 deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
233 if (!deferred_remove_workqueue) {
234 r = -ENOMEM;
235 goto out_uevent_exit;
236 }
237
238 _major = major;
239 r = register_blkdev(_major, _name);
240 if (r < 0)
241 goto out_free_workqueue;
242
243 if (!_major)
244 _major = r;
245
246 return 0;
247
248out_free_workqueue:
249 destroy_workqueue(deferred_remove_workqueue);
250out_uevent_exit:
251 dm_uevent_exit();
252
253 return r;
254}
255
256static void local_exit(void)
257{
258 flush_scheduled_work();
259 destroy_workqueue(deferred_remove_workqueue);
260
261 unregister_blkdev(_major, _name);
262 dm_uevent_exit();
263
264 _major = 0;
265
266 DMINFO("cleaned up");
267}
268
269static int (*_inits[])(void) __initdata = {
270 local_init,
271 dm_target_init,
272 dm_linear_init,
273 dm_stripe_init,
274 dm_io_init,
275 dm_kcopyd_init,
276 dm_interface_init,
277 dm_statistics_init,
278};
279
280static void (*_exits[])(void) = {
281 local_exit,
282 dm_target_exit,
283 dm_linear_exit,
284 dm_stripe_exit,
285 dm_io_exit,
286 dm_kcopyd_exit,
287 dm_interface_exit,
288 dm_statistics_exit,
289};
290
291static int __init dm_init(void)
292{
293 const int count = ARRAY_SIZE(_inits);
294
295 int r, i;
296
297 for (i = 0; i < count; i++) {
298 r = _inits[i]();
299 if (r)
300 goto bad;
301 }
302
303 return 0;
304
305 bad:
306 while (i--)
307 _exits[i]();
308
309 return r;
310}
311
312static void __exit dm_exit(void)
313{
314 int i = ARRAY_SIZE(_exits);
315
316 while (i--)
317 _exits[i]();
318
319
320
321
322 idr_destroy(&_minor_idr);
323}
324
325
326
327
328int dm_deleting_md(struct mapped_device *md)
329{
330 return test_bit(DMF_DELETING, &md->flags);
331}
332
333static int dm_blk_open(struct block_device *bdev, fmode_t mode)
334{
335 struct mapped_device *md;
336
337 spin_lock(&_minor_lock);
338
339 md = bdev->bd_disk->private_data;
340 if (!md)
341 goto out;
342
343 if (test_bit(DMF_FREEING, &md->flags) ||
344 dm_deleting_md(md)) {
345 md = NULL;
346 goto out;
347 }
348
349 dm_get(md);
350 atomic_inc(&md->open_count);
351out:
352 spin_unlock(&_minor_lock);
353
354 return md ? 0 : -ENXIO;
355}
356
357static void dm_blk_close(struct gendisk *disk, fmode_t mode)
358{
359 struct mapped_device *md;
360
361 spin_lock(&_minor_lock);
362
363 md = disk->private_data;
364 if (WARN_ON(!md))
365 goto out;
366
367 if (atomic_dec_and_test(&md->open_count) &&
368 (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
369 queue_work(deferred_remove_workqueue, &deferred_remove_work);
370
371 dm_put(md);
372out:
373 spin_unlock(&_minor_lock);
374}
375
376int dm_open_count(struct mapped_device *md)
377{
378 return atomic_read(&md->open_count);
379}
380
381
382
383
384int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
385{
386 int r = 0;
387
388 spin_lock(&_minor_lock);
389
390 if (dm_open_count(md)) {
391 r = -EBUSY;
392 if (mark_deferred)
393 set_bit(DMF_DEFERRED_REMOVE, &md->flags);
394 } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
395 r = -EEXIST;
396 else
397 set_bit(DMF_DELETING, &md->flags);
398
399 spin_unlock(&_minor_lock);
400
401 return r;
402}
403
404int dm_cancel_deferred_remove(struct mapped_device *md)
405{
406 int r = 0;
407
408 spin_lock(&_minor_lock);
409
410 if (test_bit(DMF_DELETING, &md->flags))
411 r = -EBUSY;
412 else
413 clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
414
415 spin_unlock(&_minor_lock);
416
417 return r;
418}
419
420static void do_deferred_remove(struct work_struct *w)
421{
422 dm_deferred_remove();
423}
424
425sector_t dm_get_size(struct mapped_device *md)
426{
427 return get_capacity(md->disk);
428}
429
430struct request_queue *dm_get_md_queue(struct mapped_device *md)
431{
432 return md->queue;
433}
434
435struct dm_stats *dm_get_stats(struct mapped_device *md)
436{
437 return &md->stats;
438}
439
440static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
441{
442 struct mapped_device *md = bdev->bd_disk->private_data;
443
444 return dm_get_geometry(md, geo);
445}
446
447#ifdef CONFIG_BLK_DEV_ZONED
448int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx, void *data)
449{
450 struct dm_report_zones_args *args = data;
451 sector_t sector_diff = args->tgt->begin - args->start;
452
453
454
455
456 if (zone->start >= args->start + args->tgt->len)
457 return 0;
458
459
460
461
462
463 zone->start += sector_diff;
464 if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) {
465 if (zone->cond == BLK_ZONE_COND_FULL)
466 zone->wp = zone->start + zone->len;
467 else if (zone->cond == BLK_ZONE_COND_EMPTY)
468 zone->wp = zone->start;
469 else
470 zone->wp += sector_diff;
471 }
472
473 args->next_sector = zone->start + zone->len;
474 return args->orig_cb(zone, args->zone_idx++, args->orig_data);
475}
476EXPORT_SYMBOL_GPL(dm_report_zones_cb);
477
478static int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
479 unsigned int nr_zones, report_zones_cb cb, void *data)
480{
481 struct mapped_device *md = disk->private_data;
482 struct dm_table *map;
483 int srcu_idx, ret;
484 struct dm_report_zones_args args = {
485 .next_sector = sector,
486 .orig_data = data,
487 .orig_cb = cb,
488 };
489
490 if (dm_suspended_md(md))
491 return -EAGAIN;
492
493 map = dm_get_live_table(md, &srcu_idx);
494 if (!map)
495 return -EIO;
496
497 do {
498 struct dm_target *tgt;
499
500 tgt = dm_table_find_target(map, args.next_sector);
501 if (WARN_ON_ONCE(!tgt->type->report_zones)) {
502 ret = -EIO;
503 goto out;
504 }
505
506 args.tgt = tgt;
507 ret = tgt->type->report_zones(tgt, &args,
508 nr_zones - args.zone_idx);
509 if (ret < 0)
510 goto out;
511 } while (args.zone_idx < nr_zones &&
512 args.next_sector < get_capacity(disk));
513
514 ret = args.zone_idx;
515out:
516 dm_put_live_table(md, srcu_idx);
517 return ret;
518}
519#else
520#define dm_blk_report_zones NULL
521#endif
522
523static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx,
524 struct block_device **bdev)
525 __acquires(md->io_barrier)
526{
527 struct dm_target *tgt;
528 struct dm_table *map;
529 int r;
530
531retry:
532 r = -ENOTTY;
533 map = dm_get_live_table(md, srcu_idx);
534 if (!map || !dm_table_get_size(map))
535 return r;
536
537
538 if (dm_table_get_num_targets(map) != 1)
539 return r;
540
541 tgt = dm_table_get_target(map, 0);
542 if (!tgt->type->prepare_ioctl)
543 return r;
544
545 if (dm_suspended_md(md))
546 return -EAGAIN;
547
548 r = tgt->type->prepare_ioctl(tgt, bdev);
549 if (r == -ENOTCONN && !fatal_signal_pending(current)) {
550 dm_put_live_table(md, *srcu_idx);
551 msleep(10);
552 goto retry;
553 }
554
555 return r;
556}
557
558static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx)
559 __releases(md->io_barrier)
560{
561 dm_put_live_table(md, srcu_idx);
562}
563
564static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
565 unsigned int cmd, unsigned long arg)
566{
567 struct mapped_device *md = bdev->bd_disk->private_data;
568 int r, srcu_idx;
569
570 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
571 if (r < 0)
572 goto out;
573
574 if (r > 0) {
575
576
577
578
579 if (!capable(CAP_SYS_RAWIO)) {
580 DMWARN_LIMIT(
581 "%s: sending ioctl %x to DM device without required privilege.",
582 current->comm, cmd);
583 r = -ENOIOCTLCMD;
584 goto out;
585 }
586 }
587
588 r = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
589out:
590 dm_unprepare_ioctl(md, srcu_idx);
591 return r;
592}
593
594static void start_io_acct(struct dm_io *io);
595
596static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
597{
598 struct dm_io *io;
599 struct dm_target_io *tio;
600 struct bio *clone;
601
602 clone = bio_alloc_bioset(GFP_NOIO, 0, &md->io_bs);
603 if (!clone)
604 return NULL;
605
606 tio = container_of(clone, struct dm_target_io, clone);
607 tio->inside_dm_io = true;
608 tio->io = NULL;
609
610 io = container_of(tio, struct dm_io, tio);
611 io->magic = DM_IO_MAGIC;
612 io->status = 0;
613 atomic_set(&io->io_count, 1);
614 io->orig_bio = bio;
615 io->md = md;
616 spin_lock_init(&io->endio_lock);
617
618 start_io_acct(io);
619
620 return io;
621}
622
623static void free_io(struct mapped_device *md, struct dm_io *io)
624{
625 bio_put(&io->tio.clone);
626}
627
628static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *ti,
629 unsigned target_bio_nr, gfp_t gfp_mask)
630{
631 struct dm_target_io *tio;
632
633 if (!ci->io->tio.io) {
634
635 tio = &ci->io->tio;
636 } else {
637 struct bio *clone = bio_alloc_bioset(gfp_mask, 0, &ci->io->md->bs);
638 if (!clone)
639 return NULL;
640
641 tio = container_of(clone, struct dm_target_io, clone);
642 tio->inside_dm_io = false;
643 }
644
645 tio->magic = DM_TIO_MAGIC;
646 tio->io = ci->io;
647 tio->ti = ti;
648 tio->target_bio_nr = target_bio_nr;
649
650 return tio;
651}
652
653static void free_tio(struct dm_target_io *tio)
654{
655 if (tio->inside_dm_io)
656 return;
657 bio_put(&tio->clone);
658}
659
660u64 dm_start_time_ns_from_clone(struct bio *bio)
661{
662 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
663 struct dm_io *io = tio->io;
664
665 return jiffies_to_nsecs(io->start_time);
666}
667EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone);
668
669static void start_io_acct(struct dm_io *io)
670{
671 struct mapped_device *md = io->md;
672 struct bio *bio = io->orig_bio;
673
674 io->start_time = bio_start_io_acct(bio);
675 if (unlikely(dm_stats_used(&md->stats)))
676 dm_stats_account_io(&md->stats, bio_data_dir(bio),
677 bio->bi_iter.bi_sector, bio_sectors(bio),
678 false, 0, &io->stats_aux);
679}
680
681static void end_io_acct(struct dm_io *io)
682{
683 struct mapped_device *md = io->md;
684 struct bio *bio = io->orig_bio;
685 unsigned long duration = jiffies - io->start_time;
686
687 bio_end_io_acct(bio, io->start_time);
688
689 if (unlikely(dm_stats_used(&md->stats)))
690 dm_stats_account_io(&md->stats, bio_data_dir(bio),
691 bio->bi_iter.bi_sector, bio_sectors(bio),
692 true, duration, &io->stats_aux);
693
694
695 if (unlikely(wq_has_sleeper(&md->wait)))
696 wake_up(&md->wait);
697}
698
699
700
701
702static void queue_io(struct mapped_device *md, struct bio *bio)
703{
704 unsigned long flags;
705
706 spin_lock_irqsave(&md->deferred_lock, flags);
707 bio_list_add(&md->deferred, bio);
708 spin_unlock_irqrestore(&md->deferred_lock, flags);
709 queue_work(md->wq, &md->work);
710}
711
712
713
714
715
716
717struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
718{
719 *srcu_idx = srcu_read_lock(&md->io_barrier);
720
721 return srcu_dereference(md->map, &md->io_barrier);
722}
723
724void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
725{
726 srcu_read_unlock(&md->io_barrier, srcu_idx);
727}
728
729void dm_sync_table(struct mapped_device *md)
730{
731 synchronize_srcu(&md->io_barrier);
732 synchronize_rcu_expedited();
733}
734
735
736
737
738
739static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
740{
741 rcu_read_lock();
742 return rcu_dereference(md->map);
743}
744
745static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
746{
747 rcu_read_unlock();
748}
749
750static char *_dm_claim_ptr = "I belong to device-mapper";
751
752
753
754
755static int open_table_device(struct table_device *td, dev_t dev,
756 struct mapped_device *md)
757{
758 struct block_device *bdev;
759
760 int r;
761
762 BUG_ON(td->dm_dev.bdev);
763
764 bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr);
765 if (IS_ERR(bdev))
766 return PTR_ERR(bdev);
767
768 r = bd_link_disk_holder(bdev, dm_disk(md));
769 if (r) {
770 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
771 return r;
772 }
773
774 td->dm_dev.bdev = bdev;
775 td->dm_dev.dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
776 return 0;
777}
778
779
780
781
782static void close_table_device(struct table_device *td, struct mapped_device *md)
783{
784 if (!td->dm_dev.bdev)
785 return;
786
787 bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
788 blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
789 put_dax(td->dm_dev.dax_dev);
790 td->dm_dev.bdev = NULL;
791 td->dm_dev.dax_dev = NULL;
792}
793
794static struct table_device *find_table_device(struct list_head *l, dev_t dev,
795 fmode_t mode)
796{
797 struct table_device *td;
798
799 list_for_each_entry(td, l, list)
800 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
801 return td;
802
803 return NULL;
804}
805
806int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
807 struct dm_dev **result)
808{
809 int r;
810 struct table_device *td;
811
812 mutex_lock(&md->table_devices_lock);
813 td = find_table_device(&md->table_devices, dev, mode);
814 if (!td) {
815 td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
816 if (!td) {
817 mutex_unlock(&md->table_devices_lock);
818 return -ENOMEM;
819 }
820
821 td->dm_dev.mode = mode;
822 td->dm_dev.bdev = NULL;
823
824 if ((r = open_table_device(td, dev, md))) {
825 mutex_unlock(&md->table_devices_lock);
826 kfree(td);
827 return r;
828 }
829
830 format_dev_t(td->dm_dev.name, dev);
831
832 refcount_set(&td->count, 1);
833 list_add(&td->list, &md->table_devices);
834 } else {
835 refcount_inc(&td->count);
836 }
837 mutex_unlock(&md->table_devices_lock);
838
839 *result = &td->dm_dev;
840 return 0;
841}
842EXPORT_SYMBOL_GPL(dm_get_table_device);
843
844void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
845{
846 struct table_device *td = container_of(d, struct table_device, dm_dev);
847
848 mutex_lock(&md->table_devices_lock);
849 if (refcount_dec_and_test(&td->count)) {
850 close_table_device(td, md);
851 list_del(&td->list);
852 kfree(td);
853 }
854 mutex_unlock(&md->table_devices_lock);
855}
856EXPORT_SYMBOL(dm_put_table_device);
857
858static void free_table_devices(struct list_head *devices)
859{
860 struct list_head *tmp, *next;
861
862 list_for_each_safe(tmp, next, devices) {
863 struct table_device *td = list_entry(tmp, struct table_device, list);
864
865 DMWARN("dm_destroy: %s still exists with %d references",
866 td->dm_dev.name, refcount_read(&td->count));
867 kfree(td);
868 }
869}
870
871
872
873
874int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
875{
876 *geo = md->geometry;
877
878 return 0;
879}
880
881
882
883
884int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
885{
886 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
887
888 if (geo->start > sz) {
889 DMWARN("Start sector is beyond the geometry limits.");
890 return -EINVAL;
891 }
892
893 md->geometry = *geo;
894
895 return 0;
896}
897
898static int __noflush_suspending(struct mapped_device *md)
899{
900 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
901}
902
903
904
905
906
907static void dec_pending(struct dm_io *io, blk_status_t error)
908{
909 unsigned long flags;
910 blk_status_t io_error;
911 struct bio *bio;
912 struct mapped_device *md = io->md;
913
914
915 if (unlikely(error)) {
916 spin_lock_irqsave(&io->endio_lock, flags);
917 if (!(io->status == BLK_STS_DM_REQUEUE && __noflush_suspending(md)))
918 io->status = error;
919 spin_unlock_irqrestore(&io->endio_lock, flags);
920 }
921
922 if (atomic_dec_and_test(&io->io_count)) {
923 if (io->status == BLK_STS_DM_REQUEUE) {
924
925
926
927 spin_lock_irqsave(&md->deferred_lock, flags);
928 if (__noflush_suspending(md))
929
930 bio_list_add_head(&md->deferred, io->orig_bio);
931 else
932
933 io->status = BLK_STS_IOERR;
934 spin_unlock_irqrestore(&md->deferred_lock, flags);
935 }
936
937 io_error = io->status;
938 bio = io->orig_bio;
939 end_io_acct(io);
940 free_io(md, io);
941
942 if (io_error == BLK_STS_DM_REQUEUE)
943 return;
944
945 if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
946
947
948
949
950 bio->bi_opf &= ~REQ_PREFLUSH;
951 queue_io(md, bio);
952 } else {
953
954 if (io_error)
955 bio->bi_status = io_error;
956 bio_endio(bio);
957 }
958 }
959}
960
961void disable_discard(struct mapped_device *md)
962{
963 struct queue_limits *limits = dm_get_queue_limits(md);
964
965
966 limits->max_discard_sectors = 0;
967 blk_queue_flag_clear(QUEUE_FLAG_DISCARD, md->queue);
968}
969
970void disable_write_same(struct mapped_device *md)
971{
972 struct queue_limits *limits = dm_get_queue_limits(md);
973
974
975 limits->max_write_same_sectors = 0;
976}
977
978void disable_write_zeroes(struct mapped_device *md)
979{
980 struct queue_limits *limits = dm_get_queue_limits(md);
981
982
983 limits->max_write_zeroes_sectors = 0;
984}
985
986static void clone_endio(struct bio *bio)
987{
988 blk_status_t error = bio->bi_status;
989 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
990 struct dm_io *io = tio->io;
991 struct mapped_device *md = tio->io->md;
992 dm_endio_fn endio = tio->ti->type->end_io;
993 struct bio *orig_bio = io->orig_bio;
994
995 if (unlikely(error == BLK_STS_TARGET) && md->type != DM_TYPE_NVME_BIO_BASED) {
996 if (bio_op(bio) == REQ_OP_DISCARD &&
997 !bio->bi_disk->queue->limits.max_discard_sectors)
998 disable_discard(md);
999 else if (bio_op(bio) == REQ_OP_WRITE_SAME &&
1000 !bio->bi_disk->queue->limits.max_write_same_sectors)
1001 disable_write_same(md);
1002 else if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
1003 !bio->bi_disk->queue->limits.max_write_zeroes_sectors)
1004 disable_write_zeroes(md);
1005 }
1006
1007
1008
1009
1010
1011 if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) {
1012 sector_t written_sector = bio->bi_iter.bi_sector;
1013 struct request_queue *q = orig_bio->bi_disk->queue;
1014 u64 mask = (u64)blk_queue_zone_sectors(q) - 1;
1015
1016 orig_bio->bi_iter.bi_sector += written_sector & mask;
1017 }
1018
1019 if (endio) {
1020 int r = endio(tio->ti, bio, &error);
1021 switch (r) {
1022 case DM_ENDIO_REQUEUE:
1023 error = BLK_STS_DM_REQUEUE;
1024 fallthrough;
1025 case DM_ENDIO_DONE:
1026 break;
1027 case DM_ENDIO_INCOMPLETE:
1028
1029 return;
1030 default:
1031 DMWARN("unimplemented target endio return value: %d", r);
1032 BUG();
1033 }
1034 }
1035
1036 free_tio(tio);
1037 dec_pending(io, error);
1038}
1039
1040
1041
1042
1043
1044static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
1045{
1046 sector_t target_offset = dm_target_offset(ti, sector);
1047
1048 return ti->len - target_offset;
1049}
1050
1051static sector_t max_io_len(sector_t sector, struct dm_target *ti)
1052{
1053 sector_t len = max_io_len_target_boundary(sector, ti);
1054 sector_t offset, max_len;
1055
1056
1057
1058
1059 if (ti->max_io_len) {
1060 offset = dm_target_offset(ti, sector);
1061 if (unlikely(ti->max_io_len & (ti->max_io_len - 1)))
1062 max_len = sector_div(offset, ti->max_io_len);
1063 else
1064 max_len = offset & (ti->max_io_len - 1);
1065 max_len = ti->max_io_len - max_len;
1066
1067 if (len > max_len)
1068 len = max_len;
1069 }
1070
1071 return len;
1072}
1073
1074int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
1075{
1076 if (len > UINT_MAX) {
1077 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
1078 (unsigned long long)len, UINT_MAX);
1079 ti->error = "Maximum size of target IO is too large";
1080 return -EINVAL;
1081 }
1082
1083 ti->max_io_len = (uint32_t) len;
1084
1085 return 0;
1086}
1087EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
1088
1089static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
1090 sector_t sector, int *srcu_idx)
1091 __acquires(md->io_barrier)
1092{
1093 struct dm_table *map;
1094 struct dm_target *ti;
1095
1096 map = dm_get_live_table(md, srcu_idx);
1097 if (!map)
1098 return NULL;
1099
1100 ti = dm_table_find_target(map, sector);
1101 if (!ti)
1102 return NULL;
1103
1104 return ti;
1105}
1106
1107static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
1108 long nr_pages, void **kaddr, pfn_t *pfn)
1109{
1110 struct mapped_device *md = dax_get_private(dax_dev);
1111 sector_t sector = pgoff * PAGE_SECTORS;
1112 struct dm_target *ti;
1113 long len, ret = -EIO;
1114 int srcu_idx;
1115
1116 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1117
1118 if (!ti)
1119 goto out;
1120 if (!ti->type->direct_access)
1121 goto out;
1122 len = max_io_len(sector, ti) / PAGE_SECTORS;
1123 if (len < 1)
1124 goto out;
1125 nr_pages = min(len, nr_pages);
1126 ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
1127
1128 out:
1129 dm_put_live_table(md, srcu_idx);
1130
1131 return ret;
1132}
1133
1134static bool dm_dax_supported(struct dax_device *dax_dev, struct block_device *bdev,
1135 int blocksize, sector_t start, sector_t len)
1136{
1137 struct mapped_device *md = dax_get_private(dax_dev);
1138 struct dm_table *map;
1139 bool ret = false;
1140 int srcu_idx;
1141
1142 map = dm_get_live_table(md, &srcu_idx);
1143 if (!map)
1144 goto out;
1145
1146 ret = dm_table_supports_dax(map, device_supports_dax, &blocksize);
1147
1148out:
1149 dm_put_live_table(md, srcu_idx);
1150
1151 return ret;
1152}
1153
1154static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
1155 void *addr, size_t bytes, struct iov_iter *i)
1156{
1157 struct mapped_device *md = dax_get_private(dax_dev);
1158 sector_t sector = pgoff * PAGE_SECTORS;
1159 struct dm_target *ti;
1160 long ret = 0;
1161 int srcu_idx;
1162
1163 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1164
1165 if (!ti)
1166 goto out;
1167 if (!ti->type->dax_copy_from_iter) {
1168 ret = copy_from_iter(addr, bytes, i);
1169 goto out;
1170 }
1171 ret = ti->type->dax_copy_from_iter(ti, pgoff, addr, bytes, i);
1172 out:
1173 dm_put_live_table(md, srcu_idx);
1174
1175 return ret;
1176}
1177
1178static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
1179 void *addr, size_t bytes, struct iov_iter *i)
1180{
1181 struct mapped_device *md = dax_get_private(dax_dev);
1182 sector_t sector = pgoff * PAGE_SECTORS;
1183 struct dm_target *ti;
1184 long ret = 0;
1185 int srcu_idx;
1186
1187 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1188
1189 if (!ti)
1190 goto out;
1191 if (!ti->type->dax_copy_to_iter) {
1192 ret = copy_to_iter(addr, bytes, i);
1193 goto out;
1194 }
1195 ret = ti->type->dax_copy_to_iter(ti, pgoff, addr, bytes, i);
1196 out:
1197 dm_put_live_table(md, srcu_idx);
1198
1199 return ret;
1200}
1201
1202static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
1203 size_t nr_pages)
1204{
1205 struct mapped_device *md = dax_get_private(dax_dev);
1206 sector_t sector = pgoff * PAGE_SECTORS;
1207 struct dm_target *ti;
1208 int ret = -EIO;
1209 int srcu_idx;
1210
1211 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1212
1213 if (!ti)
1214 goto out;
1215 if (WARN_ON(!ti->type->dax_zero_page_range)) {
1216
1217
1218
1219
1220 dm_put_live_table(md, srcu_idx);
1221 goto out;
1222 }
1223 ret = ti->type->dax_zero_page_range(ti, pgoff, nr_pages);
1224
1225 out:
1226 dm_put_live_table(md, srcu_idx);
1227
1228 return ret;
1229}
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
1261{
1262 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
1263 unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
1264 BUG_ON(bio->bi_opf & REQ_PREFLUSH);
1265 BUG_ON(bi_size > *tio->len_ptr);
1266 BUG_ON(n_sectors > bi_size);
1267 *tio->len_ptr -= bi_size - n_sectors;
1268 bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
1269}
1270EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
1271
1272static blk_qc_t __map_bio(struct dm_target_io *tio)
1273{
1274 int r;
1275 sector_t sector;
1276 struct bio *clone = &tio->clone;
1277 struct dm_io *io = tio->io;
1278 struct dm_target *ti = tio->ti;
1279 blk_qc_t ret = BLK_QC_T_NONE;
1280
1281 clone->bi_end_io = clone_endio;
1282
1283
1284
1285
1286
1287
1288 atomic_inc(&io->io_count);
1289 sector = clone->bi_iter.bi_sector;
1290
1291 r = ti->type->map(ti, clone);
1292 switch (r) {
1293 case DM_MAPIO_SUBMITTED:
1294 break;
1295 case DM_MAPIO_REMAPPED:
1296
1297 trace_block_bio_remap(clone->bi_disk->queue, clone,
1298 bio_dev(io->orig_bio), sector);
1299 ret = submit_bio_noacct(clone);
1300 break;
1301 case DM_MAPIO_KILL:
1302 free_tio(tio);
1303 dec_pending(io, BLK_STS_IOERR);
1304 break;
1305 case DM_MAPIO_REQUEUE:
1306 free_tio(tio);
1307 dec_pending(io, BLK_STS_DM_REQUEUE);
1308 break;
1309 default:
1310 DMWARN("unimplemented target map return value: %d", r);
1311 BUG();
1312 }
1313
1314 return ret;
1315}
1316
1317static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
1318{
1319 bio->bi_iter.bi_sector = sector;
1320 bio->bi_iter.bi_size = to_bytes(len);
1321}
1322
1323
1324
1325
1326static int clone_bio(struct dm_target_io *tio, struct bio *bio,
1327 sector_t sector, unsigned len)
1328{
1329 struct bio *clone = &tio->clone;
1330
1331 __bio_clone_fast(clone, bio);
1332
1333 bio_crypt_clone(clone, bio, GFP_NOIO);
1334
1335 if (bio_integrity(bio)) {
1336 int r;
1337
1338 if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
1339 !dm_target_passes_integrity(tio->ti->type))) {
1340 DMWARN("%s: the target %s doesn't support integrity data.",
1341 dm_device_name(tio->io->md),
1342 tio->ti->type->name);
1343 return -EIO;
1344 }
1345
1346 r = bio_integrity_clone(clone, bio, GFP_NOIO);
1347 if (r < 0)
1348 return r;
1349 }
1350
1351 bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
1352 clone->bi_iter.bi_size = to_bytes(len);
1353
1354 if (bio_integrity(bio))
1355 bio_integrity_trim(clone);
1356
1357 return 0;
1358}
1359
1360static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
1361 struct dm_target *ti, unsigned num_bios)
1362{
1363 struct dm_target_io *tio;
1364 int try;
1365
1366 if (!num_bios)
1367 return;
1368
1369 if (num_bios == 1) {
1370 tio = alloc_tio(ci, ti, 0, GFP_NOIO);
1371 bio_list_add(blist, &tio->clone);
1372 return;
1373 }
1374
1375 for (try = 0; try < 2; try++) {
1376 int bio_nr;
1377 struct bio *bio;
1378
1379 if (try)
1380 mutex_lock(&ci->io->md->table_devices_lock);
1381 for (bio_nr = 0; bio_nr < num_bios; bio_nr++) {
1382 tio = alloc_tio(ci, ti, bio_nr, try ? GFP_NOIO : GFP_NOWAIT);
1383 if (!tio)
1384 break;
1385
1386 bio_list_add(blist, &tio->clone);
1387 }
1388 if (try)
1389 mutex_unlock(&ci->io->md->table_devices_lock);
1390 if (bio_nr == num_bios)
1391 return;
1392
1393 while ((bio = bio_list_pop(blist))) {
1394 tio = container_of(bio, struct dm_target_io, clone);
1395 free_tio(tio);
1396 }
1397 }
1398}
1399
1400static blk_qc_t __clone_and_map_simple_bio(struct clone_info *ci,
1401 struct dm_target_io *tio, unsigned *len)
1402{
1403 struct bio *clone = &tio->clone;
1404
1405 tio->len_ptr = len;
1406
1407 __bio_clone_fast(clone, ci->bio);
1408 if (len)
1409 bio_setup_sector(clone, ci->sector, *len);
1410
1411 return __map_bio(tio);
1412}
1413
1414static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
1415 unsigned num_bios, unsigned *len)
1416{
1417 struct bio_list blist = BIO_EMPTY_LIST;
1418 struct bio *bio;
1419 struct dm_target_io *tio;
1420
1421 alloc_multiple_bios(&blist, ci, ti, num_bios);
1422
1423 while ((bio = bio_list_pop(&blist))) {
1424 tio = container_of(bio, struct dm_target_io, clone);
1425 (void) __clone_and_map_simple_bio(ci, tio, len);
1426 }
1427}
1428
1429static int __send_empty_flush(struct clone_info *ci)
1430{
1431 unsigned target_nr = 0;
1432 struct dm_target *ti;
1433
1434
1435
1436
1437
1438
1439
1440
1441 bio_set_dev(ci->bio, ci->io->md->bdev);
1442
1443 BUG_ON(bio_has_data(ci->bio));
1444 while ((ti = dm_table_get_target(ci->map, target_nr++)))
1445 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
1446 return 0;
1447}
1448
1449static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
1450 sector_t sector, unsigned *len)
1451{
1452 struct bio *bio = ci->bio;
1453 struct dm_target_io *tio;
1454 int r;
1455
1456 tio = alloc_tio(ci, ti, 0, GFP_NOIO);
1457 tio->len_ptr = len;
1458 r = clone_bio(tio, bio, sector, *len);
1459 if (r < 0) {
1460 free_tio(tio);
1461 return r;
1462 }
1463 (void) __map_bio(tio);
1464
1465 return 0;
1466}
1467
1468typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
1469
1470static unsigned get_num_discard_bios(struct dm_target *ti)
1471{
1472 return ti->num_discard_bios;
1473}
1474
1475static unsigned get_num_secure_erase_bios(struct dm_target *ti)
1476{
1477 return ti->num_secure_erase_bios;
1478}
1479
1480static unsigned get_num_write_same_bios(struct dm_target *ti)
1481{
1482 return ti->num_write_same_bios;
1483}
1484
1485static unsigned get_num_write_zeroes_bios(struct dm_target *ti)
1486{
1487 return ti->num_write_zeroes_bios;
1488}
1489
1490static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
1491 unsigned num_bios)
1492{
1493 unsigned len;
1494
1495
1496
1497
1498
1499
1500
1501 if (!num_bios)
1502 return -EOPNOTSUPP;
1503
1504 len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
1505
1506 __send_duplicate_bios(ci, ti, num_bios, &len);
1507
1508 ci->sector += len;
1509 ci->sector_count -= len;
1510
1511 return 0;
1512}
1513
1514static int __send_discard(struct clone_info *ci, struct dm_target *ti)
1515{
1516 return __send_changing_extent_only(ci, ti, get_num_discard_bios(ti));
1517}
1518
1519static int __send_secure_erase(struct clone_info *ci, struct dm_target *ti)
1520{
1521 return __send_changing_extent_only(ci, ti, get_num_secure_erase_bios(ti));
1522}
1523
1524static int __send_write_same(struct clone_info *ci, struct dm_target *ti)
1525{
1526 return __send_changing_extent_only(ci, ti, get_num_write_same_bios(ti));
1527}
1528
1529static int __send_write_zeroes(struct clone_info *ci, struct dm_target *ti)
1530{
1531 return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios(ti));
1532}
1533
1534static bool is_abnormal_io(struct bio *bio)
1535{
1536 bool r = false;
1537
1538 switch (bio_op(bio)) {
1539 case REQ_OP_DISCARD:
1540 case REQ_OP_SECURE_ERASE:
1541 case REQ_OP_WRITE_SAME:
1542 case REQ_OP_WRITE_ZEROES:
1543 r = true;
1544 break;
1545 }
1546
1547 return r;
1548}
1549
1550static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti,
1551 int *result)
1552{
1553 struct bio *bio = ci->bio;
1554
1555 if (bio_op(bio) == REQ_OP_DISCARD)
1556 *result = __send_discard(ci, ti);
1557 else if (bio_op(bio) == REQ_OP_SECURE_ERASE)
1558 *result = __send_secure_erase(ci, ti);
1559 else if (bio_op(bio) == REQ_OP_WRITE_SAME)
1560 *result = __send_write_same(ci, ti);
1561 else if (bio_op(bio) == REQ_OP_WRITE_ZEROES)
1562 *result = __send_write_zeroes(ci, ti);
1563 else
1564 return false;
1565
1566 return true;
1567}
1568
1569
1570
1571
1572static int __split_and_process_non_flush(struct clone_info *ci)
1573{
1574 struct dm_target *ti;
1575 unsigned len;
1576 int r;
1577
1578 ti = dm_table_find_target(ci->map, ci->sector);
1579 if (!ti)
1580 return -EIO;
1581
1582 if (__process_abnormal_io(ci, ti, &r))
1583 return r;
1584
1585 len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count);
1586
1587 r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
1588 if (r < 0)
1589 return r;
1590
1591 ci->sector += len;
1592 ci->sector_count -= len;
1593
1594 return 0;
1595}
1596
1597static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
1598 struct dm_table *map, struct bio *bio)
1599{
1600 ci->map = map;
1601 ci->io = alloc_io(md, bio);
1602 ci->sector = bio->bi_iter.bi_sector;
1603}
1604
1605#define __dm_part_stat_sub(part, field, subnd) \
1606 (part_stat_get(part, field) -= (subnd))
1607
1608
1609
1610
1611static blk_qc_t __split_and_process_bio(struct mapped_device *md,
1612 struct dm_table *map, struct bio *bio)
1613{
1614 struct clone_info ci;
1615 blk_qc_t ret = BLK_QC_T_NONE;
1616 int error = 0;
1617
1618 init_clone_info(&ci, md, map, bio);
1619
1620 if (bio->bi_opf & REQ_PREFLUSH) {
1621 struct bio flush_bio;
1622
1623
1624
1625
1626
1627
1628 bio_init(&flush_bio, NULL, 0);
1629 flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
1630 ci.bio = &flush_bio;
1631 ci.sector_count = 0;
1632 error = __send_empty_flush(&ci);
1633 bio_uninit(ci.bio);
1634
1635 } else if (op_is_zone_mgmt(bio_op(bio))) {
1636 ci.bio = bio;
1637 ci.sector_count = 0;
1638 error = __split_and_process_non_flush(&ci);
1639 } else {
1640 ci.bio = bio;
1641 ci.sector_count = bio_sectors(bio);
1642 while (ci.sector_count && !error) {
1643 error = __split_and_process_non_flush(&ci);
1644 if (current->bio_list && ci.sector_count && !error) {
1645
1646
1647
1648
1649
1650
1651
1652
1653 struct bio *b = bio_split(bio, bio_sectors(bio) - ci.sector_count,
1654 GFP_NOIO, &md->queue->bio_split);
1655 ci.io->orig_bio = b;
1656
1657
1658
1659
1660
1661
1662
1663
1664 part_stat_lock();
1665 __dm_part_stat_sub(&dm_disk(md)->part0,
1666 sectors[op_stat_group(bio_op(bio))], ci.sector_count);
1667 part_stat_unlock();
1668
1669 bio_chain(b, bio);
1670 trace_block_split(md->queue, b, bio->bi_iter.bi_sector);
1671 ret = submit_bio_noacct(bio);
1672 break;
1673 }
1674 }
1675 }
1676
1677
1678 dec_pending(ci.io, errno_to_blk_status(error));
1679 return ret;
1680}
1681
1682
1683
1684
1685
1686static blk_qc_t __process_bio(struct mapped_device *md, struct dm_table *map,
1687 struct bio *bio, struct dm_target *ti)
1688{
1689 struct clone_info ci;
1690 blk_qc_t ret = BLK_QC_T_NONE;
1691 int error = 0;
1692
1693 init_clone_info(&ci, md, map, bio);
1694
1695 if (bio->bi_opf & REQ_PREFLUSH) {
1696 struct bio flush_bio;
1697
1698
1699
1700
1701
1702
1703 bio_init(&flush_bio, NULL, 0);
1704 flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
1705 ci.bio = &flush_bio;
1706 ci.sector_count = 0;
1707 error = __send_empty_flush(&ci);
1708 bio_uninit(ci.bio);
1709
1710 } else {
1711 struct dm_target_io *tio;
1712
1713 ci.bio = bio;
1714 ci.sector_count = bio_sectors(bio);
1715 if (__process_abnormal_io(&ci, ti, &error))
1716 goto out;
1717
1718 tio = alloc_tio(&ci, ti, 0, GFP_NOIO);
1719 ret = __clone_and_map_simple_bio(&ci, tio, NULL);
1720 }
1721out:
1722
1723 dec_pending(ci.io, errno_to_blk_status(error));
1724 return ret;
1725}
1726
1727static blk_qc_t dm_process_bio(struct mapped_device *md,
1728 struct dm_table *map, struct bio *bio)
1729{
1730 blk_qc_t ret = BLK_QC_T_NONE;
1731 struct dm_target *ti = md->immutable_target;
1732
1733 if (unlikely(!map)) {
1734 bio_io_error(bio);
1735 return ret;
1736 }
1737
1738 if (!ti) {
1739 ti = dm_table_find_target(map, bio->bi_iter.bi_sector);
1740 if (unlikely(!ti)) {
1741 bio_io_error(bio);
1742 return ret;
1743 }
1744 }
1745
1746
1747
1748
1749
1750
1751
1752
1753 if (current->bio_list) {
1754 if (is_abnormal_io(bio))
1755 blk_queue_split(&bio);
1756
1757 }
1758
1759 if (dm_get_md_type(md) == DM_TYPE_NVME_BIO_BASED)
1760 return __process_bio(md, map, bio, ti);
1761 return __split_and_process_bio(md, map, bio);
1762}
1763
1764static blk_qc_t dm_submit_bio(struct bio *bio)
1765{
1766 struct mapped_device *md = bio->bi_disk->private_data;
1767 blk_qc_t ret = BLK_QC_T_NONE;
1768 int srcu_idx;
1769 struct dm_table *map;
1770
1771 if (dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) {
1772
1773
1774
1775
1776
1777
1778
1779 percpu_ref_get(&bio->bi_disk->queue->q_usage_counter);
1780 return blk_mq_submit_bio(bio);
1781 }
1782
1783 map = dm_get_live_table(md, &srcu_idx);
1784
1785
1786 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1787 dm_put_live_table(md, srcu_idx);
1788
1789 if (!(bio->bi_opf & REQ_RAHEAD))
1790 queue_io(md, bio);
1791 else
1792 bio_io_error(bio);
1793 return ret;
1794 }
1795
1796 ret = dm_process_bio(md, map, bio);
1797
1798 dm_put_live_table(md, srcu_idx);
1799 return ret;
1800}
1801
1802
1803
1804
1805static void free_minor(int minor)
1806{
1807 spin_lock(&_minor_lock);
1808 idr_remove(&_minor_idr, minor);
1809 spin_unlock(&_minor_lock);
1810}
1811
1812
1813
1814
1815static int specific_minor(int minor)
1816{
1817 int r;
1818
1819 if (minor >= (1 << MINORBITS))
1820 return -EINVAL;
1821
1822 idr_preload(GFP_KERNEL);
1823 spin_lock(&_minor_lock);
1824
1825 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
1826
1827 spin_unlock(&_minor_lock);
1828 idr_preload_end();
1829 if (r < 0)
1830 return r == -ENOSPC ? -EBUSY : r;
1831 return 0;
1832}
1833
1834static int next_free_minor(int *minor)
1835{
1836 int r;
1837
1838 idr_preload(GFP_KERNEL);
1839 spin_lock(&_minor_lock);
1840
1841 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
1842
1843 spin_unlock(&_minor_lock);
1844 idr_preload_end();
1845 if (r < 0)
1846 return r;
1847 *minor = r;
1848 return 0;
1849}
1850
1851static const struct block_device_operations dm_blk_dops;
1852static const struct dax_operations dm_dax_ops;
1853
1854static void dm_wq_work(struct work_struct *work);
1855
1856static void cleanup_mapped_device(struct mapped_device *md)
1857{
1858 if (md->wq)
1859 destroy_workqueue(md->wq);
1860 bioset_exit(&md->bs);
1861 bioset_exit(&md->io_bs);
1862
1863 if (md->dax_dev) {
1864 kill_dax(md->dax_dev);
1865 put_dax(md->dax_dev);
1866 md->dax_dev = NULL;
1867 }
1868
1869 if (md->disk) {
1870 spin_lock(&_minor_lock);
1871 md->disk->private_data = NULL;
1872 spin_unlock(&_minor_lock);
1873 del_gendisk(md->disk);
1874 put_disk(md->disk);
1875 }
1876
1877 if (md->queue)
1878 blk_cleanup_queue(md->queue);
1879
1880 cleanup_srcu_struct(&md->io_barrier);
1881
1882 if (md->bdev) {
1883 bdput(md->bdev);
1884 md->bdev = NULL;
1885 }
1886
1887 mutex_destroy(&md->suspend_lock);
1888 mutex_destroy(&md->type_lock);
1889 mutex_destroy(&md->table_devices_lock);
1890
1891 dm_mq_cleanup_mapped_device(md);
1892}
1893
1894
1895
1896
1897static struct mapped_device *alloc_dev(int minor)
1898{
1899 int r, numa_node_id = dm_get_numa_node();
1900 struct mapped_device *md;
1901 void *old_md;
1902
1903 md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
1904 if (!md) {
1905 DMWARN("unable to allocate device, out of memory.");
1906 return NULL;
1907 }
1908
1909 if (!try_module_get(THIS_MODULE))
1910 goto bad_module_get;
1911
1912
1913 if (minor == DM_ANY_MINOR)
1914 r = next_free_minor(&minor);
1915 else
1916 r = specific_minor(minor);
1917 if (r < 0)
1918 goto bad_minor;
1919
1920 r = init_srcu_struct(&md->io_barrier);
1921 if (r < 0)
1922 goto bad_io_barrier;
1923
1924 md->numa_node_id = numa_node_id;
1925 md->init_tio_pdu = false;
1926 md->type = DM_TYPE_NONE;
1927 mutex_init(&md->suspend_lock);
1928 mutex_init(&md->type_lock);
1929 mutex_init(&md->table_devices_lock);
1930 spin_lock_init(&md->deferred_lock);
1931 atomic_set(&md->holders, 1);
1932 atomic_set(&md->open_count, 0);
1933 atomic_set(&md->event_nr, 0);
1934 atomic_set(&md->uevent_seq, 0);
1935 INIT_LIST_HEAD(&md->uevent_list);
1936 INIT_LIST_HEAD(&md->table_devices);
1937 spin_lock_init(&md->uevent_lock);
1938
1939
1940
1941
1942
1943
1944 md->queue = blk_alloc_queue(numa_node_id);
1945 if (!md->queue)
1946 goto bad;
1947
1948 md->disk = alloc_disk_node(1, md->numa_node_id);
1949 if (!md->disk)
1950 goto bad;
1951
1952 init_waitqueue_head(&md->wait);
1953 INIT_WORK(&md->work, dm_wq_work);
1954 init_waitqueue_head(&md->eventq);
1955 init_completion(&md->kobj_holder.completion);
1956
1957 md->disk->major = _major;
1958 md->disk->first_minor = minor;
1959 md->disk->fops = &dm_blk_dops;
1960 md->disk->queue = md->queue;
1961 md->disk->private_data = md;
1962 sprintf(md->disk->disk_name, "dm-%d", minor);
1963
1964 if (IS_ENABLED(CONFIG_DAX_DRIVER)) {
1965 md->dax_dev = alloc_dax(md, md->disk->disk_name,
1966 &dm_dax_ops, 0);
1967 if (IS_ERR(md->dax_dev))
1968 goto bad;
1969 }
1970
1971 add_disk_no_queue_reg(md->disk);
1972 format_dev_t(md->name, MKDEV(_major, minor));
1973
1974 md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
1975 if (!md->wq)
1976 goto bad;
1977
1978 md->bdev = bdget_disk(md->disk, 0);
1979 if (!md->bdev)
1980 goto bad;
1981
1982 dm_stats_init(&md->stats);
1983
1984
1985 spin_lock(&_minor_lock);
1986 old_md = idr_replace(&_minor_idr, md, minor);
1987 spin_unlock(&_minor_lock);
1988
1989 BUG_ON(old_md != MINOR_ALLOCED);
1990
1991 return md;
1992
1993bad:
1994 cleanup_mapped_device(md);
1995bad_io_barrier:
1996 free_minor(minor);
1997bad_minor:
1998 module_put(THIS_MODULE);
1999bad_module_get:
2000 kvfree(md);
2001 return NULL;
2002}
2003
2004static void unlock_fs(struct mapped_device *md);
2005
2006static void free_dev(struct mapped_device *md)
2007{
2008 int minor = MINOR(disk_devt(md->disk));
2009
2010 unlock_fs(md);
2011
2012 cleanup_mapped_device(md);
2013
2014 free_table_devices(&md->table_devices);
2015 dm_stats_cleanup(&md->stats);
2016 free_minor(minor);
2017
2018 module_put(THIS_MODULE);
2019 kvfree(md);
2020}
2021
2022static int __bind_mempools(struct mapped_device *md, struct dm_table *t)
2023{
2024 struct dm_md_mempools *p = dm_table_get_md_mempools(t);
2025 int ret = 0;
2026
2027 if (dm_table_bio_based(t)) {
2028
2029
2030
2031
2032
2033 bioset_exit(&md->bs);
2034 bioset_exit(&md->io_bs);
2035
2036 } else if (bioset_initialized(&md->bs)) {
2037
2038
2039
2040
2041
2042
2043
2044
2045 goto out;
2046 }
2047
2048 BUG_ON(!p ||
2049 bioset_initialized(&md->bs) ||
2050 bioset_initialized(&md->io_bs));
2051
2052 ret = bioset_init_from_src(&md->bs, &p->bs);
2053 if (ret)
2054 goto out;
2055 ret = bioset_init_from_src(&md->io_bs, &p->io_bs);
2056 if (ret)
2057 bioset_exit(&md->bs);
2058out:
2059
2060 dm_table_free_md_mempools(t);
2061 return ret;
2062}
2063
2064
2065
2066
2067static void event_callback(void *context)
2068{
2069 unsigned long flags;
2070 LIST_HEAD(uevents);
2071 struct mapped_device *md = (struct mapped_device *) context;
2072
2073 spin_lock_irqsave(&md->uevent_lock, flags);
2074 list_splice_init(&md->uevent_list, &uevents);
2075 spin_unlock_irqrestore(&md->uevent_lock, flags);
2076
2077 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
2078
2079 atomic_inc(&md->event_nr);
2080 wake_up(&md->eventq);
2081 dm_issue_global_event();
2082}
2083
2084
2085
2086
2087static void __set_size(struct mapped_device *md, sector_t size)
2088{
2089 lockdep_assert_held(&md->suspend_lock);
2090
2091 set_capacity(md->disk, size);
2092
2093 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
2094}
2095
2096
2097
2098
2099static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2100 struct queue_limits *limits)
2101{
2102 struct dm_table *old_map;
2103 struct request_queue *q = md->queue;
2104 bool request_based = dm_table_request_based(t);
2105 sector_t size;
2106 int ret;
2107
2108 lockdep_assert_held(&md->suspend_lock);
2109
2110 size = dm_table_get_size(t);
2111
2112
2113
2114
2115 if (size != dm_get_size(md))
2116 memset(&md->geometry, 0, sizeof(md->geometry));
2117
2118 __set_size(md, size);
2119
2120 dm_table_event_callback(t, event_callback, md);
2121
2122
2123
2124
2125
2126
2127
2128
2129 if (request_based)
2130 dm_stop_queue(q);
2131
2132 if (request_based || md->type == DM_TYPE_NVME_BIO_BASED) {
2133
2134
2135
2136
2137
2138
2139 md->immutable_target = dm_table_get_immutable_target(t);
2140 }
2141
2142 ret = __bind_mempools(md, t);
2143 if (ret) {
2144 old_map = ERR_PTR(ret);
2145 goto out;
2146 }
2147
2148 old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2149 rcu_assign_pointer(md->map, (void *)t);
2150 md->immutable_target_type = dm_table_get_immutable_target_type(t);
2151
2152 dm_table_set_restrictions(t, q, limits);
2153 if (old_map)
2154 dm_sync_table(md);
2155
2156out:
2157 return old_map;
2158}
2159
2160
2161
2162
2163static struct dm_table *__unbind(struct mapped_device *md)
2164{
2165 struct dm_table *map = rcu_dereference_protected(md->map, 1);
2166
2167 if (!map)
2168 return NULL;
2169
2170 dm_table_event_callback(map, NULL, NULL);
2171 RCU_INIT_POINTER(md->map, NULL);
2172 dm_sync_table(md);
2173
2174 return map;
2175}
2176
2177
2178
2179
2180int dm_create(int minor, struct mapped_device **result)
2181{
2182 int r;
2183 struct mapped_device *md;
2184
2185 md = alloc_dev(minor);
2186 if (!md)
2187 return -ENXIO;
2188
2189 r = dm_sysfs_init(md);
2190 if (r) {
2191 free_dev(md);
2192 return r;
2193 }
2194
2195 *result = md;
2196 return 0;
2197}
2198
2199
2200
2201
2202
2203void dm_lock_md_type(struct mapped_device *md)
2204{
2205 mutex_lock(&md->type_lock);
2206}
2207
2208void dm_unlock_md_type(struct mapped_device *md)
2209{
2210 mutex_unlock(&md->type_lock);
2211}
2212
2213void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
2214{
2215 BUG_ON(!mutex_is_locked(&md->type_lock));
2216 md->type = type;
2217}
2218
2219enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
2220{
2221 return md->type;
2222}
2223
2224struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
2225{
2226 return md->immutable_target_type;
2227}
2228
2229
2230
2231
2232
2233struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
2234{
2235 BUG_ON(!atomic_read(&md->holders));
2236 return &md->queue->limits;
2237}
2238EXPORT_SYMBOL_GPL(dm_get_queue_limits);
2239
2240
2241
2242
2243int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
2244{
2245 int r;
2246 struct queue_limits limits;
2247 enum dm_queue_mode type = dm_get_md_type(md);
2248
2249 switch (type) {
2250 case DM_TYPE_REQUEST_BASED:
2251 r = dm_mq_init_request_queue(md, t);
2252 if (r) {
2253 DMERR("Cannot initialize queue for request-based dm-mq mapped device");
2254 return r;
2255 }
2256 break;
2257 case DM_TYPE_BIO_BASED:
2258 case DM_TYPE_DAX_BIO_BASED:
2259 case DM_TYPE_NVME_BIO_BASED:
2260 break;
2261 case DM_TYPE_NONE:
2262 WARN_ON_ONCE(true);
2263 break;
2264 }
2265
2266 r = dm_calculate_queue_limits(t, &limits);
2267 if (r) {
2268 DMERR("Cannot calculate initial queue limits");
2269 return r;
2270 }
2271 dm_table_set_restrictions(t, md->queue, &limits);
2272 blk_register_queue(md->disk);
2273
2274 return 0;
2275}
2276
2277struct mapped_device *dm_get_md(dev_t dev)
2278{
2279 struct mapped_device *md;
2280 unsigned minor = MINOR(dev);
2281
2282 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2283 return NULL;
2284
2285 spin_lock(&_minor_lock);
2286
2287 md = idr_find(&_minor_idr, minor);
2288 if (!md || md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) ||
2289 test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2290 md = NULL;
2291 goto out;
2292 }
2293 dm_get(md);
2294out:
2295 spin_unlock(&_minor_lock);
2296
2297 return md;
2298}
2299EXPORT_SYMBOL_GPL(dm_get_md);
2300
2301void *dm_get_mdptr(struct mapped_device *md)
2302{
2303 return md->interface_ptr;
2304}
2305
2306void dm_set_mdptr(struct mapped_device *md, void *ptr)
2307{
2308 md->interface_ptr = ptr;
2309}
2310
2311void dm_get(struct mapped_device *md)
2312{
2313 atomic_inc(&md->holders);
2314 BUG_ON(test_bit(DMF_FREEING, &md->flags));
2315}
2316
2317int dm_hold(struct mapped_device *md)
2318{
2319 spin_lock(&_minor_lock);
2320 if (test_bit(DMF_FREEING, &md->flags)) {
2321 spin_unlock(&_minor_lock);
2322 return -EBUSY;
2323 }
2324 dm_get(md);
2325 spin_unlock(&_minor_lock);
2326 return 0;
2327}
2328EXPORT_SYMBOL_GPL(dm_hold);
2329
2330const char *dm_device_name(struct mapped_device *md)
2331{
2332 return md->name;
2333}
2334EXPORT_SYMBOL_GPL(dm_device_name);
2335
2336static void __dm_destroy(struct mapped_device *md, bool wait)
2337{
2338 struct dm_table *map;
2339 int srcu_idx;
2340
2341 might_sleep();
2342
2343 spin_lock(&_minor_lock);
2344 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2345 set_bit(DMF_FREEING, &md->flags);
2346 spin_unlock(&_minor_lock);
2347
2348 blk_set_queue_dying(md->queue);
2349
2350
2351
2352
2353
2354 mutex_lock(&md->suspend_lock);
2355 map = dm_get_live_table(md, &srcu_idx);
2356 if (!dm_suspended_md(md)) {
2357 dm_table_presuspend_targets(map);
2358 set_bit(DMF_SUSPENDED, &md->flags);
2359 set_bit(DMF_POST_SUSPENDING, &md->flags);
2360 dm_table_postsuspend_targets(map);
2361 }
2362
2363 dm_put_live_table(md, srcu_idx);
2364 mutex_unlock(&md->suspend_lock);
2365
2366
2367
2368
2369
2370
2371
2372 if (wait)
2373 while (atomic_read(&md->holders))
2374 msleep(1);
2375 else if (atomic_read(&md->holders))
2376 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2377 dm_device_name(md), atomic_read(&md->holders));
2378
2379 dm_sysfs_exit(md);
2380 dm_table_destroy(__unbind(md));
2381 free_dev(md);
2382}
2383
2384void dm_destroy(struct mapped_device *md)
2385{
2386 __dm_destroy(md, true);
2387}
2388
2389void dm_destroy_immediate(struct mapped_device *md)
2390{
2391 __dm_destroy(md, false);
2392}
2393
2394void dm_put(struct mapped_device *md)
2395{
2396 atomic_dec(&md->holders);
2397}
2398EXPORT_SYMBOL_GPL(dm_put);
2399
2400static bool md_in_flight_bios(struct mapped_device *md)
2401{
2402 int cpu;
2403 struct hd_struct *part = &dm_disk(md)->part0;
2404 long sum = 0;
2405
2406 for_each_possible_cpu(cpu) {
2407 sum += part_stat_local_read_cpu(part, in_flight[0], cpu);
2408 sum += part_stat_local_read_cpu(part, in_flight[1], cpu);
2409 }
2410
2411 return sum != 0;
2412}
2413
2414static int dm_wait_for_bios_completion(struct mapped_device *md, long task_state)
2415{
2416 int r = 0;
2417 DEFINE_WAIT(wait);
2418
2419 while (true) {
2420 prepare_to_wait(&md->wait, &wait, task_state);
2421
2422 if (!md_in_flight_bios(md))
2423 break;
2424
2425 if (signal_pending_state(task_state, current)) {
2426 r = -EINTR;
2427 break;
2428 }
2429
2430 io_schedule();
2431 }
2432 finish_wait(&md->wait, &wait);
2433
2434 return r;
2435}
2436
2437static int dm_wait_for_completion(struct mapped_device *md, long task_state)
2438{
2439 int r = 0;
2440
2441 if (!queue_is_mq(md->queue))
2442 return dm_wait_for_bios_completion(md, task_state);
2443
2444 while (true) {
2445 if (!blk_mq_queue_inflight(md->queue))
2446 break;
2447
2448 if (signal_pending_state(task_state, current)) {
2449 r = -EINTR;
2450 break;
2451 }
2452
2453 msleep(5);
2454 }
2455
2456 return r;
2457}
2458
2459
2460
2461
2462static void dm_wq_work(struct work_struct *work)
2463{
2464 struct mapped_device *md = container_of(work, struct mapped_device,
2465 work);
2466 struct bio *c;
2467 int srcu_idx;
2468 struct dm_table *map;
2469
2470 map = dm_get_live_table(md, &srcu_idx);
2471
2472 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2473 spin_lock_irq(&md->deferred_lock);
2474 c = bio_list_pop(&md->deferred);
2475 spin_unlock_irq(&md->deferred_lock);
2476
2477 if (!c)
2478 break;
2479
2480 if (dm_request_based(md))
2481 (void) submit_bio_noacct(c);
2482 else
2483 (void) dm_process_bio(md, map, c);
2484 }
2485
2486 dm_put_live_table(md, srcu_idx);
2487}
2488
2489static void dm_queue_flush(struct mapped_device *md)
2490{
2491 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2492 smp_mb__after_atomic();
2493 queue_work(md->wq, &md->work);
2494}
2495
2496
2497
2498
2499struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2500{
2501 struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
2502 struct queue_limits limits;
2503 int r;
2504
2505 mutex_lock(&md->suspend_lock);
2506
2507
2508 if (!dm_suspended_md(md))
2509 goto out;
2510
2511
2512
2513
2514
2515
2516
2517 if (dm_table_has_no_data_devices(table)) {
2518 live_map = dm_get_live_table_fast(md);
2519 if (live_map)
2520 limits = md->queue->limits;
2521 dm_put_live_table_fast(md);
2522 }
2523
2524 if (!live_map) {
2525 r = dm_calculate_queue_limits(table, &limits);
2526 if (r) {
2527 map = ERR_PTR(r);
2528 goto out;
2529 }
2530 }
2531
2532 map = __bind(md, table, &limits);
2533 dm_issue_global_event();
2534
2535out:
2536 mutex_unlock(&md->suspend_lock);
2537 return map;
2538}
2539
2540
2541
2542
2543
2544static int lock_fs(struct mapped_device *md)
2545{
2546 int r;
2547
2548 WARN_ON(md->frozen_sb);
2549
2550 md->frozen_sb = freeze_bdev(md->bdev);
2551 if (IS_ERR(md->frozen_sb)) {
2552 r = PTR_ERR(md->frozen_sb);
2553 md->frozen_sb = NULL;
2554 return r;
2555 }
2556
2557 set_bit(DMF_FROZEN, &md->flags);
2558
2559 return 0;
2560}
2561
2562static void unlock_fs(struct mapped_device *md)
2563{
2564 if (!test_bit(DMF_FROZEN, &md->flags))
2565 return;
2566
2567 thaw_bdev(md->bdev, md->frozen_sb);
2568 md->frozen_sb = NULL;
2569 clear_bit(DMF_FROZEN, &md->flags);
2570}
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
2582 unsigned suspend_flags, long task_state,
2583 int dmf_suspended_flag)
2584{
2585 bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
2586 bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
2587 int r;
2588
2589 lockdep_assert_held(&md->suspend_lock);
2590
2591
2592
2593
2594
2595 if (noflush)
2596 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2597 else
2598 DMDEBUG("%s: suspending with flush", dm_device_name(md));
2599
2600
2601
2602
2603
2604 dm_table_presuspend_targets(map);
2605
2606
2607
2608
2609
2610
2611
2612 if (!noflush && do_lockfs) {
2613 r = lock_fs(md);
2614 if (r) {
2615 dm_table_presuspend_undo_targets(map);
2616 return r;
2617 }
2618 }
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2633 if (map)
2634 synchronize_srcu(&md->io_barrier);
2635
2636
2637
2638
2639
2640 if (dm_request_based(md))
2641 dm_stop_queue(md->queue);
2642
2643 flush_workqueue(md->wq);
2644
2645
2646
2647
2648
2649
2650 r = dm_wait_for_completion(md, task_state);
2651 if (!r)
2652 set_bit(dmf_suspended_flag, &md->flags);
2653
2654 if (noflush)
2655 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2656 if (map)
2657 synchronize_srcu(&md->io_barrier);
2658
2659
2660 if (r < 0) {
2661 dm_queue_flush(md);
2662
2663 if (dm_request_based(md))
2664 dm_start_queue(md->queue);
2665
2666 unlock_fs(md);
2667 dm_table_presuspend_undo_targets(map);
2668
2669 }
2670
2671 return r;
2672}
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2691{
2692 struct dm_table *map = NULL;
2693 int r = 0;
2694
2695retry:
2696 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2697
2698 if (dm_suspended_md(md)) {
2699 r = -EINVAL;
2700 goto out_unlock;
2701 }
2702
2703 if (dm_suspended_internally_md(md)) {
2704
2705 mutex_unlock(&md->suspend_lock);
2706 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2707 if (r)
2708 return r;
2709 goto retry;
2710 }
2711
2712 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2713
2714 r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED);
2715 if (r)
2716 goto out_unlock;
2717
2718 set_bit(DMF_POST_SUSPENDING, &md->flags);
2719 dm_table_postsuspend_targets(map);
2720 clear_bit(DMF_POST_SUSPENDING, &md->flags);
2721
2722out_unlock:
2723 mutex_unlock(&md->suspend_lock);
2724 return r;
2725}
2726
2727static int __dm_resume(struct mapped_device *md, struct dm_table *map)
2728{
2729 if (map) {
2730 int r = dm_table_resume_targets(map);
2731 if (r)
2732 return r;
2733 }
2734
2735 dm_queue_flush(md);
2736
2737
2738
2739
2740
2741
2742 if (dm_request_based(md))
2743 dm_start_queue(md->queue);
2744
2745 unlock_fs(md);
2746
2747 return 0;
2748}
2749
2750int dm_resume(struct mapped_device *md)
2751{
2752 int r;
2753 struct dm_table *map = NULL;
2754
2755retry:
2756 r = -EINVAL;
2757 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2758
2759 if (!dm_suspended_md(md))
2760 goto out;
2761
2762 if (dm_suspended_internally_md(md)) {
2763
2764 mutex_unlock(&md->suspend_lock);
2765 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2766 if (r)
2767 return r;
2768 goto retry;
2769 }
2770
2771 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2772 if (!map || !dm_table_get_size(map))
2773 goto out;
2774
2775 r = __dm_resume(md, map);
2776 if (r)
2777 goto out;
2778
2779 clear_bit(DMF_SUSPENDED, &md->flags);
2780out:
2781 mutex_unlock(&md->suspend_lock);
2782
2783 return r;
2784}
2785
2786
2787
2788
2789
2790
2791
2792static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
2793{
2794 struct dm_table *map = NULL;
2795
2796 lockdep_assert_held(&md->suspend_lock);
2797
2798 if (md->internal_suspend_count++)
2799 return;
2800
2801 if (dm_suspended_md(md)) {
2802 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2803 return;
2804 }
2805
2806 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2807
2808
2809
2810
2811
2812
2813
2814 (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE,
2815 DMF_SUSPENDED_INTERNALLY);
2816
2817 set_bit(DMF_POST_SUSPENDING, &md->flags);
2818 dm_table_postsuspend_targets(map);
2819 clear_bit(DMF_POST_SUSPENDING, &md->flags);
2820}
2821
2822static void __dm_internal_resume(struct mapped_device *md)
2823{
2824 BUG_ON(!md->internal_suspend_count);
2825
2826 if (--md->internal_suspend_count)
2827 return;
2828
2829 if (dm_suspended_md(md))
2830 goto done;
2831
2832
2833
2834
2835
2836 (void) __dm_resume(md, NULL);
2837
2838done:
2839 clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2840 smp_mb__after_atomic();
2841 wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
2842}
2843
2844void dm_internal_suspend_noflush(struct mapped_device *md)
2845{
2846 mutex_lock(&md->suspend_lock);
2847 __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
2848 mutex_unlock(&md->suspend_lock);
2849}
2850EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
2851
2852void dm_internal_resume(struct mapped_device *md)
2853{
2854 mutex_lock(&md->suspend_lock);
2855 __dm_internal_resume(md);
2856 mutex_unlock(&md->suspend_lock);
2857}
2858EXPORT_SYMBOL_GPL(dm_internal_resume);
2859
2860
2861
2862
2863
2864
2865void dm_internal_suspend_fast(struct mapped_device *md)
2866{
2867 mutex_lock(&md->suspend_lock);
2868 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2869 return;
2870
2871 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2872 synchronize_srcu(&md->io_barrier);
2873 flush_workqueue(md->wq);
2874 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2875}
2876EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
2877
2878void dm_internal_resume_fast(struct mapped_device *md)
2879{
2880 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2881 goto done;
2882
2883 dm_queue_flush(md);
2884
2885done:
2886 mutex_unlock(&md->suspend_lock);
2887}
2888EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
2889
2890
2891
2892
2893int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
2894 unsigned cookie)
2895{
2896 int r;
2897 unsigned noio_flag;
2898 char udev_cookie[DM_COOKIE_LENGTH];
2899 char *envp[] = { udev_cookie, NULL };
2900
2901 noio_flag = memalloc_noio_save();
2902
2903 if (!cookie)
2904 r = kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
2905 else {
2906 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2907 DM_COOKIE_ENV_VAR_NAME, cookie);
2908 r = kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
2909 action, envp);
2910 }
2911
2912 memalloc_noio_restore(noio_flag);
2913
2914 return r;
2915}
2916
2917uint32_t dm_next_uevent_seq(struct mapped_device *md)
2918{
2919 return atomic_add_return(1, &md->uevent_seq);
2920}
2921
2922uint32_t dm_get_event_nr(struct mapped_device *md)
2923{
2924 return atomic_read(&md->event_nr);
2925}
2926
2927int dm_wait_event(struct mapped_device *md, int event_nr)
2928{
2929 return wait_event_interruptible(md->eventq,
2930 (event_nr != atomic_read(&md->event_nr)));
2931}
2932
2933void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
2934{
2935 unsigned long flags;
2936
2937 spin_lock_irqsave(&md->uevent_lock, flags);
2938 list_add(elist, &md->uevent_list);
2939 spin_unlock_irqrestore(&md->uevent_lock, flags);
2940}
2941
2942
2943
2944
2945
2946struct gendisk *dm_disk(struct mapped_device *md)
2947{
2948 return md->disk;
2949}
2950EXPORT_SYMBOL_GPL(dm_disk);
2951
2952struct kobject *dm_kobject(struct mapped_device *md)
2953{
2954 return &md->kobj_holder.kobj;
2955}
2956
2957struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2958{
2959 struct mapped_device *md;
2960
2961 md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
2962
2963 spin_lock(&_minor_lock);
2964 if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2965 md = NULL;
2966 goto out;
2967 }
2968 dm_get(md);
2969out:
2970 spin_unlock(&_minor_lock);
2971
2972 return md;
2973}
2974
2975int dm_suspended_md(struct mapped_device *md)
2976{
2977 return test_bit(DMF_SUSPENDED, &md->flags);
2978}
2979
2980static int dm_post_suspending_md(struct mapped_device *md)
2981{
2982 return test_bit(DMF_POST_SUSPENDING, &md->flags);
2983}
2984
2985int dm_suspended_internally_md(struct mapped_device *md)
2986{
2987 return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2988}
2989
2990int dm_test_deferred_remove_flag(struct mapped_device *md)
2991{
2992 return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
2993}
2994
2995int dm_suspended(struct dm_target *ti)
2996{
2997 return dm_suspended_md(dm_table_get_md(ti->table));
2998}
2999EXPORT_SYMBOL_GPL(dm_suspended);
3000
3001int dm_post_suspending(struct dm_target *ti)
3002{
3003 return dm_post_suspending_md(dm_table_get_md(ti->table));
3004}
3005EXPORT_SYMBOL_GPL(dm_post_suspending);
3006
3007int dm_noflush_suspending(struct dm_target *ti)
3008{
3009 return __noflush_suspending(dm_table_get_md(ti->table));
3010}
3011EXPORT_SYMBOL_GPL(dm_noflush_suspending);
3012
3013struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
3014 unsigned integrity, unsigned per_io_data_size,
3015 unsigned min_pool_size)
3016{
3017 struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
3018 unsigned int pool_size = 0;
3019 unsigned int front_pad, io_front_pad;
3020 int ret;
3021
3022 if (!pools)
3023 return NULL;
3024
3025 switch (type) {
3026 case DM_TYPE_BIO_BASED:
3027 case DM_TYPE_DAX_BIO_BASED:
3028 case DM_TYPE_NVME_BIO_BASED:
3029 pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
3030 front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
3031 io_front_pad = roundup(front_pad, __alignof__(struct dm_io)) + offsetof(struct dm_io, tio);
3032 ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0);
3033 if (ret)
3034 goto out;
3035 if (integrity && bioset_integrity_create(&pools->io_bs, pool_size))
3036 goto out;
3037 break;
3038 case DM_TYPE_REQUEST_BASED:
3039 pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size);
3040 front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
3041
3042 break;
3043 default:
3044 BUG();
3045 }
3046
3047 ret = bioset_init(&pools->bs, pool_size, front_pad, 0);
3048 if (ret)
3049 goto out;
3050
3051 if (integrity && bioset_integrity_create(&pools->bs, pool_size))
3052 goto out;
3053
3054 return pools;
3055
3056out:
3057 dm_free_md_mempools(pools);
3058
3059 return NULL;
3060}
3061
3062void dm_free_md_mempools(struct dm_md_mempools *pools)
3063{
3064 if (!pools)
3065 return;
3066
3067 bioset_exit(&pools->bs);
3068 bioset_exit(&pools->io_bs);
3069
3070 kfree(pools);
3071}
3072
3073struct dm_pr {
3074 u64 old_key;
3075 u64 new_key;
3076 u32 flags;
3077 bool fail_early;
3078};
3079
3080static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
3081 void *data)
3082{
3083 struct mapped_device *md = bdev->bd_disk->private_data;
3084 struct dm_table *table;
3085 struct dm_target *ti;
3086 int ret = -ENOTTY, srcu_idx;
3087
3088 table = dm_get_live_table(md, &srcu_idx);
3089 if (!table || !dm_table_get_size(table))
3090 goto out;
3091
3092
3093 if (dm_table_get_num_targets(table) != 1)
3094 goto out;
3095 ti = dm_table_get_target(table, 0);
3096
3097 ret = -EINVAL;
3098 if (!ti->type->iterate_devices)
3099 goto out;
3100
3101 ret = ti->type->iterate_devices(ti, fn, data);
3102out:
3103 dm_put_live_table(md, srcu_idx);
3104 return ret;
3105}
3106
3107
3108
3109
3110static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev,
3111 sector_t start, sector_t len, void *data)
3112{
3113 struct dm_pr *pr = data;
3114 const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
3115
3116 if (!ops || !ops->pr_register)
3117 return -EOPNOTSUPP;
3118 return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);
3119}
3120
3121static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
3122 u32 flags)
3123{
3124 struct dm_pr pr = {
3125 .old_key = old_key,
3126 .new_key = new_key,
3127 .flags = flags,
3128 .fail_early = true,
3129 };
3130 int ret;
3131
3132 ret = dm_call_pr(bdev, __dm_pr_register, &pr);
3133 if (ret && new_key) {
3134
3135 pr.old_key = new_key;
3136 pr.new_key = 0;
3137 pr.flags = 0;
3138 pr.fail_early = false;
3139 dm_call_pr(bdev, __dm_pr_register, &pr);
3140 }
3141
3142 return ret;
3143}
3144
3145static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
3146 u32 flags)
3147{
3148 struct mapped_device *md = bdev->bd_disk->private_data;
3149 const struct pr_ops *ops;
3150 int r, srcu_idx;
3151
3152 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3153 if (r < 0)
3154 goto out;
3155
3156 ops = bdev->bd_disk->fops->pr_ops;
3157 if (ops && ops->pr_reserve)
3158 r = ops->pr_reserve(bdev, key, type, flags);
3159 else
3160 r = -EOPNOTSUPP;
3161out:
3162 dm_unprepare_ioctl(md, srcu_idx);
3163 return r;
3164}
3165
3166static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
3167{
3168 struct mapped_device *md = bdev->bd_disk->private_data;
3169 const struct pr_ops *ops;
3170 int r, srcu_idx;
3171
3172 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3173 if (r < 0)
3174 goto out;
3175
3176 ops = bdev->bd_disk->fops->pr_ops;
3177 if (ops && ops->pr_release)
3178 r = ops->pr_release(bdev, key, type);
3179 else
3180 r = -EOPNOTSUPP;
3181out:
3182 dm_unprepare_ioctl(md, srcu_idx);
3183 return r;
3184}
3185
3186static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
3187 enum pr_type type, bool abort)
3188{
3189 struct mapped_device *md = bdev->bd_disk->private_data;
3190 const struct pr_ops *ops;
3191 int r, srcu_idx;
3192
3193 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3194 if (r < 0)
3195 goto out;
3196
3197 ops = bdev->bd_disk->fops->pr_ops;
3198 if (ops && ops->pr_preempt)
3199 r = ops->pr_preempt(bdev, old_key, new_key, type, abort);
3200 else
3201 r = -EOPNOTSUPP;
3202out:
3203 dm_unprepare_ioctl(md, srcu_idx);
3204 return r;
3205}
3206
3207static int dm_pr_clear(struct block_device *bdev, u64 key)
3208{
3209 struct mapped_device *md = bdev->bd_disk->private_data;
3210 const struct pr_ops *ops;
3211 int r, srcu_idx;
3212
3213 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3214 if (r < 0)
3215 goto out;
3216
3217 ops = bdev->bd_disk->fops->pr_ops;
3218 if (ops && ops->pr_clear)
3219 r = ops->pr_clear(bdev, key);
3220 else
3221 r = -EOPNOTSUPP;
3222out:
3223 dm_unprepare_ioctl(md, srcu_idx);
3224 return r;
3225}
3226
3227static const struct pr_ops dm_pr_ops = {
3228 .pr_register = dm_pr_register,
3229 .pr_reserve = dm_pr_reserve,
3230 .pr_release = dm_pr_release,
3231 .pr_preempt = dm_pr_preempt,
3232 .pr_clear = dm_pr_clear,
3233};
3234
3235static const struct block_device_operations dm_blk_dops = {
3236 .submit_bio = dm_submit_bio,
3237 .open = dm_blk_open,
3238 .release = dm_blk_close,
3239 .ioctl = dm_blk_ioctl,
3240 .getgeo = dm_blk_getgeo,
3241 .report_zones = dm_blk_report_zones,
3242 .pr_ops = &dm_pr_ops,
3243 .owner = THIS_MODULE
3244};
3245
3246static const struct dax_operations dm_dax_ops = {
3247 .direct_access = dm_dax_direct_access,
3248 .dax_supported = dm_dax_supported,
3249 .copy_from_iter = dm_dax_copy_from_iter,
3250 .copy_to_iter = dm_dax_copy_to_iter,
3251 .zero_page_range = dm_dax_zero_page_range,
3252};
3253
3254
3255
3256
3257module_init(dm_init);
3258module_exit(dm_exit);
3259
3260module_param(major, uint, 0);
3261MODULE_PARM_DESC(major, "The major number of the device mapper");
3262
3263module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
3264MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
3265
3266module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
3267MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
3268
3269MODULE_DESCRIPTION(DM_NAME " driver");
3270MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3271MODULE_LICENSE("GPL");
3272