1
2
3
4
5
6
7
8#include "dm-core.h"
9#include "dm-rq.h"
10#include "dm-uevent.h"
11
12#include <linux/init.h>
13#include <linux/module.h>
14#include <linux/mutex.h>
15#include <linux/sched/mm.h>
16#include <linux/sched/signal.h>
17#include <linux/blkpg.h>
18#include <linux/bio.h>
19#include <linux/mempool.h>
20#include <linux/dax.h>
21#include <linux/slab.h>
22#include <linux/idr.h>
23#include <linux/uio.h>
24#include <linux/hdreg.h>
25#include <linux/delay.h>
26#include <linux/wait.h>
27#include <linux/pr.h>
28#include <linux/refcount.h>
29#include <linux/part_stat.h>
30#include <linux/blk-crypto.h>
31
32#define DM_MSG_PREFIX "core"
33
34
35
36
37
38#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
39#define DM_COOKIE_LENGTH 24
40
41static const char *_name = DM_NAME;
42
43static unsigned int major = 0;
44static unsigned int _major = 0;
45
46static DEFINE_IDR(_minor_idr);
47
48static DEFINE_SPINLOCK(_minor_lock);
49
50static void do_deferred_remove(struct work_struct *w);
51
52static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
53
54static struct workqueue_struct *deferred_remove_workqueue;
55
56atomic_t dm_global_event_nr = ATOMIC_INIT(0);
57DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq);
58
59void dm_issue_global_event(void)
60{
61 atomic_inc(&dm_global_event_nr);
62 wake_up(&dm_global_eventq);
63}
64
65
66
67
68struct clone_info {
69 struct dm_table *map;
70 struct bio *bio;
71 struct dm_io *io;
72 sector_t sector;
73 unsigned sector_count;
74};
75
76
77
78
79#define DM_TIO_MAGIC 7282014
80struct dm_target_io {
81 unsigned magic;
82 struct dm_io *io;
83 struct dm_target *ti;
84 unsigned target_bio_nr;
85 unsigned *len_ptr;
86 bool inside_dm_io;
87 struct bio clone;
88};
89
90
91
92
93
94#define DM_IO_MAGIC 5191977
95struct dm_io {
96 unsigned magic;
97 struct mapped_device *md;
98 blk_status_t status;
99 atomic_t io_count;
100 struct bio *orig_bio;
101 unsigned long start_time;
102 spinlock_t endio_lock;
103 struct dm_stats_aux stats_aux;
104
105 struct dm_target_io tio;
106};
107
108void *dm_per_bio_data(struct bio *bio, size_t data_size)
109{
110 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
111 if (!tio->inside_dm_io)
112 return (char *)bio - offsetof(struct dm_target_io, clone) - data_size;
113 return (char *)bio - offsetof(struct dm_target_io, clone) - offsetof(struct dm_io, tio) - data_size;
114}
115EXPORT_SYMBOL_GPL(dm_per_bio_data);
116
117struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size)
118{
119 struct dm_io *io = (struct dm_io *)((char *)data + data_size);
120 if (io->magic == DM_IO_MAGIC)
121 return (struct bio *)((char *)io + offsetof(struct dm_io, tio) + offsetof(struct dm_target_io, clone));
122 BUG_ON(io->magic != DM_TIO_MAGIC);
123 return (struct bio *)((char *)io + offsetof(struct dm_target_io, clone));
124}
125EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data);
126
127unsigned dm_bio_get_target_bio_nr(const struct bio *bio)
128{
129 return container_of(bio, struct dm_target_io, clone)->target_bio_nr;
130}
131EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr);
132
133#define MINOR_ALLOCED ((void *)-1)
134
135
136
137
138#define DMF_BLOCK_IO_FOR_SUSPEND 0
139#define DMF_SUSPENDED 1
140#define DMF_FROZEN 2
141#define DMF_FREEING 3
142#define DMF_DELETING 4
143#define DMF_NOFLUSH_SUSPENDING 5
144#define DMF_DEFERRED_REMOVE 6
145#define DMF_SUSPENDED_INTERNALLY 7
146#define DMF_POST_SUSPENDING 8
147
148#define DM_NUMA_NODE NUMA_NO_NODE
149static int dm_numa_node = DM_NUMA_NODE;
150
151
152
153
154struct dm_md_mempools {
155 struct bio_set bs;
156 struct bio_set io_bs;
157};
158
159struct table_device {
160 struct list_head list;
161 refcount_t count;
162 struct dm_dev dm_dev;
163};
164
165
166
167
168#define RESERVED_BIO_BASED_IOS 16
169static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
170
171static int __dm_get_module_param_int(int *module_param, int min, int max)
172{
173 int param = READ_ONCE(*module_param);
174 int modified_param = 0;
175 bool modified = true;
176
177 if (param < min)
178 modified_param = min;
179 else if (param > max)
180 modified_param = max;
181 else
182 modified = false;
183
184 if (modified) {
185 (void)cmpxchg(module_param, param, modified_param);
186 param = modified_param;
187 }
188
189 return param;
190}
191
192unsigned __dm_get_module_param(unsigned *module_param,
193 unsigned def, unsigned max)
194{
195 unsigned param = READ_ONCE(*module_param);
196 unsigned modified_param = 0;
197
198 if (!param)
199 modified_param = def;
200 else if (param > max)
201 modified_param = max;
202
203 if (modified_param) {
204 (void)cmpxchg(module_param, param, modified_param);
205 param = modified_param;
206 }
207
208 return param;
209}
210
211unsigned dm_get_reserved_bio_based_ios(void)
212{
213 return __dm_get_module_param(&reserved_bio_based_ios,
214 RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS);
215}
216EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
217
218static unsigned dm_get_numa_node(void)
219{
220 return __dm_get_module_param_int(&dm_numa_node,
221 DM_NUMA_NODE, num_online_nodes() - 1);
222}
223
224static int __init local_init(void)
225{
226 int r;
227
228 r = dm_uevent_init();
229 if (r)
230 return r;
231
232 deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
233 if (!deferred_remove_workqueue) {
234 r = -ENOMEM;
235 goto out_uevent_exit;
236 }
237
238 _major = major;
239 r = register_blkdev(_major, _name);
240 if (r < 0)
241 goto out_free_workqueue;
242
243 if (!_major)
244 _major = r;
245
246 return 0;
247
248out_free_workqueue:
249 destroy_workqueue(deferred_remove_workqueue);
250out_uevent_exit:
251 dm_uevent_exit();
252
253 return r;
254}
255
256static void local_exit(void)
257{
258 flush_scheduled_work();
259 destroy_workqueue(deferred_remove_workqueue);
260
261 unregister_blkdev(_major, _name);
262 dm_uevent_exit();
263
264 _major = 0;
265
266 DMINFO("cleaned up");
267}
268
269static int (*_inits[])(void) __initdata = {
270 local_init,
271 dm_target_init,
272 dm_linear_init,
273 dm_stripe_init,
274 dm_io_init,
275 dm_kcopyd_init,
276 dm_interface_init,
277 dm_statistics_init,
278};
279
280static void (*_exits[])(void) = {
281 local_exit,
282 dm_target_exit,
283 dm_linear_exit,
284 dm_stripe_exit,
285 dm_io_exit,
286 dm_kcopyd_exit,
287 dm_interface_exit,
288 dm_statistics_exit,
289};
290
291static int __init dm_init(void)
292{
293 const int count = ARRAY_SIZE(_inits);
294
295 int r, i;
296
297 for (i = 0; i < count; i++) {
298 r = _inits[i]();
299 if (r)
300 goto bad;
301 }
302
303 return 0;
304
305 bad:
306 while (i--)
307 _exits[i]();
308
309 return r;
310}
311
312static void __exit dm_exit(void)
313{
314 int i = ARRAY_SIZE(_exits);
315
316 while (i--)
317 _exits[i]();
318
319
320
321
322 idr_destroy(&_minor_idr);
323}
324
325
326
327
328int dm_deleting_md(struct mapped_device *md)
329{
330 return test_bit(DMF_DELETING, &md->flags);
331}
332
333static int dm_blk_open(struct block_device *bdev, fmode_t mode)
334{
335 struct mapped_device *md;
336
337 spin_lock(&_minor_lock);
338
339 md = bdev->bd_disk->private_data;
340 if (!md)
341 goto out;
342
343 if (test_bit(DMF_FREEING, &md->flags) ||
344 dm_deleting_md(md)) {
345 md = NULL;
346 goto out;
347 }
348
349 dm_get(md);
350 atomic_inc(&md->open_count);
351out:
352 spin_unlock(&_minor_lock);
353
354 return md ? 0 : -ENXIO;
355}
356
357static void dm_blk_close(struct gendisk *disk, fmode_t mode)
358{
359 struct mapped_device *md;
360
361 spin_lock(&_minor_lock);
362
363 md = disk->private_data;
364 if (WARN_ON(!md))
365 goto out;
366
367 if (atomic_dec_and_test(&md->open_count) &&
368 (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
369 queue_work(deferred_remove_workqueue, &deferred_remove_work);
370
371 dm_put(md);
372out:
373 spin_unlock(&_minor_lock);
374}
375
376int dm_open_count(struct mapped_device *md)
377{
378 return atomic_read(&md->open_count);
379}
380
381
382
383
384int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
385{
386 int r = 0;
387
388 spin_lock(&_minor_lock);
389
390 if (dm_open_count(md)) {
391 r = -EBUSY;
392 if (mark_deferred)
393 set_bit(DMF_DEFERRED_REMOVE, &md->flags);
394 } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
395 r = -EEXIST;
396 else
397 set_bit(DMF_DELETING, &md->flags);
398
399 spin_unlock(&_minor_lock);
400
401 return r;
402}
403
404int dm_cancel_deferred_remove(struct mapped_device *md)
405{
406 int r = 0;
407
408 spin_lock(&_minor_lock);
409
410 if (test_bit(DMF_DELETING, &md->flags))
411 r = -EBUSY;
412 else
413 clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
414
415 spin_unlock(&_minor_lock);
416
417 return r;
418}
419
420static void do_deferred_remove(struct work_struct *w)
421{
422 dm_deferred_remove();
423}
424
425static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
426{
427 struct mapped_device *md = bdev->bd_disk->private_data;
428
429 return dm_get_geometry(md, geo);
430}
431
432#ifdef CONFIG_BLK_DEV_ZONED
433int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx, void *data)
434{
435 struct dm_report_zones_args *args = data;
436 sector_t sector_diff = args->tgt->begin - args->start;
437
438
439
440
441 if (zone->start >= args->start + args->tgt->len)
442 return 0;
443
444
445
446
447
448 zone->start += sector_diff;
449 if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) {
450 if (zone->cond == BLK_ZONE_COND_FULL)
451 zone->wp = zone->start + zone->len;
452 else if (zone->cond == BLK_ZONE_COND_EMPTY)
453 zone->wp = zone->start;
454 else
455 zone->wp += sector_diff;
456 }
457
458 args->next_sector = zone->start + zone->len;
459 return args->orig_cb(zone, args->zone_idx++, args->orig_data);
460}
461EXPORT_SYMBOL_GPL(dm_report_zones_cb);
462
463static int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
464 unsigned int nr_zones, report_zones_cb cb, void *data)
465{
466 struct mapped_device *md = disk->private_data;
467 struct dm_table *map;
468 int srcu_idx, ret;
469 struct dm_report_zones_args args = {
470 .next_sector = sector,
471 .orig_data = data,
472 .orig_cb = cb,
473 };
474
475 if (dm_suspended_md(md))
476 return -EAGAIN;
477
478 map = dm_get_live_table(md, &srcu_idx);
479 if (!map) {
480 ret = -EIO;
481 goto out;
482 }
483
484 do {
485 struct dm_target *tgt;
486
487 tgt = dm_table_find_target(map, args.next_sector);
488 if (WARN_ON_ONCE(!tgt->type->report_zones)) {
489 ret = -EIO;
490 goto out;
491 }
492
493 args.tgt = tgt;
494 ret = tgt->type->report_zones(tgt, &args,
495 nr_zones - args.zone_idx);
496 if (ret < 0)
497 goto out;
498 } while (args.zone_idx < nr_zones &&
499 args.next_sector < get_capacity(disk));
500
501 ret = args.zone_idx;
502out:
503 dm_put_live_table(md, srcu_idx);
504 return ret;
505}
506#else
507#define dm_blk_report_zones NULL
508#endif
509
510static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx,
511 struct block_device **bdev)
512{
513 struct dm_target *tgt;
514 struct dm_table *map;
515 int r;
516
517retry:
518 r = -ENOTTY;
519 map = dm_get_live_table(md, srcu_idx);
520 if (!map || !dm_table_get_size(map))
521 return r;
522
523
524 if (dm_table_get_num_targets(map) != 1)
525 return r;
526
527 tgt = dm_table_get_target(map, 0);
528 if (!tgt->type->prepare_ioctl)
529 return r;
530
531 if (dm_suspended_md(md))
532 return -EAGAIN;
533
534 r = tgt->type->prepare_ioctl(tgt, bdev);
535 if (r == -ENOTCONN && !fatal_signal_pending(current)) {
536 dm_put_live_table(md, *srcu_idx);
537 msleep(10);
538 goto retry;
539 }
540
541 return r;
542}
543
544static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx)
545{
546 dm_put_live_table(md, srcu_idx);
547}
548
549static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
550 unsigned int cmd, unsigned long arg)
551{
552 struct mapped_device *md = bdev->bd_disk->private_data;
553 int r, srcu_idx;
554
555 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
556 if (r < 0)
557 goto out;
558
559 if (r > 0) {
560
561
562
563
564 if (!capable(CAP_SYS_RAWIO)) {
565 DMWARN_LIMIT(
566 "%s: sending ioctl %x to DM device without required privilege.",
567 current->comm, cmd);
568 r = -ENOIOCTLCMD;
569 goto out;
570 }
571 }
572
573 r = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
574out:
575 dm_unprepare_ioctl(md, srcu_idx);
576 return r;
577}
578
579u64 dm_start_time_ns_from_clone(struct bio *bio)
580{
581 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
582 struct dm_io *io = tio->io;
583
584 return jiffies_to_nsecs(io->start_time);
585}
586EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone);
587
588static void start_io_acct(struct dm_io *io)
589{
590 struct mapped_device *md = io->md;
591 struct bio *bio = io->orig_bio;
592
593 io->start_time = bio_start_io_acct(bio);
594 if (unlikely(dm_stats_used(&md->stats)))
595 dm_stats_account_io(&md->stats, bio_data_dir(bio),
596 bio->bi_iter.bi_sector, bio_sectors(bio),
597 false, 0, &io->stats_aux);
598}
599
600static void end_io_acct(struct dm_io *io)
601{
602 struct mapped_device *md = io->md;
603 struct bio *bio = io->orig_bio;
604 unsigned long duration = jiffies - io->start_time;
605
606 bio_end_io_acct(bio, io->start_time);
607
608 if (unlikely(dm_stats_used(&md->stats)))
609 dm_stats_account_io(&md->stats, bio_data_dir(bio),
610 bio->bi_iter.bi_sector, bio_sectors(bio),
611 true, duration, &io->stats_aux);
612
613
614 if (unlikely(wq_has_sleeper(&md->wait)))
615 wake_up(&md->wait);
616}
617
618static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
619{
620 struct dm_io *io;
621 struct dm_target_io *tio;
622 struct bio *clone;
623
624 clone = bio_alloc_bioset(GFP_NOIO, 0, &md->io_bs);
625 if (!clone)
626 return NULL;
627
628 tio = container_of(clone, struct dm_target_io, clone);
629 tio->inside_dm_io = true;
630 tio->io = NULL;
631
632 io = container_of(tio, struct dm_io, tio);
633 io->magic = DM_IO_MAGIC;
634 io->status = 0;
635 atomic_set(&io->io_count, 1);
636 io->orig_bio = bio;
637 io->md = md;
638 spin_lock_init(&io->endio_lock);
639
640 start_io_acct(io);
641
642 return io;
643}
644
645static void free_io(struct mapped_device *md, struct dm_io *io)
646{
647 bio_put(&io->tio.clone);
648}
649
650static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *ti,
651 unsigned target_bio_nr, gfp_t gfp_mask)
652{
653 struct dm_target_io *tio;
654
655 if (!ci->io->tio.io) {
656
657 tio = &ci->io->tio;
658 } else {
659 struct bio *clone = bio_alloc_bioset(gfp_mask, 0, &ci->io->md->bs);
660 if (!clone)
661 return NULL;
662
663 tio = container_of(clone, struct dm_target_io, clone);
664 tio->inside_dm_io = false;
665 }
666
667 tio->magic = DM_TIO_MAGIC;
668 tio->io = ci->io;
669 tio->ti = ti;
670 tio->target_bio_nr = target_bio_nr;
671
672 return tio;
673}
674
675static void free_tio(struct dm_target_io *tio)
676{
677 if (tio->inside_dm_io)
678 return;
679 bio_put(&tio->clone);
680}
681
682
683
684
685static void queue_io(struct mapped_device *md, struct bio *bio)
686{
687 unsigned long flags;
688
689 spin_lock_irqsave(&md->deferred_lock, flags);
690 bio_list_add(&md->deferred, bio);
691 spin_unlock_irqrestore(&md->deferred_lock, flags);
692 queue_work(md->wq, &md->work);
693}
694
695
696
697
698
699
700struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
701{
702 *srcu_idx = srcu_read_lock(&md->io_barrier);
703
704 return srcu_dereference(md->map, &md->io_barrier);
705}
706
707void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
708{
709 srcu_read_unlock(&md->io_barrier, srcu_idx);
710}
711
712void dm_sync_table(struct mapped_device *md)
713{
714 synchronize_srcu(&md->io_barrier);
715 synchronize_rcu_expedited();
716}
717
718
719
720
721
722static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
723{
724 rcu_read_lock();
725 return rcu_dereference(md->map);
726}
727
728static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
729{
730 rcu_read_unlock();
731}
732
733static char *_dm_claim_ptr = "I belong to device-mapper";
734
735
736
737
738static int open_table_device(struct table_device *td, dev_t dev,
739 struct mapped_device *md)
740{
741 struct block_device *bdev;
742
743 int r;
744
745 BUG_ON(td->dm_dev.bdev);
746
747 bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr);
748 if (IS_ERR(bdev))
749 return PTR_ERR(bdev);
750
751 r = bd_link_disk_holder(bdev, dm_disk(md));
752 if (r) {
753 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
754 return r;
755 }
756
757 td->dm_dev.bdev = bdev;
758 td->dm_dev.dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
759 return 0;
760}
761
762
763
764
765static void close_table_device(struct table_device *td, struct mapped_device *md)
766{
767 if (!td->dm_dev.bdev)
768 return;
769
770 bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
771 blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
772 put_dax(td->dm_dev.dax_dev);
773 td->dm_dev.bdev = NULL;
774 td->dm_dev.dax_dev = NULL;
775}
776
777static struct table_device *find_table_device(struct list_head *l, dev_t dev,
778 fmode_t mode)
779{
780 struct table_device *td;
781
782 list_for_each_entry(td, l, list)
783 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
784 return td;
785
786 return NULL;
787}
788
789int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
790 struct dm_dev **result)
791{
792 int r;
793 struct table_device *td;
794
795 mutex_lock(&md->table_devices_lock);
796 td = find_table_device(&md->table_devices, dev, mode);
797 if (!td) {
798 td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
799 if (!td) {
800 mutex_unlock(&md->table_devices_lock);
801 return -ENOMEM;
802 }
803
804 td->dm_dev.mode = mode;
805 td->dm_dev.bdev = NULL;
806
807 if ((r = open_table_device(td, dev, md))) {
808 mutex_unlock(&md->table_devices_lock);
809 kfree(td);
810 return r;
811 }
812
813 format_dev_t(td->dm_dev.name, dev);
814
815 refcount_set(&td->count, 1);
816 list_add(&td->list, &md->table_devices);
817 } else {
818 refcount_inc(&td->count);
819 }
820 mutex_unlock(&md->table_devices_lock);
821
822 *result = &td->dm_dev;
823 return 0;
824}
825EXPORT_SYMBOL_GPL(dm_get_table_device);
826
827void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
828{
829 struct table_device *td = container_of(d, struct table_device, dm_dev);
830
831 mutex_lock(&md->table_devices_lock);
832 if (refcount_dec_and_test(&td->count)) {
833 close_table_device(td, md);
834 list_del(&td->list);
835 kfree(td);
836 }
837 mutex_unlock(&md->table_devices_lock);
838}
839EXPORT_SYMBOL(dm_put_table_device);
840
841static void free_table_devices(struct list_head *devices)
842{
843 struct list_head *tmp, *next;
844
845 list_for_each_safe(tmp, next, devices) {
846 struct table_device *td = list_entry(tmp, struct table_device, list);
847
848 DMWARN("dm_destroy: %s still exists with %d references",
849 td->dm_dev.name, refcount_read(&td->count));
850 kfree(td);
851 }
852}
853
854
855
856
857int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
858{
859 *geo = md->geometry;
860
861 return 0;
862}
863
864
865
866
867int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
868{
869 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
870
871 if (geo->start > sz) {
872 DMWARN("Start sector is beyond the geometry limits.");
873 return -EINVAL;
874 }
875
876 md->geometry = *geo;
877
878 return 0;
879}
880
881static int __noflush_suspending(struct mapped_device *md)
882{
883 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
884}
885
886
887
888
889
890static void dec_pending(struct dm_io *io, blk_status_t error)
891{
892 unsigned long flags;
893 blk_status_t io_error;
894 struct bio *bio;
895 struct mapped_device *md = io->md;
896
897
898 if (unlikely(error)) {
899 spin_lock_irqsave(&io->endio_lock, flags);
900 if (!(io->status == BLK_STS_DM_REQUEUE && __noflush_suspending(md)))
901 io->status = error;
902 spin_unlock_irqrestore(&io->endio_lock, flags);
903 }
904
905 if (atomic_dec_and_test(&io->io_count)) {
906 if (io->status == BLK_STS_DM_REQUEUE) {
907
908
909
910 spin_lock_irqsave(&md->deferred_lock, flags);
911 if (__noflush_suspending(md))
912
913 bio_list_add_head(&md->deferred, io->orig_bio);
914 else
915
916 io->status = BLK_STS_IOERR;
917 spin_unlock_irqrestore(&md->deferred_lock, flags);
918 }
919
920 io_error = io->status;
921 bio = io->orig_bio;
922 end_io_acct(io);
923 free_io(md, io);
924
925 if (io_error == BLK_STS_DM_REQUEUE)
926 return;
927
928 if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
929
930
931
932
933 bio->bi_opf &= ~REQ_PREFLUSH;
934 queue_io(md, bio);
935 } else {
936
937 if (io_error)
938 bio->bi_status = io_error;
939 bio_endio(bio);
940 }
941 }
942}
943
944void disable_discard(struct mapped_device *md)
945{
946 struct queue_limits *limits = dm_get_queue_limits(md);
947
948
949 limits->max_discard_sectors = 0;
950 blk_queue_flag_clear(QUEUE_FLAG_DISCARD, md->queue);
951}
952
953void disable_write_same(struct mapped_device *md)
954{
955 struct queue_limits *limits = dm_get_queue_limits(md);
956
957
958 limits->max_write_same_sectors = 0;
959}
960
961void disable_write_zeroes(struct mapped_device *md)
962{
963 struct queue_limits *limits = dm_get_queue_limits(md);
964
965
966 limits->max_write_zeroes_sectors = 0;
967}
968
969static void clone_endio(struct bio *bio)
970{
971 blk_status_t error = bio->bi_status;
972 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
973 struct dm_io *io = tio->io;
974 struct mapped_device *md = tio->io->md;
975 dm_endio_fn endio = tio->ti->type->end_io;
976 struct bio *orig_bio = io->orig_bio;
977
978 if (unlikely(error == BLK_STS_TARGET)) {
979 if (bio_op(bio) == REQ_OP_DISCARD &&
980 !bio->bi_disk->queue->limits.max_discard_sectors)
981 disable_discard(md);
982 else if (bio_op(bio) == REQ_OP_WRITE_SAME &&
983 !bio->bi_disk->queue->limits.max_write_same_sectors)
984 disable_write_same(md);
985 else if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
986 !bio->bi_disk->queue->limits.max_write_zeroes_sectors)
987 disable_write_zeroes(md);
988 }
989
990
991
992
993
994 if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) {
995 sector_t written_sector = bio->bi_iter.bi_sector;
996 struct request_queue *q = orig_bio->bi_disk->queue;
997 u64 mask = (u64)blk_queue_zone_sectors(q) - 1;
998
999 orig_bio->bi_iter.bi_sector += written_sector & mask;
1000 }
1001
1002 if (endio) {
1003 int r = endio(tio->ti, bio, &error);
1004 switch (r) {
1005 case DM_ENDIO_REQUEUE:
1006 error = BLK_STS_DM_REQUEUE;
1007 fallthrough;
1008 case DM_ENDIO_DONE:
1009 break;
1010 case DM_ENDIO_INCOMPLETE:
1011
1012 return;
1013 default:
1014 DMWARN("unimplemented target endio return value: %d", r);
1015 BUG();
1016 }
1017 }
1018
1019 free_tio(tio);
1020 dec_pending(io, error);
1021}
1022
1023
1024
1025
1026
1027static inline sector_t max_io_len_target_boundary(struct dm_target *ti,
1028 sector_t target_offset)
1029{
1030 return ti->len - target_offset;
1031}
1032
1033static sector_t max_io_len(struct dm_target *ti, sector_t sector)
1034{
1035 sector_t target_offset = dm_target_offset(ti, sector);
1036 sector_t len = max_io_len_target_boundary(ti, target_offset);
1037 sector_t max_len;
1038
1039
1040
1041
1042
1043
1044
1045
1046 if (ti->max_io_len) {
1047 max_len = blk_max_size_offset(ti->table->md->queue,
1048 target_offset, ti->max_io_len);
1049 if (len > max_len)
1050 len = max_len;
1051 }
1052
1053 return len;
1054}
1055
1056int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
1057{
1058 if (len > UINT_MAX) {
1059 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
1060 (unsigned long long)len, UINT_MAX);
1061 ti->error = "Maximum size of target IO is too large";
1062 return -EINVAL;
1063 }
1064
1065 ti->max_io_len = (uint32_t) len;
1066
1067 return 0;
1068}
1069EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
1070
1071static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
1072 sector_t sector, int *srcu_idx)
1073 __acquires(md->io_barrier)
1074{
1075 struct dm_table *map;
1076 struct dm_target *ti;
1077
1078 map = dm_get_live_table(md, srcu_idx);
1079 if (!map)
1080 return NULL;
1081
1082 ti = dm_table_find_target(map, sector);
1083 if (!ti)
1084 return NULL;
1085
1086 return ti;
1087}
1088
1089static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
1090 long nr_pages, void **kaddr, pfn_t *pfn)
1091{
1092 struct mapped_device *md = dax_get_private(dax_dev);
1093 sector_t sector = pgoff * PAGE_SECTORS;
1094 struct dm_target *ti;
1095 long len, ret = -EIO;
1096 int srcu_idx;
1097
1098 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1099
1100 if (!ti)
1101 goto out;
1102 if (!ti->type->direct_access)
1103 goto out;
1104 len = max_io_len(ti, sector) / PAGE_SECTORS;
1105 if (len < 1)
1106 goto out;
1107 nr_pages = min(len, nr_pages);
1108 ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
1109
1110 out:
1111 dm_put_live_table(md, srcu_idx);
1112
1113 return ret;
1114}
1115
1116static bool dm_dax_supported(struct dax_device *dax_dev, struct block_device *bdev,
1117 int blocksize, sector_t start, sector_t len)
1118{
1119 struct mapped_device *md = dax_get_private(dax_dev);
1120 struct dm_table *map;
1121 bool ret = false;
1122 int srcu_idx;
1123
1124 map = dm_get_live_table(md, &srcu_idx);
1125 if (!map)
1126 goto out;
1127
1128 ret = dm_table_supports_dax(map, device_supports_dax, &blocksize);
1129
1130out:
1131 dm_put_live_table(md, srcu_idx);
1132
1133 return ret;
1134}
1135
1136static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
1137 void *addr, size_t bytes, struct iov_iter *i)
1138{
1139 struct mapped_device *md = dax_get_private(dax_dev);
1140 sector_t sector = pgoff * PAGE_SECTORS;
1141 struct dm_target *ti;
1142 long ret = 0;
1143 int srcu_idx;
1144
1145 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1146
1147 if (!ti)
1148 goto out;
1149 if (!ti->type->dax_copy_from_iter) {
1150 ret = copy_from_iter(addr, bytes, i);
1151 goto out;
1152 }
1153 ret = ti->type->dax_copy_from_iter(ti, pgoff, addr, bytes, i);
1154 out:
1155 dm_put_live_table(md, srcu_idx);
1156
1157 return ret;
1158}
1159
1160static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
1161 void *addr, size_t bytes, struct iov_iter *i)
1162{
1163 struct mapped_device *md = dax_get_private(dax_dev);
1164 sector_t sector = pgoff * PAGE_SECTORS;
1165 struct dm_target *ti;
1166 long ret = 0;
1167 int srcu_idx;
1168
1169 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1170
1171 if (!ti)
1172 goto out;
1173 if (!ti->type->dax_copy_to_iter) {
1174 ret = copy_to_iter(addr, bytes, i);
1175 goto out;
1176 }
1177 ret = ti->type->dax_copy_to_iter(ti, pgoff, addr, bytes, i);
1178 out:
1179 dm_put_live_table(md, srcu_idx);
1180
1181 return ret;
1182}
1183
1184static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
1185 size_t nr_pages)
1186{
1187 struct mapped_device *md = dax_get_private(dax_dev);
1188 sector_t sector = pgoff * PAGE_SECTORS;
1189 struct dm_target *ti;
1190 int ret = -EIO;
1191 int srcu_idx;
1192
1193 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1194
1195 if (!ti)
1196 goto out;
1197 if (WARN_ON(!ti->type->dax_zero_page_range)) {
1198
1199
1200
1201
1202 goto out;
1203 }
1204 ret = ti->type->dax_zero_page_range(ti, pgoff, nr_pages);
1205 out:
1206 dm_put_live_table(md, srcu_idx);
1207
1208 return ret;
1209}
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
1241{
1242 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
1243 unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
1244 BUG_ON(bio->bi_opf & REQ_PREFLUSH);
1245 BUG_ON(bi_size > *tio->len_ptr);
1246 BUG_ON(n_sectors > bi_size);
1247 *tio->len_ptr -= bi_size - n_sectors;
1248 bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
1249}
1250EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
1251
1252static blk_qc_t __map_bio(struct dm_target_io *tio)
1253{
1254 int r;
1255 sector_t sector;
1256 struct bio *clone = &tio->clone;
1257 struct dm_io *io = tio->io;
1258 struct dm_target *ti = tio->ti;
1259 blk_qc_t ret = BLK_QC_T_NONE;
1260
1261 clone->bi_end_io = clone_endio;
1262
1263
1264
1265
1266
1267
1268 atomic_inc(&io->io_count);
1269 sector = clone->bi_iter.bi_sector;
1270
1271 r = ti->type->map(ti, clone);
1272 switch (r) {
1273 case DM_MAPIO_SUBMITTED:
1274 break;
1275 case DM_MAPIO_REMAPPED:
1276
1277 trace_block_bio_remap(clone->bi_disk->queue, clone,
1278 bio_dev(io->orig_bio), sector);
1279 ret = submit_bio_noacct(clone);
1280 break;
1281 case DM_MAPIO_KILL:
1282 free_tio(tio);
1283 dec_pending(io, BLK_STS_IOERR);
1284 break;
1285 case DM_MAPIO_REQUEUE:
1286 free_tio(tio);
1287 dec_pending(io, BLK_STS_DM_REQUEUE);
1288 break;
1289 default:
1290 DMWARN("unimplemented target map return value: %d", r);
1291 BUG();
1292 }
1293
1294 return ret;
1295}
1296
1297static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
1298{
1299 bio->bi_iter.bi_sector = sector;
1300 bio->bi_iter.bi_size = to_bytes(len);
1301}
1302
1303
1304
1305
1306static int clone_bio(struct dm_target_io *tio, struct bio *bio,
1307 sector_t sector, unsigned len)
1308{
1309 struct bio *clone = &tio->clone;
1310 int r;
1311
1312 __bio_clone_fast(clone, bio);
1313
1314 r = bio_crypt_clone(clone, bio, GFP_NOIO);
1315 if (r < 0)
1316 return r;
1317
1318 if (bio_integrity(bio)) {
1319 if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
1320 !dm_target_passes_integrity(tio->ti->type))) {
1321 DMWARN("%s: the target %s doesn't support integrity data.",
1322 dm_device_name(tio->io->md),
1323 tio->ti->type->name);
1324 return -EIO;
1325 }
1326
1327 r = bio_integrity_clone(clone, bio, GFP_NOIO);
1328 if (r < 0)
1329 return r;
1330 }
1331
1332 bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
1333 clone->bi_iter.bi_size = to_bytes(len);
1334
1335 if (bio_integrity(bio))
1336 bio_integrity_trim(clone);
1337
1338 return 0;
1339}
1340
1341static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
1342 struct dm_target *ti, unsigned num_bios)
1343{
1344 struct dm_target_io *tio;
1345 int try;
1346
1347 if (!num_bios)
1348 return;
1349
1350 if (num_bios == 1) {
1351 tio = alloc_tio(ci, ti, 0, GFP_NOIO);
1352 bio_list_add(blist, &tio->clone);
1353 return;
1354 }
1355
1356 for (try = 0; try < 2; try++) {
1357 int bio_nr;
1358 struct bio *bio;
1359
1360 if (try)
1361 mutex_lock(&ci->io->md->table_devices_lock);
1362 for (bio_nr = 0; bio_nr < num_bios; bio_nr++) {
1363 tio = alloc_tio(ci, ti, bio_nr, try ? GFP_NOIO : GFP_NOWAIT);
1364 if (!tio)
1365 break;
1366
1367 bio_list_add(blist, &tio->clone);
1368 }
1369 if (try)
1370 mutex_unlock(&ci->io->md->table_devices_lock);
1371 if (bio_nr == num_bios)
1372 return;
1373
1374 while ((bio = bio_list_pop(blist))) {
1375 tio = container_of(bio, struct dm_target_io, clone);
1376 free_tio(tio);
1377 }
1378 }
1379}
1380
1381static blk_qc_t __clone_and_map_simple_bio(struct clone_info *ci,
1382 struct dm_target_io *tio, unsigned *len)
1383{
1384 struct bio *clone = &tio->clone;
1385
1386 tio->len_ptr = len;
1387
1388 __bio_clone_fast(clone, ci->bio);
1389 if (len)
1390 bio_setup_sector(clone, ci->sector, *len);
1391
1392 return __map_bio(tio);
1393}
1394
1395static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
1396 unsigned num_bios, unsigned *len)
1397{
1398 struct bio_list blist = BIO_EMPTY_LIST;
1399 struct bio *bio;
1400 struct dm_target_io *tio;
1401
1402 alloc_multiple_bios(&blist, ci, ti, num_bios);
1403
1404 while ((bio = bio_list_pop(&blist))) {
1405 tio = container_of(bio, struct dm_target_io, clone);
1406 (void) __clone_and_map_simple_bio(ci, tio, len);
1407 }
1408}
1409
1410static int __send_empty_flush(struct clone_info *ci)
1411{
1412 unsigned target_nr = 0;
1413 struct dm_target *ti;
1414 struct bio flush_bio;
1415
1416
1417
1418
1419
1420
1421 bio_init(&flush_bio, NULL, 0);
1422 flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
1423 ci->bio = &flush_bio;
1424 ci->sector_count = 0;
1425
1426
1427
1428
1429
1430
1431
1432
1433 bio_set_dev(ci->bio, ci->io->md->bdev);
1434
1435 BUG_ON(bio_has_data(ci->bio));
1436 while ((ti = dm_table_get_target(ci->map, target_nr++)))
1437 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
1438
1439 bio_uninit(ci->bio);
1440 return 0;
1441}
1442
1443static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
1444 sector_t sector, unsigned *len)
1445{
1446 struct bio *bio = ci->bio;
1447 struct dm_target_io *tio;
1448 int r;
1449
1450 tio = alloc_tio(ci, ti, 0, GFP_NOIO);
1451 tio->len_ptr = len;
1452 r = clone_bio(tio, bio, sector, *len);
1453 if (r < 0) {
1454 free_tio(tio);
1455 return r;
1456 }
1457 (void) __map_bio(tio);
1458
1459 return 0;
1460}
1461
1462static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
1463 unsigned num_bios)
1464{
1465 unsigned len;
1466
1467
1468
1469
1470
1471
1472
1473 if (!num_bios)
1474 return -EOPNOTSUPP;
1475
1476 len = min_t(sector_t, ci->sector_count,
1477 max_io_len_target_boundary(ti, dm_target_offset(ti, ci->sector)));
1478
1479 __send_duplicate_bios(ci, ti, num_bios, &len);
1480
1481 ci->sector += len;
1482 ci->sector_count -= len;
1483
1484 return 0;
1485}
1486
1487static bool is_abnormal_io(struct bio *bio)
1488{
1489 bool r = false;
1490
1491 switch (bio_op(bio)) {
1492 case REQ_OP_DISCARD:
1493 case REQ_OP_SECURE_ERASE:
1494 case REQ_OP_WRITE_SAME:
1495 case REQ_OP_WRITE_ZEROES:
1496 r = true;
1497 break;
1498 }
1499
1500 return r;
1501}
1502
1503static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti,
1504 int *result)
1505{
1506 struct bio *bio = ci->bio;
1507 unsigned num_bios = 0;
1508
1509 switch (bio_op(bio)) {
1510 case REQ_OP_DISCARD:
1511 num_bios = ti->num_discard_bios;
1512 break;
1513 case REQ_OP_SECURE_ERASE:
1514 num_bios = ti->num_secure_erase_bios;
1515 break;
1516 case REQ_OP_WRITE_SAME:
1517 num_bios = ti->num_write_same_bios;
1518 break;
1519 case REQ_OP_WRITE_ZEROES:
1520 num_bios = ti->num_write_zeroes_bios;
1521 break;
1522 default:
1523 return false;
1524 }
1525
1526 *result = __send_changing_extent_only(ci, ti, num_bios);
1527 return true;
1528}
1529
1530
1531
1532
1533static int __split_and_process_non_flush(struct clone_info *ci)
1534{
1535 struct dm_target *ti;
1536 unsigned len;
1537 int r;
1538
1539 ti = dm_table_find_target(ci->map, ci->sector);
1540 if (!ti)
1541 return -EIO;
1542
1543 if (__process_abnormal_io(ci, ti, &r))
1544 return r;
1545
1546 len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count);
1547
1548 r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
1549 if (r < 0)
1550 return r;
1551
1552 ci->sector += len;
1553 ci->sector_count -= len;
1554
1555 return 0;
1556}
1557
1558static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
1559 struct dm_table *map, struct bio *bio)
1560{
1561 ci->map = map;
1562 ci->io = alloc_io(md, bio);
1563 ci->sector = bio->bi_iter.bi_sector;
1564}
1565
1566#define __dm_part_stat_sub(part, field, subnd) \
1567 (part_stat_get(part, field) -= (subnd))
1568
1569
1570
1571
1572static blk_qc_t __split_and_process_bio(struct mapped_device *md,
1573 struct dm_table *map, struct bio *bio)
1574{
1575 struct clone_info ci;
1576 blk_qc_t ret = BLK_QC_T_NONE;
1577 int error = 0;
1578
1579 init_clone_info(&ci, md, map, bio);
1580
1581 if (bio->bi_opf & REQ_PREFLUSH) {
1582 error = __send_empty_flush(&ci);
1583
1584 } else if (op_is_zone_mgmt(bio_op(bio))) {
1585 ci.bio = bio;
1586 ci.sector_count = 0;
1587 error = __split_and_process_non_flush(&ci);
1588 } else {
1589 ci.bio = bio;
1590 ci.sector_count = bio_sectors(bio);
1591 while (ci.sector_count && !error) {
1592 error = __split_and_process_non_flush(&ci);
1593 if (current->bio_list && ci.sector_count && !error) {
1594
1595
1596
1597
1598
1599
1600
1601
1602 struct bio *b = bio_split(bio, bio_sectors(bio) - ci.sector_count,
1603 GFP_NOIO, &md->queue->bio_split);
1604 ci.io->orig_bio = b;
1605
1606
1607
1608
1609
1610
1611
1612
1613 part_stat_lock();
1614 __dm_part_stat_sub(&dm_disk(md)->part0,
1615 sectors[op_stat_group(bio_op(bio))], ci.sector_count);
1616 part_stat_unlock();
1617
1618 bio_chain(b, bio);
1619 trace_block_split(md->queue, b, bio->bi_iter.bi_sector);
1620 ret = submit_bio_noacct(bio);
1621 break;
1622 }
1623 }
1624 }
1625
1626
1627 dec_pending(ci.io, errno_to_blk_status(error));
1628 return ret;
1629}
1630
1631static blk_qc_t dm_submit_bio(struct bio *bio)
1632{
1633 struct mapped_device *md = bio->bi_disk->private_data;
1634 blk_qc_t ret = BLK_QC_T_NONE;
1635 int srcu_idx;
1636 struct dm_table *map;
1637
1638 map = dm_get_live_table(md, &srcu_idx);
1639 if (unlikely(!map)) {
1640 DMERR_LIMIT("%s: mapping table unavailable, erroring io",
1641 dm_device_name(md));
1642 bio_io_error(bio);
1643 goto out;
1644 }
1645
1646
1647 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1648 if (bio->bi_opf & REQ_NOWAIT)
1649 bio_wouldblock_error(bio);
1650 else if (bio->bi_opf & REQ_RAHEAD)
1651 bio_io_error(bio);
1652 else
1653 queue_io(md, bio);
1654 goto out;
1655 }
1656
1657
1658
1659
1660
1661 if (is_abnormal_io(bio))
1662 blk_queue_split(&bio);
1663
1664 ret = __split_and_process_bio(md, map, bio);
1665out:
1666 dm_put_live_table(md, srcu_idx);
1667 return ret;
1668}
1669
1670
1671
1672
1673static void free_minor(int minor)
1674{
1675 spin_lock(&_minor_lock);
1676 idr_remove(&_minor_idr, minor);
1677 spin_unlock(&_minor_lock);
1678}
1679
1680
1681
1682
1683static int specific_minor(int minor)
1684{
1685 int r;
1686
1687 if (minor >= (1 << MINORBITS))
1688 return -EINVAL;
1689
1690 idr_preload(GFP_KERNEL);
1691 spin_lock(&_minor_lock);
1692
1693 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
1694
1695 spin_unlock(&_minor_lock);
1696 idr_preload_end();
1697 if (r < 0)
1698 return r == -ENOSPC ? -EBUSY : r;
1699 return 0;
1700}
1701
1702static int next_free_minor(int *minor)
1703{
1704 int r;
1705
1706 idr_preload(GFP_KERNEL);
1707 spin_lock(&_minor_lock);
1708
1709 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
1710
1711 spin_unlock(&_minor_lock);
1712 idr_preload_end();
1713 if (r < 0)
1714 return r;
1715 *minor = r;
1716 return 0;
1717}
1718
1719static const struct block_device_operations dm_blk_dops;
1720static const struct block_device_operations dm_rq_blk_dops;
1721static const struct dax_operations dm_dax_ops;
1722
1723static void dm_wq_work(struct work_struct *work);
1724
1725static void cleanup_mapped_device(struct mapped_device *md)
1726{
1727 if (md->wq)
1728 destroy_workqueue(md->wq);
1729 bioset_exit(&md->bs);
1730 bioset_exit(&md->io_bs);
1731
1732 if (md->dax_dev) {
1733 kill_dax(md->dax_dev);
1734 put_dax(md->dax_dev);
1735 md->dax_dev = NULL;
1736 }
1737
1738 if (md->disk) {
1739 spin_lock(&_minor_lock);
1740 md->disk->private_data = NULL;
1741 spin_unlock(&_minor_lock);
1742 del_gendisk(md->disk);
1743 put_disk(md->disk);
1744 }
1745
1746 if (md->queue)
1747 blk_cleanup_queue(md->queue);
1748
1749 cleanup_srcu_struct(&md->io_barrier);
1750
1751 if (md->bdev) {
1752 bdput(md->bdev);
1753 md->bdev = NULL;
1754 }
1755
1756 mutex_destroy(&md->suspend_lock);
1757 mutex_destroy(&md->type_lock);
1758 mutex_destroy(&md->table_devices_lock);
1759
1760 dm_mq_cleanup_mapped_device(md);
1761}
1762
1763
1764
1765
1766static struct mapped_device *alloc_dev(int minor)
1767{
1768 int r, numa_node_id = dm_get_numa_node();
1769 struct mapped_device *md;
1770 void *old_md;
1771
1772 md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
1773 if (!md) {
1774 DMWARN("unable to allocate device, out of memory.");
1775 return NULL;
1776 }
1777
1778 if (!try_module_get(THIS_MODULE))
1779 goto bad_module_get;
1780
1781
1782 if (minor == DM_ANY_MINOR)
1783 r = next_free_minor(&minor);
1784 else
1785 r = specific_minor(minor);
1786 if (r < 0)
1787 goto bad_minor;
1788
1789 r = init_srcu_struct(&md->io_barrier);
1790 if (r < 0)
1791 goto bad_io_barrier;
1792
1793 md->numa_node_id = numa_node_id;
1794 md->init_tio_pdu = false;
1795 md->type = DM_TYPE_NONE;
1796 mutex_init(&md->suspend_lock);
1797 mutex_init(&md->type_lock);
1798 mutex_init(&md->table_devices_lock);
1799 spin_lock_init(&md->deferred_lock);
1800 atomic_set(&md->holders, 1);
1801 atomic_set(&md->open_count, 0);
1802 atomic_set(&md->event_nr, 0);
1803 atomic_set(&md->uevent_seq, 0);
1804 INIT_LIST_HEAD(&md->uevent_list);
1805 INIT_LIST_HEAD(&md->table_devices);
1806 spin_lock_init(&md->uevent_lock);
1807
1808
1809
1810
1811
1812
1813 md->queue = blk_alloc_queue(numa_node_id);
1814 if (!md->queue)
1815 goto bad;
1816
1817 md->disk = alloc_disk_node(1, md->numa_node_id);
1818 if (!md->disk)
1819 goto bad;
1820
1821 init_waitqueue_head(&md->wait);
1822 INIT_WORK(&md->work, dm_wq_work);
1823 init_waitqueue_head(&md->eventq);
1824 init_completion(&md->kobj_holder.completion);
1825
1826 md->disk->major = _major;
1827 md->disk->first_minor = minor;
1828 md->disk->fops = &dm_blk_dops;
1829 md->disk->queue = md->queue;
1830 md->disk->private_data = md;
1831 sprintf(md->disk->disk_name, "dm-%d", minor);
1832
1833 if (IS_ENABLED(CONFIG_DAX_DRIVER)) {
1834 md->dax_dev = alloc_dax(md, md->disk->disk_name,
1835 &dm_dax_ops, 0);
1836 if (IS_ERR(md->dax_dev))
1837 goto bad;
1838 }
1839
1840 add_disk_no_queue_reg(md->disk);
1841 format_dev_t(md->name, MKDEV(_major, minor));
1842
1843 md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
1844 if (!md->wq)
1845 goto bad;
1846
1847 md->bdev = bdget_disk(md->disk, 0);
1848 if (!md->bdev)
1849 goto bad;
1850
1851 dm_stats_init(&md->stats);
1852
1853
1854 spin_lock(&_minor_lock);
1855 old_md = idr_replace(&_minor_idr, md, minor);
1856 spin_unlock(&_minor_lock);
1857
1858 BUG_ON(old_md != MINOR_ALLOCED);
1859
1860 return md;
1861
1862bad:
1863 cleanup_mapped_device(md);
1864bad_io_barrier:
1865 free_minor(minor);
1866bad_minor:
1867 module_put(THIS_MODULE);
1868bad_module_get:
1869 kvfree(md);
1870 return NULL;
1871}
1872
1873static void unlock_fs(struct mapped_device *md);
1874
1875static void free_dev(struct mapped_device *md)
1876{
1877 int minor = MINOR(disk_devt(md->disk));
1878
1879 unlock_fs(md);
1880
1881 cleanup_mapped_device(md);
1882
1883 free_table_devices(&md->table_devices);
1884 dm_stats_cleanup(&md->stats);
1885 free_minor(minor);
1886
1887 module_put(THIS_MODULE);
1888 kvfree(md);
1889}
1890
1891static int __bind_mempools(struct mapped_device *md, struct dm_table *t)
1892{
1893 struct dm_md_mempools *p = dm_table_get_md_mempools(t);
1894 int ret = 0;
1895
1896 if (dm_table_bio_based(t)) {
1897
1898
1899
1900
1901
1902 bioset_exit(&md->bs);
1903 bioset_exit(&md->io_bs);
1904
1905 } else if (bioset_initialized(&md->bs)) {
1906
1907
1908
1909
1910
1911
1912
1913
1914 goto out;
1915 }
1916
1917 BUG_ON(!p ||
1918 bioset_initialized(&md->bs) ||
1919 bioset_initialized(&md->io_bs));
1920
1921 ret = bioset_init_from_src(&md->bs, &p->bs);
1922 if (ret)
1923 goto out;
1924 ret = bioset_init_from_src(&md->io_bs, &p->io_bs);
1925 if (ret)
1926 bioset_exit(&md->bs);
1927out:
1928
1929 dm_table_free_md_mempools(t);
1930 return ret;
1931}
1932
1933
1934
1935
1936static void event_callback(void *context)
1937{
1938 unsigned long flags;
1939 LIST_HEAD(uevents);
1940 struct mapped_device *md = (struct mapped_device *) context;
1941
1942 spin_lock_irqsave(&md->uevent_lock, flags);
1943 list_splice_init(&md->uevent_list, &uevents);
1944 spin_unlock_irqrestore(&md->uevent_lock, flags);
1945
1946 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
1947
1948 atomic_inc(&md->event_nr);
1949 wake_up(&md->eventq);
1950 dm_issue_global_event();
1951}
1952
1953
1954
1955
1956static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
1957 struct queue_limits *limits)
1958{
1959 struct dm_table *old_map;
1960 struct request_queue *q = md->queue;
1961 bool request_based = dm_table_request_based(t);
1962 sector_t size;
1963 int ret;
1964
1965 lockdep_assert_held(&md->suspend_lock);
1966
1967 size = dm_table_get_size(t);
1968
1969
1970
1971
1972 if (size != dm_get_size(md))
1973 memset(&md->geometry, 0, sizeof(md->geometry));
1974
1975 set_capacity(md->disk, size);
1976 bd_set_nr_sectors(md->bdev, size);
1977
1978 dm_table_event_callback(t, event_callback, md);
1979
1980
1981
1982
1983
1984
1985
1986
1987 if (request_based)
1988 dm_stop_queue(q);
1989
1990 if (request_based) {
1991
1992
1993
1994
1995 md->immutable_target = dm_table_get_immutable_target(t);
1996 }
1997
1998 ret = __bind_mempools(md, t);
1999 if (ret) {
2000 old_map = ERR_PTR(ret);
2001 goto out;
2002 }
2003
2004 old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2005 rcu_assign_pointer(md->map, (void *)t);
2006 md->immutable_target_type = dm_table_get_immutable_target_type(t);
2007
2008 dm_table_set_restrictions(t, q, limits);
2009 if (old_map)
2010 dm_sync_table(md);
2011
2012out:
2013 return old_map;
2014}
2015
2016
2017
2018
2019static struct dm_table *__unbind(struct mapped_device *md)
2020{
2021 struct dm_table *map = rcu_dereference_protected(md->map, 1);
2022
2023 if (!map)
2024 return NULL;
2025
2026 dm_table_event_callback(map, NULL, NULL);
2027 RCU_INIT_POINTER(md->map, NULL);
2028 dm_sync_table(md);
2029
2030 return map;
2031}
2032
2033
2034
2035
2036int dm_create(int minor, struct mapped_device **result)
2037{
2038 int r;
2039 struct mapped_device *md;
2040
2041 md = alloc_dev(minor);
2042 if (!md)
2043 return -ENXIO;
2044
2045 r = dm_sysfs_init(md);
2046 if (r) {
2047 free_dev(md);
2048 return r;
2049 }
2050
2051 *result = md;
2052 return 0;
2053}
2054
2055
2056
2057
2058
2059void dm_lock_md_type(struct mapped_device *md)
2060{
2061 mutex_lock(&md->type_lock);
2062}
2063
2064void dm_unlock_md_type(struct mapped_device *md)
2065{
2066 mutex_unlock(&md->type_lock);
2067}
2068
2069void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
2070{
2071 BUG_ON(!mutex_is_locked(&md->type_lock));
2072 md->type = type;
2073}
2074
2075enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
2076{
2077 return md->type;
2078}
2079
2080struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
2081{
2082 return md->immutable_target_type;
2083}
2084
2085
2086
2087
2088
2089struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
2090{
2091 BUG_ON(!atomic_read(&md->holders));
2092 return &md->queue->limits;
2093}
2094EXPORT_SYMBOL_GPL(dm_get_queue_limits);
2095
2096
2097
2098
2099int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
2100{
2101 int r;
2102 struct queue_limits limits;
2103 enum dm_queue_mode type = dm_get_md_type(md);
2104
2105 switch (type) {
2106 case DM_TYPE_REQUEST_BASED:
2107 md->disk->fops = &dm_rq_blk_dops;
2108 r = dm_mq_init_request_queue(md, t);
2109 if (r) {
2110 DMERR("Cannot initialize queue for request-based dm mapped device");
2111 return r;
2112 }
2113 break;
2114 case DM_TYPE_BIO_BASED:
2115 case DM_TYPE_DAX_BIO_BASED:
2116 break;
2117 case DM_TYPE_NONE:
2118 WARN_ON_ONCE(true);
2119 break;
2120 }
2121
2122 r = dm_calculate_queue_limits(t, &limits);
2123 if (r) {
2124 DMERR("Cannot calculate initial queue limits");
2125 return r;
2126 }
2127 dm_table_set_restrictions(t, md->queue, &limits);
2128 blk_register_queue(md->disk);
2129
2130 return 0;
2131}
2132
2133struct mapped_device *dm_get_md(dev_t dev)
2134{
2135 struct mapped_device *md;
2136 unsigned minor = MINOR(dev);
2137
2138 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2139 return NULL;
2140
2141 spin_lock(&_minor_lock);
2142
2143 md = idr_find(&_minor_idr, minor);
2144 if (!md || md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) ||
2145 test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2146 md = NULL;
2147 goto out;
2148 }
2149 dm_get(md);
2150out:
2151 spin_unlock(&_minor_lock);
2152
2153 return md;
2154}
2155EXPORT_SYMBOL_GPL(dm_get_md);
2156
2157void *dm_get_mdptr(struct mapped_device *md)
2158{
2159 return md->interface_ptr;
2160}
2161
2162void dm_set_mdptr(struct mapped_device *md, void *ptr)
2163{
2164 md->interface_ptr = ptr;
2165}
2166
2167void dm_get(struct mapped_device *md)
2168{
2169 atomic_inc(&md->holders);
2170 BUG_ON(test_bit(DMF_FREEING, &md->flags));
2171}
2172
2173int dm_hold(struct mapped_device *md)
2174{
2175 spin_lock(&_minor_lock);
2176 if (test_bit(DMF_FREEING, &md->flags)) {
2177 spin_unlock(&_minor_lock);
2178 return -EBUSY;
2179 }
2180 dm_get(md);
2181 spin_unlock(&_minor_lock);
2182 return 0;
2183}
2184EXPORT_SYMBOL_GPL(dm_hold);
2185
2186const char *dm_device_name(struct mapped_device *md)
2187{
2188 return md->name;
2189}
2190EXPORT_SYMBOL_GPL(dm_device_name);
2191
2192static void __dm_destroy(struct mapped_device *md, bool wait)
2193{
2194 struct dm_table *map;
2195 int srcu_idx;
2196
2197 might_sleep();
2198
2199 spin_lock(&_minor_lock);
2200 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2201 set_bit(DMF_FREEING, &md->flags);
2202 spin_unlock(&_minor_lock);
2203
2204 blk_set_queue_dying(md->queue);
2205
2206
2207
2208
2209
2210 mutex_lock(&md->suspend_lock);
2211 map = dm_get_live_table(md, &srcu_idx);
2212 if (!dm_suspended_md(md)) {
2213 dm_table_presuspend_targets(map);
2214 set_bit(DMF_SUSPENDED, &md->flags);
2215 set_bit(DMF_POST_SUSPENDING, &md->flags);
2216 dm_table_postsuspend_targets(map);
2217 }
2218
2219 dm_put_live_table(md, srcu_idx);
2220 mutex_unlock(&md->suspend_lock);
2221
2222
2223
2224
2225
2226
2227
2228 if (wait)
2229 while (atomic_read(&md->holders))
2230 msleep(1);
2231 else if (atomic_read(&md->holders))
2232 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2233 dm_device_name(md), atomic_read(&md->holders));
2234
2235 dm_sysfs_exit(md);
2236 dm_table_destroy(__unbind(md));
2237 free_dev(md);
2238}
2239
2240void dm_destroy(struct mapped_device *md)
2241{
2242 __dm_destroy(md, true);
2243}
2244
2245void dm_destroy_immediate(struct mapped_device *md)
2246{
2247 __dm_destroy(md, false);
2248}
2249
2250void dm_put(struct mapped_device *md)
2251{
2252 atomic_dec(&md->holders);
2253}
2254EXPORT_SYMBOL_GPL(dm_put);
2255
2256static bool md_in_flight_bios(struct mapped_device *md)
2257{
2258 int cpu;
2259 struct hd_struct *part = &dm_disk(md)->part0;
2260 long sum = 0;
2261
2262 for_each_possible_cpu(cpu) {
2263 sum += part_stat_local_read_cpu(part, in_flight[0], cpu);
2264 sum += part_stat_local_read_cpu(part, in_flight[1], cpu);
2265 }
2266
2267 return sum != 0;
2268}
2269
2270static int dm_wait_for_bios_completion(struct mapped_device *md, long task_state)
2271{
2272 int r = 0;
2273 DEFINE_WAIT(wait);
2274
2275 while (true) {
2276 prepare_to_wait(&md->wait, &wait, task_state);
2277
2278 if (!md_in_flight_bios(md))
2279 break;
2280
2281 if (signal_pending_state(task_state, current)) {
2282 r = -EINTR;
2283 break;
2284 }
2285
2286 io_schedule();
2287 }
2288 finish_wait(&md->wait, &wait);
2289
2290 return r;
2291}
2292
2293static int dm_wait_for_completion(struct mapped_device *md, long task_state)
2294{
2295 int r = 0;
2296
2297 if (!queue_is_mq(md->queue))
2298 return dm_wait_for_bios_completion(md, task_state);
2299
2300 while (true) {
2301 if (!blk_mq_queue_inflight(md->queue))
2302 break;
2303
2304 if (signal_pending_state(task_state, current)) {
2305 r = -EINTR;
2306 break;
2307 }
2308
2309 msleep(5);
2310 }
2311
2312 return r;
2313}
2314
2315
2316
2317
2318static void dm_wq_work(struct work_struct *work)
2319{
2320 struct mapped_device *md = container_of(work, struct mapped_device, work);
2321 struct bio *bio;
2322
2323 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2324 spin_lock_irq(&md->deferred_lock);
2325 bio = bio_list_pop(&md->deferred);
2326 spin_unlock_irq(&md->deferred_lock);
2327
2328 if (!bio)
2329 break;
2330
2331 submit_bio_noacct(bio);
2332 }
2333}
2334
2335static void dm_queue_flush(struct mapped_device *md)
2336{
2337 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2338 smp_mb__after_atomic();
2339 queue_work(md->wq, &md->work);
2340}
2341
2342
2343
2344
2345struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2346{
2347 struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
2348 struct queue_limits limits;
2349 int r;
2350
2351 mutex_lock(&md->suspend_lock);
2352
2353
2354 if (!dm_suspended_md(md))
2355 goto out;
2356
2357
2358
2359
2360
2361
2362
2363 if (dm_table_has_no_data_devices(table)) {
2364 live_map = dm_get_live_table_fast(md);
2365 if (live_map)
2366 limits = md->queue->limits;
2367 dm_put_live_table_fast(md);
2368 }
2369
2370 if (!live_map) {
2371 r = dm_calculate_queue_limits(table, &limits);
2372 if (r) {
2373 map = ERR_PTR(r);
2374 goto out;
2375 }
2376 }
2377
2378 map = __bind(md, table, &limits);
2379 dm_issue_global_event();
2380
2381out:
2382 mutex_unlock(&md->suspend_lock);
2383 return map;
2384}
2385
2386
2387
2388
2389
2390static int lock_fs(struct mapped_device *md)
2391{
2392 int r;
2393
2394 WARN_ON(md->frozen_sb);
2395
2396 md->frozen_sb = freeze_bdev(md->bdev);
2397 if (IS_ERR(md->frozen_sb)) {
2398 r = PTR_ERR(md->frozen_sb);
2399 md->frozen_sb = NULL;
2400 return r;
2401 }
2402
2403 set_bit(DMF_FROZEN, &md->flags);
2404
2405 return 0;
2406}
2407
2408static void unlock_fs(struct mapped_device *md)
2409{
2410 if (!test_bit(DMF_FROZEN, &md->flags))
2411 return;
2412
2413 thaw_bdev(md->bdev, md->frozen_sb);
2414 md->frozen_sb = NULL;
2415 clear_bit(DMF_FROZEN, &md->flags);
2416}
2417
2418
2419
2420
2421
2422
2423
2424
2425
2426
2427static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
2428 unsigned suspend_flags, long task_state,
2429 int dmf_suspended_flag)
2430{
2431 bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
2432 bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
2433 int r;
2434
2435 lockdep_assert_held(&md->suspend_lock);
2436
2437
2438
2439
2440
2441 if (noflush)
2442 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2443 else
2444 DMDEBUG("%s: suspending with flush", dm_device_name(md));
2445
2446
2447
2448
2449
2450 dm_table_presuspend_targets(map);
2451
2452
2453
2454
2455
2456
2457
2458 if (!noflush && do_lockfs) {
2459 r = lock_fs(md);
2460 if (r) {
2461 dm_table_presuspend_undo_targets(map);
2462 return r;
2463 }
2464 }
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476
2477 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2478 if (map)
2479 synchronize_srcu(&md->io_barrier);
2480
2481
2482
2483
2484
2485 if (dm_request_based(md))
2486 dm_stop_queue(md->queue);
2487
2488 flush_workqueue(md->wq);
2489
2490
2491
2492
2493
2494
2495 r = dm_wait_for_completion(md, task_state);
2496 if (!r)
2497 set_bit(dmf_suspended_flag, &md->flags);
2498
2499 if (noflush)
2500 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2501 if (map)
2502 synchronize_srcu(&md->io_barrier);
2503
2504
2505 if (r < 0) {
2506 dm_queue_flush(md);
2507
2508 if (dm_request_based(md))
2509 dm_start_queue(md->queue);
2510
2511 unlock_fs(md);
2512 dm_table_presuspend_undo_targets(map);
2513
2514 }
2515
2516 return r;
2517}
2518
2519
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530
2531
2532
2533
2534
2535int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2536{
2537 struct dm_table *map = NULL;
2538 int r = 0;
2539
2540retry:
2541 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2542
2543 if (dm_suspended_md(md)) {
2544 r = -EINVAL;
2545 goto out_unlock;
2546 }
2547
2548 if (dm_suspended_internally_md(md)) {
2549
2550 mutex_unlock(&md->suspend_lock);
2551 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2552 if (r)
2553 return r;
2554 goto retry;
2555 }
2556
2557 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2558
2559 r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED);
2560 if (r)
2561 goto out_unlock;
2562
2563 set_bit(DMF_POST_SUSPENDING, &md->flags);
2564 dm_table_postsuspend_targets(map);
2565 clear_bit(DMF_POST_SUSPENDING, &md->flags);
2566
2567out_unlock:
2568 mutex_unlock(&md->suspend_lock);
2569 return r;
2570}
2571
2572static int __dm_resume(struct mapped_device *md, struct dm_table *map)
2573{
2574 if (map) {
2575 int r = dm_table_resume_targets(map);
2576 if (r)
2577 return r;
2578 }
2579
2580 dm_queue_flush(md);
2581
2582
2583
2584
2585
2586
2587 if (dm_request_based(md))
2588 dm_start_queue(md->queue);
2589
2590 unlock_fs(md);
2591
2592 return 0;
2593}
2594
2595int dm_resume(struct mapped_device *md)
2596{
2597 int r;
2598 struct dm_table *map = NULL;
2599
2600retry:
2601 r = -EINVAL;
2602 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2603
2604 if (!dm_suspended_md(md))
2605 goto out;
2606
2607 if (dm_suspended_internally_md(md)) {
2608
2609 mutex_unlock(&md->suspend_lock);
2610 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2611 if (r)
2612 return r;
2613 goto retry;
2614 }
2615
2616 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2617 if (!map || !dm_table_get_size(map))
2618 goto out;
2619
2620 r = __dm_resume(md, map);
2621 if (r)
2622 goto out;
2623
2624 clear_bit(DMF_SUSPENDED, &md->flags);
2625out:
2626 mutex_unlock(&md->suspend_lock);
2627
2628 return r;
2629}
2630
2631
2632
2633
2634
2635
2636
2637static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
2638{
2639 struct dm_table *map = NULL;
2640
2641 lockdep_assert_held(&md->suspend_lock);
2642
2643 if (md->internal_suspend_count++)
2644 return;
2645
2646 if (dm_suspended_md(md)) {
2647 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2648 return;
2649 }
2650
2651 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2652
2653
2654
2655
2656
2657
2658
2659 (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE,
2660 DMF_SUSPENDED_INTERNALLY);
2661
2662 set_bit(DMF_POST_SUSPENDING, &md->flags);
2663 dm_table_postsuspend_targets(map);
2664 clear_bit(DMF_POST_SUSPENDING, &md->flags);
2665}
2666
2667static void __dm_internal_resume(struct mapped_device *md)
2668{
2669 BUG_ON(!md->internal_suspend_count);
2670
2671 if (--md->internal_suspend_count)
2672 return;
2673
2674 if (dm_suspended_md(md))
2675 goto done;
2676
2677
2678
2679
2680
2681 (void) __dm_resume(md, NULL);
2682
2683done:
2684 clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2685 smp_mb__after_atomic();
2686 wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
2687}
2688
2689void dm_internal_suspend_noflush(struct mapped_device *md)
2690{
2691 mutex_lock(&md->suspend_lock);
2692 __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
2693 mutex_unlock(&md->suspend_lock);
2694}
2695EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
2696
2697void dm_internal_resume(struct mapped_device *md)
2698{
2699 mutex_lock(&md->suspend_lock);
2700 __dm_internal_resume(md);
2701 mutex_unlock(&md->suspend_lock);
2702}
2703EXPORT_SYMBOL_GPL(dm_internal_resume);
2704
2705
2706
2707
2708
2709
2710void dm_internal_suspend_fast(struct mapped_device *md)
2711{
2712 mutex_lock(&md->suspend_lock);
2713 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2714 return;
2715
2716 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2717 synchronize_srcu(&md->io_barrier);
2718 flush_workqueue(md->wq);
2719 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2720}
2721EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
2722
2723void dm_internal_resume_fast(struct mapped_device *md)
2724{
2725 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2726 goto done;
2727
2728 dm_queue_flush(md);
2729
2730done:
2731 mutex_unlock(&md->suspend_lock);
2732}
2733EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
2734
2735
2736
2737
2738int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
2739 unsigned cookie)
2740{
2741 int r;
2742 unsigned noio_flag;
2743 char udev_cookie[DM_COOKIE_LENGTH];
2744 char *envp[] = { udev_cookie, NULL };
2745
2746 noio_flag = memalloc_noio_save();
2747
2748 if (!cookie)
2749 r = kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
2750 else {
2751 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2752 DM_COOKIE_ENV_VAR_NAME, cookie);
2753 r = kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
2754 action, envp);
2755 }
2756
2757 memalloc_noio_restore(noio_flag);
2758
2759 return r;
2760}
2761
2762uint32_t dm_next_uevent_seq(struct mapped_device *md)
2763{
2764 return atomic_add_return(1, &md->uevent_seq);
2765}
2766
2767uint32_t dm_get_event_nr(struct mapped_device *md)
2768{
2769 return atomic_read(&md->event_nr);
2770}
2771
2772int dm_wait_event(struct mapped_device *md, int event_nr)
2773{
2774 return wait_event_interruptible(md->eventq,
2775 (event_nr != atomic_read(&md->event_nr)));
2776}
2777
2778void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
2779{
2780 unsigned long flags;
2781
2782 spin_lock_irqsave(&md->uevent_lock, flags);
2783 list_add(elist, &md->uevent_list);
2784 spin_unlock_irqrestore(&md->uevent_lock, flags);
2785}
2786
2787
2788
2789
2790
2791struct gendisk *dm_disk(struct mapped_device *md)
2792{
2793 return md->disk;
2794}
2795EXPORT_SYMBOL_GPL(dm_disk);
2796
2797struct kobject *dm_kobject(struct mapped_device *md)
2798{
2799 return &md->kobj_holder.kobj;
2800}
2801
2802struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2803{
2804 struct mapped_device *md;
2805
2806 md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
2807
2808 spin_lock(&_minor_lock);
2809 if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2810 md = NULL;
2811 goto out;
2812 }
2813 dm_get(md);
2814out:
2815 spin_unlock(&_minor_lock);
2816
2817 return md;
2818}
2819
2820int dm_suspended_md(struct mapped_device *md)
2821{
2822 return test_bit(DMF_SUSPENDED, &md->flags);
2823}
2824
2825static int dm_post_suspending_md(struct mapped_device *md)
2826{
2827 return test_bit(DMF_POST_SUSPENDING, &md->flags);
2828}
2829
2830int dm_suspended_internally_md(struct mapped_device *md)
2831{
2832 return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2833}
2834
2835int dm_test_deferred_remove_flag(struct mapped_device *md)
2836{
2837 return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
2838}
2839
2840int dm_suspended(struct dm_target *ti)
2841{
2842 return dm_suspended_md(ti->table->md);
2843}
2844EXPORT_SYMBOL_GPL(dm_suspended);
2845
2846int dm_post_suspending(struct dm_target *ti)
2847{
2848 return dm_post_suspending_md(ti->table->md);
2849}
2850EXPORT_SYMBOL_GPL(dm_post_suspending);
2851
2852int dm_noflush_suspending(struct dm_target *ti)
2853{
2854 return __noflush_suspending(ti->table->md);
2855}
2856EXPORT_SYMBOL_GPL(dm_noflush_suspending);
2857
2858struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
2859 unsigned integrity, unsigned per_io_data_size,
2860 unsigned min_pool_size)
2861{
2862 struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
2863 unsigned int pool_size = 0;
2864 unsigned int front_pad, io_front_pad;
2865 int ret;
2866
2867 if (!pools)
2868 return NULL;
2869
2870 switch (type) {
2871 case DM_TYPE_BIO_BASED:
2872 case DM_TYPE_DAX_BIO_BASED:
2873 pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
2874 front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
2875 io_front_pad = roundup(front_pad, __alignof__(struct dm_io)) + offsetof(struct dm_io, tio);
2876 ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0);
2877 if (ret)
2878 goto out;
2879 if (integrity && bioset_integrity_create(&pools->io_bs, pool_size))
2880 goto out;
2881 break;
2882 case DM_TYPE_REQUEST_BASED:
2883 pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size);
2884 front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
2885
2886 break;
2887 default:
2888 BUG();
2889 }
2890
2891 ret = bioset_init(&pools->bs, pool_size, front_pad, 0);
2892 if (ret)
2893 goto out;
2894
2895 if (integrity && bioset_integrity_create(&pools->bs, pool_size))
2896 goto out;
2897
2898 return pools;
2899
2900out:
2901 dm_free_md_mempools(pools);
2902
2903 return NULL;
2904}
2905
2906void dm_free_md_mempools(struct dm_md_mempools *pools)
2907{
2908 if (!pools)
2909 return;
2910
2911 bioset_exit(&pools->bs);
2912 bioset_exit(&pools->io_bs);
2913
2914 kfree(pools);
2915}
2916
2917struct dm_pr {
2918 u64 old_key;
2919 u64 new_key;
2920 u32 flags;
2921 bool fail_early;
2922};
2923
2924static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
2925 void *data)
2926{
2927 struct mapped_device *md = bdev->bd_disk->private_data;
2928 struct dm_table *table;
2929 struct dm_target *ti;
2930 int ret = -ENOTTY, srcu_idx;
2931
2932 table = dm_get_live_table(md, &srcu_idx);
2933 if (!table || !dm_table_get_size(table))
2934 goto out;
2935
2936
2937 if (dm_table_get_num_targets(table) != 1)
2938 goto out;
2939 ti = dm_table_get_target(table, 0);
2940
2941 ret = -EINVAL;
2942 if (!ti->type->iterate_devices)
2943 goto out;
2944
2945 ret = ti->type->iterate_devices(ti, fn, data);
2946out:
2947 dm_put_live_table(md, srcu_idx);
2948 return ret;
2949}
2950
2951
2952
2953
2954static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev,
2955 sector_t start, sector_t len, void *data)
2956{
2957 struct dm_pr *pr = data;
2958 const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
2959
2960 if (!ops || !ops->pr_register)
2961 return -EOPNOTSUPP;
2962 return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);
2963}
2964
2965static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
2966 u32 flags)
2967{
2968 struct dm_pr pr = {
2969 .old_key = old_key,
2970 .new_key = new_key,
2971 .flags = flags,
2972 .fail_early = true,
2973 };
2974 int ret;
2975
2976 ret = dm_call_pr(bdev, __dm_pr_register, &pr);
2977 if (ret && new_key) {
2978
2979 pr.old_key = new_key;
2980 pr.new_key = 0;
2981 pr.flags = 0;
2982 pr.fail_early = false;
2983 dm_call_pr(bdev, __dm_pr_register, &pr);
2984 }
2985
2986 return ret;
2987}
2988
2989static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
2990 u32 flags)
2991{
2992 struct mapped_device *md = bdev->bd_disk->private_data;
2993 const struct pr_ops *ops;
2994 int r, srcu_idx;
2995
2996 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
2997 if (r < 0)
2998 goto out;
2999
3000 ops = bdev->bd_disk->fops->pr_ops;
3001 if (ops && ops->pr_reserve)
3002 r = ops->pr_reserve(bdev, key, type, flags);
3003 else
3004 r = -EOPNOTSUPP;
3005out:
3006 dm_unprepare_ioctl(md, srcu_idx);
3007 return r;
3008}
3009
3010static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
3011{
3012 struct mapped_device *md = bdev->bd_disk->private_data;
3013 const struct pr_ops *ops;
3014 int r, srcu_idx;
3015
3016 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3017 if (r < 0)
3018 goto out;
3019
3020 ops = bdev->bd_disk->fops->pr_ops;
3021 if (ops && ops->pr_release)
3022 r = ops->pr_release(bdev, key, type);
3023 else
3024 r = -EOPNOTSUPP;
3025out:
3026 dm_unprepare_ioctl(md, srcu_idx);
3027 return r;
3028}
3029
3030static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
3031 enum pr_type type, bool abort)
3032{
3033 struct mapped_device *md = bdev->bd_disk->private_data;
3034 const struct pr_ops *ops;
3035 int r, srcu_idx;
3036
3037 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3038 if (r < 0)
3039 goto out;
3040
3041 ops = bdev->bd_disk->fops->pr_ops;
3042 if (ops && ops->pr_preempt)
3043 r = ops->pr_preempt(bdev, old_key, new_key, type, abort);
3044 else
3045 r = -EOPNOTSUPP;
3046out:
3047 dm_unprepare_ioctl(md, srcu_idx);
3048 return r;
3049}
3050
3051static int dm_pr_clear(struct block_device *bdev, u64 key)
3052{
3053 struct mapped_device *md = bdev->bd_disk->private_data;
3054 const struct pr_ops *ops;
3055 int r, srcu_idx;
3056
3057 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3058 if (r < 0)
3059 goto out;
3060
3061 ops = bdev->bd_disk->fops->pr_ops;
3062 if (ops && ops->pr_clear)
3063 r = ops->pr_clear(bdev, key);
3064 else
3065 r = -EOPNOTSUPP;
3066out:
3067 dm_unprepare_ioctl(md, srcu_idx);
3068 return r;
3069}
3070
3071static const struct pr_ops dm_pr_ops = {
3072 .pr_register = dm_pr_register,
3073 .pr_reserve = dm_pr_reserve,
3074 .pr_release = dm_pr_release,
3075 .pr_preempt = dm_pr_preempt,
3076 .pr_clear = dm_pr_clear,
3077};
3078
3079static const struct block_device_operations dm_blk_dops = {
3080 .submit_bio = dm_submit_bio,
3081 .open = dm_blk_open,
3082 .release = dm_blk_close,
3083 .ioctl = dm_blk_ioctl,
3084 .getgeo = dm_blk_getgeo,
3085 .report_zones = dm_blk_report_zones,
3086 .pr_ops = &dm_pr_ops,
3087 .owner = THIS_MODULE
3088};
3089
3090static const struct block_device_operations dm_rq_blk_dops = {
3091 .open = dm_blk_open,
3092 .release = dm_blk_close,
3093 .ioctl = dm_blk_ioctl,
3094 .getgeo = dm_blk_getgeo,
3095 .pr_ops = &dm_pr_ops,
3096 .owner = THIS_MODULE
3097};
3098
3099static const struct dax_operations dm_dax_ops = {
3100 .direct_access = dm_dax_direct_access,
3101 .dax_supported = dm_dax_supported,
3102 .copy_from_iter = dm_dax_copy_from_iter,
3103 .copy_to_iter = dm_dax_copy_to_iter,
3104 .zero_page_range = dm_dax_zero_page_range,
3105};
3106
3107
3108
3109
3110module_init(dm_init);
3111module_exit(dm_exit);
3112
3113module_param(major, uint, 0);
3114MODULE_PARM_DESC(major, "The major number of the device mapper");
3115
3116module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
3117MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
3118
3119module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
3120MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
3121
3122MODULE_DESCRIPTION(DM_NAME " driver");
3123MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3124MODULE_LICENSE("GPL");
3125