1
2
3
4
5
6
7
8#include "dm-core.h"
9#include "dm-rq.h"
10#include "dm-uevent.h"
11
12#include <linux/init.h>
13#include <linux/module.h>
14#include <linux/mutex.h>
15#include <linux/sched/mm.h>
16#include <linux/sched/signal.h>
17#include <linux/blkpg.h>
18#include <linux/bio.h>
19#include <linux/mempool.h>
20#include <linux/dax.h>
21#include <linux/slab.h>
22#include <linux/idr.h>
23#include <linux/uio.h>
24#include <linux/hdreg.h>
25#include <linux/delay.h>
26#include <linux/wait.h>
27#include <linux/pr.h>
28#include <linux/refcount.h>
29#include <linux/part_stat.h>
30#include <linux/blk-crypto.h>
31
32#define DM_MSG_PREFIX "core"
33
34
35
36
37
38#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
39#define DM_COOKIE_LENGTH 24
40
41static const char *_name = DM_NAME;
42
43static unsigned int major = 0;
44static unsigned int _major = 0;
45
46static DEFINE_IDR(_minor_idr);
47
48static DEFINE_SPINLOCK(_minor_lock);
49
50static void do_deferred_remove(struct work_struct *w);
51
52static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
53
54static struct workqueue_struct *deferred_remove_workqueue;
55
56atomic_t dm_global_event_nr = ATOMIC_INIT(0);
57DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq);
58
59void dm_issue_global_event(void)
60{
61 atomic_inc(&dm_global_event_nr);
62 wake_up(&dm_global_eventq);
63}
64
65
66
67
68struct clone_info {
69 struct dm_table *map;
70 struct bio *bio;
71 struct dm_io *io;
72 sector_t sector;
73 unsigned sector_count;
74};
75
76
77
78
79#define DM_TIO_MAGIC 7282014
80struct dm_target_io {
81 unsigned magic;
82 struct dm_io *io;
83 struct dm_target *ti;
84 unsigned target_bio_nr;
85 unsigned *len_ptr;
86 bool inside_dm_io;
87 struct bio clone;
88};
89
90
91
92
93
94#define DM_IO_MAGIC 5191977
95struct dm_io {
96 unsigned magic;
97 struct mapped_device *md;
98 blk_status_t status;
99 atomic_t io_count;
100 struct bio *orig_bio;
101 unsigned long start_time;
102 spinlock_t endio_lock;
103 struct dm_stats_aux stats_aux;
104
105 struct dm_target_io tio;
106};
107
108void *dm_per_bio_data(struct bio *bio, size_t data_size)
109{
110 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
111 if (!tio->inside_dm_io)
112 return (char *)bio - offsetof(struct dm_target_io, clone) - data_size;
113 return (char *)bio - offsetof(struct dm_target_io, clone) - offsetof(struct dm_io, tio) - data_size;
114}
115EXPORT_SYMBOL_GPL(dm_per_bio_data);
116
117struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size)
118{
119 struct dm_io *io = (struct dm_io *)((char *)data + data_size);
120 if (io->magic == DM_IO_MAGIC)
121 return (struct bio *)((char *)io + offsetof(struct dm_io, tio) + offsetof(struct dm_target_io, clone));
122 BUG_ON(io->magic != DM_TIO_MAGIC);
123 return (struct bio *)((char *)io + offsetof(struct dm_target_io, clone));
124}
125EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data);
126
127unsigned dm_bio_get_target_bio_nr(const struct bio *bio)
128{
129 return container_of(bio, struct dm_target_io, clone)->target_bio_nr;
130}
131EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr);
132
133#define MINOR_ALLOCED ((void *)-1)
134
135
136
137
138#define DMF_BLOCK_IO_FOR_SUSPEND 0
139#define DMF_SUSPENDED 1
140#define DMF_FROZEN 2
141#define DMF_FREEING 3
142#define DMF_DELETING 4
143#define DMF_NOFLUSH_SUSPENDING 5
144#define DMF_DEFERRED_REMOVE 6
145#define DMF_SUSPENDED_INTERNALLY 7
146#define DMF_POST_SUSPENDING 8
147
148#define DM_NUMA_NODE NUMA_NO_NODE
149static int dm_numa_node = DM_NUMA_NODE;
150
151
152
153
154struct dm_md_mempools {
155 struct bio_set bs;
156 struct bio_set io_bs;
157};
158
159struct table_device {
160 struct list_head list;
161 refcount_t count;
162 struct dm_dev dm_dev;
163};
164
165
166
167
168#define RESERVED_BIO_BASED_IOS 16
169static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
170
171static int __dm_get_module_param_int(int *module_param, int min, int max)
172{
173 int param = READ_ONCE(*module_param);
174 int modified_param = 0;
175 bool modified = true;
176
177 if (param < min)
178 modified_param = min;
179 else if (param > max)
180 modified_param = max;
181 else
182 modified = false;
183
184 if (modified) {
185 (void)cmpxchg(module_param, param, modified_param);
186 param = modified_param;
187 }
188
189 return param;
190}
191
192unsigned __dm_get_module_param(unsigned *module_param,
193 unsigned def, unsigned max)
194{
195 unsigned param = READ_ONCE(*module_param);
196 unsigned modified_param = 0;
197
198 if (!param)
199 modified_param = def;
200 else if (param > max)
201 modified_param = max;
202
203 if (modified_param) {
204 (void)cmpxchg(module_param, param, modified_param);
205 param = modified_param;
206 }
207
208 return param;
209}
210
211unsigned dm_get_reserved_bio_based_ios(void)
212{
213 return __dm_get_module_param(&reserved_bio_based_ios,
214 RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS);
215}
216EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
217
218static unsigned dm_get_numa_node(void)
219{
220 return __dm_get_module_param_int(&dm_numa_node,
221 DM_NUMA_NODE, num_online_nodes() - 1);
222}
223
224static int __init local_init(void)
225{
226 int r;
227
228 r = dm_uevent_init();
229 if (r)
230 return r;
231
232 deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
233 if (!deferred_remove_workqueue) {
234 r = -ENOMEM;
235 goto out_uevent_exit;
236 }
237
238 _major = major;
239 r = register_blkdev(_major, _name);
240 if (r < 0)
241 goto out_free_workqueue;
242
243 if (!_major)
244 _major = r;
245
246 return 0;
247
248out_free_workqueue:
249 destroy_workqueue(deferred_remove_workqueue);
250out_uevent_exit:
251 dm_uevent_exit();
252
253 return r;
254}
255
256static void local_exit(void)
257{
258 flush_scheduled_work();
259 destroy_workqueue(deferred_remove_workqueue);
260
261 unregister_blkdev(_major, _name);
262 dm_uevent_exit();
263
264 _major = 0;
265
266 DMINFO("cleaned up");
267}
268
269static int (*_inits[])(void) __initdata = {
270 local_init,
271 dm_target_init,
272 dm_linear_init,
273 dm_stripe_init,
274 dm_io_init,
275 dm_kcopyd_init,
276 dm_interface_init,
277 dm_statistics_init,
278};
279
280static void (*_exits[])(void) = {
281 local_exit,
282 dm_target_exit,
283 dm_linear_exit,
284 dm_stripe_exit,
285 dm_io_exit,
286 dm_kcopyd_exit,
287 dm_interface_exit,
288 dm_statistics_exit,
289};
290
291static int __init dm_init(void)
292{
293 const int count = ARRAY_SIZE(_inits);
294
295 int r, i;
296
297 for (i = 0; i < count; i++) {
298 r = _inits[i]();
299 if (r)
300 goto bad;
301 }
302
303 return 0;
304
305 bad:
306 while (i--)
307 _exits[i]();
308
309 return r;
310}
311
312static void __exit dm_exit(void)
313{
314 int i = ARRAY_SIZE(_exits);
315
316 while (i--)
317 _exits[i]();
318
319
320
321
322 idr_destroy(&_minor_idr);
323}
324
325
326
327
328int dm_deleting_md(struct mapped_device *md)
329{
330 return test_bit(DMF_DELETING, &md->flags);
331}
332
333static int dm_blk_open(struct block_device *bdev, fmode_t mode)
334{
335 struct mapped_device *md;
336
337 spin_lock(&_minor_lock);
338
339 md = bdev->bd_disk->private_data;
340 if (!md)
341 goto out;
342
343 if (test_bit(DMF_FREEING, &md->flags) ||
344 dm_deleting_md(md)) {
345 md = NULL;
346 goto out;
347 }
348
349 dm_get(md);
350 atomic_inc(&md->open_count);
351out:
352 spin_unlock(&_minor_lock);
353
354 return md ? 0 : -ENXIO;
355}
356
357static void dm_blk_close(struct gendisk *disk, fmode_t mode)
358{
359 struct mapped_device *md;
360
361 spin_lock(&_minor_lock);
362
363 md = disk->private_data;
364 if (WARN_ON(!md))
365 goto out;
366
367 if (atomic_dec_and_test(&md->open_count) &&
368 (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
369 queue_work(deferred_remove_workqueue, &deferred_remove_work);
370
371 dm_put(md);
372out:
373 spin_unlock(&_minor_lock);
374}
375
376int dm_open_count(struct mapped_device *md)
377{
378 return atomic_read(&md->open_count);
379}
380
381
382
383
384int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
385{
386 int r = 0;
387
388 spin_lock(&_minor_lock);
389
390 if (dm_open_count(md)) {
391 r = -EBUSY;
392 if (mark_deferred)
393 set_bit(DMF_DEFERRED_REMOVE, &md->flags);
394 } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
395 r = -EEXIST;
396 else
397 set_bit(DMF_DELETING, &md->flags);
398
399 spin_unlock(&_minor_lock);
400
401 return r;
402}
403
404int dm_cancel_deferred_remove(struct mapped_device *md)
405{
406 int r = 0;
407
408 spin_lock(&_minor_lock);
409
410 if (test_bit(DMF_DELETING, &md->flags))
411 r = -EBUSY;
412 else
413 clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
414
415 spin_unlock(&_minor_lock);
416
417 return r;
418}
419
420static void do_deferred_remove(struct work_struct *w)
421{
422 dm_deferred_remove();
423}
424
425static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
426{
427 struct mapped_device *md = bdev->bd_disk->private_data;
428
429 return dm_get_geometry(md, geo);
430}
431
432#ifdef CONFIG_BLK_DEV_ZONED
433int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx, void *data)
434{
435 struct dm_report_zones_args *args = data;
436 sector_t sector_diff = args->tgt->begin - args->start;
437
438
439
440
441 if (zone->start >= args->start + args->tgt->len)
442 return 0;
443
444
445
446
447
448 zone->start += sector_diff;
449 if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) {
450 if (zone->cond == BLK_ZONE_COND_FULL)
451 zone->wp = zone->start + zone->len;
452 else if (zone->cond == BLK_ZONE_COND_EMPTY)
453 zone->wp = zone->start;
454 else
455 zone->wp += sector_diff;
456 }
457
458 args->next_sector = zone->start + zone->len;
459 return args->orig_cb(zone, args->zone_idx++, args->orig_data);
460}
461EXPORT_SYMBOL_GPL(dm_report_zones_cb);
462
463static int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
464 unsigned int nr_zones, report_zones_cb cb, void *data)
465{
466 struct mapped_device *md = disk->private_data;
467 struct dm_table *map;
468 int srcu_idx, ret;
469 struct dm_report_zones_args args = {
470 .next_sector = sector,
471 .orig_data = data,
472 .orig_cb = cb,
473 };
474
475 if (dm_suspended_md(md))
476 return -EAGAIN;
477
478 map = dm_get_live_table(md, &srcu_idx);
479 if (!map) {
480 ret = -EIO;
481 goto out;
482 }
483
484 do {
485 struct dm_target *tgt;
486
487 tgt = dm_table_find_target(map, args.next_sector);
488 if (WARN_ON_ONCE(!tgt->type->report_zones)) {
489 ret = -EIO;
490 goto out;
491 }
492
493 args.tgt = tgt;
494 ret = tgt->type->report_zones(tgt, &args,
495 nr_zones - args.zone_idx);
496 if (ret < 0)
497 goto out;
498 } while (args.zone_idx < nr_zones &&
499 args.next_sector < get_capacity(disk));
500
501 ret = args.zone_idx;
502out:
503 dm_put_live_table(md, srcu_idx);
504 return ret;
505}
506#else
507#define dm_blk_report_zones NULL
508#endif
509
510static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx,
511 struct block_device **bdev)
512{
513 struct dm_target *tgt;
514 struct dm_table *map;
515 int r;
516
517retry:
518 r = -ENOTTY;
519 map = dm_get_live_table(md, srcu_idx);
520 if (!map || !dm_table_get_size(map))
521 return r;
522
523
524 if (dm_table_get_num_targets(map) != 1)
525 return r;
526
527 tgt = dm_table_get_target(map, 0);
528 if (!tgt->type->prepare_ioctl)
529 return r;
530
531 if (dm_suspended_md(md))
532 return -EAGAIN;
533
534 r = tgt->type->prepare_ioctl(tgt, bdev);
535 if (r == -ENOTCONN && !fatal_signal_pending(current)) {
536 dm_put_live_table(md, *srcu_idx);
537 msleep(10);
538 goto retry;
539 }
540
541 return r;
542}
543
544static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx)
545{
546 dm_put_live_table(md, srcu_idx);
547}
548
549static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
550 unsigned int cmd, unsigned long arg)
551{
552 struct mapped_device *md = bdev->bd_disk->private_data;
553 int r, srcu_idx;
554
555 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
556 if (r < 0)
557 goto out;
558
559 if (r > 0) {
560
561
562
563
564 if (!capable(CAP_SYS_RAWIO)) {
565 DMDEBUG_LIMIT(
566 "%s: sending ioctl %x to DM device without required privilege.",
567 current->comm, cmd);
568 r = -ENOIOCTLCMD;
569 goto out;
570 }
571 }
572
573 if (!bdev->bd_disk->fops->ioctl)
574 r = -ENOTTY;
575 else
576 r = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg);
577out:
578 dm_unprepare_ioctl(md, srcu_idx);
579 return r;
580}
581
582u64 dm_start_time_ns_from_clone(struct bio *bio)
583{
584 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
585 struct dm_io *io = tio->io;
586
587 return jiffies_to_nsecs(io->start_time);
588}
589EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone);
590
591static void start_io_acct(struct dm_io *io)
592{
593 struct mapped_device *md = io->md;
594 struct bio *bio = io->orig_bio;
595
596 io->start_time = bio_start_io_acct(bio);
597 if (unlikely(dm_stats_used(&md->stats)))
598 dm_stats_account_io(&md->stats, bio_data_dir(bio),
599 bio->bi_iter.bi_sector, bio_sectors(bio),
600 false, 0, &io->stats_aux);
601}
602
603static void end_io_acct(struct dm_io *io)
604{
605 struct mapped_device *md = io->md;
606 struct bio *bio = io->orig_bio;
607 unsigned long duration = jiffies - io->start_time;
608
609 bio_end_io_acct(bio, io->start_time);
610
611 if (unlikely(dm_stats_used(&md->stats)))
612 dm_stats_account_io(&md->stats, bio_data_dir(bio),
613 bio->bi_iter.bi_sector, bio_sectors(bio),
614 true, duration, &io->stats_aux);
615
616
617 if (unlikely(wq_has_sleeper(&md->wait)))
618 wake_up(&md->wait);
619}
620
621static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
622{
623 struct dm_io *io;
624 struct dm_target_io *tio;
625 struct bio *clone;
626
627 clone = bio_alloc_bioset(GFP_NOIO, 0, &md->io_bs);
628 if (!clone)
629 return NULL;
630
631 tio = container_of(clone, struct dm_target_io, clone);
632 tio->inside_dm_io = true;
633 tio->io = NULL;
634
635 io = container_of(tio, struct dm_io, tio);
636 io->magic = DM_IO_MAGIC;
637 io->status = 0;
638 atomic_set(&io->io_count, 1);
639 io->orig_bio = bio;
640 io->md = md;
641 spin_lock_init(&io->endio_lock);
642
643 start_io_acct(io);
644
645 return io;
646}
647
648static void free_io(struct mapped_device *md, struct dm_io *io)
649{
650 bio_put(&io->tio.clone);
651}
652
653static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *ti,
654 unsigned target_bio_nr, gfp_t gfp_mask)
655{
656 struct dm_target_io *tio;
657
658 if (!ci->io->tio.io) {
659
660 tio = &ci->io->tio;
661 } else {
662 struct bio *clone = bio_alloc_bioset(gfp_mask, 0, &ci->io->md->bs);
663 if (!clone)
664 return NULL;
665
666 tio = container_of(clone, struct dm_target_io, clone);
667 tio->inside_dm_io = false;
668 }
669
670 tio->magic = DM_TIO_MAGIC;
671 tio->io = ci->io;
672 tio->ti = ti;
673 tio->target_bio_nr = target_bio_nr;
674
675 return tio;
676}
677
678static void free_tio(struct dm_target_io *tio)
679{
680 if (tio->inside_dm_io)
681 return;
682 bio_put(&tio->clone);
683}
684
685
686
687
688static void queue_io(struct mapped_device *md, struct bio *bio)
689{
690 unsigned long flags;
691
692 spin_lock_irqsave(&md->deferred_lock, flags);
693 bio_list_add(&md->deferred, bio);
694 spin_unlock_irqrestore(&md->deferred_lock, flags);
695 queue_work(md->wq, &md->work);
696}
697
698
699
700
701
702
703struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
704{
705 *srcu_idx = srcu_read_lock(&md->io_barrier);
706
707 return srcu_dereference(md->map, &md->io_barrier);
708}
709
710void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
711{
712 srcu_read_unlock(&md->io_barrier, srcu_idx);
713}
714
715void dm_sync_table(struct mapped_device *md)
716{
717 synchronize_srcu(&md->io_barrier);
718 synchronize_rcu_expedited();
719}
720
721
722
723
724
725static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
726{
727 rcu_read_lock();
728 return rcu_dereference(md->map);
729}
730
731static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
732{
733 rcu_read_unlock();
734}
735
736static char *_dm_claim_ptr = "I belong to device-mapper";
737
738
739
740
741static int open_table_device(struct table_device *td, dev_t dev,
742 struct mapped_device *md)
743{
744 struct block_device *bdev;
745
746 int r;
747
748 BUG_ON(td->dm_dev.bdev);
749
750 bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr);
751 if (IS_ERR(bdev))
752 return PTR_ERR(bdev);
753
754 r = bd_link_disk_holder(bdev, dm_disk(md));
755 if (r) {
756 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
757 return r;
758 }
759
760 td->dm_dev.bdev = bdev;
761 td->dm_dev.dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
762 return 0;
763}
764
765
766
767
768static void close_table_device(struct table_device *td, struct mapped_device *md)
769{
770 if (!td->dm_dev.bdev)
771 return;
772
773 bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
774 blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
775 put_dax(td->dm_dev.dax_dev);
776 td->dm_dev.bdev = NULL;
777 td->dm_dev.dax_dev = NULL;
778}
779
780static struct table_device *find_table_device(struct list_head *l, dev_t dev,
781 fmode_t mode)
782{
783 struct table_device *td;
784
785 list_for_each_entry(td, l, list)
786 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
787 return td;
788
789 return NULL;
790}
791
792int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
793 struct dm_dev **result)
794{
795 int r;
796 struct table_device *td;
797
798 mutex_lock(&md->table_devices_lock);
799 td = find_table_device(&md->table_devices, dev, mode);
800 if (!td) {
801 td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
802 if (!td) {
803 mutex_unlock(&md->table_devices_lock);
804 return -ENOMEM;
805 }
806
807 td->dm_dev.mode = mode;
808 td->dm_dev.bdev = NULL;
809
810 if ((r = open_table_device(td, dev, md))) {
811 mutex_unlock(&md->table_devices_lock);
812 kfree(td);
813 return r;
814 }
815
816 format_dev_t(td->dm_dev.name, dev);
817
818 refcount_set(&td->count, 1);
819 list_add(&td->list, &md->table_devices);
820 } else {
821 refcount_inc(&td->count);
822 }
823 mutex_unlock(&md->table_devices_lock);
824
825 *result = &td->dm_dev;
826 return 0;
827}
828EXPORT_SYMBOL_GPL(dm_get_table_device);
829
830void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
831{
832 struct table_device *td = container_of(d, struct table_device, dm_dev);
833
834 mutex_lock(&md->table_devices_lock);
835 if (refcount_dec_and_test(&td->count)) {
836 close_table_device(td, md);
837 list_del(&td->list);
838 kfree(td);
839 }
840 mutex_unlock(&md->table_devices_lock);
841}
842EXPORT_SYMBOL(dm_put_table_device);
843
844static void free_table_devices(struct list_head *devices)
845{
846 struct list_head *tmp, *next;
847
848 list_for_each_safe(tmp, next, devices) {
849 struct table_device *td = list_entry(tmp, struct table_device, list);
850
851 DMWARN("dm_destroy: %s still exists with %d references",
852 td->dm_dev.name, refcount_read(&td->count));
853 kfree(td);
854 }
855}
856
857
858
859
860int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
861{
862 *geo = md->geometry;
863
864 return 0;
865}
866
867
868
869
870int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
871{
872 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
873
874 if (geo->start > sz) {
875 DMWARN("Start sector is beyond the geometry limits.");
876 return -EINVAL;
877 }
878
879 md->geometry = *geo;
880
881 return 0;
882}
883
884static int __noflush_suspending(struct mapped_device *md)
885{
886 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
887}
888
889
890
891
892
893static void dec_pending(struct dm_io *io, blk_status_t error)
894{
895 unsigned long flags;
896 blk_status_t io_error;
897 struct bio *bio;
898 struct mapped_device *md = io->md;
899
900
901 if (unlikely(error)) {
902 spin_lock_irqsave(&io->endio_lock, flags);
903 if (!(io->status == BLK_STS_DM_REQUEUE && __noflush_suspending(md)))
904 io->status = error;
905 spin_unlock_irqrestore(&io->endio_lock, flags);
906 }
907
908 if (atomic_dec_and_test(&io->io_count)) {
909 if (io->status == BLK_STS_DM_REQUEUE) {
910
911
912
913 spin_lock_irqsave(&md->deferred_lock, flags);
914 if (__noflush_suspending(md))
915
916 bio_list_add_head(&md->deferred, io->orig_bio);
917 else
918
919 io->status = BLK_STS_IOERR;
920 spin_unlock_irqrestore(&md->deferred_lock, flags);
921 }
922
923 io_error = io->status;
924 bio = io->orig_bio;
925 end_io_acct(io);
926 free_io(md, io);
927
928 if (io_error == BLK_STS_DM_REQUEUE)
929 return;
930
931 if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
932
933
934
935
936 bio->bi_opf &= ~REQ_PREFLUSH;
937 queue_io(md, bio);
938 } else {
939
940 if (io_error)
941 bio->bi_status = io_error;
942 bio_endio(bio);
943 }
944 }
945}
946
947void disable_discard(struct mapped_device *md)
948{
949 struct queue_limits *limits = dm_get_queue_limits(md);
950
951
952 limits->max_discard_sectors = 0;
953 blk_queue_flag_clear(QUEUE_FLAG_DISCARD, md->queue);
954}
955
956void disable_write_same(struct mapped_device *md)
957{
958 struct queue_limits *limits = dm_get_queue_limits(md);
959
960
961 limits->max_write_same_sectors = 0;
962}
963
964void disable_write_zeroes(struct mapped_device *md)
965{
966 struct queue_limits *limits = dm_get_queue_limits(md);
967
968
969 limits->max_write_zeroes_sectors = 0;
970}
971
972static void clone_endio(struct bio *bio)
973{
974 blk_status_t error = bio->bi_status;
975 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
976 struct dm_io *io = tio->io;
977 struct mapped_device *md = tio->io->md;
978 dm_endio_fn endio = tio->ti->type->end_io;
979 struct bio *orig_bio = io->orig_bio;
980
981 if (unlikely(error == BLK_STS_TARGET)) {
982 if (bio_op(bio) == REQ_OP_DISCARD &&
983 !bio->bi_disk->queue->limits.max_discard_sectors)
984 disable_discard(md);
985 else if (bio_op(bio) == REQ_OP_WRITE_SAME &&
986 !bio->bi_disk->queue->limits.max_write_same_sectors)
987 disable_write_same(md);
988 else if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
989 !bio->bi_disk->queue->limits.max_write_zeroes_sectors)
990 disable_write_zeroes(md);
991 }
992
993
994
995
996
997 if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) {
998 sector_t written_sector = bio->bi_iter.bi_sector;
999 struct request_queue *q = orig_bio->bi_disk->queue;
1000 u64 mask = (u64)blk_queue_zone_sectors(q) - 1;
1001
1002 orig_bio->bi_iter.bi_sector += written_sector & mask;
1003 }
1004
1005 if (endio) {
1006 int r = endio(tio->ti, bio, &error);
1007 switch (r) {
1008 case DM_ENDIO_REQUEUE:
1009 error = BLK_STS_DM_REQUEUE;
1010 fallthrough;
1011 case DM_ENDIO_DONE:
1012 break;
1013 case DM_ENDIO_INCOMPLETE:
1014
1015 return;
1016 default:
1017 DMWARN("unimplemented target endio return value: %d", r);
1018 BUG();
1019 }
1020 }
1021
1022 free_tio(tio);
1023 dec_pending(io, error);
1024}
1025
1026
1027
1028
1029
1030static inline sector_t max_io_len_target_boundary(struct dm_target *ti,
1031 sector_t target_offset)
1032{
1033 return ti->len - target_offset;
1034}
1035
1036static sector_t max_io_len(struct dm_target *ti, sector_t sector)
1037{
1038 sector_t target_offset = dm_target_offset(ti, sector);
1039 sector_t len = max_io_len_target_boundary(ti, target_offset);
1040 sector_t max_len;
1041
1042
1043
1044
1045
1046
1047
1048
1049 if (ti->max_io_len) {
1050 max_len = blk_max_size_offset(ti->table->md->queue,
1051 target_offset, ti->max_io_len);
1052 if (len > max_len)
1053 len = max_len;
1054 }
1055
1056 return len;
1057}
1058
1059int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
1060{
1061 if (len > UINT_MAX) {
1062 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
1063 (unsigned long long)len, UINT_MAX);
1064 ti->error = "Maximum size of target IO is too large";
1065 return -EINVAL;
1066 }
1067
1068 ti->max_io_len = (uint32_t) len;
1069
1070 return 0;
1071}
1072EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
1073
1074static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
1075 sector_t sector, int *srcu_idx)
1076 __acquires(md->io_barrier)
1077{
1078 struct dm_table *map;
1079 struct dm_target *ti;
1080
1081 map = dm_get_live_table(md, srcu_idx);
1082 if (!map)
1083 return NULL;
1084
1085 ti = dm_table_find_target(map, sector);
1086 if (!ti)
1087 return NULL;
1088
1089 return ti;
1090}
1091
1092static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
1093 long nr_pages, void **kaddr, pfn_t *pfn)
1094{
1095 struct mapped_device *md = dax_get_private(dax_dev);
1096 sector_t sector = pgoff * PAGE_SECTORS;
1097 struct dm_target *ti;
1098 long len, ret = -EIO;
1099 int srcu_idx;
1100
1101 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1102
1103 if (!ti)
1104 goto out;
1105 if (!ti->type->direct_access)
1106 goto out;
1107 len = max_io_len(ti, sector) / PAGE_SECTORS;
1108 if (len < 1)
1109 goto out;
1110 nr_pages = min(len, nr_pages);
1111 ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
1112
1113 out:
1114 dm_put_live_table(md, srcu_idx);
1115
1116 return ret;
1117}
1118
1119static bool dm_dax_supported(struct dax_device *dax_dev, struct block_device *bdev,
1120 int blocksize, sector_t start, sector_t len)
1121{
1122 struct mapped_device *md = dax_get_private(dax_dev);
1123 struct dm_table *map;
1124 bool ret = false;
1125 int srcu_idx;
1126
1127 map = dm_get_live_table(md, &srcu_idx);
1128 if (!map)
1129 goto out;
1130
1131 ret = dm_table_supports_dax(map, device_supports_dax, &blocksize);
1132
1133out:
1134 dm_put_live_table(md, srcu_idx);
1135
1136 return ret;
1137}
1138
1139static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
1140 void *addr, size_t bytes, struct iov_iter *i)
1141{
1142 struct mapped_device *md = dax_get_private(dax_dev);
1143 sector_t sector = pgoff * PAGE_SECTORS;
1144 struct dm_target *ti;
1145 long ret = 0;
1146 int srcu_idx;
1147
1148 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1149
1150 if (!ti)
1151 goto out;
1152 if (!ti->type->dax_copy_from_iter) {
1153 ret = copy_from_iter(addr, bytes, i);
1154 goto out;
1155 }
1156 ret = ti->type->dax_copy_from_iter(ti, pgoff, addr, bytes, i);
1157 out:
1158 dm_put_live_table(md, srcu_idx);
1159
1160 return ret;
1161}
1162
1163static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
1164 void *addr, size_t bytes, struct iov_iter *i)
1165{
1166 struct mapped_device *md = dax_get_private(dax_dev);
1167 sector_t sector = pgoff * PAGE_SECTORS;
1168 struct dm_target *ti;
1169 long ret = 0;
1170 int srcu_idx;
1171
1172 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1173
1174 if (!ti)
1175 goto out;
1176 if (!ti->type->dax_copy_to_iter) {
1177 ret = copy_to_iter(addr, bytes, i);
1178 goto out;
1179 }
1180 ret = ti->type->dax_copy_to_iter(ti, pgoff, addr, bytes, i);
1181 out:
1182 dm_put_live_table(md, srcu_idx);
1183
1184 return ret;
1185}
1186
1187static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
1188 size_t nr_pages)
1189{
1190 struct mapped_device *md = dax_get_private(dax_dev);
1191 sector_t sector = pgoff * PAGE_SECTORS;
1192 struct dm_target *ti;
1193 int ret = -EIO;
1194 int srcu_idx;
1195
1196 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1197
1198 if (!ti)
1199 goto out;
1200 if (WARN_ON(!ti->type->dax_zero_page_range)) {
1201
1202
1203
1204
1205 goto out;
1206 }
1207 ret = ti->type->dax_zero_page_range(ti, pgoff, nr_pages);
1208 out:
1209 dm_put_live_table(md, srcu_idx);
1210
1211 return ret;
1212}
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
1244{
1245 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
1246 unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
1247 BUG_ON(bio->bi_opf & REQ_PREFLUSH);
1248 BUG_ON(bi_size > *tio->len_ptr);
1249 BUG_ON(n_sectors > bi_size);
1250 *tio->len_ptr -= bi_size - n_sectors;
1251 bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
1252}
1253EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
1254
1255static blk_qc_t __map_bio(struct dm_target_io *tio)
1256{
1257 int r;
1258 sector_t sector;
1259 struct bio *clone = &tio->clone;
1260 struct dm_io *io = tio->io;
1261 struct dm_target *ti = tio->ti;
1262 blk_qc_t ret = BLK_QC_T_NONE;
1263
1264 clone->bi_end_io = clone_endio;
1265
1266
1267
1268
1269
1270
1271 atomic_inc(&io->io_count);
1272 sector = clone->bi_iter.bi_sector;
1273
1274 r = ti->type->map(ti, clone);
1275 switch (r) {
1276 case DM_MAPIO_SUBMITTED:
1277 break;
1278 case DM_MAPIO_REMAPPED:
1279
1280 trace_block_bio_remap(clone, bio_dev(io->orig_bio), sector);
1281 ret = submit_bio_noacct(clone);
1282 break;
1283 case DM_MAPIO_KILL:
1284 free_tio(tio);
1285 dec_pending(io, BLK_STS_IOERR);
1286 break;
1287 case DM_MAPIO_REQUEUE:
1288 free_tio(tio);
1289 dec_pending(io, BLK_STS_DM_REQUEUE);
1290 break;
1291 default:
1292 DMWARN("unimplemented target map return value: %d", r);
1293 BUG();
1294 }
1295
1296 return ret;
1297}
1298
1299static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
1300{
1301 bio->bi_iter.bi_sector = sector;
1302 bio->bi_iter.bi_size = to_bytes(len);
1303}
1304
1305
1306
1307
1308static int clone_bio(struct dm_target_io *tio, struct bio *bio,
1309 sector_t sector, unsigned len)
1310{
1311 struct bio *clone = &tio->clone;
1312 int r;
1313
1314 __bio_clone_fast(clone, bio);
1315
1316 r = bio_crypt_clone(clone, bio, GFP_NOIO);
1317 if (r < 0)
1318 return r;
1319
1320 if (bio_integrity(bio)) {
1321 if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
1322 !dm_target_passes_integrity(tio->ti->type))) {
1323 DMWARN("%s: the target %s doesn't support integrity data.",
1324 dm_device_name(tio->io->md),
1325 tio->ti->type->name);
1326 return -EIO;
1327 }
1328
1329 r = bio_integrity_clone(clone, bio, GFP_NOIO);
1330 if (r < 0)
1331 return r;
1332 }
1333
1334 bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
1335 clone->bi_iter.bi_size = to_bytes(len);
1336
1337 if (bio_integrity(bio))
1338 bio_integrity_trim(clone);
1339
1340 return 0;
1341}
1342
1343static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
1344 struct dm_target *ti, unsigned num_bios)
1345{
1346 struct dm_target_io *tio;
1347 int try;
1348
1349 if (!num_bios)
1350 return;
1351
1352 if (num_bios == 1) {
1353 tio = alloc_tio(ci, ti, 0, GFP_NOIO);
1354 bio_list_add(blist, &tio->clone);
1355 return;
1356 }
1357
1358 for (try = 0; try < 2; try++) {
1359 int bio_nr;
1360 struct bio *bio;
1361
1362 if (try)
1363 mutex_lock(&ci->io->md->table_devices_lock);
1364 for (bio_nr = 0; bio_nr < num_bios; bio_nr++) {
1365 tio = alloc_tio(ci, ti, bio_nr, try ? GFP_NOIO : GFP_NOWAIT);
1366 if (!tio)
1367 break;
1368
1369 bio_list_add(blist, &tio->clone);
1370 }
1371 if (try)
1372 mutex_unlock(&ci->io->md->table_devices_lock);
1373 if (bio_nr == num_bios)
1374 return;
1375
1376 while ((bio = bio_list_pop(blist))) {
1377 tio = container_of(bio, struct dm_target_io, clone);
1378 free_tio(tio);
1379 }
1380 }
1381}
1382
1383static blk_qc_t __clone_and_map_simple_bio(struct clone_info *ci,
1384 struct dm_target_io *tio, unsigned *len)
1385{
1386 struct bio *clone = &tio->clone;
1387
1388 tio->len_ptr = len;
1389
1390 __bio_clone_fast(clone, ci->bio);
1391 if (len)
1392 bio_setup_sector(clone, ci->sector, *len);
1393
1394 return __map_bio(tio);
1395}
1396
1397static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
1398 unsigned num_bios, unsigned *len)
1399{
1400 struct bio_list blist = BIO_EMPTY_LIST;
1401 struct bio *bio;
1402 struct dm_target_io *tio;
1403
1404 alloc_multiple_bios(&blist, ci, ti, num_bios);
1405
1406 while ((bio = bio_list_pop(&blist))) {
1407 tio = container_of(bio, struct dm_target_io, clone);
1408 (void) __clone_and_map_simple_bio(ci, tio, len);
1409 }
1410}
1411
1412static int __send_empty_flush(struct clone_info *ci)
1413{
1414 unsigned target_nr = 0;
1415 struct dm_target *ti;
1416 struct bio flush_bio;
1417
1418
1419
1420
1421
1422
1423 bio_init(&flush_bio, NULL, 0);
1424 flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
1425 flush_bio.bi_disk = ci->io->md->disk;
1426 bio_associate_blkg(&flush_bio);
1427
1428 ci->bio = &flush_bio;
1429 ci->sector_count = 0;
1430
1431 BUG_ON(bio_has_data(ci->bio));
1432 while ((ti = dm_table_get_target(ci->map, target_nr++)))
1433 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
1434
1435 bio_uninit(ci->bio);
1436 return 0;
1437}
1438
1439static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
1440 sector_t sector, unsigned *len)
1441{
1442 struct bio *bio = ci->bio;
1443 struct dm_target_io *tio;
1444 int r;
1445
1446 tio = alloc_tio(ci, ti, 0, GFP_NOIO);
1447 tio->len_ptr = len;
1448 r = clone_bio(tio, bio, sector, *len);
1449 if (r < 0) {
1450 free_tio(tio);
1451 return r;
1452 }
1453 (void) __map_bio(tio);
1454
1455 return 0;
1456}
1457
1458static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
1459 unsigned num_bios)
1460{
1461 unsigned len;
1462
1463
1464
1465
1466
1467
1468
1469 if (!num_bios)
1470 return -EOPNOTSUPP;
1471
1472 len = min_t(sector_t, ci->sector_count,
1473 max_io_len_target_boundary(ti, dm_target_offset(ti, ci->sector)));
1474
1475 __send_duplicate_bios(ci, ti, num_bios, &len);
1476
1477 ci->sector += len;
1478 ci->sector_count -= len;
1479
1480 return 0;
1481}
1482
1483static bool is_abnormal_io(struct bio *bio)
1484{
1485 bool r = false;
1486
1487 switch (bio_op(bio)) {
1488 case REQ_OP_DISCARD:
1489 case REQ_OP_SECURE_ERASE:
1490 case REQ_OP_WRITE_SAME:
1491 case REQ_OP_WRITE_ZEROES:
1492 r = true;
1493 break;
1494 }
1495
1496 return r;
1497}
1498
1499static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti,
1500 int *result)
1501{
1502 struct bio *bio = ci->bio;
1503 unsigned num_bios = 0;
1504
1505 switch (bio_op(bio)) {
1506 case REQ_OP_DISCARD:
1507 num_bios = ti->num_discard_bios;
1508 break;
1509 case REQ_OP_SECURE_ERASE:
1510 num_bios = ti->num_secure_erase_bios;
1511 break;
1512 case REQ_OP_WRITE_SAME:
1513 num_bios = ti->num_write_same_bios;
1514 break;
1515 case REQ_OP_WRITE_ZEROES:
1516 num_bios = ti->num_write_zeroes_bios;
1517 break;
1518 default:
1519 return false;
1520 }
1521
1522 *result = __send_changing_extent_only(ci, ti, num_bios);
1523 return true;
1524}
1525
1526
1527
1528
1529static int __split_and_process_non_flush(struct clone_info *ci)
1530{
1531 struct dm_target *ti;
1532 unsigned len;
1533 int r;
1534
1535 ti = dm_table_find_target(ci->map, ci->sector);
1536 if (!ti)
1537 return -EIO;
1538
1539 if (__process_abnormal_io(ci, ti, &r))
1540 return r;
1541
1542 len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count);
1543
1544 r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
1545 if (r < 0)
1546 return r;
1547
1548 ci->sector += len;
1549 ci->sector_count -= len;
1550
1551 return 0;
1552}
1553
1554static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
1555 struct dm_table *map, struct bio *bio)
1556{
1557 ci->map = map;
1558 ci->io = alloc_io(md, bio);
1559 ci->sector = bio->bi_iter.bi_sector;
1560}
1561
1562#define __dm_part_stat_sub(part, field, subnd) \
1563 (part_stat_get(part, field) -= (subnd))
1564
1565
1566
1567
1568static blk_qc_t __split_and_process_bio(struct mapped_device *md,
1569 struct dm_table *map, struct bio *bio)
1570{
1571 struct clone_info ci;
1572 blk_qc_t ret = BLK_QC_T_NONE;
1573 int error = 0;
1574
1575 init_clone_info(&ci, md, map, bio);
1576
1577 if (bio->bi_opf & REQ_PREFLUSH) {
1578 error = __send_empty_flush(&ci);
1579
1580 } else if (op_is_zone_mgmt(bio_op(bio))) {
1581 ci.bio = bio;
1582 ci.sector_count = 0;
1583 error = __split_and_process_non_flush(&ci);
1584 } else {
1585 ci.bio = bio;
1586 ci.sector_count = bio_sectors(bio);
1587 while (ci.sector_count && !error) {
1588 error = __split_and_process_non_flush(&ci);
1589 if (ci.sector_count && !error) {
1590
1591
1592
1593
1594
1595
1596
1597
1598 struct bio *b = bio_split(bio, bio_sectors(bio) - ci.sector_count,
1599 GFP_NOIO, &md->queue->bio_split);
1600 ci.io->orig_bio = b;
1601
1602
1603
1604
1605
1606
1607
1608
1609 part_stat_lock();
1610 __dm_part_stat_sub(dm_disk(md)->part0,
1611 sectors[op_stat_group(bio_op(bio))], ci.sector_count);
1612 part_stat_unlock();
1613
1614 bio_chain(b, bio);
1615 trace_block_split(b, bio->bi_iter.bi_sector);
1616 ret = submit_bio_noacct(bio);
1617 break;
1618 }
1619 }
1620 }
1621
1622
1623 dec_pending(ci.io, errno_to_blk_status(error));
1624 return ret;
1625}
1626
1627static blk_qc_t dm_submit_bio(struct bio *bio)
1628{
1629 struct mapped_device *md = bio->bi_disk->private_data;
1630 blk_qc_t ret = BLK_QC_T_NONE;
1631 int srcu_idx;
1632 struct dm_table *map;
1633
1634 map = dm_get_live_table(md, &srcu_idx);
1635 if (unlikely(!map)) {
1636 DMERR_LIMIT("%s: mapping table unavailable, erroring io",
1637 dm_device_name(md));
1638 bio_io_error(bio);
1639 goto out;
1640 }
1641
1642
1643 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1644 if (bio->bi_opf & REQ_NOWAIT)
1645 bio_wouldblock_error(bio);
1646 else if (bio->bi_opf & REQ_RAHEAD)
1647 bio_io_error(bio);
1648 else
1649 queue_io(md, bio);
1650 goto out;
1651 }
1652
1653
1654
1655
1656
1657 if (is_abnormal_io(bio))
1658 blk_queue_split(&bio);
1659
1660 ret = __split_and_process_bio(md, map, bio);
1661out:
1662 dm_put_live_table(md, srcu_idx);
1663 return ret;
1664}
1665
1666
1667
1668
1669static void free_minor(int minor)
1670{
1671 spin_lock(&_minor_lock);
1672 idr_remove(&_minor_idr, minor);
1673 spin_unlock(&_minor_lock);
1674}
1675
1676
1677
1678
1679static int specific_minor(int minor)
1680{
1681 int r;
1682
1683 if (minor >= (1 << MINORBITS))
1684 return -EINVAL;
1685
1686 idr_preload(GFP_KERNEL);
1687 spin_lock(&_minor_lock);
1688
1689 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
1690
1691 spin_unlock(&_minor_lock);
1692 idr_preload_end();
1693 if (r < 0)
1694 return r == -ENOSPC ? -EBUSY : r;
1695 return 0;
1696}
1697
1698static int next_free_minor(int *minor)
1699{
1700 int r;
1701
1702 idr_preload(GFP_KERNEL);
1703 spin_lock(&_minor_lock);
1704
1705 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
1706
1707 spin_unlock(&_minor_lock);
1708 idr_preload_end();
1709 if (r < 0)
1710 return r;
1711 *minor = r;
1712 return 0;
1713}
1714
1715static const struct block_device_operations dm_blk_dops;
1716static const struct block_device_operations dm_rq_blk_dops;
1717static const struct dax_operations dm_dax_ops;
1718
1719static void dm_wq_work(struct work_struct *work);
1720
1721static void cleanup_mapped_device(struct mapped_device *md)
1722{
1723 if (md->wq)
1724 destroy_workqueue(md->wq);
1725 bioset_exit(&md->bs);
1726 bioset_exit(&md->io_bs);
1727
1728 if (md->dax_dev) {
1729 kill_dax(md->dax_dev);
1730 put_dax(md->dax_dev);
1731 md->dax_dev = NULL;
1732 }
1733
1734 if (md->disk) {
1735 spin_lock(&_minor_lock);
1736 md->disk->private_data = NULL;
1737 spin_unlock(&_minor_lock);
1738 del_gendisk(md->disk);
1739 put_disk(md->disk);
1740 }
1741
1742 if (md->queue)
1743 blk_cleanup_queue(md->queue);
1744
1745 cleanup_srcu_struct(&md->io_barrier);
1746
1747 mutex_destroy(&md->suspend_lock);
1748 mutex_destroy(&md->type_lock);
1749 mutex_destroy(&md->table_devices_lock);
1750
1751 dm_mq_cleanup_mapped_device(md);
1752}
1753
1754
1755
1756
1757static struct mapped_device *alloc_dev(int minor)
1758{
1759 int r, numa_node_id = dm_get_numa_node();
1760 struct mapped_device *md;
1761 void *old_md;
1762
1763 md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
1764 if (!md) {
1765 DMWARN("unable to allocate device, out of memory.");
1766 return NULL;
1767 }
1768
1769 if (!try_module_get(THIS_MODULE))
1770 goto bad_module_get;
1771
1772
1773 if (minor == DM_ANY_MINOR)
1774 r = next_free_minor(&minor);
1775 else
1776 r = specific_minor(minor);
1777 if (r < 0)
1778 goto bad_minor;
1779
1780 r = init_srcu_struct(&md->io_barrier);
1781 if (r < 0)
1782 goto bad_io_barrier;
1783
1784 md->numa_node_id = numa_node_id;
1785 md->init_tio_pdu = false;
1786 md->type = DM_TYPE_NONE;
1787 mutex_init(&md->suspend_lock);
1788 mutex_init(&md->type_lock);
1789 mutex_init(&md->table_devices_lock);
1790 spin_lock_init(&md->deferred_lock);
1791 atomic_set(&md->holders, 1);
1792 atomic_set(&md->open_count, 0);
1793 atomic_set(&md->event_nr, 0);
1794 atomic_set(&md->uevent_seq, 0);
1795 INIT_LIST_HEAD(&md->uevent_list);
1796 INIT_LIST_HEAD(&md->table_devices);
1797 spin_lock_init(&md->uevent_lock);
1798
1799
1800
1801
1802
1803
1804 md->queue = blk_alloc_queue(numa_node_id);
1805 if (!md->queue)
1806 goto bad;
1807
1808 md->disk = alloc_disk_node(1, md->numa_node_id);
1809 if (!md->disk)
1810 goto bad;
1811
1812 init_waitqueue_head(&md->wait);
1813 INIT_WORK(&md->work, dm_wq_work);
1814 init_waitqueue_head(&md->eventq);
1815 init_completion(&md->kobj_holder.completion);
1816
1817 md->disk->major = _major;
1818 md->disk->first_minor = minor;
1819 md->disk->fops = &dm_blk_dops;
1820 md->disk->queue = md->queue;
1821 md->disk->private_data = md;
1822 sprintf(md->disk->disk_name, "dm-%d", minor);
1823
1824 if (IS_ENABLED(CONFIG_DAX_DRIVER)) {
1825 md->dax_dev = alloc_dax(md, md->disk->disk_name,
1826 &dm_dax_ops, 0);
1827 if (IS_ERR(md->dax_dev))
1828 goto bad;
1829 }
1830
1831 add_disk_no_queue_reg(md->disk);
1832 format_dev_t(md->name, MKDEV(_major, minor));
1833
1834 md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
1835 if (!md->wq)
1836 goto bad;
1837
1838 dm_stats_init(&md->stats);
1839
1840
1841 spin_lock(&_minor_lock);
1842 old_md = idr_replace(&_minor_idr, md, minor);
1843 spin_unlock(&_minor_lock);
1844
1845 BUG_ON(old_md != MINOR_ALLOCED);
1846
1847 return md;
1848
1849bad:
1850 cleanup_mapped_device(md);
1851bad_io_barrier:
1852 free_minor(minor);
1853bad_minor:
1854 module_put(THIS_MODULE);
1855bad_module_get:
1856 kvfree(md);
1857 return NULL;
1858}
1859
1860static void unlock_fs(struct mapped_device *md);
1861
1862static void free_dev(struct mapped_device *md)
1863{
1864 int minor = MINOR(disk_devt(md->disk));
1865
1866 unlock_fs(md);
1867
1868 cleanup_mapped_device(md);
1869
1870 free_table_devices(&md->table_devices);
1871 dm_stats_cleanup(&md->stats);
1872 free_minor(minor);
1873
1874 module_put(THIS_MODULE);
1875 kvfree(md);
1876}
1877
1878static int __bind_mempools(struct mapped_device *md, struct dm_table *t)
1879{
1880 struct dm_md_mempools *p = dm_table_get_md_mempools(t);
1881 int ret = 0;
1882
1883 if (dm_table_bio_based(t)) {
1884
1885
1886
1887
1888
1889 bioset_exit(&md->bs);
1890 bioset_exit(&md->io_bs);
1891
1892 } else if (bioset_initialized(&md->bs)) {
1893
1894
1895
1896
1897
1898
1899
1900
1901 goto out;
1902 }
1903
1904 BUG_ON(!p ||
1905 bioset_initialized(&md->bs) ||
1906 bioset_initialized(&md->io_bs));
1907
1908 ret = bioset_init_from_src(&md->bs, &p->bs);
1909 if (ret)
1910 goto out;
1911 ret = bioset_init_from_src(&md->io_bs, &p->io_bs);
1912 if (ret)
1913 bioset_exit(&md->bs);
1914out:
1915
1916 dm_table_free_md_mempools(t);
1917 return ret;
1918}
1919
1920
1921
1922
1923static void event_callback(void *context)
1924{
1925 unsigned long flags;
1926 LIST_HEAD(uevents);
1927 struct mapped_device *md = (struct mapped_device *) context;
1928
1929 spin_lock_irqsave(&md->uevent_lock, flags);
1930 list_splice_init(&md->uevent_list, &uevents);
1931 spin_unlock_irqrestore(&md->uevent_lock, flags);
1932
1933 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
1934
1935 atomic_inc(&md->event_nr);
1936 wake_up(&md->eventq);
1937 dm_issue_global_event();
1938}
1939
1940
1941
1942
1943static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
1944 struct queue_limits *limits)
1945{
1946 struct dm_table *old_map;
1947 struct request_queue *q = md->queue;
1948 bool request_based = dm_table_request_based(t);
1949 sector_t size;
1950 int ret;
1951
1952 lockdep_assert_held(&md->suspend_lock);
1953
1954 size = dm_table_get_size(t);
1955
1956
1957
1958
1959 if (size != dm_get_size(md))
1960 memset(&md->geometry, 0, sizeof(md->geometry));
1961
1962 set_capacity_and_notify(md->disk, size);
1963
1964 dm_table_event_callback(t, event_callback, md);
1965
1966
1967
1968
1969
1970
1971
1972
1973 if (request_based)
1974 dm_stop_queue(q);
1975
1976 if (request_based) {
1977
1978
1979
1980
1981 md->immutable_target = dm_table_get_immutable_target(t);
1982 }
1983
1984 ret = __bind_mempools(md, t);
1985 if (ret) {
1986 old_map = ERR_PTR(ret);
1987 goto out;
1988 }
1989
1990 old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
1991 rcu_assign_pointer(md->map, (void *)t);
1992 md->immutable_target_type = dm_table_get_immutable_target_type(t);
1993
1994 dm_table_set_restrictions(t, q, limits);
1995 if (old_map)
1996 dm_sync_table(md);
1997
1998out:
1999 return old_map;
2000}
2001
2002
2003
2004
2005static struct dm_table *__unbind(struct mapped_device *md)
2006{
2007 struct dm_table *map = rcu_dereference_protected(md->map, 1);
2008
2009 if (!map)
2010 return NULL;
2011
2012 dm_table_event_callback(map, NULL, NULL);
2013 RCU_INIT_POINTER(md->map, NULL);
2014 dm_sync_table(md);
2015
2016 return map;
2017}
2018
2019
2020
2021
2022int dm_create(int minor, struct mapped_device **result)
2023{
2024 int r;
2025 struct mapped_device *md;
2026
2027 md = alloc_dev(minor);
2028 if (!md)
2029 return -ENXIO;
2030
2031 r = dm_sysfs_init(md);
2032 if (r) {
2033 free_dev(md);
2034 return r;
2035 }
2036
2037 *result = md;
2038 return 0;
2039}
2040
2041
2042
2043
2044
2045void dm_lock_md_type(struct mapped_device *md)
2046{
2047 mutex_lock(&md->type_lock);
2048}
2049
2050void dm_unlock_md_type(struct mapped_device *md)
2051{
2052 mutex_unlock(&md->type_lock);
2053}
2054
2055void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
2056{
2057 BUG_ON(!mutex_is_locked(&md->type_lock));
2058 md->type = type;
2059}
2060
2061enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
2062{
2063 return md->type;
2064}
2065
2066struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
2067{
2068 return md->immutable_target_type;
2069}
2070
2071
2072
2073
2074
2075struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
2076{
2077 BUG_ON(!atomic_read(&md->holders));
2078 return &md->queue->limits;
2079}
2080EXPORT_SYMBOL_GPL(dm_get_queue_limits);
2081
2082
2083
2084
2085int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
2086{
2087 int r;
2088 struct queue_limits limits;
2089 enum dm_queue_mode type = dm_get_md_type(md);
2090
2091 switch (type) {
2092 case DM_TYPE_REQUEST_BASED:
2093 md->disk->fops = &dm_rq_blk_dops;
2094 r = dm_mq_init_request_queue(md, t);
2095 if (r) {
2096 DMERR("Cannot initialize queue for request-based dm mapped device");
2097 return r;
2098 }
2099 break;
2100 case DM_TYPE_BIO_BASED:
2101 case DM_TYPE_DAX_BIO_BASED:
2102 break;
2103 case DM_TYPE_NONE:
2104 WARN_ON_ONCE(true);
2105 break;
2106 }
2107
2108 r = dm_calculate_queue_limits(t, &limits);
2109 if (r) {
2110 DMERR("Cannot calculate initial queue limits");
2111 return r;
2112 }
2113 dm_table_set_restrictions(t, md->queue, &limits);
2114 blk_register_queue(md->disk);
2115
2116 return 0;
2117}
2118
2119struct mapped_device *dm_get_md(dev_t dev)
2120{
2121 struct mapped_device *md;
2122 unsigned minor = MINOR(dev);
2123
2124 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2125 return NULL;
2126
2127 spin_lock(&_minor_lock);
2128
2129 md = idr_find(&_minor_idr, minor);
2130 if (!md || md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) ||
2131 test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2132 md = NULL;
2133 goto out;
2134 }
2135 dm_get(md);
2136out:
2137 spin_unlock(&_minor_lock);
2138
2139 return md;
2140}
2141EXPORT_SYMBOL_GPL(dm_get_md);
2142
2143void *dm_get_mdptr(struct mapped_device *md)
2144{
2145 return md->interface_ptr;
2146}
2147
2148void dm_set_mdptr(struct mapped_device *md, void *ptr)
2149{
2150 md->interface_ptr = ptr;
2151}
2152
2153void dm_get(struct mapped_device *md)
2154{
2155 atomic_inc(&md->holders);
2156 BUG_ON(test_bit(DMF_FREEING, &md->flags));
2157}
2158
2159int dm_hold(struct mapped_device *md)
2160{
2161 spin_lock(&_minor_lock);
2162 if (test_bit(DMF_FREEING, &md->flags)) {
2163 spin_unlock(&_minor_lock);
2164 return -EBUSY;
2165 }
2166 dm_get(md);
2167 spin_unlock(&_minor_lock);
2168 return 0;
2169}
2170EXPORT_SYMBOL_GPL(dm_hold);
2171
2172const char *dm_device_name(struct mapped_device *md)
2173{
2174 return md->name;
2175}
2176EXPORT_SYMBOL_GPL(dm_device_name);
2177
2178static void __dm_destroy(struct mapped_device *md, bool wait)
2179{
2180 struct dm_table *map;
2181 int srcu_idx;
2182
2183 might_sleep();
2184
2185 spin_lock(&_minor_lock);
2186 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2187 set_bit(DMF_FREEING, &md->flags);
2188 spin_unlock(&_minor_lock);
2189
2190 blk_set_queue_dying(md->queue);
2191
2192
2193
2194
2195
2196 mutex_lock(&md->suspend_lock);
2197 map = dm_get_live_table(md, &srcu_idx);
2198 if (!dm_suspended_md(md)) {
2199 dm_table_presuspend_targets(map);
2200 set_bit(DMF_SUSPENDED, &md->flags);
2201 set_bit(DMF_POST_SUSPENDING, &md->flags);
2202 dm_table_postsuspend_targets(map);
2203 }
2204
2205 dm_put_live_table(md, srcu_idx);
2206 mutex_unlock(&md->suspend_lock);
2207
2208
2209
2210
2211
2212
2213
2214 if (wait)
2215 while (atomic_read(&md->holders))
2216 msleep(1);
2217 else if (atomic_read(&md->holders))
2218 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2219 dm_device_name(md), atomic_read(&md->holders));
2220
2221 dm_sysfs_exit(md);
2222 dm_table_destroy(__unbind(md));
2223 free_dev(md);
2224}
2225
2226void dm_destroy(struct mapped_device *md)
2227{
2228 __dm_destroy(md, true);
2229}
2230
2231void dm_destroy_immediate(struct mapped_device *md)
2232{
2233 __dm_destroy(md, false);
2234}
2235
2236void dm_put(struct mapped_device *md)
2237{
2238 atomic_dec(&md->holders);
2239}
2240EXPORT_SYMBOL_GPL(dm_put);
2241
2242static bool md_in_flight_bios(struct mapped_device *md)
2243{
2244 int cpu;
2245 struct block_device *part = dm_disk(md)->part0;
2246 long sum = 0;
2247
2248 for_each_possible_cpu(cpu) {
2249 sum += part_stat_local_read_cpu(part, in_flight[0], cpu);
2250 sum += part_stat_local_read_cpu(part, in_flight[1], cpu);
2251 }
2252
2253 return sum != 0;
2254}
2255
2256static int dm_wait_for_bios_completion(struct mapped_device *md, long task_state)
2257{
2258 int r = 0;
2259 DEFINE_WAIT(wait);
2260
2261 while (true) {
2262 prepare_to_wait(&md->wait, &wait, task_state);
2263
2264 if (!md_in_flight_bios(md))
2265 break;
2266
2267 if (signal_pending_state(task_state, current)) {
2268 r = -EINTR;
2269 break;
2270 }
2271
2272 io_schedule();
2273 }
2274 finish_wait(&md->wait, &wait);
2275
2276 return r;
2277}
2278
2279static int dm_wait_for_completion(struct mapped_device *md, long task_state)
2280{
2281 int r = 0;
2282
2283 if (!queue_is_mq(md->queue))
2284 return dm_wait_for_bios_completion(md, task_state);
2285
2286 while (true) {
2287 if (!blk_mq_queue_inflight(md->queue))
2288 break;
2289
2290 if (signal_pending_state(task_state, current)) {
2291 r = -EINTR;
2292 break;
2293 }
2294
2295 msleep(5);
2296 }
2297
2298 return r;
2299}
2300
2301
2302
2303
2304static void dm_wq_work(struct work_struct *work)
2305{
2306 struct mapped_device *md = container_of(work, struct mapped_device, work);
2307 struct bio *bio;
2308
2309 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2310 spin_lock_irq(&md->deferred_lock);
2311 bio = bio_list_pop(&md->deferred);
2312 spin_unlock_irq(&md->deferred_lock);
2313
2314 if (!bio)
2315 break;
2316
2317 submit_bio_noacct(bio);
2318 }
2319}
2320
2321static void dm_queue_flush(struct mapped_device *md)
2322{
2323 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2324 smp_mb__after_atomic();
2325 queue_work(md->wq, &md->work);
2326}
2327
2328
2329
2330
2331struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2332{
2333 struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
2334 struct queue_limits limits;
2335 int r;
2336
2337 mutex_lock(&md->suspend_lock);
2338
2339
2340 if (!dm_suspended_md(md))
2341 goto out;
2342
2343
2344
2345
2346
2347
2348
2349 if (dm_table_has_no_data_devices(table)) {
2350 live_map = dm_get_live_table_fast(md);
2351 if (live_map)
2352 limits = md->queue->limits;
2353 dm_put_live_table_fast(md);
2354 }
2355
2356 if (!live_map) {
2357 r = dm_calculate_queue_limits(table, &limits);
2358 if (r) {
2359 map = ERR_PTR(r);
2360 goto out;
2361 }
2362 }
2363
2364 map = __bind(md, table, &limits);
2365 dm_issue_global_event();
2366
2367out:
2368 mutex_unlock(&md->suspend_lock);
2369 return map;
2370}
2371
2372
2373
2374
2375
2376static int lock_fs(struct mapped_device *md)
2377{
2378 int r;
2379
2380 WARN_ON(test_bit(DMF_FROZEN, &md->flags));
2381
2382 r = freeze_bdev(md->disk->part0);
2383 if (!r)
2384 set_bit(DMF_FROZEN, &md->flags);
2385 return r;
2386}
2387
2388static void unlock_fs(struct mapped_device *md)
2389{
2390 if (!test_bit(DMF_FROZEN, &md->flags))
2391 return;
2392 thaw_bdev(md->disk->part0);
2393 clear_bit(DMF_FROZEN, &md->flags);
2394}
2395
2396
2397
2398
2399
2400
2401
2402
2403
2404
2405static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
2406 unsigned suspend_flags, long task_state,
2407 int dmf_suspended_flag)
2408{
2409 bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
2410 bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
2411 int r;
2412
2413 lockdep_assert_held(&md->suspend_lock);
2414
2415
2416
2417
2418
2419 if (noflush)
2420 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2421 else
2422 DMDEBUG("%s: suspending with flush", dm_device_name(md));
2423
2424
2425
2426
2427
2428 dm_table_presuspend_targets(map);
2429
2430
2431
2432
2433
2434
2435
2436 if (!noflush && do_lockfs) {
2437 r = lock_fs(md);
2438 if (r) {
2439 dm_table_presuspend_undo_targets(map);
2440 return r;
2441 }
2442 }
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2456 if (map)
2457 synchronize_srcu(&md->io_barrier);
2458
2459
2460
2461
2462
2463 if (dm_request_based(md))
2464 dm_stop_queue(md->queue);
2465
2466 flush_workqueue(md->wq);
2467
2468
2469
2470
2471
2472
2473 r = dm_wait_for_completion(md, task_state);
2474 if (!r)
2475 set_bit(dmf_suspended_flag, &md->flags);
2476
2477 if (noflush)
2478 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2479 if (map)
2480 synchronize_srcu(&md->io_barrier);
2481
2482
2483 if (r < 0) {
2484 dm_queue_flush(md);
2485
2486 if (dm_request_based(md))
2487 dm_start_queue(md->queue);
2488
2489 unlock_fs(md);
2490 dm_table_presuspend_undo_targets(map);
2491
2492 }
2493
2494 return r;
2495}
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2514{
2515 struct dm_table *map = NULL;
2516 int r = 0;
2517
2518retry:
2519 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2520
2521 if (dm_suspended_md(md)) {
2522 r = -EINVAL;
2523 goto out_unlock;
2524 }
2525
2526 if (dm_suspended_internally_md(md)) {
2527
2528 mutex_unlock(&md->suspend_lock);
2529 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2530 if (r)
2531 return r;
2532 goto retry;
2533 }
2534
2535 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2536
2537 r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED);
2538 if (r)
2539 goto out_unlock;
2540
2541 set_bit(DMF_POST_SUSPENDING, &md->flags);
2542 dm_table_postsuspend_targets(map);
2543 clear_bit(DMF_POST_SUSPENDING, &md->flags);
2544
2545out_unlock:
2546 mutex_unlock(&md->suspend_lock);
2547 return r;
2548}
2549
2550static int __dm_resume(struct mapped_device *md, struct dm_table *map)
2551{
2552 if (map) {
2553 int r = dm_table_resume_targets(map);
2554 if (r)
2555 return r;
2556 }
2557
2558 dm_queue_flush(md);
2559
2560
2561
2562
2563
2564
2565 if (dm_request_based(md))
2566 dm_start_queue(md->queue);
2567
2568 unlock_fs(md);
2569
2570 return 0;
2571}
2572
2573int dm_resume(struct mapped_device *md)
2574{
2575 int r;
2576 struct dm_table *map = NULL;
2577
2578retry:
2579 r = -EINVAL;
2580 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2581
2582 if (!dm_suspended_md(md))
2583 goto out;
2584
2585 if (dm_suspended_internally_md(md)) {
2586
2587 mutex_unlock(&md->suspend_lock);
2588 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2589 if (r)
2590 return r;
2591 goto retry;
2592 }
2593
2594 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2595 if (!map || !dm_table_get_size(map))
2596 goto out;
2597
2598 r = __dm_resume(md, map);
2599 if (r)
2600 goto out;
2601
2602 clear_bit(DMF_SUSPENDED, &md->flags);
2603out:
2604 mutex_unlock(&md->suspend_lock);
2605
2606 return r;
2607}
2608
2609
2610
2611
2612
2613
2614
2615static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
2616{
2617 struct dm_table *map = NULL;
2618
2619 lockdep_assert_held(&md->suspend_lock);
2620
2621 if (md->internal_suspend_count++)
2622 return;
2623
2624 if (dm_suspended_md(md)) {
2625 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2626 return;
2627 }
2628
2629 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2630
2631
2632
2633
2634
2635
2636
2637 (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE,
2638 DMF_SUSPENDED_INTERNALLY);
2639
2640 set_bit(DMF_POST_SUSPENDING, &md->flags);
2641 dm_table_postsuspend_targets(map);
2642 clear_bit(DMF_POST_SUSPENDING, &md->flags);
2643}
2644
2645static void __dm_internal_resume(struct mapped_device *md)
2646{
2647 BUG_ON(!md->internal_suspend_count);
2648
2649 if (--md->internal_suspend_count)
2650 return;
2651
2652 if (dm_suspended_md(md))
2653 goto done;
2654
2655
2656
2657
2658
2659 (void) __dm_resume(md, NULL);
2660
2661done:
2662 clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2663 smp_mb__after_atomic();
2664 wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
2665}
2666
2667void dm_internal_suspend_noflush(struct mapped_device *md)
2668{
2669 mutex_lock(&md->suspend_lock);
2670 __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
2671 mutex_unlock(&md->suspend_lock);
2672}
2673EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
2674
2675void dm_internal_resume(struct mapped_device *md)
2676{
2677 mutex_lock(&md->suspend_lock);
2678 __dm_internal_resume(md);
2679 mutex_unlock(&md->suspend_lock);
2680}
2681EXPORT_SYMBOL_GPL(dm_internal_resume);
2682
2683
2684
2685
2686
2687
2688void dm_internal_suspend_fast(struct mapped_device *md)
2689{
2690 mutex_lock(&md->suspend_lock);
2691 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2692 return;
2693
2694 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2695 synchronize_srcu(&md->io_barrier);
2696 flush_workqueue(md->wq);
2697 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2698}
2699EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
2700
2701void dm_internal_resume_fast(struct mapped_device *md)
2702{
2703 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2704 goto done;
2705
2706 dm_queue_flush(md);
2707
2708done:
2709 mutex_unlock(&md->suspend_lock);
2710}
2711EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
2712
2713
2714
2715
2716int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
2717 unsigned cookie)
2718{
2719 int r;
2720 unsigned noio_flag;
2721 char udev_cookie[DM_COOKIE_LENGTH];
2722 char *envp[] = { udev_cookie, NULL };
2723
2724 noio_flag = memalloc_noio_save();
2725
2726 if (!cookie)
2727 r = kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
2728 else {
2729 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2730 DM_COOKIE_ENV_VAR_NAME, cookie);
2731 r = kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
2732 action, envp);
2733 }
2734
2735 memalloc_noio_restore(noio_flag);
2736
2737 return r;
2738}
2739
2740uint32_t dm_next_uevent_seq(struct mapped_device *md)
2741{
2742 return atomic_add_return(1, &md->uevent_seq);
2743}
2744
2745uint32_t dm_get_event_nr(struct mapped_device *md)
2746{
2747 return atomic_read(&md->event_nr);
2748}
2749
2750int dm_wait_event(struct mapped_device *md, int event_nr)
2751{
2752 return wait_event_interruptible(md->eventq,
2753 (event_nr != atomic_read(&md->event_nr)));
2754}
2755
2756void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
2757{
2758 unsigned long flags;
2759
2760 spin_lock_irqsave(&md->uevent_lock, flags);
2761 list_add(elist, &md->uevent_list);
2762 spin_unlock_irqrestore(&md->uevent_lock, flags);
2763}
2764
2765
2766
2767
2768
2769struct gendisk *dm_disk(struct mapped_device *md)
2770{
2771 return md->disk;
2772}
2773EXPORT_SYMBOL_GPL(dm_disk);
2774
2775struct kobject *dm_kobject(struct mapped_device *md)
2776{
2777 return &md->kobj_holder.kobj;
2778}
2779
2780struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2781{
2782 struct mapped_device *md;
2783
2784 md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
2785
2786 spin_lock(&_minor_lock);
2787 if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2788 md = NULL;
2789 goto out;
2790 }
2791 dm_get(md);
2792out:
2793 spin_unlock(&_minor_lock);
2794
2795 return md;
2796}
2797
2798int dm_suspended_md(struct mapped_device *md)
2799{
2800 return test_bit(DMF_SUSPENDED, &md->flags);
2801}
2802
2803static int dm_post_suspending_md(struct mapped_device *md)
2804{
2805 return test_bit(DMF_POST_SUSPENDING, &md->flags);
2806}
2807
2808int dm_suspended_internally_md(struct mapped_device *md)
2809{
2810 return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2811}
2812
2813int dm_test_deferred_remove_flag(struct mapped_device *md)
2814{
2815 return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
2816}
2817
2818int dm_suspended(struct dm_target *ti)
2819{
2820 return dm_suspended_md(ti->table->md);
2821}
2822EXPORT_SYMBOL_GPL(dm_suspended);
2823
2824int dm_post_suspending(struct dm_target *ti)
2825{
2826 return dm_post_suspending_md(ti->table->md);
2827}
2828EXPORT_SYMBOL_GPL(dm_post_suspending);
2829
2830int dm_noflush_suspending(struct dm_target *ti)
2831{
2832 return __noflush_suspending(ti->table->md);
2833}
2834EXPORT_SYMBOL_GPL(dm_noflush_suspending);
2835
2836struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
2837 unsigned integrity, unsigned per_io_data_size,
2838 unsigned min_pool_size)
2839{
2840 struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
2841 unsigned int pool_size = 0;
2842 unsigned int front_pad, io_front_pad;
2843 int ret;
2844
2845 if (!pools)
2846 return NULL;
2847
2848 switch (type) {
2849 case DM_TYPE_BIO_BASED:
2850 case DM_TYPE_DAX_BIO_BASED:
2851 pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
2852 front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
2853 io_front_pad = roundup(front_pad, __alignof__(struct dm_io)) + offsetof(struct dm_io, tio);
2854 ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0);
2855 if (ret)
2856 goto out;
2857 if (integrity && bioset_integrity_create(&pools->io_bs, pool_size))
2858 goto out;
2859 break;
2860 case DM_TYPE_REQUEST_BASED:
2861 pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size);
2862 front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
2863
2864 break;
2865 default:
2866 BUG();
2867 }
2868
2869 ret = bioset_init(&pools->bs, pool_size, front_pad, 0);
2870 if (ret)
2871 goto out;
2872
2873 if (integrity && bioset_integrity_create(&pools->bs, pool_size))
2874 goto out;
2875
2876 return pools;
2877
2878out:
2879 dm_free_md_mempools(pools);
2880
2881 return NULL;
2882}
2883
2884void dm_free_md_mempools(struct dm_md_mempools *pools)
2885{
2886 if (!pools)
2887 return;
2888
2889 bioset_exit(&pools->bs);
2890 bioset_exit(&pools->io_bs);
2891
2892 kfree(pools);
2893}
2894
2895struct dm_pr {
2896 u64 old_key;
2897 u64 new_key;
2898 u32 flags;
2899 bool fail_early;
2900};
2901
2902static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
2903 void *data)
2904{
2905 struct mapped_device *md = bdev->bd_disk->private_data;
2906 struct dm_table *table;
2907 struct dm_target *ti;
2908 int ret = -ENOTTY, srcu_idx;
2909
2910 table = dm_get_live_table(md, &srcu_idx);
2911 if (!table || !dm_table_get_size(table))
2912 goto out;
2913
2914
2915 if (dm_table_get_num_targets(table) != 1)
2916 goto out;
2917 ti = dm_table_get_target(table, 0);
2918
2919 ret = -EINVAL;
2920 if (!ti->type->iterate_devices)
2921 goto out;
2922
2923 ret = ti->type->iterate_devices(ti, fn, data);
2924out:
2925 dm_put_live_table(md, srcu_idx);
2926 return ret;
2927}
2928
2929
2930
2931
2932static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev,
2933 sector_t start, sector_t len, void *data)
2934{
2935 struct dm_pr *pr = data;
2936 const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
2937
2938 if (!ops || !ops->pr_register)
2939 return -EOPNOTSUPP;
2940 return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);
2941}
2942
2943static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
2944 u32 flags)
2945{
2946 struct dm_pr pr = {
2947 .old_key = old_key,
2948 .new_key = new_key,
2949 .flags = flags,
2950 .fail_early = true,
2951 };
2952 int ret;
2953
2954 ret = dm_call_pr(bdev, __dm_pr_register, &pr);
2955 if (ret && new_key) {
2956
2957 pr.old_key = new_key;
2958 pr.new_key = 0;
2959 pr.flags = 0;
2960 pr.fail_early = false;
2961 dm_call_pr(bdev, __dm_pr_register, &pr);
2962 }
2963
2964 return ret;
2965}
2966
2967static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
2968 u32 flags)
2969{
2970 struct mapped_device *md = bdev->bd_disk->private_data;
2971 const struct pr_ops *ops;
2972 int r, srcu_idx;
2973
2974 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
2975 if (r < 0)
2976 goto out;
2977
2978 ops = bdev->bd_disk->fops->pr_ops;
2979 if (ops && ops->pr_reserve)
2980 r = ops->pr_reserve(bdev, key, type, flags);
2981 else
2982 r = -EOPNOTSUPP;
2983out:
2984 dm_unprepare_ioctl(md, srcu_idx);
2985 return r;
2986}
2987
2988static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
2989{
2990 struct mapped_device *md = bdev->bd_disk->private_data;
2991 const struct pr_ops *ops;
2992 int r, srcu_idx;
2993
2994 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
2995 if (r < 0)
2996 goto out;
2997
2998 ops = bdev->bd_disk->fops->pr_ops;
2999 if (ops && ops->pr_release)
3000 r = ops->pr_release(bdev, key, type);
3001 else
3002 r = -EOPNOTSUPP;
3003out:
3004 dm_unprepare_ioctl(md, srcu_idx);
3005 return r;
3006}
3007
3008static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
3009 enum pr_type type, bool abort)
3010{
3011 struct mapped_device *md = bdev->bd_disk->private_data;
3012 const struct pr_ops *ops;
3013 int r, srcu_idx;
3014
3015 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3016 if (r < 0)
3017 goto out;
3018
3019 ops = bdev->bd_disk->fops->pr_ops;
3020 if (ops && ops->pr_preempt)
3021 r = ops->pr_preempt(bdev, old_key, new_key, type, abort);
3022 else
3023 r = -EOPNOTSUPP;
3024out:
3025 dm_unprepare_ioctl(md, srcu_idx);
3026 return r;
3027}
3028
3029static int dm_pr_clear(struct block_device *bdev, u64 key)
3030{
3031 struct mapped_device *md = bdev->bd_disk->private_data;
3032 const struct pr_ops *ops;
3033 int r, srcu_idx;
3034
3035 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3036 if (r < 0)
3037 goto out;
3038
3039 ops = bdev->bd_disk->fops->pr_ops;
3040 if (ops && ops->pr_clear)
3041 r = ops->pr_clear(bdev, key);
3042 else
3043 r = -EOPNOTSUPP;
3044out:
3045 dm_unprepare_ioctl(md, srcu_idx);
3046 return r;
3047}
3048
3049static const struct pr_ops dm_pr_ops = {
3050 .pr_register = dm_pr_register,
3051 .pr_reserve = dm_pr_reserve,
3052 .pr_release = dm_pr_release,
3053 .pr_preempt = dm_pr_preempt,
3054 .pr_clear = dm_pr_clear,
3055};
3056
3057static const struct block_device_operations dm_blk_dops = {
3058 .submit_bio = dm_submit_bio,
3059 .open = dm_blk_open,
3060 .release = dm_blk_close,
3061 .ioctl = dm_blk_ioctl,
3062 .getgeo = dm_blk_getgeo,
3063 .report_zones = dm_blk_report_zones,
3064 .pr_ops = &dm_pr_ops,
3065 .owner = THIS_MODULE
3066};
3067
3068static const struct block_device_operations dm_rq_blk_dops = {
3069 .open = dm_blk_open,
3070 .release = dm_blk_close,
3071 .ioctl = dm_blk_ioctl,
3072 .getgeo = dm_blk_getgeo,
3073 .pr_ops = &dm_pr_ops,
3074 .owner = THIS_MODULE
3075};
3076
3077static const struct dax_operations dm_dax_ops = {
3078 .direct_access = dm_dax_direct_access,
3079 .dax_supported = dm_dax_supported,
3080 .copy_from_iter = dm_dax_copy_from_iter,
3081 .copy_to_iter = dm_dax_copy_to_iter,
3082 .zero_page_range = dm_dax_zero_page_range,
3083};
3084
3085
3086
3087
3088module_init(dm_init);
3089module_exit(dm_exit);
3090
3091module_param(major, uint, 0);
3092MODULE_PARM_DESC(major, "The major number of the device mapper");
3093
3094module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
3095MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
3096
3097module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
3098MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
3099
3100MODULE_DESCRIPTION(DM_NAME " driver");
3101MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3102MODULE_LICENSE("GPL");
3103