1
2
3
4
5
6
7
8#include "dm-core.h"
9#include "dm-rq.h"
10#include "dm-uevent.h"
11
12#include <linux/init.h>
13#include <linux/module.h>
14#include <linux/mutex.h>
15#include <linux/sched/signal.h>
16#include <linux/blkpg.h>
17#include <linux/bio.h>
18#include <linux/mempool.h>
19#include <linux/dax.h>
20#include <linux/slab.h>
21#include <linux/idr.h>
22#include <linux/uio.h>
23#include <linux/hdreg.h>
24#include <linux/delay.h>
25#include <linux/wait.h>
26#include <linux/pr.h>
27#include <linux/refcount.h>
28
29#define DM_MSG_PREFIX "core"
30
31
32
33
34
35#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
36#define DM_COOKIE_LENGTH 24
37
38static const char *_name = DM_NAME;
39
40static unsigned int major = 0;
41static unsigned int _major = 0;
42
43static DEFINE_IDR(_minor_idr);
44
45static DEFINE_SPINLOCK(_minor_lock);
46
47static void do_deferred_remove(struct work_struct *w);
48
49static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
50
51static struct workqueue_struct *deferred_remove_workqueue;
52
53atomic_t dm_global_event_nr = ATOMIC_INIT(0);
54DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq);
55
56void dm_issue_global_event(void)
57{
58 atomic_inc(&dm_global_event_nr);
59 wake_up(&dm_global_eventq);
60}
61
62
63
64
65struct clone_info {
66 struct dm_table *map;
67 struct bio *bio;
68 struct dm_io *io;
69 sector_t sector;
70 unsigned sector_count;
71};
72
73
74
75
76#define DM_TIO_MAGIC 7282014
77struct dm_target_io {
78 unsigned magic;
79 struct dm_io *io;
80 struct dm_target *ti;
81 unsigned target_bio_nr;
82 unsigned *len_ptr;
83 bool inside_dm_io;
84 struct bio clone;
85};
86
87
88
89
90
91#define DM_IO_MAGIC 5191977
92struct dm_io {
93 unsigned magic;
94 struct mapped_device *md;
95 blk_status_t status;
96 atomic_t io_count;
97 struct bio *orig_bio;
98 unsigned long start_time;
99 spinlock_t endio_lock;
100 struct dm_stats_aux stats_aux;
101
102 struct dm_target_io tio;
103};
104
105void *dm_per_bio_data(struct bio *bio, size_t data_size)
106{
107 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
108 if (!tio->inside_dm_io)
109 return (char *)bio - offsetof(struct dm_target_io, clone) - data_size;
110 return (char *)bio - offsetof(struct dm_target_io, clone) - offsetof(struct dm_io, tio) - data_size;
111}
112EXPORT_SYMBOL_GPL(dm_per_bio_data);
113
114struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size)
115{
116 struct dm_io *io = (struct dm_io *)((char *)data + data_size);
117 if (io->magic == DM_IO_MAGIC)
118 return (struct bio *)((char *)io + offsetof(struct dm_io, tio) + offsetof(struct dm_target_io, clone));
119 BUG_ON(io->magic != DM_TIO_MAGIC);
120 return (struct bio *)((char *)io + offsetof(struct dm_target_io, clone));
121}
122EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data);
123
124unsigned dm_bio_get_target_bio_nr(const struct bio *bio)
125{
126 return container_of(bio, struct dm_target_io, clone)->target_bio_nr;
127}
128EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr);
129
130#define MINOR_ALLOCED ((void *)-1)
131
132
133
134
135#define DMF_BLOCK_IO_FOR_SUSPEND 0
136#define DMF_SUSPENDED 1
137#define DMF_FROZEN 2
138#define DMF_FREEING 3
139#define DMF_DELETING 4
140#define DMF_NOFLUSH_SUSPENDING 5
141#define DMF_DEFERRED_REMOVE 6
142#define DMF_SUSPENDED_INTERNALLY 7
143
144#define DM_NUMA_NODE NUMA_NO_NODE
145static int dm_numa_node = DM_NUMA_NODE;
146
147
148
149
150struct dm_md_mempools {
151 struct bio_set *bs;
152 struct bio_set *io_bs;
153};
154
155struct table_device {
156 struct list_head list;
157 refcount_t count;
158 struct dm_dev dm_dev;
159};
160
161static struct kmem_cache *_rq_tio_cache;
162static struct kmem_cache *_rq_cache;
163
164
165
166
167#define RESERVED_BIO_BASED_IOS 16
168static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
169
170static int __dm_get_module_param_int(int *module_param, int min, int max)
171{
172 int param = READ_ONCE(*module_param);
173 int modified_param = 0;
174 bool modified = true;
175
176 if (param < min)
177 modified_param = min;
178 else if (param > max)
179 modified_param = max;
180 else
181 modified = false;
182
183 if (modified) {
184 (void)cmpxchg(module_param, param, modified_param);
185 param = modified_param;
186 }
187
188 return param;
189}
190
191unsigned __dm_get_module_param(unsigned *module_param,
192 unsigned def, unsigned max)
193{
194 unsigned param = READ_ONCE(*module_param);
195 unsigned modified_param = 0;
196
197 if (!param)
198 modified_param = def;
199 else if (param > max)
200 modified_param = max;
201
202 if (modified_param) {
203 (void)cmpxchg(module_param, param, modified_param);
204 param = modified_param;
205 }
206
207 return param;
208}
209
210unsigned dm_get_reserved_bio_based_ios(void)
211{
212 return __dm_get_module_param(&reserved_bio_based_ios,
213 RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS);
214}
215EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
216
217static unsigned dm_get_numa_node(void)
218{
219 return __dm_get_module_param_int(&dm_numa_node,
220 DM_NUMA_NODE, num_online_nodes() - 1);
221}
222
223static int __init local_init(void)
224{
225 int r = -ENOMEM;
226
227 _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
228 if (!_rq_tio_cache)
229 return r;
230
231 _rq_cache = kmem_cache_create("dm_old_clone_request", sizeof(struct request),
232 __alignof__(struct request), 0, NULL);
233 if (!_rq_cache)
234 goto out_free_rq_tio_cache;
235
236 r = dm_uevent_init();
237 if (r)
238 goto out_free_rq_cache;
239
240 deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
241 if (!deferred_remove_workqueue) {
242 r = -ENOMEM;
243 goto out_uevent_exit;
244 }
245
246 _major = major;
247 r = register_blkdev(_major, _name);
248 if (r < 0)
249 goto out_free_workqueue;
250
251 if (!_major)
252 _major = r;
253
254 return 0;
255
256out_free_workqueue:
257 destroy_workqueue(deferred_remove_workqueue);
258out_uevent_exit:
259 dm_uevent_exit();
260out_free_rq_cache:
261 kmem_cache_destroy(_rq_cache);
262out_free_rq_tio_cache:
263 kmem_cache_destroy(_rq_tio_cache);
264
265 return r;
266}
267
268static void local_exit(void)
269{
270 flush_scheduled_work();
271 destroy_workqueue(deferred_remove_workqueue);
272
273 kmem_cache_destroy(_rq_cache);
274 kmem_cache_destroy(_rq_tio_cache);
275 unregister_blkdev(_major, _name);
276 dm_uevent_exit();
277
278 _major = 0;
279
280 DMINFO("cleaned up");
281}
282
283static int (*_inits[])(void) __initdata = {
284 local_init,
285 dm_target_init,
286 dm_linear_init,
287 dm_stripe_init,
288 dm_io_init,
289 dm_kcopyd_init,
290 dm_interface_init,
291 dm_statistics_init,
292};
293
294static void (*_exits[])(void) = {
295 local_exit,
296 dm_target_exit,
297 dm_linear_exit,
298 dm_stripe_exit,
299 dm_io_exit,
300 dm_kcopyd_exit,
301 dm_interface_exit,
302 dm_statistics_exit,
303};
304
305static int __init dm_init(void)
306{
307 const int count = ARRAY_SIZE(_inits);
308
309 int r, i;
310
311 for (i = 0; i < count; i++) {
312 r = _inits[i]();
313 if (r)
314 goto bad;
315 }
316
317 return 0;
318
319 bad:
320 while (i--)
321 _exits[i]();
322
323 return r;
324}
325
326static void __exit dm_exit(void)
327{
328 int i = ARRAY_SIZE(_exits);
329
330 while (i--)
331 _exits[i]();
332
333
334
335
336 idr_destroy(&_minor_idr);
337}
338
339
340
341
342int dm_deleting_md(struct mapped_device *md)
343{
344 return test_bit(DMF_DELETING, &md->flags);
345}
346
347static int dm_blk_open(struct block_device *bdev, fmode_t mode)
348{
349 struct mapped_device *md;
350
351 spin_lock(&_minor_lock);
352
353 md = bdev->bd_disk->private_data;
354 if (!md)
355 goto out;
356
357 if (test_bit(DMF_FREEING, &md->flags) ||
358 dm_deleting_md(md)) {
359 md = NULL;
360 goto out;
361 }
362
363 dm_get(md);
364 atomic_inc(&md->open_count);
365out:
366 spin_unlock(&_minor_lock);
367
368 return md ? 0 : -ENXIO;
369}
370
371static void dm_blk_close(struct gendisk *disk, fmode_t mode)
372{
373 struct mapped_device *md;
374
375 spin_lock(&_minor_lock);
376
377 md = disk->private_data;
378 if (WARN_ON(!md))
379 goto out;
380
381 if (atomic_dec_and_test(&md->open_count) &&
382 (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
383 queue_work(deferred_remove_workqueue, &deferred_remove_work);
384
385 dm_put(md);
386out:
387 spin_unlock(&_minor_lock);
388}
389
390int dm_open_count(struct mapped_device *md)
391{
392 return atomic_read(&md->open_count);
393}
394
395
396
397
398int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
399{
400 int r = 0;
401
402 spin_lock(&_minor_lock);
403
404 if (dm_open_count(md)) {
405 r = -EBUSY;
406 if (mark_deferred)
407 set_bit(DMF_DEFERRED_REMOVE, &md->flags);
408 } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
409 r = -EEXIST;
410 else
411 set_bit(DMF_DELETING, &md->flags);
412
413 spin_unlock(&_minor_lock);
414
415 return r;
416}
417
418int dm_cancel_deferred_remove(struct mapped_device *md)
419{
420 int r = 0;
421
422 spin_lock(&_minor_lock);
423
424 if (test_bit(DMF_DELETING, &md->flags))
425 r = -EBUSY;
426 else
427 clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
428
429 spin_unlock(&_minor_lock);
430
431 return r;
432}
433
434static void do_deferred_remove(struct work_struct *w)
435{
436 dm_deferred_remove();
437}
438
439sector_t dm_get_size(struct mapped_device *md)
440{
441 return get_capacity(md->disk);
442}
443
444struct request_queue *dm_get_md_queue(struct mapped_device *md)
445{
446 return md->queue;
447}
448
449struct dm_stats *dm_get_stats(struct mapped_device *md)
450{
451 return &md->stats;
452}
453
454static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
455{
456 struct mapped_device *md = bdev->bd_disk->private_data;
457
458 return dm_get_geometry(md, geo);
459}
460
461static char *_dm_claim_ptr = "I belong to device-mapper";
462
463static int dm_get_bdev_for_ioctl(struct mapped_device *md,
464 struct block_device **bdev,
465 fmode_t *mode)
466{
467 struct dm_target *tgt;
468 struct dm_table *map;
469 int srcu_idx, r, r2;
470
471retry:
472 r = -ENOTTY;
473 map = dm_get_live_table(md, &srcu_idx);
474 if (!map || !dm_table_get_size(map))
475 goto out;
476
477
478 if (dm_table_get_num_targets(map) != 1)
479 goto out;
480
481 tgt = dm_table_get_target(map, 0);
482 if (!tgt->type->prepare_ioctl)
483 goto out;
484
485 if (dm_suspended_md(md)) {
486 r = -EAGAIN;
487 goto out;
488 }
489
490 r = tgt->type->prepare_ioctl(tgt, bdev, mode);
491 if (r < 0)
492 goto out;
493
494 bdgrab(*bdev);
495 r2 = blkdev_get(*bdev, *mode, _dm_claim_ptr);
496 if (r2 < 0) {
497 r = r2;
498 goto out;
499 }
500
501 dm_put_live_table(md, srcu_idx);
502 return r;
503
504out:
505 dm_put_live_table(md, srcu_idx);
506 if (r == -ENOTCONN && !fatal_signal_pending(current)) {
507 msleep(10);
508 goto retry;
509 }
510 return r;
511}
512
513static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
514 unsigned int cmd, unsigned long arg)
515{
516 struct mapped_device *md = bdev->bd_disk->private_data;
517 int r;
518
519 r = dm_get_bdev_for_ioctl(md, &bdev, &mode);
520 if (r < 0)
521 return r;
522
523 if (r > 0) {
524
525
526
527
528 if (!capable(CAP_SYS_RAWIO)) {
529 DMWARN_LIMIT(
530 "%s: sending ioctl %x to DM device without required privilege.",
531 current->comm, cmd);
532 r = -ENOIOCTLCMD;
533 goto out;
534 }
535 }
536
537 r = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
538out:
539 blkdev_put(bdev, mode);
540 return r;
541}
542
543static void start_io_acct(struct dm_io *io);
544
545static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
546{
547 struct dm_io *io;
548 struct dm_target_io *tio;
549 struct bio *clone;
550
551 clone = bio_alloc_bioset(GFP_NOIO, 0, md->io_bs);
552 if (!clone)
553 return NULL;
554
555 tio = container_of(clone, struct dm_target_io, clone);
556 tio->inside_dm_io = true;
557 tio->io = NULL;
558
559 io = container_of(tio, struct dm_io, tio);
560 io->magic = DM_IO_MAGIC;
561 io->status = 0;
562 atomic_set(&io->io_count, 1);
563 io->orig_bio = bio;
564 io->md = md;
565 spin_lock_init(&io->endio_lock);
566
567 start_io_acct(io);
568
569 return io;
570}
571
572static void free_io(struct mapped_device *md, struct dm_io *io)
573{
574 bio_put(&io->tio.clone);
575}
576
577static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *ti,
578 unsigned target_bio_nr, gfp_t gfp_mask)
579{
580 struct dm_target_io *tio;
581
582 if (!ci->io->tio.io) {
583
584 tio = &ci->io->tio;
585 } else {
586 struct bio *clone = bio_alloc_bioset(gfp_mask, 0, ci->io->md->bs);
587 if (!clone)
588 return NULL;
589
590 tio = container_of(clone, struct dm_target_io, clone);
591 tio->inside_dm_io = false;
592 }
593
594 tio->magic = DM_TIO_MAGIC;
595 tio->io = ci->io;
596 tio->ti = ti;
597 tio->target_bio_nr = target_bio_nr;
598
599 return tio;
600}
601
602static void free_tio(struct dm_target_io *tio)
603{
604 if (tio->inside_dm_io)
605 return;
606 bio_put(&tio->clone);
607}
608
609int md_in_flight(struct mapped_device *md)
610{
611 return atomic_read(&md->pending[READ]) +
612 atomic_read(&md->pending[WRITE]);
613}
614
615static void start_io_acct(struct dm_io *io)
616{
617 struct mapped_device *md = io->md;
618 struct bio *bio = io->orig_bio;
619 int rw = bio_data_dir(bio);
620
621 io->start_time = jiffies;
622
623 generic_start_io_acct(md->queue, rw, bio_sectors(bio), &dm_disk(md)->part0);
624
625 atomic_set(&dm_disk(md)->part0.in_flight[rw],
626 atomic_inc_return(&md->pending[rw]));
627
628 if (unlikely(dm_stats_used(&md->stats)))
629 dm_stats_account_io(&md->stats, bio_data_dir(bio),
630 bio->bi_iter.bi_sector, bio_sectors(bio),
631 false, 0, &io->stats_aux);
632}
633
634static void end_io_acct(struct dm_io *io)
635{
636 struct mapped_device *md = io->md;
637 struct bio *bio = io->orig_bio;
638 unsigned long duration = jiffies - io->start_time;
639 int pending;
640 int rw = bio_data_dir(bio);
641
642 generic_end_io_acct(md->queue, rw, &dm_disk(md)->part0, io->start_time);
643
644 if (unlikely(dm_stats_used(&md->stats)))
645 dm_stats_account_io(&md->stats, bio_data_dir(bio),
646 bio->bi_iter.bi_sector, bio_sectors(bio),
647 true, duration, &io->stats_aux);
648
649
650
651
652
653 pending = atomic_dec_return(&md->pending[rw]);
654 atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
655 pending += atomic_read(&md->pending[rw^0x1]);
656
657
658 if (!pending)
659 wake_up(&md->wait);
660}
661
662
663
664
665static void queue_io(struct mapped_device *md, struct bio *bio)
666{
667 unsigned long flags;
668
669 spin_lock_irqsave(&md->deferred_lock, flags);
670 bio_list_add(&md->deferred, bio);
671 spin_unlock_irqrestore(&md->deferred_lock, flags);
672 queue_work(md->wq, &md->work);
673}
674
675
676
677
678
679
680struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
681{
682 *srcu_idx = srcu_read_lock(&md->io_barrier);
683
684 return srcu_dereference(md->map, &md->io_barrier);
685}
686
687void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
688{
689 srcu_read_unlock(&md->io_barrier, srcu_idx);
690}
691
692void dm_sync_table(struct mapped_device *md)
693{
694 synchronize_srcu(&md->io_barrier);
695 synchronize_rcu_expedited();
696}
697
698
699
700
701
702static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
703{
704 rcu_read_lock();
705 return rcu_dereference(md->map);
706}
707
708static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
709{
710 rcu_read_unlock();
711}
712
713
714
715
716static int open_table_device(struct table_device *td, dev_t dev,
717 struct mapped_device *md)
718{
719 struct block_device *bdev;
720
721 int r;
722
723 BUG_ON(td->dm_dev.bdev);
724
725 bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr);
726 if (IS_ERR(bdev))
727 return PTR_ERR(bdev);
728
729 r = bd_link_disk_holder(bdev, dm_disk(md));
730 if (r) {
731 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
732 return r;
733 }
734
735 td->dm_dev.bdev = bdev;
736 td->dm_dev.dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
737 return 0;
738}
739
740
741
742
743static void close_table_device(struct table_device *td, struct mapped_device *md)
744{
745 if (!td->dm_dev.bdev)
746 return;
747
748 bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
749 blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
750 put_dax(td->dm_dev.dax_dev);
751 td->dm_dev.bdev = NULL;
752 td->dm_dev.dax_dev = NULL;
753}
754
755static struct table_device *find_table_device(struct list_head *l, dev_t dev,
756 fmode_t mode) {
757 struct table_device *td;
758
759 list_for_each_entry(td, l, list)
760 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
761 return td;
762
763 return NULL;
764}
765
766int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
767 struct dm_dev **result) {
768 int r;
769 struct table_device *td;
770
771 mutex_lock(&md->table_devices_lock);
772 td = find_table_device(&md->table_devices, dev, mode);
773 if (!td) {
774 td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
775 if (!td) {
776 mutex_unlock(&md->table_devices_lock);
777 return -ENOMEM;
778 }
779
780 td->dm_dev.mode = mode;
781 td->dm_dev.bdev = NULL;
782
783 if ((r = open_table_device(td, dev, md))) {
784 mutex_unlock(&md->table_devices_lock);
785 kfree(td);
786 return r;
787 }
788
789 format_dev_t(td->dm_dev.name, dev);
790
791 refcount_set(&td->count, 1);
792 list_add(&td->list, &md->table_devices);
793 } else {
794 refcount_inc(&td->count);
795 }
796 mutex_unlock(&md->table_devices_lock);
797
798 *result = &td->dm_dev;
799 return 0;
800}
801EXPORT_SYMBOL_GPL(dm_get_table_device);
802
803void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
804{
805 struct table_device *td = container_of(d, struct table_device, dm_dev);
806
807 mutex_lock(&md->table_devices_lock);
808 if (refcount_dec_and_test(&td->count)) {
809 close_table_device(td, md);
810 list_del(&td->list);
811 kfree(td);
812 }
813 mutex_unlock(&md->table_devices_lock);
814}
815EXPORT_SYMBOL(dm_put_table_device);
816
817static void free_table_devices(struct list_head *devices)
818{
819 struct list_head *tmp, *next;
820
821 list_for_each_safe(tmp, next, devices) {
822 struct table_device *td = list_entry(tmp, struct table_device, list);
823
824 DMWARN("dm_destroy: %s still exists with %d references",
825 td->dm_dev.name, refcount_read(&td->count));
826 kfree(td);
827 }
828}
829
830
831
832
833int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
834{
835 *geo = md->geometry;
836
837 return 0;
838}
839
840
841
842
843int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
844{
845 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
846
847 if (geo->start > sz) {
848 DMWARN("Start sector is beyond the geometry limits.");
849 return -EINVAL;
850 }
851
852 md->geometry = *geo;
853
854 return 0;
855}
856
857static int __noflush_suspending(struct mapped_device *md)
858{
859 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
860}
861
862
863
864
865
866static void dec_pending(struct dm_io *io, blk_status_t error)
867{
868 unsigned long flags;
869 blk_status_t io_error;
870 struct bio *bio;
871 struct mapped_device *md = io->md;
872
873
874 if (unlikely(error)) {
875 spin_lock_irqsave(&io->endio_lock, flags);
876 if (!(io->status == BLK_STS_DM_REQUEUE && __noflush_suspending(md)))
877 io->status = error;
878 spin_unlock_irqrestore(&io->endio_lock, flags);
879 }
880
881 if (atomic_dec_and_test(&io->io_count)) {
882 if (io->status == BLK_STS_DM_REQUEUE) {
883
884
885
886 spin_lock_irqsave(&md->deferred_lock, flags);
887 if (__noflush_suspending(md))
888
889 bio_list_add_head(&md->deferred, io->orig_bio);
890 else
891
892 io->status = BLK_STS_IOERR;
893 spin_unlock_irqrestore(&md->deferred_lock, flags);
894 }
895
896 io_error = io->status;
897 bio = io->orig_bio;
898 end_io_acct(io);
899 free_io(md, io);
900
901 if (io_error == BLK_STS_DM_REQUEUE)
902 return;
903
904 if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
905
906
907
908
909 bio->bi_opf &= ~REQ_PREFLUSH;
910 queue_io(md, bio);
911 } else {
912
913 if (io_error)
914 bio->bi_status = io_error;
915 bio_endio(bio);
916 }
917 }
918}
919
920void disable_write_same(struct mapped_device *md)
921{
922 struct queue_limits *limits = dm_get_queue_limits(md);
923
924
925 limits->max_write_same_sectors = 0;
926}
927
928void disable_write_zeroes(struct mapped_device *md)
929{
930 struct queue_limits *limits = dm_get_queue_limits(md);
931
932
933 limits->max_write_zeroes_sectors = 0;
934}
935
936static void clone_endio(struct bio *bio)
937{
938 blk_status_t error = bio->bi_status;
939 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
940 struct dm_io *io = tio->io;
941 struct mapped_device *md = tio->io->md;
942 dm_endio_fn endio = tio->ti->type->end_io;
943
944 if (unlikely(error == BLK_STS_TARGET) && md->type != DM_TYPE_NVME_BIO_BASED) {
945 if (bio_op(bio) == REQ_OP_WRITE_SAME &&
946 !bio->bi_disk->queue->limits.max_write_same_sectors)
947 disable_write_same(md);
948 if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
949 !bio->bi_disk->queue->limits.max_write_zeroes_sectors)
950 disable_write_zeroes(md);
951 }
952
953 if (endio) {
954 int r = endio(tio->ti, bio, &error);
955 switch (r) {
956 case DM_ENDIO_REQUEUE:
957 error = BLK_STS_DM_REQUEUE;
958
959 case DM_ENDIO_DONE:
960 break;
961 case DM_ENDIO_INCOMPLETE:
962
963 return;
964 default:
965 DMWARN("unimplemented target endio return value: %d", r);
966 BUG();
967 }
968 }
969
970 free_tio(tio);
971 dec_pending(io, error);
972}
973
974
975
976
977
978static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
979{
980 sector_t target_offset = dm_target_offset(ti, sector);
981
982 return ti->len - target_offset;
983}
984
985static sector_t max_io_len(sector_t sector, struct dm_target *ti)
986{
987 sector_t len = max_io_len_target_boundary(sector, ti);
988 sector_t offset, max_len;
989
990
991
992
993 if (ti->max_io_len) {
994 offset = dm_target_offset(ti, sector);
995 if (unlikely(ti->max_io_len & (ti->max_io_len - 1)))
996 max_len = sector_div(offset, ti->max_io_len);
997 else
998 max_len = offset & (ti->max_io_len - 1);
999 max_len = ti->max_io_len - max_len;
1000
1001 if (len > max_len)
1002 len = max_len;
1003 }
1004
1005 return len;
1006}
1007
1008int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
1009{
1010 if (len > UINT_MAX) {
1011 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
1012 (unsigned long long)len, UINT_MAX);
1013 ti->error = "Maximum size of target IO is too large";
1014 return -EINVAL;
1015 }
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025 ti->max_io_len = min_t(uint32_t, len, BIO_MAX_PAGES * PAGE_SIZE);
1026
1027 return 0;
1028}
1029EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
1030
1031static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
1032 sector_t sector, int *srcu_idx)
1033{
1034 struct dm_table *map;
1035 struct dm_target *ti;
1036
1037 map = dm_get_live_table(md, srcu_idx);
1038 if (!map)
1039 return NULL;
1040
1041 ti = dm_table_find_target(map, sector);
1042 if (!dm_target_is_valid(ti))
1043 return NULL;
1044
1045 return ti;
1046}
1047
1048static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
1049 long nr_pages, void **kaddr, pfn_t *pfn)
1050{
1051 struct mapped_device *md = dax_get_private(dax_dev);
1052 sector_t sector = pgoff * PAGE_SECTORS;
1053 struct dm_target *ti;
1054 long len, ret = -EIO;
1055 int srcu_idx;
1056
1057 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1058
1059 if (!ti)
1060 goto out;
1061 if (!ti->type->direct_access)
1062 goto out;
1063 len = max_io_len(sector, ti) / PAGE_SECTORS;
1064 if (len < 1)
1065 goto out;
1066 nr_pages = min(len, nr_pages);
1067 if (ti->type->direct_access)
1068 ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
1069
1070 out:
1071 dm_put_live_table(md, srcu_idx);
1072
1073 return ret;
1074}
1075
1076static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
1077 void *addr, size_t bytes, struct iov_iter *i)
1078{
1079 struct mapped_device *md = dax_get_private(dax_dev);
1080 sector_t sector = pgoff * PAGE_SECTORS;
1081 struct dm_target *ti;
1082 long ret = 0;
1083 int srcu_idx;
1084
1085 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1086
1087 if (!ti)
1088 goto out;
1089 if (!ti->type->dax_copy_from_iter) {
1090 ret = copy_from_iter(addr, bytes, i);
1091 goto out;
1092 }
1093 ret = ti->type->dax_copy_from_iter(ti, pgoff, addr, bytes, i);
1094 out:
1095 dm_put_live_table(md, srcu_idx);
1096
1097 return ret;
1098}
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
1129{
1130 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
1131 unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
1132 BUG_ON(bio->bi_opf & REQ_PREFLUSH);
1133 BUG_ON(bi_size > *tio->len_ptr);
1134 BUG_ON(n_sectors > bi_size);
1135 *tio->len_ptr -= bi_size - n_sectors;
1136 bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
1137}
1138EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148void dm_remap_zone_report(struct dm_target *ti, struct bio *bio, sector_t start)
1149{
1150#ifdef CONFIG_BLK_DEV_ZONED
1151 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
1152 struct bio *report_bio = tio->io->orig_bio;
1153 struct blk_zone_report_hdr *hdr = NULL;
1154 struct blk_zone *zone;
1155 unsigned int nr_rep = 0;
1156 unsigned int ofst;
1157 struct bio_vec bvec;
1158 struct bvec_iter iter;
1159 void *addr;
1160
1161 if (bio->bi_status)
1162 return;
1163
1164
1165
1166
1167
1168 bio_for_each_segment(bvec, report_bio, iter) {
1169 addr = kmap_atomic(bvec.bv_page);
1170
1171
1172 if (!hdr) {
1173 hdr = addr;
1174 ofst = sizeof(struct blk_zone_report_hdr);
1175 } else
1176 ofst = 0;
1177
1178
1179 while (hdr->nr_zones && ofst < bvec.bv_len) {
1180 zone = addr + ofst;
1181 if (zone->start >= start + ti->len) {
1182 hdr->nr_zones = 0;
1183 break;
1184 }
1185 zone->start = zone->start + ti->begin - start;
1186 if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) {
1187 if (zone->cond == BLK_ZONE_COND_FULL)
1188 zone->wp = zone->start + zone->len;
1189 else if (zone->cond == BLK_ZONE_COND_EMPTY)
1190 zone->wp = zone->start;
1191 else
1192 zone->wp = zone->wp + ti->begin - start;
1193 }
1194 ofst += sizeof(struct blk_zone);
1195 hdr->nr_zones--;
1196 nr_rep++;
1197 }
1198
1199 if (addr != hdr)
1200 kunmap_atomic(addr);
1201
1202 if (!hdr->nr_zones)
1203 break;
1204 }
1205
1206 if (hdr) {
1207 hdr->nr_zones = nr_rep;
1208 kunmap_atomic(hdr);
1209 }
1210
1211 bio_advance(report_bio, report_bio->bi_iter.bi_size);
1212
1213#else
1214 bio->bi_status = BLK_STS_NOTSUPP;
1215#endif
1216}
1217EXPORT_SYMBOL_GPL(dm_remap_zone_report);
1218
1219static blk_qc_t __map_bio(struct dm_target_io *tio)
1220{
1221 int r;
1222 sector_t sector;
1223 struct bio *clone = &tio->clone;
1224 struct dm_io *io = tio->io;
1225 struct mapped_device *md = io->md;
1226 struct dm_target *ti = tio->ti;
1227 blk_qc_t ret = BLK_QC_T_NONE;
1228
1229 clone->bi_end_io = clone_endio;
1230
1231
1232
1233
1234
1235
1236 atomic_inc(&io->io_count);
1237 sector = clone->bi_iter.bi_sector;
1238
1239 r = ti->type->map(ti, clone);
1240 switch (r) {
1241 case DM_MAPIO_SUBMITTED:
1242 break;
1243 case DM_MAPIO_REMAPPED:
1244
1245 trace_block_bio_remap(clone->bi_disk->queue, clone,
1246 bio_dev(io->orig_bio), sector);
1247 if (md->type == DM_TYPE_NVME_BIO_BASED)
1248 ret = direct_make_request(clone);
1249 else
1250 ret = generic_make_request(clone);
1251 break;
1252 case DM_MAPIO_KILL:
1253 free_tio(tio);
1254 dec_pending(io, BLK_STS_IOERR);
1255 break;
1256 case DM_MAPIO_REQUEUE:
1257 free_tio(tio);
1258 dec_pending(io, BLK_STS_DM_REQUEUE);
1259 break;
1260 default:
1261 DMWARN("unimplemented target map return value: %d", r);
1262 BUG();
1263 }
1264
1265 return ret;
1266}
1267
1268static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
1269{
1270 bio->bi_iter.bi_sector = sector;
1271 bio->bi_iter.bi_size = to_bytes(len);
1272}
1273
1274
1275
1276
1277static int clone_bio(struct dm_target_io *tio, struct bio *bio,
1278 sector_t sector, unsigned len)
1279{
1280 struct bio *clone = &tio->clone;
1281
1282 __bio_clone_fast(clone, bio);
1283
1284 if (unlikely(bio_integrity(bio) != NULL)) {
1285 int r;
1286
1287 if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
1288 !dm_target_passes_integrity(tio->ti->type))) {
1289 DMWARN("%s: the target %s doesn't support integrity data.",
1290 dm_device_name(tio->io->md),
1291 tio->ti->type->name);
1292 return -EIO;
1293 }
1294
1295 r = bio_integrity_clone(clone, bio, GFP_NOIO);
1296 if (r < 0)
1297 return r;
1298 }
1299
1300 if (bio_op(bio) != REQ_OP_ZONE_REPORT)
1301 bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
1302 clone->bi_iter.bi_size = to_bytes(len);
1303
1304 if (unlikely(bio_integrity(bio) != NULL))
1305 bio_integrity_trim(clone);
1306
1307 return 0;
1308}
1309
1310static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
1311 struct dm_target *ti, unsigned num_bios)
1312{
1313 struct dm_target_io *tio;
1314 int try;
1315
1316 if (!num_bios)
1317 return;
1318
1319 if (num_bios == 1) {
1320 tio = alloc_tio(ci, ti, 0, GFP_NOIO);
1321 bio_list_add(blist, &tio->clone);
1322 return;
1323 }
1324
1325 for (try = 0; try < 2; try++) {
1326 int bio_nr;
1327 struct bio *bio;
1328
1329 if (try)
1330 mutex_lock(&ci->io->md->table_devices_lock);
1331 for (bio_nr = 0; bio_nr < num_bios; bio_nr++) {
1332 tio = alloc_tio(ci, ti, bio_nr, try ? GFP_NOIO : GFP_NOWAIT);
1333 if (!tio)
1334 break;
1335
1336 bio_list_add(blist, &tio->clone);
1337 }
1338 if (try)
1339 mutex_unlock(&ci->io->md->table_devices_lock);
1340 if (bio_nr == num_bios)
1341 return;
1342
1343 while ((bio = bio_list_pop(blist))) {
1344 tio = container_of(bio, struct dm_target_io, clone);
1345 free_tio(tio);
1346 }
1347 }
1348}
1349
1350static blk_qc_t __clone_and_map_simple_bio(struct clone_info *ci,
1351 struct dm_target_io *tio, unsigned *len)
1352{
1353 struct bio *clone = &tio->clone;
1354
1355 tio->len_ptr = len;
1356
1357 __bio_clone_fast(clone, ci->bio);
1358 if (len)
1359 bio_setup_sector(clone, ci->sector, *len);
1360
1361 return __map_bio(tio);
1362}
1363
1364static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
1365 unsigned num_bios, unsigned *len)
1366{
1367 struct bio_list blist = BIO_EMPTY_LIST;
1368 struct bio *bio;
1369 struct dm_target_io *tio;
1370
1371 alloc_multiple_bios(&blist, ci, ti, num_bios);
1372
1373 while ((bio = bio_list_pop(&blist))) {
1374 tio = container_of(bio, struct dm_target_io, clone);
1375 (void) __clone_and_map_simple_bio(ci, tio, len);
1376 }
1377}
1378
1379static int __send_empty_flush(struct clone_info *ci)
1380{
1381 unsigned target_nr = 0;
1382 struct dm_target *ti;
1383
1384 BUG_ON(bio_has_data(ci->bio));
1385 while ((ti = dm_table_get_target(ci->map, target_nr++)))
1386 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
1387
1388 return 0;
1389}
1390
1391static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
1392 sector_t sector, unsigned *len)
1393{
1394 struct bio *bio = ci->bio;
1395 struct dm_target_io *tio;
1396 int r;
1397
1398 tio = alloc_tio(ci, ti, 0, GFP_NOIO);
1399 tio->len_ptr = len;
1400 r = clone_bio(tio, bio, sector, *len);
1401 if (r < 0) {
1402 free_tio(tio);
1403 return r;
1404 }
1405 (void) __map_bio(tio);
1406
1407 return 0;
1408}
1409
1410typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
1411
1412static unsigned get_num_discard_bios(struct dm_target *ti)
1413{
1414 return ti->num_discard_bios;
1415}
1416
1417static unsigned get_num_write_same_bios(struct dm_target *ti)
1418{
1419 return ti->num_write_same_bios;
1420}
1421
1422static unsigned get_num_write_zeroes_bios(struct dm_target *ti)
1423{
1424 return ti->num_write_zeroes_bios;
1425}
1426
1427typedef bool (*is_split_required_fn)(struct dm_target *ti);
1428
1429static bool is_split_required_for_discard(struct dm_target *ti)
1430{
1431 return ti->split_discard_bios;
1432}
1433
1434static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
1435 get_num_bios_fn get_num_bios,
1436 is_split_required_fn is_split_required)
1437{
1438 unsigned len;
1439 unsigned num_bios;
1440
1441
1442
1443
1444
1445
1446
1447 num_bios = get_num_bios ? get_num_bios(ti) : 0;
1448 if (!num_bios)
1449 return -EOPNOTSUPP;
1450
1451 if (is_split_required && !is_split_required(ti))
1452 len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
1453 else
1454 len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti));
1455
1456 __send_duplicate_bios(ci, ti, num_bios, &len);
1457
1458 ci->sector += len;
1459 ci->sector_count -= len;
1460
1461 return 0;
1462}
1463
1464static int __send_discard(struct clone_info *ci, struct dm_target *ti)
1465{
1466 return __send_changing_extent_only(ci, ti, get_num_discard_bios,
1467 is_split_required_for_discard);
1468}
1469
1470static int __send_write_same(struct clone_info *ci, struct dm_target *ti)
1471{
1472 return __send_changing_extent_only(ci, ti, get_num_write_same_bios, NULL);
1473}
1474
1475static int __send_write_zeroes(struct clone_info *ci, struct dm_target *ti)
1476{
1477 return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios, NULL);
1478}
1479
1480
1481
1482
1483static int __split_and_process_non_flush(struct clone_info *ci)
1484{
1485 struct bio *bio = ci->bio;
1486 struct dm_target *ti;
1487 unsigned len;
1488 int r;
1489
1490 ti = dm_table_find_target(ci->map, ci->sector);
1491 if (!dm_target_is_valid(ti))
1492 return -EIO;
1493
1494 if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
1495 return __send_discard(ci, ti);
1496 else if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
1497 return __send_write_same(ci, ti);
1498 else if (unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES))
1499 return __send_write_zeroes(ci, ti);
1500
1501 if (bio_op(bio) == REQ_OP_ZONE_REPORT)
1502 len = ci->sector_count;
1503 else
1504 len = min_t(sector_t, max_io_len(ci->sector, ti),
1505 ci->sector_count);
1506
1507 r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
1508 if (r < 0)
1509 return r;
1510
1511 ci->sector += len;
1512 ci->sector_count -= len;
1513
1514 return 0;
1515}
1516
1517static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
1518 struct dm_table *map, struct bio *bio)
1519{
1520 ci->map = map;
1521 ci->io = alloc_io(md, bio);
1522 ci->sector = bio->bi_iter.bi_sector;
1523}
1524
1525
1526
1527
1528static blk_qc_t __split_and_process_bio(struct mapped_device *md,
1529 struct dm_table *map, struct bio *bio)
1530{
1531 struct clone_info ci;
1532 blk_qc_t ret = BLK_QC_T_NONE;
1533 int error = 0;
1534
1535 if (unlikely(!map)) {
1536 bio_io_error(bio);
1537 return ret;
1538 }
1539
1540 init_clone_info(&ci, md, map, bio);
1541
1542 if (bio->bi_opf & REQ_PREFLUSH) {
1543 ci.bio = &ci.io->md->flush_bio;
1544 ci.sector_count = 0;
1545 error = __send_empty_flush(&ci);
1546
1547 } else if (bio_op(bio) == REQ_OP_ZONE_RESET) {
1548 ci.bio = bio;
1549 ci.sector_count = 0;
1550 error = __split_and_process_non_flush(&ci);
1551 } else {
1552 ci.bio = bio;
1553 ci.sector_count = bio_sectors(bio);
1554 while (ci.sector_count && !error) {
1555 error = __split_and_process_non_flush(&ci);
1556 if (current->bio_list && ci.sector_count && !error) {
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568 struct bio *b = bio_clone_bioset(bio, GFP_NOIO,
1569 md->queue->bio_split);
1570 ci.io->orig_bio = b;
1571 bio_advance(bio, (bio_sectors(bio) - ci.sector_count) << 9);
1572 bio_chain(b, bio);
1573 ret = generic_make_request(bio);
1574 break;
1575 }
1576 }
1577 }
1578
1579
1580 dec_pending(ci.io, errno_to_blk_status(error));
1581 return ret;
1582}
1583
1584
1585
1586
1587
1588static blk_qc_t __process_bio(struct mapped_device *md,
1589 struct dm_table *map, struct bio *bio)
1590{
1591 struct clone_info ci;
1592 blk_qc_t ret = BLK_QC_T_NONE;
1593 int error = 0;
1594
1595 if (unlikely(!map)) {
1596 bio_io_error(bio);
1597 return ret;
1598 }
1599
1600 init_clone_info(&ci, md, map, bio);
1601
1602 if (bio->bi_opf & REQ_PREFLUSH) {
1603 ci.bio = &ci.io->md->flush_bio;
1604 ci.sector_count = 0;
1605 error = __send_empty_flush(&ci);
1606
1607 } else {
1608 struct dm_target *ti = md->immutable_target;
1609 struct dm_target_io *tio;
1610
1611
1612
1613
1614
1615 if (unlikely(WARN_ON_ONCE(!ti || !dm_target_is_valid(ti)))) {
1616 error = -EIO;
1617 goto out;
1618 }
1619
1620 tio = alloc_tio(&ci, ti, 0, GFP_NOIO);
1621 ci.bio = bio;
1622 ci.sector_count = bio_sectors(bio);
1623 ret = __clone_and_map_simple_bio(&ci, tio, NULL);
1624 }
1625out:
1626
1627 dec_pending(ci.io, errno_to_blk_status(error));
1628 return ret;
1629}
1630
1631typedef blk_qc_t (process_bio_fn)(struct mapped_device *, struct dm_table *, struct bio *);
1632
1633static blk_qc_t __dm_make_request(struct request_queue *q, struct bio *bio,
1634 process_bio_fn process_bio)
1635{
1636 struct mapped_device *md = q->queuedata;
1637 blk_qc_t ret = BLK_QC_T_NONE;
1638 int srcu_idx;
1639 struct dm_table *map;
1640
1641 map = dm_get_live_table(md, &srcu_idx);
1642
1643
1644 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1645 dm_put_live_table(md, srcu_idx);
1646
1647 if (!(bio->bi_opf & REQ_RAHEAD))
1648 queue_io(md, bio);
1649 else
1650 bio_io_error(bio);
1651 return ret;
1652 }
1653
1654 ret = process_bio(md, map, bio);
1655
1656 dm_put_live_table(md, srcu_idx);
1657 return ret;
1658}
1659
1660
1661
1662
1663
1664static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
1665{
1666 return __dm_make_request(q, bio, __split_and_process_bio);
1667}
1668
1669static blk_qc_t dm_make_request_nvme(struct request_queue *q, struct bio *bio)
1670{
1671 return __dm_make_request(q, bio, __process_bio);
1672}
1673
1674static int dm_any_congested(void *congested_data, int bdi_bits)
1675{
1676 int r = bdi_bits;
1677 struct mapped_device *md = congested_data;
1678 struct dm_table *map;
1679
1680 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
1681 if (dm_request_based(md)) {
1682
1683
1684
1685
1686 r = md->queue->backing_dev_info->wb.state & bdi_bits;
1687 } else {
1688 map = dm_get_live_table_fast(md);
1689 if (map)
1690 r = dm_table_any_congested(map, bdi_bits);
1691 dm_put_live_table_fast(md);
1692 }
1693 }
1694
1695 return r;
1696}
1697
1698
1699
1700
1701static void free_minor(int minor)
1702{
1703 spin_lock(&_minor_lock);
1704 idr_remove(&_minor_idr, minor);
1705 spin_unlock(&_minor_lock);
1706}
1707
1708
1709
1710
1711static int specific_minor(int minor)
1712{
1713 int r;
1714
1715 if (minor >= (1 << MINORBITS))
1716 return -EINVAL;
1717
1718 idr_preload(GFP_KERNEL);
1719 spin_lock(&_minor_lock);
1720
1721 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
1722
1723 spin_unlock(&_minor_lock);
1724 idr_preload_end();
1725 if (r < 0)
1726 return r == -ENOSPC ? -EBUSY : r;
1727 return 0;
1728}
1729
1730static int next_free_minor(int *minor)
1731{
1732 int r;
1733
1734 idr_preload(GFP_KERNEL);
1735 spin_lock(&_minor_lock);
1736
1737 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
1738
1739 spin_unlock(&_minor_lock);
1740 idr_preload_end();
1741 if (r < 0)
1742 return r;
1743 *minor = r;
1744 return 0;
1745}
1746
1747static const struct block_device_operations dm_blk_dops;
1748static const struct dax_operations dm_dax_ops;
1749
1750static void dm_wq_work(struct work_struct *work);
1751
1752static void dm_init_normal_md_queue(struct mapped_device *md)
1753{
1754 md->use_blk_mq = false;
1755
1756
1757
1758
1759 md->queue->backing_dev_info->congested_fn = dm_any_congested;
1760}
1761
1762static void cleanup_mapped_device(struct mapped_device *md)
1763{
1764 if (md->wq)
1765 destroy_workqueue(md->wq);
1766 if (md->kworker_task)
1767 kthread_stop(md->kworker_task);
1768 if (md->bs)
1769 bioset_free(md->bs);
1770 if (md->io_bs)
1771 bioset_free(md->io_bs);
1772
1773 if (md->dax_dev) {
1774 kill_dax(md->dax_dev);
1775 put_dax(md->dax_dev);
1776 md->dax_dev = NULL;
1777 }
1778
1779 if (md->disk) {
1780 spin_lock(&_minor_lock);
1781 md->disk->private_data = NULL;
1782 spin_unlock(&_minor_lock);
1783 del_gendisk(md->disk);
1784 put_disk(md->disk);
1785 }
1786
1787 if (md->queue)
1788 blk_cleanup_queue(md->queue);
1789
1790 cleanup_srcu_struct(&md->io_barrier);
1791
1792 if (md->bdev) {
1793 bdput(md->bdev);
1794 md->bdev = NULL;
1795 }
1796
1797 mutex_destroy(&md->suspend_lock);
1798 mutex_destroy(&md->type_lock);
1799 mutex_destroy(&md->table_devices_lock);
1800
1801 dm_mq_cleanup_mapped_device(md);
1802}
1803
1804
1805
1806
1807static struct mapped_device *alloc_dev(int minor)
1808{
1809 int r, numa_node_id = dm_get_numa_node();
1810 struct dax_device *dax_dev;
1811 struct mapped_device *md;
1812 void *old_md;
1813
1814 md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
1815 if (!md) {
1816 DMWARN("unable to allocate device, out of memory.");
1817 return NULL;
1818 }
1819
1820 if (!try_module_get(THIS_MODULE))
1821 goto bad_module_get;
1822
1823
1824 if (minor == DM_ANY_MINOR)
1825 r = next_free_minor(&minor);
1826 else
1827 r = specific_minor(minor);
1828 if (r < 0)
1829 goto bad_minor;
1830
1831 r = init_srcu_struct(&md->io_barrier);
1832 if (r < 0)
1833 goto bad_io_barrier;
1834
1835 md->numa_node_id = numa_node_id;
1836 md->use_blk_mq = dm_use_blk_mq_default();
1837 md->init_tio_pdu = false;
1838 md->type = DM_TYPE_NONE;
1839 mutex_init(&md->suspend_lock);
1840 mutex_init(&md->type_lock);
1841 mutex_init(&md->table_devices_lock);
1842 spin_lock_init(&md->deferred_lock);
1843 atomic_set(&md->holders, 1);
1844 atomic_set(&md->open_count, 0);
1845 atomic_set(&md->event_nr, 0);
1846 atomic_set(&md->uevent_seq, 0);
1847 INIT_LIST_HEAD(&md->uevent_list);
1848 INIT_LIST_HEAD(&md->table_devices);
1849 spin_lock_init(&md->uevent_lock);
1850
1851 md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id);
1852 if (!md->queue)
1853 goto bad;
1854 md->queue->queuedata = md;
1855 md->queue->backing_dev_info->congested_data = md;
1856
1857 md->disk = alloc_disk_node(1, md->numa_node_id);
1858 if (!md->disk)
1859 goto bad;
1860
1861 atomic_set(&md->pending[0], 0);
1862 atomic_set(&md->pending[1], 0);
1863 init_waitqueue_head(&md->wait);
1864 INIT_WORK(&md->work, dm_wq_work);
1865 init_waitqueue_head(&md->eventq);
1866 init_completion(&md->kobj_holder.completion);
1867 md->kworker_task = NULL;
1868
1869 md->disk->major = _major;
1870 md->disk->first_minor = minor;
1871 md->disk->fops = &dm_blk_dops;
1872 md->disk->queue = md->queue;
1873 md->disk->private_data = md;
1874 sprintf(md->disk->disk_name, "dm-%d", minor);
1875
1876 dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops);
1877 if (!dax_dev)
1878 goto bad;
1879 md->dax_dev = dax_dev;
1880
1881 add_disk_no_queue_reg(md->disk);
1882 format_dev_t(md->name, MKDEV(_major, minor));
1883
1884 md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
1885 if (!md->wq)
1886 goto bad;
1887
1888 md->bdev = bdget_disk(md->disk, 0);
1889 if (!md->bdev)
1890 goto bad;
1891
1892 bio_init(&md->flush_bio, NULL, 0);
1893 bio_set_dev(&md->flush_bio, md->bdev);
1894 md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
1895
1896 dm_stats_init(&md->stats);
1897
1898
1899 spin_lock(&_minor_lock);
1900 old_md = idr_replace(&_minor_idr, md, minor);
1901 spin_unlock(&_minor_lock);
1902
1903 BUG_ON(old_md != MINOR_ALLOCED);
1904
1905 return md;
1906
1907bad:
1908 cleanup_mapped_device(md);
1909bad_io_barrier:
1910 free_minor(minor);
1911bad_minor:
1912 module_put(THIS_MODULE);
1913bad_module_get:
1914 kvfree(md);
1915 return NULL;
1916}
1917
1918static void unlock_fs(struct mapped_device *md);
1919
1920static void free_dev(struct mapped_device *md)
1921{
1922 int minor = MINOR(disk_devt(md->disk));
1923
1924 unlock_fs(md);
1925
1926 cleanup_mapped_device(md);
1927
1928 free_table_devices(&md->table_devices);
1929 dm_stats_cleanup(&md->stats);
1930 free_minor(minor);
1931
1932 module_put(THIS_MODULE);
1933 kvfree(md);
1934}
1935
1936static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
1937{
1938 struct dm_md_mempools *p = dm_table_get_md_mempools(t);
1939
1940 if (dm_table_bio_based(t)) {
1941
1942
1943
1944
1945
1946 if (md->bs) {
1947 bioset_free(md->bs);
1948 md->bs = NULL;
1949 }
1950 if (md->io_bs) {
1951 bioset_free(md->io_bs);
1952 md->io_bs = NULL;
1953 }
1954
1955 } else if (md->bs) {
1956
1957
1958
1959
1960
1961
1962
1963
1964 goto out;
1965 }
1966
1967 BUG_ON(!p || md->bs || md->io_bs);
1968
1969 md->bs = p->bs;
1970 p->bs = NULL;
1971 md->io_bs = p->io_bs;
1972 p->io_bs = NULL;
1973out:
1974
1975 dm_table_free_md_mempools(t);
1976}
1977
1978
1979
1980
1981static void event_callback(void *context)
1982{
1983 unsigned long flags;
1984 LIST_HEAD(uevents);
1985 struct mapped_device *md = (struct mapped_device *) context;
1986
1987 spin_lock_irqsave(&md->uevent_lock, flags);
1988 list_splice_init(&md->uevent_list, &uevents);
1989 spin_unlock_irqrestore(&md->uevent_lock, flags);
1990
1991 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
1992
1993 atomic_inc(&md->event_nr);
1994 wake_up(&md->eventq);
1995 dm_issue_global_event();
1996}
1997
1998
1999
2000
2001static void __set_size(struct mapped_device *md, sector_t size)
2002{
2003 lockdep_assert_held(&md->suspend_lock);
2004
2005 set_capacity(md->disk, size);
2006
2007 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
2008}
2009
2010
2011
2012
2013static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2014 struct queue_limits *limits)
2015{
2016 struct dm_table *old_map;
2017 struct request_queue *q = md->queue;
2018 bool request_based = dm_table_request_based(t);
2019 sector_t size;
2020
2021 lockdep_assert_held(&md->suspend_lock);
2022
2023 size = dm_table_get_size(t);
2024
2025
2026
2027
2028 if (size != dm_get_size(md))
2029 memset(&md->geometry, 0, sizeof(md->geometry));
2030
2031 __set_size(md, size);
2032
2033 dm_table_event_callback(t, event_callback, md);
2034
2035
2036
2037
2038
2039
2040
2041
2042 if (request_based)
2043 dm_stop_queue(q);
2044
2045 if (request_based || md->type == DM_TYPE_NVME_BIO_BASED) {
2046
2047
2048
2049
2050
2051
2052 md->immutable_target = dm_table_get_immutable_target(t);
2053 }
2054
2055 __bind_mempools(md, t);
2056
2057 old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2058 rcu_assign_pointer(md->map, (void *)t);
2059 md->immutable_target_type = dm_table_get_immutable_target_type(t);
2060
2061 dm_table_set_restrictions(t, q, limits);
2062 if (old_map)
2063 dm_sync_table(md);
2064
2065 return old_map;
2066}
2067
2068
2069
2070
2071static struct dm_table *__unbind(struct mapped_device *md)
2072{
2073 struct dm_table *map = rcu_dereference_protected(md->map, 1);
2074
2075 if (!map)
2076 return NULL;
2077
2078 dm_table_event_callback(map, NULL, NULL);
2079 RCU_INIT_POINTER(md->map, NULL);
2080 dm_sync_table(md);
2081
2082 return map;
2083}
2084
2085
2086
2087
2088int dm_create(int minor, struct mapped_device **result)
2089{
2090 int r;
2091 struct mapped_device *md;
2092
2093 md = alloc_dev(minor);
2094 if (!md)
2095 return -ENXIO;
2096
2097 r = dm_sysfs_init(md);
2098 if (r) {
2099 free_dev(md);
2100 return r;
2101 }
2102
2103 *result = md;
2104 return 0;
2105}
2106
2107
2108
2109
2110
2111void dm_lock_md_type(struct mapped_device *md)
2112{
2113 mutex_lock(&md->type_lock);
2114}
2115
2116void dm_unlock_md_type(struct mapped_device *md)
2117{
2118 mutex_unlock(&md->type_lock);
2119}
2120
2121void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
2122{
2123 BUG_ON(!mutex_is_locked(&md->type_lock));
2124 md->type = type;
2125}
2126
2127enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
2128{
2129 return md->type;
2130}
2131
2132struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
2133{
2134 return md->immutable_target_type;
2135}
2136
2137
2138
2139
2140
2141struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
2142{
2143 BUG_ON(!atomic_read(&md->holders));
2144 return &md->queue->limits;
2145}
2146EXPORT_SYMBOL_GPL(dm_get_queue_limits);
2147
2148
2149
2150
2151int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
2152{
2153 int r;
2154 struct queue_limits limits;
2155 enum dm_queue_mode type = dm_get_md_type(md);
2156
2157 switch (type) {
2158 case DM_TYPE_REQUEST_BASED:
2159 dm_init_normal_md_queue(md);
2160 r = dm_old_init_request_queue(md, t);
2161 if (r) {
2162 DMERR("Cannot initialize queue for request-based mapped device");
2163 return r;
2164 }
2165 break;
2166 case DM_TYPE_MQ_REQUEST_BASED:
2167 r = dm_mq_init_request_queue(md, t);
2168 if (r) {
2169 DMERR("Cannot initialize queue for request-based dm-mq mapped device");
2170 return r;
2171 }
2172 break;
2173 case DM_TYPE_BIO_BASED:
2174 case DM_TYPE_DAX_BIO_BASED:
2175 dm_init_normal_md_queue(md);
2176 blk_queue_make_request(md->queue, dm_make_request);
2177 break;
2178 case DM_TYPE_NVME_BIO_BASED:
2179 dm_init_normal_md_queue(md);
2180 blk_queue_make_request(md->queue, dm_make_request_nvme);
2181 break;
2182 case DM_TYPE_NONE:
2183 WARN_ON_ONCE(true);
2184 break;
2185 }
2186
2187 r = dm_calculate_queue_limits(t, &limits);
2188 if (r) {
2189 DMERR("Cannot calculate initial queue limits");
2190 return r;
2191 }
2192 dm_table_set_restrictions(t, md->queue, &limits);
2193 blk_register_queue(md->disk);
2194
2195 return 0;
2196}
2197
2198struct mapped_device *dm_get_md(dev_t dev)
2199{
2200 struct mapped_device *md;
2201 unsigned minor = MINOR(dev);
2202
2203 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2204 return NULL;
2205
2206 spin_lock(&_minor_lock);
2207
2208 md = idr_find(&_minor_idr, minor);
2209 if (!md || md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) ||
2210 test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2211 md = NULL;
2212 goto out;
2213 }
2214 dm_get(md);
2215out:
2216 spin_unlock(&_minor_lock);
2217
2218 return md;
2219}
2220EXPORT_SYMBOL_GPL(dm_get_md);
2221
2222void *dm_get_mdptr(struct mapped_device *md)
2223{
2224 return md->interface_ptr;
2225}
2226
2227void dm_set_mdptr(struct mapped_device *md, void *ptr)
2228{
2229 md->interface_ptr = ptr;
2230}
2231
2232void dm_get(struct mapped_device *md)
2233{
2234 atomic_inc(&md->holders);
2235 BUG_ON(test_bit(DMF_FREEING, &md->flags));
2236}
2237
2238int dm_hold(struct mapped_device *md)
2239{
2240 spin_lock(&_minor_lock);
2241 if (test_bit(DMF_FREEING, &md->flags)) {
2242 spin_unlock(&_minor_lock);
2243 return -EBUSY;
2244 }
2245 dm_get(md);
2246 spin_unlock(&_minor_lock);
2247 return 0;
2248}
2249EXPORT_SYMBOL_GPL(dm_hold);
2250
2251const char *dm_device_name(struct mapped_device *md)
2252{
2253 return md->name;
2254}
2255EXPORT_SYMBOL_GPL(dm_device_name);
2256
2257static void __dm_destroy(struct mapped_device *md, bool wait)
2258{
2259 struct dm_table *map;
2260 int srcu_idx;
2261
2262 might_sleep();
2263
2264 spin_lock(&_minor_lock);
2265 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2266 set_bit(DMF_FREEING, &md->flags);
2267 spin_unlock(&_minor_lock);
2268
2269 blk_set_queue_dying(md->queue);
2270
2271 if (dm_request_based(md) && md->kworker_task)
2272 kthread_flush_worker(&md->kworker);
2273
2274
2275
2276
2277
2278 mutex_lock(&md->suspend_lock);
2279 map = dm_get_live_table(md, &srcu_idx);
2280 if (!dm_suspended_md(md)) {
2281 dm_table_presuspend_targets(map);
2282 dm_table_postsuspend_targets(map);
2283 }
2284
2285 dm_put_live_table(md, srcu_idx);
2286 mutex_unlock(&md->suspend_lock);
2287
2288
2289
2290
2291
2292
2293
2294 if (wait)
2295 while (atomic_read(&md->holders))
2296 msleep(1);
2297 else if (atomic_read(&md->holders))
2298 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2299 dm_device_name(md), atomic_read(&md->holders));
2300
2301 dm_sysfs_exit(md);
2302 dm_table_destroy(__unbind(md));
2303 free_dev(md);
2304}
2305
2306void dm_destroy(struct mapped_device *md)
2307{
2308 __dm_destroy(md, true);
2309}
2310
2311void dm_destroy_immediate(struct mapped_device *md)
2312{
2313 __dm_destroy(md, false);
2314}
2315
2316void dm_put(struct mapped_device *md)
2317{
2318 atomic_dec(&md->holders);
2319}
2320EXPORT_SYMBOL_GPL(dm_put);
2321
2322static int dm_wait_for_completion(struct mapped_device *md, long task_state)
2323{
2324 int r = 0;
2325 DEFINE_WAIT(wait);
2326
2327 while (1) {
2328 prepare_to_wait(&md->wait, &wait, task_state);
2329
2330 if (!md_in_flight(md))
2331 break;
2332
2333 if (signal_pending_state(task_state, current)) {
2334 r = -EINTR;
2335 break;
2336 }
2337
2338 io_schedule();
2339 }
2340 finish_wait(&md->wait, &wait);
2341
2342 return r;
2343}
2344
2345
2346
2347
2348static void dm_wq_work(struct work_struct *work)
2349{
2350 struct mapped_device *md = container_of(work, struct mapped_device,
2351 work);
2352 struct bio *c;
2353 int srcu_idx;
2354 struct dm_table *map;
2355
2356 map = dm_get_live_table(md, &srcu_idx);
2357
2358 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2359 spin_lock_irq(&md->deferred_lock);
2360 c = bio_list_pop(&md->deferred);
2361 spin_unlock_irq(&md->deferred_lock);
2362
2363 if (!c)
2364 break;
2365
2366 if (dm_request_based(md))
2367 generic_make_request(c);
2368 else
2369 __split_and_process_bio(md, map, c);
2370 }
2371
2372 dm_put_live_table(md, srcu_idx);
2373}
2374
2375static void dm_queue_flush(struct mapped_device *md)
2376{
2377 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2378 smp_mb__after_atomic();
2379 queue_work(md->wq, &md->work);
2380}
2381
2382
2383
2384
2385struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2386{
2387 struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
2388 struct queue_limits limits;
2389 int r;
2390
2391 mutex_lock(&md->suspend_lock);
2392
2393
2394 if (!dm_suspended_md(md))
2395 goto out;
2396
2397
2398
2399
2400
2401
2402
2403 if (dm_table_has_no_data_devices(table)) {
2404 live_map = dm_get_live_table_fast(md);
2405 if (live_map)
2406 limits = md->queue->limits;
2407 dm_put_live_table_fast(md);
2408 }
2409
2410 if (!live_map) {
2411 r = dm_calculate_queue_limits(table, &limits);
2412 if (r) {
2413 map = ERR_PTR(r);
2414 goto out;
2415 }
2416 }
2417
2418 map = __bind(md, table, &limits);
2419 dm_issue_global_event();
2420
2421out:
2422 mutex_unlock(&md->suspend_lock);
2423 return map;
2424}
2425
2426
2427
2428
2429
2430static int lock_fs(struct mapped_device *md)
2431{
2432 int r;
2433
2434 WARN_ON(md->frozen_sb);
2435
2436 md->frozen_sb = freeze_bdev(md->bdev);
2437 if (IS_ERR(md->frozen_sb)) {
2438 r = PTR_ERR(md->frozen_sb);
2439 md->frozen_sb = NULL;
2440 return r;
2441 }
2442
2443 set_bit(DMF_FROZEN, &md->flags);
2444
2445 return 0;
2446}
2447
2448static void unlock_fs(struct mapped_device *md)
2449{
2450 if (!test_bit(DMF_FROZEN, &md->flags))
2451 return;
2452
2453 thaw_bdev(md->bdev, md->frozen_sb);
2454 md->frozen_sb = NULL;
2455 clear_bit(DMF_FROZEN, &md->flags);
2456}
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
2468 unsigned suspend_flags, long task_state,
2469 int dmf_suspended_flag)
2470{
2471 bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
2472 bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
2473 int r;
2474
2475 lockdep_assert_held(&md->suspend_lock);
2476
2477
2478
2479
2480
2481 if (noflush)
2482 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2483 else
2484 pr_debug("%s: suspending with flush\n", dm_device_name(md));
2485
2486
2487
2488
2489
2490 dm_table_presuspend_targets(map);
2491
2492
2493
2494
2495
2496
2497
2498 if (!noflush && do_lockfs) {
2499 r = lock_fs(md);
2500 if (r) {
2501 dm_table_presuspend_undo_targets(map);
2502 return r;
2503 }
2504 }
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2519 if (map)
2520 synchronize_srcu(&md->io_barrier);
2521
2522
2523
2524
2525
2526 if (dm_request_based(md)) {
2527 dm_stop_queue(md->queue);
2528 if (md->kworker_task)
2529 kthread_flush_worker(&md->kworker);
2530 }
2531
2532 flush_workqueue(md->wq);
2533
2534
2535
2536
2537
2538
2539 r = dm_wait_for_completion(md, task_state);
2540 if (!r)
2541 set_bit(dmf_suspended_flag, &md->flags);
2542
2543 if (noflush)
2544 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2545 if (map)
2546 synchronize_srcu(&md->io_barrier);
2547
2548
2549 if (r < 0) {
2550 dm_queue_flush(md);
2551
2552 if (dm_request_based(md))
2553 dm_start_queue(md->queue);
2554
2555 unlock_fs(md);
2556 dm_table_presuspend_undo_targets(map);
2557
2558 }
2559
2560 return r;
2561}
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2580{
2581 struct dm_table *map = NULL;
2582 int r = 0;
2583
2584retry:
2585 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2586
2587 if (dm_suspended_md(md)) {
2588 r = -EINVAL;
2589 goto out_unlock;
2590 }
2591
2592 if (dm_suspended_internally_md(md)) {
2593
2594 mutex_unlock(&md->suspend_lock);
2595 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2596 if (r)
2597 return r;
2598 goto retry;
2599 }
2600
2601 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2602
2603 r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED);
2604 if (r)
2605 goto out_unlock;
2606
2607 dm_table_postsuspend_targets(map);
2608
2609out_unlock:
2610 mutex_unlock(&md->suspend_lock);
2611 return r;
2612}
2613
2614static int __dm_resume(struct mapped_device *md, struct dm_table *map)
2615{
2616 if (map) {
2617 int r = dm_table_resume_targets(map);
2618 if (r)
2619 return r;
2620 }
2621
2622 dm_queue_flush(md);
2623
2624
2625
2626
2627
2628
2629 if (dm_request_based(md))
2630 dm_start_queue(md->queue);
2631
2632 unlock_fs(md);
2633
2634 return 0;
2635}
2636
2637int dm_resume(struct mapped_device *md)
2638{
2639 int r;
2640 struct dm_table *map = NULL;
2641
2642retry:
2643 r = -EINVAL;
2644 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2645
2646 if (!dm_suspended_md(md))
2647 goto out;
2648
2649 if (dm_suspended_internally_md(md)) {
2650
2651 mutex_unlock(&md->suspend_lock);
2652 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2653 if (r)
2654 return r;
2655 goto retry;
2656 }
2657
2658 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2659 if (!map || !dm_table_get_size(map))
2660 goto out;
2661
2662 r = __dm_resume(md, map);
2663 if (r)
2664 goto out;
2665
2666 clear_bit(DMF_SUSPENDED, &md->flags);
2667out:
2668 mutex_unlock(&md->suspend_lock);
2669
2670 return r;
2671}
2672
2673
2674
2675
2676
2677
2678
2679static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
2680{
2681 struct dm_table *map = NULL;
2682
2683 lockdep_assert_held(&md->suspend_lock);
2684
2685 if (md->internal_suspend_count++)
2686 return;
2687
2688 if (dm_suspended_md(md)) {
2689 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2690 return;
2691 }
2692
2693 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2694
2695
2696
2697
2698
2699
2700
2701 (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE,
2702 DMF_SUSPENDED_INTERNALLY);
2703
2704 dm_table_postsuspend_targets(map);
2705}
2706
2707static void __dm_internal_resume(struct mapped_device *md)
2708{
2709 BUG_ON(!md->internal_suspend_count);
2710
2711 if (--md->internal_suspend_count)
2712 return;
2713
2714 if (dm_suspended_md(md))
2715 goto done;
2716
2717
2718
2719
2720
2721 (void) __dm_resume(md, NULL);
2722
2723done:
2724 clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2725 smp_mb__after_atomic();
2726 wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
2727}
2728
2729void dm_internal_suspend_noflush(struct mapped_device *md)
2730{
2731 mutex_lock(&md->suspend_lock);
2732 __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
2733 mutex_unlock(&md->suspend_lock);
2734}
2735EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
2736
2737void dm_internal_resume(struct mapped_device *md)
2738{
2739 mutex_lock(&md->suspend_lock);
2740 __dm_internal_resume(md);
2741 mutex_unlock(&md->suspend_lock);
2742}
2743EXPORT_SYMBOL_GPL(dm_internal_resume);
2744
2745
2746
2747
2748
2749
2750void dm_internal_suspend_fast(struct mapped_device *md)
2751{
2752 mutex_lock(&md->suspend_lock);
2753 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2754 return;
2755
2756 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2757 synchronize_srcu(&md->io_barrier);
2758 flush_workqueue(md->wq);
2759 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2760}
2761EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
2762
2763void dm_internal_resume_fast(struct mapped_device *md)
2764{
2765 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2766 goto done;
2767
2768 dm_queue_flush(md);
2769
2770done:
2771 mutex_unlock(&md->suspend_lock);
2772}
2773EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
2774
2775
2776
2777
2778int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
2779 unsigned cookie)
2780{
2781 char udev_cookie[DM_COOKIE_LENGTH];
2782 char *envp[] = { udev_cookie, NULL };
2783
2784 if (!cookie)
2785 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
2786 else {
2787 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2788 DM_COOKIE_ENV_VAR_NAME, cookie);
2789 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
2790 action, envp);
2791 }
2792}
2793
2794uint32_t dm_next_uevent_seq(struct mapped_device *md)
2795{
2796 return atomic_add_return(1, &md->uevent_seq);
2797}
2798
2799uint32_t dm_get_event_nr(struct mapped_device *md)
2800{
2801 return atomic_read(&md->event_nr);
2802}
2803
2804int dm_wait_event(struct mapped_device *md, int event_nr)
2805{
2806 return wait_event_interruptible(md->eventq,
2807 (event_nr != atomic_read(&md->event_nr)));
2808}
2809
2810void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
2811{
2812 unsigned long flags;
2813
2814 spin_lock_irqsave(&md->uevent_lock, flags);
2815 list_add(elist, &md->uevent_list);
2816 spin_unlock_irqrestore(&md->uevent_lock, flags);
2817}
2818
2819
2820
2821
2822
2823struct gendisk *dm_disk(struct mapped_device *md)
2824{
2825 return md->disk;
2826}
2827EXPORT_SYMBOL_GPL(dm_disk);
2828
2829struct kobject *dm_kobject(struct mapped_device *md)
2830{
2831 return &md->kobj_holder.kobj;
2832}
2833
2834struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2835{
2836 struct mapped_device *md;
2837
2838 md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
2839
2840 spin_lock(&_minor_lock);
2841 if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2842 md = NULL;
2843 goto out;
2844 }
2845 dm_get(md);
2846out:
2847 spin_unlock(&_minor_lock);
2848
2849 return md;
2850}
2851
2852int dm_suspended_md(struct mapped_device *md)
2853{
2854 return test_bit(DMF_SUSPENDED, &md->flags);
2855}
2856
2857int dm_suspended_internally_md(struct mapped_device *md)
2858{
2859 return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2860}
2861
2862int dm_test_deferred_remove_flag(struct mapped_device *md)
2863{
2864 return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
2865}
2866
2867int dm_suspended(struct dm_target *ti)
2868{
2869 return dm_suspended_md(dm_table_get_md(ti->table));
2870}
2871EXPORT_SYMBOL_GPL(dm_suspended);
2872
2873int dm_noflush_suspending(struct dm_target *ti)
2874{
2875 return __noflush_suspending(dm_table_get_md(ti->table));
2876}
2877EXPORT_SYMBOL_GPL(dm_noflush_suspending);
2878
2879struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
2880 unsigned integrity, unsigned per_io_data_size,
2881 unsigned min_pool_size)
2882{
2883 struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
2884 unsigned int pool_size = 0;
2885 unsigned int front_pad, io_front_pad;
2886
2887 if (!pools)
2888 return NULL;
2889
2890 switch (type) {
2891 case DM_TYPE_BIO_BASED:
2892 case DM_TYPE_DAX_BIO_BASED:
2893 case DM_TYPE_NVME_BIO_BASED:
2894 pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
2895 front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
2896 io_front_pad = roundup(front_pad, __alignof__(struct dm_io)) + offsetof(struct dm_io, tio);
2897 pools->io_bs = bioset_create(pool_size, io_front_pad, 0);
2898 if (!pools->io_bs)
2899 goto out;
2900 if (integrity && bioset_integrity_create(pools->io_bs, pool_size))
2901 goto out;
2902 break;
2903 case DM_TYPE_REQUEST_BASED:
2904 case DM_TYPE_MQ_REQUEST_BASED:
2905 pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size);
2906 front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
2907
2908 break;
2909 default:
2910 BUG();
2911 }
2912
2913 pools->bs = bioset_create(pool_size, front_pad, 0);
2914 if (!pools->bs)
2915 goto out;
2916
2917 if (integrity && bioset_integrity_create(pools->bs, pool_size))
2918 goto out;
2919
2920 return pools;
2921
2922out:
2923 dm_free_md_mempools(pools);
2924
2925 return NULL;
2926}
2927
2928void dm_free_md_mempools(struct dm_md_mempools *pools)
2929{
2930 if (!pools)
2931 return;
2932
2933 if (pools->bs)
2934 bioset_free(pools->bs);
2935 if (pools->io_bs)
2936 bioset_free(pools->io_bs);
2937
2938 kfree(pools);
2939}
2940
2941struct dm_pr {
2942 u64 old_key;
2943 u64 new_key;
2944 u32 flags;
2945 bool fail_early;
2946};
2947
2948static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
2949 void *data)
2950{
2951 struct mapped_device *md = bdev->bd_disk->private_data;
2952 struct dm_table *table;
2953 struct dm_target *ti;
2954 int ret = -ENOTTY, srcu_idx;
2955
2956 table = dm_get_live_table(md, &srcu_idx);
2957 if (!table || !dm_table_get_size(table))
2958 goto out;
2959
2960
2961 if (dm_table_get_num_targets(table) != 1)
2962 goto out;
2963 ti = dm_table_get_target(table, 0);
2964
2965 ret = -EINVAL;
2966 if (!ti->type->iterate_devices)
2967 goto out;
2968
2969 ret = ti->type->iterate_devices(ti, fn, data);
2970out:
2971 dm_put_live_table(md, srcu_idx);
2972 return ret;
2973}
2974
2975
2976
2977
2978static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev,
2979 sector_t start, sector_t len, void *data)
2980{
2981 struct dm_pr *pr = data;
2982 const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
2983
2984 if (!ops || !ops->pr_register)
2985 return -EOPNOTSUPP;
2986 return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);
2987}
2988
2989static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
2990 u32 flags)
2991{
2992 struct dm_pr pr = {
2993 .old_key = old_key,
2994 .new_key = new_key,
2995 .flags = flags,
2996 .fail_early = true,
2997 };
2998 int ret;
2999
3000 ret = dm_call_pr(bdev, __dm_pr_register, &pr);
3001 if (ret && new_key) {
3002
3003 pr.old_key = new_key;
3004 pr.new_key = 0;
3005 pr.flags = 0;
3006 pr.fail_early = false;
3007 dm_call_pr(bdev, __dm_pr_register, &pr);
3008 }
3009
3010 return ret;
3011}
3012
3013static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
3014 u32 flags)
3015{
3016 struct mapped_device *md = bdev->bd_disk->private_data;
3017 const struct pr_ops *ops;
3018 fmode_t mode;
3019 int r;
3020
3021 r = dm_get_bdev_for_ioctl(md, &bdev, &mode);
3022 if (r < 0)
3023 return r;
3024
3025 ops = bdev->bd_disk->fops->pr_ops;
3026 if (ops && ops->pr_reserve)
3027 r = ops->pr_reserve(bdev, key, type, flags);
3028 else
3029 r = -EOPNOTSUPP;
3030
3031 blkdev_put(bdev, mode);
3032 return r;
3033}
3034
3035static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
3036{
3037 struct mapped_device *md = bdev->bd_disk->private_data;
3038 const struct pr_ops *ops;
3039 fmode_t mode;
3040 int r;
3041
3042 r = dm_get_bdev_for_ioctl(md, &bdev, &mode);
3043 if (r < 0)
3044 return r;
3045
3046 ops = bdev->bd_disk->fops->pr_ops;
3047 if (ops && ops->pr_release)
3048 r = ops->pr_release(bdev, key, type);
3049 else
3050 r = -EOPNOTSUPP;
3051
3052 blkdev_put(bdev, mode);
3053 return r;
3054}
3055
3056static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
3057 enum pr_type type, bool abort)
3058{
3059 struct mapped_device *md = bdev->bd_disk->private_data;
3060 const struct pr_ops *ops;
3061 fmode_t mode;
3062 int r;
3063
3064 r = dm_get_bdev_for_ioctl(md, &bdev, &mode);
3065 if (r < 0)
3066 return r;
3067
3068 ops = bdev->bd_disk->fops->pr_ops;
3069 if (ops && ops->pr_preempt)
3070 r = ops->pr_preempt(bdev, old_key, new_key, type, abort);
3071 else
3072 r = -EOPNOTSUPP;
3073
3074 blkdev_put(bdev, mode);
3075 return r;
3076}
3077
3078static int dm_pr_clear(struct block_device *bdev, u64 key)
3079{
3080 struct mapped_device *md = bdev->bd_disk->private_data;
3081 const struct pr_ops *ops;
3082 fmode_t mode;
3083 int r;
3084
3085 r = dm_get_bdev_for_ioctl(md, &bdev, &mode);
3086 if (r < 0)
3087 return r;
3088
3089 ops = bdev->bd_disk->fops->pr_ops;
3090 if (ops && ops->pr_clear)
3091 r = ops->pr_clear(bdev, key);
3092 else
3093 r = -EOPNOTSUPP;
3094
3095 blkdev_put(bdev, mode);
3096 return r;
3097}
3098
3099static const struct pr_ops dm_pr_ops = {
3100 .pr_register = dm_pr_register,
3101 .pr_reserve = dm_pr_reserve,
3102 .pr_release = dm_pr_release,
3103 .pr_preempt = dm_pr_preempt,
3104 .pr_clear = dm_pr_clear,
3105};
3106
3107static const struct block_device_operations dm_blk_dops = {
3108 .open = dm_blk_open,
3109 .release = dm_blk_close,
3110 .ioctl = dm_blk_ioctl,
3111 .getgeo = dm_blk_getgeo,
3112 .pr_ops = &dm_pr_ops,
3113 .owner = THIS_MODULE
3114};
3115
3116static const struct dax_operations dm_dax_ops = {
3117 .direct_access = dm_dax_direct_access,
3118 .copy_from_iter = dm_dax_copy_from_iter,
3119};
3120
3121
3122
3123
3124module_init(dm_init);
3125module_exit(dm_exit);
3126
3127module_param(major, uint, 0);
3128MODULE_PARM_DESC(major, "The major number of the device mapper");
3129
3130module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
3131MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
3132
3133module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
3134MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
3135
3136MODULE_DESCRIPTION(DM_NAME " driver");
3137MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3138MODULE_LICENSE("GPL");
3139