1
2
3
4
5
6
7
8#include "dm-core.h"
9#include "dm-rq.h"
10#include "dm-uevent.h"
11
12#include <linux/init.h>
13#include <linux/module.h>
14#include <linux/mutex.h>
15#include <linux/sched/signal.h>
16#include <linux/blkpg.h>
17#include <linux/bio.h>
18#include <linux/mempool.h>
19#include <linux/dax.h>
20#include <linux/slab.h>
21#include <linux/idr.h>
22#include <linux/uio.h>
23#include <linux/hdreg.h>
24#include <linux/delay.h>
25#include <linux/wait.h>
26#include <linux/pr.h>
27#include <linux/refcount.h>
28
29#define DM_MSG_PREFIX "core"
30
31
32
33
34
35#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
36#define DM_COOKIE_LENGTH 24
37
38static const char *_name = DM_NAME;
39
40static unsigned int major = 0;
41static unsigned int _major = 0;
42
43static DEFINE_IDR(_minor_idr);
44
45static DEFINE_SPINLOCK(_minor_lock);
46
47static void do_deferred_remove(struct work_struct *w);
48
49static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
50
51static struct workqueue_struct *deferred_remove_workqueue;
52
53atomic_t dm_global_event_nr = ATOMIC_INIT(0);
54DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq);
55
56void dm_issue_global_event(void)
57{
58 atomic_inc(&dm_global_event_nr);
59 wake_up(&dm_global_eventq);
60}
61
62
63
64
65struct clone_info {
66 struct dm_table *map;
67 struct bio *bio;
68 struct dm_io *io;
69 sector_t sector;
70 unsigned sector_count;
71};
72
73
74
75
76#define DM_TIO_MAGIC 7282014
77struct dm_target_io {
78 unsigned magic;
79 struct dm_io *io;
80 struct dm_target *ti;
81 unsigned target_bio_nr;
82 unsigned *len_ptr;
83 bool inside_dm_io;
84 struct bio clone;
85};
86
87
88
89
90
91#define DM_IO_MAGIC 5191977
92struct dm_io {
93 unsigned magic;
94 struct mapped_device *md;
95 blk_status_t status;
96 atomic_t io_count;
97 struct bio *orig_bio;
98 unsigned long start_time;
99 spinlock_t endio_lock;
100 struct dm_stats_aux stats_aux;
101
102 struct dm_target_io tio;
103};
104
105void *dm_per_bio_data(struct bio *bio, size_t data_size)
106{
107 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
108 if (!tio->inside_dm_io)
109 return (char *)bio - offsetof(struct dm_target_io, clone) - data_size;
110 return (char *)bio - offsetof(struct dm_target_io, clone) - offsetof(struct dm_io, tio) - data_size;
111}
112EXPORT_SYMBOL_GPL(dm_per_bio_data);
113
114struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size)
115{
116 struct dm_io *io = (struct dm_io *)((char *)data + data_size);
117 if (io->magic == DM_IO_MAGIC)
118 return (struct bio *)((char *)io + offsetof(struct dm_io, tio) + offsetof(struct dm_target_io, clone));
119 BUG_ON(io->magic != DM_TIO_MAGIC);
120 return (struct bio *)((char *)io + offsetof(struct dm_target_io, clone));
121}
122EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data);
123
124unsigned dm_bio_get_target_bio_nr(const struct bio *bio)
125{
126 return container_of(bio, struct dm_target_io, clone)->target_bio_nr;
127}
128EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr);
129
130#define MINOR_ALLOCED ((void *)-1)
131
132
133
134
135#define DMF_BLOCK_IO_FOR_SUSPEND 0
136#define DMF_SUSPENDED 1
137#define DMF_FROZEN 2
138#define DMF_FREEING 3
139#define DMF_DELETING 4
140#define DMF_NOFLUSH_SUSPENDING 5
141#define DMF_DEFERRED_REMOVE 6
142#define DMF_SUSPENDED_INTERNALLY 7
143
144#define DM_NUMA_NODE NUMA_NO_NODE
145static int dm_numa_node = DM_NUMA_NODE;
146
147
148
149
150struct dm_md_mempools {
151 struct bio_set bs;
152 struct bio_set io_bs;
153};
154
155struct table_device {
156 struct list_head list;
157 refcount_t count;
158 struct dm_dev dm_dev;
159};
160
161static struct kmem_cache *_rq_tio_cache;
162static struct kmem_cache *_rq_cache;
163
164
165
166
167#define RESERVED_BIO_BASED_IOS 16
168static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
169
170static int __dm_get_module_param_int(int *module_param, int min, int max)
171{
172 int param = READ_ONCE(*module_param);
173 int modified_param = 0;
174 bool modified = true;
175
176 if (param < min)
177 modified_param = min;
178 else if (param > max)
179 modified_param = max;
180 else
181 modified = false;
182
183 if (modified) {
184 (void)cmpxchg(module_param, param, modified_param);
185 param = modified_param;
186 }
187
188 return param;
189}
190
191unsigned __dm_get_module_param(unsigned *module_param,
192 unsigned def, unsigned max)
193{
194 unsigned param = READ_ONCE(*module_param);
195 unsigned modified_param = 0;
196
197 if (!param)
198 modified_param = def;
199 else if (param > max)
200 modified_param = max;
201
202 if (modified_param) {
203 (void)cmpxchg(module_param, param, modified_param);
204 param = modified_param;
205 }
206
207 return param;
208}
209
210unsigned dm_get_reserved_bio_based_ios(void)
211{
212 return __dm_get_module_param(&reserved_bio_based_ios,
213 RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS);
214}
215EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
216
217static unsigned dm_get_numa_node(void)
218{
219 return __dm_get_module_param_int(&dm_numa_node,
220 DM_NUMA_NODE, num_online_nodes() - 1);
221}
222
223static int __init local_init(void)
224{
225 int r = -ENOMEM;
226
227 _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
228 if (!_rq_tio_cache)
229 return r;
230
231 _rq_cache = kmem_cache_create("dm_old_clone_request", sizeof(struct request),
232 __alignof__(struct request), 0, NULL);
233 if (!_rq_cache)
234 goto out_free_rq_tio_cache;
235
236 r = dm_uevent_init();
237 if (r)
238 goto out_free_rq_cache;
239
240 deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
241 if (!deferred_remove_workqueue) {
242 r = -ENOMEM;
243 goto out_uevent_exit;
244 }
245
246 _major = major;
247 r = register_blkdev(_major, _name);
248 if (r < 0)
249 goto out_free_workqueue;
250
251 if (!_major)
252 _major = r;
253
254 return 0;
255
256out_free_workqueue:
257 destroy_workqueue(deferred_remove_workqueue);
258out_uevent_exit:
259 dm_uevent_exit();
260out_free_rq_cache:
261 kmem_cache_destroy(_rq_cache);
262out_free_rq_tio_cache:
263 kmem_cache_destroy(_rq_tio_cache);
264
265 return r;
266}
267
268static void local_exit(void)
269{
270 flush_scheduled_work();
271 destroy_workqueue(deferred_remove_workqueue);
272
273 kmem_cache_destroy(_rq_cache);
274 kmem_cache_destroy(_rq_tio_cache);
275 unregister_blkdev(_major, _name);
276 dm_uevent_exit();
277
278 _major = 0;
279
280 DMINFO("cleaned up");
281}
282
283static int (*_inits[])(void) __initdata = {
284 local_init,
285 dm_target_init,
286 dm_linear_init,
287 dm_stripe_init,
288 dm_io_init,
289 dm_kcopyd_init,
290 dm_interface_init,
291 dm_statistics_init,
292};
293
294static void (*_exits[])(void) = {
295 local_exit,
296 dm_target_exit,
297 dm_linear_exit,
298 dm_stripe_exit,
299 dm_io_exit,
300 dm_kcopyd_exit,
301 dm_interface_exit,
302 dm_statistics_exit,
303};
304
305static int __init dm_init(void)
306{
307 const int count = ARRAY_SIZE(_inits);
308
309 int r, i;
310
311 for (i = 0; i < count; i++) {
312 r = _inits[i]();
313 if (r)
314 goto bad;
315 }
316
317 return 0;
318
319 bad:
320 while (i--)
321 _exits[i]();
322
323 return r;
324}
325
326static void __exit dm_exit(void)
327{
328 int i = ARRAY_SIZE(_exits);
329
330 while (i--)
331 _exits[i]();
332
333
334
335
336 idr_destroy(&_minor_idr);
337}
338
339
340
341
342int dm_deleting_md(struct mapped_device *md)
343{
344 return test_bit(DMF_DELETING, &md->flags);
345}
346
347static int dm_blk_open(struct block_device *bdev, fmode_t mode)
348{
349 struct mapped_device *md;
350
351 spin_lock(&_minor_lock);
352
353 md = bdev->bd_disk->private_data;
354 if (!md)
355 goto out;
356
357 if (test_bit(DMF_FREEING, &md->flags) ||
358 dm_deleting_md(md)) {
359 md = NULL;
360 goto out;
361 }
362
363 dm_get(md);
364 atomic_inc(&md->open_count);
365out:
366 spin_unlock(&_minor_lock);
367
368 return md ? 0 : -ENXIO;
369}
370
371static void dm_blk_close(struct gendisk *disk, fmode_t mode)
372{
373 struct mapped_device *md;
374
375 spin_lock(&_minor_lock);
376
377 md = disk->private_data;
378 if (WARN_ON(!md))
379 goto out;
380
381 if (atomic_dec_and_test(&md->open_count) &&
382 (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
383 queue_work(deferred_remove_workqueue, &deferred_remove_work);
384
385 dm_put(md);
386out:
387 spin_unlock(&_minor_lock);
388}
389
390int dm_open_count(struct mapped_device *md)
391{
392 return atomic_read(&md->open_count);
393}
394
395
396
397
398int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
399{
400 int r = 0;
401
402 spin_lock(&_minor_lock);
403
404 if (dm_open_count(md)) {
405 r = -EBUSY;
406 if (mark_deferred)
407 set_bit(DMF_DEFERRED_REMOVE, &md->flags);
408 } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
409 r = -EEXIST;
410 else
411 set_bit(DMF_DELETING, &md->flags);
412
413 spin_unlock(&_minor_lock);
414
415 return r;
416}
417
418int dm_cancel_deferred_remove(struct mapped_device *md)
419{
420 int r = 0;
421
422 spin_lock(&_minor_lock);
423
424 if (test_bit(DMF_DELETING, &md->flags))
425 r = -EBUSY;
426 else
427 clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
428
429 spin_unlock(&_minor_lock);
430
431 return r;
432}
433
434static void do_deferred_remove(struct work_struct *w)
435{
436 dm_deferred_remove();
437}
438
439sector_t dm_get_size(struct mapped_device *md)
440{
441 return get_capacity(md->disk);
442}
443
444struct request_queue *dm_get_md_queue(struct mapped_device *md)
445{
446 return md->queue;
447}
448
449struct dm_stats *dm_get_stats(struct mapped_device *md)
450{
451 return &md->stats;
452}
453
454static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
455{
456 struct mapped_device *md = bdev->bd_disk->private_data;
457
458 return dm_get_geometry(md, geo);
459}
460
461static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx,
462 struct block_device **bdev)
463 __acquires(md->io_barrier)
464{
465 struct dm_target *tgt;
466 struct dm_table *map;
467 int r;
468
469retry:
470 r = -ENOTTY;
471 map = dm_get_live_table(md, srcu_idx);
472 if (!map || !dm_table_get_size(map))
473 return r;
474
475
476 if (dm_table_get_num_targets(map) != 1)
477 return r;
478
479 tgt = dm_table_get_target(map, 0);
480 if (!tgt->type->prepare_ioctl)
481 return r;
482
483 if (dm_suspended_md(md))
484 return -EAGAIN;
485
486 r = tgt->type->prepare_ioctl(tgt, bdev);
487 if (r == -ENOTCONN && !fatal_signal_pending(current)) {
488 dm_put_live_table(md, *srcu_idx);
489 msleep(10);
490 goto retry;
491 }
492
493 return r;
494}
495
496static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx)
497 __releases(md->io_barrier)
498{
499 dm_put_live_table(md, srcu_idx);
500}
501
502static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
503 unsigned int cmd, unsigned long arg)
504{
505 struct mapped_device *md = bdev->bd_disk->private_data;
506 int r, srcu_idx;
507
508 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
509 if (r < 0)
510 goto out;
511
512 if (r > 0) {
513
514
515
516
517 if (!capable(CAP_SYS_RAWIO)) {
518 DMWARN_LIMIT(
519 "%s: sending ioctl %x to DM device without required privilege.",
520 current->comm, cmd);
521 r = -ENOIOCTLCMD;
522 goto out;
523 }
524 }
525
526 r = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
527out:
528 dm_unprepare_ioctl(md, srcu_idx);
529 return r;
530}
531
532static void start_io_acct(struct dm_io *io);
533
534static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
535{
536 struct dm_io *io;
537 struct dm_target_io *tio;
538 struct bio *clone;
539
540 clone = bio_alloc_bioset(GFP_NOIO, 0, &md->io_bs);
541 if (!clone)
542 return NULL;
543
544 tio = container_of(clone, struct dm_target_io, clone);
545 tio->inside_dm_io = true;
546 tio->io = NULL;
547
548 io = container_of(tio, struct dm_io, tio);
549 io->magic = DM_IO_MAGIC;
550 io->status = 0;
551 atomic_set(&io->io_count, 1);
552 io->orig_bio = bio;
553 io->md = md;
554 spin_lock_init(&io->endio_lock);
555
556 start_io_acct(io);
557
558 return io;
559}
560
561static void free_io(struct mapped_device *md, struct dm_io *io)
562{
563 bio_put(&io->tio.clone);
564}
565
566static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *ti,
567 unsigned target_bio_nr, gfp_t gfp_mask)
568{
569 struct dm_target_io *tio;
570
571 if (!ci->io->tio.io) {
572
573 tio = &ci->io->tio;
574 } else {
575 struct bio *clone = bio_alloc_bioset(gfp_mask, 0, &ci->io->md->bs);
576 if (!clone)
577 return NULL;
578
579 tio = container_of(clone, struct dm_target_io, clone);
580 tio->inside_dm_io = false;
581 }
582
583 tio->magic = DM_TIO_MAGIC;
584 tio->io = ci->io;
585 tio->ti = ti;
586 tio->target_bio_nr = target_bio_nr;
587
588 return tio;
589}
590
591static void free_tio(struct dm_target_io *tio)
592{
593 if (tio->inside_dm_io)
594 return;
595 bio_put(&tio->clone);
596}
597
598int md_in_flight(struct mapped_device *md)
599{
600 return atomic_read(&md->pending[READ]) +
601 atomic_read(&md->pending[WRITE]);
602}
603
604static void start_io_acct(struct dm_io *io)
605{
606 struct mapped_device *md = io->md;
607 struct bio *bio = io->orig_bio;
608 int rw = bio_data_dir(bio);
609
610 io->start_time = jiffies;
611
612 generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio),
613 &dm_disk(md)->part0);
614
615 atomic_set(&dm_disk(md)->part0.in_flight[rw],
616 atomic_inc_return(&md->pending[rw]));
617
618 if (unlikely(dm_stats_used(&md->stats)))
619 dm_stats_account_io(&md->stats, bio_data_dir(bio),
620 bio->bi_iter.bi_sector, bio_sectors(bio),
621 false, 0, &io->stats_aux);
622}
623
624static void end_io_acct(struct dm_io *io)
625{
626 struct mapped_device *md = io->md;
627 struct bio *bio = io->orig_bio;
628 unsigned long duration = jiffies - io->start_time;
629 int pending;
630 int rw = bio_data_dir(bio);
631
632 generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0,
633 io->start_time);
634
635 if (unlikely(dm_stats_used(&md->stats)))
636 dm_stats_account_io(&md->stats, bio_data_dir(bio),
637 bio->bi_iter.bi_sector, bio_sectors(bio),
638 true, duration, &io->stats_aux);
639
640
641
642
643
644 pending = atomic_dec_return(&md->pending[rw]);
645 atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
646 pending += atomic_read(&md->pending[rw^0x1]);
647
648
649 if (!pending)
650 wake_up(&md->wait);
651}
652
653
654
655
656static void queue_io(struct mapped_device *md, struct bio *bio)
657{
658 unsigned long flags;
659
660 spin_lock_irqsave(&md->deferred_lock, flags);
661 bio_list_add(&md->deferred, bio);
662 spin_unlock_irqrestore(&md->deferred_lock, flags);
663 queue_work(md->wq, &md->work);
664}
665
666
667
668
669
670
671struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
672{
673 *srcu_idx = srcu_read_lock(&md->io_barrier);
674
675 return srcu_dereference(md->map, &md->io_barrier);
676}
677
678void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
679{
680 srcu_read_unlock(&md->io_barrier, srcu_idx);
681}
682
683void dm_sync_table(struct mapped_device *md)
684{
685 synchronize_srcu(&md->io_barrier);
686 synchronize_rcu_expedited();
687}
688
689
690
691
692
693static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
694{
695 rcu_read_lock();
696 return rcu_dereference(md->map);
697}
698
699static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
700{
701 rcu_read_unlock();
702}
703
704static char *_dm_claim_ptr = "I belong to device-mapper";
705
706
707
708
709static int open_table_device(struct table_device *td, dev_t dev,
710 struct mapped_device *md)
711{
712 struct block_device *bdev;
713
714 int r;
715
716 BUG_ON(td->dm_dev.bdev);
717
718 bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr);
719 if (IS_ERR(bdev))
720 return PTR_ERR(bdev);
721
722 r = bd_link_disk_holder(bdev, dm_disk(md));
723 if (r) {
724 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
725 return r;
726 }
727
728 td->dm_dev.bdev = bdev;
729 td->dm_dev.dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
730 return 0;
731}
732
733
734
735
736static void close_table_device(struct table_device *td, struct mapped_device *md)
737{
738 if (!td->dm_dev.bdev)
739 return;
740
741 bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
742 blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
743 put_dax(td->dm_dev.dax_dev);
744 td->dm_dev.bdev = NULL;
745 td->dm_dev.dax_dev = NULL;
746}
747
748static struct table_device *find_table_device(struct list_head *l, dev_t dev,
749 fmode_t mode) {
750 struct table_device *td;
751
752 list_for_each_entry(td, l, list)
753 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
754 return td;
755
756 return NULL;
757}
758
759int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
760 struct dm_dev **result) {
761 int r;
762 struct table_device *td;
763
764 mutex_lock(&md->table_devices_lock);
765 td = find_table_device(&md->table_devices, dev, mode);
766 if (!td) {
767 td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
768 if (!td) {
769 mutex_unlock(&md->table_devices_lock);
770 return -ENOMEM;
771 }
772
773 td->dm_dev.mode = mode;
774 td->dm_dev.bdev = NULL;
775
776 if ((r = open_table_device(td, dev, md))) {
777 mutex_unlock(&md->table_devices_lock);
778 kfree(td);
779 return r;
780 }
781
782 format_dev_t(td->dm_dev.name, dev);
783
784 refcount_set(&td->count, 1);
785 list_add(&td->list, &md->table_devices);
786 } else {
787 refcount_inc(&td->count);
788 }
789 mutex_unlock(&md->table_devices_lock);
790
791 *result = &td->dm_dev;
792 return 0;
793}
794EXPORT_SYMBOL_GPL(dm_get_table_device);
795
796void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
797{
798 struct table_device *td = container_of(d, struct table_device, dm_dev);
799
800 mutex_lock(&md->table_devices_lock);
801 if (refcount_dec_and_test(&td->count)) {
802 close_table_device(td, md);
803 list_del(&td->list);
804 kfree(td);
805 }
806 mutex_unlock(&md->table_devices_lock);
807}
808EXPORT_SYMBOL(dm_put_table_device);
809
810static void free_table_devices(struct list_head *devices)
811{
812 struct list_head *tmp, *next;
813
814 list_for_each_safe(tmp, next, devices) {
815 struct table_device *td = list_entry(tmp, struct table_device, list);
816
817 DMWARN("dm_destroy: %s still exists with %d references",
818 td->dm_dev.name, refcount_read(&td->count));
819 kfree(td);
820 }
821}
822
823
824
825
826int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
827{
828 *geo = md->geometry;
829
830 return 0;
831}
832
833
834
835
836int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
837{
838 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
839
840 if (geo->start > sz) {
841 DMWARN("Start sector is beyond the geometry limits.");
842 return -EINVAL;
843 }
844
845 md->geometry = *geo;
846
847 return 0;
848}
849
850static int __noflush_suspending(struct mapped_device *md)
851{
852 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
853}
854
855
856
857
858
859static void dec_pending(struct dm_io *io, blk_status_t error)
860{
861 unsigned long flags;
862 blk_status_t io_error;
863 struct bio *bio;
864 struct mapped_device *md = io->md;
865
866
867 if (unlikely(error)) {
868 spin_lock_irqsave(&io->endio_lock, flags);
869 if (!(io->status == BLK_STS_DM_REQUEUE && __noflush_suspending(md)))
870 io->status = error;
871 spin_unlock_irqrestore(&io->endio_lock, flags);
872 }
873
874 if (atomic_dec_and_test(&io->io_count)) {
875 if (io->status == BLK_STS_DM_REQUEUE) {
876
877
878
879 spin_lock_irqsave(&md->deferred_lock, flags);
880 if (__noflush_suspending(md))
881
882 bio_list_add_head(&md->deferred, io->orig_bio);
883 else
884
885 io->status = BLK_STS_IOERR;
886 spin_unlock_irqrestore(&md->deferred_lock, flags);
887 }
888
889 io_error = io->status;
890 bio = io->orig_bio;
891 end_io_acct(io);
892 free_io(md, io);
893
894 if (io_error == BLK_STS_DM_REQUEUE)
895 return;
896
897 if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
898
899
900
901
902 bio->bi_opf &= ~REQ_PREFLUSH;
903 queue_io(md, bio);
904 } else {
905
906 if (io_error)
907 bio->bi_status = io_error;
908 bio_endio(bio);
909 }
910 }
911}
912
913void disable_write_same(struct mapped_device *md)
914{
915 struct queue_limits *limits = dm_get_queue_limits(md);
916
917
918 limits->max_write_same_sectors = 0;
919}
920
921void disable_write_zeroes(struct mapped_device *md)
922{
923 struct queue_limits *limits = dm_get_queue_limits(md);
924
925
926 limits->max_write_zeroes_sectors = 0;
927}
928
929static void clone_endio(struct bio *bio)
930{
931 blk_status_t error = bio->bi_status;
932 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
933 struct dm_io *io = tio->io;
934 struct mapped_device *md = tio->io->md;
935 dm_endio_fn endio = tio->ti->type->end_io;
936
937 if (unlikely(error == BLK_STS_TARGET) && md->type != DM_TYPE_NVME_BIO_BASED) {
938 if (bio_op(bio) == REQ_OP_WRITE_SAME &&
939 !bio->bi_disk->queue->limits.max_write_same_sectors)
940 disable_write_same(md);
941 if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
942 !bio->bi_disk->queue->limits.max_write_zeroes_sectors)
943 disable_write_zeroes(md);
944 }
945
946 if (endio) {
947 int r = endio(tio->ti, bio, &error);
948 switch (r) {
949 case DM_ENDIO_REQUEUE:
950 error = BLK_STS_DM_REQUEUE;
951
952 case DM_ENDIO_DONE:
953 break;
954 case DM_ENDIO_INCOMPLETE:
955
956 return;
957 default:
958 DMWARN("unimplemented target endio return value: %d", r);
959 BUG();
960 }
961 }
962
963 free_tio(tio);
964 dec_pending(io, error);
965}
966
967
968
969
970
971static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
972{
973 sector_t target_offset = dm_target_offset(ti, sector);
974
975 return ti->len - target_offset;
976}
977
978static sector_t max_io_len(sector_t sector, struct dm_target *ti)
979{
980 sector_t len = max_io_len_target_boundary(sector, ti);
981 sector_t offset, max_len;
982
983
984
985
986 if (ti->max_io_len) {
987 offset = dm_target_offset(ti, sector);
988 if (unlikely(ti->max_io_len & (ti->max_io_len - 1)))
989 max_len = sector_div(offset, ti->max_io_len);
990 else
991 max_len = offset & (ti->max_io_len - 1);
992 max_len = ti->max_io_len - max_len;
993
994 if (len > max_len)
995 len = max_len;
996 }
997
998 return len;
999}
1000
1001int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
1002{
1003 if (len > UINT_MAX) {
1004 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
1005 (unsigned long long)len, UINT_MAX);
1006 ti->error = "Maximum size of target IO is too large";
1007 return -EINVAL;
1008 }
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018 ti->max_io_len = min_t(uint32_t, len, BIO_MAX_PAGES * PAGE_SIZE);
1019
1020 return 0;
1021}
1022EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
1023
1024static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
1025 sector_t sector, int *srcu_idx)
1026 __acquires(md->io_barrier)
1027{
1028 struct dm_table *map;
1029 struct dm_target *ti;
1030
1031 map = dm_get_live_table(md, srcu_idx);
1032 if (!map)
1033 return NULL;
1034
1035 ti = dm_table_find_target(map, sector);
1036 if (!dm_target_is_valid(ti))
1037 return NULL;
1038
1039 return ti;
1040}
1041
1042static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
1043 long nr_pages, void **kaddr, pfn_t *pfn)
1044{
1045 struct mapped_device *md = dax_get_private(dax_dev);
1046 sector_t sector = pgoff * PAGE_SECTORS;
1047 struct dm_target *ti;
1048 long len, ret = -EIO;
1049 int srcu_idx;
1050
1051 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1052
1053 if (!ti)
1054 goto out;
1055 if (!ti->type->direct_access)
1056 goto out;
1057 len = max_io_len(sector, ti) / PAGE_SECTORS;
1058 if (len < 1)
1059 goto out;
1060 nr_pages = min(len, nr_pages);
1061 ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
1062
1063 out:
1064 dm_put_live_table(md, srcu_idx);
1065
1066 return ret;
1067}
1068
1069static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
1070 void *addr, size_t bytes, struct iov_iter *i)
1071{
1072 struct mapped_device *md = dax_get_private(dax_dev);
1073 sector_t sector = pgoff * PAGE_SECTORS;
1074 struct dm_target *ti;
1075 long ret = 0;
1076 int srcu_idx;
1077
1078 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1079
1080 if (!ti)
1081 goto out;
1082 if (!ti->type->dax_copy_from_iter) {
1083 ret = copy_from_iter(addr, bytes, i);
1084 goto out;
1085 }
1086 ret = ti->type->dax_copy_from_iter(ti, pgoff, addr, bytes, i);
1087 out:
1088 dm_put_live_table(md, srcu_idx);
1089
1090 return ret;
1091}
1092
1093static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
1094 void *addr, size_t bytes, struct iov_iter *i)
1095{
1096 struct mapped_device *md = dax_get_private(dax_dev);
1097 sector_t sector = pgoff * PAGE_SECTORS;
1098 struct dm_target *ti;
1099 long ret = 0;
1100 int srcu_idx;
1101
1102 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1103
1104 if (!ti)
1105 goto out;
1106 if (!ti->type->dax_copy_to_iter) {
1107 ret = copy_to_iter(addr, bytes, i);
1108 goto out;
1109 }
1110 ret = ti->type->dax_copy_to_iter(ti, pgoff, addr, bytes, i);
1111 out:
1112 dm_put_live_table(md, srcu_idx);
1113
1114 return ret;
1115}
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
1146{
1147 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
1148 unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
1149 BUG_ON(bio->bi_opf & REQ_PREFLUSH);
1150 BUG_ON(bi_size > *tio->len_ptr);
1151 BUG_ON(n_sectors > bi_size);
1152 *tio->len_ptr -= bi_size - n_sectors;
1153 bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
1154}
1155EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167void dm_remap_zone_report(struct dm_target *ti, struct bio *bio, sector_t start)
1168{
1169#ifdef CONFIG_BLK_DEV_ZONED
1170 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
1171 struct bio *report_bio = tio->io->orig_bio;
1172 struct blk_zone_report_hdr *hdr = NULL;
1173 struct blk_zone *zone;
1174 unsigned int nr_rep = 0;
1175 unsigned int ofst;
1176 sector_t part_offset;
1177 struct bio_vec bvec;
1178 struct bvec_iter iter;
1179 void *addr;
1180
1181 if (bio->bi_status)
1182 return;
1183
1184
1185
1186
1187
1188
1189
1190
1191 part_offset = bio->bi_iter.bi_sector + ti->begin - (start + bio_end_sector(report_bio));
1192
1193
1194
1195
1196
1197 bio_for_each_segment(bvec, report_bio, iter) {
1198 addr = kmap_atomic(bvec.bv_page);
1199
1200
1201 if (!hdr) {
1202 hdr = addr;
1203 ofst = sizeof(struct blk_zone_report_hdr);
1204 } else
1205 ofst = 0;
1206
1207
1208 while (hdr->nr_zones && ofst < bvec.bv_len) {
1209 zone = addr + ofst;
1210 zone->start -= part_offset;
1211 if (zone->start >= start + ti->len) {
1212 hdr->nr_zones = 0;
1213 break;
1214 }
1215 zone->start = zone->start + ti->begin - start;
1216 if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) {
1217 if (zone->cond == BLK_ZONE_COND_FULL)
1218 zone->wp = zone->start + zone->len;
1219 else if (zone->cond == BLK_ZONE_COND_EMPTY)
1220 zone->wp = zone->start;
1221 else
1222 zone->wp = zone->wp + ti->begin - start - part_offset;
1223 }
1224 ofst += sizeof(struct blk_zone);
1225 hdr->nr_zones--;
1226 nr_rep++;
1227 }
1228
1229 if (addr != hdr)
1230 kunmap_atomic(addr);
1231
1232 if (!hdr->nr_zones)
1233 break;
1234 }
1235
1236 if (hdr) {
1237 hdr->nr_zones = nr_rep;
1238 kunmap_atomic(hdr);
1239 }
1240
1241 bio_advance(report_bio, report_bio->bi_iter.bi_size);
1242
1243#else
1244 bio->bi_status = BLK_STS_NOTSUPP;
1245#endif
1246}
1247EXPORT_SYMBOL_GPL(dm_remap_zone_report);
1248
1249static blk_qc_t __map_bio(struct dm_target_io *tio)
1250{
1251 int r;
1252 sector_t sector;
1253 struct bio *clone = &tio->clone;
1254 struct dm_io *io = tio->io;
1255 struct mapped_device *md = io->md;
1256 struct dm_target *ti = tio->ti;
1257 blk_qc_t ret = BLK_QC_T_NONE;
1258
1259 clone->bi_end_io = clone_endio;
1260
1261
1262
1263
1264
1265
1266 atomic_inc(&io->io_count);
1267 sector = clone->bi_iter.bi_sector;
1268
1269 r = ti->type->map(ti, clone);
1270 switch (r) {
1271 case DM_MAPIO_SUBMITTED:
1272 break;
1273 case DM_MAPIO_REMAPPED:
1274
1275 trace_block_bio_remap(clone->bi_disk->queue, clone,
1276 bio_dev(io->orig_bio), sector);
1277 if (md->type == DM_TYPE_NVME_BIO_BASED)
1278 ret = direct_make_request(clone);
1279 else
1280 ret = generic_make_request(clone);
1281 break;
1282 case DM_MAPIO_KILL:
1283 free_tio(tio);
1284 dec_pending(io, BLK_STS_IOERR);
1285 break;
1286 case DM_MAPIO_REQUEUE:
1287 free_tio(tio);
1288 dec_pending(io, BLK_STS_DM_REQUEUE);
1289 break;
1290 default:
1291 DMWARN("unimplemented target map return value: %d", r);
1292 BUG();
1293 }
1294
1295 return ret;
1296}
1297
1298static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
1299{
1300 bio->bi_iter.bi_sector = sector;
1301 bio->bi_iter.bi_size = to_bytes(len);
1302}
1303
1304
1305
1306
1307static int clone_bio(struct dm_target_io *tio, struct bio *bio,
1308 sector_t sector, unsigned len)
1309{
1310 struct bio *clone = &tio->clone;
1311
1312 __bio_clone_fast(clone, bio);
1313
1314 if (unlikely(bio_integrity(bio) != NULL)) {
1315 int r;
1316
1317 if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
1318 !dm_target_passes_integrity(tio->ti->type))) {
1319 DMWARN("%s: the target %s doesn't support integrity data.",
1320 dm_device_name(tio->io->md),
1321 tio->ti->type->name);
1322 return -EIO;
1323 }
1324
1325 r = bio_integrity_clone(clone, bio, GFP_NOIO);
1326 if (r < 0)
1327 return r;
1328 }
1329
1330 if (bio_op(bio) != REQ_OP_ZONE_REPORT)
1331 bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
1332 clone->bi_iter.bi_size = to_bytes(len);
1333
1334 if (unlikely(bio_integrity(bio) != NULL))
1335 bio_integrity_trim(clone);
1336
1337 return 0;
1338}
1339
1340static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
1341 struct dm_target *ti, unsigned num_bios)
1342{
1343 struct dm_target_io *tio;
1344 int try;
1345
1346 if (!num_bios)
1347 return;
1348
1349 if (num_bios == 1) {
1350 tio = alloc_tio(ci, ti, 0, GFP_NOIO);
1351 bio_list_add(blist, &tio->clone);
1352 return;
1353 }
1354
1355 for (try = 0; try < 2; try++) {
1356 int bio_nr;
1357 struct bio *bio;
1358
1359 if (try)
1360 mutex_lock(&ci->io->md->table_devices_lock);
1361 for (bio_nr = 0; bio_nr < num_bios; bio_nr++) {
1362 tio = alloc_tio(ci, ti, bio_nr, try ? GFP_NOIO : GFP_NOWAIT);
1363 if (!tio)
1364 break;
1365
1366 bio_list_add(blist, &tio->clone);
1367 }
1368 if (try)
1369 mutex_unlock(&ci->io->md->table_devices_lock);
1370 if (bio_nr == num_bios)
1371 return;
1372
1373 while ((bio = bio_list_pop(blist))) {
1374 tio = container_of(bio, struct dm_target_io, clone);
1375 free_tio(tio);
1376 }
1377 }
1378}
1379
1380static blk_qc_t __clone_and_map_simple_bio(struct clone_info *ci,
1381 struct dm_target_io *tio, unsigned *len)
1382{
1383 struct bio *clone = &tio->clone;
1384
1385 tio->len_ptr = len;
1386
1387 __bio_clone_fast(clone, ci->bio);
1388 if (len)
1389 bio_setup_sector(clone, ci->sector, *len);
1390
1391 return __map_bio(tio);
1392}
1393
1394static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
1395 unsigned num_bios, unsigned *len)
1396{
1397 struct bio_list blist = BIO_EMPTY_LIST;
1398 struct bio *bio;
1399 struct dm_target_io *tio;
1400
1401 alloc_multiple_bios(&blist, ci, ti, num_bios);
1402
1403 while ((bio = bio_list_pop(&blist))) {
1404 tio = container_of(bio, struct dm_target_io, clone);
1405 (void) __clone_and_map_simple_bio(ci, tio, len);
1406 }
1407}
1408
1409static int __send_empty_flush(struct clone_info *ci)
1410{
1411 unsigned target_nr = 0;
1412 struct dm_target *ti;
1413
1414 BUG_ON(bio_has_data(ci->bio));
1415 while ((ti = dm_table_get_target(ci->map, target_nr++)))
1416 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
1417
1418 return 0;
1419}
1420
1421static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
1422 sector_t sector, unsigned *len)
1423{
1424 struct bio *bio = ci->bio;
1425 struct dm_target_io *tio;
1426 int r;
1427
1428 tio = alloc_tio(ci, ti, 0, GFP_NOIO);
1429 tio->len_ptr = len;
1430 r = clone_bio(tio, bio, sector, *len);
1431 if (r < 0) {
1432 free_tio(tio);
1433 return r;
1434 }
1435 (void) __map_bio(tio);
1436
1437 return 0;
1438}
1439
1440typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
1441
1442static unsigned get_num_discard_bios(struct dm_target *ti)
1443{
1444 return ti->num_discard_bios;
1445}
1446
1447static unsigned get_num_secure_erase_bios(struct dm_target *ti)
1448{
1449 return ti->num_secure_erase_bios;
1450}
1451
1452static unsigned get_num_write_same_bios(struct dm_target *ti)
1453{
1454 return ti->num_write_same_bios;
1455}
1456
1457static unsigned get_num_write_zeroes_bios(struct dm_target *ti)
1458{
1459 return ti->num_write_zeroes_bios;
1460}
1461
1462typedef bool (*is_split_required_fn)(struct dm_target *ti);
1463
1464static bool is_split_required_for_discard(struct dm_target *ti)
1465{
1466 return ti->split_discard_bios;
1467}
1468
1469static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
1470 get_num_bios_fn get_num_bios,
1471 is_split_required_fn is_split_required)
1472{
1473 unsigned len;
1474 unsigned num_bios;
1475
1476
1477
1478
1479
1480
1481
1482 num_bios = get_num_bios ? get_num_bios(ti) : 0;
1483 if (!num_bios)
1484 return -EOPNOTSUPP;
1485
1486 if (is_split_required && !is_split_required(ti))
1487 len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
1488 else
1489 len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti));
1490
1491 __send_duplicate_bios(ci, ti, num_bios, &len);
1492
1493 ci->sector += len;
1494 ci->sector_count -= len;
1495
1496 return 0;
1497}
1498
1499static int __send_discard(struct clone_info *ci, struct dm_target *ti)
1500{
1501 return __send_changing_extent_only(ci, ti, get_num_discard_bios,
1502 is_split_required_for_discard);
1503}
1504
1505static int __send_secure_erase(struct clone_info *ci, struct dm_target *ti)
1506{
1507 return __send_changing_extent_only(ci, ti, get_num_secure_erase_bios, NULL);
1508}
1509
1510static int __send_write_same(struct clone_info *ci, struct dm_target *ti)
1511{
1512 return __send_changing_extent_only(ci, ti, get_num_write_same_bios, NULL);
1513}
1514
1515static int __send_write_zeroes(struct clone_info *ci, struct dm_target *ti)
1516{
1517 return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios, NULL);
1518}
1519
1520static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti,
1521 int *result)
1522{
1523 struct bio *bio = ci->bio;
1524
1525 if (bio_op(bio) == REQ_OP_DISCARD)
1526 *result = __send_discard(ci, ti);
1527 else if (bio_op(bio) == REQ_OP_SECURE_ERASE)
1528 *result = __send_secure_erase(ci, ti);
1529 else if (bio_op(bio) == REQ_OP_WRITE_SAME)
1530 *result = __send_write_same(ci, ti);
1531 else if (bio_op(bio) == REQ_OP_WRITE_ZEROES)
1532 *result = __send_write_zeroes(ci, ti);
1533 else
1534 return false;
1535
1536 return true;
1537}
1538
1539
1540
1541
1542static int __split_and_process_non_flush(struct clone_info *ci)
1543{
1544 struct bio *bio = ci->bio;
1545 struct dm_target *ti;
1546 unsigned len;
1547 int r;
1548
1549 ti = dm_table_find_target(ci->map, ci->sector);
1550 if (!dm_target_is_valid(ti))
1551 return -EIO;
1552
1553 if (unlikely(__process_abnormal_io(ci, ti, &r)))
1554 return r;
1555
1556 if (bio_op(bio) == REQ_OP_ZONE_REPORT)
1557 len = ci->sector_count;
1558 else
1559 len = min_t(sector_t, max_io_len(ci->sector, ti),
1560 ci->sector_count);
1561
1562 r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
1563 if (r < 0)
1564 return r;
1565
1566 ci->sector += len;
1567 ci->sector_count -= len;
1568
1569 return 0;
1570}
1571
1572static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
1573 struct dm_table *map, struct bio *bio)
1574{
1575 ci->map = map;
1576 ci->io = alloc_io(md, bio);
1577 ci->sector = bio->bi_iter.bi_sector;
1578}
1579
1580
1581
1582
1583static blk_qc_t __split_and_process_bio(struct mapped_device *md,
1584 struct dm_table *map, struct bio *bio)
1585{
1586 struct clone_info ci;
1587 blk_qc_t ret = BLK_QC_T_NONE;
1588 int error = 0;
1589
1590 if (unlikely(!map)) {
1591 bio_io_error(bio);
1592 return ret;
1593 }
1594
1595 init_clone_info(&ci, md, map, bio);
1596
1597 if (bio->bi_opf & REQ_PREFLUSH) {
1598 ci.bio = &ci.io->md->flush_bio;
1599 ci.sector_count = 0;
1600 error = __send_empty_flush(&ci);
1601
1602 } else if (bio_op(bio) == REQ_OP_ZONE_RESET) {
1603 ci.bio = bio;
1604 ci.sector_count = 0;
1605 error = __split_and_process_non_flush(&ci);
1606 } else {
1607 ci.bio = bio;
1608 ci.sector_count = bio_sectors(bio);
1609 while (ci.sector_count && !error) {
1610 error = __split_and_process_non_flush(&ci);
1611 if (current->bio_list && ci.sector_count && !error) {
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623 struct bio *b = bio_split(bio, bio_sectors(bio) - ci.sector_count,
1624 GFP_NOIO, &md->queue->bio_split);
1625 ci.io->orig_bio = b;
1626 bio_chain(b, bio);
1627 ret = generic_make_request(bio);
1628 break;
1629 }
1630 }
1631 }
1632
1633
1634 dec_pending(ci.io, errno_to_blk_status(error));
1635 return ret;
1636}
1637
1638
1639
1640
1641
1642static blk_qc_t __process_bio(struct mapped_device *md,
1643 struct dm_table *map, struct bio *bio)
1644{
1645 struct clone_info ci;
1646 blk_qc_t ret = BLK_QC_T_NONE;
1647 int error = 0;
1648
1649 if (unlikely(!map)) {
1650 bio_io_error(bio);
1651 return ret;
1652 }
1653
1654 init_clone_info(&ci, md, map, bio);
1655
1656 if (bio->bi_opf & REQ_PREFLUSH) {
1657 ci.bio = &ci.io->md->flush_bio;
1658 ci.sector_count = 0;
1659 error = __send_empty_flush(&ci);
1660
1661 } else {
1662 struct dm_target *ti = md->immutable_target;
1663 struct dm_target_io *tio;
1664
1665
1666
1667
1668
1669 if (unlikely(WARN_ON_ONCE(!ti || !dm_target_is_valid(ti)))) {
1670 error = -EIO;
1671 goto out;
1672 }
1673
1674 ci.bio = bio;
1675 ci.sector_count = bio_sectors(bio);
1676 if (unlikely(__process_abnormal_io(&ci, ti, &error)))
1677 goto out;
1678
1679 tio = alloc_tio(&ci, ti, 0, GFP_NOIO);
1680 ret = __clone_and_map_simple_bio(&ci, tio, NULL);
1681 }
1682out:
1683
1684 dec_pending(ci.io, errno_to_blk_status(error));
1685 return ret;
1686}
1687
1688typedef blk_qc_t (process_bio_fn)(struct mapped_device *, struct dm_table *, struct bio *);
1689
1690static blk_qc_t __dm_make_request(struct request_queue *q, struct bio *bio,
1691 process_bio_fn process_bio)
1692{
1693 struct mapped_device *md = q->queuedata;
1694 blk_qc_t ret = BLK_QC_T_NONE;
1695 int srcu_idx;
1696 struct dm_table *map;
1697
1698 map = dm_get_live_table(md, &srcu_idx);
1699
1700
1701 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1702 dm_put_live_table(md, srcu_idx);
1703
1704 if (!(bio->bi_opf & REQ_RAHEAD))
1705 queue_io(md, bio);
1706 else
1707 bio_io_error(bio);
1708 return ret;
1709 }
1710
1711 ret = process_bio(md, map, bio);
1712
1713 dm_put_live_table(md, srcu_idx);
1714 return ret;
1715}
1716
1717
1718
1719
1720
1721static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
1722{
1723 return __dm_make_request(q, bio, __split_and_process_bio);
1724}
1725
1726static blk_qc_t dm_make_request_nvme(struct request_queue *q, struct bio *bio)
1727{
1728 return __dm_make_request(q, bio, __process_bio);
1729}
1730
1731static int dm_any_congested(void *congested_data, int bdi_bits)
1732{
1733 int r = bdi_bits;
1734 struct mapped_device *md = congested_data;
1735 struct dm_table *map;
1736
1737 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
1738 if (dm_request_based(md)) {
1739
1740
1741
1742
1743 r = md->queue->backing_dev_info->wb.state & bdi_bits;
1744 } else {
1745 map = dm_get_live_table_fast(md);
1746 if (map)
1747 r = dm_table_any_congested(map, bdi_bits);
1748 dm_put_live_table_fast(md);
1749 }
1750 }
1751
1752 return r;
1753}
1754
1755
1756
1757
1758static void free_minor(int minor)
1759{
1760 spin_lock(&_minor_lock);
1761 idr_remove(&_minor_idr, minor);
1762 spin_unlock(&_minor_lock);
1763}
1764
1765
1766
1767
1768static int specific_minor(int minor)
1769{
1770 int r;
1771
1772 if (minor >= (1 << MINORBITS))
1773 return -EINVAL;
1774
1775 idr_preload(GFP_KERNEL);
1776 spin_lock(&_minor_lock);
1777
1778 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
1779
1780 spin_unlock(&_minor_lock);
1781 idr_preload_end();
1782 if (r < 0)
1783 return r == -ENOSPC ? -EBUSY : r;
1784 return 0;
1785}
1786
1787static int next_free_minor(int *minor)
1788{
1789 int r;
1790
1791 idr_preload(GFP_KERNEL);
1792 spin_lock(&_minor_lock);
1793
1794 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
1795
1796 spin_unlock(&_minor_lock);
1797 idr_preload_end();
1798 if (r < 0)
1799 return r;
1800 *minor = r;
1801 return 0;
1802}
1803
1804static const struct block_device_operations dm_blk_dops;
1805static const struct dax_operations dm_dax_ops;
1806
1807static void dm_wq_work(struct work_struct *work);
1808
1809static void dm_init_normal_md_queue(struct mapped_device *md)
1810{
1811 md->use_blk_mq = false;
1812
1813
1814
1815
1816 md->queue->backing_dev_info->congested_fn = dm_any_congested;
1817}
1818
1819static void cleanup_mapped_device(struct mapped_device *md)
1820{
1821 if (md->wq)
1822 destroy_workqueue(md->wq);
1823 if (md->kworker_task)
1824 kthread_stop(md->kworker_task);
1825 bioset_exit(&md->bs);
1826 bioset_exit(&md->io_bs);
1827
1828 if (md->dax_dev) {
1829 kill_dax(md->dax_dev);
1830 put_dax(md->dax_dev);
1831 md->dax_dev = NULL;
1832 }
1833
1834 if (md->disk) {
1835 spin_lock(&_minor_lock);
1836 md->disk->private_data = NULL;
1837 spin_unlock(&_minor_lock);
1838 del_gendisk(md->disk);
1839 put_disk(md->disk);
1840 }
1841
1842 if (md->queue)
1843 blk_cleanup_queue(md->queue);
1844
1845 cleanup_srcu_struct(&md->io_barrier);
1846
1847 if (md->bdev) {
1848 bdput(md->bdev);
1849 md->bdev = NULL;
1850 }
1851
1852 mutex_destroy(&md->suspend_lock);
1853 mutex_destroy(&md->type_lock);
1854 mutex_destroy(&md->table_devices_lock);
1855
1856 dm_mq_cleanup_mapped_device(md);
1857}
1858
1859
1860
1861
1862static struct mapped_device *alloc_dev(int minor)
1863{
1864 int r, numa_node_id = dm_get_numa_node();
1865 struct dax_device *dax_dev = NULL;
1866 struct mapped_device *md;
1867 void *old_md;
1868
1869 md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
1870 if (!md) {
1871 DMWARN("unable to allocate device, out of memory.");
1872 return NULL;
1873 }
1874
1875 if (!try_module_get(THIS_MODULE))
1876 goto bad_module_get;
1877
1878
1879 if (minor == DM_ANY_MINOR)
1880 r = next_free_minor(&minor);
1881 else
1882 r = specific_minor(minor);
1883 if (r < 0)
1884 goto bad_minor;
1885
1886 r = init_srcu_struct(&md->io_barrier);
1887 if (r < 0)
1888 goto bad_io_barrier;
1889
1890 md->numa_node_id = numa_node_id;
1891 md->use_blk_mq = dm_use_blk_mq_default();
1892 md->init_tio_pdu = false;
1893 md->type = DM_TYPE_NONE;
1894 mutex_init(&md->suspend_lock);
1895 mutex_init(&md->type_lock);
1896 mutex_init(&md->table_devices_lock);
1897 spin_lock_init(&md->deferred_lock);
1898 atomic_set(&md->holders, 1);
1899 atomic_set(&md->open_count, 0);
1900 atomic_set(&md->event_nr, 0);
1901 atomic_set(&md->uevent_seq, 0);
1902 INIT_LIST_HEAD(&md->uevent_list);
1903 INIT_LIST_HEAD(&md->table_devices);
1904 spin_lock_init(&md->uevent_lock);
1905
1906 md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id, NULL);
1907 if (!md->queue)
1908 goto bad;
1909 md->queue->queuedata = md;
1910 md->queue->backing_dev_info->congested_data = md;
1911
1912 md->disk = alloc_disk_node(1, md->numa_node_id);
1913 if (!md->disk)
1914 goto bad;
1915
1916 atomic_set(&md->pending[0], 0);
1917 atomic_set(&md->pending[1], 0);
1918 init_waitqueue_head(&md->wait);
1919 INIT_WORK(&md->work, dm_wq_work);
1920 init_waitqueue_head(&md->eventq);
1921 init_completion(&md->kobj_holder.completion);
1922 md->kworker_task = NULL;
1923
1924 md->disk->major = _major;
1925 md->disk->first_minor = minor;
1926 md->disk->fops = &dm_blk_dops;
1927 md->disk->queue = md->queue;
1928 md->disk->private_data = md;
1929 sprintf(md->disk->disk_name, "dm-%d", minor);
1930
1931 if (IS_ENABLED(CONFIG_DAX_DRIVER)) {
1932 dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops);
1933 if (!dax_dev)
1934 goto bad;
1935 }
1936 md->dax_dev = dax_dev;
1937
1938 add_disk_no_queue_reg(md->disk);
1939 format_dev_t(md->name, MKDEV(_major, minor));
1940
1941 md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
1942 if (!md->wq)
1943 goto bad;
1944
1945 md->bdev = bdget_disk(md->disk, 0);
1946 if (!md->bdev)
1947 goto bad;
1948
1949 bio_init(&md->flush_bio, NULL, 0);
1950 bio_set_dev(&md->flush_bio, md->bdev);
1951 md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
1952
1953 dm_stats_init(&md->stats);
1954
1955
1956 spin_lock(&_minor_lock);
1957 old_md = idr_replace(&_minor_idr, md, minor);
1958 spin_unlock(&_minor_lock);
1959
1960 BUG_ON(old_md != MINOR_ALLOCED);
1961
1962 return md;
1963
1964bad:
1965 cleanup_mapped_device(md);
1966bad_io_barrier:
1967 free_minor(minor);
1968bad_minor:
1969 module_put(THIS_MODULE);
1970bad_module_get:
1971 kvfree(md);
1972 return NULL;
1973}
1974
1975static void unlock_fs(struct mapped_device *md);
1976
1977static void free_dev(struct mapped_device *md)
1978{
1979 int minor = MINOR(disk_devt(md->disk));
1980
1981 unlock_fs(md);
1982
1983 cleanup_mapped_device(md);
1984
1985 free_table_devices(&md->table_devices);
1986 dm_stats_cleanup(&md->stats);
1987 free_minor(minor);
1988
1989 module_put(THIS_MODULE);
1990 kvfree(md);
1991}
1992
1993static int __bind_mempools(struct mapped_device *md, struct dm_table *t)
1994{
1995 struct dm_md_mempools *p = dm_table_get_md_mempools(t);
1996 int ret = 0;
1997
1998 if (dm_table_bio_based(t)) {
1999
2000
2001
2002
2003
2004 bioset_exit(&md->bs);
2005 bioset_exit(&md->io_bs);
2006
2007 } else if (bioset_initialized(&md->bs)) {
2008
2009
2010
2011
2012
2013
2014
2015
2016 goto out;
2017 }
2018
2019 BUG_ON(!p ||
2020 bioset_initialized(&md->bs) ||
2021 bioset_initialized(&md->io_bs));
2022
2023 ret = bioset_init_from_src(&md->bs, &p->bs);
2024 if (ret)
2025 goto out;
2026 ret = bioset_init_from_src(&md->io_bs, &p->io_bs);
2027 if (ret)
2028 bioset_exit(&md->bs);
2029out:
2030
2031 dm_table_free_md_mempools(t);
2032 return ret;
2033}
2034
2035
2036
2037
2038static void event_callback(void *context)
2039{
2040 unsigned long flags;
2041 LIST_HEAD(uevents);
2042 struct mapped_device *md = (struct mapped_device *) context;
2043
2044 spin_lock_irqsave(&md->uevent_lock, flags);
2045 list_splice_init(&md->uevent_list, &uevents);
2046 spin_unlock_irqrestore(&md->uevent_lock, flags);
2047
2048 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
2049
2050 atomic_inc(&md->event_nr);
2051 wake_up(&md->eventq);
2052 dm_issue_global_event();
2053}
2054
2055
2056
2057
2058static void __set_size(struct mapped_device *md, sector_t size)
2059{
2060 lockdep_assert_held(&md->suspend_lock);
2061
2062 set_capacity(md->disk, size);
2063
2064 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
2065}
2066
2067
2068
2069
2070static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2071 struct queue_limits *limits)
2072{
2073 struct dm_table *old_map;
2074 struct request_queue *q = md->queue;
2075 bool request_based = dm_table_request_based(t);
2076 sector_t size;
2077 int ret;
2078
2079 lockdep_assert_held(&md->suspend_lock);
2080
2081 size = dm_table_get_size(t);
2082
2083
2084
2085
2086 if (size != dm_get_size(md))
2087 memset(&md->geometry, 0, sizeof(md->geometry));
2088
2089 __set_size(md, size);
2090
2091 dm_table_event_callback(t, event_callback, md);
2092
2093
2094
2095
2096
2097
2098
2099
2100 if (request_based)
2101 dm_stop_queue(q);
2102
2103 if (request_based || md->type == DM_TYPE_NVME_BIO_BASED) {
2104
2105
2106
2107
2108
2109
2110 md->immutable_target = dm_table_get_immutable_target(t);
2111 }
2112
2113 ret = __bind_mempools(md, t);
2114 if (ret) {
2115 old_map = ERR_PTR(ret);
2116 goto out;
2117 }
2118
2119 old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2120 rcu_assign_pointer(md->map, (void *)t);
2121 md->immutable_target_type = dm_table_get_immutable_target_type(t);
2122
2123 dm_table_set_restrictions(t, q, limits);
2124 if (old_map)
2125 dm_sync_table(md);
2126
2127out:
2128 return old_map;
2129}
2130
2131
2132
2133
2134static struct dm_table *__unbind(struct mapped_device *md)
2135{
2136 struct dm_table *map = rcu_dereference_protected(md->map, 1);
2137
2138 if (!map)
2139 return NULL;
2140
2141 dm_table_event_callback(map, NULL, NULL);
2142 RCU_INIT_POINTER(md->map, NULL);
2143 dm_sync_table(md);
2144
2145 return map;
2146}
2147
2148
2149
2150
2151int dm_create(int minor, struct mapped_device **result)
2152{
2153 int r;
2154 struct mapped_device *md;
2155
2156 md = alloc_dev(minor);
2157 if (!md)
2158 return -ENXIO;
2159
2160 r = dm_sysfs_init(md);
2161 if (r) {
2162 free_dev(md);
2163 return r;
2164 }
2165
2166 *result = md;
2167 return 0;
2168}
2169
2170
2171
2172
2173
2174void dm_lock_md_type(struct mapped_device *md)
2175{
2176 mutex_lock(&md->type_lock);
2177}
2178
2179void dm_unlock_md_type(struct mapped_device *md)
2180{
2181 mutex_unlock(&md->type_lock);
2182}
2183
2184void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
2185{
2186 BUG_ON(!mutex_is_locked(&md->type_lock));
2187 md->type = type;
2188}
2189
2190enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
2191{
2192 return md->type;
2193}
2194
2195struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
2196{
2197 return md->immutable_target_type;
2198}
2199
2200
2201
2202
2203
2204struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
2205{
2206 BUG_ON(!atomic_read(&md->holders));
2207 return &md->queue->limits;
2208}
2209EXPORT_SYMBOL_GPL(dm_get_queue_limits);
2210
2211
2212
2213
2214int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
2215{
2216 int r;
2217 struct queue_limits limits;
2218 enum dm_queue_mode type = dm_get_md_type(md);
2219
2220 switch (type) {
2221 case DM_TYPE_REQUEST_BASED:
2222 dm_init_normal_md_queue(md);
2223 r = dm_old_init_request_queue(md, t);
2224 if (r) {
2225 DMERR("Cannot initialize queue for request-based mapped device");
2226 return r;
2227 }
2228 break;
2229 case DM_TYPE_MQ_REQUEST_BASED:
2230 r = dm_mq_init_request_queue(md, t);
2231 if (r) {
2232 DMERR("Cannot initialize queue for request-based dm-mq mapped device");
2233 return r;
2234 }
2235 break;
2236 case DM_TYPE_BIO_BASED:
2237 case DM_TYPE_DAX_BIO_BASED:
2238 dm_init_normal_md_queue(md);
2239 blk_queue_make_request(md->queue, dm_make_request);
2240 break;
2241 case DM_TYPE_NVME_BIO_BASED:
2242 dm_init_normal_md_queue(md);
2243 blk_queue_make_request(md->queue, dm_make_request_nvme);
2244 break;
2245 case DM_TYPE_NONE:
2246 WARN_ON_ONCE(true);
2247 break;
2248 }
2249
2250 r = dm_calculate_queue_limits(t, &limits);
2251 if (r) {
2252 DMERR("Cannot calculate initial queue limits");
2253 return r;
2254 }
2255 dm_table_set_restrictions(t, md->queue, &limits);
2256 blk_register_queue(md->disk);
2257
2258 return 0;
2259}
2260
2261struct mapped_device *dm_get_md(dev_t dev)
2262{
2263 struct mapped_device *md;
2264 unsigned minor = MINOR(dev);
2265
2266 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2267 return NULL;
2268
2269 spin_lock(&_minor_lock);
2270
2271 md = idr_find(&_minor_idr, minor);
2272 if (!md || md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) ||
2273 test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2274 md = NULL;
2275 goto out;
2276 }
2277 dm_get(md);
2278out:
2279 spin_unlock(&_minor_lock);
2280
2281 return md;
2282}
2283EXPORT_SYMBOL_GPL(dm_get_md);
2284
2285void *dm_get_mdptr(struct mapped_device *md)
2286{
2287 return md->interface_ptr;
2288}
2289
2290void dm_set_mdptr(struct mapped_device *md, void *ptr)
2291{
2292 md->interface_ptr = ptr;
2293}
2294
2295void dm_get(struct mapped_device *md)
2296{
2297 atomic_inc(&md->holders);
2298 BUG_ON(test_bit(DMF_FREEING, &md->flags));
2299}
2300
2301int dm_hold(struct mapped_device *md)
2302{
2303 spin_lock(&_minor_lock);
2304 if (test_bit(DMF_FREEING, &md->flags)) {
2305 spin_unlock(&_minor_lock);
2306 return -EBUSY;
2307 }
2308 dm_get(md);
2309 spin_unlock(&_minor_lock);
2310 return 0;
2311}
2312EXPORT_SYMBOL_GPL(dm_hold);
2313
2314const char *dm_device_name(struct mapped_device *md)
2315{
2316 return md->name;
2317}
2318EXPORT_SYMBOL_GPL(dm_device_name);
2319
2320static void __dm_destroy(struct mapped_device *md, bool wait)
2321{
2322 struct dm_table *map;
2323 int srcu_idx;
2324
2325 might_sleep();
2326
2327 spin_lock(&_minor_lock);
2328 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2329 set_bit(DMF_FREEING, &md->flags);
2330 spin_unlock(&_minor_lock);
2331
2332 blk_set_queue_dying(md->queue);
2333
2334 if (dm_request_based(md) && md->kworker_task)
2335 kthread_flush_worker(&md->kworker);
2336
2337
2338
2339
2340
2341 mutex_lock(&md->suspend_lock);
2342 map = dm_get_live_table(md, &srcu_idx);
2343 if (!dm_suspended_md(md)) {
2344 dm_table_presuspend_targets(map);
2345 dm_table_postsuspend_targets(map);
2346 }
2347
2348 dm_put_live_table(md, srcu_idx);
2349 mutex_unlock(&md->suspend_lock);
2350
2351
2352
2353
2354
2355
2356
2357 if (wait)
2358 while (atomic_read(&md->holders))
2359 msleep(1);
2360 else if (atomic_read(&md->holders))
2361 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2362 dm_device_name(md), atomic_read(&md->holders));
2363
2364 dm_sysfs_exit(md);
2365 dm_table_destroy(__unbind(md));
2366 free_dev(md);
2367}
2368
2369void dm_destroy(struct mapped_device *md)
2370{
2371 __dm_destroy(md, true);
2372}
2373
2374void dm_destroy_immediate(struct mapped_device *md)
2375{
2376 __dm_destroy(md, false);
2377}
2378
2379void dm_put(struct mapped_device *md)
2380{
2381 atomic_dec(&md->holders);
2382}
2383EXPORT_SYMBOL_GPL(dm_put);
2384
2385static int dm_wait_for_completion(struct mapped_device *md, long task_state)
2386{
2387 int r = 0;
2388 DEFINE_WAIT(wait);
2389
2390 while (1) {
2391 prepare_to_wait(&md->wait, &wait, task_state);
2392
2393 if (!md_in_flight(md))
2394 break;
2395
2396 if (signal_pending_state(task_state, current)) {
2397 r = -EINTR;
2398 break;
2399 }
2400
2401 io_schedule();
2402 }
2403 finish_wait(&md->wait, &wait);
2404
2405 return r;
2406}
2407
2408
2409
2410
2411static void dm_wq_work(struct work_struct *work)
2412{
2413 struct mapped_device *md = container_of(work, struct mapped_device,
2414 work);
2415 struct bio *c;
2416 int srcu_idx;
2417 struct dm_table *map;
2418
2419 map = dm_get_live_table(md, &srcu_idx);
2420
2421 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2422 spin_lock_irq(&md->deferred_lock);
2423 c = bio_list_pop(&md->deferred);
2424 spin_unlock_irq(&md->deferred_lock);
2425
2426 if (!c)
2427 break;
2428
2429 if (dm_request_based(md))
2430 generic_make_request(c);
2431 else
2432 __split_and_process_bio(md, map, c);
2433 }
2434
2435 dm_put_live_table(md, srcu_idx);
2436}
2437
2438static void dm_queue_flush(struct mapped_device *md)
2439{
2440 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2441 smp_mb__after_atomic();
2442 queue_work(md->wq, &md->work);
2443}
2444
2445
2446
2447
2448struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2449{
2450 struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
2451 struct queue_limits limits;
2452 int r;
2453
2454 mutex_lock(&md->suspend_lock);
2455
2456
2457 if (!dm_suspended_md(md))
2458 goto out;
2459
2460
2461
2462
2463
2464
2465
2466 if (dm_table_has_no_data_devices(table)) {
2467 live_map = dm_get_live_table_fast(md);
2468 if (live_map)
2469 limits = md->queue->limits;
2470 dm_put_live_table_fast(md);
2471 }
2472
2473 if (!live_map) {
2474 r = dm_calculate_queue_limits(table, &limits);
2475 if (r) {
2476 map = ERR_PTR(r);
2477 goto out;
2478 }
2479 }
2480
2481 map = __bind(md, table, &limits);
2482 dm_issue_global_event();
2483
2484out:
2485 mutex_unlock(&md->suspend_lock);
2486 return map;
2487}
2488
2489
2490
2491
2492
2493static int lock_fs(struct mapped_device *md)
2494{
2495 int r;
2496
2497 WARN_ON(md->frozen_sb);
2498
2499 md->frozen_sb = freeze_bdev(md->bdev);
2500 if (IS_ERR(md->frozen_sb)) {
2501 r = PTR_ERR(md->frozen_sb);
2502 md->frozen_sb = NULL;
2503 return r;
2504 }
2505
2506 set_bit(DMF_FROZEN, &md->flags);
2507
2508 return 0;
2509}
2510
2511static void unlock_fs(struct mapped_device *md)
2512{
2513 if (!test_bit(DMF_FROZEN, &md->flags))
2514 return;
2515
2516 thaw_bdev(md->bdev, md->frozen_sb);
2517 md->frozen_sb = NULL;
2518 clear_bit(DMF_FROZEN, &md->flags);
2519}
2520
2521
2522
2523
2524
2525
2526
2527
2528
2529
2530static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
2531 unsigned suspend_flags, long task_state,
2532 int dmf_suspended_flag)
2533{
2534 bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
2535 bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
2536 int r;
2537
2538 lockdep_assert_held(&md->suspend_lock);
2539
2540
2541
2542
2543
2544 if (noflush)
2545 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2546 else
2547 pr_debug("%s: suspending with flush\n", dm_device_name(md));
2548
2549
2550
2551
2552
2553 dm_table_presuspend_targets(map);
2554
2555
2556
2557
2558
2559
2560
2561 if (!noflush && do_lockfs) {
2562 r = lock_fs(md);
2563 if (r) {
2564 dm_table_presuspend_undo_targets(map);
2565 return r;
2566 }
2567 }
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2582 if (map)
2583 synchronize_srcu(&md->io_barrier);
2584
2585
2586
2587
2588
2589 if (dm_request_based(md)) {
2590 dm_stop_queue(md->queue);
2591 if (md->kworker_task)
2592 kthread_flush_worker(&md->kworker);
2593 }
2594
2595 flush_workqueue(md->wq);
2596
2597
2598
2599
2600
2601
2602 r = dm_wait_for_completion(md, task_state);
2603 if (!r)
2604 set_bit(dmf_suspended_flag, &md->flags);
2605
2606 if (noflush)
2607 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2608 if (map)
2609 synchronize_srcu(&md->io_barrier);
2610
2611
2612 if (r < 0) {
2613 dm_queue_flush(md);
2614
2615 if (dm_request_based(md))
2616 dm_start_queue(md->queue);
2617
2618 unlock_fs(md);
2619 dm_table_presuspend_undo_targets(map);
2620
2621 }
2622
2623 return r;
2624}
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2643{
2644 struct dm_table *map = NULL;
2645 int r = 0;
2646
2647retry:
2648 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2649
2650 if (dm_suspended_md(md)) {
2651 r = -EINVAL;
2652 goto out_unlock;
2653 }
2654
2655 if (dm_suspended_internally_md(md)) {
2656
2657 mutex_unlock(&md->suspend_lock);
2658 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2659 if (r)
2660 return r;
2661 goto retry;
2662 }
2663
2664 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2665
2666 r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED);
2667 if (r)
2668 goto out_unlock;
2669
2670 dm_table_postsuspend_targets(map);
2671
2672out_unlock:
2673 mutex_unlock(&md->suspend_lock);
2674 return r;
2675}
2676
2677static int __dm_resume(struct mapped_device *md, struct dm_table *map)
2678{
2679 if (map) {
2680 int r = dm_table_resume_targets(map);
2681 if (r)
2682 return r;
2683 }
2684
2685 dm_queue_flush(md);
2686
2687
2688
2689
2690
2691
2692 if (dm_request_based(md))
2693 dm_start_queue(md->queue);
2694
2695 unlock_fs(md);
2696
2697 return 0;
2698}
2699
2700int dm_resume(struct mapped_device *md)
2701{
2702 int r;
2703 struct dm_table *map = NULL;
2704
2705retry:
2706 r = -EINVAL;
2707 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2708
2709 if (!dm_suspended_md(md))
2710 goto out;
2711
2712 if (dm_suspended_internally_md(md)) {
2713
2714 mutex_unlock(&md->suspend_lock);
2715 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2716 if (r)
2717 return r;
2718 goto retry;
2719 }
2720
2721 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2722 if (!map || !dm_table_get_size(map))
2723 goto out;
2724
2725 r = __dm_resume(md, map);
2726 if (r)
2727 goto out;
2728
2729 clear_bit(DMF_SUSPENDED, &md->flags);
2730out:
2731 mutex_unlock(&md->suspend_lock);
2732
2733 return r;
2734}
2735
2736
2737
2738
2739
2740
2741
2742static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
2743{
2744 struct dm_table *map = NULL;
2745
2746 lockdep_assert_held(&md->suspend_lock);
2747
2748 if (md->internal_suspend_count++)
2749 return;
2750
2751 if (dm_suspended_md(md)) {
2752 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2753 return;
2754 }
2755
2756 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2757
2758
2759
2760
2761
2762
2763
2764 (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE,
2765 DMF_SUSPENDED_INTERNALLY);
2766
2767 dm_table_postsuspend_targets(map);
2768}
2769
2770static void __dm_internal_resume(struct mapped_device *md)
2771{
2772 BUG_ON(!md->internal_suspend_count);
2773
2774 if (--md->internal_suspend_count)
2775 return;
2776
2777 if (dm_suspended_md(md))
2778 goto done;
2779
2780
2781
2782
2783
2784 (void) __dm_resume(md, NULL);
2785
2786done:
2787 clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2788 smp_mb__after_atomic();
2789 wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
2790}
2791
2792void dm_internal_suspend_noflush(struct mapped_device *md)
2793{
2794 mutex_lock(&md->suspend_lock);
2795 __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
2796 mutex_unlock(&md->suspend_lock);
2797}
2798EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
2799
2800void dm_internal_resume(struct mapped_device *md)
2801{
2802 mutex_lock(&md->suspend_lock);
2803 __dm_internal_resume(md);
2804 mutex_unlock(&md->suspend_lock);
2805}
2806EXPORT_SYMBOL_GPL(dm_internal_resume);
2807
2808
2809
2810
2811
2812
2813void dm_internal_suspend_fast(struct mapped_device *md)
2814{
2815 mutex_lock(&md->suspend_lock);
2816 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2817 return;
2818
2819 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2820 synchronize_srcu(&md->io_barrier);
2821 flush_workqueue(md->wq);
2822 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2823}
2824EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
2825
2826void dm_internal_resume_fast(struct mapped_device *md)
2827{
2828 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2829 goto done;
2830
2831 dm_queue_flush(md);
2832
2833done:
2834 mutex_unlock(&md->suspend_lock);
2835}
2836EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
2837
2838
2839
2840
2841int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
2842 unsigned cookie)
2843{
2844 char udev_cookie[DM_COOKIE_LENGTH];
2845 char *envp[] = { udev_cookie, NULL };
2846
2847 if (!cookie)
2848 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
2849 else {
2850 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2851 DM_COOKIE_ENV_VAR_NAME, cookie);
2852 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
2853 action, envp);
2854 }
2855}
2856
2857uint32_t dm_next_uevent_seq(struct mapped_device *md)
2858{
2859 return atomic_add_return(1, &md->uevent_seq);
2860}
2861
2862uint32_t dm_get_event_nr(struct mapped_device *md)
2863{
2864 return atomic_read(&md->event_nr);
2865}
2866
2867int dm_wait_event(struct mapped_device *md, int event_nr)
2868{
2869 return wait_event_interruptible(md->eventq,
2870 (event_nr != atomic_read(&md->event_nr)));
2871}
2872
2873void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
2874{
2875 unsigned long flags;
2876
2877 spin_lock_irqsave(&md->uevent_lock, flags);
2878 list_add(elist, &md->uevent_list);
2879 spin_unlock_irqrestore(&md->uevent_lock, flags);
2880}
2881
2882
2883
2884
2885
2886struct gendisk *dm_disk(struct mapped_device *md)
2887{
2888 return md->disk;
2889}
2890EXPORT_SYMBOL_GPL(dm_disk);
2891
2892struct kobject *dm_kobject(struct mapped_device *md)
2893{
2894 return &md->kobj_holder.kobj;
2895}
2896
2897struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2898{
2899 struct mapped_device *md;
2900
2901 md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
2902
2903 spin_lock(&_minor_lock);
2904 if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2905 md = NULL;
2906 goto out;
2907 }
2908 dm_get(md);
2909out:
2910 spin_unlock(&_minor_lock);
2911
2912 return md;
2913}
2914
2915int dm_suspended_md(struct mapped_device *md)
2916{
2917 return test_bit(DMF_SUSPENDED, &md->flags);
2918}
2919
2920int dm_suspended_internally_md(struct mapped_device *md)
2921{
2922 return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2923}
2924
2925int dm_test_deferred_remove_flag(struct mapped_device *md)
2926{
2927 return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
2928}
2929
2930int dm_suspended(struct dm_target *ti)
2931{
2932 return dm_suspended_md(dm_table_get_md(ti->table));
2933}
2934EXPORT_SYMBOL_GPL(dm_suspended);
2935
2936int dm_noflush_suspending(struct dm_target *ti)
2937{
2938 return __noflush_suspending(dm_table_get_md(ti->table));
2939}
2940EXPORT_SYMBOL_GPL(dm_noflush_suspending);
2941
2942struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
2943 unsigned integrity, unsigned per_io_data_size,
2944 unsigned min_pool_size)
2945{
2946 struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
2947 unsigned int pool_size = 0;
2948 unsigned int front_pad, io_front_pad;
2949 int ret;
2950
2951 if (!pools)
2952 return NULL;
2953
2954 switch (type) {
2955 case DM_TYPE_BIO_BASED:
2956 case DM_TYPE_DAX_BIO_BASED:
2957 case DM_TYPE_NVME_BIO_BASED:
2958 pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
2959 front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
2960 io_front_pad = roundup(front_pad, __alignof__(struct dm_io)) + offsetof(struct dm_io, tio);
2961 ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0);
2962 if (ret)
2963 goto out;
2964 if (integrity && bioset_integrity_create(&pools->io_bs, pool_size))
2965 goto out;
2966 break;
2967 case DM_TYPE_REQUEST_BASED:
2968 case DM_TYPE_MQ_REQUEST_BASED:
2969 pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size);
2970 front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
2971
2972 break;
2973 default:
2974 BUG();
2975 }
2976
2977 ret = bioset_init(&pools->bs, pool_size, front_pad, 0);
2978 if (ret)
2979 goto out;
2980
2981 if (integrity && bioset_integrity_create(&pools->bs, pool_size))
2982 goto out;
2983
2984 return pools;
2985
2986out:
2987 dm_free_md_mempools(pools);
2988
2989 return NULL;
2990}
2991
2992void dm_free_md_mempools(struct dm_md_mempools *pools)
2993{
2994 if (!pools)
2995 return;
2996
2997 bioset_exit(&pools->bs);
2998 bioset_exit(&pools->io_bs);
2999
3000 kfree(pools);
3001}
3002
3003struct dm_pr {
3004 u64 old_key;
3005 u64 new_key;
3006 u32 flags;
3007 bool fail_early;
3008};
3009
3010static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
3011 void *data)
3012{
3013 struct mapped_device *md = bdev->bd_disk->private_data;
3014 struct dm_table *table;
3015 struct dm_target *ti;
3016 int ret = -ENOTTY, srcu_idx;
3017
3018 table = dm_get_live_table(md, &srcu_idx);
3019 if (!table || !dm_table_get_size(table))
3020 goto out;
3021
3022
3023 if (dm_table_get_num_targets(table) != 1)
3024 goto out;
3025 ti = dm_table_get_target(table, 0);
3026
3027 ret = -EINVAL;
3028 if (!ti->type->iterate_devices)
3029 goto out;
3030
3031 ret = ti->type->iterate_devices(ti, fn, data);
3032out:
3033 dm_put_live_table(md, srcu_idx);
3034 return ret;
3035}
3036
3037
3038
3039
3040static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev,
3041 sector_t start, sector_t len, void *data)
3042{
3043 struct dm_pr *pr = data;
3044 const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
3045
3046 if (!ops || !ops->pr_register)
3047 return -EOPNOTSUPP;
3048 return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);
3049}
3050
3051static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
3052 u32 flags)
3053{
3054 struct dm_pr pr = {
3055 .old_key = old_key,
3056 .new_key = new_key,
3057 .flags = flags,
3058 .fail_early = true,
3059 };
3060 int ret;
3061
3062 ret = dm_call_pr(bdev, __dm_pr_register, &pr);
3063 if (ret && new_key) {
3064
3065 pr.old_key = new_key;
3066 pr.new_key = 0;
3067 pr.flags = 0;
3068 pr.fail_early = false;
3069 dm_call_pr(bdev, __dm_pr_register, &pr);
3070 }
3071
3072 return ret;
3073}
3074
3075static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
3076 u32 flags)
3077{
3078 struct mapped_device *md = bdev->bd_disk->private_data;
3079 const struct pr_ops *ops;
3080 int r, srcu_idx;
3081
3082 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3083 if (r < 0)
3084 goto out;
3085
3086 ops = bdev->bd_disk->fops->pr_ops;
3087 if (ops && ops->pr_reserve)
3088 r = ops->pr_reserve(bdev, key, type, flags);
3089 else
3090 r = -EOPNOTSUPP;
3091out:
3092 dm_unprepare_ioctl(md, srcu_idx);
3093 return r;
3094}
3095
3096static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
3097{
3098 struct mapped_device *md = bdev->bd_disk->private_data;
3099 const struct pr_ops *ops;
3100 int r, srcu_idx;
3101
3102 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3103 if (r < 0)
3104 goto out;
3105
3106 ops = bdev->bd_disk->fops->pr_ops;
3107 if (ops && ops->pr_release)
3108 r = ops->pr_release(bdev, key, type);
3109 else
3110 r = -EOPNOTSUPP;
3111out:
3112 dm_unprepare_ioctl(md, srcu_idx);
3113 return r;
3114}
3115
3116static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
3117 enum pr_type type, bool abort)
3118{
3119 struct mapped_device *md = bdev->bd_disk->private_data;
3120 const struct pr_ops *ops;
3121 int r, srcu_idx;
3122
3123 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3124 if (r < 0)
3125 goto out;
3126
3127 ops = bdev->bd_disk->fops->pr_ops;
3128 if (ops && ops->pr_preempt)
3129 r = ops->pr_preempt(bdev, old_key, new_key, type, abort);
3130 else
3131 r = -EOPNOTSUPP;
3132out:
3133 dm_unprepare_ioctl(md, srcu_idx);
3134 return r;
3135}
3136
3137static int dm_pr_clear(struct block_device *bdev, u64 key)
3138{
3139 struct mapped_device *md = bdev->bd_disk->private_data;
3140 const struct pr_ops *ops;
3141 int r, srcu_idx;
3142
3143 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3144 if (r < 0)
3145 goto out;
3146
3147 ops = bdev->bd_disk->fops->pr_ops;
3148 if (ops && ops->pr_clear)
3149 r = ops->pr_clear(bdev, key);
3150 else
3151 r = -EOPNOTSUPP;
3152out:
3153 dm_unprepare_ioctl(md, srcu_idx);
3154 return r;
3155}
3156
3157static const struct pr_ops dm_pr_ops = {
3158 .pr_register = dm_pr_register,
3159 .pr_reserve = dm_pr_reserve,
3160 .pr_release = dm_pr_release,
3161 .pr_preempt = dm_pr_preempt,
3162 .pr_clear = dm_pr_clear,
3163};
3164
3165static const struct block_device_operations dm_blk_dops = {
3166 .open = dm_blk_open,
3167 .release = dm_blk_close,
3168 .ioctl = dm_blk_ioctl,
3169 .getgeo = dm_blk_getgeo,
3170 .pr_ops = &dm_pr_ops,
3171 .owner = THIS_MODULE
3172};
3173
3174static const struct dax_operations dm_dax_ops = {
3175 .direct_access = dm_dax_direct_access,
3176 .copy_from_iter = dm_dax_copy_from_iter,
3177 .copy_to_iter = dm_dax_copy_to_iter,
3178};
3179
3180
3181
3182
3183module_init(dm_init);
3184module_exit(dm_exit);
3185
3186module_param(major, uint, 0);
3187MODULE_PARM_DESC(major, "The major number of the device mapper");
3188
3189module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
3190MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
3191
3192module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
3193MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
3194
3195MODULE_DESCRIPTION(DM_NAME " driver");
3196MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3197MODULE_LICENSE("GPL");
3198