1
2
3
4
5
6
7
8#include "dm-core.h"
9#include "dm-rq.h"
10#include "dm-uevent.h"
11
12#include <linux/init.h>
13#include <linux/module.h>
14#include <linux/mutex.h>
15#include <linux/sched/signal.h>
16#include <linux/blkpg.h>
17#include <linux/bio.h>
18#include <linux/mempool.h>
19#include <linux/dax.h>
20#include <linux/slab.h>
21#include <linux/idr.h>
22#include <linux/uio.h>
23#include <linux/hdreg.h>
24#include <linux/delay.h>
25#include <linux/wait.h>
26#include <linux/pr.h>
27
28#define DM_MSG_PREFIX "core"
29
30
31
32
33
34#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
35#define DM_COOKIE_LENGTH 24
36
37static const char *_name = DM_NAME;
38
39static unsigned int major = 0;
40static unsigned int _major = 0;
41
42static DEFINE_IDR(_minor_idr);
43
44static DEFINE_SPINLOCK(_minor_lock);
45
46static void do_deferred_remove(struct work_struct *w);
47
48static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
49
50static struct workqueue_struct *deferred_remove_workqueue;
51
52atomic_t dm_global_event_nr = ATOMIC_INIT(0);
53DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq);
54
55void dm_issue_global_event(void)
56{
57 atomic_inc(&dm_global_event_nr);
58 wake_up(&dm_global_eventq);
59}
60
61
62
63
64struct dm_io {
65 struct mapped_device *md;
66 blk_status_t status;
67 atomic_t io_count;
68 struct bio *bio;
69 unsigned long start_time;
70 spinlock_t endio_lock;
71 struct dm_stats_aux stats_aux;
72};
73
74#define MINOR_ALLOCED ((void *)-1)
75
76
77
78
79#define DMF_BLOCK_IO_FOR_SUSPEND 0
80#define DMF_SUSPENDED 1
81#define DMF_FROZEN 2
82#define DMF_FREEING 3
83#define DMF_DELETING 4
84#define DMF_NOFLUSH_SUSPENDING 5
85#define DMF_DEFERRED_REMOVE 6
86#define DMF_SUSPENDED_INTERNALLY 7
87
88#define DM_NUMA_NODE NUMA_NO_NODE
89static int dm_numa_node = DM_NUMA_NODE;
90
91
92
93
94struct dm_md_mempools {
95 mempool_t *io_pool;
96 struct bio_set *bs;
97};
98
99struct table_device {
100 struct list_head list;
101 atomic_t count;
102 struct dm_dev dm_dev;
103};
104
105static struct kmem_cache *_io_cache;
106static struct kmem_cache *_rq_tio_cache;
107static struct kmem_cache *_rq_cache;
108
109
110
111
112#define RESERVED_BIO_BASED_IOS 16
113static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
114
115static int __dm_get_module_param_int(int *module_param, int min, int max)
116{
117 int param = ACCESS_ONCE(*module_param);
118 int modified_param = 0;
119 bool modified = true;
120
121 if (param < min)
122 modified_param = min;
123 else if (param > max)
124 modified_param = max;
125 else
126 modified = false;
127
128 if (modified) {
129 (void)cmpxchg(module_param, param, modified_param);
130 param = modified_param;
131 }
132
133 return param;
134}
135
136unsigned __dm_get_module_param(unsigned *module_param,
137 unsigned def, unsigned max)
138{
139 unsigned param = ACCESS_ONCE(*module_param);
140 unsigned modified_param = 0;
141
142 if (!param)
143 modified_param = def;
144 else if (param > max)
145 modified_param = max;
146
147 if (modified_param) {
148 (void)cmpxchg(module_param, param, modified_param);
149 param = modified_param;
150 }
151
152 return param;
153}
154
155unsigned dm_get_reserved_bio_based_ios(void)
156{
157 return __dm_get_module_param(&reserved_bio_based_ios,
158 RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS);
159}
160EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
161
162static unsigned dm_get_numa_node(void)
163{
164 return __dm_get_module_param_int(&dm_numa_node,
165 DM_NUMA_NODE, num_online_nodes() - 1);
166}
167
168static int __init local_init(void)
169{
170 int r = -ENOMEM;
171
172
173 _io_cache = KMEM_CACHE(dm_io, 0);
174 if (!_io_cache)
175 return r;
176
177 _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
178 if (!_rq_tio_cache)
179 goto out_free_io_cache;
180
181 _rq_cache = kmem_cache_create("dm_old_clone_request", sizeof(struct request),
182 __alignof__(struct request), 0, NULL);
183 if (!_rq_cache)
184 goto out_free_rq_tio_cache;
185
186 r = dm_uevent_init();
187 if (r)
188 goto out_free_rq_cache;
189
190 deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
191 if (!deferred_remove_workqueue) {
192 r = -ENOMEM;
193 goto out_uevent_exit;
194 }
195
196 _major = major;
197 r = register_blkdev(_major, _name);
198 if (r < 0)
199 goto out_free_workqueue;
200
201 if (!_major)
202 _major = r;
203
204 return 0;
205
206out_free_workqueue:
207 destroy_workqueue(deferred_remove_workqueue);
208out_uevent_exit:
209 dm_uevent_exit();
210out_free_rq_cache:
211 kmem_cache_destroy(_rq_cache);
212out_free_rq_tio_cache:
213 kmem_cache_destroy(_rq_tio_cache);
214out_free_io_cache:
215 kmem_cache_destroy(_io_cache);
216
217 return r;
218}
219
220static void local_exit(void)
221{
222 flush_scheduled_work();
223 destroy_workqueue(deferred_remove_workqueue);
224
225 kmem_cache_destroy(_rq_cache);
226 kmem_cache_destroy(_rq_tio_cache);
227 kmem_cache_destroy(_io_cache);
228 unregister_blkdev(_major, _name);
229 dm_uevent_exit();
230
231 _major = 0;
232
233 DMINFO("cleaned up");
234}
235
236static int (*_inits[])(void) __initdata = {
237 local_init,
238 dm_target_init,
239 dm_linear_init,
240 dm_stripe_init,
241 dm_io_init,
242 dm_kcopyd_init,
243 dm_interface_init,
244 dm_statistics_init,
245};
246
247static void (*_exits[])(void) = {
248 local_exit,
249 dm_target_exit,
250 dm_linear_exit,
251 dm_stripe_exit,
252 dm_io_exit,
253 dm_kcopyd_exit,
254 dm_interface_exit,
255 dm_statistics_exit,
256};
257
258static int __init dm_init(void)
259{
260 const int count = ARRAY_SIZE(_inits);
261
262 int r, i;
263
264 for (i = 0; i < count; i++) {
265 r = _inits[i]();
266 if (r)
267 goto bad;
268 }
269
270 return 0;
271
272 bad:
273 while (i--)
274 _exits[i]();
275
276 return r;
277}
278
279static void __exit dm_exit(void)
280{
281 int i = ARRAY_SIZE(_exits);
282
283 while (i--)
284 _exits[i]();
285
286
287
288
289 idr_destroy(&_minor_idr);
290}
291
292
293
294
295int dm_deleting_md(struct mapped_device *md)
296{
297 return test_bit(DMF_DELETING, &md->flags);
298}
299
300static int dm_blk_open(struct block_device *bdev, fmode_t mode)
301{
302 struct mapped_device *md;
303
304 spin_lock(&_minor_lock);
305
306 md = bdev->bd_disk->private_data;
307 if (!md)
308 goto out;
309
310 if (test_bit(DMF_FREEING, &md->flags) ||
311 dm_deleting_md(md)) {
312 md = NULL;
313 goto out;
314 }
315
316 dm_get(md);
317 atomic_inc(&md->open_count);
318out:
319 spin_unlock(&_minor_lock);
320
321 return md ? 0 : -ENXIO;
322}
323
324static void dm_blk_close(struct gendisk *disk, fmode_t mode)
325{
326 struct mapped_device *md;
327
328 spin_lock(&_minor_lock);
329
330 md = disk->private_data;
331 if (WARN_ON(!md))
332 goto out;
333
334 if (atomic_dec_and_test(&md->open_count) &&
335 (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
336 queue_work(deferred_remove_workqueue, &deferred_remove_work);
337
338 dm_put(md);
339out:
340 spin_unlock(&_minor_lock);
341}
342
343int dm_open_count(struct mapped_device *md)
344{
345 return atomic_read(&md->open_count);
346}
347
348
349
350
351int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
352{
353 int r = 0;
354
355 spin_lock(&_minor_lock);
356
357 if (dm_open_count(md)) {
358 r = -EBUSY;
359 if (mark_deferred)
360 set_bit(DMF_DEFERRED_REMOVE, &md->flags);
361 } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
362 r = -EEXIST;
363 else
364 set_bit(DMF_DELETING, &md->flags);
365
366 spin_unlock(&_minor_lock);
367
368 return r;
369}
370
371int dm_cancel_deferred_remove(struct mapped_device *md)
372{
373 int r = 0;
374
375 spin_lock(&_minor_lock);
376
377 if (test_bit(DMF_DELETING, &md->flags))
378 r = -EBUSY;
379 else
380 clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
381
382 spin_unlock(&_minor_lock);
383
384 return r;
385}
386
387static void do_deferred_remove(struct work_struct *w)
388{
389 dm_deferred_remove();
390}
391
392sector_t dm_get_size(struct mapped_device *md)
393{
394 return get_capacity(md->disk);
395}
396
397struct request_queue *dm_get_md_queue(struct mapped_device *md)
398{
399 return md->queue;
400}
401
402struct dm_stats *dm_get_stats(struct mapped_device *md)
403{
404 return &md->stats;
405}
406
407static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
408{
409 struct mapped_device *md = bdev->bd_disk->private_data;
410
411 return dm_get_geometry(md, geo);
412}
413
414static int dm_grab_bdev_for_ioctl(struct mapped_device *md,
415 struct block_device **bdev,
416 fmode_t *mode)
417{
418 struct dm_target *tgt;
419 struct dm_table *map;
420 int srcu_idx, r;
421
422retry:
423 r = -ENOTTY;
424 map = dm_get_live_table(md, &srcu_idx);
425 if (!map || !dm_table_get_size(map))
426 goto out;
427
428
429 if (dm_table_get_num_targets(map) != 1)
430 goto out;
431
432 tgt = dm_table_get_target(map, 0);
433 if (!tgt->type->prepare_ioctl)
434 goto out;
435
436 if (dm_suspended_md(md)) {
437 r = -EAGAIN;
438 goto out;
439 }
440
441 r = tgt->type->prepare_ioctl(tgt, bdev, mode);
442 if (r < 0)
443 goto out;
444
445 bdgrab(*bdev);
446 dm_put_live_table(md, srcu_idx);
447 return r;
448
449out:
450 dm_put_live_table(md, srcu_idx);
451 if (r == -ENOTCONN && !fatal_signal_pending(current)) {
452 msleep(10);
453 goto retry;
454 }
455 return r;
456}
457
458static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
459 unsigned int cmd, unsigned long arg)
460{
461 struct mapped_device *md = bdev->bd_disk->private_data;
462 int r;
463
464 r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
465 if (r < 0)
466 return r;
467
468 if (r > 0) {
469
470
471
472
473 if (!capable(CAP_SYS_RAWIO)) {
474 DMWARN_LIMIT(
475 "%s: sending ioctl %x to DM device without required privilege.",
476 current->comm, cmd);
477 r = -ENOIOCTLCMD;
478 goto out;
479 }
480 }
481
482 r = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
483out:
484 bdput(bdev);
485 return r;
486}
487
488static struct dm_io *alloc_io(struct mapped_device *md)
489{
490 return mempool_alloc(md->io_pool, GFP_NOIO);
491}
492
493static void free_io(struct mapped_device *md, struct dm_io *io)
494{
495 mempool_free(io, md->io_pool);
496}
497
498static void free_tio(struct dm_target_io *tio)
499{
500 bio_put(&tio->clone);
501}
502
503int md_in_flight(struct mapped_device *md)
504{
505 return atomic_read(&md->pending[READ]) +
506 atomic_read(&md->pending[WRITE]);
507}
508
509static void start_io_acct(struct dm_io *io)
510{
511 struct mapped_device *md = io->md;
512 struct bio *bio = io->bio;
513 int cpu;
514 int rw = bio_data_dir(bio);
515
516 io->start_time = jiffies;
517
518 cpu = part_stat_lock();
519 part_round_stats(md->queue, cpu, &dm_disk(md)->part0);
520 part_stat_unlock();
521 atomic_set(&dm_disk(md)->part0.in_flight[rw],
522 atomic_inc_return(&md->pending[rw]));
523
524 if (unlikely(dm_stats_used(&md->stats)))
525 dm_stats_account_io(&md->stats, bio_data_dir(bio),
526 bio->bi_iter.bi_sector, bio_sectors(bio),
527 false, 0, &io->stats_aux);
528}
529
530static void end_io_acct(struct dm_io *io)
531{
532 struct mapped_device *md = io->md;
533 struct bio *bio = io->bio;
534 unsigned long duration = jiffies - io->start_time;
535 int pending;
536 int rw = bio_data_dir(bio);
537
538 generic_end_io_acct(md->queue, rw, &dm_disk(md)->part0, io->start_time);
539
540 if (unlikely(dm_stats_used(&md->stats)))
541 dm_stats_account_io(&md->stats, bio_data_dir(bio),
542 bio->bi_iter.bi_sector, bio_sectors(bio),
543 true, duration, &io->stats_aux);
544
545
546
547
548
549 pending = atomic_dec_return(&md->pending[rw]);
550 atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
551 pending += atomic_read(&md->pending[rw^0x1]);
552
553
554 if (!pending)
555 wake_up(&md->wait);
556}
557
558
559
560
561static void queue_io(struct mapped_device *md, struct bio *bio)
562{
563 unsigned long flags;
564
565 spin_lock_irqsave(&md->deferred_lock, flags);
566 bio_list_add(&md->deferred, bio);
567 spin_unlock_irqrestore(&md->deferred_lock, flags);
568 queue_work(md->wq, &md->work);
569}
570
571
572
573
574
575
576struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
577{
578 *srcu_idx = srcu_read_lock(&md->io_barrier);
579
580 return srcu_dereference(md->map, &md->io_barrier);
581}
582
583void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
584{
585 srcu_read_unlock(&md->io_barrier, srcu_idx);
586}
587
588void dm_sync_table(struct mapped_device *md)
589{
590 synchronize_srcu(&md->io_barrier);
591 synchronize_rcu_expedited();
592}
593
594
595
596
597
598static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
599{
600 rcu_read_lock();
601 return rcu_dereference(md->map);
602}
603
604static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
605{
606 rcu_read_unlock();
607}
608
609
610
611
612static int open_table_device(struct table_device *td, dev_t dev,
613 struct mapped_device *md)
614{
615 static char *_claim_ptr = "I belong to device-mapper";
616 struct block_device *bdev;
617
618 int r;
619
620 BUG_ON(td->dm_dev.bdev);
621
622 bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _claim_ptr);
623 if (IS_ERR(bdev))
624 return PTR_ERR(bdev);
625
626 r = bd_link_disk_holder(bdev, dm_disk(md));
627 if (r) {
628 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
629 return r;
630 }
631
632 td->dm_dev.bdev = bdev;
633 td->dm_dev.dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
634 return 0;
635}
636
637
638
639
640static void close_table_device(struct table_device *td, struct mapped_device *md)
641{
642 if (!td->dm_dev.bdev)
643 return;
644
645 bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
646 blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
647 put_dax(td->dm_dev.dax_dev);
648 td->dm_dev.bdev = NULL;
649 td->dm_dev.dax_dev = NULL;
650}
651
652static struct table_device *find_table_device(struct list_head *l, dev_t dev,
653 fmode_t mode) {
654 struct table_device *td;
655
656 list_for_each_entry(td, l, list)
657 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
658 return td;
659
660 return NULL;
661}
662
663int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
664 struct dm_dev **result) {
665 int r;
666 struct table_device *td;
667
668 mutex_lock(&md->table_devices_lock);
669 td = find_table_device(&md->table_devices, dev, mode);
670 if (!td) {
671 td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
672 if (!td) {
673 mutex_unlock(&md->table_devices_lock);
674 return -ENOMEM;
675 }
676
677 td->dm_dev.mode = mode;
678 td->dm_dev.bdev = NULL;
679
680 if ((r = open_table_device(td, dev, md))) {
681 mutex_unlock(&md->table_devices_lock);
682 kfree(td);
683 return r;
684 }
685
686 format_dev_t(td->dm_dev.name, dev);
687
688 atomic_set(&td->count, 0);
689 list_add(&td->list, &md->table_devices);
690 }
691 atomic_inc(&td->count);
692 mutex_unlock(&md->table_devices_lock);
693
694 *result = &td->dm_dev;
695 return 0;
696}
697EXPORT_SYMBOL_GPL(dm_get_table_device);
698
699void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
700{
701 struct table_device *td = container_of(d, struct table_device, dm_dev);
702
703 mutex_lock(&md->table_devices_lock);
704 if (atomic_dec_and_test(&td->count)) {
705 close_table_device(td, md);
706 list_del(&td->list);
707 kfree(td);
708 }
709 mutex_unlock(&md->table_devices_lock);
710}
711EXPORT_SYMBOL(dm_put_table_device);
712
713static void free_table_devices(struct list_head *devices)
714{
715 struct list_head *tmp, *next;
716
717 list_for_each_safe(tmp, next, devices) {
718 struct table_device *td = list_entry(tmp, struct table_device, list);
719
720 DMWARN("dm_destroy: %s still exists with %d references",
721 td->dm_dev.name, atomic_read(&td->count));
722 kfree(td);
723 }
724}
725
726
727
728
729int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
730{
731 *geo = md->geometry;
732
733 return 0;
734}
735
736
737
738
739int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
740{
741 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
742
743 if (geo->start > sz) {
744 DMWARN("Start sector is beyond the geometry limits.");
745 return -EINVAL;
746 }
747
748 md->geometry = *geo;
749
750 return 0;
751}
752
753
754
755
756
757
758
759
760
761
762static int __noflush_suspending(struct mapped_device *md)
763{
764 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
765}
766
767
768
769
770
771static void dec_pending(struct dm_io *io, blk_status_t error)
772{
773 unsigned long flags;
774 blk_status_t io_error;
775 struct bio *bio;
776 struct mapped_device *md = io->md;
777
778
779 if (unlikely(error)) {
780 spin_lock_irqsave(&io->endio_lock, flags);
781 if (!(io->status == BLK_STS_DM_REQUEUE &&
782 __noflush_suspending(md)))
783 io->status = error;
784 spin_unlock_irqrestore(&io->endio_lock, flags);
785 }
786
787 if (atomic_dec_and_test(&io->io_count)) {
788 if (io->status == BLK_STS_DM_REQUEUE) {
789
790
791
792 spin_lock_irqsave(&md->deferred_lock, flags);
793 if (__noflush_suspending(md))
794 bio_list_add_head(&md->deferred, io->bio);
795 else
796
797 io->status = BLK_STS_IOERR;
798 spin_unlock_irqrestore(&md->deferred_lock, flags);
799 }
800
801 io_error = io->status;
802 bio = io->bio;
803 end_io_acct(io);
804 free_io(md, io);
805
806 if (io_error == BLK_STS_DM_REQUEUE)
807 return;
808
809 if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
810
811
812
813
814 bio->bi_opf &= ~REQ_PREFLUSH;
815 queue_io(md, bio);
816 } else {
817
818 bio->bi_status = io_error;
819 bio_endio(bio);
820 }
821 }
822}
823
824void disable_write_same(struct mapped_device *md)
825{
826 struct queue_limits *limits = dm_get_queue_limits(md);
827
828
829 limits->max_write_same_sectors = 0;
830}
831
832void disable_write_zeroes(struct mapped_device *md)
833{
834 struct queue_limits *limits = dm_get_queue_limits(md);
835
836
837 limits->max_write_zeroes_sectors = 0;
838}
839
840static void clone_endio(struct bio *bio)
841{
842 blk_status_t error = bio->bi_status;
843 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
844 struct dm_io *io = tio->io;
845 struct mapped_device *md = tio->io->md;
846 dm_endio_fn endio = tio->ti->type->end_io;
847
848 if (unlikely(error == BLK_STS_TARGET)) {
849 if (bio_op(bio) == REQ_OP_WRITE_SAME &&
850 !bio->bi_disk->queue->limits.max_write_same_sectors)
851 disable_write_same(md);
852 if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
853 !bio->bi_disk->queue->limits.max_write_zeroes_sectors)
854 disable_write_zeroes(md);
855 }
856
857 if (endio) {
858 int r = endio(tio->ti, bio, &error);
859 switch (r) {
860 case DM_ENDIO_REQUEUE:
861 error = BLK_STS_DM_REQUEUE;
862
863 case DM_ENDIO_DONE:
864 break;
865 case DM_ENDIO_INCOMPLETE:
866
867 return;
868 default:
869 DMWARN("unimplemented target endio return value: %d", r);
870 BUG();
871 }
872 }
873
874 free_tio(tio);
875 dec_pending(io, error);
876}
877
878
879
880
881
882static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
883{
884 sector_t target_offset = dm_target_offset(ti, sector);
885
886 return ti->len - target_offset;
887}
888
889static sector_t max_io_len(sector_t sector, struct dm_target *ti)
890{
891 sector_t len = max_io_len_target_boundary(sector, ti);
892 sector_t offset, max_len;
893
894
895
896
897 if (ti->max_io_len) {
898 offset = dm_target_offset(ti, sector);
899 if (unlikely(ti->max_io_len & (ti->max_io_len - 1)))
900 max_len = sector_div(offset, ti->max_io_len);
901 else
902 max_len = offset & (ti->max_io_len - 1);
903 max_len = ti->max_io_len - max_len;
904
905 if (len > max_len)
906 len = max_len;
907 }
908
909 return len;
910}
911
912int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
913{
914 if (len > UINT_MAX) {
915 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
916 (unsigned long long)len, UINT_MAX);
917 ti->error = "Maximum size of target IO is too large";
918 return -EINVAL;
919 }
920
921 ti->max_io_len = (uint32_t) len;
922
923 return 0;
924}
925EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
926
927static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
928 sector_t sector, int *srcu_idx)
929{
930 struct dm_table *map;
931 struct dm_target *ti;
932
933 map = dm_get_live_table(md, srcu_idx);
934 if (!map)
935 return NULL;
936
937 ti = dm_table_find_target(map, sector);
938 if (!dm_target_is_valid(ti))
939 return NULL;
940
941 return ti;
942}
943
944static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
945 long nr_pages, void **kaddr, pfn_t *pfn)
946{
947 struct mapped_device *md = dax_get_private(dax_dev);
948 sector_t sector = pgoff * PAGE_SECTORS;
949 struct dm_target *ti;
950 long len, ret = -EIO;
951 int srcu_idx;
952
953 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
954
955 if (!ti)
956 goto out;
957 if (!ti->type->direct_access)
958 goto out;
959 len = max_io_len(sector, ti) / PAGE_SECTORS;
960 if (len < 1)
961 goto out;
962 nr_pages = min(len, nr_pages);
963 if (ti->type->direct_access)
964 ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
965
966 out:
967 dm_put_live_table(md, srcu_idx);
968
969 return ret;
970}
971
972static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
973 void *addr, size_t bytes, struct iov_iter *i)
974{
975 struct mapped_device *md = dax_get_private(dax_dev);
976 sector_t sector = pgoff * PAGE_SECTORS;
977 struct dm_target *ti;
978 long ret = 0;
979 int srcu_idx;
980
981 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
982
983 if (!ti)
984 goto out;
985 if (!ti->type->dax_copy_from_iter) {
986 ret = copy_from_iter(addr, bytes, i);
987 goto out;
988 }
989 ret = ti->type->dax_copy_from_iter(ti, pgoff, addr, bytes, i);
990 out:
991 dm_put_live_table(md, srcu_idx);
992
993 return ret;
994}
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
1025{
1026 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
1027 unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
1028 BUG_ON(bio->bi_opf & REQ_PREFLUSH);
1029 BUG_ON(bi_size > *tio->len_ptr);
1030 BUG_ON(n_sectors > bi_size);
1031 *tio->len_ptr -= bi_size - n_sectors;
1032 bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
1033}
1034EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044void dm_remap_zone_report(struct dm_target *ti, struct bio *bio, sector_t start)
1045{
1046#ifdef CONFIG_BLK_DEV_ZONED
1047 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
1048 struct bio *report_bio = tio->io->bio;
1049 struct blk_zone_report_hdr *hdr = NULL;
1050 struct blk_zone *zone;
1051 unsigned int nr_rep = 0;
1052 unsigned int ofst;
1053 struct bio_vec bvec;
1054 struct bvec_iter iter;
1055 void *addr;
1056
1057 if (bio->bi_status)
1058 return;
1059
1060
1061
1062
1063
1064 bio_for_each_segment(bvec, report_bio, iter) {
1065 addr = kmap_atomic(bvec.bv_page);
1066
1067
1068 if (!hdr) {
1069 hdr = addr;
1070 ofst = sizeof(struct blk_zone_report_hdr);
1071 } else
1072 ofst = 0;
1073
1074
1075 while (hdr->nr_zones && ofst < bvec.bv_len) {
1076 zone = addr + ofst;
1077 if (zone->start >= start + ti->len) {
1078 hdr->nr_zones = 0;
1079 break;
1080 }
1081 zone->start = zone->start + ti->begin - start;
1082 if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) {
1083 if (zone->cond == BLK_ZONE_COND_FULL)
1084 zone->wp = zone->start + zone->len;
1085 else if (zone->cond == BLK_ZONE_COND_EMPTY)
1086 zone->wp = zone->start;
1087 else
1088 zone->wp = zone->wp + ti->begin - start;
1089 }
1090 ofst += sizeof(struct blk_zone);
1091 hdr->nr_zones--;
1092 nr_rep++;
1093 }
1094
1095 if (addr != hdr)
1096 kunmap_atomic(addr);
1097
1098 if (!hdr->nr_zones)
1099 break;
1100 }
1101
1102 if (hdr) {
1103 hdr->nr_zones = nr_rep;
1104 kunmap_atomic(hdr);
1105 }
1106
1107 bio_advance(report_bio, report_bio->bi_iter.bi_size);
1108
1109#else
1110 bio->bi_status = BLK_STS_NOTSUPP;
1111#endif
1112}
1113EXPORT_SYMBOL_GPL(dm_remap_zone_report);
1114
1115
1116
1117
1118
1119struct dm_offload {
1120 struct blk_plug plug;
1121 struct blk_plug_cb cb;
1122};
1123
1124static void flush_current_bio_list(struct blk_plug_cb *cb, bool from_schedule)
1125{
1126 struct dm_offload *o = container_of(cb, struct dm_offload, cb);
1127 struct bio_list list;
1128 struct bio *bio;
1129 int i;
1130
1131 INIT_LIST_HEAD(&o->cb.list);
1132
1133 if (unlikely(!current->bio_list))
1134 return;
1135
1136 for (i = 0; i < 2; i++) {
1137 list = current->bio_list[i];
1138 bio_list_init(¤t->bio_list[i]);
1139
1140 while ((bio = bio_list_pop(&list))) {
1141 struct bio_set *bs = bio->bi_pool;
1142 if (unlikely(!bs) || bs == fs_bio_set ||
1143 !bs->rescue_workqueue) {
1144 bio_list_add(¤t->bio_list[i], bio);
1145 continue;
1146 }
1147
1148 spin_lock(&bs->rescue_lock);
1149 bio_list_add(&bs->rescue_list, bio);
1150 queue_work(bs->rescue_workqueue, &bs->rescue_work);
1151 spin_unlock(&bs->rescue_lock);
1152 }
1153 }
1154}
1155
1156static void dm_offload_start(struct dm_offload *o)
1157{
1158 blk_start_plug(&o->plug);
1159 o->cb.callback = flush_current_bio_list;
1160 list_add(&o->cb.list, ¤t->plug->cb_list);
1161}
1162
1163static void dm_offload_end(struct dm_offload *o)
1164{
1165 list_del(&o->cb.list);
1166 blk_finish_plug(&o->plug);
1167}
1168
1169static void __map_bio(struct dm_target_io *tio)
1170{
1171 int r;
1172 sector_t sector;
1173 struct dm_offload o;
1174 struct bio *clone = &tio->clone;
1175 struct dm_target *ti = tio->ti;
1176
1177 clone->bi_end_io = clone_endio;
1178
1179
1180
1181
1182
1183
1184 atomic_inc(&tio->io->io_count);
1185 sector = clone->bi_iter.bi_sector;
1186
1187 dm_offload_start(&o);
1188 r = ti->type->map(ti, clone);
1189 dm_offload_end(&o);
1190
1191 switch (r) {
1192 case DM_MAPIO_SUBMITTED:
1193 break;
1194 case DM_MAPIO_REMAPPED:
1195
1196 trace_block_bio_remap(clone->bi_disk->queue, clone,
1197 bio_dev(tio->io->bio), sector);
1198 generic_make_request(clone);
1199 break;
1200 case DM_MAPIO_KILL:
1201 dec_pending(tio->io, BLK_STS_IOERR);
1202 free_tio(tio);
1203 break;
1204 case DM_MAPIO_REQUEUE:
1205 dec_pending(tio->io, BLK_STS_DM_REQUEUE);
1206 free_tio(tio);
1207 break;
1208 default:
1209 DMWARN("unimplemented target map return value: %d", r);
1210 BUG();
1211 }
1212}
1213
1214struct clone_info {
1215 struct mapped_device *md;
1216 struct dm_table *map;
1217 struct bio *bio;
1218 struct dm_io *io;
1219 sector_t sector;
1220 unsigned sector_count;
1221};
1222
1223static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
1224{
1225 bio->bi_iter.bi_sector = sector;
1226 bio->bi_iter.bi_size = to_bytes(len);
1227}
1228
1229
1230
1231
1232static int clone_bio(struct dm_target_io *tio, struct bio *bio,
1233 sector_t sector, unsigned len)
1234{
1235 struct bio *clone = &tio->clone;
1236
1237 __bio_clone_fast(clone, bio);
1238
1239 if (unlikely(bio_integrity(bio) != NULL)) {
1240 int r;
1241
1242 if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
1243 !dm_target_passes_integrity(tio->ti->type))) {
1244 DMWARN("%s: the target %s doesn't support integrity data.",
1245 dm_device_name(tio->io->md),
1246 tio->ti->type->name);
1247 return -EIO;
1248 }
1249
1250 r = bio_integrity_clone(clone, bio, GFP_NOIO);
1251 if (r < 0)
1252 return r;
1253 }
1254
1255 if (bio_op(bio) != REQ_OP_ZONE_REPORT)
1256 bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
1257 clone->bi_iter.bi_size = to_bytes(len);
1258
1259 if (unlikely(bio_integrity(bio) != NULL))
1260 bio_integrity_trim(clone);
1261
1262 return 0;
1263}
1264
1265static struct dm_target_io *alloc_tio(struct clone_info *ci,
1266 struct dm_target *ti,
1267 unsigned target_bio_nr)
1268{
1269 struct dm_target_io *tio;
1270 struct bio *clone;
1271
1272 clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs);
1273 tio = container_of(clone, struct dm_target_io, clone);
1274
1275 tio->io = ci->io;
1276 tio->ti = ti;
1277 tio->target_bio_nr = target_bio_nr;
1278
1279 return tio;
1280}
1281
1282static void __clone_and_map_simple_bio(struct clone_info *ci,
1283 struct dm_target *ti,
1284 unsigned target_bio_nr, unsigned *len)
1285{
1286 struct dm_target_io *tio = alloc_tio(ci, ti, target_bio_nr);
1287 struct bio *clone = &tio->clone;
1288
1289 tio->len_ptr = len;
1290
1291 __bio_clone_fast(clone, ci->bio);
1292 if (len)
1293 bio_setup_sector(clone, ci->sector, *len);
1294
1295 __map_bio(tio);
1296}
1297
1298static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
1299 unsigned num_bios, unsigned *len)
1300{
1301 unsigned target_bio_nr;
1302
1303 for (target_bio_nr = 0; target_bio_nr < num_bios; target_bio_nr++)
1304 __clone_and_map_simple_bio(ci, ti, target_bio_nr, len);
1305}
1306
1307static int __send_empty_flush(struct clone_info *ci)
1308{
1309 unsigned target_nr = 0;
1310 struct dm_target *ti;
1311
1312 BUG_ON(bio_has_data(ci->bio));
1313 while ((ti = dm_table_get_target(ci->map, target_nr++)))
1314 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
1315
1316 return 0;
1317}
1318
1319static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
1320 sector_t sector, unsigned *len)
1321{
1322 struct bio *bio = ci->bio;
1323 struct dm_target_io *tio;
1324 unsigned target_bio_nr;
1325 unsigned num_target_bios = 1;
1326 int r = 0;
1327
1328
1329
1330
1331 if (bio_data_dir(bio) == WRITE && ti->num_write_bios)
1332 num_target_bios = ti->num_write_bios(ti, bio);
1333
1334 for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) {
1335 tio = alloc_tio(ci, ti, target_bio_nr);
1336 tio->len_ptr = len;
1337 r = clone_bio(tio, bio, sector, *len);
1338 if (r < 0) {
1339 free_tio(tio);
1340 break;
1341 }
1342 __map_bio(tio);
1343 }
1344
1345 return r;
1346}
1347
1348typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
1349
1350static unsigned get_num_discard_bios(struct dm_target *ti)
1351{
1352 return ti->num_discard_bios;
1353}
1354
1355static unsigned get_num_write_same_bios(struct dm_target *ti)
1356{
1357 return ti->num_write_same_bios;
1358}
1359
1360static unsigned get_num_write_zeroes_bios(struct dm_target *ti)
1361{
1362 return ti->num_write_zeroes_bios;
1363}
1364
1365typedef bool (*is_split_required_fn)(struct dm_target *ti);
1366
1367static bool is_split_required_for_discard(struct dm_target *ti)
1368{
1369 return ti->split_discard_bios;
1370}
1371
1372static int __send_changing_extent_only(struct clone_info *ci,
1373 get_num_bios_fn get_num_bios,
1374 is_split_required_fn is_split_required)
1375{
1376 struct dm_target *ti;
1377 unsigned len;
1378 unsigned num_bios;
1379
1380 do {
1381 ti = dm_table_find_target(ci->map, ci->sector);
1382 if (!dm_target_is_valid(ti))
1383 return -EIO;
1384
1385
1386
1387
1388
1389
1390
1391 num_bios = get_num_bios ? get_num_bios(ti) : 0;
1392 if (!num_bios)
1393 return -EOPNOTSUPP;
1394
1395 if (is_split_required && !is_split_required(ti))
1396 len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
1397 else
1398 len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti));
1399
1400 __send_duplicate_bios(ci, ti, num_bios, &len);
1401
1402 ci->sector += len;
1403 } while (ci->sector_count -= len);
1404
1405 return 0;
1406}
1407
1408static int __send_discard(struct clone_info *ci)
1409{
1410 return __send_changing_extent_only(ci, get_num_discard_bios,
1411 is_split_required_for_discard);
1412}
1413
1414static int __send_write_same(struct clone_info *ci)
1415{
1416 return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
1417}
1418
1419static int __send_write_zeroes(struct clone_info *ci)
1420{
1421 return __send_changing_extent_only(ci, get_num_write_zeroes_bios, NULL);
1422}
1423
1424
1425
1426
1427static int __split_and_process_non_flush(struct clone_info *ci)
1428{
1429 struct bio *bio = ci->bio;
1430 struct dm_target *ti;
1431 unsigned len;
1432 int r;
1433
1434 if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
1435 return __send_discard(ci);
1436 else if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
1437 return __send_write_same(ci);
1438 else if (unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES))
1439 return __send_write_zeroes(ci);
1440
1441 ti = dm_table_find_target(ci->map, ci->sector);
1442 if (!dm_target_is_valid(ti))
1443 return -EIO;
1444
1445 if (bio_op(bio) == REQ_OP_ZONE_REPORT)
1446 len = ci->sector_count;
1447 else
1448 len = min_t(sector_t, max_io_len(ci->sector, ti),
1449 ci->sector_count);
1450
1451 r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
1452 if (r < 0)
1453 return r;
1454
1455 ci->sector += len;
1456 ci->sector_count -= len;
1457
1458 return 0;
1459}
1460
1461
1462
1463
1464static void __split_and_process_bio(struct mapped_device *md,
1465 struct dm_table *map, struct bio *bio)
1466{
1467 struct clone_info ci;
1468 int error = 0;
1469
1470 if (unlikely(!map)) {
1471 bio_io_error(bio);
1472 return;
1473 }
1474
1475 ci.map = map;
1476 ci.md = md;
1477 ci.io = alloc_io(md);
1478 ci.io->status = 0;
1479 atomic_set(&ci.io->io_count, 1);
1480 ci.io->bio = bio;
1481 ci.io->md = md;
1482 spin_lock_init(&ci.io->endio_lock);
1483 ci.sector = bio->bi_iter.bi_sector;
1484
1485 start_io_acct(ci.io);
1486
1487 if (bio->bi_opf & REQ_PREFLUSH) {
1488 ci.bio = &ci.md->flush_bio;
1489 ci.sector_count = 0;
1490 error = __send_empty_flush(&ci);
1491
1492 } else if (bio_op(bio) == REQ_OP_ZONE_RESET) {
1493 ci.bio = bio;
1494 ci.sector_count = 0;
1495 error = __split_and_process_non_flush(&ci);
1496 } else {
1497 ci.bio = bio;
1498 ci.sector_count = bio_sectors(bio);
1499 while (ci.sector_count && !error)
1500 error = __split_and_process_non_flush(&ci);
1501 }
1502
1503
1504 dec_pending(ci.io, errno_to_blk_status(error));
1505}
1506
1507
1508
1509
1510
1511
1512
1513
1514static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
1515{
1516 int rw = bio_data_dir(bio);
1517 struct mapped_device *md = q->queuedata;
1518 int srcu_idx;
1519 struct dm_table *map;
1520
1521 map = dm_get_live_table(md, &srcu_idx);
1522
1523 generic_start_io_acct(q, rw, bio_sectors(bio), &dm_disk(md)->part0);
1524
1525
1526 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1527 dm_put_live_table(md, srcu_idx);
1528
1529 if (!(bio->bi_opf & REQ_RAHEAD))
1530 queue_io(md, bio);
1531 else
1532 bio_io_error(bio);
1533 return BLK_QC_T_NONE;
1534 }
1535
1536 __split_and_process_bio(md, map, bio);
1537 dm_put_live_table(md, srcu_idx);
1538 return BLK_QC_T_NONE;
1539}
1540
1541static int dm_any_congested(void *congested_data, int bdi_bits)
1542{
1543 int r = bdi_bits;
1544 struct mapped_device *md = congested_data;
1545 struct dm_table *map;
1546
1547 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
1548 if (dm_request_based(md)) {
1549
1550
1551
1552
1553 r = md->queue->backing_dev_info->wb.state & bdi_bits;
1554 } else {
1555 map = dm_get_live_table_fast(md);
1556 if (map)
1557 r = dm_table_any_congested(map, bdi_bits);
1558 dm_put_live_table_fast(md);
1559 }
1560 }
1561
1562 return r;
1563}
1564
1565
1566
1567
1568static void free_minor(int minor)
1569{
1570 spin_lock(&_minor_lock);
1571 idr_remove(&_minor_idr, minor);
1572 spin_unlock(&_minor_lock);
1573}
1574
1575
1576
1577
1578static int specific_minor(int minor)
1579{
1580 int r;
1581
1582 if (minor >= (1 << MINORBITS))
1583 return -EINVAL;
1584
1585 idr_preload(GFP_KERNEL);
1586 spin_lock(&_minor_lock);
1587
1588 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
1589
1590 spin_unlock(&_minor_lock);
1591 idr_preload_end();
1592 if (r < 0)
1593 return r == -ENOSPC ? -EBUSY : r;
1594 return 0;
1595}
1596
1597static int next_free_minor(int *minor)
1598{
1599 int r;
1600
1601 idr_preload(GFP_KERNEL);
1602 spin_lock(&_minor_lock);
1603
1604 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
1605
1606 spin_unlock(&_minor_lock);
1607 idr_preload_end();
1608 if (r < 0)
1609 return r;
1610 *minor = r;
1611 return 0;
1612}
1613
1614static const struct block_device_operations dm_blk_dops;
1615static const struct dax_operations dm_dax_ops;
1616
1617static void dm_wq_work(struct work_struct *work);
1618
1619void dm_init_md_queue(struct mapped_device *md)
1620{
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630 queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
1631
1632
1633
1634
1635
1636 md->queue->queuedata = md;
1637 md->queue->backing_dev_info->congested_data = md;
1638}
1639
1640void dm_init_normal_md_queue(struct mapped_device *md)
1641{
1642 md->use_blk_mq = false;
1643 dm_init_md_queue(md);
1644
1645
1646
1647
1648 md->queue->backing_dev_info->congested_fn = dm_any_congested;
1649}
1650
1651static void cleanup_mapped_device(struct mapped_device *md)
1652{
1653 if (md->wq)
1654 destroy_workqueue(md->wq);
1655 if (md->kworker_task)
1656 kthread_stop(md->kworker_task);
1657 mempool_destroy(md->io_pool);
1658 if (md->bs)
1659 bioset_free(md->bs);
1660
1661 if (md->dax_dev) {
1662 kill_dax(md->dax_dev);
1663 put_dax(md->dax_dev);
1664 md->dax_dev = NULL;
1665 }
1666
1667 if (md->disk) {
1668 spin_lock(&_minor_lock);
1669 md->disk->private_data = NULL;
1670 spin_unlock(&_minor_lock);
1671 del_gendisk(md->disk);
1672 put_disk(md->disk);
1673 }
1674
1675 if (md->queue)
1676 blk_cleanup_queue(md->queue);
1677
1678 cleanup_srcu_struct(&md->io_barrier);
1679
1680 if (md->bdev) {
1681 bdput(md->bdev);
1682 md->bdev = NULL;
1683 }
1684
1685 dm_mq_cleanup_mapped_device(md);
1686}
1687
1688
1689
1690
1691static struct mapped_device *alloc_dev(int minor)
1692{
1693 int r, numa_node_id = dm_get_numa_node();
1694 struct dax_device *dax_dev;
1695 struct mapped_device *md;
1696 void *old_md;
1697
1698 md = kzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
1699 if (!md) {
1700 DMWARN("unable to allocate device, out of memory.");
1701 return NULL;
1702 }
1703
1704 if (!try_module_get(THIS_MODULE))
1705 goto bad_module_get;
1706
1707
1708 if (minor == DM_ANY_MINOR)
1709 r = next_free_minor(&minor);
1710 else
1711 r = specific_minor(minor);
1712 if (r < 0)
1713 goto bad_minor;
1714
1715 r = init_srcu_struct(&md->io_barrier);
1716 if (r < 0)
1717 goto bad_io_barrier;
1718
1719 md->numa_node_id = numa_node_id;
1720 md->use_blk_mq = dm_use_blk_mq_default();
1721 md->init_tio_pdu = false;
1722 md->type = DM_TYPE_NONE;
1723 mutex_init(&md->suspend_lock);
1724 mutex_init(&md->type_lock);
1725 mutex_init(&md->table_devices_lock);
1726 spin_lock_init(&md->deferred_lock);
1727 atomic_set(&md->holders, 1);
1728 atomic_set(&md->open_count, 0);
1729 atomic_set(&md->event_nr, 0);
1730 atomic_set(&md->uevent_seq, 0);
1731 INIT_LIST_HEAD(&md->uevent_list);
1732 INIT_LIST_HEAD(&md->table_devices);
1733 spin_lock_init(&md->uevent_lock);
1734
1735 md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id);
1736 if (!md->queue)
1737 goto bad;
1738
1739 dm_init_md_queue(md);
1740
1741 md->disk = alloc_disk_node(1, numa_node_id);
1742 if (!md->disk)
1743 goto bad;
1744
1745 atomic_set(&md->pending[0], 0);
1746 atomic_set(&md->pending[1], 0);
1747 init_waitqueue_head(&md->wait);
1748 INIT_WORK(&md->work, dm_wq_work);
1749 init_waitqueue_head(&md->eventq);
1750 init_completion(&md->kobj_holder.completion);
1751 md->kworker_task = NULL;
1752
1753 md->disk->major = _major;
1754 md->disk->first_minor = minor;
1755 md->disk->fops = &dm_blk_dops;
1756 md->disk->queue = md->queue;
1757 md->disk->private_data = md;
1758 sprintf(md->disk->disk_name, "dm-%d", minor);
1759
1760 dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops);
1761 if (!dax_dev)
1762 goto bad;
1763 md->dax_dev = dax_dev;
1764
1765 add_disk(md->disk);
1766 format_dev_t(md->name, MKDEV(_major, minor));
1767
1768 md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
1769 if (!md->wq)
1770 goto bad;
1771
1772 md->bdev = bdget_disk(md->disk, 0);
1773 if (!md->bdev)
1774 goto bad;
1775
1776 bio_init(&md->flush_bio, NULL, 0);
1777 bio_set_dev(&md->flush_bio, md->bdev);
1778 md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
1779
1780 dm_stats_init(&md->stats);
1781
1782
1783 spin_lock(&_minor_lock);
1784 old_md = idr_replace(&_minor_idr, md, minor);
1785 spin_unlock(&_minor_lock);
1786
1787 BUG_ON(old_md != MINOR_ALLOCED);
1788
1789 return md;
1790
1791bad:
1792 cleanup_mapped_device(md);
1793bad_io_barrier:
1794 free_minor(minor);
1795bad_minor:
1796 module_put(THIS_MODULE);
1797bad_module_get:
1798 kfree(md);
1799 return NULL;
1800}
1801
1802static void unlock_fs(struct mapped_device *md);
1803
1804static void free_dev(struct mapped_device *md)
1805{
1806 int minor = MINOR(disk_devt(md->disk));
1807
1808 unlock_fs(md);
1809
1810 cleanup_mapped_device(md);
1811
1812 free_table_devices(&md->table_devices);
1813 dm_stats_cleanup(&md->stats);
1814 free_minor(minor);
1815
1816 module_put(THIS_MODULE);
1817 kfree(md);
1818}
1819
1820static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
1821{
1822 struct dm_md_mempools *p = dm_table_get_md_mempools(t);
1823
1824 if (md->bs) {
1825
1826 if (dm_table_bio_based(t)) {
1827
1828
1829
1830
1831 bioset_free(md->bs);
1832 md->bs = p->bs;
1833 p->bs = NULL;
1834 }
1835
1836
1837
1838
1839
1840
1841
1842
1843 goto out;
1844 }
1845
1846 BUG_ON(!p || md->io_pool || md->bs);
1847
1848 md->io_pool = p->io_pool;
1849 p->io_pool = NULL;
1850 md->bs = p->bs;
1851 p->bs = NULL;
1852
1853out:
1854
1855 dm_table_free_md_mempools(t);
1856}
1857
1858
1859
1860
1861static void event_callback(void *context)
1862{
1863 unsigned long flags;
1864 LIST_HEAD(uevents);
1865 struct mapped_device *md = (struct mapped_device *) context;
1866
1867 spin_lock_irqsave(&md->uevent_lock, flags);
1868 list_splice_init(&md->uevent_list, &uevents);
1869 spin_unlock_irqrestore(&md->uevent_lock, flags);
1870
1871 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
1872
1873 atomic_inc(&md->event_nr);
1874 wake_up(&md->eventq);
1875 dm_issue_global_event();
1876}
1877
1878
1879
1880
1881static void __set_size(struct mapped_device *md, sector_t size)
1882{
1883 lockdep_assert_held(&md->suspend_lock);
1884
1885 set_capacity(md->disk, size);
1886
1887 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
1888}
1889
1890
1891
1892
1893static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
1894 struct queue_limits *limits)
1895{
1896 struct dm_table *old_map;
1897 struct request_queue *q = md->queue;
1898 sector_t size;
1899
1900 lockdep_assert_held(&md->suspend_lock);
1901
1902 size = dm_table_get_size(t);
1903
1904
1905
1906
1907 if (size != dm_get_size(md))
1908 memset(&md->geometry, 0, sizeof(md->geometry));
1909
1910 __set_size(md, size);
1911
1912 dm_table_event_callback(t, event_callback, md);
1913
1914
1915
1916
1917
1918
1919
1920
1921 if (dm_table_request_based(t)) {
1922 dm_stop_queue(q);
1923
1924
1925
1926
1927
1928 md->immutable_target = dm_table_get_immutable_target(t);
1929 }
1930
1931 __bind_mempools(md, t);
1932
1933 old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
1934 rcu_assign_pointer(md->map, (void *)t);
1935 md->immutable_target_type = dm_table_get_immutable_target_type(t);
1936
1937 dm_table_set_restrictions(t, q, limits);
1938 if (old_map)
1939 dm_sync_table(md);
1940
1941 return old_map;
1942}
1943
1944
1945
1946
1947static struct dm_table *__unbind(struct mapped_device *md)
1948{
1949 struct dm_table *map = rcu_dereference_protected(md->map, 1);
1950
1951 if (!map)
1952 return NULL;
1953
1954 dm_table_event_callback(map, NULL, NULL);
1955 RCU_INIT_POINTER(md->map, NULL);
1956 dm_sync_table(md);
1957
1958 return map;
1959}
1960
1961
1962
1963
1964int dm_create(int minor, struct mapped_device **result)
1965{
1966 struct mapped_device *md;
1967
1968 md = alloc_dev(minor);
1969 if (!md)
1970 return -ENXIO;
1971
1972 dm_sysfs_init(md);
1973
1974 *result = md;
1975 return 0;
1976}
1977
1978
1979
1980
1981
1982void dm_lock_md_type(struct mapped_device *md)
1983{
1984 mutex_lock(&md->type_lock);
1985}
1986
1987void dm_unlock_md_type(struct mapped_device *md)
1988{
1989 mutex_unlock(&md->type_lock);
1990}
1991
1992void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
1993{
1994 BUG_ON(!mutex_is_locked(&md->type_lock));
1995 md->type = type;
1996}
1997
1998enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
1999{
2000 return md->type;
2001}
2002
2003struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
2004{
2005 return md->immutable_target_type;
2006}
2007
2008
2009
2010
2011
2012struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
2013{
2014 BUG_ON(!atomic_read(&md->holders));
2015 return &md->queue->limits;
2016}
2017EXPORT_SYMBOL_GPL(dm_get_queue_limits);
2018
2019
2020
2021
2022int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
2023{
2024 int r;
2025 enum dm_queue_mode type = dm_get_md_type(md);
2026
2027 switch (type) {
2028 case DM_TYPE_REQUEST_BASED:
2029 r = dm_old_init_request_queue(md, t);
2030 if (r) {
2031 DMERR("Cannot initialize queue for request-based mapped device");
2032 return r;
2033 }
2034 break;
2035 case DM_TYPE_MQ_REQUEST_BASED:
2036 r = dm_mq_init_request_queue(md, t);
2037 if (r) {
2038 DMERR("Cannot initialize queue for request-based dm-mq mapped device");
2039 return r;
2040 }
2041 break;
2042 case DM_TYPE_BIO_BASED:
2043 case DM_TYPE_DAX_BIO_BASED:
2044 dm_init_normal_md_queue(md);
2045 blk_queue_make_request(md->queue, dm_make_request);
2046
2047
2048
2049
2050 bioset_free(md->queue->bio_split);
2051 md->queue->bio_split = NULL;
2052
2053 if (type == DM_TYPE_DAX_BIO_BASED)
2054 queue_flag_set_unlocked(QUEUE_FLAG_DAX, md->queue);
2055 break;
2056 case DM_TYPE_NONE:
2057 WARN_ON_ONCE(true);
2058 break;
2059 }
2060
2061 return 0;
2062}
2063
2064struct mapped_device *dm_get_md(dev_t dev)
2065{
2066 struct mapped_device *md;
2067 unsigned minor = MINOR(dev);
2068
2069 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2070 return NULL;
2071
2072 spin_lock(&_minor_lock);
2073
2074 md = idr_find(&_minor_idr, minor);
2075 if (md) {
2076 if ((md == MINOR_ALLOCED ||
2077 (MINOR(disk_devt(dm_disk(md))) != minor) ||
2078 dm_deleting_md(md) ||
2079 test_bit(DMF_FREEING, &md->flags))) {
2080 md = NULL;
2081 goto out;
2082 }
2083 dm_get(md);
2084 }
2085
2086out:
2087 spin_unlock(&_minor_lock);
2088
2089 return md;
2090}
2091EXPORT_SYMBOL_GPL(dm_get_md);
2092
2093void *dm_get_mdptr(struct mapped_device *md)
2094{
2095 return md->interface_ptr;
2096}
2097
2098void dm_set_mdptr(struct mapped_device *md, void *ptr)
2099{
2100 md->interface_ptr = ptr;
2101}
2102
2103void dm_get(struct mapped_device *md)
2104{
2105 atomic_inc(&md->holders);
2106 BUG_ON(test_bit(DMF_FREEING, &md->flags));
2107}
2108
2109int dm_hold(struct mapped_device *md)
2110{
2111 spin_lock(&_minor_lock);
2112 if (test_bit(DMF_FREEING, &md->flags)) {
2113 spin_unlock(&_minor_lock);
2114 return -EBUSY;
2115 }
2116 dm_get(md);
2117 spin_unlock(&_minor_lock);
2118 return 0;
2119}
2120EXPORT_SYMBOL_GPL(dm_hold);
2121
2122const char *dm_device_name(struct mapped_device *md)
2123{
2124 return md->name;
2125}
2126EXPORT_SYMBOL_GPL(dm_device_name);
2127
2128static void __dm_destroy(struct mapped_device *md, bool wait)
2129{
2130 struct request_queue *q = dm_get_md_queue(md);
2131 struct dm_table *map;
2132 int srcu_idx;
2133
2134 might_sleep();
2135
2136 spin_lock(&_minor_lock);
2137 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2138 set_bit(DMF_FREEING, &md->flags);
2139 spin_unlock(&_minor_lock);
2140
2141 blk_set_queue_dying(q);
2142
2143 if (dm_request_based(md) && md->kworker_task)
2144 kthread_flush_worker(&md->kworker);
2145
2146
2147
2148
2149
2150 mutex_lock(&md->suspend_lock);
2151 map = dm_get_live_table(md, &srcu_idx);
2152 if (!dm_suspended_md(md)) {
2153 dm_table_presuspend_targets(map);
2154 dm_table_postsuspend_targets(map);
2155 }
2156
2157 dm_put_live_table(md, srcu_idx);
2158 mutex_unlock(&md->suspend_lock);
2159
2160
2161
2162
2163
2164
2165
2166 if (wait)
2167 while (atomic_read(&md->holders))
2168 msleep(1);
2169 else if (atomic_read(&md->holders))
2170 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2171 dm_device_name(md), atomic_read(&md->holders));
2172
2173 dm_sysfs_exit(md);
2174 dm_table_destroy(__unbind(md));
2175 free_dev(md);
2176}
2177
2178void dm_destroy(struct mapped_device *md)
2179{
2180 __dm_destroy(md, true);
2181}
2182
2183void dm_destroy_immediate(struct mapped_device *md)
2184{
2185 __dm_destroy(md, false);
2186}
2187
2188void dm_put(struct mapped_device *md)
2189{
2190 atomic_dec(&md->holders);
2191}
2192EXPORT_SYMBOL_GPL(dm_put);
2193
2194static int dm_wait_for_completion(struct mapped_device *md, long task_state)
2195{
2196 int r = 0;
2197 DEFINE_WAIT(wait);
2198
2199 while (1) {
2200 prepare_to_wait(&md->wait, &wait, task_state);
2201
2202 if (!md_in_flight(md))
2203 break;
2204
2205 if (signal_pending_state(task_state, current)) {
2206 r = -EINTR;
2207 break;
2208 }
2209
2210 io_schedule();
2211 }
2212 finish_wait(&md->wait, &wait);
2213
2214 return r;
2215}
2216
2217
2218
2219
2220static void dm_wq_work(struct work_struct *work)
2221{
2222 struct mapped_device *md = container_of(work, struct mapped_device,
2223 work);
2224 struct bio *c;
2225 int srcu_idx;
2226 struct dm_table *map;
2227
2228 map = dm_get_live_table(md, &srcu_idx);
2229
2230 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2231 spin_lock_irq(&md->deferred_lock);
2232 c = bio_list_pop(&md->deferred);
2233 spin_unlock_irq(&md->deferred_lock);
2234
2235 if (!c)
2236 break;
2237
2238 if (dm_request_based(md))
2239 generic_make_request(c);
2240 else
2241 __split_and_process_bio(md, map, c);
2242 }
2243
2244 dm_put_live_table(md, srcu_idx);
2245}
2246
2247static void dm_queue_flush(struct mapped_device *md)
2248{
2249 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2250 smp_mb__after_atomic();
2251 queue_work(md->wq, &md->work);
2252}
2253
2254
2255
2256
2257struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2258{
2259 struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
2260 struct queue_limits limits;
2261 int r;
2262
2263 mutex_lock(&md->suspend_lock);
2264
2265
2266 if (!dm_suspended_md(md))
2267 goto out;
2268
2269
2270
2271
2272
2273
2274
2275 if (dm_table_has_no_data_devices(table)) {
2276 live_map = dm_get_live_table_fast(md);
2277 if (live_map)
2278 limits = md->queue->limits;
2279 dm_put_live_table_fast(md);
2280 }
2281
2282 if (!live_map) {
2283 r = dm_calculate_queue_limits(table, &limits);
2284 if (r) {
2285 map = ERR_PTR(r);
2286 goto out;
2287 }
2288 }
2289
2290 map = __bind(md, table, &limits);
2291 dm_issue_global_event();
2292
2293out:
2294 mutex_unlock(&md->suspend_lock);
2295 return map;
2296}
2297
2298
2299
2300
2301
2302static int lock_fs(struct mapped_device *md)
2303{
2304 int r;
2305
2306 WARN_ON(md->frozen_sb);
2307
2308 md->frozen_sb = freeze_bdev(md->bdev);
2309 if (IS_ERR(md->frozen_sb)) {
2310 r = PTR_ERR(md->frozen_sb);
2311 md->frozen_sb = NULL;
2312 return r;
2313 }
2314
2315 set_bit(DMF_FROZEN, &md->flags);
2316
2317 return 0;
2318}
2319
2320static void unlock_fs(struct mapped_device *md)
2321{
2322 if (!test_bit(DMF_FROZEN, &md->flags))
2323 return;
2324
2325 thaw_bdev(md->bdev, md->frozen_sb);
2326 md->frozen_sb = NULL;
2327 clear_bit(DMF_FROZEN, &md->flags);
2328}
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
2340 unsigned suspend_flags, long task_state,
2341 int dmf_suspended_flag)
2342{
2343 bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
2344 bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
2345 int r;
2346
2347 lockdep_assert_held(&md->suspend_lock);
2348
2349
2350
2351
2352
2353 if (noflush)
2354 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2355 else
2356 pr_debug("%s: suspending with flush\n", dm_device_name(md));
2357
2358
2359
2360
2361
2362 dm_table_presuspend_targets(map);
2363
2364
2365
2366
2367
2368
2369
2370 if (!noflush && do_lockfs) {
2371 r = lock_fs(md);
2372 if (r) {
2373 dm_table_presuspend_undo_targets(map);
2374 return r;
2375 }
2376 }
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2391 if (map)
2392 synchronize_srcu(&md->io_barrier);
2393
2394
2395
2396
2397
2398 if (dm_request_based(md)) {
2399 dm_stop_queue(md->queue);
2400 if (md->kworker_task)
2401 kthread_flush_worker(&md->kworker);
2402 }
2403
2404 flush_workqueue(md->wq);
2405
2406
2407
2408
2409
2410
2411 r = dm_wait_for_completion(md, task_state);
2412 if (!r)
2413 set_bit(dmf_suspended_flag, &md->flags);
2414
2415 if (noflush)
2416 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2417 if (map)
2418 synchronize_srcu(&md->io_barrier);
2419
2420
2421 if (r < 0) {
2422 dm_queue_flush(md);
2423
2424 if (dm_request_based(md))
2425 dm_start_queue(md->queue);
2426
2427 unlock_fs(md);
2428 dm_table_presuspend_undo_targets(map);
2429
2430 }
2431
2432 return r;
2433}
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2452{
2453 struct dm_table *map = NULL;
2454 int r = 0;
2455
2456retry:
2457 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2458
2459 if (dm_suspended_md(md)) {
2460 r = -EINVAL;
2461 goto out_unlock;
2462 }
2463
2464 if (dm_suspended_internally_md(md)) {
2465
2466 mutex_unlock(&md->suspend_lock);
2467 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2468 if (r)
2469 return r;
2470 goto retry;
2471 }
2472
2473 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2474
2475 r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED);
2476 if (r)
2477 goto out_unlock;
2478
2479 dm_table_postsuspend_targets(map);
2480
2481out_unlock:
2482 mutex_unlock(&md->suspend_lock);
2483 return r;
2484}
2485
2486static int __dm_resume(struct mapped_device *md, struct dm_table *map)
2487{
2488 if (map) {
2489 int r = dm_table_resume_targets(map);
2490 if (r)
2491 return r;
2492 }
2493
2494 dm_queue_flush(md);
2495
2496
2497
2498
2499
2500
2501 if (dm_request_based(md))
2502 dm_start_queue(md->queue);
2503
2504 unlock_fs(md);
2505
2506 return 0;
2507}
2508
2509int dm_resume(struct mapped_device *md)
2510{
2511 int r;
2512 struct dm_table *map = NULL;
2513
2514retry:
2515 r = -EINVAL;
2516 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2517
2518 if (!dm_suspended_md(md))
2519 goto out;
2520
2521 if (dm_suspended_internally_md(md)) {
2522
2523 mutex_unlock(&md->suspend_lock);
2524 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2525 if (r)
2526 return r;
2527 goto retry;
2528 }
2529
2530 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2531 if (!map || !dm_table_get_size(map))
2532 goto out;
2533
2534 r = __dm_resume(md, map);
2535 if (r)
2536 goto out;
2537
2538 clear_bit(DMF_SUSPENDED, &md->flags);
2539out:
2540 mutex_unlock(&md->suspend_lock);
2541
2542 return r;
2543}
2544
2545
2546
2547
2548
2549
2550
2551static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
2552{
2553 struct dm_table *map = NULL;
2554
2555 lockdep_assert_held(&md->suspend_lock);
2556
2557 if (md->internal_suspend_count++)
2558 return;
2559
2560 if (dm_suspended_md(md)) {
2561 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2562 return;
2563 }
2564
2565 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2566
2567
2568
2569
2570
2571
2572
2573 (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE,
2574 DMF_SUSPENDED_INTERNALLY);
2575
2576 dm_table_postsuspend_targets(map);
2577}
2578
2579static void __dm_internal_resume(struct mapped_device *md)
2580{
2581 BUG_ON(!md->internal_suspend_count);
2582
2583 if (--md->internal_suspend_count)
2584 return;
2585
2586 if (dm_suspended_md(md))
2587 goto done;
2588
2589
2590
2591
2592
2593 (void) __dm_resume(md, NULL);
2594
2595done:
2596 clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2597 smp_mb__after_atomic();
2598 wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
2599}
2600
2601void dm_internal_suspend_noflush(struct mapped_device *md)
2602{
2603 mutex_lock(&md->suspend_lock);
2604 __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
2605 mutex_unlock(&md->suspend_lock);
2606}
2607EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
2608
2609void dm_internal_resume(struct mapped_device *md)
2610{
2611 mutex_lock(&md->suspend_lock);
2612 __dm_internal_resume(md);
2613 mutex_unlock(&md->suspend_lock);
2614}
2615EXPORT_SYMBOL_GPL(dm_internal_resume);
2616
2617
2618
2619
2620
2621
2622void dm_internal_suspend_fast(struct mapped_device *md)
2623{
2624 mutex_lock(&md->suspend_lock);
2625 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2626 return;
2627
2628 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2629 synchronize_srcu(&md->io_barrier);
2630 flush_workqueue(md->wq);
2631 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2632}
2633EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
2634
2635void dm_internal_resume_fast(struct mapped_device *md)
2636{
2637 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2638 goto done;
2639
2640 dm_queue_flush(md);
2641
2642done:
2643 mutex_unlock(&md->suspend_lock);
2644}
2645EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
2646
2647
2648
2649
2650int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
2651 unsigned cookie)
2652{
2653 char udev_cookie[DM_COOKIE_LENGTH];
2654 char *envp[] = { udev_cookie, NULL };
2655
2656 if (!cookie)
2657 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
2658 else {
2659 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2660 DM_COOKIE_ENV_VAR_NAME, cookie);
2661 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
2662 action, envp);
2663 }
2664}
2665
2666uint32_t dm_next_uevent_seq(struct mapped_device *md)
2667{
2668 return atomic_add_return(1, &md->uevent_seq);
2669}
2670
2671uint32_t dm_get_event_nr(struct mapped_device *md)
2672{
2673 return atomic_read(&md->event_nr);
2674}
2675
2676int dm_wait_event(struct mapped_device *md, int event_nr)
2677{
2678 return wait_event_interruptible(md->eventq,
2679 (event_nr != atomic_read(&md->event_nr)));
2680}
2681
2682void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
2683{
2684 unsigned long flags;
2685
2686 spin_lock_irqsave(&md->uevent_lock, flags);
2687 list_add(elist, &md->uevent_list);
2688 spin_unlock_irqrestore(&md->uevent_lock, flags);
2689}
2690
2691
2692
2693
2694
2695struct gendisk *dm_disk(struct mapped_device *md)
2696{
2697 return md->disk;
2698}
2699EXPORT_SYMBOL_GPL(dm_disk);
2700
2701struct kobject *dm_kobject(struct mapped_device *md)
2702{
2703 return &md->kobj_holder.kobj;
2704}
2705
2706struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2707{
2708 struct mapped_device *md;
2709
2710 md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
2711
2712 if (test_bit(DMF_FREEING, &md->flags) ||
2713 dm_deleting_md(md))
2714 return NULL;
2715
2716 dm_get(md);
2717 return md;
2718}
2719
2720int dm_suspended_md(struct mapped_device *md)
2721{
2722 return test_bit(DMF_SUSPENDED, &md->flags);
2723}
2724
2725int dm_suspended_internally_md(struct mapped_device *md)
2726{
2727 return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2728}
2729
2730int dm_test_deferred_remove_flag(struct mapped_device *md)
2731{
2732 return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
2733}
2734
2735int dm_suspended(struct dm_target *ti)
2736{
2737 return dm_suspended_md(dm_table_get_md(ti->table));
2738}
2739EXPORT_SYMBOL_GPL(dm_suspended);
2740
2741int dm_noflush_suspending(struct dm_target *ti)
2742{
2743 return __noflush_suspending(dm_table_get_md(ti->table));
2744}
2745EXPORT_SYMBOL_GPL(dm_noflush_suspending);
2746
2747struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
2748 unsigned integrity, unsigned per_io_data_size)
2749{
2750 struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
2751 unsigned int pool_size = 0;
2752 unsigned int front_pad;
2753
2754 if (!pools)
2755 return NULL;
2756
2757 switch (type) {
2758 case DM_TYPE_BIO_BASED:
2759 case DM_TYPE_DAX_BIO_BASED:
2760 pool_size = dm_get_reserved_bio_based_ios();
2761 front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
2762
2763 pools->io_pool = mempool_create_slab_pool(pool_size, _io_cache);
2764 if (!pools->io_pool)
2765 goto out;
2766 break;
2767 case DM_TYPE_REQUEST_BASED:
2768 case DM_TYPE_MQ_REQUEST_BASED:
2769 pool_size = dm_get_reserved_rq_based_ios();
2770 front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
2771
2772 break;
2773 default:
2774 BUG();
2775 }
2776
2777 pools->bs = bioset_create(pool_size, front_pad, BIOSET_NEED_RESCUER);
2778 if (!pools->bs)
2779 goto out;
2780
2781 if (integrity && bioset_integrity_create(pools->bs, pool_size))
2782 goto out;
2783
2784 return pools;
2785
2786out:
2787 dm_free_md_mempools(pools);
2788
2789 return NULL;
2790}
2791
2792void dm_free_md_mempools(struct dm_md_mempools *pools)
2793{
2794 if (!pools)
2795 return;
2796
2797 mempool_destroy(pools->io_pool);
2798
2799 if (pools->bs)
2800 bioset_free(pools->bs);
2801
2802 kfree(pools);
2803}
2804
2805struct dm_pr {
2806 u64 old_key;
2807 u64 new_key;
2808 u32 flags;
2809 bool fail_early;
2810};
2811
2812static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
2813 void *data)
2814{
2815 struct mapped_device *md = bdev->bd_disk->private_data;
2816 struct dm_table *table;
2817 struct dm_target *ti;
2818 int ret = -ENOTTY, srcu_idx;
2819
2820 table = dm_get_live_table(md, &srcu_idx);
2821 if (!table || !dm_table_get_size(table))
2822 goto out;
2823
2824
2825 if (dm_table_get_num_targets(table) != 1)
2826 goto out;
2827 ti = dm_table_get_target(table, 0);
2828
2829 ret = -EINVAL;
2830 if (!ti->type->iterate_devices)
2831 goto out;
2832
2833 ret = ti->type->iterate_devices(ti, fn, data);
2834out:
2835 dm_put_live_table(md, srcu_idx);
2836 return ret;
2837}
2838
2839
2840
2841
2842static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev,
2843 sector_t start, sector_t len, void *data)
2844{
2845 struct dm_pr *pr = data;
2846 const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
2847
2848 if (!ops || !ops->pr_register)
2849 return -EOPNOTSUPP;
2850 return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);
2851}
2852
2853static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
2854 u32 flags)
2855{
2856 struct dm_pr pr = {
2857 .old_key = old_key,
2858 .new_key = new_key,
2859 .flags = flags,
2860 .fail_early = true,
2861 };
2862 int ret;
2863
2864 ret = dm_call_pr(bdev, __dm_pr_register, &pr);
2865 if (ret && new_key) {
2866
2867 pr.old_key = new_key;
2868 pr.new_key = 0;
2869 pr.flags = 0;
2870 pr.fail_early = false;
2871 dm_call_pr(bdev, __dm_pr_register, &pr);
2872 }
2873
2874 return ret;
2875}
2876
2877static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
2878 u32 flags)
2879{
2880 struct mapped_device *md = bdev->bd_disk->private_data;
2881 const struct pr_ops *ops;
2882 fmode_t mode;
2883 int r;
2884
2885 r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
2886 if (r < 0)
2887 return r;
2888
2889 ops = bdev->bd_disk->fops->pr_ops;
2890 if (ops && ops->pr_reserve)
2891 r = ops->pr_reserve(bdev, key, type, flags);
2892 else
2893 r = -EOPNOTSUPP;
2894
2895 bdput(bdev);
2896 return r;
2897}
2898
2899static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
2900{
2901 struct mapped_device *md = bdev->bd_disk->private_data;
2902 const struct pr_ops *ops;
2903 fmode_t mode;
2904 int r;
2905
2906 r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
2907 if (r < 0)
2908 return r;
2909
2910 ops = bdev->bd_disk->fops->pr_ops;
2911 if (ops && ops->pr_release)
2912 r = ops->pr_release(bdev, key, type);
2913 else
2914 r = -EOPNOTSUPP;
2915
2916 bdput(bdev);
2917 return r;
2918}
2919
2920static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
2921 enum pr_type type, bool abort)
2922{
2923 struct mapped_device *md = bdev->bd_disk->private_data;
2924 const struct pr_ops *ops;
2925 fmode_t mode;
2926 int r;
2927
2928 r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
2929 if (r < 0)
2930 return r;
2931
2932 ops = bdev->bd_disk->fops->pr_ops;
2933 if (ops && ops->pr_preempt)
2934 r = ops->pr_preempt(bdev, old_key, new_key, type, abort);
2935 else
2936 r = -EOPNOTSUPP;
2937
2938 bdput(bdev);
2939 return r;
2940}
2941
2942static int dm_pr_clear(struct block_device *bdev, u64 key)
2943{
2944 struct mapped_device *md = bdev->bd_disk->private_data;
2945 const struct pr_ops *ops;
2946 fmode_t mode;
2947 int r;
2948
2949 r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
2950 if (r < 0)
2951 return r;
2952
2953 ops = bdev->bd_disk->fops->pr_ops;
2954 if (ops && ops->pr_clear)
2955 r = ops->pr_clear(bdev, key);
2956 else
2957 r = -EOPNOTSUPP;
2958
2959 bdput(bdev);
2960 return r;
2961}
2962
2963static const struct pr_ops dm_pr_ops = {
2964 .pr_register = dm_pr_register,
2965 .pr_reserve = dm_pr_reserve,
2966 .pr_release = dm_pr_release,
2967 .pr_preempt = dm_pr_preempt,
2968 .pr_clear = dm_pr_clear,
2969};
2970
2971static const struct block_device_operations dm_blk_dops = {
2972 .open = dm_blk_open,
2973 .release = dm_blk_close,
2974 .ioctl = dm_blk_ioctl,
2975 .getgeo = dm_blk_getgeo,
2976 .pr_ops = &dm_pr_ops,
2977 .owner = THIS_MODULE
2978};
2979
2980static const struct dax_operations dm_dax_ops = {
2981 .direct_access = dm_dax_direct_access,
2982 .copy_from_iter = dm_dax_copy_from_iter,
2983};
2984
2985
2986
2987
2988module_init(dm_init);
2989module_exit(dm_exit);
2990
2991module_param(major, uint, 0);
2992MODULE_PARM_DESC(major, "The major number of the device mapper");
2993
2994module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
2995MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
2996
2997module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
2998MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
2999
3000MODULE_DESCRIPTION(DM_NAME " driver");
3001MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3002MODULE_LICENSE("GPL");
3003