1
2
3
4
5
6
7
8#include "dm-core.h"
9#include "dm-rq.h"
10#include "dm-uevent.h"
11
12#include <linux/init.h>
13#include <linux/module.h>
14#include <linux/mutex.h>
15#include <linux/sched/signal.h>
16#include <linux/blkpg.h>
17#include <linux/bio.h>
18#include <linux/mempool.h>
19#include <linux/dax.h>
20#include <linux/slab.h>
21#include <linux/idr.h>
22#include <linux/uio.h>
23#include <linux/hdreg.h>
24#include <linux/delay.h>
25#include <linux/wait.h>
26#include <linux/pr.h>
27
28#define DM_MSG_PREFIX "core"
29
30
31
32
33
34#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
35#define DM_COOKIE_LENGTH 24
36
37static const char *_name = DM_NAME;
38
39static unsigned int major = 0;
40static unsigned int _major = 0;
41
42static DEFINE_IDR(_minor_idr);
43
44static DEFINE_SPINLOCK(_minor_lock);
45
46static void do_deferred_remove(struct work_struct *w);
47
48static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
49
50static struct workqueue_struct *deferred_remove_workqueue;
51
52atomic_t dm_global_event_nr = ATOMIC_INIT(0);
53DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq);
54
55
56
57
58struct dm_io {
59 struct mapped_device *md;
60 blk_status_t status;
61 atomic_t io_count;
62 struct bio *bio;
63 unsigned long start_time;
64 spinlock_t endio_lock;
65 struct dm_stats_aux stats_aux;
66};
67
68#define MINOR_ALLOCED ((void *)-1)
69
70
71
72
73#define DMF_BLOCK_IO_FOR_SUSPEND 0
74#define DMF_SUSPENDED 1
75#define DMF_FROZEN 2
76#define DMF_FREEING 3
77#define DMF_DELETING 4
78#define DMF_NOFLUSH_SUSPENDING 5
79#define DMF_DEFERRED_REMOVE 6
80#define DMF_SUSPENDED_INTERNALLY 7
81
82#define DM_NUMA_NODE NUMA_NO_NODE
83static int dm_numa_node = DM_NUMA_NODE;
84
85
86
87
88struct dm_md_mempools {
89 mempool_t *io_pool;
90 struct bio_set *bs;
91};
92
93struct table_device {
94 struct list_head list;
95 atomic_t count;
96 struct dm_dev dm_dev;
97};
98
99static struct kmem_cache *_io_cache;
100static struct kmem_cache *_rq_tio_cache;
101static struct kmem_cache *_rq_cache;
102
103
104
105
106#define RESERVED_BIO_BASED_IOS 16
107static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
108
109static int __dm_get_module_param_int(int *module_param, int min, int max)
110{
111 int param = ACCESS_ONCE(*module_param);
112 int modified_param = 0;
113 bool modified = true;
114
115 if (param < min)
116 modified_param = min;
117 else if (param > max)
118 modified_param = max;
119 else
120 modified = false;
121
122 if (modified) {
123 (void)cmpxchg(module_param, param, modified_param);
124 param = modified_param;
125 }
126
127 return param;
128}
129
130unsigned __dm_get_module_param(unsigned *module_param,
131 unsigned def, unsigned max)
132{
133 unsigned param = ACCESS_ONCE(*module_param);
134 unsigned modified_param = 0;
135
136 if (!param)
137 modified_param = def;
138 else if (param > max)
139 modified_param = max;
140
141 if (modified_param) {
142 (void)cmpxchg(module_param, param, modified_param);
143 param = modified_param;
144 }
145
146 return param;
147}
148
149unsigned dm_get_reserved_bio_based_ios(void)
150{
151 return __dm_get_module_param(&reserved_bio_based_ios,
152 RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS);
153}
154EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
155
156static unsigned dm_get_numa_node(void)
157{
158 return __dm_get_module_param_int(&dm_numa_node,
159 DM_NUMA_NODE, num_online_nodes() - 1);
160}
161
162static int __init local_init(void)
163{
164 int r = -ENOMEM;
165
166
167 _io_cache = KMEM_CACHE(dm_io, 0);
168 if (!_io_cache)
169 return r;
170
171 _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
172 if (!_rq_tio_cache)
173 goto out_free_io_cache;
174
175 _rq_cache = kmem_cache_create("dm_old_clone_request", sizeof(struct request),
176 __alignof__(struct request), 0, NULL);
177 if (!_rq_cache)
178 goto out_free_rq_tio_cache;
179
180 r = dm_uevent_init();
181 if (r)
182 goto out_free_rq_cache;
183
184 deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
185 if (!deferred_remove_workqueue) {
186 r = -ENOMEM;
187 goto out_uevent_exit;
188 }
189
190 _major = major;
191 r = register_blkdev(_major, _name);
192 if (r < 0)
193 goto out_free_workqueue;
194
195 if (!_major)
196 _major = r;
197
198 return 0;
199
200out_free_workqueue:
201 destroy_workqueue(deferred_remove_workqueue);
202out_uevent_exit:
203 dm_uevent_exit();
204out_free_rq_cache:
205 kmem_cache_destroy(_rq_cache);
206out_free_rq_tio_cache:
207 kmem_cache_destroy(_rq_tio_cache);
208out_free_io_cache:
209 kmem_cache_destroy(_io_cache);
210
211 return r;
212}
213
214static void local_exit(void)
215{
216 flush_scheduled_work();
217 destroy_workqueue(deferred_remove_workqueue);
218
219 kmem_cache_destroy(_rq_cache);
220 kmem_cache_destroy(_rq_tio_cache);
221 kmem_cache_destroy(_io_cache);
222 unregister_blkdev(_major, _name);
223 dm_uevent_exit();
224
225 _major = 0;
226
227 DMINFO("cleaned up");
228}
229
230static int (*_inits[])(void) __initdata = {
231 local_init,
232 dm_target_init,
233 dm_linear_init,
234 dm_stripe_init,
235 dm_io_init,
236 dm_kcopyd_init,
237 dm_interface_init,
238 dm_statistics_init,
239};
240
241static void (*_exits[])(void) = {
242 local_exit,
243 dm_target_exit,
244 dm_linear_exit,
245 dm_stripe_exit,
246 dm_io_exit,
247 dm_kcopyd_exit,
248 dm_interface_exit,
249 dm_statistics_exit,
250};
251
252static int __init dm_init(void)
253{
254 const int count = ARRAY_SIZE(_inits);
255
256 int r, i;
257
258 for (i = 0; i < count; i++) {
259 r = _inits[i]();
260 if (r)
261 goto bad;
262 }
263
264 return 0;
265
266 bad:
267 while (i--)
268 _exits[i]();
269
270 return r;
271}
272
273static void __exit dm_exit(void)
274{
275 int i = ARRAY_SIZE(_exits);
276
277 while (i--)
278 _exits[i]();
279
280
281
282
283 idr_destroy(&_minor_idr);
284}
285
286
287
288
289int dm_deleting_md(struct mapped_device *md)
290{
291 return test_bit(DMF_DELETING, &md->flags);
292}
293
294static int dm_blk_open(struct block_device *bdev, fmode_t mode)
295{
296 struct mapped_device *md;
297
298 spin_lock(&_minor_lock);
299
300 md = bdev->bd_disk->private_data;
301 if (!md)
302 goto out;
303
304 if (test_bit(DMF_FREEING, &md->flags) ||
305 dm_deleting_md(md)) {
306 md = NULL;
307 goto out;
308 }
309
310 dm_get(md);
311 atomic_inc(&md->open_count);
312out:
313 spin_unlock(&_minor_lock);
314
315 return md ? 0 : -ENXIO;
316}
317
318static void dm_blk_close(struct gendisk *disk, fmode_t mode)
319{
320 struct mapped_device *md;
321
322 spin_lock(&_minor_lock);
323
324 md = disk->private_data;
325 if (WARN_ON(!md))
326 goto out;
327
328 if (atomic_dec_and_test(&md->open_count) &&
329 (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
330 queue_work(deferred_remove_workqueue, &deferred_remove_work);
331
332 dm_put(md);
333out:
334 spin_unlock(&_minor_lock);
335}
336
337int dm_open_count(struct mapped_device *md)
338{
339 return atomic_read(&md->open_count);
340}
341
342
343
344
345int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
346{
347 int r = 0;
348
349 spin_lock(&_minor_lock);
350
351 if (dm_open_count(md)) {
352 r = -EBUSY;
353 if (mark_deferred)
354 set_bit(DMF_DEFERRED_REMOVE, &md->flags);
355 } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
356 r = -EEXIST;
357 else
358 set_bit(DMF_DELETING, &md->flags);
359
360 spin_unlock(&_minor_lock);
361
362 return r;
363}
364
365int dm_cancel_deferred_remove(struct mapped_device *md)
366{
367 int r = 0;
368
369 spin_lock(&_minor_lock);
370
371 if (test_bit(DMF_DELETING, &md->flags))
372 r = -EBUSY;
373 else
374 clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
375
376 spin_unlock(&_minor_lock);
377
378 return r;
379}
380
381static void do_deferred_remove(struct work_struct *w)
382{
383 dm_deferred_remove();
384}
385
386sector_t dm_get_size(struct mapped_device *md)
387{
388 return get_capacity(md->disk);
389}
390
391struct request_queue *dm_get_md_queue(struct mapped_device *md)
392{
393 return md->queue;
394}
395
396struct dm_stats *dm_get_stats(struct mapped_device *md)
397{
398 return &md->stats;
399}
400
401static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
402{
403 struct mapped_device *md = bdev->bd_disk->private_data;
404
405 return dm_get_geometry(md, geo);
406}
407
408static int dm_grab_bdev_for_ioctl(struct mapped_device *md,
409 struct block_device **bdev,
410 fmode_t *mode)
411{
412 struct dm_target *tgt;
413 struct dm_table *map;
414 int srcu_idx, r;
415
416retry:
417 r = -ENOTTY;
418 map = dm_get_live_table(md, &srcu_idx);
419 if (!map || !dm_table_get_size(map))
420 goto out;
421
422
423 if (dm_table_get_num_targets(map) != 1)
424 goto out;
425
426 tgt = dm_table_get_target(map, 0);
427 if (!tgt->type->prepare_ioctl)
428 goto out;
429
430 if (dm_suspended_md(md)) {
431 r = -EAGAIN;
432 goto out;
433 }
434
435 r = tgt->type->prepare_ioctl(tgt, bdev, mode);
436 if (r < 0)
437 goto out;
438
439 bdgrab(*bdev);
440 dm_put_live_table(md, srcu_idx);
441 return r;
442
443out:
444 dm_put_live_table(md, srcu_idx);
445 if (r == -ENOTCONN && !fatal_signal_pending(current)) {
446 msleep(10);
447 goto retry;
448 }
449 return r;
450}
451
452static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
453 unsigned int cmd, unsigned long arg)
454{
455 struct mapped_device *md = bdev->bd_disk->private_data;
456 int r;
457
458 r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
459 if (r < 0)
460 return r;
461
462 if (r > 0) {
463
464
465
466
467 if (!capable(CAP_SYS_RAWIO)) {
468 DMWARN_LIMIT(
469 "%s: sending ioctl %x to DM device without required privilege.",
470 current->comm, cmd);
471 r = -ENOIOCTLCMD;
472 goto out;
473 }
474 }
475
476 r = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
477out:
478 bdput(bdev);
479 return r;
480}
481
482static struct dm_io *alloc_io(struct mapped_device *md)
483{
484 return mempool_alloc(md->io_pool, GFP_NOIO);
485}
486
487static void free_io(struct mapped_device *md, struct dm_io *io)
488{
489 mempool_free(io, md->io_pool);
490}
491
492static void free_tio(struct dm_target_io *tio)
493{
494 bio_put(&tio->clone);
495}
496
497int md_in_flight(struct mapped_device *md)
498{
499 return atomic_read(&md->pending[READ]) +
500 atomic_read(&md->pending[WRITE]);
501}
502
503static void start_io_acct(struct dm_io *io)
504{
505 struct mapped_device *md = io->md;
506 struct bio *bio = io->bio;
507 int cpu;
508 int rw = bio_data_dir(bio);
509
510 io->start_time = jiffies;
511
512 cpu = part_stat_lock();
513 part_round_stats(cpu, &dm_disk(md)->part0);
514 part_stat_unlock();
515 atomic_set(&dm_disk(md)->part0.in_flight[rw],
516 atomic_inc_return(&md->pending[rw]));
517
518 if (unlikely(dm_stats_used(&md->stats)))
519 dm_stats_account_io(&md->stats, bio_data_dir(bio),
520 bio->bi_iter.bi_sector, bio_sectors(bio),
521 false, 0, &io->stats_aux);
522}
523
524static void end_io_acct(struct dm_io *io)
525{
526 struct mapped_device *md = io->md;
527 struct bio *bio = io->bio;
528 unsigned long duration = jiffies - io->start_time;
529 int pending;
530 int rw = bio_data_dir(bio);
531
532 generic_end_io_acct(rw, &dm_disk(md)->part0, io->start_time);
533
534 if (unlikely(dm_stats_used(&md->stats)))
535 dm_stats_account_io(&md->stats, bio_data_dir(bio),
536 bio->bi_iter.bi_sector, bio_sectors(bio),
537 true, duration, &io->stats_aux);
538
539
540
541
542
543 pending = atomic_dec_return(&md->pending[rw]);
544 atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
545 pending += atomic_read(&md->pending[rw^0x1]);
546
547
548 if (!pending)
549 wake_up(&md->wait);
550}
551
552
553
554
555static void queue_io(struct mapped_device *md, struct bio *bio)
556{
557 unsigned long flags;
558
559 spin_lock_irqsave(&md->deferred_lock, flags);
560 bio_list_add(&md->deferred, bio);
561 spin_unlock_irqrestore(&md->deferred_lock, flags);
562 queue_work(md->wq, &md->work);
563}
564
565
566
567
568
569
570struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
571{
572 *srcu_idx = srcu_read_lock(&md->io_barrier);
573
574 return srcu_dereference(md->map, &md->io_barrier);
575}
576
577void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
578{
579 srcu_read_unlock(&md->io_barrier, srcu_idx);
580}
581
582void dm_sync_table(struct mapped_device *md)
583{
584 synchronize_srcu(&md->io_barrier);
585 synchronize_rcu_expedited();
586}
587
588
589
590
591
592static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
593{
594 rcu_read_lock();
595 return rcu_dereference(md->map);
596}
597
598static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
599{
600 rcu_read_unlock();
601}
602
603
604
605
606static int open_table_device(struct table_device *td, dev_t dev,
607 struct mapped_device *md)
608{
609 static char *_claim_ptr = "I belong to device-mapper";
610 struct block_device *bdev;
611
612 int r;
613
614 BUG_ON(td->dm_dev.bdev);
615
616 bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _claim_ptr);
617 if (IS_ERR(bdev))
618 return PTR_ERR(bdev);
619
620 r = bd_link_disk_holder(bdev, dm_disk(md));
621 if (r) {
622 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
623 return r;
624 }
625
626 td->dm_dev.bdev = bdev;
627 td->dm_dev.dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
628 return 0;
629}
630
631
632
633
634static void close_table_device(struct table_device *td, struct mapped_device *md)
635{
636 if (!td->dm_dev.bdev)
637 return;
638
639 bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
640 blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
641 put_dax(td->dm_dev.dax_dev);
642 td->dm_dev.bdev = NULL;
643 td->dm_dev.dax_dev = NULL;
644}
645
646static struct table_device *find_table_device(struct list_head *l, dev_t dev,
647 fmode_t mode) {
648 struct table_device *td;
649
650 list_for_each_entry(td, l, list)
651 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
652 return td;
653
654 return NULL;
655}
656
657int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
658 struct dm_dev **result) {
659 int r;
660 struct table_device *td;
661
662 mutex_lock(&md->table_devices_lock);
663 td = find_table_device(&md->table_devices, dev, mode);
664 if (!td) {
665 td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
666 if (!td) {
667 mutex_unlock(&md->table_devices_lock);
668 return -ENOMEM;
669 }
670
671 td->dm_dev.mode = mode;
672 td->dm_dev.bdev = NULL;
673
674 if ((r = open_table_device(td, dev, md))) {
675 mutex_unlock(&md->table_devices_lock);
676 kfree(td);
677 return r;
678 }
679
680 format_dev_t(td->dm_dev.name, dev);
681
682 atomic_set(&td->count, 0);
683 list_add(&td->list, &md->table_devices);
684 }
685 atomic_inc(&td->count);
686 mutex_unlock(&md->table_devices_lock);
687
688 *result = &td->dm_dev;
689 return 0;
690}
691EXPORT_SYMBOL_GPL(dm_get_table_device);
692
693void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
694{
695 struct table_device *td = container_of(d, struct table_device, dm_dev);
696
697 mutex_lock(&md->table_devices_lock);
698 if (atomic_dec_and_test(&td->count)) {
699 close_table_device(td, md);
700 list_del(&td->list);
701 kfree(td);
702 }
703 mutex_unlock(&md->table_devices_lock);
704}
705EXPORT_SYMBOL(dm_put_table_device);
706
707static void free_table_devices(struct list_head *devices)
708{
709 struct list_head *tmp, *next;
710
711 list_for_each_safe(tmp, next, devices) {
712 struct table_device *td = list_entry(tmp, struct table_device, list);
713
714 DMWARN("dm_destroy: %s still exists with %d references",
715 td->dm_dev.name, atomic_read(&td->count));
716 kfree(td);
717 }
718}
719
720
721
722
723int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
724{
725 *geo = md->geometry;
726
727 return 0;
728}
729
730
731
732
733int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
734{
735 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
736
737 if (geo->start > sz) {
738 DMWARN("Start sector is beyond the geometry limits.");
739 return -EINVAL;
740 }
741
742 md->geometry = *geo;
743
744 return 0;
745}
746
747
748
749
750
751
752
753
754
755
756static int __noflush_suspending(struct mapped_device *md)
757{
758 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
759}
760
761
762
763
764
765static void dec_pending(struct dm_io *io, blk_status_t error)
766{
767 unsigned long flags;
768 blk_status_t io_error;
769 struct bio *bio;
770 struct mapped_device *md = io->md;
771
772
773 if (unlikely(error)) {
774 spin_lock_irqsave(&io->endio_lock, flags);
775 if (!(io->status == BLK_STS_DM_REQUEUE &&
776 __noflush_suspending(md)))
777 io->status = error;
778 spin_unlock_irqrestore(&io->endio_lock, flags);
779 }
780
781 if (atomic_dec_and_test(&io->io_count)) {
782 if (io->status == BLK_STS_DM_REQUEUE) {
783
784
785
786 spin_lock_irqsave(&md->deferred_lock, flags);
787 if (__noflush_suspending(md))
788 bio_list_add_head(&md->deferred, io->bio);
789 else
790
791 io->status = BLK_STS_IOERR;
792 spin_unlock_irqrestore(&md->deferred_lock, flags);
793 }
794
795 io_error = io->status;
796 bio = io->bio;
797 end_io_acct(io);
798 free_io(md, io);
799
800 if (io_error == BLK_STS_DM_REQUEUE)
801 return;
802
803 if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
804
805
806
807
808 bio->bi_opf &= ~REQ_PREFLUSH;
809 queue_io(md, bio);
810 } else {
811
812 bio->bi_status = io_error;
813 bio_endio(bio);
814 }
815 }
816}
817
818void disable_write_same(struct mapped_device *md)
819{
820 struct queue_limits *limits = dm_get_queue_limits(md);
821
822
823 limits->max_write_same_sectors = 0;
824}
825
826void disable_write_zeroes(struct mapped_device *md)
827{
828 struct queue_limits *limits = dm_get_queue_limits(md);
829
830
831 limits->max_write_zeroes_sectors = 0;
832}
833
834static void clone_endio(struct bio *bio)
835{
836 blk_status_t error = bio->bi_status;
837 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
838 struct dm_io *io = tio->io;
839 struct mapped_device *md = tio->io->md;
840 dm_endio_fn endio = tio->ti->type->end_io;
841
842 if (unlikely(error == BLK_STS_TARGET)) {
843 if (bio_op(bio) == REQ_OP_WRITE_SAME &&
844 !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)
845 disable_write_same(md);
846 if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
847 !bdev_get_queue(bio->bi_bdev)->limits.max_write_zeroes_sectors)
848 disable_write_zeroes(md);
849 }
850
851 if (endio) {
852 int r = endio(tio->ti, bio, &error);
853 switch (r) {
854 case DM_ENDIO_REQUEUE:
855 error = BLK_STS_DM_REQUEUE;
856
857 case DM_ENDIO_DONE:
858 break;
859 case DM_ENDIO_INCOMPLETE:
860
861 return;
862 default:
863 DMWARN("unimplemented target endio return value: %d", r);
864 BUG();
865 }
866 }
867
868 free_tio(tio);
869 dec_pending(io, error);
870}
871
872
873
874
875
876static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
877{
878 sector_t target_offset = dm_target_offset(ti, sector);
879
880 return ti->len - target_offset;
881}
882
883static sector_t max_io_len(sector_t sector, struct dm_target *ti)
884{
885 sector_t len = max_io_len_target_boundary(sector, ti);
886 sector_t offset, max_len;
887
888
889
890
891 if (ti->max_io_len) {
892 offset = dm_target_offset(ti, sector);
893 if (unlikely(ti->max_io_len & (ti->max_io_len - 1)))
894 max_len = sector_div(offset, ti->max_io_len);
895 else
896 max_len = offset & (ti->max_io_len - 1);
897 max_len = ti->max_io_len - max_len;
898
899 if (len > max_len)
900 len = max_len;
901 }
902
903 return len;
904}
905
906int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
907{
908 if (len > UINT_MAX) {
909 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
910 (unsigned long long)len, UINT_MAX);
911 ti->error = "Maximum size of target IO is too large";
912 return -EINVAL;
913 }
914
915 ti->max_io_len = (uint32_t) len;
916
917 return 0;
918}
919EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
920
921static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
922 sector_t sector, int *srcu_idx)
923{
924 struct dm_table *map;
925 struct dm_target *ti;
926
927 map = dm_get_live_table(md, srcu_idx);
928 if (!map)
929 return NULL;
930
931 ti = dm_table_find_target(map, sector);
932 if (!dm_target_is_valid(ti))
933 return NULL;
934
935 return ti;
936}
937
938static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
939 long nr_pages, void **kaddr, pfn_t *pfn)
940{
941 struct mapped_device *md = dax_get_private(dax_dev);
942 sector_t sector = pgoff * PAGE_SECTORS;
943 struct dm_target *ti;
944 long len, ret = -EIO;
945 int srcu_idx;
946
947 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
948
949 if (!ti)
950 goto out;
951 if (!ti->type->direct_access)
952 goto out;
953 len = max_io_len(sector, ti) / PAGE_SECTORS;
954 if (len < 1)
955 goto out;
956 nr_pages = min(len, nr_pages);
957 if (ti->type->direct_access)
958 ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
959
960 out:
961 dm_put_live_table(md, srcu_idx);
962
963 return ret;
964}
965
966static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
967 void *addr, size_t bytes, struct iov_iter *i)
968{
969 struct mapped_device *md = dax_get_private(dax_dev);
970 sector_t sector = pgoff * PAGE_SECTORS;
971 struct dm_target *ti;
972 long ret = 0;
973 int srcu_idx;
974
975 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
976
977 if (!ti)
978 goto out;
979 if (!ti->type->dax_copy_from_iter) {
980 ret = copy_from_iter(addr, bytes, i);
981 goto out;
982 }
983 ret = ti->type->dax_copy_from_iter(ti, pgoff, addr, bytes, i);
984 out:
985 dm_put_live_table(md, srcu_idx);
986
987 return ret;
988}
989
990static void dm_dax_flush(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
991 size_t size)
992{
993 struct mapped_device *md = dax_get_private(dax_dev);
994 sector_t sector = pgoff * PAGE_SECTORS;
995 struct dm_target *ti;
996 int srcu_idx;
997
998 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
999
1000 if (!ti)
1001 goto out;
1002 if (ti->type->dax_flush)
1003 ti->type->dax_flush(ti, pgoff, addr, size);
1004 out:
1005 dm_put_live_table(md, srcu_idx);
1006}
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
1037{
1038 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
1039 unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
1040 BUG_ON(bio->bi_opf & REQ_PREFLUSH);
1041 BUG_ON(bi_size > *tio->len_ptr);
1042 BUG_ON(n_sectors > bi_size);
1043 *tio->len_ptr -= bi_size - n_sectors;
1044 bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
1045}
1046EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056void dm_remap_zone_report(struct dm_target *ti, struct bio *bio, sector_t start)
1057{
1058#ifdef CONFIG_BLK_DEV_ZONED
1059 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
1060 struct bio *report_bio = tio->io->bio;
1061 struct blk_zone_report_hdr *hdr = NULL;
1062 struct blk_zone *zone;
1063 unsigned int nr_rep = 0;
1064 unsigned int ofst;
1065 struct bio_vec bvec;
1066 struct bvec_iter iter;
1067 void *addr;
1068
1069 if (bio->bi_status)
1070 return;
1071
1072
1073
1074
1075
1076 bio_for_each_segment(bvec, report_bio, iter) {
1077 addr = kmap_atomic(bvec.bv_page);
1078
1079
1080 if (!hdr) {
1081 hdr = addr;
1082 ofst = sizeof(struct blk_zone_report_hdr);
1083 } else
1084 ofst = 0;
1085
1086
1087 while (hdr->nr_zones && ofst < bvec.bv_len) {
1088 zone = addr + ofst;
1089 if (zone->start >= start + ti->len) {
1090 hdr->nr_zones = 0;
1091 break;
1092 }
1093 zone->start = zone->start + ti->begin - start;
1094 if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) {
1095 if (zone->cond == BLK_ZONE_COND_FULL)
1096 zone->wp = zone->start + zone->len;
1097 else if (zone->cond == BLK_ZONE_COND_EMPTY)
1098 zone->wp = zone->start;
1099 else
1100 zone->wp = zone->wp + ti->begin - start;
1101 }
1102 ofst += sizeof(struct blk_zone);
1103 hdr->nr_zones--;
1104 nr_rep++;
1105 }
1106
1107 if (addr != hdr)
1108 kunmap_atomic(addr);
1109
1110 if (!hdr->nr_zones)
1111 break;
1112 }
1113
1114 if (hdr) {
1115 hdr->nr_zones = nr_rep;
1116 kunmap_atomic(hdr);
1117 }
1118
1119 bio_advance(report_bio, report_bio->bi_iter.bi_size);
1120
1121#else
1122 bio->bi_status = BLK_STS_NOTSUPP;
1123#endif
1124}
1125EXPORT_SYMBOL_GPL(dm_remap_zone_report);
1126
1127
1128
1129
1130
1131struct dm_offload {
1132 struct blk_plug plug;
1133 struct blk_plug_cb cb;
1134};
1135
1136static void flush_current_bio_list(struct blk_plug_cb *cb, bool from_schedule)
1137{
1138 struct dm_offload *o = container_of(cb, struct dm_offload, cb);
1139 struct bio_list list;
1140 struct bio *bio;
1141 int i;
1142
1143 INIT_LIST_HEAD(&o->cb.list);
1144
1145 if (unlikely(!current->bio_list))
1146 return;
1147
1148 for (i = 0; i < 2; i++) {
1149 list = current->bio_list[i];
1150 bio_list_init(¤t->bio_list[i]);
1151
1152 while ((bio = bio_list_pop(&list))) {
1153 struct bio_set *bs = bio->bi_pool;
1154 if (unlikely(!bs) || bs == fs_bio_set ||
1155 !bs->rescue_workqueue) {
1156 bio_list_add(¤t->bio_list[i], bio);
1157 continue;
1158 }
1159
1160 spin_lock(&bs->rescue_lock);
1161 bio_list_add(&bs->rescue_list, bio);
1162 queue_work(bs->rescue_workqueue, &bs->rescue_work);
1163 spin_unlock(&bs->rescue_lock);
1164 }
1165 }
1166}
1167
1168static void dm_offload_start(struct dm_offload *o)
1169{
1170 blk_start_plug(&o->plug);
1171 o->cb.callback = flush_current_bio_list;
1172 list_add(&o->cb.list, ¤t->plug->cb_list);
1173}
1174
1175static void dm_offload_end(struct dm_offload *o)
1176{
1177 list_del(&o->cb.list);
1178 blk_finish_plug(&o->plug);
1179}
1180
1181static void __map_bio(struct dm_target_io *tio)
1182{
1183 int r;
1184 sector_t sector;
1185 struct dm_offload o;
1186 struct bio *clone = &tio->clone;
1187 struct dm_target *ti = tio->ti;
1188
1189 clone->bi_end_io = clone_endio;
1190
1191
1192
1193
1194
1195
1196 atomic_inc(&tio->io->io_count);
1197 sector = clone->bi_iter.bi_sector;
1198
1199 dm_offload_start(&o);
1200 r = ti->type->map(ti, clone);
1201 dm_offload_end(&o);
1202
1203 switch (r) {
1204 case DM_MAPIO_SUBMITTED:
1205 break;
1206 case DM_MAPIO_REMAPPED:
1207
1208 trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,
1209 tio->io->bio->bi_bdev->bd_dev, sector);
1210 generic_make_request(clone);
1211 break;
1212 case DM_MAPIO_KILL:
1213 dec_pending(tio->io, BLK_STS_IOERR);
1214 free_tio(tio);
1215 break;
1216 case DM_MAPIO_REQUEUE:
1217 dec_pending(tio->io, BLK_STS_DM_REQUEUE);
1218 free_tio(tio);
1219 break;
1220 default:
1221 DMWARN("unimplemented target map return value: %d", r);
1222 BUG();
1223 }
1224}
1225
1226struct clone_info {
1227 struct mapped_device *md;
1228 struct dm_table *map;
1229 struct bio *bio;
1230 struct dm_io *io;
1231 sector_t sector;
1232 unsigned sector_count;
1233};
1234
1235static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
1236{
1237 bio->bi_iter.bi_sector = sector;
1238 bio->bi_iter.bi_size = to_bytes(len);
1239}
1240
1241
1242
1243
1244static int clone_bio(struct dm_target_io *tio, struct bio *bio,
1245 sector_t sector, unsigned len)
1246{
1247 struct bio *clone = &tio->clone;
1248
1249 __bio_clone_fast(clone, bio);
1250
1251 if (unlikely(bio_integrity(bio) != NULL)) {
1252 int r;
1253
1254 if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
1255 !dm_target_passes_integrity(tio->ti->type))) {
1256 DMWARN("%s: the target %s doesn't support integrity data.",
1257 dm_device_name(tio->io->md),
1258 tio->ti->type->name);
1259 return -EIO;
1260 }
1261
1262 r = bio_integrity_clone(clone, bio, GFP_NOIO);
1263 if (r < 0)
1264 return r;
1265 }
1266
1267 if (bio_op(bio) != REQ_OP_ZONE_REPORT)
1268 bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
1269 clone->bi_iter.bi_size = to_bytes(len);
1270
1271 if (unlikely(bio_integrity(bio) != NULL))
1272 bio_integrity_trim(clone);
1273
1274 return 0;
1275}
1276
1277static struct dm_target_io *alloc_tio(struct clone_info *ci,
1278 struct dm_target *ti,
1279 unsigned target_bio_nr)
1280{
1281 struct dm_target_io *tio;
1282 struct bio *clone;
1283
1284 clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs);
1285 tio = container_of(clone, struct dm_target_io, clone);
1286
1287 tio->io = ci->io;
1288 tio->ti = ti;
1289 tio->target_bio_nr = target_bio_nr;
1290
1291 return tio;
1292}
1293
1294static void __clone_and_map_simple_bio(struct clone_info *ci,
1295 struct dm_target *ti,
1296 unsigned target_bio_nr, unsigned *len)
1297{
1298 struct dm_target_io *tio = alloc_tio(ci, ti, target_bio_nr);
1299 struct bio *clone = &tio->clone;
1300
1301 tio->len_ptr = len;
1302
1303 __bio_clone_fast(clone, ci->bio);
1304 if (len)
1305 bio_setup_sector(clone, ci->sector, *len);
1306
1307 __map_bio(tio);
1308}
1309
1310static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
1311 unsigned num_bios, unsigned *len)
1312{
1313 unsigned target_bio_nr;
1314
1315 for (target_bio_nr = 0; target_bio_nr < num_bios; target_bio_nr++)
1316 __clone_and_map_simple_bio(ci, ti, target_bio_nr, len);
1317}
1318
1319static int __send_empty_flush(struct clone_info *ci)
1320{
1321 unsigned target_nr = 0;
1322 struct dm_target *ti;
1323
1324 BUG_ON(bio_has_data(ci->bio));
1325 while ((ti = dm_table_get_target(ci->map, target_nr++)))
1326 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
1327
1328 return 0;
1329}
1330
1331static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
1332 sector_t sector, unsigned *len)
1333{
1334 struct bio *bio = ci->bio;
1335 struct dm_target_io *tio;
1336 unsigned target_bio_nr;
1337 unsigned num_target_bios = 1;
1338 int r = 0;
1339
1340
1341
1342
1343 if (bio_data_dir(bio) == WRITE && ti->num_write_bios)
1344 num_target_bios = ti->num_write_bios(ti, bio);
1345
1346 for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) {
1347 tio = alloc_tio(ci, ti, target_bio_nr);
1348 tio->len_ptr = len;
1349 r = clone_bio(tio, bio, sector, *len);
1350 if (r < 0) {
1351 free_tio(tio);
1352 break;
1353 }
1354 __map_bio(tio);
1355 }
1356
1357 return r;
1358}
1359
1360typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
1361
1362static unsigned get_num_discard_bios(struct dm_target *ti)
1363{
1364 return ti->num_discard_bios;
1365}
1366
1367static unsigned get_num_write_same_bios(struct dm_target *ti)
1368{
1369 return ti->num_write_same_bios;
1370}
1371
1372static unsigned get_num_write_zeroes_bios(struct dm_target *ti)
1373{
1374 return ti->num_write_zeroes_bios;
1375}
1376
1377typedef bool (*is_split_required_fn)(struct dm_target *ti);
1378
1379static bool is_split_required_for_discard(struct dm_target *ti)
1380{
1381 return ti->split_discard_bios;
1382}
1383
1384static int __send_changing_extent_only(struct clone_info *ci,
1385 get_num_bios_fn get_num_bios,
1386 is_split_required_fn is_split_required)
1387{
1388 struct dm_target *ti;
1389 unsigned len;
1390 unsigned num_bios;
1391
1392 do {
1393 ti = dm_table_find_target(ci->map, ci->sector);
1394 if (!dm_target_is_valid(ti))
1395 return -EIO;
1396
1397
1398
1399
1400
1401
1402
1403 num_bios = get_num_bios ? get_num_bios(ti) : 0;
1404 if (!num_bios)
1405 return -EOPNOTSUPP;
1406
1407 if (is_split_required && !is_split_required(ti))
1408 len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
1409 else
1410 len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti));
1411
1412 __send_duplicate_bios(ci, ti, num_bios, &len);
1413
1414 ci->sector += len;
1415 } while (ci->sector_count -= len);
1416
1417 return 0;
1418}
1419
1420static int __send_discard(struct clone_info *ci)
1421{
1422 return __send_changing_extent_only(ci, get_num_discard_bios,
1423 is_split_required_for_discard);
1424}
1425
1426static int __send_write_same(struct clone_info *ci)
1427{
1428 return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
1429}
1430
1431static int __send_write_zeroes(struct clone_info *ci)
1432{
1433 return __send_changing_extent_only(ci, get_num_write_zeroes_bios, NULL);
1434}
1435
1436
1437
1438
1439static int __split_and_process_non_flush(struct clone_info *ci)
1440{
1441 struct bio *bio = ci->bio;
1442 struct dm_target *ti;
1443 unsigned len;
1444 int r;
1445
1446 if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
1447 return __send_discard(ci);
1448 else if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
1449 return __send_write_same(ci);
1450 else if (unlikely(bio_op(bio) == REQ_OP_WRITE_ZEROES))
1451 return __send_write_zeroes(ci);
1452
1453 ti = dm_table_find_target(ci->map, ci->sector);
1454 if (!dm_target_is_valid(ti))
1455 return -EIO;
1456
1457 if (bio_op(bio) == REQ_OP_ZONE_REPORT)
1458 len = ci->sector_count;
1459 else
1460 len = min_t(sector_t, max_io_len(ci->sector, ti),
1461 ci->sector_count);
1462
1463 r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
1464 if (r < 0)
1465 return r;
1466
1467 ci->sector += len;
1468 ci->sector_count -= len;
1469
1470 return 0;
1471}
1472
1473
1474
1475
1476static void __split_and_process_bio(struct mapped_device *md,
1477 struct dm_table *map, struct bio *bio)
1478{
1479 struct clone_info ci;
1480 int error = 0;
1481
1482 if (unlikely(!map)) {
1483 bio_io_error(bio);
1484 return;
1485 }
1486
1487 ci.map = map;
1488 ci.md = md;
1489 ci.io = alloc_io(md);
1490 ci.io->status = 0;
1491 atomic_set(&ci.io->io_count, 1);
1492 ci.io->bio = bio;
1493 ci.io->md = md;
1494 spin_lock_init(&ci.io->endio_lock);
1495 ci.sector = bio->bi_iter.bi_sector;
1496
1497 start_io_acct(ci.io);
1498
1499 if (bio->bi_opf & REQ_PREFLUSH) {
1500 ci.bio = &ci.md->flush_bio;
1501 ci.sector_count = 0;
1502 error = __send_empty_flush(&ci);
1503
1504 } else if (bio_op(bio) == REQ_OP_ZONE_RESET) {
1505 ci.bio = bio;
1506 ci.sector_count = 0;
1507 error = __split_and_process_non_flush(&ci);
1508 } else {
1509 ci.bio = bio;
1510 ci.sector_count = bio_sectors(bio);
1511 while (ci.sector_count && !error)
1512 error = __split_and_process_non_flush(&ci);
1513 }
1514
1515
1516 dec_pending(ci.io, errno_to_blk_status(error));
1517}
1518
1519
1520
1521
1522
1523
1524
1525
1526static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
1527{
1528 int rw = bio_data_dir(bio);
1529 struct mapped_device *md = q->queuedata;
1530 int srcu_idx;
1531 struct dm_table *map;
1532
1533 map = dm_get_live_table(md, &srcu_idx);
1534
1535 generic_start_io_acct(rw, bio_sectors(bio), &dm_disk(md)->part0);
1536
1537
1538 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1539 dm_put_live_table(md, srcu_idx);
1540
1541 if (!(bio->bi_opf & REQ_RAHEAD))
1542 queue_io(md, bio);
1543 else
1544 bio_io_error(bio);
1545 return BLK_QC_T_NONE;
1546 }
1547
1548 __split_and_process_bio(md, map, bio);
1549 dm_put_live_table(md, srcu_idx);
1550 return BLK_QC_T_NONE;
1551}
1552
1553static int dm_any_congested(void *congested_data, int bdi_bits)
1554{
1555 int r = bdi_bits;
1556 struct mapped_device *md = congested_data;
1557 struct dm_table *map;
1558
1559 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
1560 if (dm_request_based(md)) {
1561
1562
1563
1564
1565 r = md->queue->backing_dev_info->wb.state & bdi_bits;
1566 } else {
1567 map = dm_get_live_table_fast(md);
1568 if (map)
1569 r = dm_table_any_congested(map, bdi_bits);
1570 dm_put_live_table_fast(md);
1571 }
1572 }
1573
1574 return r;
1575}
1576
1577
1578
1579
1580static void free_minor(int minor)
1581{
1582 spin_lock(&_minor_lock);
1583 idr_remove(&_minor_idr, minor);
1584 spin_unlock(&_minor_lock);
1585}
1586
1587
1588
1589
1590static int specific_minor(int minor)
1591{
1592 int r;
1593
1594 if (minor >= (1 << MINORBITS))
1595 return -EINVAL;
1596
1597 idr_preload(GFP_KERNEL);
1598 spin_lock(&_minor_lock);
1599
1600 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
1601
1602 spin_unlock(&_minor_lock);
1603 idr_preload_end();
1604 if (r < 0)
1605 return r == -ENOSPC ? -EBUSY : r;
1606 return 0;
1607}
1608
1609static int next_free_minor(int *minor)
1610{
1611 int r;
1612
1613 idr_preload(GFP_KERNEL);
1614 spin_lock(&_minor_lock);
1615
1616 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
1617
1618 spin_unlock(&_minor_lock);
1619 idr_preload_end();
1620 if (r < 0)
1621 return r;
1622 *minor = r;
1623 return 0;
1624}
1625
1626static const struct block_device_operations dm_blk_dops;
1627static const struct dax_operations dm_dax_ops;
1628
1629static void dm_wq_work(struct work_struct *work);
1630
1631void dm_init_md_queue(struct mapped_device *md)
1632{
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642 queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
1643
1644
1645
1646
1647
1648 md->queue->queuedata = md;
1649 md->queue->backing_dev_info->congested_data = md;
1650}
1651
1652void dm_init_normal_md_queue(struct mapped_device *md)
1653{
1654 md->use_blk_mq = false;
1655 dm_init_md_queue(md);
1656
1657
1658
1659
1660 md->queue->backing_dev_info->congested_fn = dm_any_congested;
1661}
1662
1663static void cleanup_mapped_device(struct mapped_device *md)
1664{
1665 if (md->wq)
1666 destroy_workqueue(md->wq);
1667 if (md->kworker_task)
1668 kthread_stop(md->kworker_task);
1669 mempool_destroy(md->io_pool);
1670 if (md->bs)
1671 bioset_free(md->bs);
1672
1673 if (md->dax_dev) {
1674 kill_dax(md->dax_dev);
1675 put_dax(md->dax_dev);
1676 md->dax_dev = NULL;
1677 }
1678
1679 if (md->disk) {
1680 spin_lock(&_minor_lock);
1681 md->disk->private_data = NULL;
1682 spin_unlock(&_minor_lock);
1683 del_gendisk(md->disk);
1684 put_disk(md->disk);
1685 }
1686
1687 if (md->queue)
1688 blk_cleanup_queue(md->queue);
1689
1690 cleanup_srcu_struct(&md->io_barrier);
1691
1692 if (md->bdev) {
1693 bdput(md->bdev);
1694 md->bdev = NULL;
1695 }
1696
1697 dm_mq_cleanup_mapped_device(md);
1698}
1699
1700
1701
1702
1703static struct mapped_device *alloc_dev(int minor)
1704{
1705 int r, numa_node_id = dm_get_numa_node();
1706 struct dax_device *dax_dev;
1707 struct mapped_device *md;
1708 void *old_md;
1709
1710 md = kzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
1711 if (!md) {
1712 DMWARN("unable to allocate device, out of memory.");
1713 return NULL;
1714 }
1715
1716 if (!try_module_get(THIS_MODULE))
1717 goto bad_module_get;
1718
1719
1720 if (minor == DM_ANY_MINOR)
1721 r = next_free_minor(&minor);
1722 else
1723 r = specific_minor(minor);
1724 if (r < 0)
1725 goto bad_minor;
1726
1727 r = init_srcu_struct(&md->io_barrier);
1728 if (r < 0)
1729 goto bad_io_barrier;
1730
1731 md->numa_node_id = numa_node_id;
1732 md->use_blk_mq = dm_use_blk_mq_default();
1733 md->init_tio_pdu = false;
1734 md->type = DM_TYPE_NONE;
1735 mutex_init(&md->suspend_lock);
1736 mutex_init(&md->type_lock);
1737 mutex_init(&md->table_devices_lock);
1738 spin_lock_init(&md->deferred_lock);
1739 atomic_set(&md->holders, 1);
1740 atomic_set(&md->open_count, 0);
1741 atomic_set(&md->event_nr, 0);
1742 atomic_set(&md->uevent_seq, 0);
1743 INIT_LIST_HEAD(&md->uevent_list);
1744 INIT_LIST_HEAD(&md->table_devices);
1745 spin_lock_init(&md->uevent_lock);
1746
1747 md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id);
1748 if (!md->queue)
1749 goto bad;
1750
1751 dm_init_md_queue(md);
1752
1753 md->disk = alloc_disk_node(1, numa_node_id);
1754 if (!md->disk)
1755 goto bad;
1756
1757 atomic_set(&md->pending[0], 0);
1758 atomic_set(&md->pending[1], 0);
1759 init_waitqueue_head(&md->wait);
1760 INIT_WORK(&md->work, dm_wq_work);
1761 init_waitqueue_head(&md->eventq);
1762 init_completion(&md->kobj_holder.completion);
1763 md->kworker_task = NULL;
1764
1765 md->disk->major = _major;
1766 md->disk->first_minor = minor;
1767 md->disk->fops = &dm_blk_dops;
1768 md->disk->queue = md->queue;
1769 md->disk->private_data = md;
1770 sprintf(md->disk->disk_name, "dm-%d", minor);
1771
1772 dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops);
1773 if (!dax_dev)
1774 goto bad;
1775 md->dax_dev = dax_dev;
1776
1777 add_disk(md->disk);
1778 format_dev_t(md->name, MKDEV(_major, minor));
1779
1780 md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
1781 if (!md->wq)
1782 goto bad;
1783
1784 md->bdev = bdget_disk(md->disk, 0);
1785 if (!md->bdev)
1786 goto bad;
1787
1788 bio_init(&md->flush_bio, NULL, 0);
1789 md->flush_bio.bi_bdev = md->bdev;
1790 md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
1791
1792 dm_stats_init(&md->stats);
1793
1794
1795 spin_lock(&_minor_lock);
1796 old_md = idr_replace(&_minor_idr, md, minor);
1797 spin_unlock(&_minor_lock);
1798
1799 BUG_ON(old_md != MINOR_ALLOCED);
1800
1801 return md;
1802
1803bad:
1804 cleanup_mapped_device(md);
1805bad_io_barrier:
1806 free_minor(minor);
1807bad_minor:
1808 module_put(THIS_MODULE);
1809bad_module_get:
1810 kfree(md);
1811 return NULL;
1812}
1813
1814static void unlock_fs(struct mapped_device *md);
1815
1816static void free_dev(struct mapped_device *md)
1817{
1818 int minor = MINOR(disk_devt(md->disk));
1819
1820 unlock_fs(md);
1821
1822 cleanup_mapped_device(md);
1823
1824 free_table_devices(&md->table_devices);
1825 dm_stats_cleanup(&md->stats);
1826 free_minor(minor);
1827
1828 module_put(THIS_MODULE);
1829 kfree(md);
1830}
1831
1832static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
1833{
1834 struct dm_md_mempools *p = dm_table_get_md_mempools(t);
1835
1836 if (md->bs) {
1837
1838 if (dm_table_bio_based(t)) {
1839
1840
1841
1842
1843 bioset_free(md->bs);
1844 md->bs = p->bs;
1845 p->bs = NULL;
1846 }
1847
1848
1849
1850
1851
1852
1853
1854
1855 goto out;
1856 }
1857
1858 BUG_ON(!p || md->io_pool || md->bs);
1859
1860 md->io_pool = p->io_pool;
1861 p->io_pool = NULL;
1862 md->bs = p->bs;
1863 p->bs = NULL;
1864
1865out:
1866
1867 dm_table_free_md_mempools(t);
1868}
1869
1870
1871
1872
1873static void event_callback(void *context)
1874{
1875 unsigned long flags;
1876 LIST_HEAD(uevents);
1877 struct mapped_device *md = (struct mapped_device *) context;
1878
1879 spin_lock_irqsave(&md->uevent_lock, flags);
1880 list_splice_init(&md->uevent_list, &uevents);
1881 spin_unlock_irqrestore(&md->uevent_lock, flags);
1882
1883 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
1884
1885 atomic_inc(&md->event_nr);
1886 atomic_inc(&dm_global_event_nr);
1887 wake_up(&md->eventq);
1888 wake_up(&dm_global_eventq);
1889}
1890
1891
1892
1893
1894static void __set_size(struct mapped_device *md, sector_t size)
1895{
1896 lockdep_assert_held(&md->suspend_lock);
1897
1898 set_capacity(md->disk, size);
1899
1900 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
1901}
1902
1903
1904
1905
1906static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
1907 struct queue_limits *limits)
1908{
1909 struct dm_table *old_map;
1910 struct request_queue *q = md->queue;
1911 sector_t size;
1912
1913 lockdep_assert_held(&md->suspend_lock);
1914
1915 size = dm_table_get_size(t);
1916
1917
1918
1919
1920 if (size != dm_get_size(md))
1921 memset(&md->geometry, 0, sizeof(md->geometry));
1922
1923 __set_size(md, size);
1924
1925 dm_table_event_callback(t, event_callback, md);
1926
1927
1928
1929
1930
1931
1932
1933
1934 if (dm_table_request_based(t)) {
1935 dm_stop_queue(q);
1936
1937
1938
1939
1940
1941 md->immutable_target = dm_table_get_immutable_target(t);
1942 }
1943
1944 __bind_mempools(md, t);
1945
1946 old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
1947 rcu_assign_pointer(md->map, (void *)t);
1948 md->immutable_target_type = dm_table_get_immutable_target_type(t);
1949
1950 dm_table_set_restrictions(t, q, limits);
1951 if (old_map)
1952 dm_sync_table(md);
1953
1954 return old_map;
1955}
1956
1957
1958
1959
1960static struct dm_table *__unbind(struct mapped_device *md)
1961{
1962 struct dm_table *map = rcu_dereference_protected(md->map, 1);
1963
1964 if (!map)
1965 return NULL;
1966
1967 dm_table_event_callback(map, NULL, NULL);
1968 RCU_INIT_POINTER(md->map, NULL);
1969 dm_sync_table(md);
1970
1971 return map;
1972}
1973
1974
1975
1976
1977int dm_create(int minor, struct mapped_device **result)
1978{
1979 struct mapped_device *md;
1980
1981 md = alloc_dev(minor);
1982 if (!md)
1983 return -ENXIO;
1984
1985 dm_sysfs_init(md);
1986
1987 *result = md;
1988 return 0;
1989}
1990
1991
1992
1993
1994
1995void dm_lock_md_type(struct mapped_device *md)
1996{
1997 mutex_lock(&md->type_lock);
1998}
1999
2000void dm_unlock_md_type(struct mapped_device *md)
2001{
2002 mutex_unlock(&md->type_lock);
2003}
2004
2005void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
2006{
2007 BUG_ON(!mutex_is_locked(&md->type_lock));
2008 md->type = type;
2009}
2010
2011enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
2012{
2013 return md->type;
2014}
2015
2016struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
2017{
2018 return md->immutable_target_type;
2019}
2020
2021
2022
2023
2024
2025struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
2026{
2027 BUG_ON(!atomic_read(&md->holders));
2028 return &md->queue->limits;
2029}
2030EXPORT_SYMBOL_GPL(dm_get_queue_limits);
2031
2032
2033
2034
2035int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
2036{
2037 int r;
2038 enum dm_queue_mode type = dm_get_md_type(md);
2039
2040 switch (type) {
2041 case DM_TYPE_REQUEST_BASED:
2042 r = dm_old_init_request_queue(md, t);
2043 if (r) {
2044 DMERR("Cannot initialize queue for request-based mapped device");
2045 return r;
2046 }
2047 break;
2048 case DM_TYPE_MQ_REQUEST_BASED:
2049 r = dm_mq_init_request_queue(md, t);
2050 if (r) {
2051 DMERR("Cannot initialize queue for request-based dm-mq mapped device");
2052 return r;
2053 }
2054 break;
2055 case DM_TYPE_BIO_BASED:
2056 case DM_TYPE_DAX_BIO_BASED:
2057 dm_init_normal_md_queue(md);
2058 blk_queue_make_request(md->queue, dm_make_request);
2059
2060
2061
2062
2063 bioset_free(md->queue->bio_split);
2064 md->queue->bio_split = NULL;
2065
2066 if (type == DM_TYPE_DAX_BIO_BASED)
2067 queue_flag_set_unlocked(QUEUE_FLAG_DAX, md->queue);
2068 break;
2069 case DM_TYPE_NONE:
2070 WARN_ON_ONCE(true);
2071 break;
2072 }
2073
2074 return 0;
2075}
2076
2077struct mapped_device *dm_get_md(dev_t dev)
2078{
2079 struct mapped_device *md;
2080 unsigned minor = MINOR(dev);
2081
2082 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2083 return NULL;
2084
2085 spin_lock(&_minor_lock);
2086
2087 md = idr_find(&_minor_idr, minor);
2088 if (md) {
2089 if ((md == MINOR_ALLOCED ||
2090 (MINOR(disk_devt(dm_disk(md))) != minor) ||
2091 dm_deleting_md(md) ||
2092 test_bit(DMF_FREEING, &md->flags))) {
2093 md = NULL;
2094 goto out;
2095 }
2096 dm_get(md);
2097 }
2098
2099out:
2100 spin_unlock(&_minor_lock);
2101
2102 return md;
2103}
2104EXPORT_SYMBOL_GPL(dm_get_md);
2105
2106void *dm_get_mdptr(struct mapped_device *md)
2107{
2108 return md->interface_ptr;
2109}
2110
2111void dm_set_mdptr(struct mapped_device *md, void *ptr)
2112{
2113 md->interface_ptr = ptr;
2114}
2115
2116void dm_get(struct mapped_device *md)
2117{
2118 atomic_inc(&md->holders);
2119 BUG_ON(test_bit(DMF_FREEING, &md->flags));
2120}
2121
2122int dm_hold(struct mapped_device *md)
2123{
2124 spin_lock(&_minor_lock);
2125 if (test_bit(DMF_FREEING, &md->flags)) {
2126 spin_unlock(&_minor_lock);
2127 return -EBUSY;
2128 }
2129 dm_get(md);
2130 spin_unlock(&_minor_lock);
2131 return 0;
2132}
2133EXPORT_SYMBOL_GPL(dm_hold);
2134
2135const char *dm_device_name(struct mapped_device *md)
2136{
2137 return md->name;
2138}
2139EXPORT_SYMBOL_GPL(dm_device_name);
2140
2141static void __dm_destroy(struct mapped_device *md, bool wait)
2142{
2143 struct request_queue *q = dm_get_md_queue(md);
2144 struct dm_table *map;
2145 int srcu_idx;
2146
2147 might_sleep();
2148
2149 spin_lock(&_minor_lock);
2150 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2151 set_bit(DMF_FREEING, &md->flags);
2152 spin_unlock(&_minor_lock);
2153
2154 blk_set_queue_dying(q);
2155
2156 if (dm_request_based(md) && md->kworker_task)
2157 kthread_flush_worker(&md->kworker);
2158
2159
2160
2161
2162
2163 mutex_lock(&md->suspend_lock);
2164 map = dm_get_live_table(md, &srcu_idx);
2165 if (!dm_suspended_md(md)) {
2166 dm_table_presuspend_targets(map);
2167 dm_table_postsuspend_targets(map);
2168 }
2169
2170 dm_put_live_table(md, srcu_idx);
2171 mutex_unlock(&md->suspend_lock);
2172
2173
2174
2175
2176
2177
2178
2179 if (wait)
2180 while (atomic_read(&md->holders))
2181 msleep(1);
2182 else if (atomic_read(&md->holders))
2183 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2184 dm_device_name(md), atomic_read(&md->holders));
2185
2186 dm_sysfs_exit(md);
2187 dm_table_destroy(__unbind(md));
2188 free_dev(md);
2189}
2190
2191void dm_destroy(struct mapped_device *md)
2192{
2193 __dm_destroy(md, true);
2194}
2195
2196void dm_destroy_immediate(struct mapped_device *md)
2197{
2198 __dm_destroy(md, false);
2199}
2200
2201void dm_put(struct mapped_device *md)
2202{
2203 atomic_dec(&md->holders);
2204}
2205EXPORT_SYMBOL_GPL(dm_put);
2206
2207static int dm_wait_for_completion(struct mapped_device *md, long task_state)
2208{
2209 int r = 0;
2210 DEFINE_WAIT(wait);
2211
2212 while (1) {
2213 prepare_to_wait(&md->wait, &wait, task_state);
2214
2215 if (!md_in_flight(md))
2216 break;
2217
2218 if (signal_pending_state(task_state, current)) {
2219 r = -EINTR;
2220 break;
2221 }
2222
2223 io_schedule();
2224 }
2225 finish_wait(&md->wait, &wait);
2226
2227 return r;
2228}
2229
2230
2231
2232
2233static void dm_wq_work(struct work_struct *work)
2234{
2235 struct mapped_device *md = container_of(work, struct mapped_device,
2236 work);
2237 struct bio *c;
2238 int srcu_idx;
2239 struct dm_table *map;
2240
2241 map = dm_get_live_table(md, &srcu_idx);
2242
2243 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2244 spin_lock_irq(&md->deferred_lock);
2245 c = bio_list_pop(&md->deferred);
2246 spin_unlock_irq(&md->deferred_lock);
2247
2248 if (!c)
2249 break;
2250
2251 if (dm_request_based(md))
2252 generic_make_request(c);
2253 else
2254 __split_and_process_bio(md, map, c);
2255 }
2256
2257 dm_put_live_table(md, srcu_idx);
2258}
2259
2260static void dm_queue_flush(struct mapped_device *md)
2261{
2262 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2263 smp_mb__after_atomic();
2264 queue_work(md->wq, &md->work);
2265}
2266
2267
2268
2269
2270struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2271{
2272 struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
2273 struct queue_limits limits;
2274 int r;
2275
2276 mutex_lock(&md->suspend_lock);
2277
2278
2279 if (!dm_suspended_md(md))
2280 goto out;
2281
2282
2283
2284
2285
2286
2287
2288 if (dm_table_has_no_data_devices(table)) {
2289 live_map = dm_get_live_table_fast(md);
2290 if (live_map)
2291 limits = md->queue->limits;
2292 dm_put_live_table_fast(md);
2293 }
2294
2295 if (!live_map) {
2296 r = dm_calculate_queue_limits(table, &limits);
2297 if (r) {
2298 map = ERR_PTR(r);
2299 goto out;
2300 }
2301 }
2302
2303 map = __bind(md, table, &limits);
2304
2305out:
2306 mutex_unlock(&md->suspend_lock);
2307 return map;
2308}
2309
2310
2311
2312
2313
2314static int lock_fs(struct mapped_device *md)
2315{
2316 int r;
2317
2318 WARN_ON(md->frozen_sb);
2319
2320 md->frozen_sb = freeze_bdev(md->bdev);
2321 if (IS_ERR(md->frozen_sb)) {
2322 r = PTR_ERR(md->frozen_sb);
2323 md->frozen_sb = NULL;
2324 return r;
2325 }
2326
2327 set_bit(DMF_FROZEN, &md->flags);
2328
2329 return 0;
2330}
2331
2332static void unlock_fs(struct mapped_device *md)
2333{
2334 if (!test_bit(DMF_FROZEN, &md->flags))
2335 return;
2336
2337 thaw_bdev(md->bdev, md->frozen_sb);
2338 md->frozen_sb = NULL;
2339 clear_bit(DMF_FROZEN, &md->flags);
2340}
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
2352 unsigned suspend_flags, long task_state,
2353 int dmf_suspended_flag)
2354{
2355 bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
2356 bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
2357 int r;
2358
2359 lockdep_assert_held(&md->suspend_lock);
2360
2361
2362
2363
2364
2365 if (noflush)
2366 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2367 else
2368 pr_debug("%s: suspending with flush\n", dm_device_name(md));
2369
2370
2371
2372
2373
2374 dm_table_presuspend_targets(map);
2375
2376
2377
2378
2379
2380
2381
2382 if (!noflush && do_lockfs) {
2383 r = lock_fs(md);
2384 if (r) {
2385 dm_table_presuspend_undo_targets(map);
2386 return r;
2387 }
2388 }
2389
2390
2391
2392
2393
2394
2395
2396
2397
2398
2399
2400
2401
2402 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2403 if (map)
2404 synchronize_srcu(&md->io_barrier);
2405
2406
2407
2408
2409
2410 if (dm_request_based(md)) {
2411 dm_stop_queue(md->queue);
2412 if (md->kworker_task)
2413 kthread_flush_worker(&md->kworker);
2414 }
2415
2416 flush_workqueue(md->wq);
2417
2418
2419
2420
2421
2422
2423 r = dm_wait_for_completion(md, task_state);
2424 if (!r)
2425 set_bit(dmf_suspended_flag, &md->flags);
2426
2427 if (noflush)
2428 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2429 if (map)
2430 synchronize_srcu(&md->io_barrier);
2431
2432
2433 if (r < 0) {
2434 dm_queue_flush(md);
2435
2436 if (dm_request_based(md))
2437 dm_start_queue(md->queue);
2438
2439 unlock_fs(md);
2440 dm_table_presuspend_undo_targets(map);
2441
2442 }
2443
2444 return r;
2445}
2446
2447
2448
2449
2450
2451
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2464{
2465 struct dm_table *map = NULL;
2466 int r = 0;
2467
2468retry:
2469 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2470
2471 if (dm_suspended_md(md)) {
2472 r = -EINVAL;
2473 goto out_unlock;
2474 }
2475
2476 if (dm_suspended_internally_md(md)) {
2477
2478 mutex_unlock(&md->suspend_lock);
2479 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2480 if (r)
2481 return r;
2482 goto retry;
2483 }
2484
2485 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2486
2487 r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED);
2488 if (r)
2489 goto out_unlock;
2490
2491 dm_table_postsuspend_targets(map);
2492
2493out_unlock:
2494 mutex_unlock(&md->suspend_lock);
2495 return r;
2496}
2497
2498static int __dm_resume(struct mapped_device *md, struct dm_table *map)
2499{
2500 if (map) {
2501 int r = dm_table_resume_targets(map);
2502 if (r)
2503 return r;
2504 }
2505
2506 dm_queue_flush(md);
2507
2508
2509
2510
2511
2512
2513 if (dm_request_based(md))
2514 dm_start_queue(md->queue);
2515
2516 unlock_fs(md);
2517
2518 return 0;
2519}
2520
2521int dm_resume(struct mapped_device *md)
2522{
2523 int r;
2524 struct dm_table *map = NULL;
2525
2526retry:
2527 r = -EINVAL;
2528 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2529
2530 if (!dm_suspended_md(md))
2531 goto out;
2532
2533 if (dm_suspended_internally_md(md)) {
2534
2535 mutex_unlock(&md->suspend_lock);
2536 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2537 if (r)
2538 return r;
2539 goto retry;
2540 }
2541
2542 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2543 if (!map || !dm_table_get_size(map))
2544 goto out;
2545
2546 r = __dm_resume(md, map);
2547 if (r)
2548 goto out;
2549
2550 clear_bit(DMF_SUSPENDED, &md->flags);
2551out:
2552 mutex_unlock(&md->suspend_lock);
2553
2554 return r;
2555}
2556
2557
2558
2559
2560
2561
2562
2563static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
2564{
2565 struct dm_table *map = NULL;
2566
2567 lockdep_assert_held(&md->suspend_lock);
2568
2569 if (md->internal_suspend_count++)
2570 return;
2571
2572 if (dm_suspended_md(md)) {
2573 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2574 return;
2575 }
2576
2577 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2578
2579
2580
2581
2582
2583
2584
2585 (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE,
2586 DMF_SUSPENDED_INTERNALLY);
2587
2588 dm_table_postsuspend_targets(map);
2589}
2590
2591static void __dm_internal_resume(struct mapped_device *md)
2592{
2593 BUG_ON(!md->internal_suspend_count);
2594
2595 if (--md->internal_suspend_count)
2596 return;
2597
2598 if (dm_suspended_md(md))
2599 goto done;
2600
2601
2602
2603
2604
2605 (void) __dm_resume(md, NULL);
2606
2607done:
2608 clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2609 smp_mb__after_atomic();
2610 wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
2611}
2612
2613void dm_internal_suspend_noflush(struct mapped_device *md)
2614{
2615 mutex_lock(&md->suspend_lock);
2616 __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
2617 mutex_unlock(&md->suspend_lock);
2618}
2619EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
2620
2621void dm_internal_resume(struct mapped_device *md)
2622{
2623 mutex_lock(&md->suspend_lock);
2624 __dm_internal_resume(md);
2625 mutex_unlock(&md->suspend_lock);
2626}
2627EXPORT_SYMBOL_GPL(dm_internal_resume);
2628
2629
2630
2631
2632
2633
2634void dm_internal_suspend_fast(struct mapped_device *md)
2635{
2636 mutex_lock(&md->suspend_lock);
2637 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2638 return;
2639
2640 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2641 synchronize_srcu(&md->io_barrier);
2642 flush_workqueue(md->wq);
2643 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2644}
2645EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
2646
2647void dm_internal_resume_fast(struct mapped_device *md)
2648{
2649 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2650 goto done;
2651
2652 dm_queue_flush(md);
2653
2654done:
2655 mutex_unlock(&md->suspend_lock);
2656}
2657EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
2658
2659
2660
2661
2662int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
2663 unsigned cookie)
2664{
2665 char udev_cookie[DM_COOKIE_LENGTH];
2666 char *envp[] = { udev_cookie, NULL };
2667
2668 if (!cookie)
2669 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
2670 else {
2671 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2672 DM_COOKIE_ENV_VAR_NAME, cookie);
2673 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
2674 action, envp);
2675 }
2676}
2677
2678uint32_t dm_next_uevent_seq(struct mapped_device *md)
2679{
2680 return atomic_add_return(1, &md->uevent_seq);
2681}
2682
2683uint32_t dm_get_event_nr(struct mapped_device *md)
2684{
2685 return atomic_read(&md->event_nr);
2686}
2687
2688int dm_wait_event(struct mapped_device *md, int event_nr)
2689{
2690 return wait_event_interruptible(md->eventq,
2691 (event_nr != atomic_read(&md->event_nr)));
2692}
2693
2694void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
2695{
2696 unsigned long flags;
2697
2698 spin_lock_irqsave(&md->uevent_lock, flags);
2699 list_add(elist, &md->uevent_list);
2700 spin_unlock_irqrestore(&md->uevent_lock, flags);
2701}
2702
2703
2704
2705
2706
2707struct gendisk *dm_disk(struct mapped_device *md)
2708{
2709 return md->disk;
2710}
2711EXPORT_SYMBOL_GPL(dm_disk);
2712
2713struct kobject *dm_kobject(struct mapped_device *md)
2714{
2715 return &md->kobj_holder.kobj;
2716}
2717
2718struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2719{
2720 struct mapped_device *md;
2721
2722 md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
2723
2724 if (test_bit(DMF_FREEING, &md->flags) ||
2725 dm_deleting_md(md))
2726 return NULL;
2727
2728 dm_get(md);
2729 return md;
2730}
2731
2732int dm_suspended_md(struct mapped_device *md)
2733{
2734 return test_bit(DMF_SUSPENDED, &md->flags);
2735}
2736
2737int dm_suspended_internally_md(struct mapped_device *md)
2738{
2739 return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2740}
2741
2742int dm_test_deferred_remove_flag(struct mapped_device *md)
2743{
2744 return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
2745}
2746
2747int dm_suspended(struct dm_target *ti)
2748{
2749 return dm_suspended_md(dm_table_get_md(ti->table));
2750}
2751EXPORT_SYMBOL_GPL(dm_suspended);
2752
2753int dm_noflush_suspending(struct dm_target *ti)
2754{
2755 return __noflush_suspending(dm_table_get_md(ti->table));
2756}
2757EXPORT_SYMBOL_GPL(dm_noflush_suspending);
2758
2759struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
2760 unsigned integrity, unsigned per_io_data_size)
2761{
2762 struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
2763 unsigned int pool_size = 0;
2764 unsigned int front_pad;
2765
2766 if (!pools)
2767 return NULL;
2768
2769 switch (type) {
2770 case DM_TYPE_BIO_BASED:
2771 case DM_TYPE_DAX_BIO_BASED:
2772 pool_size = dm_get_reserved_bio_based_ios();
2773 front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
2774
2775 pools->io_pool = mempool_create_slab_pool(pool_size, _io_cache);
2776 if (!pools->io_pool)
2777 goto out;
2778 break;
2779 case DM_TYPE_REQUEST_BASED:
2780 case DM_TYPE_MQ_REQUEST_BASED:
2781 pool_size = dm_get_reserved_rq_based_ios();
2782 front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
2783
2784 break;
2785 default:
2786 BUG();
2787 }
2788
2789 pools->bs = bioset_create(pool_size, front_pad, BIOSET_NEED_RESCUER);
2790 if (!pools->bs)
2791 goto out;
2792
2793 if (integrity && bioset_integrity_create(pools->bs, pool_size))
2794 goto out;
2795
2796 return pools;
2797
2798out:
2799 dm_free_md_mempools(pools);
2800
2801 return NULL;
2802}
2803
2804void dm_free_md_mempools(struct dm_md_mempools *pools)
2805{
2806 if (!pools)
2807 return;
2808
2809 mempool_destroy(pools->io_pool);
2810
2811 if (pools->bs)
2812 bioset_free(pools->bs);
2813
2814 kfree(pools);
2815}
2816
2817struct dm_pr {
2818 u64 old_key;
2819 u64 new_key;
2820 u32 flags;
2821 bool fail_early;
2822};
2823
2824static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
2825 void *data)
2826{
2827 struct mapped_device *md = bdev->bd_disk->private_data;
2828 struct dm_table *table;
2829 struct dm_target *ti;
2830 int ret = -ENOTTY, srcu_idx;
2831
2832 table = dm_get_live_table(md, &srcu_idx);
2833 if (!table || !dm_table_get_size(table))
2834 goto out;
2835
2836
2837 if (dm_table_get_num_targets(table) != 1)
2838 goto out;
2839 ti = dm_table_get_target(table, 0);
2840
2841 ret = -EINVAL;
2842 if (!ti->type->iterate_devices)
2843 goto out;
2844
2845 ret = ti->type->iterate_devices(ti, fn, data);
2846out:
2847 dm_put_live_table(md, srcu_idx);
2848 return ret;
2849}
2850
2851
2852
2853
2854static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev,
2855 sector_t start, sector_t len, void *data)
2856{
2857 struct dm_pr *pr = data;
2858 const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
2859
2860 if (!ops || !ops->pr_register)
2861 return -EOPNOTSUPP;
2862 return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);
2863}
2864
2865static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
2866 u32 flags)
2867{
2868 struct dm_pr pr = {
2869 .old_key = old_key,
2870 .new_key = new_key,
2871 .flags = flags,
2872 .fail_early = true,
2873 };
2874 int ret;
2875
2876 ret = dm_call_pr(bdev, __dm_pr_register, &pr);
2877 if (ret && new_key) {
2878
2879 pr.old_key = new_key;
2880 pr.new_key = 0;
2881 pr.flags = 0;
2882 pr.fail_early = false;
2883 dm_call_pr(bdev, __dm_pr_register, &pr);
2884 }
2885
2886 return ret;
2887}
2888
2889static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
2890 u32 flags)
2891{
2892 struct mapped_device *md = bdev->bd_disk->private_data;
2893 const struct pr_ops *ops;
2894 fmode_t mode;
2895 int r;
2896
2897 r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
2898 if (r < 0)
2899 return r;
2900
2901 ops = bdev->bd_disk->fops->pr_ops;
2902 if (ops && ops->pr_reserve)
2903 r = ops->pr_reserve(bdev, key, type, flags);
2904 else
2905 r = -EOPNOTSUPP;
2906
2907 bdput(bdev);
2908 return r;
2909}
2910
2911static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
2912{
2913 struct mapped_device *md = bdev->bd_disk->private_data;
2914 const struct pr_ops *ops;
2915 fmode_t mode;
2916 int r;
2917
2918 r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
2919 if (r < 0)
2920 return r;
2921
2922 ops = bdev->bd_disk->fops->pr_ops;
2923 if (ops && ops->pr_release)
2924 r = ops->pr_release(bdev, key, type);
2925 else
2926 r = -EOPNOTSUPP;
2927
2928 bdput(bdev);
2929 return r;
2930}
2931
2932static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
2933 enum pr_type type, bool abort)
2934{
2935 struct mapped_device *md = bdev->bd_disk->private_data;
2936 const struct pr_ops *ops;
2937 fmode_t mode;
2938 int r;
2939
2940 r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
2941 if (r < 0)
2942 return r;
2943
2944 ops = bdev->bd_disk->fops->pr_ops;
2945 if (ops && ops->pr_preempt)
2946 r = ops->pr_preempt(bdev, old_key, new_key, type, abort);
2947 else
2948 r = -EOPNOTSUPP;
2949
2950 bdput(bdev);
2951 return r;
2952}
2953
2954static int dm_pr_clear(struct block_device *bdev, u64 key)
2955{
2956 struct mapped_device *md = bdev->bd_disk->private_data;
2957 const struct pr_ops *ops;
2958 fmode_t mode;
2959 int r;
2960
2961 r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
2962 if (r < 0)
2963 return r;
2964
2965 ops = bdev->bd_disk->fops->pr_ops;
2966 if (ops && ops->pr_clear)
2967 r = ops->pr_clear(bdev, key);
2968 else
2969 r = -EOPNOTSUPP;
2970
2971 bdput(bdev);
2972 return r;
2973}
2974
2975static const struct pr_ops dm_pr_ops = {
2976 .pr_register = dm_pr_register,
2977 .pr_reserve = dm_pr_reserve,
2978 .pr_release = dm_pr_release,
2979 .pr_preempt = dm_pr_preempt,
2980 .pr_clear = dm_pr_clear,
2981};
2982
2983static const struct block_device_operations dm_blk_dops = {
2984 .open = dm_blk_open,
2985 .release = dm_blk_close,
2986 .ioctl = dm_blk_ioctl,
2987 .getgeo = dm_blk_getgeo,
2988 .pr_ops = &dm_pr_ops,
2989 .owner = THIS_MODULE
2990};
2991
2992static const struct dax_operations dm_dax_ops = {
2993 .direct_access = dm_dax_direct_access,
2994 .copy_from_iter = dm_dax_copy_from_iter,
2995 .flush = dm_dax_flush,
2996};
2997
2998
2999
3000
3001module_init(dm_init);
3002module_exit(dm_exit);
3003
3004module_param(major, uint, 0);
3005MODULE_PARM_DESC(major, "The major number of the device mapper");
3006
3007module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
3008MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
3009
3010module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
3011MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
3012
3013MODULE_DESCRIPTION(DM_NAME " driver");
3014MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3015MODULE_LICENSE("GPL");
3016