1
2
3
4
5
6
7
8#include "dm-core.h"
9#include "dm-rq.h"
10#include "dm-uevent.h"
11
12#include <linux/init.h>
13#include <linux/module.h>
14#include <linux/mutex.h>
15#include <linux/blkpg.h>
16#include <linux/bio.h>
17#include <linux/mempool.h>
18#include <linux/slab.h>
19#include <linux/idr.h>
20#include <linux/hdreg.h>
21#include <linux/delay.h>
22#include <linux/wait.h>
23#include <linux/pr.h>
24
25#define DM_MSG_PREFIX "core"
26
27#ifdef CONFIG_PRINTK
28
29
30
31DEFINE_RATELIMIT_STATE(dm_ratelimit_state,
32 DEFAULT_RATELIMIT_INTERVAL,
33 DEFAULT_RATELIMIT_BURST);
34EXPORT_SYMBOL(dm_ratelimit_state);
35#endif
36
37
38
39
40
41#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
42#define DM_COOKIE_LENGTH 24
43
44static const char *_name = DM_NAME;
45
46static unsigned int major = 0;
47static unsigned int _major = 0;
48
49static DEFINE_IDR(_minor_idr);
50
51static DEFINE_SPINLOCK(_minor_lock);
52
53static void do_deferred_remove(struct work_struct *w);
54
55static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
56
57static struct workqueue_struct *deferred_remove_workqueue;
58
59
60
61
62struct dm_io {
63 struct mapped_device *md;
64 int error;
65 atomic_t io_count;
66 struct bio *bio;
67 unsigned long start_time;
68 spinlock_t endio_lock;
69 struct dm_stats_aux stats_aux;
70};
71
72#define MINOR_ALLOCED ((void *)-1)
73
74
75
76
77#define DMF_BLOCK_IO_FOR_SUSPEND 0
78#define DMF_SUSPENDED 1
79#define DMF_FROZEN 2
80#define DMF_FREEING 3
81#define DMF_DELETING 4
82#define DMF_NOFLUSH_SUSPENDING 5
83#define DMF_MERGE_IS_OPTIONAL 6
84#define DMF_DEFERRED_REMOVE 7
85#define DMF_SUSPENDED_INTERNALLY 8
86
87#define DM_NUMA_NODE NUMA_NO_NODE
88static int dm_numa_node = DM_NUMA_NODE;
89
90
91
92
93struct dm_md_mempools {
94 mempool_t *io_pool;
95 mempool_t *rq_pool;
96 struct bio_set *bs;
97};
98
99struct table_device {
100 struct list_head list;
101 atomic_t count;
102 struct dm_dev dm_dev;
103};
104
105static struct kmem_cache *_io_cache;
106static struct kmem_cache *_rq_tio_cache;
107static struct kmem_cache *_rq_cache;
108
109
110
111
112#define RESERVED_BIO_BASED_IOS 16
113static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
114
115static int __dm_get_module_param_int(int *module_param, int min, int max)
116{
117 int param = ACCESS_ONCE(*module_param);
118 int modified_param = 0;
119 bool modified = true;
120
121 if (param < min)
122 modified_param = min;
123 else if (param > max)
124 modified_param = max;
125 else
126 modified = false;
127
128 if (modified) {
129 (void)cmpxchg(module_param, param, modified_param);
130 param = modified_param;
131 }
132
133 return param;
134}
135
136unsigned __dm_get_module_param(unsigned *module_param,
137 unsigned def, unsigned max)
138{
139 unsigned param = ACCESS_ONCE(*module_param);
140 unsigned modified_param = 0;
141
142 if (!param)
143 modified_param = def;
144 else if (param > max)
145 modified_param = max;
146
147 if (modified_param) {
148 (void)cmpxchg(module_param, param, modified_param);
149 param = modified_param;
150 }
151
152 return param;
153}
154
155unsigned dm_get_reserved_bio_based_ios(void)
156{
157 return __dm_get_module_param(&reserved_bio_based_ios,
158 RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS);
159}
160EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
161
162static unsigned dm_get_numa_node(void)
163{
164 return __dm_get_module_param_int(&dm_numa_node,
165 DM_NUMA_NODE, num_online_nodes() - 1);
166}
167
168static int __init local_init(void)
169{
170 int r = -ENOMEM;
171
172
173 _io_cache = KMEM_CACHE(dm_io, 0);
174 if (!_io_cache)
175 return r;
176
177 _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
178 if (!_rq_tio_cache)
179 goto out_free_io_cache;
180
181 _rq_cache = kmem_cache_create("dm_old_clone_request", sizeof(struct request),
182 __alignof__(struct request), 0, NULL);
183 if (!_rq_cache)
184 goto out_free_rq_tio_cache;
185
186 r = dm_uevent_init();
187 if (r)
188 goto out_free_rq_cache;
189
190 deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
191 if (!deferred_remove_workqueue) {
192 r = -ENOMEM;
193 goto out_uevent_exit;
194 }
195
196 _major = major;
197 r = register_blkdev(_major, _name);
198 if (r < 0)
199 goto out_free_workqueue;
200
201 if (!_major)
202 _major = r;
203
204 return 0;
205
206out_free_workqueue:
207 destroy_workqueue(deferred_remove_workqueue);
208out_uevent_exit:
209 dm_uevent_exit();
210out_free_rq_cache:
211 kmem_cache_destroy(_rq_cache);
212out_free_rq_tio_cache:
213 kmem_cache_destroy(_rq_tio_cache);
214out_free_io_cache:
215 kmem_cache_destroy(_io_cache);
216
217 return r;
218}
219
220static void local_exit(void)
221{
222 flush_scheduled_work();
223 destroy_workqueue(deferred_remove_workqueue);
224
225 kmem_cache_destroy(_rq_cache);
226 kmem_cache_destroy(_rq_tio_cache);
227 kmem_cache_destroy(_io_cache);
228 unregister_blkdev(_major, _name);
229 dm_uevent_exit();
230
231 _major = 0;
232
233 DMINFO("cleaned up");
234}
235
236static int (*_inits[])(void) __initdata = {
237 local_init,
238 dm_target_init,
239 dm_linear_init,
240 dm_stripe_init,
241 dm_io_init,
242 dm_kcopyd_init,
243 dm_interface_init,
244 dm_statistics_init,
245};
246
247static void (*_exits[])(void) = {
248 local_exit,
249 dm_target_exit,
250 dm_linear_exit,
251 dm_stripe_exit,
252 dm_io_exit,
253 dm_kcopyd_exit,
254 dm_interface_exit,
255 dm_statistics_exit,
256};
257
258static int __init dm_init(void)
259{
260 const int count = ARRAY_SIZE(_inits);
261
262 int r, i;
263
264 for (i = 0; i < count; i++) {
265 r = _inits[i]();
266 if (r)
267 goto bad;
268 }
269
270 return 0;
271
272 bad:
273 while (i--)
274 _exits[i]();
275
276 return r;
277}
278
279static void __exit dm_exit(void)
280{
281 int i = ARRAY_SIZE(_exits);
282
283 while (i--)
284 _exits[i]();
285
286
287
288
289 idr_destroy(&_minor_idr);
290}
291
292
293
294
295int dm_deleting_md(struct mapped_device *md)
296{
297 return test_bit(DMF_DELETING, &md->flags);
298}
299
300static int dm_blk_open(struct block_device *bdev, fmode_t mode)
301{
302 struct mapped_device *md;
303
304 spin_lock(&_minor_lock);
305
306 md = bdev->bd_disk->private_data;
307 if (!md)
308 goto out;
309
310 if (test_bit(DMF_FREEING, &md->flags) ||
311 dm_deleting_md(md)) {
312 md = NULL;
313 goto out;
314 }
315
316 dm_get(md);
317 atomic_inc(&md->open_count);
318out:
319 spin_unlock(&_minor_lock);
320
321 return md ? 0 : -ENXIO;
322}
323
324static void dm_blk_close(struct gendisk *disk, fmode_t mode)
325{
326 struct mapped_device *md;
327
328 spin_lock(&_minor_lock);
329
330 md = disk->private_data;
331 if (WARN_ON(!md))
332 goto out;
333
334 if (atomic_dec_and_test(&md->open_count) &&
335 (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
336 queue_work(deferred_remove_workqueue, &deferred_remove_work);
337
338 dm_put(md);
339out:
340 spin_unlock(&_minor_lock);
341}
342
343int dm_open_count(struct mapped_device *md)
344{
345 return atomic_read(&md->open_count);
346}
347
348
349
350
351int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
352{
353 int r = 0;
354
355 spin_lock(&_minor_lock);
356
357 if (dm_open_count(md)) {
358 r = -EBUSY;
359 if (mark_deferred)
360 set_bit(DMF_DEFERRED_REMOVE, &md->flags);
361 } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
362 r = -EEXIST;
363 else
364 set_bit(DMF_DELETING, &md->flags);
365
366 spin_unlock(&_minor_lock);
367
368 return r;
369}
370
371int dm_cancel_deferred_remove(struct mapped_device *md)
372{
373 int r = 0;
374
375 spin_lock(&_minor_lock);
376
377 if (test_bit(DMF_DELETING, &md->flags))
378 r = -EBUSY;
379 else
380 clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
381
382 spin_unlock(&_minor_lock);
383
384 return r;
385}
386
387static void do_deferred_remove(struct work_struct *w)
388{
389 dm_deferred_remove();
390}
391
392sector_t dm_get_size(struct mapped_device *md)
393{
394 return get_capacity(md->disk);
395}
396
397struct request_queue *dm_get_md_queue(struct mapped_device *md)
398{
399 return md->queue;
400}
401
402struct dm_stats *dm_get_stats(struct mapped_device *md)
403{
404 return &md->stats;
405}
406
407static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
408{
409 struct mapped_device *md = bdev->bd_disk->private_data;
410
411 return dm_get_geometry(md, geo);
412}
413
414static int dm_grab_bdev_for_ioctl(struct mapped_device *md,
415 struct block_device **bdev,
416 fmode_t *mode)
417{
418 struct dm_target *tgt;
419 struct dm_table *map;
420 int srcu_idx, r;
421
422retry:
423 r = -ENOTTY;
424 map = dm_get_live_table(md, &srcu_idx);
425 if (!map || !dm_table_get_size(map))
426 goto out;
427
428
429 if (dm_table_get_num_targets(map) != 1)
430 goto out;
431
432 tgt = dm_table_get_target(map, 0);
433 if (!tgt->type->prepare_ioctl)
434 goto out;
435
436 if (dm_suspended_md(md)) {
437 r = -EAGAIN;
438 goto out;
439 }
440
441 r = tgt->type->prepare_ioctl(tgt, bdev, mode);
442 if (r < 0)
443 goto out;
444
445 bdgrab(*bdev);
446 dm_put_live_table(md, srcu_idx);
447 return r;
448
449out:
450 dm_put_live_table(md, srcu_idx);
451 if (r == -ENOTCONN && !fatal_signal_pending(current)) {
452 msleep(10);
453 goto retry;
454 }
455 return r;
456}
457
458static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
459 unsigned int cmd, unsigned long arg)
460{
461 struct mapped_device *md = bdev->bd_disk->private_data;
462 int r;
463
464 r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
465 if (r < 0)
466 return r;
467
468 if (r > 0) {
469
470
471
472
473
474 r = scsi_verify_blk_ioctl(NULL, cmd);
475 if (r)
476 goto out;
477 }
478
479 r = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
480out:
481 bdput(bdev);
482 return r;
483}
484
485static struct dm_io *alloc_io(struct mapped_device *md)
486{
487 return mempool_alloc(md->io_pool, GFP_NOIO);
488}
489
490static void free_io(struct mapped_device *md, struct dm_io *io)
491{
492 mempool_free(io, md->io_pool);
493}
494
495static void free_tio(struct dm_target_io *tio)
496{
497 bio_put(&tio->clone);
498}
499
500int md_in_flight(struct mapped_device *md)
501{
502 return atomic_read(&md->pending[READ]) +
503 atomic_read(&md->pending[WRITE]);
504}
505
506static void start_io_acct(struct dm_io *io)
507{
508 struct mapped_device *md = io->md;
509 struct bio *bio = io->bio;
510 int cpu;
511 int rw = bio_data_dir(bio);
512
513 io->start_time = jiffies;
514
515 cpu = part_stat_lock();
516 part_round_stats(cpu, &dm_disk(md)->part0);
517 part_stat_unlock();
518 atomic_set(&dm_disk(md)->part0.in_flight[rw],
519 atomic_inc_return(&md->pending[rw]));
520
521 if (unlikely(dm_stats_used(&md->stats)))
522 dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_sector,
523 bio_sectors(bio), false, 0, &io->stats_aux);
524}
525
526static void end_io_acct(struct dm_io *io)
527{
528 struct mapped_device *md = io->md;
529 struct bio *bio = io->bio;
530 unsigned long duration = jiffies - io->start_time;
531 int pending, cpu;
532 int rw = bio_data_dir(bio);
533
534 cpu = part_stat_lock();
535 part_round_stats(cpu, &dm_disk(md)->part0);
536 part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration);
537 part_stat_unlock();
538
539 if (unlikely(dm_stats_used(&md->stats)))
540 dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_sector,
541 bio_sectors(bio), true, duration, &io->stats_aux);
542
543
544
545
546
547 pending = atomic_dec_return(&md->pending[rw]);
548 atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
549 pending += atomic_read(&md->pending[rw^0x1]);
550
551
552 if (!pending)
553 wake_up(&md->wait);
554}
555
556
557
558
559static void queue_io(struct mapped_device *md, struct bio *bio)
560{
561 unsigned long flags;
562
563 spin_lock_irqsave(&md->deferred_lock, flags);
564 bio_list_add(&md->deferred, bio);
565 spin_unlock_irqrestore(&md->deferred_lock, flags);
566 queue_work(md->wq, &md->work);
567}
568
569
570
571
572
573
574struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
575{
576 *srcu_idx = srcu_read_lock(&md->io_barrier);
577
578 return srcu_dereference(md->map, &md->io_barrier);
579}
580
581void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
582{
583 srcu_read_unlock(&md->io_barrier, srcu_idx);
584}
585
586void dm_sync_table(struct mapped_device *md)
587{
588 synchronize_srcu(&md->io_barrier);
589 synchronize_rcu_expedited();
590}
591
592
593
594
595
596static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
597{
598 rcu_read_lock();
599 return rcu_dereference(md->map);
600}
601
602static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
603{
604 rcu_read_unlock();
605}
606
607
608
609
610static int open_table_device(struct table_device *td, dev_t dev,
611 struct mapped_device *md)
612{
613 static char *_claim_ptr = "I belong to device-mapper";
614 struct block_device *bdev;
615
616 int r;
617
618 BUG_ON(td->dm_dev.bdev);
619
620 bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _claim_ptr);
621 if (IS_ERR(bdev))
622 return PTR_ERR(bdev);
623
624 r = bd_link_disk_holder(bdev, dm_disk(md));
625 if (r) {
626 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
627 return r;
628 }
629
630 td->dm_dev.bdev = bdev;
631 return 0;
632}
633
634
635
636
637static void close_table_device(struct table_device *td, struct mapped_device *md)
638{
639 if (!td->dm_dev.bdev)
640 return;
641
642 bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
643 blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
644 td->dm_dev.bdev = NULL;
645}
646
647static struct table_device *find_table_device(struct list_head *l, dev_t dev,
648 fmode_t mode) {
649 struct table_device *td;
650
651 list_for_each_entry(td, l, list)
652 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
653 return td;
654
655 return NULL;
656}
657
658int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
659 struct dm_dev **result) {
660 int r;
661 struct table_device *td;
662
663 mutex_lock(&md->table_devices_lock);
664 td = find_table_device(&md->table_devices, dev, mode);
665 if (!td) {
666 td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
667 if (!td) {
668 mutex_unlock(&md->table_devices_lock);
669 return -ENOMEM;
670 }
671
672 td->dm_dev.mode = mode;
673 td->dm_dev.bdev = NULL;
674
675 if ((r = open_table_device(td, dev, md))) {
676 mutex_unlock(&md->table_devices_lock);
677 kfree(td);
678 return r;
679 }
680
681 format_dev_t(td->dm_dev.name, dev);
682
683 atomic_set(&td->count, 0);
684 list_add(&td->list, &md->table_devices);
685 }
686 atomic_inc(&td->count);
687 mutex_unlock(&md->table_devices_lock);
688
689 *result = &td->dm_dev;
690 return 0;
691}
692EXPORT_SYMBOL_GPL(dm_get_table_device);
693
694void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
695{
696 struct table_device *td = container_of(d, struct table_device, dm_dev);
697
698 mutex_lock(&md->table_devices_lock);
699 if (atomic_dec_and_test(&td->count)) {
700 close_table_device(td, md);
701 list_del(&td->list);
702 kfree(td);
703 }
704 mutex_unlock(&md->table_devices_lock);
705}
706EXPORT_SYMBOL(dm_put_table_device);
707
708static void free_table_devices(struct list_head *devices)
709{
710 struct list_head *tmp, *next;
711
712 list_for_each_safe(tmp, next, devices) {
713 struct table_device *td = list_entry(tmp, struct table_device, list);
714
715 DMWARN("dm_destroy: %s still exists with %d references",
716 td->dm_dev.name, atomic_read(&td->count));
717 kfree(td);
718 }
719}
720
721
722
723
724int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
725{
726 *geo = md->geometry;
727
728 return 0;
729}
730
731
732
733
734int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
735{
736 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
737
738 if (geo->start > sz) {
739 DMWARN("Start sector is beyond the geometry limits.");
740 return -EINVAL;
741 }
742
743 md->geometry = *geo;
744
745 return 0;
746}
747
748
749
750
751
752
753
754
755
756
757static int __noflush_suspending(struct mapped_device *md)
758{
759 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
760}
761
762
763
764
765
766static void dec_pending(struct dm_io *io, int error)
767{
768 unsigned long flags;
769 int io_error;
770 struct bio *bio;
771 struct mapped_device *md = io->md;
772
773
774 if (unlikely(error)) {
775 spin_lock_irqsave(&io->endio_lock, flags);
776 if (!(io->error > 0 && __noflush_suspending(md)))
777 io->error = error;
778 spin_unlock_irqrestore(&io->endio_lock, flags);
779 }
780
781 if (atomic_dec_and_test(&io->io_count)) {
782 if (io->error == DM_ENDIO_REQUEUE) {
783
784
785
786 spin_lock_irqsave(&md->deferred_lock, flags);
787 if (__noflush_suspending(md))
788 bio_list_add_head(&md->deferred, io->bio);
789 else
790
791 io->error = -EIO;
792 spin_unlock_irqrestore(&md->deferred_lock, flags);
793 }
794
795 io_error = io->error;
796 bio = io->bio;
797 end_io_acct(io);
798 free_io(md, io);
799
800 if (io_error == DM_ENDIO_REQUEUE)
801 return;
802
803 if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) {
804
805
806
807
808 bio->bi_rw &= ~REQ_FLUSH;
809 queue_io(md, bio);
810 } else {
811
812 trace_block_bio_complete(md->queue, bio, io_error);
813 bio_endio(bio, io_error);
814 }
815 }
816}
817
818void disable_write_same(struct mapped_device *md)
819{
820 struct queue_limits *limits = dm_get_queue_limits(md);
821
822
823 limits->max_write_same_sectors = 0;
824}
825
826static void clone_endio(struct bio *bio, int error)
827{
828 int r = error;
829 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
830 struct dm_io *io = tio->io;
831 struct mapped_device *md = tio->io->md;
832 dm_endio_fn endio = tio->ti->type->end_io;
833
834 if (!bio_flagged(bio, BIO_UPTODATE) && !error)
835 error = -EIO;
836
837 if (endio) {
838 r = endio(tio->ti, bio, error);
839 if (r < 0 || r == DM_ENDIO_REQUEUE)
840
841
842
843
844 error = r;
845 else if (r == DM_ENDIO_INCOMPLETE)
846
847 return;
848 else if (r) {
849 DMWARN("unimplemented target endio return value: %d", r);
850 BUG();
851 }
852 }
853
854 if (unlikely(r == -EREMOTEIO && (bio->bi_rw & REQ_WRITE_SAME) &&
855 !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors))
856 disable_write_same(md);
857
858 free_tio(tio);
859 dec_pending(io, error);
860}
861
862
863
864
865
866static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
867{
868 sector_t target_offset = dm_target_offset(ti, sector);
869
870 return ti->len - target_offset;
871}
872
873static sector_t max_io_len(sector_t sector, struct dm_target *ti)
874{
875 sector_t len = max_io_len_target_boundary(sector, ti);
876 sector_t offset, max_len;
877
878
879
880
881 if (ti->max_io_len) {
882 offset = dm_target_offset(ti, sector);
883 if (unlikely(ti->max_io_len & (ti->max_io_len - 1)))
884 max_len = sector_div(offset, ti->max_io_len);
885 else
886 max_len = offset & (ti->max_io_len - 1);
887 max_len = ti->max_io_len - max_len;
888
889 if (len > max_len)
890 len = max_len;
891 }
892
893 return len;
894}
895
896int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
897{
898 if (len > UINT_MAX) {
899 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
900 (unsigned long long)len, UINT_MAX);
901 ti->error = "Maximum size of target IO is too large";
902 return -EINVAL;
903 }
904
905 ti->max_io_len = (uint32_t) len;
906
907 return 0;
908}
909EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
910
911static long dm_blk_direct_access(struct block_device *bdev, sector_t sector,
912 void **kaddr, pfn_t *pfn, long size)
913{
914 struct mapped_device *md = bdev->bd_disk->private_data;
915 struct dm_table *map;
916 struct dm_target *ti;
917 int srcu_idx;
918 long len, ret = -EIO;
919
920 map = dm_get_live_table(md, &srcu_idx);
921 if (!map)
922 goto out;
923
924 ti = dm_table_find_target(map, sector);
925 if (!dm_target_is_valid(ti))
926 goto out;
927
928 len = max_io_len(sector, ti) << SECTOR_SHIFT;
929 size = min(len, size);
930
931 if (ti->type->direct_access)
932 ret = ti->type->direct_access(ti, sector, kaddr, pfn, size);
933out:
934 dm_put_live_table(md, srcu_idx);
935 return min(ret, size);
936}
937
938
939
940
941
942struct dm_offload {
943 struct blk_plug plug;
944 struct blk_plug_cb cb;
945};
946
947static void flush_current_bio_list(struct blk_plug_cb *cb, bool from_schedule)
948{
949 struct dm_offload *o = container_of(cb, struct dm_offload, cb);
950 struct bio_list list;
951 struct bio *bio;
952 int i;
953
954 INIT_LIST_HEAD(&o->cb.list);
955
956 if (unlikely(!current->bio_list))
957 return;
958
959 for (i = 0; i < 2; i++) {
960 list = current->bio_list[i];
961 bio_list_init(¤t->bio_list[i]);
962
963 while ((bio = bio_list_pop(&list))) {
964 struct bio_set *bs = bio->bi_pool;
965 if (unlikely(!bs) || bs == fs_bio_set) {
966 bio_list_add(¤t->bio_list[i], bio);
967 continue;
968 }
969
970 spin_lock(&bs->rescue_lock);
971 bio_list_add(&bs->rescue_list, bio);
972 queue_work(bs->rescue_workqueue, &bs->rescue_work);
973 spin_unlock(&bs->rescue_lock);
974 }
975 }
976}
977
978static void dm_offload_start(struct dm_offload *o)
979{
980 blk_start_plug(&o->plug);
981 o->cb.callback = flush_current_bio_list;
982 list_add(&o->cb.list, ¤t->plug->cb_list);
983}
984
985static void dm_offload_end(struct dm_offload *o)
986{
987 list_del(&o->cb.list);
988 blk_finish_plug(&o->plug);
989}
990
991static void __map_bio(struct dm_target_io *tio)
992{
993 int r;
994 sector_t sector;
995 struct dm_offload o;
996 struct bio *clone = &tio->clone;
997 struct dm_target *ti = tio->ti;
998
999 clone->bi_end_io = clone_endio;
1000
1001
1002
1003
1004
1005
1006 atomic_inc(&tio->io->io_count);
1007 sector = clone->bi_sector;
1008
1009 dm_offload_start(&o);
1010 r = ti->type->map(ti, clone);
1011 dm_offload_end(&o);
1012
1013 if (r == DM_MAPIO_REMAPPED) {
1014
1015
1016 trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,
1017 tio->io->bio->bi_bdev->bd_dev, sector);
1018
1019 generic_make_request(clone);
1020 } else if (r < 0 || r == DM_MAPIO_REQUEUE) {
1021
1022 dec_pending(tio->io, r);
1023 free_tio(tio);
1024 } else if (r != DM_MAPIO_SUBMITTED) {
1025 DMWARN("unimplemented target map return value: %d", r);
1026 BUG();
1027 }
1028}
1029
1030struct clone_info {
1031 struct mapped_device *md;
1032 struct dm_table *map;
1033 struct bio *bio;
1034 struct dm_io *io;
1035 sector_t sector;
1036 sector_t sector_count;
1037 unsigned short idx;
1038};
1039
1040static void bio_setup_sector(struct bio *bio, sector_t sector, sector_t len)
1041{
1042 bio->bi_sector = sector;
1043 bio->bi_size = to_bytes(len);
1044}
1045
1046static void bio_setup_bv(struct bio *bio, unsigned short idx, unsigned short bv_count)
1047{
1048 bio->bi_idx = idx;
1049 bio->bi_vcnt = idx + bv_count;
1050 bio->bi_flags &= ~(1 << BIO_SEG_VALID);
1051}
1052
1053static int clone_bio_integrity(struct bio *bio, struct bio *clone,
1054 unsigned short idx, unsigned len, unsigned offset,
1055 bool trim)
1056{
1057 int r;
1058
1059 r = bio_integrity_clone(clone, bio, GFP_NOIO);
1060 if (r < 0)
1061 return r;
1062
1063 if (trim)
1064 bio_integrity_trim(clone, bio_sector_offset(bio, idx, offset), len);
1065
1066 return 0;
1067}
1068
1069
1070
1071
1072static int clone_split_bio(struct dm_target_io *tio, struct bio *bio,
1073 sector_t sector, unsigned short idx,
1074 unsigned offset, unsigned len)
1075{
1076 struct bio *clone = &tio->clone;
1077 struct bio_vec *bv = bio->bi_io_vec + idx;
1078
1079 *clone->bi_io_vec = *bv;
1080
1081 bio_setup_sector(clone, sector, len);
1082
1083 clone->bi_bdev = bio->bi_bdev;
1084 clone->bi_rw = bio->bi_rw;
1085 clone->bi_vcnt = 1;
1086 clone->bi_io_vec->bv_offset = offset;
1087 clone->bi_io_vec->bv_len = clone->bi_size;
1088 clone->bi_flags |= 1 << BIO_CLONED;
1089
1090 if (bio_integrity(bio)) {
1091 int r = clone_bio_integrity(bio, clone, idx, len, offset, true);
1092 if (r < 0)
1093 return r;
1094 }
1095
1096 return 0;
1097}
1098
1099
1100
1101
1102static int clone_bio(struct dm_target_io *tio, struct bio *bio,
1103 sector_t sector, unsigned short idx,
1104 unsigned short bv_count, unsigned len)
1105{
1106 struct bio *clone = &tio->clone;
1107
1108 __bio_clone(clone, bio);
1109 bio_setup_sector(clone, sector, len);
1110 bio_setup_bv(clone, idx, bv_count);
1111
1112 if (bio_integrity(bio)) {
1113 int r;
1114 bool trim = false;
1115
1116 if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
1117 trim = true;
1118 r = clone_bio_integrity(bio, clone, idx, len, 0, trim);
1119 if (r < 0)
1120 return r;
1121 }
1122
1123 return 0;
1124}
1125
1126static struct dm_target_io *alloc_tio(struct clone_info *ci,
1127 struct dm_target *ti, int nr_iovecs,
1128 unsigned target_bio_nr)
1129{
1130 struct dm_target_io *tio;
1131 struct bio *clone;
1132
1133 clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, ci->md->bs);
1134 tio = container_of(clone, struct dm_target_io, clone);
1135
1136 tio->io = ci->io;
1137 tio->ti = ti;
1138 tio->target_bio_nr = target_bio_nr;
1139
1140 return tio;
1141}
1142
1143static void __clone_and_map_simple_bio(struct clone_info *ci,
1144 struct dm_target *ti,
1145 unsigned target_bio_nr, sector_t len)
1146{
1147 struct dm_target_io *tio = alloc_tio(ci, ti, ci->bio->bi_max_vecs, target_bio_nr);
1148 struct bio *clone = &tio->clone;
1149
1150
1151
1152
1153
1154
1155 __bio_clone(clone, ci->bio);
1156 if (len)
1157 bio_setup_sector(clone, ci->sector, len);
1158
1159 __map_bio(tio);
1160}
1161
1162static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
1163 unsigned num_bios, sector_t len)
1164{
1165 unsigned target_bio_nr;
1166
1167 for (target_bio_nr = 0; target_bio_nr < num_bios; target_bio_nr++)
1168 __clone_and_map_simple_bio(ci, ti, target_bio_nr, len);
1169}
1170
1171static int __send_empty_flush(struct clone_info *ci)
1172{
1173 unsigned target_nr = 0;
1174 struct dm_target *ti;
1175
1176 BUG_ON(bio_has_data(ci->bio));
1177 while ((ti = dm_table_get_target(ci->map, target_nr++)))
1178 __send_duplicate_bios(ci, ti, ti->num_flush_bios, 0);
1179
1180 return 0;
1181}
1182
1183static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
1184 sector_t sector, int nr_iovecs,
1185 unsigned short idx, unsigned short bv_count,
1186 unsigned offset, unsigned len,
1187 bool split_bvec)
1188{
1189 struct bio *bio = ci->bio;
1190 struct dm_target_io *tio;
1191 unsigned target_bio_nr;
1192 unsigned num_target_bios = 1;
1193 int r = 0;
1194
1195
1196
1197
1198 if (bio_data_dir(bio) == WRITE && ti->num_write_bios)
1199 num_target_bios = ti->num_write_bios(ti, bio);
1200
1201 for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) {
1202 tio = alloc_tio(ci, ti, nr_iovecs, target_bio_nr);
1203 if (split_bvec)
1204 r = clone_split_bio(tio, bio, sector, idx, offset, len);
1205 else
1206 r = clone_bio(tio, bio, sector, idx, bv_count, len);
1207 if (r < 0) {
1208 free_tio(tio);
1209 break;
1210 }
1211 __map_bio(tio);
1212 }
1213
1214 return r;
1215}
1216
1217typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
1218
1219static unsigned get_num_discard_bios(struct dm_target *ti)
1220{
1221 return ti->num_discard_bios;
1222}
1223
1224static unsigned get_num_write_same_bios(struct dm_target *ti)
1225{
1226 return ti->num_write_same_bios;
1227}
1228
1229typedef bool (*is_split_required_fn)(struct dm_target *ti);
1230
1231static bool is_split_required_for_discard(struct dm_target *ti)
1232{
1233 return ti->split_discard_bios;
1234}
1235
1236static int __send_changing_extent_only(struct clone_info *ci,
1237 get_num_bios_fn get_num_bios,
1238 is_split_required_fn is_split_required)
1239{
1240 struct dm_target *ti;
1241 sector_t len;
1242 unsigned num_bios;
1243
1244 do {
1245 ti = dm_table_find_target(ci->map, ci->sector);
1246 if (!dm_target_is_valid(ti))
1247 return -EIO;
1248
1249
1250
1251
1252
1253
1254
1255 num_bios = get_num_bios ? get_num_bios(ti) : 0;
1256 if (!num_bios)
1257 return -EOPNOTSUPP;
1258
1259 if (is_split_required && !is_split_required(ti))
1260 len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
1261 else
1262 len = min(ci->sector_count, max_io_len(ci->sector, ti));
1263
1264 __send_duplicate_bios(ci, ti, num_bios, len);
1265
1266 ci->sector += len;
1267 } while (ci->sector_count -= len);
1268
1269 return 0;
1270}
1271
1272static int __send_discard(struct clone_info *ci)
1273{
1274 return __send_changing_extent_only(ci, get_num_discard_bios,
1275 is_split_required_for_discard);
1276}
1277
1278static int __send_write_same(struct clone_info *ci)
1279{
1280 return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
1281}
1282
1283
1284
1285
1286static sector_t __len_within_target(struct clone_info *ci, sector_t max, int *idx)
1287{
1288 struct bio *bio = ci->bio;
1289 sector_t bv_len, total_len = 0;
1290
1291 for (*idx = ci->idx; max && (*idx < bio->bi_vcnt); (*idx)++) {
1292 bv_len = to_sector(bio->bi_io_vec[*idx].bv_len);
1293
1294 if (bv_len > max)
1295 break;
1296
1297 max -= bv_len;
1298 total_len += bv_len;
1299 }
1300
1301 return total_len;
1302}
1303
1304static int __split_bvec_across_targets(struct clone_info *ci,
1305 struct dm_target *ti, sector_t max)
1306{
1307 struct bio *bio = ci->bio;
1308 struct bio_vec *bv = bio->bi_io_vec + ci->idx;
1309 sector_t remaining = to_sector(bv->bv_len);
1310 unsigned offset = 0;
1311 sector_t len;
1312 int r;
1313
1314 do {
1315 if (offset) {
1316 ti = dm_table_find_target(ci->map, ci->sector);
1317 if (!dm_target_is_valid(ti))
1318 return -EIO;
1319
1320 max = max_io_len(ci->sector, ti);
1321 }
1322
1323 len = min(remaining, max);
1324
1325 r = __clone_and_map_data_bio(ci, ti, ci->sector, 1, ci->idx, 0,
1326 bv->bv_offset + offset, len, true);
1327 if (r < 0)
1328 return r;
1329
1330 ci->sector += len;
1331 ci->sector_count -= len;
1332 offset += to_bytes(len);
1333 } while (remaining -= len);
1334
1335 ci->idx++;
1336
1337 return 0;
1338}
1339
1340
1341
1342
1343static int __split_and_process_non_flush(struct clone_info *ci)
1344{
1345 struct bio *bio = ci->bio;
1346 struct dm_target *ti;
1347 sector_t len, max;
1348 int idx;
1349 int r;
1350
1351 if (unlikely(bio->bi_rw & REQ_DISCARD))
1352 return __send_discard(ci);
1353 else if (unlikely(bio->bi_rw & REQ_WRITE_SAME))
1354 return __send_write_same(ci);
1355
1356 ti = dm_table_find_target(ci->map, ci->sector);
1357 if (!dm_target_is_valid(ti))
1358 return -EIO;
1359
1360 max = max_io_len(ci->sector, ti);
1361
1362
1363
1364
1365
1366 if (ci->sector_count <= max) {
1367 r = __clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs,
1368 ci->idx, bio->bi_vcnt - ci->idx, 0,
1369 ci->sector_count, false);
1370 if (r < 0)
1371 return r;
1372
1373 ci->sector_count = 0;
1374 return 0;
1375 }
1376
1377
1378
1379
1380
1381 if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
1382 len = __len_within_target(ci, max, &idx);
1383
1384 r = __clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs,
1385 ci->idx, idx - ci->idx, 0, len, false);
1386 if (r < 0)
1387 return r;
1388
1389 ci->sector += len;
1390 ci->sector_count -= len;
1391 ci->idx = idx;
1392
1393 return 0;
1394 }
1395
1396
1397
1398
1399 return __split_bvec_across_targets(ci, ti, max);
1400}
1401
1402
1403
1404
1405static void __split_and_process_bio(struct mapped_device *md,
1406 struct dm_table *map, struct bio *bio)
1407{
1408 struct clone_info ci;
1409 int error = 0;
1410
1411 if (unlikely(!map)) {
1412 bio_io_error(bio);
1413 return;
1414 }
1415
1416 ci.map = map;
1417 ci.md = md;
1418 ci.io = alloc_io(md);
1419 ci.io->error = 0;
1420 atomic_set(&ci.io->io_count, 1);
1421 ci.io->bio = bio;
1422 ci.io->md = md;
1423 spin_lock_init(&ci.io->endio_lock);
1424 ci.sector = bio->bi_sector;
1425 ci.idx = bio->bi_idx;
1426
1427 start_io_acct(ci.io);
1428
1429 if (bio->bi_rw & REQ_FLUSH) {
1430 ci.bio = &ci.md->flush_bio;
1431 ci.sector_count = 0;
1432 error = __send_empty_flush(&ci);
1433
1434 } else {
1435 ci.bio = bio;
1436 ci.sector_count = bio_sectors(bio);
1437 while (ci.sector_count && !error)
1438 error = __split_and_process_non_flush(&ci);
1439 }
1440
1441
1442 dec_pending(ci.io, error);
1443}
1444
1445
1446
1447
1448static int dm_merge_bvec(struct request_queue *q,
1449 struct bvec_merge_data *bvm,
1450 struct bio_vec *biovec)
1451{
1452 struct mapped_device *md = q->queuedata;
1453 struct dm_table *map = dm_get_live_table_fast(md);
1454 struct dm_target *ti;
1455 sector_t max_sectors;
1456 int max_size = 0;
1457
1458 if (unlikely(!map))
1459 goto out;
1460
1461 ti = dm_table_find_target(map, bvm->bi_sector);
1462 if (!dm_target_is_valid(ti))
1463 goto out;
1464
1465
1466
1467
1468 max_sectors = min(max_io_len(bvm->bi_sector, ti),
1469 (sector_t) BIO_MAX_SECTORS);
1470 max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
1471 if (max_size < 0)
1472 max_size = 0;
1473
1474
1475
1476
1477
1478
1479 if (max_size && ti->type->merge)
1480 max_size = ti->type->merge(ti, bvm, biovec, max_size);
1481
1482
1483
1484
1485
1486
1487
1488 else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
1489 max_size = 0;
1490
1491out:
1492 dm_put_live_table_fast(md);
1493
1494
1495
1496 if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT))
1497 max_size = biovec->bv_len;
1498
1499 return max_size;
1500}
1501
1502
1503
1504
1505
1506static void dm_make_request(struct request_queue *q, struct bio *bio)
1507{
1508 int rw = bio_data_dir(bio);
1509 struct mapped_device *md = q->queuedata;
1510 int cpu;
1511 int srcu_idx;
1512 struct dm_table *map;
1513
1514 map = dm_get_live_table(md, &srcu_idx);
1515
1516 cpu = part_stat_lock();
1517 part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]);
1518 part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
1519 part_stat_unlock();
1520
1521
1522 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1523 dm_put_live_table(md, srcu_idx);
1524
1525 if (bio_rw(bio) != READA)
1526 queue_io(md, bio);
1527 else
1528 bio_io_error(bio);
1529 return;
1530 }
1531
1532 __split_and_process_bio(md, map, bio);
1533 dm_put_live_table(md, srcu_idx);
1534 return;
1535}
1536
1537static int dm_any_congested(void *congested_data, int bdi_bits)
1538{
1539 int r = bdi_bits;
1540 struct mapped_device *md = congested_data;
1541 struct dm_table *map;
1542
1543 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
1544 if (dm_request_based(md)) {
1545
1546
1547
1548
1549 r = md->queue->backing_dev_info.state & bdi_bits;
1550 } else {
1551 map = dm_get_live_table_fast(md);
1552 if (map)
1553 r = dm_table_any_congested(map, bdi_bits);
1554 dm_put_live_table_fast(md);
1555 }
1556 }
1557
1558 return r;
1559}
1560
1561
1562
1563
1564static void free_minor(int minor)
1565{
1566 spin_lock(&_minor_lock);
1567 idr_remove(&_minor_idr, minor);
1568 spin_unlock(&_minor_lock);
1569}
1570
1571
1572
1573
1574static int specific_minor(int minor)
1575{
1576 int r;
1577
1578 if (minor >= (1 << MINORBITS))
1579 return -EINVAL;
1580
1581 idr_preload(GFP_KERNEL);
1582 spin_lock(&_minor_lock);
1583
1584 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
1585
1586 spin_unlock(&_minor_lock);
1587 idr_preload_end();
1588 if (r < 0)
1589 return r == -ENOSPC ? -EBUSY : r;
1590 return 0;
1591}
1592
1593static int next_free_minor(int *minor)
1594{
1595 int r;
1596
1597 idr_preload(GFP_KERNEL);
1598 spin_lock(&_minor_lock);
1599
1600 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
1601
1602 spin_unlock(&_minor_lock);
1603 idr_preload_end();
1604 if (r < 0)
1605 return r;
1606 *minor = r;
1607 return 0;
1608}
1609
1610static const struct block_device_operations dm_blk_dops;
1611
1612static void dm_wq_work(struct work_struct *work);
1613
1614void dm_init_md_queue(struct mapped_device *md)
1615{
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625 queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
1626
1627
1628
1629
1630
1631 md->queue->queuedata = md;
1632 md->queue->backing_dev_info.congested_data = md;
1633}
1634
1635void dm_init_normal_md_queue(struct mapped_device *md)
1636{
1637 md->use_blk_mq = false;
1638 dm_init_md_queue(md);
1639
1640
1641
1642
1643 md->queue->backing_dev_info.congested_fn = dm_any_congested;
1644 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
1645}
1646
1647
1648
1649
1650static struct mapped_device *alloc_dev(int minor)
1651{
1652 int r, numa_node_id = dm_get_numa_node();
1653 struct mapped_device *md;
1654 void *old_md;
1655
1656 md = kzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
1657 if (!md) {
1658 DMWARN("unable to allocate device, out of memory.");
1659 return NULL;
1660 }
1661
1662 if (!try_module_get(THIS_MODULE))
1663 goto bad_module_get;
1664
1665
1666 if (minor == DM_ANY_MINOR)
1667 r = next_free_minor(&minor);
1668 else
1669 r = specific_minor(minor);
1670 if (r < 0)
1671 goto bad_minor;
1672
1673 r = init_srcu_struct(&md->io_barrier);
1674 if (r < 0)
1675 goto bad_io_barrier;
1676
1677 md->numa_node_id = numa_node_id;
1678 md->use_blk_mq = dm_use_blk_mq_default();
1679 md->init_tio_pdu = false;
1680 md->type = DM_TYPE_NONE;
1681 mutex_init(&md->suspend_lock);
1682 mutex_init(&md->type_lock);
1683 mutex_init(&md->table_devices_lock);
1684 spin_lock_init(&md->deferred_lock);
1685 atomic_set(&md->holders, 1);
1686 atomic_set(&md->open_count, 0);
1687 atomic_set(&md->event_nr, 0);
1688 atomic_set(&md->uevent_seq, 0);
1689 INIT_LIST_HEAD(&md->uevent_list);
1690 INIT_LIST_HEAD(&md->table_devices);
1691 spin_lock_init(&md->uevent_lock);
1692
1693 md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id);
1694 if (!md->queue)
1695 goto bad_queue;
1696
1697 dm_init_md_queue(md);
1698
1699 md->disk = alloc_disk_node(1, numa_node_id);
1700 if (!md->disk)
1701 goto bad_disk;
1702
1703 atomic_set(&md->pending[0], 0);
1704 atomic_set(&md->pending[1], 0);
1705 init_waitqueue_head(&md->wait);
1706 INIT_WORK(&md->work, dm_wq_work);
1707 init_waitqueue_head(&md->eventq);
1708 init_completion(&md->kobj_holder.completion);
1709 md->kworker_task = NULL;
1710
1711 md->disk->major = _major;
1712 md->disk->first_minor = minor;
1713 md->disk->fops = &dm_blk_dops;
1714 md->disk->queue = md->queue;
1715 md->disk->private_data = md;
1716 sprintf(md->disk->disk_name, "dm-%d", minor);
1717 add_disk(md->disk);
1718 format_dev_t(md->name, MKDEV(_major, minor));
1719
1720 md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
1721 if (!md->wq)
1722 goto bad_thread;
1723
1724 md->bdev = bdget_disk(md->disk, 0);
1725 if (!md->bdev)
1726 goto bad_bdev;
1727
1728 bio_init(&md->flush_bio);
1729 md->flush_bio.bi_bdev = md->bdev;
1730 md->flush_bio.bi_rw = WRITE_FLUSH;
1731
1732 dm_stats_init(&md->stats);
1733
1734
1735 spin_lock(&_minor_lock);
1736 old_md = idr_replace(&_minor_idr, md, minor);
1737 spin_unlock(&_minor_lock);
1738
1739 BUG_ON(old_md != MINOR_ALLOCED);
1740
1741 return md;
1742
1743bad_bdev:
1744 destroy_workqueue(md->wq);
1745bad_thread:
1746 del_gendisk(md->disk);
1747 put_disk(md->disk);
1748bad_disk:
1749 blk_cleanup_queue(md->queue);
1750bad_queue:
1751 cleanup_srcu_struct(&md->io_barrier);
1752bad_io_barrier:
1753 free_minor(minor);
1754bad_minor:
1755 module_put(THIS_MODULE);
1756bad_module_get:
1757 kfree(md);
1758 return NULL;
1759}
1760
1761static void unlock_fs(struct mapped_device *md);
1762
1763static void free_dev(struct mapped_device *md)
1764{
1765 int minor = MINOR(disk_devt(md->disk));
1766
1767 unlock_fs(md);
1768 destroy_workqueue(md->wq);
1769
1770 if (md->kworker_task)
1771 kthread_stop(md->kworker_task);
1772 mempool_destroy(md->io_pool);
1773 mempool_destroy(md->rq_pool);
1774 if (md->bs)
1775 bioset_free(md->bs);
1776
1777 spin_lock(&_minor_lock);
1778 md->disk->private_data = NULL;
1779 spin_unlock(&_minor_lock);
1780 if (blk_get_integrity(md->disk))
1781 blk_integrity_unregister(md->disk);
1782 del_gendisk(md->disk);
1783 put_disk(md->disk);
1784 blk_cleanup_queue(md->queue);
1785
1786 cleanup_srcu_struct(&md->io_barrier);
1787 free_table_devices(&md->table_devices);
1788 dm_stats_cleanup(&md->stats);
1789
1790 dm_mq_cleanup_mapped_device(md);
1791 bdput(md->bdev);
1792 free_minor(minor);
1793
1794 module_put(THIS_MODULE);
1795 kfree(md);
1796}
1797
1798static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
1799{
1800 struct dm_md_mempools *p = dm_table_get_md_mempools(t);
1801
1802 if (md->bs) {
1803
1804 if (dm_table_bio_based(t)) {
1805
1806
1807
1808
1809 bioset_free(md->bs);
1810 md->bs = p->bs;
1811 p->bs = NULL;
1812 }
1813
1814
1815
1816
1817
1818
1819
1820
1821 goto out;
1822 }
1823
1824 BUG_ON(!p || md->io_pool || md->rq_pool || md->bs);
1825
1826 md->io_pool = p->io_pool;
1827 p->io_pool = NULL;
1828 md->rq_pool = p->rq_pool;
1829 p->rq_pool = NULL;
1830 md->bs = p->bs;
1831 p->bs = NULL;
1832
1833out:
1834
1835 dm_table_free_md_mempools(t);
1836}
1837
1838
1839
1840
1841static void event_callback(void *context)
1842{
1843 unsigned long flags;
1844 LIST_HEAD(uevents);
1845 struct mapped_device *md = (struct mapped_device *) context;
1846
1847 spin_lock_irqsave(&md->uevent_lock, flags);
1848 list_splice_init(&md->uevent_list, &uevents);
1849 spin_unlock_irqrestore(&md->uevent_lock, flags);
1850
1851 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
1852
1853 atomic_inc(&md->event_nr);
1854 wake_up(&md->eventq);
1855}
1856
1857
1858
1859
1860static void __set_size(struct mapped_device *md, sector_t size)
1861{
1862 set_capacity(md->disk, size);
1863
1864 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
1865}
1866
1867
1868
1869
1870
1871
1872
1873
1874int dm_queue_merge_is_compulsory(struct request_queue *q)
1875{
1876 struct mapped_device *dev_md;
1877
1878 if (!q->merge_bvec_fn)
1879 return 0;
1880
1881 if (q->make_request_fn == dm_make_request) {
1882 dev_md = q->queuedata;
1883 if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags))
1884 return 0;
1885 }
1886
1887 return 1;
1888}
1889
1890static int dm_device_merge_is_compulsory(struct dm_target *ti,
1891 struct dm_dev *dev, sector_t start,
1892 sector_t len, void *data)
1893{
1894 struct block_device *bdev = dev->bdev;
1895 struct request_queue *q = bdev_get_queue(bdev);
1896
1897 return dm_queue_merge_is_compulsory(q);
1898}
1899
1900
1901
1902
1903
1904static int dm_table_merge_is_optional(struct dm_table *table)
1905{
1906 unsigned i = 0;
1907 struct dm_target *ti;
1908
1909 while (i < dm_table_get_num_targets(table)) {
1910 ti = dm_table_get_target(table, i++);
1911
1912 if (ti->type->iterate_devices &&
1913 ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL))
1914 return 0;
1915 }
1916
1917 return 1;
1918}
1919
1920
1921
1922
1923static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
1924 struct queue_limits *limits)
1925{
1926 struct dm_table *old_map;
1927 struct request_queue *q = md->queue;
1928 sector_t size;
1929 int merge_is_optional;
1930
1931 lockdep_assert_held(&md->suspend_lock);
1932
1933 size = dm_table_get_size(t);
1934
1935
1936
1937
1938 if (size != dm_get_size(md))
1939 memset(&md->geometry, 0, sizeof(md->geometry));
1940
1941 __set_size(md, size);
1942
1943 dm_table_event_callback(t, event_callback, md);
1944
1945
1946
1947
1948
1949
1950
1951
1952 if (dm_table_request_based(t)) {
1953 dm_stop_queue(q);
1954
1955
1956
1957
1958
1959 md->immutable_target = dm_table_get_immutable_target(t);
1960 }
1961
1962 __bind_mempools(md, t);
1963
1964 merge_is_optional = dm_table_merge_is_optional(t);
1965
1966 old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
1967 rcu_assign_pointer(md->map, (void *)t);
1968 md->immutable_target_type = dm_table_get_immutable_target_type(t);
1969
1970 dm_table_set_restrictions(t, q, limits);
1971 if (merge_is_optional)
1972 set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
1973 else
1974 clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
1975 if (old_map)
1976 dm_sync_table(md);
1977
1978 return old_map;
1979}
1980
1981
1982
1983
1984static struct dm_table *__unbind(struct mapped_device *md)
1985{
1986 struct dm_table *map = rcu_dereference_protected(md->map, 1);
1987
1988 if (!map)
1989 return NULL;
1990
1991 dm_table_event_callback(map, NULL, NULL);
1992 RCU_INIT_POINTER(md->map, NULL);
1993 dm_sync_table(md);
1994
1995 return map;
1996}
1997
1998
1999
2000
2001int dm_create(int minor, struct mapped_device **result)
2002{
2003 struct mapped_device *md;
2004
2005 md = alloc_dev(minor);
2006 if (!md)
2007 return -ENXIO;
2008
2009 dm_sysfs_init(md);
2010
2011 *result = md;
2012 return 0;
2013}
2014
2015
2016
2017
2018
2019void dm_lock_md_type(struct mapped_device *md)
2020{
2021 mutex_lock(&md->type_lock);
2022}
2023
2024void dm_unlock_md_type(struct mapped_device *md)
2025{
2026 mutex_unlock(&md->type_lock);
2027}
2028
2029void dm_set_md_type(struct mapped_device *md, unsigned type)
2030{
2031 BUG_ON(!mutex_is_locked(&md->type_lock));
2032 md->type = type;
2033}
2034
2035unsigned dm_get_md_type(struct mapped_device *md)
2036{
2037 return md->type;
2038}
2039
2040struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
2041{
2042 return md->immutable_target_type;
2043}
2044
2045
2046
2047
2048
2049struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
2050{
2051 BUG_ON(!atomic_read(&md->holders));
2052 return &md->queue->limits;
2053}
2054EXPORT_SYMBOL_GPL(dm_get_queue_limits);
2055
2056
2057
2058
2059int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
2060{
2061 int r;
2062 unsigned type = dm_get_md_type(md);
2063
2064 switch (type) {
2065 case DM_TYPE_REQUEST_BASED:
2066 r = dm_old_init_request_queue(md);
2067 if (r) {
2068 DMERR("Cannot initialize queue for request-based mapped device");
2069 return r;
2070 }
2071 break;
2072 case DM_TYPE_MQ_REQUEST_BASED:
2073 r = dm_mq_init_request_queue(md, t);
2074 if (r) {
2075 DMERR("Cannot initialize queue for request-based dm-mq mapped device");
2076 return r;
2077 }
2078 break;
2079 case DM_TYPE_BIO_BASED:
2080 case DM_TYPE_DAX_BIO_BASED:
2081 dm_init_normal_md_queue(md);
2082 blk_queue_make_request(md->queue, dm_make_request);
2083 blk_queue_merge_bvec(md->queue, dm_merge_bvec);
2084
2085 if (type == DM_TYPE_DAX_BIO_BASED)
2086 queue_flag_set_unlocked(QUEUE_FLAG_DAX, md->queue);
2087 break;
2088 }
2089
2090 return 0;
2091}
2092
2093struct mapped_device *dm_get_md(dev_t dev)
2094{
2095 struct mapped_device *md;
2096 unsigned minor = MINOR(dev);
2097
2098 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2099 return NULL;
2100
2101 spin_lock(&_minor_lock);
2102
2103 md = idr_find(&_minor_idr, minor);
2104 if (md) {
2105 if ((md == MINOR_ALLOCED ||
2106 (MINOR(disk_devt(dm_disk(md))) != minor) ||
2107 dm_deleting_md(md) ||
2108 test_bit(DMF_FREEING, &md->flags))) {
2109 md = NULL;
2110 goto out;
2111 }
2112 dm_get(md);
2113 }
2114
2115out:
2116 spin_unlock(&_minor_lock);
2117
2118 return md;
2119}
2120EXPORT_SYMBOL_GPL(dm_get_md);
2121
2122void *dm_get_mdptr(struct mapped_device *md)
2123{
2124 return md->interface_ptr;
2125}
2126
2127void dm_set_mdptr(struct mapped_device *md, void *ptr)
2128{
2129 md->interface_ptr = ptr;
2130}
2131
2132void dm_get(struct mapped_device *md)
2133{
2134 atomic_inc(&md->holders);
2135 BUG_ON(test_bit(DMF_FREEING, &md->flags));
2136}
2137
2138int dm_hold(struct mapped_device *md)
2139{
2140 spin_lock(&_minor_lock);
2141 if (test_bit(DMF_FREEING, &md->flags)) {
2142 spin_unlock(&_minor_lock);
2143 return -EBUSY;
2144 }
2145 dm_get(md);
2146 spin_unlock(&_minor_lock);
2147 return 0;
2148}
2149EXPORT_SYMBOL_GPL(dm_hold);
2150
2151const char *dm_device_name(struct mapped_device *md)
2152{
2153 return md->name;
2154}
2155EXPORT_SYMBOL_GPL(dm_device_name);
2156
2157static void __dm_destroy(struct mapped_device *md, bool wait)
2158{
2159 struct request_queue *q = dm_get_md_queue(md);
2160 struct dm_table *map;
2161 int srcu_idx;
2162
2163 might_sleep();
2164
2165 spin_lock(&_minor_lock);
2166 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2167 set_bit(DMF_FREEING, &md->flags);
2168 spin_unlock(&_minor_lock);
2169
2170 blk_set_queue_dying(q);
2171
2172 if (dm_request_based(md) && md->kworker_task)
2173 flush_kthread_worker(&md->kworker);
2174
2175
2176
2177
2178
2179 mutex_lock(&md->suspend_lock);
2180 map = dm_get_live_table(md, &srcu_idx);
2181 if (!dm_suspended_md(md)) {
2182 dm_table_presuspend_targets(map);
2183 dm_table_postsuspend_targets(map);
2184 }
2185
2186 dm_put_live_table(md, srcu_idx);
2187 mutex_unlock(&md->suspend_lock);
2188
2189
2190
2191
2192
2193
2194
2195 if (wait)
2196 while (atomic_read(&md->holders))
2197 msleep(1);
2198 else if (atomic_read(&md->holders))
2199 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2200 dm_device_name(md), atomic_read(&md->holders));
2201
2202 dm_sysfs_exit(md);
2203 dm_table_destroy(__unbind(md));
2204 free_dev(md);
2205}
2206
2207void dm_destroy(struct mapped_device *md)
2208{
2209 __dm_destroy(md, true);
2210}
2211
2212void dm_destroy_immediate(struct mapped_device *md)
2213{
2214 __dm_destroy(md, false);
2215}
2216
2217void dm_put(struct mapped_device *md)
2218{
2219 atomic_dec(&md->holders);
2220}
2221EXPORT_SYMBOL_GPL(dm_put);
2222
2223static int dm_wait_for_completion(struct mapped_device *md, long task_state)
2224{
2225 int r = 0;
2226 DEFINE_WAIT(wait);
2227
2228 while (1) {
2229 prepare_to_wait(&md->wait, &wait, task_state);
2230
2231 if (!md_in_flight(md))
2232 break;
2233
2234 if (signal_pending_state(task_state, current)) {
2235 r = -EINTR;
2236 break;
2237 }
2238
2239 io_schedule();
2240 }
2241 finish_wait(&md->wait, &wait);
2242
2243 return r;
2244}
2245
2246
2247
2248
2249static void dm_wq_work(struct work_struct *work)
2250{
2251 struct mapped_device *md = container_of(work, struct mapped_device,
2252 work);
2253 struct bio *c;
2254 int srcu_idx;
2255 struct dm_table *map;
2256
2257 map = dm_get_live_table(md, &srcu_idx);
2258
2259 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2260 spin_lock_irq(&md->deferred_lock);
2261 c = bio_list_pop(&md->deferred);
2262 spin_unlock_irq(&md->deferred_lock);
2263
2264 if (!c)
2265 break;
2266
2267 if (dm_request_based(md))
2268 generic_make_request(c);
2269 else
2270 __split_and_process_bio(md, map, c);
2271 }
2272
2273 dm_put_live_table(md, srcu_idx);
2274}
2275
2276static void dm_queue_flush(struct mapped_device *md)
2277{
2278 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2279 smp_mb__after_clear_bit();
2280 queue_work(md->wq, &md->work);
2281}
2282
2283
2284
2285
2286struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2287{
2288 struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
2289 struct queue_limits limits;
2290 struct queue_limits_aux limits_aux;
2291 int r;
2292
2293 mutex_lock(&md->suspend_lock);
2294
2295
2296 if (!dm_suspended_md(md))
2297 goto out;
2298
2299
2300
2301
2302
2303 limits.limits_aux = &limits_aux;
2304
2305
2306
2307
2308
2309
2310
2311 if (dm_table_has_no_data_devices(table)) {
2312 live_map = dm_get_live_table_fast(md);
2313 if (live_map)
2314 limits = md->queue->limits;
2315 dm_put_live_table_fast(md);
2316 }
2317
2318 if (!live_map) {
2319 r = dm_calculate_queue_limits(table, &limits);
2320 if (r) {
2321 map = ERR_PTR(r);
2322 goto out;
2323 }
2324 }
2325
2326 map = __bind(md, table, &limits);
2327
2328out:
2329 mutex_unlock(&md->suspend_lock);
2330 return map;
2331}
2332
2333
2334
2335
2336
2337static int lock_fs(struct mapped_device *md)
2338{
2339 int r;
2340
2341 WARN_ON(md->frozen_sb);
2342
2343 md->frozen_sb = freeze_bdev(md->bdev);
2344 if (IS_ERR(md->frozen_sb)) {
2345 r = PTR_ERR(md->frozen_sb);
2346 md->frozen_sb = NULL;
2347 return r;
2348 }
2349
2350 set_bit(DMF_FROZEN, &md->flags);
2351
2352 return 0;
2353}
2354
2355static void unlock_fs(struct mapped_device *md)
2356{
2357 if (!test_bit(DMF_FROZEN, &md->flags))
2358 return;
2359
2360 thaw_bdev(md->bdev, md->frozen_sb);
2361 md->frozen_sb = NULL;
2362 clear_bit(DMF_FROZEN, &md->flags);
2363}
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
2377 unsigned suspend_flags, long task_state,
2378 int dmf_suspended_flag)
2379{
2380 bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
2381 bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
2382 int r;
2383
2384 lockdep_assert_held(&md->suspend_lock);
2385
2386
2387
2388
2389
2390 if (noflush)
2391 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2392
2393
2394
2395
2396
2397 dm_table_presuspend_targets(map);
2398
2399
2400
2401
2402
2403
2404
2405 if (!noflush && do_lockfs) {
2406 r = lock_fs(md);
2407 if (r) {
2408 dm_table_presuspend_undo_targets(map);
2409 return r;
2410 }
2411 }
2412
2413
2414
2415
2416
2417
2418
2419
2420
2421
2422
2423
2424
2425 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2426 if (map)
2427 synchronize_srcu(&md->io_barrier);
2428
2429
2430
2431
2432
2433 if (dm_request_based(md)) {
2434 dm_stop_queue(md->queue);
2435 if (md->kworker_task)
2436 flush_kthread_worker(&md->kworker);
2437 }
2438
2439 flush_workqueue(md->wq);
2440
2441
2442
2443
2444
2445
2446 r = dm_wait_for_completion(md, task_state);
2447 if (!r)
2448 set_bit(dmf_suspended_flag, &md->flags);
2449
2450 if (noflush)
2451 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2452 if (map)
2453 synchronize_srcu(&md->io_barrier);
2454
2455
2456 if (r < 0) {
2457 dm_queue_flush(md);
2458
2459 if (dm_request_based(md))
2460 dm_start_queue(md->queue);
2461
2462 unlock_fs(md);
2463 dm_table_presuspend_undo_targets(map);
2464
2465 }
2466
2467 return r;
2468}
2469
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2487{
2488 struct dm_table *map = NULL;
2489 int r = 0;
2490
2491retry:
2492 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2493
2494 if (dm_suspended_md(md)) {
2495 r = -EINVAL;
2496 goto out_unlock;
2497 }
2498
2499 if (dm_suspended_internally_md(md)) {
2500
2501 mutex_unlock(&md->suspend_lock);
2502 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2503 if (r)
2504 return r;
2505 goto retry;
2506 }
2507
2508 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2509
2510 r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED);
2511 if (r)
2512 goto out_unlock;
2513
2514 dm_table_postsuspend_targets(map);
2515
2516out_unlock:
2517 mutex_unlock(&md->suspend_lock);
2518 return r;
2519}
2520
2521static int __dm_resume(struct mapped_device *md, struct dm_table *map)
2522{
2523 if (map) {
2524 int r = dm_table_resume_targets(map);
2525 if (r)
2526 return r;
2527 }
2528
2529 dm_queue_flush(md);
2530
2531
2532
2533
2534
2535
2536 if (dm_request_based(md))
2537 dm_start_queue(md->queue);
2538
2539 unlock_fs(md);
2540
2541 return 0;
2542}
2543
2544int dm_resume(struct mapped_device *md)
2545{
2546 int r;
2547 struct dm_table *map = NULL;
2548
2549retry:
2550 r = -EINVAL;
2551 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2552
2553 if (!dm_suspended_md(md))
2554 goto out;
2555
2556 if (dm_suspended_internally_md(md)) {
2557
2558 mutex_unlock(&md->suspend_lock);
2559 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2560 if (r)
2561 return r;
2562 goto retry;
2563 }
2564
2565 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2566 if (!map || !dm_table_get_size(map))
2567 goto out;
2568
2569 r = __dm_resume(md, map);
2570 if (r)
2571 goto out;
2572
2573 clear_bit(DMF_SUSPENDED, &md->flags);
2574out:
2575 mutex_unlock(&md->suspend_lock);
2576
2577 return r;
2578}
2579
2580
2581
2582
2583
2584
2585
2586static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
2587{
2588 struct dm_table *map = NULL;
2589
2590 if (md->internal_suspend_count++)
2591 return;
2592
2593 if (dm_suspended_md(md)) {
2594 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2595 return;
2596 }
2597
2598 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2599
2600
2601
2602
2603
2604
2605
2606 (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE,
2607 DMF_SUSPENDED_INTERNALLY);
2608
2609 dm_table_postsuspend_targets(map);
2610}
2611
2612static void __dm_internal_resume(struct mapped_device *md)
2613{
2614 BUG_ON(!md->internal_suspend_count);
2615
2616 if (--md->internal_suspend_count)
2617 return;
2618
2619 if (dm_suspended_md(md))
2620 goto done;
2621
2622
2623
2624
2625
2626 (void) __dm_resume(md, NULL);
2627
2628done:
2629 clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2630 smp_mb__after_atomic();
2631 wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
2632}
2633
2634void dm_internal_suspend_noflush(struct mapped_device *md)
2635{
2636 mutex_lock(&md->suspend_lock);
2637 __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
2638 mutex_unlock(&md->suspend_lock);
2639}
2640EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
2641
2642void dm_internal_resume(struct mapped_device *md)
2643{
2644 mutex_lock(&md->suspend_lock);
2645 __dm_internal_resume(md);
2646 mutex_unlock(&md->suspend_lock);
2647}
2648EXPORT_SYMBOL_GPL(dm_internal_resume);
2649
2650
2651
2652
2653
2654
2655void dm_internal_suspend_fast(struct mapped_device *md)
2656{
2657 mutex_lock(&md->suspend_lock);
2658 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2659 return;
2660
2661 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2662 synchronize_srcu(&md->io_barrier);
2663 flush_workqueue(md->wq);
2664 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2665}
2666EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
2667
2668void dm_internal_resume_fast(struct mapped_device *md)
2669{
2670 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2671 goto done;
2672
2673 dm_queue_flush(md);
2674
2675done:
2676 mutex_unlock(&md->suspend_lock);
2677}
2678EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
2679
2680
2681
2682
2683int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
2684 unsigned cookie)
2685{
2686 char udev_cookie[DM_COOKIE_LENGTH];
2687 char *envp[] = { udev_cookie, NULL };
2688
2689 if (!cookie)
2690 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
2691 else {
2692 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2693 DM_COOKIE_ENV_VAR_NAME, cookie);
2694 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
2695 action, envp);
2696 }
2697
2698 dm_mq_cleanup_mapped_device(md);
2699}
2700
2701uint32_t dm_next_uevent_seq(struct mapped_device *md)
2702{
2703 return atomic_add_return(1, &md->uevent_seq);
2704}
2705
2706uint32_t dm_get_event_nr(struct mapped_device *md)
2707{
2708 return atomic_read(&md->event_nr);
2709}
2710
2711int dm_wait_event(struct mapped_device *md, int event_nr)
2712{
2713 return wait_event_interruptible(md->eventq,
2714 (event_nr != atomic_read(&md->event_nr)));
2715}
2716
2717void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
2718{
2719 unsigned long flags;
2720
2721 spin_lock_irqsave(&md->uevent_lock, flags);
2722 list_add(elist, &md->uevent_list);
2723 spin_unlock_irqrestore(&md->uevent_lock, flags);
2724}
2725
2726
2727
2728
2729
2730struct gendisk *dm_disk(struct mapped_device *md)
2731{
2732 return md->disk;
2733}
2734EXPORT_SYMBOL_GPL(dm_disk);
2735
2736struct kobject *dm_kobject(struct mapped_device *md)
2737{
2738 return &md->kobj_holder.kobj;
2739}
2740
2741struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2742{
2743 struct mapped_device *md;
2744
2745 md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
2746
2747 if (test_bit(DMF_FREEING, &md->flags) ||
2748 dm_deleting_md(md))
2749 return NULL;
2750
2751 dm_get(md);
2752 return md;
2753}
2754
2755int dm_suspended_md(struct mapped_device *md)
2756{
2757 return test_bit(DMF_SUSPENDED, &md->flags);
2758}
2759
2760int dm_suspended_internally_md(struct mapped_device *md)
2761{
2762 return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2763}
2764
2765int dm_test_deferred_remove_flag(struct mapped_device *md)
2766{
2767 return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
2768}
2769
2770int dm_suspended(struct dm_target *ti)
2771{
2772 return dm_suspended_md(dm_table_get_md(ti->table));
2773}
2774EXPORT_SYMBOL_GPL(dm_suspended);
2775
2776int dm_noflush_suspending(struct dm_target *ti)
2777{
2778 return __noflush_suspending(dm_table_get_md(ti->table));
2779}
2780EXPORT_SYMBOL_GPL(dm_noflush_suspending);
2781
2782struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
2783 unsigned integrity, unsigned per_io_data_size)
2784{
2785 struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
2786 struct kmem_cache *cachep = NULL;
2787 unsigned int pool_size = 0;
2788 unsigned int front_pad;
2789
2790 if (!pools)
2791 return NULL;
2792
2793 switch (type) {
2794 case DM_TYPE_BIO_BASED:
2795 case DM_TYPE_DAX_BIO_BASED:
2796 cachep = _io_cache;
2797 pool_size = dm_get_reserved_bio_based_ios();
2798 front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
2799 break;
2800 case DM_TYPE_REQUEST_BASED:
2801 cachep = _rq_tio_cache;
2802 pool_size = dm_get_reserved_rq_based_ios();
2803 pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
2804 if (!pools->rq_pool)
2805 goto out;
2806
2807 case DM_TYPE_MQ_REQUEST_BASED:
2808 if (!pool_size)
2809 pool_size = dm_get_reserved_rq_based_ios();
2810 front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
2811
2812 break;
2813 default:
2814 BUG();
2815 }
2816
2817 if (cachep) {
2818 pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
2819 if (!pools->io_pool)
2820 goto out;
2821 }
2822
2823 pools->bs = bioset_create(pool_size, front_pad);
2824 if (!pools->bs)
2825 goto out;
2826
2827 if (integrity && bioset_integrity_create(pools->bs, pool_size))
2828 goto out;
2829
2830 return pools;
2831
2832out:
2833 dm_free_md_mempools(pools);
2834
2835 return NULL;
2836}
2837
2838void dm_free_md_mempools(struct dm_md_mempools *pools)
2839{
2840 if (!pools)
2841 return;
2842
2843 mempool_destroy(pools->io_pool);
2844 mempool_destroy(pools->rq_pool);
2845
2846 if (pools->bs)
2847 bioset_free(pools->bs);
2848
2849 kfree(pools);
2850}
2851
2852struct dm_pr {
2853 u64 old_key;
2854 u64 new_key;
2855 u32 flags;
2856 bool fail_early;
2857};
2858
2859static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
2860 void *data)
2861{
2862 struct mapped_device *md = bdev->bd_disk->private_data;
2863 struct dm_table *table;
2864 struct dm_target *ti;
2865 int ret = -ENOTTY, srcu_idx;
2866
2867 table = dm_get_live_table(md, &srcu_idx);
2868 if (!table || !dm_table_get_size(table))
2869 goto out;
2870
2871
2872 if (dm_table_get_num_targets(table) != 1)
2873 goto out;
2874 ti = dm_table_get_target(table, 0);
2875
2876 ret = -EINVAL;
2877 if (!ti->type->iterate_devices)
2878 goto out;
2879
2880 ret = ti->type->iterate_devices(ti, fn, data);
2881out:
2882 dm_put_live_table(md, srcu_idx);
2883 return ret;
2884}
2885
2886
2887
2888
2889static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev,
2890 sector_t start, sector_t len, void *data)
2891{
2892 struct dm_pr *pr = data;
2893 const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
2894
2895 if (!ops || !ops->pr_register)
2896 return -EOPNOTSUPP;
2897 return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);
2898}
2899
2900static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
2901 u32 flags)
2902{
2903 struct dm_pr pr = {
2904 .old_key = old_key,
2905 .new_key = new_key,
2906 .flags = flags,
2907 .fail_early = true,
2908 };
2909 int ret;
2910
2911 ret = dm_call_pr(bdev, __dm_pr_register, &pr);
2912 if (ret && new_key) {
2913
2914 pr.old_key = new_key;
2915 pr.new_key = 0;
2916 pr.flags = 0;
2917 pr.fail_early = false;
2918 dm_call_pr(bdev, __dm_pr_register, &pr);
2919 }
2920
2921 return ret;
2922}
2923
2924static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
2925 u32 flags)
2926{
2927 struct mapped_device *md = bdev->bd_disk->private_data;
2928 const struct pr_ops *ops;
2929 fmode_t mode;
2930 int r;
2931
2932 r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
2933 if (r < 0)
2934 return r;
2935
2936 ops = bdev->bd_disk->fops->pr_ops;
2937 if (ops && ops->pr_reserve)
2938 r = ops->pr_reserve(bdev, key, type, flags);
2939 else
2940 r = -EOPNOTSUPP;
2941
2942 bdput(bdev);
2943 return r;
2944}
2945
2946static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
2947{
2948 struct mapped_device *md = bdev->bd_disk->private_data;
2949 const struct pr_ops *ops;
2950 fmode_t mode;
2951 int r;
2952
2953 r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
2954 if (r < 0)
2955 return r;
2956
2957 ops = bdev->bd_disk->fops->pr_ops;
2958 if (ops && ops->pr_release)
2959 r = ops->pr_release(bdev, key, type);
2960 else
2961 r = -EOPNOTSUPP;
2962
2963 bdput(bdev);
2964 return r;
2965}
2966
2967static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
2968 enum pr_type type, bool abort)
2969{
2970 struct mapped_device *md = bdev->bd_disk->private_data;
2971 const struct pr_ops *ops;
2972 fmode_t mode;
2973 int r;
2974
2975 r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
2976 if (r < 0)
2977 return r;
2978
2979 ops = bdev->bd_disk->fops->pr_ops;
2980 if (ops && ops->pr_preempt)
2981 r = ops->pr_preempt(bdev, old_key, new_key, type, abort);
2982 else
2983 r = -EOPNOTSUPP;
2984
2985 bdput(bdev);
2986 return r;
2987}
2988
2989static int dm_pr_clear(struct block_device *bdev, u64 key)
2990{
2991 struct mapped_device *md = bdev->bd_disk->private_data;
2992 const struct pr_ops *ops;
2993 fmode_t mode;
2994 int r;
2995
2996 r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
2997 if (r < 0)
2998 return r;
2999
3000 ops = bdev->bd_disk->fops->pr_ops;
3001 if (ops && ops->pr_clear)
3002 r = ops->pr_clear(bdev, key);
3003 else
3004 r = -EOPNOTSUPP;
3005
3006 bdput(bdev);
3007 return r;
3008}
3009
3010static const struct pr_ops dm_pr_ops = {
3011 .pr_register = dm_pr_register,
3012 .pr_reserve = dm_pr_reserve,
3013 .pr_release = dm_pr_release,
3014 .pr_preempt = dm_pr_preempt,
3015 .pr_clear = dm_pr_clear,
3016};
3017
3018static const struct block_device_operations dm_blk_dops = {
3019 .open = dm_blk_open,
3020 .release = dm_blk_close,
3021 .ioctl = dm_blk_ioctl,
3022 .direct_access = dm_blk_direct_access,
3023 .getgeo = dm_blk_getgeo,
3024 .pr_ops = &dm_pr_ops,
3025 .owner = THIS_MODULE
3026};
3027
3028
3029
3030
3031module_init(dm_init);
3032module_exit(dm_exit);
3033
3034module_param(major, uint, 0);
3035MODULE_PARM_DESC(major, "The major number of the device mapper");
3036
3037module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
3038MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
3039
3040module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
3041MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
3042
3043MODULE_DESCRIPTION(DM_NAME " driver");
3044MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3045MODULE_LICENSE("GPL");
3046