1
2
3
4
5
6
7
8#include "dm-core.h"
9#include "dm-rq.h"
10#include "dm-uevent.h"
11
12#include <linux/init.h>
13#include <linux/module.h>
14#include <linux/mutex.h>
15#include <linux/sched/signal.h>
16#include <linux/blkpg.h>
17#include <linux/bio.h>
18#include <linux/mempool.h>
19#include <linux/dax.h>
20#include <linux/slab.h>
21#include <linux/idr.h>
22#include <linux/uio.h>
23#include <linux/hdreg.h>
24#include <linux/delay.h>
25#include <linux/wait.h>
26#include <linux/pr.h>
27#include <linux/refcount.h>
28#include <linux/part_stat.h>
29
30#define DM_MSG_PREFIX "core"
31
32
33
34
35
36#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
37#define DM_COOKIE_LENGTH 24
38
39static const char *_name = DM_NAME;
40
41static unsigned int major = 0;
42static unsigned int _major = 0;
43
44static DEFINE_IDR(_minor_idr);
45
46static DEFINE_SPINLOCK(_minor_lock);
47
48static void do_deferred_remove(struct work_struct *w);
49
50static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
51
52static struct workqueue_struct *deferred_remove_workqueue;
53
54atomic_t dm_global_event_nr = ATOMIC_INIT(0);
55DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq);
56
57void dm_issue_global_event(void)
58{
59 atomic_inc(&dm_global_event_nr);
60 wake_up(&dm_global_eventq);
61}
62
63
64
65
66struct clone_info {
67 struct dm_table *map;
68 struct bio *bio;
69 struct dm_io *io;
70 sector_t sector;
71 unsigned sector_count;
72};
73
74
75
76
77#define DM_TIO_MAGIC 7282014
78struct dm_target_io {
79 unsigned magic;
80 struct dm_io *io;
81 struct dm_target *ti;
82 unsigned target_bio_nr;
83 unsigned *len_ptr;
84 bool inside_dm_io;
85 struct bio clone;
86};
87
88
89
90
91
92#define DM_IO_MAGIC 5191977
93struct dm_io {
94 unsigned magic;
95 struct mapped_device *md;
96 blk_status_t status;
97 atomic_t io_count;
98 struct bio *orig_bio;
99 unsigned long start_time;
100 spinlock_t endio_lock;
101 struct dm_stats_aux stats_aux;
102
103 struct dm_target_io tio;
104};
105
106void *dm_per_bio_data(struct bio *bio, size_t data_size)
107{
108 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
109 if (!tio->inside_dm_io)
110 return (char *)bio - offsetof(struct dm_target_io, clone) - data_size;
111 return (char *)bio - offsetof(struct dm_target_io, clone) - offsetof(struct dm_io, tio) - data_size;
112}
113EXPORT_SYMBOL_GPL(dm_per_bio_data);
114
115struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size)
116{
117 struct dm_io *io = (struct dm_io *)((char *)data + data_size);
118 if (io->magic == DM_IO_MAGIC)
119 return (struct bio *)((char *)io + offsetof(struct dm_io, tio) + offsetof(struct dm_target_io, clone));
120 BUG_ON(io->magic != DM_TIO_MAGIC);
121 return (struct bio *)((char *)io + offsetof(struct dm_target_io, clone));
122}
123EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data);
124
125unsigned dm_bio_get_target_bio_nr(const struct bio *bio)
126{
127 return container_of(bio, struct dm_target_io, clone)->target_bio_nr;
128}
129EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr);
130
131#define MINOR_ALLOCED ((void *)-1)
132
133
134
135
136#define DMF_BLOCK_IO_FOR_SUSPEND 0
137#define DMF_SUSPENDED 1
138#define DMF_FROZEN 2
139#define DMF_FREEING 3
140#define DMF_DELETING 4
141#define DMF_NOFLUSH_SUSPENDING 5
142#define DMF_DEFERRED_REMOVE 6
143#define DMF_SUSPENDED_INTERNALLY 7
144
145#define DM_NUMA_NODE NUMA_NO_NODE
146static int dm_numa_node = DM_NUMA_NODE;
147
148
149
150
151struct dm_md_mempools {
152 struct bio_set bs;
153 struct bio_set io_bs;
154};
155
156struct table_device {
157 struct list_head list;
158 refcount_t count;
159 struct dm_dev dm_dev;
160};
161
162
163
164
165#define RESERVED_BIO_BASED_IOS 16
166static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
167
168static int __dm_get_module_param_int(int *module_param, int min, int max)
169{
170 int param = READ_ONCE(*module_param);
171 int modified_param = 0;
172 bool modified = true;
173
174 if (param < min)
175 modified_param = min;
176 else if (param > max)
177 modified_param = max;
178 else
179 modified = false;
180
181 if (modified) {
182 (void)cmpxchg(module_param, param, modified_param);
183 param = modified_param;
184 }
185
186 return param;
187}
188
189unsigned __dm_get_module_param(unsigned *module_param,
190 unsigned def, unsigned max)
191{
192 unsigned param = READ_ONCE(*module_param);
193 unsigned modified_param = 0;
194
195 if (!param)
196 modified_param = def;
197 else if (param > max)
198 modified_param = max;
199
200 if (modified_param) {
201 (void)cmpxchg(module_param, param, modified_param);
202 param = modified_param;
203 }
204
205 return param;
206}
207
208unsigned dm_get_reserved_bio_based_ios(void)
209{
210 return __dm_get_module_param(&reserved_bio_based_ios,
211 RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS);
212}
213EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
214
215static unsigned dm_get_numa_node(void)
216{
217 return __dm_get_module_param_int(&dm_numa_node,
218 DM_NUMA_NODE, num_online_nodes() - 1);
219}
220
221static int __init local_init(void)
222{
223 int r;
224
225 r = dm_uevent_init();
226 if (r)
227 return r;
228
229 deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
230 if (!deferred_remove_workqueue) {
231 r = -ENOMEM;
232 goto out_uevent_exit;
233 }
234
235 _major = major;
236 r = register_blkdev(_major, _name);
237 if (r < 0)
238 goto out_free_workqueue;
239
240 if (!_major)
241 _major = r;
242
243 return 0;
244
245out_free_workqueue:
246 destroy_workqueue(deferred_remove_workqueue);
247out_uevent_exit:
248 dm_uevent_exit();
249
250 return r;
251}
252
253static void local_exit(void)
254{
255 flush_scheduled_work();
256 destroy_workqueue(deferred_remove_workqueue);
257
258 unregister_blkdev(_major, _name);
259 dm_uevent_exit();
260
261 _major = 0;
262
263 DMINFO("cleaned up");
264}
265
266static int (*_inits[])(void) __initdata = {
267 local_init,
268 dm_target_init,
269 dm_linear_init,
270 dm_stripe_init,
271 dm_io_init,
272 dm_kcopyd_init,
273 dm_interface_init,
274 dm_statistics_init,
275};
276
277static void (*_exits[])(void) = {
278 local_exit,
279 dm_target_exit,
280 dm_linear_exit,
281 dm_stripe_exit,
282 dm_io_exit,
283 dm_kcopyd_exit,
284 dm_interface_exit,
285 dm_statistics_exit,
286};
287
288static int __init dm_init(void)
289{
290 const int count = ARRAY_SIZE(_inits);
291
292 int r, i;
293
294 for (i = 0; i < count; i++) {
295 r = _inits[i]();
296 if (r)
297 goto bad;
298 }
299
300 return 0;
301
302 bad:
303 while (i--)
304 _exits[i]();
305
306 return r;
307}
308
309static void __exit dm_exit(void)
310{
311 int i = ARRAY_SIZE(_exits);
312
313 while (i--)
314 _exits[i]();
315
316
317
318
319 idr_destroy(&_minor_idr);
320}
321
322
323
324
325int dm_deleting_md(struct mapped_device *md)
326{
327 return test_bit(DMF_DELETING, &md->flags);
328}
329
330static int dm_blk_open(struct block_device *bdev, fmode_t mode)
331{
332 struct mapped_device *md;
333
334 spin_lock(&_minor_lock);
335
336 md = bdev->bd_disk->private_data;
337 if (!md)
338 goto out;
339
340 if (test_bit(DMF_FREEING, &md->flags) ||
341 dm_deleting_md(md)) {
342 md = NULL;
343 goto out;
344 }
345
346 dm_get(md);
347 atomic_inc(&md->open_count);
348out:
349 spin_unlock(&_minor_lock);
350
351 return md ? 0 : -ENXIO;
352}
353
354static void dm_blk_close(struct gendisk *disk, fmode_t mode)
355{
356 struct mapped_device *md;
357
358 spin_lock(&_minor_lock);
359
360 md = disk->private_data;
361 if (WARN_ON(!md))
362 goto out;
363
364 if (atomic_dec_and_test(&md->open_count) &&
365 (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
366 queue_work(deferred_remove_workqueue, &deferred_remove_work);
367
368 dm_put(md);
369out:
370 spin_unlock(&_minor_lock);
371}
372
373int dm_open_count(struct mapped_device *md)
374{
375 return atomic_read(&md->open_count);
376}
377
378
379
380
381int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
382{
383 int r = 0;
384
385 spin_lock(&_minor_lock);
386
387 if (dm_open_count(md)) {
388 r = -EBUSY;
389 if (mark_deferred)
390 set_bit(DMF_DEFERRED_REMOVE, &md->flags);
391 } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
392 r = -EEXIST;
393 else
394 set_bit(DMF_DELETING, &md->flags);
395
396 spin_unlock(&_minor_lock);
397
398 return r;
399}
400
401int dm_cancel_deferred_remove(struct mapped_device *md)
402{
403 int r = 0;
404
405 spin_lock(&_minor_lock);
406
407 if (test_bit(DMF_DELETING, &md->flags))
408 r = -EBUSY;
409 else
410 clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
411
412 spin_unlock(&_minor_lock);
413
414 return r;
415}
416
417static void do_deferred_remove(struct work_struct *w)
418{
419 dm_deferred_remove();
420}
421
422sector_t dm_get_size(struct mapped_device *md)
423{
424 return get_capacity(md->disk);
425}
426
427struct request_queue *dm_get_md_queue(struct mapped_device *md)
428{
429 return md->queue;
430}
431
432struct dm_stats *dm_get_stats(struct mapped_device *md)
433{
434 return &md->stats;
435}
436
437static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
438{
439 struct mapped_device *md = bdev->bd_disk->private_data;
440
441 return dm_get_geometry(md, geo);
442}
443
444#ifdef CONFIG_BLK_DEV_ZONED
445int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx, void *data)
446{
447 struct dm_report_zones_args *args = data;
448 sector_t sector_diff = args->tgt->begin - args->start;
449
450
451
452
453 if (zone->start >= args->start + args->tgt->len)
454 return 0;
455
456
457
458
459
460 zone->start += sector_diff;
461 if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) {
462 if (zone->cond == BLK_ZONE_COND_FULL)
463 zone->wp = zone->start + zone->len;
464 else if (zone->cond == BLK_ZONE_COND_EMPTY)
465 zone->wp = zone->start;
466 else
467 zone->wp += sector_diff;
468 }
469
470 args->next_sector = zone->start + zone->len;
471 return args->orig_cb(zone, args->zone_idx++, args->orig_data);
472}
473EXPORT_SYMBOL_GPL(dm_report_zones_cb);
474
475static int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
476 unsigned int nr_zones, report_zones_cb cb, void *data)
477{
478 struct mapped_device *md = disk->private_data;
479 struct dm_table *map;
480 int srcu_idx, ret;
481 struct dm_report_zones_args args = {
482 .next_sector = sector,
483 .orig_data = data,
484 .orig_cb = cb,
485 };
486
487 if (dm_suspended_md(md))
488 return -EAGAIN;
489
490 map = dm_get_live_table(md, &srcu_idx);
491 if (!map)
492 return -EIO;
493
494 do {
495 struct dm_target *tgt;
496
497 tgt = dm_table_find_target(map, args.next_sector);
498 if (WARN_ON_ONCE(!tgt->type->report_zones)) {
499 ret = -EIO;
500 goto out;
501 }
502
503 args.tgt = tgt;
504 ret = tgt->type->report_zones(tgt, &args, nr_zones);
505 if (ret < 0)
506 goto out;
507 } while (args.zone_idx < nr_zones &&
508 args.next_sector < get_capacity(disk));
509
510 ret = args.zone_idx;
511out:
512 dm_put_live_table(md, srcu_idx);
513 return ret;
514}
515#else
516#define dm_blk_report_zones NULL
517#endif
518
519static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx,
520 struct block_device **bdev)
521 __acquires(md->io_barrier)
522{
523 struct dm_target *tgt;
524 struct dm_table *map;
525 int r;
526
527retry:
528 r = -ENOTTY;
529 map = dm_get_live_table(md, srcu_idx);
530 if (!map || !dm_table_get_size(map))
531 return r;
532
533
534 if (dm_table_get_num_targets(map) != 1)
535 return r;
536
537 tgt = dm_table_get_target(map, 0);
538 if (!tgt->type->prepare_ioctl)
539 return r;
540
541 if (dm_suspended_md(md))
542 return -EAGAIN;
543
544 r = tgt->type->prepare_ioctl(tgt, bdev);
545 if (r == -ENOTCONN && !fatal_signal_pending(current)) {
546 dm_put_live_table(md, *srcu_idx);
547 msleep(10);
548 goto retry;
549 }
550
551 return r;
552}
553
554static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx)
555 __releases(md->io_barrier)
556{
557 dm_put_live_table(md, srcu_idx);
558}
559
560static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
561 unsigned int cmd, unsigned long arg)
562{
563 struct mapped_device *md = bdev->bd_disk->private_data;
564 int r, srcu_idx;
565
566 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
567 if (r < 0)
568 goto out;
569
570 if (r > 0) {
571
572
573
574
575 if (!capable(CAP_SYS_RAWIO)) {
576 DMWARN_LIMIT(
577 "%s: sending ioctl %x to DM device without required privilege.",
578 current->comm, cmd);
579 r = -ENOIOCTLCMD;
580 goto out;
581 }
582 }
583
584 r = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
585out:
586 dm_unprepare_ioctl(md, srcu_idx);
587 return r;
588}
589
590static void start_io_acct(struct dm_io *io);
591
592static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
593{
594 struct dm_io *io;
595 struct dm_target_io *tio;
596 struct bio *clone;
597
598 clone = bio_alloc_bioset(GFP_NOIO, 0, &md->io_bs);
599 if (!clone)
600 return NULL;
601
602 tio = container_of(clone, struct dm_target_io, clone);
603 tio->inside_dm_io = true;
604 tio->io = NULL;
605
606 io = container_of(tio, struct dm_io, tio);
607 io->magic = DM_IO_MAGIC;
608 io->status = 0;
609 atomic_set(&io->io_count, 1);
610 io->orig_bio = bio;
611 io->md = md;
612 spin_lock_init(&io->endio_lock);
613
614 start_io_acct(io);
615
616 return io;
617}
618
619static void free_io(struct mapped_device *md, struct dm_io *io)
620{
621 bio_put(&io->tio.clone);
622}
623
624static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *ti,
625 unsigned target_bio_nr, gfp_t gfp_mask)
626{
627 struct dm_target_io *tio;
628
629 if (!ci->io->tio.io) {
630
631 tio = &ci->io->tio;
632 } else {
633 struct bio *clone = bio_alloc_bioset(gfp_mask, 0, &ci->io->md->bs);
634 if (!clone)
635 return NULL;
636
637 tio = container_of(clone, struct dm_target_io, clone);
638 tio->inside_dm_io = false;
639 }
640
641 tio->magic = DM_TIO_MAGIC;
642 tio->io = ci->io;
643 tio->ti = ti;
644 tio->target_bio_nr = target_bio_nr;
645
646 return tio;
647}
648
649static void free_tio(struct dm_target_io *tio)
650{
651 if (tio->inside_dm_io)
652 return;
653 bio_put(&tio->clone);
654}
655
656static bool md_in_flight_bios(struct mapped_device *md)
657{
658 int cpu;
659 struct hd_struct *part = &dm_disk(md)->part0;
660 long sum = 0;
661
662 for_each_possible_cpu(cpu) {
663 sum += part_stat_local_read_cpu(part, in_flight[0], cpu);
664 sum += part_stat_local_read_cpu(part, in_flight[1], cpu);
665 }
666
667 return sum != 0;
668}
669
670static bool md_in_flight(struct mapped_device *md)
671{
672 if (queue_is_mq(md->queue))
673 return blk_mq_queue_inflight(md->queue);
674 else
675 return md_in_flight_bios(md);
676}
677
678static void start_io_acct(struct dm_io *io)
679{
680 struct mapped_device *md = io->md;
681 struct bio *bio = io->orig_bio;
682
683 io->start_time = jiffies;
684
685 generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio),
686 &dm_disk(md)->part0);
687
688 if (unlikely(dm_stats_used(&md->stats)))
689 dm_stats_account_io(&md->stats, bio_data_dir(bio),
690 bio->bi_iter.bi_sector, bio_sectors(bio),
691 false, 0, &io->stats_aux);
692}
693
694static void end_io_acct(struct dm_io *io)
695{
696 struct mapped_device *md = io->md;
697 struct bio *bio = io->orig_bio;
698 unsigned long duration = jiffies - io->start_time;
699
700 generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0,
701 io->start_time);
702
703 if (unlikely(dm_stats_used(&md->stats)))
704 dm_stats_account_io(&md->stats, bio_data_dir(bio),
705 bio->bi_iter.bi_sector, bio_sectors(bio),
706 true, duration, &io->stats_aux);
707
708
709 if (unlikely(wq_has_sleeper(&md->wait)))
710 wake_up(&md->wait);
711}
712
713
714
715
716static void queue_io(struct mapped_device *md, struct bio *bio)
717{
718 unsigned long flags;
719
720 spin_lock_irqsave(&md->deferred_lock, flags);
721 bio_list_add(&md->deferred, bio);
722 spin_unlock_irqrestore(&md->deferred_lock, flags);
723 queue_work(md->wq, &md->work);
724}
725
726
727
728
729
730
731struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
732{
733 *srcu_idx = srcu_read_lock(&md->io_barrier);
734
735 return srcu_dereference(md->map, &md->io_barrier);
736}
737
738void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
739{
740 srcu_read_unlock(&md->io_barrier, srcu_idx);
741}
742
743void dm_sync_table(struct mapped_device *md)
744{
745 synchronize_srcu(&md->io_barrier);
746 synchronize_rcu_expedited();
747}
748
749
750
751
752
753static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
754{
755 rcu_read_lock();
756 return rcu_dereference(md->map);
757}
758
759static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
760{
761 rcu_read_unlock();
762}
763
764static char *_dm_claim_ptr = "I belong to device-mapper";
765
766
767
768
769static int open_table_device(struct table_device *td, dev_t dev,
770 struct mapped_device *md)
771{
772 struct block_device *bdev;
773
774 int r;
775
776 BUG_ON(td->dm_dev.bdev);
777
778 bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr);
779 if (IS_ERR(bdev))
780 return PTR_ERR(bdev);
781
782 r = bd_link_disk_holder(bdev, dm_disk(md));
783 if (r) {
784 blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
785 return r;
786 }
787
788 td->dm_dev.bdev = bdev;
789 td->dm_dev.dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
790 return 0;
791}
792
793
794
795
796static void close_table_device(struct table_device *td, struct mapped_device *md)
797{
798 if (!td->dm_dev.bdev)
799 return;
800
801 bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
802 blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
803 put_dax(td->dm_dev.dax_dev);
804 td->dm_dev.bdev = NULL;
805 td->dm_dev.dax_dev = NULL;
806}
807
808static struct table_device *find_table_device(struct list_head *l, dev_t dev,
809 fmode_t mode)
810{
811 struct table_device *td;
812
813 list_for_each_entry(td, l, list)
814 if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
815 return td;
816
817 return NULL;
818}
819
820int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
821 struct dm_dev **result)
822{
823 int r;
824 struct table_device *td;
825
826 mutex_lock(&md->table_devices_lock);
827 td = find_table_device(&md->table_devices, dev, mode);
828 if (!td) {
829 td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
830 if (!td) {
831 mutex_unlock(&md->table_devices_lock);
832 return -ENOMEM;
833 }
834
835 td->dm_dev.mode = mode;
836 td->dm_dev.bdev = NULL;
837
838 if ((r = open_table_device(td, dev, md))) {
839 mutex_unlock(&md->table_devices_lock);
840 kfree(td);
841 return r;
842 }
843
844 format_dev_t(td->dm_dev.name, dev);
845
846 refcount_set(&td->count, 1);
847 list_add(&td->list, &md->table_devices);
848 } else {
849 refcount_inc(&td->count);
850 }
851 mutex_unlock(&md->table_devices_lock);
852
853 *result = &td->dm_dev;
854 return 0;
855}
856EXPORT_SYMBOL_GPL(dm_get_table_device);
857
858void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
859{
860 struct table_device *td = container_of(d, struct table_device, dm_dev);
861
862 mutex_lock(&md->table_devices_lock);
863 if (refcount_dec_and_test(&td->count)) {
864 close_table_device(td, md);
865 list_del(&td->list);
866 kfree(td);
867 }
868 mutex_unlock(&md->table_devices_lock);
869}
870EXPORT_SYMBOL(dm_put_table_device);
871
872static void free_table_devices(struct list_head *devices)
873{
874 struct list_head *tmp, *next;
875
876 list_for_each_safe(tmp, next, devices) {
877 struct table_device *td = list_entry(tmp, struct table_device, list);
878
879 DMWARN("dm_destroy: %s still exists with %d references",
880 td->dm_dev.name, refcount_read(&td->count));
881 kfree(td);
882 }
883}
884
885
886
887
888int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
889{
890 *geo = md->geometry;
891
892 return 0;
893}
894
895
896
897
898int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
899{
900 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
901
902 if (geo->start > sz) {
903 DMWARN("Start sector is beyond the geometry limits.");
904 return -EINVAL;
905 }
906
907 md->geometry = *geo;
908
909 return 0;
910}
911
912static int __noflush_suspending(struct mapped_device *md)
913{
914 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
915}
916
917
918
919
920
921static void dec_pending(struct dm_io *io, blk_status_t error)
922{
923 unsigned long flags;
924 blk_status_t io_error;
925 struct bio *bio;
926 struct mapped_device *md = io->md;
927
928
929 if (unlikely(error)) {
930 spin_lock_irqsave(&io->endio_lock, flags);
931 if (!(io->status == BLK_STS_DM_REQUEUE && __noflush_suspending(md)))
932 io->status = error;
933 spin_unlock_irqrestore(&io->endio_lock, flags);
934 }
935
936 if (atomic_dec_and_test(&io->io_count)) {
937 if (io->status == BLK_STS_DM_REQUEUE) {
938
939
940
941 spin_lock_irqsave(&md->deferred_lock, flags);
942 if (__noflush_suspending(md))
943
944 bio_list_add_head(&md->deferred, io->orig_bio);
945 else
946
947 io->status = BLK_STS_IOERR;
948 spin_unlock_irqrestore(&md->deferred_lock, flags);
949 }
950
951 io_error = io->status;
952 bio = io->orig_bio;
953 end_io_acct(io);
954 free_io(md, io);
955
956 if (io_error == BLK_STS_DM_REQUEUE)
957 return;
958
959 if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
960
961
962
963
964 bio->bi_opf &= ~REQ_PREFLUSH;
965 queue_io(md, bio);
966 } else {
967
968 if (io_error)
969 bio->bi_status = io_error;
970 bio_endio(bio);
971 }
972 }
973}
974
975void disable_discard(struct mapped_device *md)
976{
977 struct queue_limits *limits = dm_get_queue_limits(md);
978
979
980 limits->max_discard_sectors = 0;
981 blk_queue_flag_clear(QUEUE_FLAG_DISCARD, md->queue);
982}
983
984void disable_write_same(struct mapped_device *md)
985{
986 struct queue_limits *limits = dm_get_queue_limits(md);
987
988
989 limits->max_write_same_sectors = 0;
990}
991
992void disable_write_zeroes(struct mapped_device *md)
993{
994 struct queue_limits *limits = dm_get_queue_limits(md);
995
996
997 limits->max_write_zeroes_sectors = 0;
998}
999
1000static void clone_endio(struct bio *bio)
1001{
1002 blk_status_t error = bio->bi_status;
1003 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
1004 struct dm_io *io = tio->io;
1005 struct mapped_device *md = tio->io->md;
1006 dm_endio_fn endio = tio->ti->type->end_io;
1007
1008 if (unlikely(error == BLK_STS_TARGET) && md->type != DM_TYPE_NVME_BIO_BASED) {
1009 if (bio_op(bio) == REQ_OP_DISCARD &&
1010 !bio->bi_disk->queue->limits.max_discard_sectors)
1011 disable_discard(md);
1012 else if (bio_op(bio) == REQ_OP_WRITE_SAME &&
1013 !bio->bi_disk->queue->limits.max_write_same_sectors)
1014 disable_write_same(md);
1015 else if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
1016 !bio->bi_disk->queue->limits.max_write_zeroes_sectors)
1017 disable_write_zeroes(md);
1018 }
1019
1020 if (endio) {
1021 int r = endio(tio->ti, bio, &error);
1022 switch (r) {
1023 case DM_ENDIO_REQUEUE:
1024 error = BLK_STS_DM_REQUEUE;
1025
1026 case DM_ENDIO_DONE:
1027 break;
1028 case DM_ENDIO_INCOMPLETE:
1029
1030 return;
1031 default:
1032 DMWARN("unimplemented target endio return value: %d", r);
1033 BUG();
1034 }
1035 }
1036
1037 free_tio(tio);
1038 dec_pending(io, error);
1039}
1040
1041
1042
1043
1044
1045static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
1046{
1047 sector_t target_offset = dm_target_offset(ti, sector);
1048
1049 return ti->len - target_offset;
1050}
1051
1052static sector_t max_io_len(sector_t sector, struct dm_target *ti)
1053{
1054 sector_t len = max_io_len_target_boundary(sector, ti);
1055 sector_t offset, max_len;
1056
1057
1058
1059
1060 if (ti->max_io_len) {
1061 offset = dm_target_offset(ti, sector);
1062 if (unlikely(ti->max_io_len & (ti->max_io_len - 1)))
1063 max_len = sector_div(offset, ti->max_io_len);
1064 else
1065 max_len = offset & (ti->max_io_len - 1);
1066 max_len = ti->max_io_len - max_len;
1067
1068 if (len > max_len)
1069 len = max_len;
1070 }
1071
1072 return len;
1073}
1074
1075int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
1076{
1077 if (len > UINT_MAX) {
1078 DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
1079 (unsigned long long)len, UINT_MAX);
1080 ti->error = "Maximum size of target IO is too large";
1081 return -EINVAL;
1082 }
1083
1084 ti->max_io_len = (uint32_t) len;
1085
1086 return 0;
1087}
1088EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
1089
1090static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
1091 sector_t sector, int *srcu_idx)
1092 __acquires(md->io_barrier)
1093{
1094 struct dm_table *map;
1095 struct dm_target *ti;
1096
1097 map = dm_get_live_table(md, srcu_idx);
1098 if (!map)
1099 return NULL;
1100
1101 ti = dm_table_find_target(map, sector);
1102 if (!ti)
1103 return NULL;
1104
1105 return ti;
1106}
1107
1108static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
1109 long nr_pages, void **kaddr, pfn_t *pfn)
1110{
1111 struct mapped_device *md = dax_get_private(dax_dev);
1112 sector_t sector = pgoff * PAGE_SECTORS;
1113 struct dm_target *ti;
1114 long len, ret = -EIO;
1115 int srcu_idx;
1116
1117 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1118
1119 if (!ti)
1120 goto out;
1121 if (!ti->type->direct_access)
1122 goto out;
1123 len = max_io_len(sector, ti) / PAGE_SECTORS;
1124 if (len < 1)
1125 goto out;
1126 nr_pages = min(len, nr_pages);
1127 ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
1128
1129 out:
1130 dm_put_live_table(md, srcu_idx);
1131
1132 return ret;
1133}
1134
1135static bool dm_dax_supported(struct dax_device *dax_dev, struct block_device *bdev,
1136 int blocksize, sector_t start, sector_t len)
1137{
1138 struct mapped_device *md = dax_get_private(dax_dev);
1139 struct dm_table *map;
1140 int srcu_idx;
1141 bool ret;
1142
1143 map = dm_get_live_table(md, &srcu_idx);
1144 if (!map)
1145 return false;
1146
1147 ret = dm_table_supports_dax(map, device_supports_dax, &blocksize);
1148
1149 dm_put_live_table(md, srcu_idx);
1150
1151 return ret;
1152}
1153
1154static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
1155 void *addr, size_t bytes, struct iov_iter *i)
1156{
1157 struct mapped_device *md = dax_get_private(dax_dev);
1158 sector_t sector = pgoff * PAGE_SECTORS;
1159 struct dm_target *ti;
1160 long ret = 0;
1161 int srcu_idx;
1162
1163 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1164
1165 if (!ti)
1166 goto out;
1167 if (!ti->type->dax_copy_from_iter) {
1168 ret = copy_from_iter(addr, bytes, i);
1169 goto out;
1170 }
1171 ret = ti->type->dax_copy_from_iter(ti, pgoff, addr, bytes, i);
1172 out:
1173 dm_put_live_table(md, srcu_idx);
1174
1175 return ret;
1176}
1177
1178static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
1179 void *addr, size_t bytes, struct iov_iter *i)
1180{
1181 struct mapped_device *md = dax_get_private(dax_dev);
1182 sector_t sector = pgoff * PAGE_SECTORS;
1183 struct dm_target *ti;
1184 long ret = 0;
1185 int srcu_idx;
1186
1187 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1188
1189 if (!ti)
1190 goto out;
1191 if (!ti->type->dax_copy_to_iter) {
1192 ret = copy_to_iter(addr, bytes, i);
1193 goto out;
1194 }
1195 ret = ti->type->dax_copy_to_iter(ti, pgoff, addr, bytes, i);
1196 out:
1197 dm_put_live_table(md, srcu_idx);
1198
1199 return ret;
1200}
1201
1202static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
1203 size_t nr_pages)
1204{
1205 struct mapped_device *md = dax_get_private(dax_dev);
1206 sector_t sector = pgoff * PAGE_SECTORS;
1207 struct dm_target *ti;
1208 int ret = -EIO;
1209 int srcu_idx;
1210
1211 ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1212
1213 if (!ti)
1214 goto out;
1215 if (WARN_ON(!ti->type->dax_zero_page_range)) {
1216
1217
1218
1219
1220 dm_put_live_table(md, srcu_idx);
1221 goto out;
1222 }
1223 ret = ti->type->dax_zero_page_range(ti, pgoff, nr_pages);
1224
1225 out:
1226 dm_put_live_table(md, srcu_idx);
1227
1228 return ret;
1229}
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
1261{
1262 struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
1263 unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
1264 BUG_ON(bio->bi_opf & REQ_PREFLUSH);
1265 BUG_ON(bi_size > *tio->len_ptr);
1266 BUG_ON(n_sectors > bi_size);
1267 *tio->len_ptr -= bi_size - n_sectors;
1268 bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
1269}
1270EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
1271
1272static blk_qc_t __map_bio(struct dm_target_io *tio)
1273{
1274 int r;
1275 sector_t sector;
1276 struct bio *clone = &tio->clone;
1277 struct dm_io *io = tio->io;
1278 struct mapped_device *md = io->md;
1279 struct dm_target *ti = tio->ti;
1280 blk_qc_t ret = BLK_QC_T_NONE;
1281
1282 clone->bi_end_io = clone_endio;
1283
1284
1285
1286
1287
1288
1289 atomic_inc(&io->io_count);
1290 sector = clone->bi_iter.bi_sector;
1291
1292 r = ti->type->map(ti, clone);
1293 switch (r) {
1294 case DM_MAPIO_SUBMITTED:
1295 break;
1296 case DM_MAPIO_REMAPPED:
1297
1298 trace_block_bio_remap(clone->bi_disk->queue, clone,
1299 bio_dev(io->orig_bio), sector);
1300 if (md->type == DM_TYPE_NVME_BIO_BASED)
1301 ret = direct_make_request(clone);
1302 else
1303 ret = generic_make_request(clone);
1304 break;
1305 case DM_MAPIO_KILL:
1306 free_tio(tio);
1307 dec_pending(io, BLK_STS_IOERR);
1308 break;
1309 case DM_MAPIO_REQUEUE:
1310 free_tio(tio);
1311 dec_pending(io, BLK_STS_DM_REQUEUE);
1312 break;
1313 default:
1314 DMWARN("unimplemented target map return value: %d", r);
1315 BUG();
1316 }
1317
1318 return ret;
1319}
1320
1321static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
1322{
1323 bio->bi_iter.bi_sector = sector;
1324 bio->bi_iter.bi_size = to_bytes(len);
1325}
1326
1327
1328
1329
1330static int clone_bio(struct dm_target_io *tio, struct bio *bio,
1331 sector_t sector, unsigned len)
1332{
1333 struct bio *clone = &tio->clone;
1334
1335 __bio_clone_fast(clone, bio);
1336
1337 if (bio_integrity(bio)) {
1338 int r;
1339
1340 if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
1341 !dm_target_passes_integrity(tio->ti->type))) {
1342 DMWARN("%s: the target %s doesn't support integrity data.",
1343 dm_device_name(tio->io->md),
1344 tio->ti->type->name);
1345 return -EIO;
1346 }
1347
1348 r = bio_integrity_clone(clone, bio, GFP_NOIO);
1349 if (r < 0)
1350 return r;
1351 }
1352
1353 bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
1354 clone->bi_iter.bi_size = to_bytes(len);
1355
1356 if (bio_integrity(bio))
1357 bio_integrity_trim(clone);
1358
1359 return 0;
1360}
1361
1362static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
1363 struct dm_target *ti, unsigned num_bios)
1364{
1365 struct dm_target_io *tio;
1366 int try;
1367
1368 if (!num_bios)
1369 return;
1370
1371 if (num_bios == 1) {
1372 tio = alloc_tio(ci, ti, 0, GFP_NOIO);
1373 bio_list_add(blist, &tio->clone);
1374 return;
1375 }
1376
1377 for (try = 0; try < 2; try++) {
1378 int bio_nr;
1379 struct bio *bio;
1380
1381 if (try)
1382 mutex_lock(&ci->io->md->table_devices_lock);
1383 for (bio_nr = 0; bio_nr < num_bios; bio_nr++) {
1384 tio = alloc_tio(ci, ti, bio_nr, try ? GFP_NOIO : GFP_NOWAIT);
1385 if (!tio)
1386 break;
1387
1388 bio_list_add(blist, &tio->clone);
1389 }
1390 if (try)
1391 mutex_unlock(&ci->io->md->table_devices_lock);
1392 if (bio_nr == num_bios)
1393 return;
1394
1395 while ((bio = bio_list_pop(blist))) {
1396 tio = container_of(bio, struct dm_target_io, clone);
1397 free_tio(tio);
1398 }
1399 }
1400}
1401
1402static blk_qc_t __clone_and_map_simple_bio(struct clone_info *ci,
1403 struct dm_target_io *tio, unsigned *len)
1404{
1405 struct bio *clone = &tio->clone;
1406
1407 tio->len_ptr = len;
1408
1409 __bio_clone_fast(clone, ci->bio);
1410 if (len)
1411 bio_setup_sector(clone, ci->sector, *len);
1412
1413 return __map_bio(tio);
1414}
1415
1416static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
1417 unsigned num_bios, unsigned *len)
1418{
1419 struct bio_list blist = BIO_EMPTY_LIST;
1420 struct bio *bio;
1421 struct dm_target_io *tio;
1422
1423 alloc_multiple_bios(&blist, ci, ti, num_bios);
1424
1425 while ((bio = bio_list_pop(&blist))) {
1426 tio = container_of(bio, struct dm_target_io, clone);
1427 (void) __clone_and_map_simple_bio(ci, tio, len);
1428 }
1429}
1430
1431static int __send_empty_flush(struct clone_info *ci)
1432{
1433 unsigned target_nr = 0;
1434 struct dm_target *ti;
1435
1436
1437
1438
1439
1440
1441
1442
1443 bio_set_dev(ci->bio, ci->io->md->bdev);
1444
1445 BUG_ON(bio_has_data(ci->bio));
1446 while ((ti = dm_table_get_target(ci->map, target_nr++)))
1447 __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
1448
1449 bio_disassociate_blkg(ci->bio);
1450
1451 return 0;
1452}
1453
1454static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
1455 sector_t sector, unsigned *len)
1456{
1457 struct bio *bio = ci->bio;
1458 struct dm_target_io *tio;
1459 int r;
1460
1461 tio = alloc_tio(ci, ti, 0, GFP_NOIO);
1462 tio->len_ptr = len;
1463 r = clone_bio(tio, bio, sector, *len);
1464 if (r < 0) {
1465 free_tio(tio);
1466 return r;
1467 }
1468 (void) __map_bio(tio);
1469
1470 return 0;
1471}
1472
1473typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
1474
1475static unsigned get_num_discard_bios(struct dm_target *ti)
1476{
1477 return ti->num_discard_bios;
1478}
1479
1480static unsigned get_num_secure_erase_bios(struct dm_target *ti)
1481{
1482 return ti->num_secure_erase_bios;
1483}
1484
1485static unsigned get_num_write_same_bios(struct dm_target *ti)
1486{
1487 return ti->num_write_same_bios;
1488}
1489
1490static unsigned get_num_write_zeroes_bios(struct dm_target *ti)
1491{
1492 return ti->num_write_zeroes_bios;
1493}
1494
1495static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
1496 unsigned num_bios)
1497{
1498 unsigned len;
1499
1500
1501
1502
1503
1504
1505
1506 if (!num_bios)
1507 return -EOPNOTSUPP;
1508
1509 len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
1510
1511 __send_duplicate_bios(ci, ti, num_bios, &len);
1512
1513 ci->sector += len;
1514 ci->sector_count -= len;
1515
1516 return 0;
1517}
1518
1519static int __send_discard(struct clone_info *ci, struct dm_target *ti)
1520{
1521 return __send_changing_extent_only(ci, ti, get_num_discard_bios(ti));
1522}
1523
1524static int __send_secure_erase(struct clone_info *ci, struct dm_target *ti)
1525{
1526 return __send_changing_extent_only(ci, ti, get_num_secure_erase_bios(ti));
1527}
1528
1529static int __send_write_same(struct clone_info *ci, struct dm_target *ti)
1530{
1531 return __send_changing_extent_only(ci, ti, get_num_write_same_bios(ti));
1532}
1533
1534static int __send_write_zeroes(struct clone_info *ci, struct dm_target *ti)
1535{
1536 return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios(ti));
1537}
1538
1539static bool is_abnormal_io(struct bio *bio)
1540{
1541 bool r = false;
1542
1543 switch (bio_op(bio)) {
1544 case REQ_OP_DISCARD:
1545 case REQ_OP_SECURE_ERASE:
1546 case REQ_OP_WRITE_SAME:
1547 case REQ_OP_WRITE_ZEROES:
1548 r = true;
1549 break;
1550 }
1551
1552 return r;
1553}
1554
1555static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti,
1556 int *result)
1557{
1558 struct bio *bio = ci->bio;
1559
1560 if (bio_op(bio) == REQ_OP_DISCARD)
1561 *result = __send_discard(ci, ti);
1562 else if (bio_op(bio) == REQ_OP_SECURE_ERASE)
1563 *result = __send_secure_erase(ci, ti);
1564 else if (bio_op(bio) == REQ_OP_WRITE_SAME)
1565 *result = __send_write_same(ci, ti);
1566 else if (bio_op(bio) == REQ_OP_WRITE_ZEROES)
1567 *result = __send_write_zeroes(ci, ti);
1568 else
1569 return false;
1570
1571 return true;
1572}
1573
1574
1575
1576
1577static int __split_and_process_non_flush(struct clone_info *ci)
1578{
1579 struct dm_target *ti;
1580 unsigned len;
1581 int r;
1582
1583 ti = dm_table_find_target(ci->map, ci->sector);
1584 if (!ti)
1585 return -EIO;
1586
1587 if (__process_abnormal_io(ci, ti, &r))
1588 return r;
1589
1590 len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count);
1591
1592 r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
1593 if (r < 0)
1594 return r;
1595
1596 ci->sector += len;
1597 ci->sector_count -= len;
1598
1599 return 0;
1600}
1601
1602static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
1603 struct dm_table *map, struct bio *bio)
1604{
1605 ci->map = map;
1606 ci->io = alloc_io(md, bio);
1607 ci->sector = bio->bi_iter.bi_sector;
1608}
1609
1610#define __dm_part_stat_sub(part, field, subnd) \
1611 (part_stat_get(part, field) -= (subnd))
1612
1613
1614
1615
1616static blk_qc_t __split_and_process_bio(struct mapped_device *md,
1617 struct dm_table *map, struct bio *bio)
1618{
1619 struct clone_info ci;
1620 blk_qc_t ret = BLK_QC_T_NONE;
1621 int error = 0;
1622
1623 init_clone_info(&ci, md, map, bio);
1624
1625 if (bio->bi_opf & REQ_PREFLUSH) {
1626 struct bio flush_bio;
1627
1628
1629
1630
1631
1632
1633 bio_init(&flush_bio, NULL, 0);
1634 flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
1635 ci.bio = &flush_bio;
1636 ci.sector_count = 0;
1637 error = __send_empty_flush(&ci);
1638
1639 } else if (op_is_zone_mgmt(bio_op(bio))) {
1640 ci.bio = bio;
1641 ci.sector_count = 0;
1642 error = __split_and_process_non_flush(&ci);
1643 } else {
1644 ci.bio = bio;
1645 ci.sector_count = bio_sectors(bio);
1646 while (ci.sector_count && !error) {
1647 error = __split_and_process_non_flush(&ci);
1648 if (current->bio_list && ci.sector_count && !error) {
1649
1650
1651
1652
1653
1654
1655
1656
1657 struct bio *b = bio_split(bio, bio_sectors(bio) - ci.sector_count,
1658 GFP_NOIO, &md->queue->bio_split);
1659 ci.io->orig_bio = b;
1660
1661
1662
1663
1664
1665
1666
1667
1668 part_stat_lock();
1669 __dm_part_stat_sub(&dm_disk(md)->part0,
1670 sectors[op_stat_group(bio_op(bio))], ci.sector_count);
1671 part_stat_unlock();
1672
1673 bio_chain(b, bio);
1674 trace_block_split(md->queue, b, bio->bi_iter.bi_sector);
1675 ret = generic_make_request(bio);
1676 break;
1677 }
1678 }
1679 }
1680
1681
1682 dec_pending(ci.io, errno_to_blk_status(error));
1683 return ret;
1684}
1685
1686
1687
1688
1689
1690static blk_qc_t __process_bio(struct mapped_device *md, struct dm_table *map,
1691 struct bio *bio, struct dm_target *ti)
1692{
1693 struct clone_info ci;
1694 blk_qc_t ret = BLK_QC_T_NONE;
1695 int error = 0;
1696
1697 init_clone_info(&ci, md, map, bio);
1698
1699 if (bio->bi_opf & REQ_PREFLUSH) {
1700 struct bio flush_bio;
1701
1702
1703
1704
1705
1706
1707 bio_init(&flush_bio, NULL, 0);
1708 flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
1709 ci.bio = &flush_bio;
1710 ci.sector_count = 0;
1711 error = __send_empty_flush(&ci);
1712
1713 } else {
1714 struct dm_target_io *tio;
1715
1716 ci.bio = bio;
1717 ci.sector_count = bio_sectors(bio);
1718 if (__process_abnormal_io(&ci, ti, &error))
1719 goto out;
1720
1721 tio = alloc_tio(&ci, ti, 0, GFP_NOIO);
1722 ret = __clone_and_map_simple_bio(&ci, tio, NULL);
1723 }
1724out:
1725
1726 dec_pending(ci.io, errno_to_blk_status(error));
1727 return ret;
1728}
1729
1730static void dm_queue_split(struct mapped_device *md, struct dm_target *ti, struct bio **bio)
1731{
1732 unsigned len, sector_count;
1733
1734 sector_count = bio_sectors(*bio);
1735 len = min_t(sector_t, max_io_len((*bio)->bi_iter.bi_sector, ti), sector_count);
1736
1737 if (sector_count > len) {
1738 struct bio *split = bio_split(*bio, len, GFP_NOIO, &md->queue->bio_split);
1739
1740 bio_chain(split, *bio);
1741 trace_block_split(md->queue, split, (*bio)->bi_iter.bi_sector);
1742 generic_make_request(*bio);
1743 *bio = split;
1744 }
1745}
1746
1747static blk_qc_t dm_process_bio(struct mapped_device *md,
1748 struct dm_table *map, struct bio *bio)
1749{
1750 blk_qc_t ret = BLK_QC_T_NONE;
1751 struct dm_target *ti = md->immutable_target;
1752
1753 if (unlikely(!map)) {
1754 bio_io_error(bio);
1755 return ret;
1756 }
1757
1758 if (!ti) {
1759 ti = dm_table_find_target(map, bio->bi_iter.bi_sector);
1760 if (unlikely(!ti)) {
1761 bio_io_error(bio);
1762 return ret;
1763 }
1764 }
1765
1766
1767
1768
1769
1770
1771 if (current->bio_list) {
1772 if (is_abnormal_io(bio))
1773 blk_queue_split(md->queue, &bio);
1774 else
1775 dm_queue_split(md, ti, &bio);
1776 }
1777
1778 if (dm_get_md_type(md) == DM_TYPE_NVME_BIO_BASED)
1779 return __process_bio(md, map, bio, ti);
1780 else
1781 return __split_and_process_bio(md, map, bio);
1782}
1783
1784static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
1785{
1786 struct mapped_device *md = q->queuedata;
1787 blk_qc_t ret = BLK_QC_T_NONE;
1788 int srcu_idx;
1789 struct dm_table *map;
1790
1791 map = dm_get_live_table(md, &srcu_idx);
1792
1793
1794 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1795 dm_put_live_table(md, srcu_idx);
1796
1797 if (!(bio->bi_opf & REQ_RAHEAD))
1798 queue_io(md, bio);
1799 else
1800 bio_io_error(bio);
1801 return ret;
1802 }
1803
1804 ret = dm_process_bio(md, map, bio);
1805
1806 dm_put_live_table(md, srcu_idx);
1807 return ret;
1808}
1809
1810static int dm_any_congested(void *congested_data, int bdi_bits)
1811{
1812 int r = bdi_bits;
1813 struct mapped_device *md = congested_data;
1814 struct dm_table *map;
1815
1816 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
1817 if (dm_request_based(md)) {
1818
1819
1820
1821
1822 struct backing_dev_info *bdi = md->queue->backing_dev_info;
1823 r = bdi->wb.congested->state & bdi_bits;
1824 } else {
1825 map = dm_get_live_table_fast(md);
1826 if (map)
1827 r = dm_table_any_congested(map, bdi_bits);
1828 dm_put_live_table_fast(md);
1829 }
1830 }
1831
1832 return r;
1833}
1834
1835
1836
1837
1838static void free_minor(int minor)
1839{
1840 spin_lock(&_minor_lock);
1841 idr_remove(&_minor_idr, minor);
1842 spin_unlock(&_minor_lock);
1843}
1844
1845
1846
1847
1848static int specific_minor(int minor)
1849{
1850 int r;
1851
1852 if (minor >= (1 << MINORBITS))
1853 return -EINVAL;
1854
1855 idr_preload(GFP_KERNEL);
1856 spin_lock(&_minor_lock);
1857
1858 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
1859
1860 spin_unlock(&_minor_lock);
1861 idr_preload_end();
1862 if (r < 0)
1863 return r == -ENOSPC ? -EBUSY : r;
1864 return 0;
1865}
1866
1867static int next_free_minor(int *minor)
1868{
1869 int r;
1870
1871 idr_preload(GFP_KERNEL);
1872 spin_lock(&_minor_lock);
1873
1874 r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
1875
1876 spin_unlock(&_minor_lock);
1877 idr_preload_end();
1878 if (r < 0)
1879 return r;
1880 *minor = r;
1881 return 0;
1882}
1883
1884static const struct block_device_operations dm_blk_dops;
1885static const struct dax_operations dm_dax_ops;
1886
1887static void dm_wq_work(struct work_struct *work);
1888
1889static void cleanup_mapped_device(struct mapped_device *md)
1890{
1891 if (md->wq)
1892 destroy_workqueue(md->wq);
1893 bioset_exit(&md->bs);
1894 bioset_exit(&md->io_bs);
1895
1896 if (md->dax_dev) {
1897 kill_dax(md->dax_dev);
1898 put_dax(md->dax_dev);
1899 md->dax_dev = NULL;
1900 }
1901
1902 if (md->disk) {
1903 spin_lock(&_minor_lock);
1904 md->disk->private_data = NULL;
1905 spin_unlock(&_minor_lock);
1906 del_gendisk(md->disk);
1907 put_disk(md->disk);
1908 }
1909
1910 if (md->queue)
1911 blk_cleanup_queue(md->queue);
1912
1913 cleanup_srcu_struct(&md->io_barrier);
1914
1915 if (md->bdev) {
1916 bdput(md->bdev);
1917 md->bdev = NULL;
1918 }
1919
1920 mutex_destroy(&md->suspend_lock);
1921 mutex_destroy(&md->type_lock);
1922 mutex_destroy(&md->table_devices_lock);
1923
1924 dm_mq_cleanup_mapped_device(md);
1925}
1926
1927
1928
1929
1930static struct mapped_device *alloc_dev(int minor)
1931{
1932 int r, numa_node_id = dm_get_numa_node();
1933 struct mapped_device *md;
1934 void *old_md;
1935
1936 md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
1937 if (!md) {
1938 DMWARN("unable to allocate device, out of memory.");
1939 return NULL;
1940 }
1941
1942 if (!try_module_get(THIS_MODULE))
1943 goto bad_module_get;
1944
1945
1946 if (minor == DM_ANY_MINOR)
1947 r = next_free_minor(&minor);
1948 else
1949 r = specific_minor(minor);
1950 if (r < 0)
1951 goto bad_minor;
1952
1953 r = init_srcu_struct(&md->io_barrier);
1954 if (r < 0)
1955 goto bad_io_barrier;
1956
1957 md->numa_node_id = numa_node_id;
1958 md->init_tio_pdu = false;
1959 md->type = DM_TYPE_NONE;
1960 mutex_init(&md->suspend_lock);
1961 mutex_init(&md->type_lock);
1962 mutex_init(&md->table_devices_lock);
1963 spin_lock_init(&md->deferred_lock);
1964 atomic_set(&md->holders, 1);
1965 atomic_set(&md->open_count, 0);
1966 atomic_set(&md->event_nr, 0);
1967 atomic_set(&md->uevent_seq, 0);
1968 INIT_LIST_HEAD(&md->uevent_list);
1969 INIT_LIST_HEAD(&md->table_devices);
1970 spin_lock_init(&md->uevent_lock);
1971
1972
1973
1974
1975
1976
1977 md->queue = blk_alloc_queue(dm_make_request, numa_node_id);
1978 if (!md->queue)
1979 goto bad;
1980 md->queue->queuedata = md;
1981
1982 md->disk = alloc_disk_node(1, md->numa_node_id);
1983 if (!md->disk)
1984 goto bad;
1985
1986 init_waitqueue_head(&md->wait);
1987 INIT_WORK(&md->work, dm_wq_work);
1988 init_waitqueue_head(&md->eventq);
1989 init_completion(&md->kobj_holder.completion);
1990
1991 md->disk->major = _major;
1992 md->disk->first_minor = minor;
1993 md->disk->fops = &dm_blk_dops;
1994 md->disk->queue = md->queue;
1995 md->disk->private_data = md;
1996 sprintf(md->disk->disk_name, "dm-%d", minor);
1997
1998 if (IS_ENABLED(CONFIG_DAX_DRIVER)) {
1999 md->dax_dev = alloc_dax(md, md->disk->disk_name,
2000 &dm_dax_ops, 0);
2001 if (IS_ERR(md->dax_dev))
2002 goto bad;
2003 }
2004
2005 add_disk_no_queue_reg(md->disk);
2006 format_dev_t(md->name, MKDEV(_major, minor));
2007
2008 md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
2009 if (!md->wq)
2010 goto bad;
2011
2012 md->bdev = bdget_disk(md->disk, 0);
2013 if (!md->bdev)
2014 goto bad;
2015
2016 dm_stats_init(&md->stats);
2017
2018
2019 spin_lock(&_minor_lock);
2020 old_md = idr_replace(&_minor_idr, md, minor);
2021 spin_unlock(&_minor_lock);
2022
2023 BUG_ON(old_md != MINOR_ALLOCED);
2024
2025 return md;
2026
2027bad:
2028 cleanup_mapped_device(md);
2029bad_io_barrier:
2030 free_minor(minor);
2031bad_minor:
2032 module_put(THIS_MODULE);
2033bad_module_get:
2034 kvfree(md);
2035 return NULL;
2036}
2037
2038static void unlock_fs(struct mapped_device *md);
2039
2040static void free_dev(struct mapped_device *md)
2041{
2042 int minor = MINOR(disk_devt(md->disk));
2043
2044 unlock_fs(md);
2045
2046 cleanup_mapped_device(md);
2047
2048 free_table_devices(&md->table_devices);
2049 dm_stats_cleanup(&md->stats);
2050 free_minor(minor);
2051
2052 module_put(THIS_MODULE);
2053 kvfree(md);
2054}
2055
2056static int __bind_mempools(struct mapped_device *md, struct dm_table *t)
2057{
2058 struct dm_md_mempools *p = dm_table_get_md_mempools(t);
2059 int ret = 0;
2060
2061 if (dm_table_bio_based(t)) {
2062
2063
2064
2065
2066
2067 bioset_exit(&md->bs);
2068 bioset_exit(&md->io_bs);
2069
2070 } else if (bioset_initialized(&md->bs)) {
2071
2072
2073
2074
2075
2076
2077
2078
2079 goto out;
2080 }
2081
2082 BUG_ON(!p ||
2083 bioset_initialized(&md->bs) ||
2084 bioset_initialized(&md->io_bs));
2085
2086 ret = bioset_init_from_src(&md->bs, &p->bs);
2087 if (ret)
2088 goto out;
2089 ret = bioset_init_from_src(&md->io_bs, &p->io_bs);
2090 if (ret)
2091 bioset_exit(&md->bs);
2092out:
2093
2094 dm_table_free_md_mempools(t);
2095 return ret;
2096}
2097
2098
2099
2100
2101static void event_callback(void *context)
2102{
2103 unsigned long flags;
2104 LIST_HEAD(uevents);
2105 struct mapped_device *md = (struct mapped_device *) context;
2106
2107 spin_lock_irqsave(&md->uevent_lock, flags);
2108 list_splice_init(&md->uevent_list, &uevents);
2109 spin_unlock_irqrestore(&md->uevent_lock, flags);
2110
2111 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
2112
2113 atomic_inc(&md->event_nr);
2114 wake_up(&md->eventq);
2115 dm_issue_global_event();
2116}
2117
2118
2119
2120
2121static void __set_size(struct mapped_device *md, sector_t size)
2122{
2123 lockdep_assert_held(&md->suspend_lock);
2124
2125 set_capacity(md->disk, size);
2126
2127 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
2128}
2129
2130
2131
2132
2133static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2134 struct queue_limits *limits)
2135{
2136 struct dm_table *old_map;
2137 struct request_queue *q = md->queue;
2138 bool request_based = dm_table_request_based(t);
2139 sector_t size;
2140 int ret;
2141
2142 lockdep_assert_held(&md->suspend_lock);
2143
2144 size = dm_table_get_size(t);
2145
2146
2147
2148
2149 if (size != dm_get_size(md))
2150 memset(&md->geometry, 0, sizeof(md->geometry));
2151
2152 __set_size(md, size);
2153
2154 dm_table_event_callback(t, event_callback, md);
2155
2156
2157
2158
2159
2160
2161
2162
2163 if (request_based)
2164 dm_stop_queue(q);
2165
2166 if (request_based || md->type == DM_TYPE_NVME_BIO_BASED) {
2167
2168
2169
2170
2171
2172
2173 md->immutable_target = dm_table_get_immutable_target(t);
2174 }
2175
2176 ret = __bind_mempools(md, t);
2177 if (ret) {
2178 old_map = ERR_PTR(ret);
2179 goto out;
2180 }
2181
2182 old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2183 rcu_assign_pointer(md->map, (void *)t);
2184 md->immutable_target_type = dm_table_get_immutable_target_type(t);
2185
2186 dm_table_set_restrictions(t, q, limits);
2187 if (old_map)
2188 dm_sync_table(md);
2189
2190out:
2191 return old_map;
2192}
2193
2194
2195
2196
2197static struct dm_table *__unbind(struct mapped_device *md)
2198{
2199 struct dm_table *map = rcu_dereference_protected(md->map, 1);
2200
2201 if (!map)
2202 return NULL;
2203
2204 dm_table_event_callback(map, NULL, NULL);
2205 RCU_INIT_POINTER(md->map, NULL);
2206 dm_sync_table(md);
2207
2208 return map;
2209}
2210
2211
2212
2213
2214int dm_create(int minor, struct mapped_device **result)
2215{
2216 int r;
2217 struct mapped_device *md;
2218
2219 md = alloc_dev(minor);
2220 if (!md)
2221 return -ENXIO;
2222
2223 r = dm_sysfs_init(md);
2224 if (r) {
2225 free_dev(md);
2226 return r;
2227 }
2228
2229 *result = md;
2230 return 0;
2231}
2232
2233
2234
2235
2236
2237void dm_lock_md_type(struct mapped_device *md)
2238{
2239 mutex_lock(&md->type_lock);
2240}
2241
2242void dm_unlock_md_type(struct mapped_device *md)
2243{
2244 mutex_unlock(&md->type_lock);
2245}
2246
2247void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
2248{
2249 BUG_ON(!mutex_is_locked(&md->type_lock));
2250 md->type = type;
2251}
2252
2253enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
2254{
2255 return md->type;
2256}
2257
2258struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
2259{
2260 return md->immutable_target_type;
2261}
2262
2263
2264
2265
2266
2267struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
2268{
2269 BUG_ON(!atomic_read(&md->holders));
2270 return &md->queue->limits;
2271}
2272EXPORT_SYMBOL_GPL(dm_get_queue_limits);
2273
2274static void dm_init_congested_fn(struct mapped_device *md)
2275{
2276 md->queue->backing_dev_info->congested_data = md;
2277 md->queue->backing_dev_info->congested_fn = dm_any_congested;
2278}
2279
2280
2281
2282
2283int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
2284{
2285 int r;
2286 struct queue_limits limits;
2287 enum dm_queue_mode type = dm_get_md_type(md);
2288
2289 switch (type) {
2290 case DM_TYPE_REQUEST_BASED:
2291 r = dm_mq_init_request_queue(md, t);
2292 if (r) {
2293 DMERR("Cannot initialize queue for request-based dm-mq mapped device");
2294 return r;
2295 }
2296 dm_init_congested_fn(md);
2297 break;
2298 case DM_TYPE_BIO_BASED:
2299 case DM_TYPE_DAX_BIO_BASED:
2300 case DM_TYPE_NVME_BIO_BASED:
2301 dm_init_congested_fn(md);
2302 break;
2303 case DM_TYPE_NONE:
2304 WARN_ON_ONCE(true);
2305 break;
2306 }
2307
2308 r = dm_calculate_queue_limits(t, &limits);
2309 if (r) {
2310 DMERR("Cannot calculate initial queue limits");
2311 return r;
2312 }
2313 dm_table_set_restrictions(t, md->queue, &limits);
2314 blk_register_queue(md->disk);
2315
2316 return 0;
2317}
2318
2319struct mapped_device *dm_get_md(dev_t dev)
2320{
2321 struct mapped_device *md;
2322 unsigned minor = MINOR(dev);
2323
2324 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2325 return NULL;
2326
2327 spin_lock(&_minor_lock);
2328
2329 md = idr_find(&_minor_idr, minor);
2330 if (!md || md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) ||
2331 test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2332 md = NULL;
2333 goto out;
2334 }
2335 dm_get(md);
2336out:
2337 spin_unlock(&_minor_lock);
2338
2339 return md;
2340}
2341EXPORT_SYMBOL_GPL(dm_get_md);
2342
2343void *dm_get_mdptr(struct mapped_device *md)
2344{
2345 return md->interface_ptr;
2346}
2347
2348void dm_set_mdptr(struct mapped_device *md, void *ptr)
2349{
2350 md->interface_ptr = ptr;
2351}
2352
2353void dm_get(struct mapped_device *md)
2354{
2355 atomic_inc(&md->holders);
2356 BUG_ON(test_bit(DMF_FREEING, &md->flags));
2357}
2358
2359int dm_hold(struct mapped_device *md)
2360{
2361 spin_lock(&_minor_lock);
2362 if (test_bit(DMF_FREEING, &md->flags)) {
2363 spin_unlock(&_minor_lock);
2364 return -EBUSY;
2365 }
2366 dm_get(md);
2367 spin_unlock(&_minor_lock);
2368 return 0;
2369}
2370EXPORT_SYMBOL_GPL(dm_hold);
2371
2372const char *dm_device_name(struct mapped_device *md)
2373{
2374 return md->name;
2375}
2376EXPORT_SYMBOL_GPL(dm_device_name);
2377
2378static void __dm_destroy(struct mapped_device *md, bool wait)
2379{
2380 struct dm_table *map;
2381 int srcu_idx;
2382
2383 might_sleep();
2384
2385 spin_lock(&_minor_lock);
2386 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2387 set_bit(DMF_FREEING, &md->flags);
2388 spin_unlock(&_minor_lock);
2389
2390 blk_set_queue_dying(md->queue);
2391
2392
2393
2394
2395
2396 mutex_lock(&md->suspend_lock);
2397 map = dm_get_live_table(md, &srcu_idx);
2398 if (!dm_suspended_md(md)) {
2399 dm_table_presuspend_targets(map);
2400 set_bit(DMF_SUSPENDED, &md->flags);
2401 dm_table_postsuspend_targets(map);
2402 }
2403
2404 dm_put_live_table(md, srcu_idx);
2405 mutex_unlock(&md->suspend_lock);
2406
2407
2408
2409
2410
2411
2412
2413 if (wait)
2414 while (atomic_read(&md->holders))
2415 msleep(1);
2416 else if (atomic_read(&md->holders))
2417 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2418 dm_device_name(md), atomic_read(&md->holders));
2419
2420 dm_sysfs_exit(md);
2421 dm_table_destroy(__unbind(md));
2422 free_dev(md);
2423}
2424
2425void dm_destroy(struct mapped_device *md)
2426{
2427 __dm_destroy(md, true);
2428}
2429
2430void dm_destroy_immediate(struct mapped_device *md)
2431{
2432 __dm_destroy(md, false);
2433}
2434
2435void dm_put(struct mapped_device *md)
2436{
2437 atomic_dec(&md->holders);
2438}
2439EXPORT_SYMBOL_GPL(dm_put);
2440
2441static int dm_wait_for_completion(struct mapped_device *md, long task_state)
2442{
2443 int r = 0;
2444 DEFINE_WAIT(wait);
2445
2446 while (1) {
2447 prepare_to_wait(&md->wait, &wait, task_state);
2448
2449 if (!md_in_flight(md))
2450 break;
2451
2452 if (signal_pending_state(task_state, current)) {
2453 r = -EINTR;
2454 break;
2455 }
2456
2457 io_schedule();
2458 }
2459 finish_wait(&md->wait, &wait);
2460
2461 return r;
2462}
2463
2464
2465
2466
2467static void dm_wq_work(struct work_struct *work)
2468{
2469 struct mapped_device *md = container_of(work, struct mapped_device,
2470 work);
2471 struct bio *c;
2472 int srcu_idx;
2473 struct dm_table *map;
2474
2475 map = dm_get_live_table(md, &srcu_idx);
2476
2477 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2478 spin_lock_irq(&md->deferred_lock);
2479 c = bio_list_pop(&md->deferred);
2480 spin_unlock_irq(&md->deferred_lock);
2481
2482 if (!c)
2483 break;
2484
2485 if (dm_request_based(md))
2486 (void) generic_make_request(c);
2487 else
2488 (void) dm_process_bio(md, map, c);
2489 }
2490
2491 dm_put_live_table(md, srcu_idx);
2492}
2493
2494static void dm_queue_flush(struct mapped_device *md)
2495{
2496 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2497 smp_mb__after_atomic();
2498 queue_work(md->wq, &md->work);
2499}
2500
2501
2502
2503
2504struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2505{
2506 struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
2507 struct queue_limits limits;
2508 int r;
2509
2510 mutex_lock(&md->suspend_lock);
2511
2512
2513 if (!dm_suspended_md(md))
2514 goto out;
2515
2516
2517
2518
2519
2520
2521
2522 if (dm_table_has_no_data_devices(table)) {
2523 live_map = dm_get_live_table_fast(md);
2524 if (live_map)
2525 limits = md->queue->limits;
2526 dm_put_live_table_fast(md);
2527 }
2528
2529 if (!live_map) {
2530 r = dm_calculate_queue_limits(table, &limits);
2531 if (r) {
2532 map = ERR_PTR(r);
2533 goto out;
2534 }
2535 }
2536
2537 map = __bind(md, table, &limits);
2538 dm_issue_global_event();
2539
2540out:
2541 mutex_unlock(&md->suspend_lock);
2542 return map;
2543}
2544
2545
2546
2547
2548
2549static int lock_fs(struct mapped_device *md)
2550{
2551 int r;
2552
2553 WARN_ON(md->frozen_sb);
2554
2555 md->frozen_sb = freeze_bdev(md->bdev);
2556 if (IS_ERR(md->frozen_sb)) {
2557 r = PTR_ERR(md->frozen_sb);
2558 md->frozen_sb = NULL;
2559 return r;
2560 }
2561
2562 set_bit(DMF_FROZEN, &md->flags);
2563
2564 return 0;
2565}
2566
2567static void unlock_fs(struct mapped_device *md)
2568{
2569 if (!test_bit(DMF_FROZEN, &md->flags))
2570 return;
2571
2572 thaw_bdev(md->bdev, md->frozen_sb);
2573 md->frozen_sb = NULL;
2574 clear_bit(DMF_FROZEN, &md->flags);
2575}
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
2587 unsigned suspend_flags, long task_state,
2588 int dmf_suspended_flag)
2589{
2590 bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
2591 bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
2592 int r;
2593
2594 lockdep_assert_held(&md->suspend_lock);
2595
2596
2597
2598
2599
2600 if (noflush)
2601 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2602 else
2603 pr_debug("%s: suspending with flush\n", dm_device_name(md));
2604
2605
2606
2607
2608
2609 dm_table_presuspend_targets(map);
2610
2611
2612
2613
2614
2615
2616
2617 if (!noflush && do_lockfs) {
2618 r = lock_fs(md);
2619 if (r) {
2620 dm_table_presuspend_undo_targets(map);
2621 return r;
2622 }
2623 }
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2638 if (map)
2639 synchronize_srcu(&md->io_barrier);
2640
2641
2642
2643
2644
2645 if (dm_request_based(md))
2646 dm_stop_queue(md->queue);
2647
2648 flush_workqueue(md->wq);
2649
2650
2651
2652
2653
2654
2655 r = dm_wait_for_completion(md, task_state);
2656 if (!r)
2657 set_bit(dmf_suspended_flag, &md->flags);
2658
2659 if (noflush)
2660 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2661 if (map)
2662 synchronize_srcu(&md->io_barrier);
2663
2664
2665 if (r < 0) {
2666 dm_queue_flush(md);
2667
2668 if (dm_request_based(md))
2669 dm_start_queue(md->queue);
2670
2671 unlock_fs(md);
2672 dm_table_presuspend_undo_targets(map);
2673
2674 }
2675
2676 return r;
2677}
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2696{
2697 struct dm_table *map = NULL;
2698 int r = 0;
2699
2700retry:
2701 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2702
2703 if (dm_suspended_md(md)) {
2704 r = -EINVAL;
2705 goto out_unlock;
2706 }
2707
2708 if (dm_suspended_internally_md(md)) {
2709
2710 mutex_unlock(&md->suspend_lock);
2711 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2712 if (r)
2713 return r;
2714 goto retry;
2715 }
2716
2717 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2718
2719 r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED);
2720 if (r)
2721 goto out_unlock;
2722
2723 dm_table_postsuspend_targets(map);
2724
2725out_unlock:
2726 mutex_unlock(&md->suspend_lock);
2727 return r;
2728}
2729
2730static int __dm_resume(struct mapped_device *md, struct dm_table *map)
2731{
2732 if (map) {
2733 int r = dm_table_resume_targets(map);
2734 if (r)
2735 return r;
2736 }
2737
2738 dm_queue_flush(md);
2739
2740
2741
2742
2743
2744
2745 if (dm_request_based(md))
2746 dm_start_queue(md->queue);
2747
2748 unlock_fs(md);
2749
2750 return 0;
2751}
2752
2753int dm_resume(struct mapped_device *md)
2754{
2755 int r;
2756 struct dm_table *map = NULL;
2757
2758retry:
2759 r = -EINVAL;
2760 mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2761
2762 if (!dm_suspended_md(md))
2763 goto out;
2764
2765 if (dm_suspended_internally_md(md)) {
2766
2767 mutex_unlock(&md->suspend_lock);
2768 r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2769 if (r)
2770 return r;
2771 goto retry;
2772 }
2773
2774 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2775 if (!map || !dm_table_get_size(map))
2776 goto out;
2777
2778 r = __dm_resume(md, map);
2779 if (r)
2780 goto out;
2781
2782 clear_bit(DMF_SUSPENDED, &md->flags);
2783out:
2784 mutex_unlock(&md->suspend_lock);
2785
2786 return r;
2787}
2788
2789
2790
2791
2792
2793
2794
2795static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
2796{
2797 struct dm_table *map = NULL;
2798
2799 lockdep_assert_held(&md->suspend_lock);
2800
2801 if (md->internal_suspend_count++)
2802 return;
2803
2804 if (dm_suspended_md(md)) {
2805 set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2806 return;
2807 }
2808
2809 map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2810
2811
2812
2813
2814
2815
2816
2817 (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE,
2818 DMF_SUSPENDED_INTERNALLY);
2819
2820 dm_table_postsuspend_targets(map);
2821}
2822
2823static void __dm_internal_resume(struct mapped_device *md)
2824{
2825 BUG_ON(!md->internal_suspend_count);
2826
2827 if (--md->internal_suspend_count)
2828 return;
2829
2830 if (dm_suspended_md(md))
2831 goto done;
2832
2833
2834
2835
2836
2837 (void) __dm_resume(md, NULL);
2838
2839done:
2840 clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2841 smp_mb__after_atomic();
2842 wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
2843}
2844
2845void dm_internal_suspend_noflush(struct mapped_device *md)
2846{
2847 mutex_lock(&md->suspend_lock);
2848 __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
2849 mutex_unlock(&md->suspend_lock);
2850}
2851EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
2852
2853void dm_internal_resume(struct mapped_device *md)
2854{
2855 mutex_lock(&md->suspend_lock);
2856 __dm_internal_resume(md);
2857 mutex_unlock(&md->suspend_lock);
2858}
2859EXPORT_SYMBOL_GPL(dm_internal_resume);
2860
2861
2862
2863
2864
2865
2866void dm_internal_suspend_fast(struct mapped_device *md)
2867{
2868 mutex_lock(&md->suspend_lock);
2869 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2870 return;
2871
2872 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2873 synchronize_srcu(&md->io_barrier);
2874 flush_workqueue(md->wq);
2875 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2876}
2877EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
2878
2879void dm_internal_resume_fast(struct mapped_device *md)
2880{
2881 if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2882 goto done;
2883
2884 dm_queue_flush(md);
2885
2886done:
2887 mutex_unlock(&md->suspend_lock);
2888}
2889EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
2890
2891
2892
2893
2894int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
2895 unsigned cookie)
2896{
2897 char udev_cookie[DM_COOKIE_LENGTH];
2898 char *envp[] = { udev_cookie, NULL };
2899
2900 if (!cookie)
2901 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
2902 else {
2903 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2904 DM_COOKIE_ENV_VAR_NAME, cookie);
2905 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
2906 action, envp);
2907 }
2908}
2909
2910uint32_t dm_next_uevent_seq(struct mapped_device *md)
2911{
2912 return atomic_add_return(1, &md->uevent_seq);
2913}
2914
2915uint32_t dm_get_event_nr(struct mapped_device *md)
2916{
2917 return atomic_read(&md->event_nr);
2918}
2919
2920int dm_wait_event(struct mapped_device *md, int event_nr)
2921{
2922 return wait_event_interruptible(md->eventq,
2923 (event_nr != atomic_read(&md->event_nr)));
2924}
2925
2926void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
2927{
2928 unsigned long flags;
2929
2930 spin_lock_irqsave(&md->uevent_lock, flags);
2931 list_add(elist, &md->uevent_list);
2932 spin_unlock_irqrestore(&md->uevent_lock, flags);
2933}
2934
2935
2936
2937
2938
2939struct gendisk *dm_disk(struct mapped_device *md)
2940{
2941 return md->disk;
2942}
2943EXPORT_SYMBOL_GPL(dm_disk);
2944
2945struct kobject *dm_kobject(struct mapped_device *md)
2946{
2947 return &md->kobj_holder.kobj;
2948}
2949
2950struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2951{
2952 struct mapped_device *md;
2953
2954 md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
2955
2956 spin_lock(&_minor_lock);
2957 if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2958 md = NULL;
2959 goto out;
2960 }
2961 dm_get(md);
2962out:
2963 spin_unlock(&_minor_lock);
2964
2965 return md;
2966}
2967
2968int dm_suspended_md(struct mapped_device *md)
2969{
2970 return test_bit(DMF_SUSPENDED, &md->flags);
2971}
2972
2973int dm_suspended_internally_md(struct mapped_device *md)
2974{
2975 return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2976}
2977
2978int dm_test_deferred_remove_flag(struct mapped_device *md)
2979{
2980 return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
2981}
2982
2983int dm_suspended(struct dm_target *ti)
2984{
2985 return dm_suspended_md(dm_table_get_md(ti->table));
2986}
2987EXPORT_SYMBOL_GPL(dm_suspended);
2988
2989int dm_noflush_suspending(struct dm_target *ti)
2990{
2991 return __noflush_suspending(dm_table_get_md(ti->table));
2992}
2993EXPORT_SYMBOL_GPL(dm_noflush_suspending);
2994
2995struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
2996 unsigned integrity, unsigned per_io_data_size,
2997 unsigned min_pool_size)
2998{
2999 struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
3000 unsigned int pool_size = 0;
3001 unsigned int front_pad, io_front_pad;
3002 int ret;
3003
3004 if (!pools)
3005 return NULL;
3006
3007 switch (type) {
3008 case DM_TYPE_BIO_BASED:
3009 case DM_TYPE_DAX_BIO_BASED:
3010 case DM_TYPE_NVME_BIO_BASED:
3011 pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
3012 front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
3013 io_front_pad = roundup(front_pad, __alignof__(struct dm_io)) + offsetof(struct dm_io, tio);
3014 ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0);
3015 if (ret)
3016 goto out;
3017 if (integrity && bioset_integrity_create(&pools->io_bs, pool_size))
3018 goto out;
3019 break;
3020 case DM_TYPE_REQUEST_BASED:
3021 pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size);
3022 front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
3023
3024 break;
3025 default:
3026 BUG();
3027 }
3028
3029 ret = bioset_init(&pools->bs, pool_size, front_pad, 0);
3030 if (ret)
3031 goto out;
3032
3033 if (integrity && bioset_integrity_create(&pools->bs, pool_size))
3034 goto out;
3035
3036 return pools;
3037
3038out:
3039 dm_free_md_mempools(pools);
3040
3041 return NULL;
3042}
3043
3044void dm_free_md_mempools(struct dm_md_mempools *pools)
3045{
3046 if (!pools)
3047 return;
3048
3049 bioset_exit(&pools->bs);
3050 bioset_exit(&pools->io_bs);
3051
3052 kfree(pools);
3053}
3054
3055struct dm_pr {
3056 u64 old_key;
3057 u64 new_key;
3058 u32 flags;
3059 bool fail_early;
3060};
3061
3062static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
3063 void *data)
3064{
3065 struct mapped_device *md = bdev->bd_disk->private_data;
3066 struct dm_table *table;
3067 struct dm_target *ti;
3068 int ret = -ENOTTY, srcu_idx;
3069
3070 table = dm_get_live_table(md, &srcu_idx);
3071 if (!table || !dm_table_get_size(table))
3072 goto out;
3073
3074
3075 if (dm_table_get_num_targets(table) != 1)
3076 goto out;
3077 ti = dm_table_get_target(table, 0);
3078
3079 ret = -EINVAL;
3080 if (!ti->type->iterate_devices)
3081 goto out;
3082
3083 ret = ti->type->iterate_devices(ti, fn, data);
3084out:
3085 dm_put_live_table(md, srcu_idx);
3086 return ret;
3087}
3088
3089
3090
3091
3092static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev,
3093 sector_t start, sector_t len, void *data)
3094{
3095 struct dm_pr *pr = data;
3096 const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
3097
3098 if (!ops || !ops->pr_register)
3099 return -EOPNOTSUPP;
3100 return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);
3101}
3102
3103static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
3104 u32 flags)
3105{
3106 struct dm_pr pr = {
3107 .old_key = old_key,
3108 .new_key = new_key,
3109 .flags = flags,
3110 .fail_early = true,
3111 };
3112 int ret;
3113
3114 ret = dm_call_pr(bdev, __dm_pr_register, &pr);
3115 if (ret && new_key) {
3116
3117 pr.old_key = new_key;
3118 pr.new_key = 0;
3119 pr.flags = 0;
3120 pr.fail_early = false;
3121 dm_call_pr(bdev, __dm_pr_register, &pr);
3122 }
3123
3124 return ret;
3125}
3126
3127static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
3128 u32 flags)
3129{
3130 struct mapped_device *md = bdev->bd_disk->private_data;
3131 const struct pr_ops *ops;
3132 int r, srcu_idx;
3133
3134 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3135 if (r < 0)
3136 goto out;
3137
3138 ops = bdev->bd_disk->fops->pr_ops;
3139 if (ops && ops->pr_reserve)
3140 r = ops->pr_reserve(bdev, key, type, flags);
3141 else
3142 r = -EOPNOTSUPP;
3143out:
3144 dm_unprepare_ioctl(md, srcu_idx);
3145 return r;
3146}
3147
3148static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
3149{
3150 struct mapped_device *md = bdev->bd_disk->private_data;
3151 const struct pr_ops *ops;
3152 int r, srcu_idx;
3153
3154 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3155 if (r < 0)
3156 goto out;
3157
3158 ops = bdev->bd_disk->fops->pr_ops;
3159 if (ops && ops->pr_release)
3160 r = ops->pr_release(bdev, key, type);
3161 else
3162 r = -EOPNOTSUPP;
3163out:
3164 dm_unprepare_ioctl(md, srcu_idx);
3165 return r;
3166}
3167
3168static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
3169 enum pr_type type, bool abort)
3170{
3171 struct mapped_device *md = bdev->bd_disk->private_data;
3172 const struct pr_ops *ops;
3173 int r, srcu_idx;
3174
3175 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3176 if (r < 0)
3177 goto out;
3178
3179 ops = bdev->bd_disk->fops->pr_ops;
3180 if (ops && ops->pr_preempt)
3181 r = ops->pr_preempt(bdev, old_key, new_key, type, abort);
3182 else
3183 r = -EOPNOTSUPP;
3184out:
3185 dm_unprepare_ioctl(md, srcu_idx);
3186 return r;
3187}
3188
3189static int dm_pr_clear(struct block_device *bdev, u64 key)
3190{
3191 struct mapped_device *md = bdev->bd_disk->private_data;
3192 const struct pr_ops *ops;
3193 int r, srcu_idx;
3194
3195 r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3196 if (r < 0)
3197 goto out;
3198
3199 ops = bdev->bd_disk->fops->pr_ops;
3200 if (ops && ops->pr_clear)
3201 r = ops->pr_clear(bdev, key);
3202 else
3203 r = -EOPNOTSUPP;
3204out:
3205 dm_unprepare_ioctl(md, srcu_idx);
3206 return r;
3207}
3208
3209static const struct pr_ops dm_pr_ops = {
3210 .pr_register = dm_pr_register,
3211 .pr_reserve = dm_pr_reserve,
3212 .pr_release = dm_pr_release,
3213 .pr_preempt = dm_pr_preempt,
3214 .pr_clear = dm_pr_clear,
3215};
3216
3217static const struct block_device_operations dm_blk_dops = {
3218 .open = dm_blk_open,
3219 .release = dm_blk_close,
3220 .ioctl = dm_blk_ioctl,
3221 .getgeo = dm_blk_getgeo,
3222 .report_zones = dm_blk_report_zones,
3223 .pr_ops = &dm_pr_ops,
3224 .owner = THIS_MODULE
3225};
3226
3227static const struct dax_operations dm_dax_ops = {
3228 .direct_access = dm_dax_direct_access,
3229 .dax_supported = dm_dax_supported,
3230 .copy_from_iter = dm_dax_copy_from_iter,
3231 .copy_to_iter = dm_dax_copy_to_iter,
3232 .zero_page_range = dm_dax_zero_page_range,
3233};
3234
3235
3236
3237
3238module_init(dm_init);
3239module_exit(dm_exit);
3240
3241module_param(major, uint, 0);
3242MODULE_PARM_DESC(major, "The major number of the device mapper");
3243
3244module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
3245MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
3246
3247module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
3248MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
3249
3250MODULE_DESCRIPTION(DM_NAME " driver");
3251MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3252MODULE_LICENSE("GPL");
3253