LXR linux/drivers/md/dm.c

   1/*
   2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
   3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
   4 *
   5 * This file is released under the GPL.
   6 */
   7
   8#include "dm.h"
   9#include "dm-uevent.h"
  10
  11#include <linux/init.h>
  12#include <linux/module.h>
  13#include <linux/mutex.h>
  14#include <linux/moduleparam.h>
  15#include <linux/blkpg.h>
  16#include <linux/bio.h>
  17#include <linux/mempool.h>
  18#include <linux/slab.h>
  19#include <linux/idr.h>
  20#include <linux/hdreg.h>
  21#include <linux/delay.h>
  22#include <linux/wait.h>
  23#include <linux/kthread.h>
  24#include <linux/ktime.h>
  25#include <linux/elevator.h> /* for rq_end_sector() */
  26#include <linux/blk-mq.h>
  27#include <linux/pr.h>
  28
  29#include <trace/events/block.h>
  30
  31#define DM_MSG_PREFIX "core"
  32
  33#ifdef CONFIG_PRINTK
  34/*
  35 * ratelimit state to be used in DMXXX_LIMIT().
  36 */
  37DEFINE_RATELIMIT_STATE(dm_ratelimit_state,
  38                       DEFAULT_RATELIMIT_INTERVAL,
  39                       DEFAULT_RATELIMIT_BURST);
  40EXPORT_SYMBOL(dm_ratelimit_state);
  41#endif
  42
  43/*
  44 * Cookies are numeric values sent with CHANGE and REMOVE
  45 * uevents while resuming, removing or renaming the device.
  46 */
  47#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
  48#define DM_COOKIE_LENGTH 24
  49
  50static const char *_name = DM_NAME;
  51
  52static unsigned int major = 0;
  53static unsigned int _major = 0;
  54
  55static DEFINE_IDR(_minor_idr);
  56
  57static DEFINE_SPINLOCK(_minor_lock);
  58
  59static void do_deferred_remove(struct work_struct *w);
  60
  61static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
  62
  63static struct workqueue_struct *deferred_remove_workqueue;
  64
  65/*
  66 * For bio-based dm.
  67 * One of these is allocated per bio.
  68 */
  69struct dm_io {
  70        struct mapped_device *md;
  71        int error;
  72        atomic_t io_count;
  73        struct bio *bio;
  74        unsigned long start_time;
  75        spinlock_t endio_lock;
  76        struct dm_stats_aux stats_aux;
  77};
  78
  79/*
  80 * For request-based dm.
  81 * One of these is allocated per request.
  82 */
  83struct dm_rq_target_io {
  84        struct mapped_device *md;
  85        struct dm_target *ti;
  86        struct request *orig, *clone;
  87        struct kthread_work work;
  88        int error;
  89        union map_info info;
  90        struct dm_stats_aux stats_aux;
  91        unsigned long duration_jiffies;
  92        unsigned n_sectors;
  93};
  94
  95/*
  96 * For request-based dm - the bio clones we allocate are embedded in these
  97 * structs.
  98 *
  99 * We allocate these with bio_alloc_bioset, using the front_pad parameter when
 100 * the bioset is created - this means the bio has to come at the end of the
 101 * struct.
 102 */
 103struct dm_rq_clone_bio_info {
 104        struct bio *orig;
 105        struct dm_rq_target_io *tio;
 106        struct bio clone;
 107};
 108
 109union map_info *dm_get_rq_mapinfo(struct request *rq)
 110{
 111        if (rq && rq->end_io_data)
 112                return &((struct dm_rq_target_io *)rq->end_io_data)->info;
 113        return NULL;
 114}
 115EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
 116
 117#define MINOR_ALLOCED ((void *)-1)
 118
 119/*
 120 * Bits for the md->flags field.
 121 */
 122#define DMF_BLOCK_IO_FOR_SUSPEND 0
 123#define DMF_SUSPENDED 1
 124#define DMF_FROZEN 2
 125#define DMF_FREEING 3
 126#define DMF_DELETING 4
 127#define DMF_NOFLUSH_SUSPENDING 5
 128#define DMF_DEFERRED_REMOVE 6
 129#define DMF_SUSPENDED_INTERNALLY 7
 130
 131/*
 132 * A dummy definition to make RCU happy.
 133 * struct dm_table should never be dereferenced in this file.
 134 */
 135struct dm_table {
 136        int undefined__;
 137};
 138
 139/*
 140 * Work processed by per-device workqueue.
 141 */
 142struct mapped_device {
 143        struct srcu_struct io_barrier;
 144        struct mutex suspend_lock;
 145        atomic_t holders;
 146        atomic_t open_count;
 147
 148        /*
 149         * The current mapping.
 150         * Use dm_get_live_table{_fast} or take suspend_lock for
 151         * dereference.
 152         */
 153        struct dm_table __rcu *map;
 154
 155        struct list_head table_devices;
 156        struct mutex table_devices_lock;
 157
 158        unsigned long flags;
 159
 160        struct request_queue *queue;
 161        unsigned type;
 162        /* Protect queue and type against concurrent access. */
 163        struct mutex type_lock;
 164
 165        struct target_type *immutable_target_type;
 166
 167        struct gendisk *disk;
 168        char name[16];
 169
 170        void *interface_ptr;
 171
 172        /*
 173         * A list of ios that arrived while we were suspended.
 174         */
 175        atomic_t pending[2];
 176        wait_queue_head_t wait;
 177        struct work_struct work;
 178        struct bio_list deferred;
 179        spinlock_t deferred_lock;
 180
 181        /*
 182         * Processing queue (flush)
 183         */
 184        struct workqueue_struct *wq;
 185
 186        /*
 187         * io objects are allocated from here.
 188         */
 189        mempool_t *io_pool;
 190        mempool_t *rq_pool;
 191
 192        struct bio_set *bs;
 193
 194        /*
 195         * Event handling.
 196         */
 197        atomic_t event_nr;
 198        wait_queue_head_t eventq;
 199        atomic_t uevent_seq;
 200        struct list_head uevent_list;
 201        spinlock_t uevent_lock; /* Protect access to uevent_list */
 202
 203        /*
 204         * freeze/thaw support require holding onto a super block
 205         */
 206        struct super_block *frozen_sb;
 207        struct block_device *bdev;
 208
 209        /* forced geometry settings */
 210        struct hd_geometry geometry;
 211
 212        /* kobject and completion */
 213        struct dm_kobject_holder kobj_holder;
 214
 215        /* zero-length flush that will be cloned and submitted to targets */
 216        struct bio flush_bio;
 217
 218        /* the number of internal suspends */
 219        unsigned internal_suspend_count;
 220
 221        struct dm_stats stats;
 222
 223        struct kthread_worker kworker;
 224        struct task_struct *kworker_task;
 225
 226        /* for request-based merge heuristic in dm_request_fn() */
 227        unsigned seq_rq_merge_deadline_usecs;
 228        int last_rq_rw;
 229        sector_t last_rq_pos;
 230        ktime_t last_rq_start_time;
 231
 232        /* for blk-mq request-based DM support */
 233        struct blk_mq_tag_set tag_set;
 234        bool use_blk_mq;
 235};
 236
 237#ifdef CONFIG_DM_MQ_DEFAULT
 238static bool use_blk_mq = true;
 239#else
 240static bool use_blk_mq = false;
 241#endif
 242
 243bool dm_use_blk_mq(struct mapped_device *md)
 244{
 245        return md->use_blk_mq;
 246}
 247
 248/*
 249 * For mempools pre-allocation at the table loading time.
 250 */
 251struct dm_md_mempools {
 252        mempool_t *io_pool;
 253        mempool_t *rq_pool;
 254        struct bio_set *bs;
 255};
 256
 257struct table_device {
 258        struct list_head list;
 259        atomic_t count;
 260        struct dm_dev dm_dev;
 261};
 262
 263#define RESERVED_BIO_BASED_IOS          16
 264#define RESERVED_REQUEST_BASED_IOS      256
 265#define RESERVED_MAX_IOS                1024
 266static struct kmem_cache *_io_cache;
 267static struct kmem_cache *_rq_tio_cache;
 268static struct kmem_cache *_rq_cache;
 269
 270/*
 271 * Bio-based DM's mempools' reserved IOs set by the user.
 272 */
 273static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
 274
 275/*
 276 * Request-based DM's mempools' reserved IOs set by the user.
 277 */
 278static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS;
 279
 280static unsigned __dm_get_module_param(unsigned *module_param,
 281                                      unsigned def, unsigned max)
 282{
 283        unsigned param = ACCESS_ONCE(*module_param);
 284        unsigned modified_param = 0;
 285
 286        if (!param)
 287                modified_param = def;
 288        else if (param > max)
 289                modified_param = max;
 290
 291        if (modified_param) {
 292                (void)cmpxchg(module_param, param, modified_param);
 293                param = modified_param;
 294        }
 295
 296        return param;
 297}
 298
 299unsigned dm_get_reserved_bio_based_ios(void)
 300{
 301        return __dm_get_module_param(&reserved_bio_based_ios,
 302                                     RESERVED_BIO_BASED_IOS, RESERVED_MAX_IOS);
 303}
 304EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
 305
 306unsigned dm_get_reserved_rq_based_ios(void)
 307{
 308        return __dm_get_module_param(&reserved_rq_based_ios,
 309                                     RESERVED_REQUEST_BASED_IOS, RESERVED_MAX_IOS);
 310}
 311EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios);
 312
 313static int __init local_init(void)
 314{
 315        int r = -ENOMEM;
 316
 317        /* allocate a slab for the dm_ios */
 318        _io_cache = KMEM_CACHE(dm_io, 0);
 319        if (!_io_cache)
 320                return r;
 321
 322        _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
 323        if (!_rq_tio_cache)
 324                goto out_free_io_cache;
 325
 326        _rq_cache = kmem_cache_create("dm_clone_request", sizeof(struct request),
 327                                      __alignof__(struct request), 0, NULL);
 328        if (!_rq_cache)
 329                goto out_free_rq_tio_cache;
 330
 331        r = dm_uevent_init();
 332        if (r)
 333                goto out_free_rq_cache;
 334
 335        deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
 336        if (!deferred_remove_workqueue) {
 337                r = -ENOMEM;
 338                goto out_uevent_exit;
 339        }
 340
 341        _major = major;
 342        r = register_blkdev(_major, _name);
 343        if (r < 0)
 344                goto out_free_workqueue;
 345
 346        if (!_major)
 347                _major = r;
 348
 349        return 0;
 350
 351out_free_workqueue:
 352        destroy_workqueue(deferred_remove_workqueue);
 353out_uevent_exit:
 354        dm_uevent_exit();
 355out_free_rq_cache:
 356        kmem_cache_destroy(_rq_cache);
 357out_free_rq_tio_cache:
 358        kmem_cache_destroy(_rq_tio_cache);
 359out_free_io_cache:
 360        kmem_cache_destroy(_io_cache);
 361
 362        return r;
 363}
 364
 365static void local_exit(void)
 366{
 367        flush_scheduled_work();
 368        destroy_workqueue(deferred_remove_workqueue);
 369
 370        kmem_cache_destroy(_rq_cache);
 371        kmem_cache_destroy(_rq_tio_cache);
 372        kmem_cache_destroy(_io_cache);
 373        unregister_blkdev(_major, _name);
 374        dm_uevent_exit();
 375
 376        _major = 0;
 377
 378        DMINFO("cleaned up");
 379}
 380
 381static int (*_inits[])(void) __initdata = {
 382        local_init,
 383        dm_target_init,
 384        dm_linear_init,
 385        dm_stripe_init,
 386        dm_io_init,
 387        dm_kcopyd_init,
 388        dm_interface_init,
 389        dm_statistics_init,
 390};
 391
 392static void (*_exits[])(void) = {
 393        local_exit,
 394        dm_target_exit,
 395        dm_linear_exit,
 396        dm_stripe_exit,
 397        dm_io_exit,
 398        dm_kcopyd_exit,
 399        dm_interface_exit,
 400        dm_statistics_exit,
 401};
 402
 403static int __init dm_init(void)
 404{
 405        const int count = ARRAY_SIZE(_inits);
 406
 407        int r, i;
 408
 409        for (i = 0; i < count; i++) {
 410                r = _inits[i]();
 411                if (r)
 412                        goto bad;
 413        }
 414
 415        return 0;
 416
 417      bad:
 418        while (i--)
 419                _exits[i]();
 420
 421        return r;
 422}
 423
 424static void __exit dm_exit(void)
 425{
 426        int i = ARRAY_SIZE(_exits);
 427
 428        while (i--)
 429                _exits[i]();
 430
 431        /*
 432         * Should be empty by this point.
 433         */
 434        idr_destroy(&_minor_idr);
 435}
 436
 437/*
 438 * Block device functions
 439 */
 440int dm_deleting_md(struct mapped_device *md)
 441{
 442        return test_bit(DMF_DELETING, &md->flags);
 443}
 444
 445static int dm_blk_open(struct block_device *bdev, fmode_t mode)
 446{
 447        struct mapped_device *md;
 448
 449        spin_lock(&_minor_lock);
 450
 451        md = bdev->bd_disk->private_data;
 452        if (!md)
 453                goto out;
 454
 455        if (test_bit(DMF_FREEING, &md->flags) ||
 456            dm_deleting_md(md)) {
 457                md = NULL;
 458                goto out;
 459        }
 460
 461        dm_get(md);
 462        atomic_inc(&md->open_count);
 463out:
 464        spin_unlock(&_minor_lock);
 465
 466        return md ? 0 : -ENXIO;
 467}
 468
 469static void dm_blk_close(struct gendisk *disk, fmode_t mode)
 470{
 471        struct mapped_device *md;
 472
 473        spin_lock(&_minor_lock);
 474
 475        md = disk->private_data;
 476        if (WARN_ON(!md))
 477                goto out;
 478
 479        if (atomic_dec_and_test(&md->open_count) &&
 480            (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
 481                queue_work(deferred_remove_workqueue, &deferred_remove_work);
 482
 483        dm_put(md);
 484out:
 485        spin_unlock(&_minor_lock);
 486}
 487
 488int dm_open_count(struct mapped_device *md)
 489{
 490        return atomic_read(&md->open_count);
 491}
 492
 493/*
 494 * Guarantees nothing is using the device before it's deleted.
 495 */
 496int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
 497{
 498        int r = 0;
 499
 500        spin_lock(&_minor_lock);
 501
 502        if (dm_open_count(md)) {
 503                r = -EBUSY;
 504                if (mark_deferred)
 505                        set_bit(DMF_DEFERRED_REMOVE, &md->flags);
 506        } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
 507                r = -EEXIST;
 508        else
 509                set_bit(DMF_DELETING, &md->flags);
 510
 511        spin_unlock(&_minor_lock);
 512
 513        return r;
 514}
 515
 516int dm_cancel_deferred_remove(struct mapped_device *md)
 517{
 518        int r = 0;
 519
 520        spin_lock(&_minor_lock);
 521
 522        if (test_bit(DMF_DELETING, &md->flags))
 523                r = -EBUSY;
 524        else
 525                clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
 526
 527        spin_unlock(&_minor_lock);
 528
 529        return r;
 530}
 531
 532static void do_deferred_remove(struct work_struct *w)
 533{
 534        dm_deferred_remove();
 535}
 536
 537sector_t dm_get_size(struct mapped_device *md)
 538{
 539        return get_capacity(md->disk);
 540}
 541
 542struct request_queue *dm_get_md_queue(struct mapped_device *md)
 543{
 544        return md->queue;
 545}
 546
 547struct dm_stats *dm_get_stats(struct mapped_device *md)
 548{
 549        return &md->stats;
 550}
 551
 552static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 553{
 554        struct mapped_device *md = bdev->bd_disk->private_data;
 555
 556        return dm_get_geometry(md, geo);
 557}
 558
 559static int dm_get_live_table_for_ioctl(struct mapped_device *md,
 560                struct dm_target **tgt, struct block_device **bdev,
 561                fmode_t *mode, int *srcu_idx)
 562{
 563        struct dm_table *map;
 564        int r;
 565
 566retry:
 567        r = -ENOTTY;
 568        map = dm_get_live_table(md, srcu_idx);
 569        if (!map || !dm_table_get_size(map))
 570                goto out;
 571
 572        /* We only support devices that have a single target */
 573        if (dm_table_get_num_targets(map) != 1)
 574                goto out;
 575
 576        *tgt = dm_table_get_target(map, 0);
 577
 578        if (!(*tgt)->type->prepare_ioctl)
 579                goto out;
 580
 581        if (dm_suspended_md(md)) {
 582                r = -EAGAIN;
 583                goto out;
 584        }
 585
 586        r = (*tgt)->type->prepare_ioctl(*tgt, bdev, mode);
 587        if (r < 0)
 588                goto out;
 589
 590        return r;
 591
 592out:
 593        dm_put_live_table(md, *srcu_idx);
 594        if (r == -ENOTCONN && !fatal_signal_pending(current)) {
 595                msleep(10);
 596                goto retry;
 597        }
 598        return r;
 599}
 600
 601static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
 602                        unsigned int cmd, unsigned long arg)
 603{
 604        struct mapped_device *md = bdev->bd_disk->private_data;
 605        struct dm_target *tgt;
 606        struct block_device *tgt_bdev = NULL;
 607        int srcu_idx, r;
 608
 609        r = dm_get_live_table_for_ioctl(md, &tgt, &tgt_bdev, &mode, &srcu_idx);
 610        if (r < 0)
 611                return r;
 612
 613        if (r > 0) {
 614                /*
 615                 * Target determined this ioctl is being issued against
 616                 * a logical partition of the parent bdev; so extra
 617                 * validation is needed.
 618                 */
 619                r = scsi_verify_blk_ioctl(NULL, cmd);
 620                if (r)
 621                        goto out;
 622        }
 623
 624        r =  __blkdev_driver_ioctl(tgt_bdev, mode, cmd, arg);
 625out:
 626        dm_put_live_table(md, srcu_idx);
 627        return r;
 628}
 629
 630static struct dm_io *alloc_io(struct mapped_device *md)
 631{
 632        return mempool_alloc(md->io_pool, GFP_NOIO);
 633}
 634
 635static void free_io(struct mapped_device *md, struct dm_io *io)
 636{
 637        mempool_free(io, md->io_pool);
 638}
 639
 640static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
 641{
 642        bio_put(&tio->clone);
 643}
 644
 645static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md,
 646                                            gfp_t gfp_mask)
 647{
 648        return mempool_alloc(md->io_pool, gfp_mask);
 649}
 650
 651static void free_rq_tio(struct dm_rq_target_io *tio)
 652{
 653        mempool_free(tio, tio->md->io_pool);
 654}
 655
 656static struct request *alloc_clone_request(struct mapped_device *md,
 657                                           gfp_t gfp_mask)
 658{
 659        return mempool_alloc(md->rq_pool, gfp_mask);
 660}
 661
 662static void free_clone_request(struct mapped_device *md, struct request *rq)
 663{
 664        mempool_free(rq, md->rq_pool);
 665}
 666
 667static int md_in_flight(struct mapped_device *md)
 668{
 669        return atomic_read(&md->pending[READ]) +
 670               atomic_read(&md->pending[WRITE]);
 671}
 672
 673static void start_io_acct(struct dm_io *io)
 674{
 675        struct mapped_device *md = io->md;
 676        struct bio *bio = io->bio;
 677        int cpu;
 678        int rw = bio_data_dir(bio);
 679
 680        io->start_time = jiffies;
 681
 682        cpu = part_stat_lock();
 683        part_round_stats(cpu, &dm_disk(md)->part0);
 684        part_stat_unlock();
 685        atomic_set(&dm_disk(md)->part0.in_flight[rw],
 686                atomic_inc_return(&md->pending[rw]));
 687
 688        if (unlikely(dm_stats_used(&md->stats)))
 689                dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector,
 690                                    bio_sectors(bio), false, 0, &io->stats_aux);
 691}
 692
 693static void end_io_acct(struct dm_io *io)
 694{
 695        struct mapped_device *md = io->md;
 696        struct bio *bio = io->bio;
 697        unsigned long duration = jiffies - io->start_time;
 698        int pending;
 699        int rw = bio_data_dir(bio);
 700
 701        generic_end_io_acct(rw, &dm_disk(md)->part0, io->start_time);
 702
 703        if (unlikely(dm_stats_used(&md->stats)))
 704                dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector,
 705                                    bio_sectors(bio), true, duration, &io->stats_aux);
 706
 707        /*
 708         * After this is decremented the bio must not be touched if it is
 709         * a flush.
 710         */
 711        pending = atomic_dec_return(&md->pending[rw]);
 712        atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
 713        pending += atomic_read(&md->pending[rw^0x1]);
 714
 715        /* nudge anyone waiting on suspend queue */
 716        if (!pending)
 717                wake_up(&md->wait);
 718}
 719
 720/*
 721 * Add the bio to the list of deferred io.
 722 */
 723static void queue_io(struct mapped_device *md, struct bio *bio)
 724{
 725        unsigned long flags;
 726
 727        spin_lock_irqsave(&md->deferred_lock, flags);
 728        bio_list_add(&md->deferred, bio);
 729        spin_unlock_irqrestore(&md->deferred_lock, flags);
 730        queue_work(md->wq, &md->work);
 731}
 732
 733/*
 734 * Everyone (including functions in this file), should use this
 735 * function to access the md->map field, and make sure they call
 736 * dm_put_live_table() when finished.
 737 */
 738struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
 739{
 740        *srcu_idx = srcu_read_lock(&md->io_barrier);
 741
 742        return srcu_dereference(md->map, &md->io_barrier);
 743}
 744
 745void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
 746{
 747        srcu_read_unlock(&md->io_barrier, srcu_idx);
 748}
 749
 750void dm_sync_table(struct mapped_device *md)
 751{
 752        synchronize_srcu(&md->io_barrier);
 753        synchronize_rcu_expedited();
 754}
 755
 756/*
 757 * A fast alternative to dm_get_live_table/dm_put_live_table.
 758 * The caller must not block between these two functions.
 759 */
 760static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
 761{
 762        rcu_read_lock();
 763        return rcu_dereference(md->map);
 764}
 765
 766static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
 767{
 768        rcu_read_unlock();
 769}
 770
 771/*
 772 * Open a table device so we can use it as a map destination.
 773 */
 774static int open_table_device(struct table_device *td, dev_t dev,
 775                             struct mapped_device *md)
 776{
 777        static char *_claim_ptr = "I belong to device-mapper";
 778        struct block_device *bdev;
 779
 780        int r;
 781
 782        BUG_ON(td->dm_dev.bdev);
 783
 784        bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _claim_ptr);
 785        if (IS_ERR(bdev))
 786                return PTR_ERR(bdev);
 787
 788        r = bd_link_disk_holder(bdev, dm_disk(md));
 789        if (r) {
 790                blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
 791                return r;
 792        }
 793
 794        td->dm_dev.bdev = bdev;
 795        return 0;
 796}
 797
 798/*
 799 * Close a table device that we've been using.
 800 */
 801static void close_table_device(struct table_device *td, struct mapped_device *md)
 802{
 803        if (!td->dm_dev.bdev)
 804                return;
 805
 806        bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
 807        blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
 808        td->dm_dev.bdev = NULL;
 809}
 810
 811static struct table_device *find_table_device(struct list_head *l, dev_t dev,
 812                                              fmode_t mode) {
 813        struct table_device *td;
 814
 815        list_for_each_entry(td, l, list)
 816                if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
 817                        return td;
 818
 819        return NULL;
 820}
 821
 822int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
 823                        struct dm_dev **result) {
 824        int r;
 825        struct table_device *td;
 826
 827        mutex_lock(&md->table_devices_lock);
 828        td = find_table_device(&md->table_devices, dev, mode);
 829        if (!td) {
 830                td = kmalloc(sizeof(*td), GFP_KERNEL);
 831                if (!td) {
 832                        mutex_unlock(&md->table_devices_lock);
 833                        return -ENOMEM;
 834                }
 835
 836                td->dm_dev.mode = mode;
 837                td->dm_dev.bdev = NULL;
 838
 839                if ((r = open_table_device(td, dev, md))) {
 840                        mutex_unlock(&md->table_devices_lock);
 841                        kfree(td);
 842                        return r;
 843                }
 844
 845                format_dev_t(td->dm_dev.name, dev);
 846
 847                atomic_set(&td->count, 0);
 848                list_add(&td->list, &md->table_devices);
 849        }
 850        atomic_inc(&td->count);
 851        mutex_unlock(&md->table_devices_lock);
 852
 853        *result = &td->dm_dev;
 854        return 0;
 855}
 856EXPORT_SYMBOL_GPL(dm_get_table_device);
 857
 858void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
 859{
 860        struct table_device *td = container_of(d, struct table_device, dm_dev);
 861
 862        mutex_lock(&md->table_devices_lock);
 863        if (atomic_dec_and_test(&td->count)) {
 864                close_table_device(td, md);
 865                list_del(&td->list);
 866                kfree(td);
 867        }
 868        mutex_unlock(&md->table_devices_lock);
 869}
 870EXPORT_SYMBOL(dm_put_table_device);
 871
 872static void free_table_devices(struct list_head *devices)
 873{
 874        struct list_head *tmp, *next;
 875
 876        list_for_each_safe(tmp, next, devices) {
 877                struct table_device *td = list_entry(tmp, struct table_device, list);
 878
 879                DMWARN("dm_destroy: %s still exists with %d references",
 880                       td->dm_dev.name, atomic_read(&td->count));
 881                kfree(td);
 882        }
 883}
 884
 885/*
 886 * Get the geometry associated with a dm device
 887 */
 888int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
 889{
 890        *geo = md->geometry;
 891
 892        return 0;
 893}
 894
 895/*
 896 * Set the geometry of a device.
 897 */
 898int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
 899{
 900        sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
 901
 902        if (geo->start > sz) {
 903                DMWARN("Start sector is beyond the geometry limits.");
 904                return -EINVAL;
 905        }
 906
 907        md->geometry = *geo;
 908
 909        return 0;
 910}
 911
 912/*-----------------------------------------------------------------
 913 * CRUD START:
 914 *   A more elegant soln is in the works that uses the queue
 915 *   merge fn, unfortunately there are a couple of changes to
 916 *   the block layer that I want to make for this.  So in the
 917 *   interests of getting something for people to use I give
 918 *   you this clearly demarcated crap.
 919 *---------------------------------------------------------------*/
 920
 921static int __noflush_suspending(struct mapped_device *md)
 922{
 923        return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
 924}
 925
 926/*
 927 * Decrements the number of outstanding ios that a bio has been
 928 * cloned into, completing the original io if necc.
 929 */
 930static void dec_pending(struct dm_io *io, int error)
 931{
 932        unsigned long flags;
 933        int io_error;
 934        struct bio *bio;
 935        struct mapped_device *md = io->md;
 936
 937        /* Push-back supersedes any I/O errors */
 938        if (unlikely(error)) {
 939                spin_lock_irqsave(&io->endio_lock, flags);
 940                if (!(io->error > 0 && __noflush_suspending(md)))
 941                        io->error = error;
 942                spin_unlock_irqrestore(&io->endio_lock, flags);
 943        }
 944
 945        if (atomic_dec_and_test(&io->io_count)) {
 946                if (io->error == DM_ENDIO_REQUEUE) {
 947                        /*
 948                         * Target requested pushing back the I/O.
 949                         */
 950                        spin_lock_irqsave(&md->deferred_lock, flags);
 951                        if (__noflush_suspending(md))
 952                                bio_list_add_head(&md->deferred, io->bio);
 953                        else
 954                                /* noflush suspend was interrupted. */
 955                                io->error = -EIO;
 956                        spin_unlock_irqrestore(&md->deferred_lock, flags);
 957                }
 958
 959                io_error = io->error;
 960                bio = io->bio;
 961                end_io_acct(io);
 962                free_io(md, io);
 963
 964                if (io_error == DM_ENDIO_REQUEUE)
 965                        return;
 966
 967                if ((bio->bi_rw & REQ_FLUSH) && bio->bi_iter.bi_size) {
 968                        /*
 969                         * Preflush done for flush with data, reissue
 970                         * without REQ_FLUSH.
 971                         */
 972                        bio->bi_rw &= ~REQ_FLUSH;
 973                        queue_io(md, bio);
 974                } else {
 975                        /* done with normal IO or empty flush */
 976                        trace_block_bio_complete(md->queue, bio, io_error);
 977                        bio->bi_error = io_error;
 978                        bio_endio(bio);
 979                }
 980        }
 981}
 982
 983static void disable_write_same(struct mapped_device *md)
 984{
 985        struct queue_limits *limits = dm_get_queue_limits(md);
 986
 987        /* device doesn't really support WRITE SAME, disable it */
 988        limits->max_write_same_sectors = 0;
 989}
 990
 991static void clone_endio(struct bio *bio)
 992{
 993        int error = bio->bi_error;
 994        int r = error;
 995        struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
 996        struct dm_io *io = tio->io;
 997        struct mapped_device *md = tio->io->md;
 998        dm_endio_fn endio = tio->ti->type->end_io;
 999
1000        if (endio) {

1001                r = endio(tio->ti, bio, error);
1002                if (r < 0 || r == DM_ENDIO_REQUEUE)
1003                        /*
1004                         * error and requeue request are handled
1005                         * in dec_pending().
1006                         */
1007                        error = r;
1008                else if (r == DM_ENDIO_INCOMPLETE)
1009                        /* The target will handle the io */
1010                        return;
1011                else if (r) {
1012                        DMWARN("unimplemented target endio return value: %d", r);
1013                        BUG();
1014                }
1015        }
1016
1017        if (unlikely(r == -EREMOTEIO && (bio->bi_rw & REQ_WRITE_SAME) &&
1018                     !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors))
1019                disable_write_same(md);
1020
1021        free_tio(md, tio);
1022        dec_pending(io, error);
1023}
1024
1025/*
1026 * Partial completion handling for request-based dm
1027 */
1028static void end_clone_bio(struct bio *clone)
1029{
1030        struct dm_rq_clone_bio_info *info =
1031                container_of(clone, struct dm_rq_clone_bio_info, clone);
1032        struct dm_rq_target_io *tio = info->tio;
1033        struct bio *bio = info->orig;
1034        unsigned int nr_bytes = info->orig->bi_iter.bi_size;
1035        int error = clone->bi_error;
1036
1037        bio_put(clone);
1038
1039        if (tio->error)
1040                /*
1041                 * An error has already been detected on the request.
1042                 * Once error occurred, just let clone->end_io() handle
1043                 * the remainder.
1044                 */
1045                return;
1046        else if (error) {
1047                /*
1048                 * Don't notice the error to the upper layer yet.
1049                 * The error handling decision is made by the target driver,
1050                 * when the request is completed.
1051                 */
1052                tio->error = error;
1053                return;
1054        }
1055
1056        /*
1057         * I/O for the bio successfully completed.
1058         * Notice the data completion to the upper layer.
1059         */
1060
1061        /*
1062         * bios are processed from the head of the list.
1063         * So the completing bio should always be rq->bio.
1064         * If it's not, something wrong is happening.
1065         */
1066        if (tio->orig->bio != bio)
1067                DMERR("bio completion is going in the middle of the request");
1068
1069        /*
1070         * Update the original request.
1071         * Do not use blk_end_request() here, because it may complete
1072         * the original request before the clone, and break the ordering.
1073         */
1074        blk_update_request(tio->orig, 0, nr_bytes);
1075}
1076
1077static struct dm_rq_target_io *tio_from_request(struct request *rq)
1078{
1079        return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special);
1080}
1081
1082static void rq_end_stats(struct mapped_device *md, struct request *orig)
1083{
1084        if (unlikely(dm_stats_used(&md->stats))) {
1085                struct dm_rq_target_io *tio = tio_from_request(orig);
1086                tio->duration_jiffies = jiffies - tio->duration_jiffies;
1087                dm_stats_account_io(&md->stats, orig->cmd_flags, blk_rq_pos(orig),
1088                                    tio->n_sectors, true, tio->duration_jiffies,
1089                                    &tio->stats_aux);
1090        }
1091}
1092
1093/*
1094 * Don't touch any member of the md after calling this function because
1095 * the md may be freed in dm_put() at the end of this function.
1096 * Or do dm_get() before calling this function and dm_put() later.
1097 */
1098static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
1099{
1100        atomic_dec(&md->pending[rw]);
1101
1102        /* nudge anyone waiting on suspend queue */
1103        if (!md_in_flight(md))
1104                wake_up(&md->wait);
1105
1106        /*
1107         * Run this off this callpath, as drivers could invoke end_io while
1108         * inside their request_fn (and holding the queue lock). Calling
1109         * back into ->request_fn() could deadlock attempting to grab the
1110         * queue lock again.
1111         */
1112        if (run_queue) {
1113                if (md->queue->mq_ops)
1114                        blk_mq_run_hw_queues(md->queue, true);
1115                else
1116                        blk_run_queue_async(md->queue);
1117        }
1118
1119        /*
1120         * dm_put() must be at the end of this function. See the comment above
1121         */
1122        dm_put(md);
1123}
1124
1125static void free_rq_clone(struct request *clone)
1126{
1127        struct dm_rq_target_io *tio = clone->end_io_data;
1128        struct mapped_device *md = tio->md;
1129
1130        blk_rq_unprep_clone(clone);
1131
1132        if (md->type == DM_TYPE_MQ_REQUEST_BASED)
1133                /* stacked on blk-mq queue(s) */
1134                tio->ti->type->release_clone_rq(clone);
1135        else if (!md->queue->mq_ops)
1136                /* request_fn queue stacked on request_fn queue(s) */
1137                free_clone_request(md, clone);
1138        /*
1139         * NOTE: for the blk-mq queue stacked on request_fn queue(s) case:
1140         * no need to call free_clone_request() because we leverage blk-mq by
1141         * allocating the clone at the end of the blk-mq pdu (see: clone_rq)
1142         */
1143
1144        if (!md->queue->mq_ops)
1145                free_rq_tio(tio);
1146}
1147
1148/*
1149 * Complete the clone and the original request.
1150 * Must be called without clone's queue lock held,
1151 * see end_clone_request() for more details.
1152 */
1153static void dm_end_request(struct request *clone, int error)
1154{
1155        int rw = rq_data_dir(clone);
1156        struct dm_rq_target_io *tio = clone->end_io_data;
1157        struct mapped_device *md = tio->md;
1158        struct request *rq = tio->orig;
1159
1160        if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
1161                rq->errors = clone->errors;
1162                rq->resid_len = clone->resid_len;
1163
1164                if (rq->sense)
1165                        /*
1166                         * We are using the sense buffer of the original
1167                         * request.
1168                         * So setting the length of the sense data is enough.
1169                         */
1170                        rq->sense_len = clone->sense_len;
1171        }
1172
1173        free_rq_clone(clone);
1174        rq_end_stats(md, rq);
1175        if (!rq->q->mq_ops)
1176                blk_end_request_all(rq, error);
1177        else
1178                blk_mq_end_request(rq, error);
1179        rq_completed(md, rw, true);
1180}
1181
1182static void dm_unprep_request(struct request *rq)
1183{
1184        struct dm_rq_target_io *tio = tio_from_request(rq);
1185        struct request *clone = tio->clone;
1186
1187        if (!rq->q->mq_ops) {
1188                rq->special = NULL;
1189                rq->cmd_flags &= ~REQ_DONTPREP;
1190        }
1191
1192        if (clone)
1193                free_rq_clone(clone);
1194        else if (!tio->md->queue->mq_ops)
1195                free_rq_tio(tio);
1196}
1197
1198/*
1199 * Requeue the original request of a clone.
1200 */
1201static void old_requeue_request(struct request *rq)
1202{
1203        struct request_queue *q = rq->q;
1204        unsigned long flags;
1205
1206        spin_lock_irqsave(q->queue_lock, flags);
1207        blk_requeue_request(q, rq);
1208        blk_run_queue_async(q);
1209        spin_unlock_irqrestore(q->queue_lock, flags);
1210}
1211
1212static void dm_requeue_original_request(struct mapped_device *md,
1213                                        struct request *rq)
1214{
1215        int rw = rq_data_dir(rq);
1216
1217        dm_unprep_request(rq);
1218
1219        rq_end_stats(md, rq);
1220        if (!rq->q->mq_ops)
1221                old_requeue_request(rq);
1222        else {
1223                blk_mq_requeue_request(rq);
1224                blk_mq_kick_requeue_list(rq->q);
1225        }
1226
1227        rq_completed(md, rw, false);
1228}
1229
1230static void old_stop_queue(struct request_queue *q)
1231{
1232        unsigned long flags;
1233
1234        if (blk_queue_stopped(q))
1235                return;
1236
1237        spin_lock_irqsave(q->queue_lock, flags);
1238        blk_stop_queue(q);
1239        spin_unlock_irqrestore(q->queue_lock, flags);
1240}
1241
1242static void stop_queue(struct request_queue *q)
1243{
1244        if (!q->mq_ops)
1245                old_stop_queue(q);
1246        else
1247                blk_mq_stop_hw_queues(q);
1248}
1249
1250static void old_start_queue(struct request_queue *q)
1251{
1252        unsigned long flags;
1253
1254        spin_lock_irqsave(q->queue_lock, flags);
1255        if (blk_queue_stopped(q))
1256                blk_start_queue(q);
1257        spin_unlock_irqrestore(q->queue_lock, flags);
1258}
1259
1260static void start_queue(struct request_queue *q)
1261{
1262        if (!q->mq_ops)
1263                old_start_queue(q);
1264        else
1265                blk_mq_start_stopped_hw_queues(q, true);
1266}
1267
1268static void dm_done(struct request *clone, int error, bool mapped)
1269{
1270        int r = error;
1271        struct dm_rq_target_io *tio = clone->end_io_data;
1272        dm_request_endio_fn rq_end_io = NULL;
1273
1274        if (tio->ti) {
1275                rq_end_io = tio->ti->type->rq_end_io;
1276
1277                if (mapped && rq_end_io)
1278                        r = rq_end_io(tio->ti, clone, error, &tio->info);
1279        }
1280
1281        if (unlikely(r == -EREMOTEIO && (clone->cmd_flags & REQ_WRITE_SAME) &&
1282                     !clone->q->limits.max_write_same_sectors))
1283                disable_write_same(tio->md);
1284
1285        if (r <= 0)
1286                /* The target wants to complete the I/O */
1287                dm_end_request(clone, r);
1288        else if (r == DM_ENDIO_INCOMPLETE)
1289                /* The target will handle the I/O */
1290                return;
1291        else if (r == DM_ENDIO_REQUEUE)
1292                /* The target wants to requeue the I/O */
1293                dm_requeue_original_request(tio->md, tio->orig);
1294        else {
1295                DMWARN("unimplemented target endio return value: %d", r);
1296                BUG();
1297        }
1298}
1299
1300/*
1301 * Request completion handler for request-based dm
1302 */
1303static void dm_softirq_done(struct request *rq)
1304{
1305        bool mapped = true;
1306        struct dm_rq_target_io *tio = tio_from_request(rq);
1307        struct request *clone = tio->clone;
1308        int rw;
1309
1310        if (!clone) {
1311                rq_end_stats(tio->md, rq);
1312                rw = rq_data_dir(rq);
1313                if (!rq->q->mq_ops) {
1314                        blk_end_request_all(rq, tio->error);
1315                        rq_completed(tio->md, rw, false);
1316                        free_rq_tio(tio);
1317                } else {
1318                        blk_mq_end_request(rq, tio->error);
1319                        rq_completed(tio->md, rw, false);
1320                }
1321                return;
1322        }
1323
1324        if (rq->cmd_flags & REQ_FAILED)
1325                mapped = false;
1326
1327        dm_done(clone, tio->error, mapped);
1328}
1329
1330/*
1331 * Complete the clone and the original request with the error status
1332 * through softirq context.
1333 */
1334static void dm_complete_request(struct request *rq, int error)
1335{
1336        struct dm_rq_target_io *tio = tio_from_request(rq);
1337
1338        tio->error = error;
1339        blk_complete_request(rq);
1340}
1341
1342/*
1343 * Complete the not-mapped clone and the original request with the error status
1344 * through softirq context.
1345 * Target's rq_end_io() function isn't called.
1346 * This may be used when the target's map_rq() or clone_and_map_rq() functions fail.
1347 */
1348static void dm_kill_unmapped_request(struct request *rq, int error)
1349{
1350        rq->cmd_flags |= REQ_FAILED;
1351        dm_complete_request(rq, error);
1352}
1353
1354/*
1355 * Called with the clone's queue lock held (for non-blk-mq)
1356 */
1357static void end_clone_request(struct request *clone, int error)
1358{
1359        struct dm_rq_target_io *tio = clone->end_io_data;
1360
1361        if (!clone->q->mq_ops) {
1362                /*
1363                 * For just cleaning up the information of the queue in which
1364                 * the clone was dispatched.
1365                 * The clone is *NOT* freed actually here because it is alloced
1366                 * from dm own mempool (REQ_ALLOCED isn't set).
1367                 */
1368                __blk_put_request(clone->q, clone);
1369        }
1370
1371        /*
1372         * Actual request completion is done in a softirq context which doesn't
1373         * hold the clone's queue lock.  Otherwise, deadlock could occur because:
1374         *     - another request may be submitted by the upper level driver
1375         *       of the stacking during the completion
1376         *     - the submission which requires queue lock may be done
1377         *       against this clone's queue
1378         */
1379        dm_complete_request(tio->orig, error);
1380}
1381
1382/*
1383 * Return maximum size of I/O possible at the supplied sector up to the current
1384 * target boundary.
1385 */
1386static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
1387{
1388        sector_t target_offset = dm_target_offset(ti, sector);
1389
1390        return ti->len - target_offset;
1391}
1392
1393static sector_t max_io_len(sector_t sector, struct dm_target *ti)
1394{
1395        sector_t len = max_io_len_target_boundary(sector, ti);
1396        sector_t offset, max_len;
1397
1398        /*
1399         * Does the target need to split even further?
1400         */
1401        if (ti->max_io_len) {
1402                offset = dm_target_offset(ti, sector);
1403                if (unlikely(ti->max_io_len & (ti->max_io_len - 1)))
1404                        max_len = sector_div(offset, ti->max_io_len);
1405                else
1406                        max_len = offset & (ti->max_io_len - 1);
1407                max_len = ti->max_io_len - max_len;
1408
1409                if (len > max_len)
1410                        len = max_len;
1411        }
1412
1413        return len;
1414}
1415
1416int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
1417{
1418        if (len > UINT_MAX) {
1419                DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
1420                      (unsigned long long)len, UINT_MAX);
1421                ti->error = "Maximum size of target IO is too large";
1422                return -EINVAL;
1423        }
1424
1425        ti->max_io_len = (uint32_t) len;
1426
1427        return 0;
1428}
1429EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
1430
1431/*
1432 * A target may call dm_accept_partial_bio only from the map routine.  It is
1433 * allowed for all bio types except REQ_FLUSH.
1434 *
1435 * dm_accept_partial_bio informs the dm that the target only wants to process
1436 * additional n_sectors sectors of the bio and the rest of the data should be
1437 * sent in a next bio.
1438 *
1439 * A diagram that explains the arithmetics:
1440 * +--------------------+---------------+-------+
1441 * |         1          |       2       |   3   |
1442 * +--------------------+---------------+-------+
1443 *
1444 * <-------------- *tio->len_ptr --------------->
1445 *                      <------- bi_size ------->
1446 *                      <-- n_sectors -->
1447 *
1448 * Region 1 was already iterated over with bio_advance or similar function.
1449 *      (it may be empty if the target doesn't use bio_advance)
1450 * Region 2 is the remaining bio size that the target wants to process.
1451 *      (it may be empty if region 1 is non-empty, although there is no reason
1452 *       to make it empty)
1453 * The target requires that region 3 is to be sent in the next bio.
1454 *
1455 * If the target wants to receive multiple copies of the bio (via num_*bios, etc),
1456 * the partially processed part (the sum of regions 1+2) must be the same for all
1457 * copies of the bio.
1458 */
1459void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
1460{
1461        struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
1462        unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
1463        BUG_ON(bio->bi_rw & REQ_FLUSH);
1464        BUG_ON(bi_size > *tio->len_ptr);
1465        BUG_ON(n_sectors > bi_size);
1466        *tio->len_ptr -= bi_size - n_sectors;
1467        bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
1468}
1469EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
1470
1471static void __map_bio(struct dm_target_io *tio)
1472{
1473        int r;
1474        sector_t sector;
1475        struct mapped_device *md;
1476        struct bio *clone = &tio->clone;
1477        struct dm_target *ti = tio->ti;
1478
1479        clone->bi_end_io = clone_endio;
1480
1481        /*
1482         * Map the clone.  If r == 0 we don't need to do
1483         * anything, the target has assumed ownership of
1484         * this io.
1485         */
1486        atomic_inc(&tio->io->io_count);
1487        sector = clone->bi_iter.bi_sector;
1488        r = ti->type->map(ti, clone);
1489        if (r == DM_MAPIO_REMAPPED) {
1490                /* the bio has been remapped so dispatch it */
1491
1492                trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,
1493                                      tio->io->bio->bi_bdev->bd_dev, sector);
1494
1495                generic_make_request(clone);
1496        } else if (r < 0 || r == DM_MAPIO_REQUEUE) {
1497                /* error the io and bail out, or requeue it if needed */
1498                md = tio->io->md;
1499                dec_pending(tio->io, r);
1500                free_tio(md, tio);
1501        } else if (r != DM_MAPIO_SUBMITTED) {
1502                DMWARN("unimplemented target map return value: %d", r);
1503                BUG();
1504        }
1505}
1506
1507struct clone_info {
1508        struct mapped_device *md;
1509        struct dm_table *map;
1510        struct bio *bio;
1511        struct dm_io *io;
1512        sector_t sector;
1513        unsigned sector_count;
1514};
1515
1516static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
1517{
1518        bio->bi_iter.bi_sector = sector;
1519        bio->bi_iter.bi_size = to_bytes(len);
1520}
1521
1522/*
1523 * Creates a bio that consists of range of complete bvecs.
1524 */
1525static void clone_bio(struct dm_target_io *tio, struct bio *bio,
1526                      sector_t sector, unsigned len)
1527{
1528        struct bio *clone = &tio->clone;
1529
1530        __bio_clone_fast(clone, bio);
1531
1532        if (bio_integrity(bio))
1533                bio_integrity_clone(clone, bio, GFP_NOIO);
1534
1535        bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
1536        clone->bi_iter.bi_size = to_bytes(len);
1537
1538        if (bio_integrity(bio))
1539                bio_integrity_trim(clone, 0, len);
1540}
1541
1542static struct dm_target_io *alloc_tio(struct clone_info *ci,
1543                                      struct dm_target *ti,
1544                                      unsigned target_bio_nr)
1545{
1546        struct dm_target_io *tio;
1547        struct bio *clone;
1548
1549        clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs);
1550        tio = container_of(clone, struct dm_target_io, clone);
1551
1552        tio->io = ci->io;
1553        tio->ti = ti;
1554        tio->target_bio_nr = target_bio_nr;
1555
1556        return tio;
1557}
1558
1559static void __clone_and_map_simple_bio(struct clone_info *ci,
1560                                       struct dm_target *ti,
1561                                       unsigned target_bio_nr, unsigned *len)
1562{
1563        struct dm_target_io *tio = alloc_tio(ci, ti, target_bio_nr);
1564        struct bio *clone = &tio->clone;
1565
1566        tio->len_ptr = len;
1567
1568        __bio_clone_fast(clone, ci->bio);
1569        if (len)
1570                bio_setup_sector(clone, ci->sector, *len);
1571
1572        __map_bio(tio);
1573}
1574
1575static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
1576                                  unsigned num_bios, unsigned *len)
1577{
1578        unsigned target_bio_nr;
1579
1580        for (target_bio_nr = 0; target_bio_nr < num_bios; target_bio_nr++)
1581                __clone_and_map_simple_bio(ci, ti, target_bio_nr, len);
1582}
1583
1584static int __send_empty_flush(struct clone_info *ci)
1585{
1586        unsigned target_nr = 0;
1587        struct dm_target *ti;
1588
1589        BUG_ON(bio_has_data(ci->bio));
1590        while ((ti = dm_table_get_target(ci->map, target_nr++)))
1591                __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
1592
1593        return 0;
1594}
1595
1596static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
1597                                     sector_t sector, unsigned *len)
1598{
1599        struct bio *bio = ci->bio;
1600        struct dm_target_io *tio;
1601        unsigned target_bio_nr;
1602        unsigned num_target_bios = 1;
1603
1604        /*
1605         * Does the target want to receive duplicate copies of the bio?
1606         */
1607        if (bio_data_dir(bio) == WRITE && ti->num_write_bios)
1608                num_target_bios = ti->num_write_bios(ti, bio);
1609
1610        for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) {
1611                tio = alloc_tio(ci, ti, target_bio_nr);
1612                tio->len_ptr = len;
1613                clone_bio(tio, bio, sector, *len);
1614                __map_bio(tio);
1615        }
1616}
1617
1618typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
1619
1620static unsigned get_num_discard_bios(struct dm_target *ti)
1621{
1622        return ti->num_discard_bios;
1623}
1624
1625static unsigned get_num_write_same_bios(struct dm_target *ti)
1626{
1627        return ti->num_write_same_bios;
1628}
1629
1630typedef bool (*is_split_required_fn)(struct dm_target *ti);
1631
1632static bool is_split_required_for_discard(struct dm_target *ti)
1633{
1634        return ti->split_discard_bios;
1635}
1636
1637static int __send_changing_extent_only(struct clone_info *ci,
1638                                       get_num_bios_fn get_num_bios,
1639                                       is_split_required_fn is_split_required)
1640{
1641        struct dm_target *ti;
1642        unsigned len;
1643        unsigned num_bios;
1644
1645        do {
1646                ti = dm_table_find_target(ci->map, ci->sector);
1647                if (!dm_target_is_valid(ti))
1648                        return -EIO;
1649
1650                /*
1651                 * Even though the device advertised support for this type of
1652                 * request, that does not mean every target supports it, and
1653                 * reconfiguration might also have changed that since the
1654                 * check was performed.
1655                 */
1656                num_bios = get_num_bios ? get_num_bios(ti) : 0;
1657                if (!num_bios)
1658                        return -EOPNOTSUPP;
1659
1660                if (is_split_required && !is_split_required(ti))
1661                        len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
1662                else
1663                        len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti));
1664
1665                __send_duplicate_bios(ci, ti, num_bios, &len);
1666
1667                ci->sector += len;
1668        } while (ci->sector_count -= len);
1669
1670        return 0;
1671}
1672
1673static int __send_discard(struct clone_info *ci)
1674{
1675        return __send_changing_extent_only(ci, get_num_discard_bios,
1676                                           is_split_required_for_discard);
1677}
1678
1679static int __send_write_same(struct clone_info *ci)
1680{
1681        return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
1682}
1683
1684/*
1685 * Select the correct strategy for processing a non-flush bio.
1686 */
1687static int __split_and_process_non_flush(struct clone_info *ci)
1688{
1689        struct bio *bio = ci->bio;
1690        struct dm_target *ti;
1691        unsigned len;
1692
1693        if (unlikely(bio->bi_rw & REQ_DISCARD))
1694                return __send_discard(ci);
1695        else if (unlikely(bio->bi_rw & REQ_WRITE_SAME))
1696                return __send_write_same(ci);
1697
1698        ti = dm_table_find_target(ci->map, ci->sector);
1699        if (!dm_target_is_valid(ti))
1700                return -EIO;
1701
1702        len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count);
1703
1704        __clone_and_map_data_bio(ci, ti, ci->sector, &len);
1705
1706        ci->sector += len;
1707        ci->sector_count -= len;
1708
1709        return 0;
1710}
1711
1712/*
1713 * Entry point to split a bio into clones and submit them to the targets.
1714 */
1715static void __split_and_process_bio(struct mapped_device *md,
1716                                    struct dm_table *map, struct bio *bio)
1717{
1718        struct clone_info ci;
1719        int error = 0;
1720
1721        if (unlikely(!map)) {
1722                bio_io_error(bio);
1723                return;
1724        }
1725
1726        ci.map = map;
1727        ci.md = md;
1728        ci.io = alloc_io(md);
1729        ci.io->error = 0;
1730        atomic_set(&ci.io->io_count, 1);
1731        ci.io->bio = bio;
1732        ci.io->md = md;
1733        spin_lock_init(&ci.io->endio_lock);
1734        ci.sector = bio->bi_iter.bi_sector;
1735
1736        start_io_acct(ci.io);
1737
1738        if (bio->bi_rw & REQ_FLUSH) {
1739                ci.bio = &ci.md->flush_bio;
1740                ci.sector_count = 0;
1741                error = __send_empty_flush(&ci);
1742                /* dec_pending submits any data associated with flush */
1743        } else {
1744                ci.bio = bio;
1745                ci.sector_count = bio_sectors(bio);
1746                while (ci.sector_count && !error)
1747                        error = __split_and_process_non_flush(&ci);
1748        }
1749
1750        /* drop the extra reference count */
1751        dec_pending(ci.io, error);
1752}
1753/*-----------------------------------------------------------------
1754 * CRUD END
1755 *---------------------------------------------------------------*/
1756
1757/*
1758 * The request function that just remaps the bio built up by
1759 * dm_merge_bvec.
1760 */
1761static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
1762{
1763        int rw = bio_data_dir(bio);
1764        struct mapped_device *md = q->queuedata;
1765        int srcu_idx;
1766        struct dm_table *map;
1767
1768        map = dm_get_live_table(md, &srcu_idx);
1769
1770        generic_start_io_acct(rw, bio_sectors(bio), &dm_disk(md)->part0);
1771
1772        /* if we're suspended, we have to queue this io for later */
1773        if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1774                dm_put_live_table(md, srcu_idx);
1775
1776                if (bio_rw(bio) != READA)
1777                        queue_io(md, bio);
1778                else
1779                        bio_io_error(bio);
1780                return BLK_QC_T_NONE;
1781        }
1782
1783        __split_and_process_bio(md, map, bio);
1784        dm_put_live_table(md, srcu_idx);
1785        return BLK_QC_T_NONE;
1786}
1787
1788int dm_request_based(struct mapped_device *md)
1789{
1790        return blk_queue_stackable(md->queue);
1791}
1792
1793static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
1794{
1795        int r;
1796
1797        if (blk_queue_io_stat(clone->q))
1798                clone->cmd_flags |= REQ_IO_STAT;
1799
1800        clone->start_time = jiffies;
1801        r = blk_insert_cloned_request(clone->q, clone);
1802        if (r)
1803                /* must complete clone in terms of original request */
1804                dm_complete_request(rq, r);
1805}
1806
1807static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
1808                                 void *data)
1809{
1810        struct dm_rq_target_io *tio = data;
1811        struct dm_rq_clone_bio_info *info =
1812                container_of(bio, struct dm_rq_clone_bio_info, clone);
1813
1814        info->orig = bio_orig;
1815        info->tio = tio;
1816        bio->bi_end_io = end_clone_bio;
1817
1818        return 0;
1819}
1820
1821static int setup_clone(struct request *clone, struct request *rq,
1822                       struct dm_rq_target_io *tio, gfp_t gfp_mask)
1823{
1824        int r;
1825
1826        r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask,
1827                              dm_rq_bio_constructor, tio);
1828        if (r)
1829                return r;
1830
1831        clone->cmd = rq->cmd;
1832        clone->cmd_len = rq->cmd_len;
1833        clone->sense = rq->sense;
1834        clone->end_io = end_clone_request;
1835        clone->end_io_data = tio;
1836
1837        tio->clone = clone;
1838
1839        return 0;
1840}
1841
1842static struct request *clone_rq(struct request *rq, struct mapped_device *md,
1843                                struct dm_rq_target_io *tio, gfp_t gfp_mask)
1844{
1845        /*
1846         * Do not allocate a clone if tio->clone was already set
1847         * (see: dm_mq_queue_rq).
1848         */
1849        bool alloc_clone = !tio->clone;
1850        struct request *clone;
1851
1852        if (alloc_clone) {
1853                clone = alloc_clone_request(md, gfp_mask);
1854                if (!clone)
1855                        return NULL;
1856        } else
1857                clone = tio->clone;
1858
1859        blk_rq_init(NULL, clone);
1860        if (setup_clone(clone, rq, tio, gfp_mask)) {
1861                /* -ENOMEM */
1862                if (alloc_clone)
1863                        free_clone_request(md, clone);
1864                return NULL;
1865        }
1866
1867        return clone;
1868}
1869
1870static void map_tio_request(struct kthread_work *work);
1871
1872static void init_tio(struct dm_rq_target_io *tio, struct request *rq,
1873                     struct mapped_device *md)
1874{
1875        tio->md = md;
1876        tio->ti = NULL;
1877        tio->clone = NULL;
1878        tio->orig = rq;
1879        tio->error = 0;
1880        memset(&tio->info, 0, sizeof(tio->info));
1881        if (md->kworker_task)
1882                init_kthread_work(&tio->work, map_tio_request);
1883}
1884
1885static struct dm_rq_target_io *prep_tio(struct request *rq,
1886                                        struct mapped_device *md, gfp_t gfp_mask)
1887{
1888        struct dm_rq_target_io *tio;
1889        int srcu_idx;
1890        struct dm_table *table;
1891
1892        tio = alloc_rq_tio(md, gfp_mask);
1893        if (!tio)
1894                return NULL;
1895
1896        init_tio(tio, rq, md);
1897
1898        table = dm_get_live_table(md, &srcu_idx);
1899        if (!dm_table_mq_request_based(table)) {
1900                if (!clone_rq(rq, md, tio, gfp_mask)) {
1901                        dm_put_live_table(md, srcu_idx);
1902                        free_rq_tio(tio);
1903                        return NULL;
1904                }
1905        }
1906        dm_put_live_table(md, srcu_idx);
1907
1908        return tio;
1909}
1910
1911/*
1912 * Called with the queue lock held.
1913 */
1914static int dm_prep_fn(struct request_queue *q, struct request *rq)
1915{
1916        struct mapped_device *md = q->queuedata;
1917        struct dm_rq_target_io *tio;
1918
1919        if (unlikely(rq->special)) {
1920                DMWARN("Already has something in rq->special.");
1921                return BLKPREP_KILL;
1922        }
1923
1924        tio = prep_tio(rq, md, GFP_ATOMIC);
1925        if (!tio)
1926                return BLKPREP_DEFER;
1927
1928        rq->special = tio;
1929        rq->cmd_flags |= REQ_DONTPREP;
1930
1931        return BLKPREP_OK;
1932}
1933
1934/*
1935 * Returns:
1936 * 0                : the request has been processed
1937 * DM_MAPIO_REQUEUE : the original request needs to be requeued
1938 * < 0              : the request was completed due to failure
1939 */
1940static int map_request(struct dm_rq_target_io *tio, struct request *rq,
1941                       struct mapped_device *md)
1942{
1943        int r;
1944        struct dm_target *ti = tio->ti;
1945        struct request *clone = NULL;
1946
1947        if (tio->clone) {
1948                clone = tio->clone;
1949                r = ti->type->map_rq(ti, clone, &tio->info);
1950        } else {
1951                r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone);
1952                if (r < 0) {
1953                        /* The target wants to complete the I/O */
1954                        dm_kill_unmapped_request(rq, r);
1955                        return r;
1956                }
1957                if (r != DM_MAPIO_REMAPPED)
1958                        return r;
1959                if (setup_clone(clone, rq, tio, GFP_ATOMIC)) {
1960                        /* -ENOMEM */
1961                        ti->type->release_clone_rq(clone);
1962                        return DM_MAPIO_REQUEUE;
1963                }
1964        }
1965
1966        switch (r) {
1967        case DM_MAPIO_SUBMITTED:
1968                /* The target has taken the I/O to submit by itself later */
1969                break;
1970        case DM_MAPIO_REMAPPED:
1971                /* The target has remapped the I/O so dispatch it */
1972                trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
1973                                     blk_rq_pos(rq));
1974                dm_dispatch_clone_request(clone, rq);
1975                break;
1976        case DM_MAPIO_REQUEUE:
1977                /* The target wants to requeue the I/O */
1978                dm_requeue_original_request(md, tio->orig);
1979                break;
1980        default:
1981                if (r > 0) {
1982                        DMWARN("unimplemented target map return value: %d", r);
1983                        BUG();
1984                }
1985
1986                /* The target wants to complete the I/O */
1987                dm_kill_unmapped_request(rq, r);
1988                return r;
1989        }
1990
1991        return 0;
1992}
1993
1994static void map_tio_request(struct kthread_work *work)
1995{
1996        struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work);
1997        struct request *rq = tio->orig;
1998        struct mapped_device *md = tio->md;
1999
2000        if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE)

2001                dm_requeue_original_request(md, rq);
2002}
2003
2004static void dm_start_request(struct mapped_device *md, struct request *orig)
2005{
2006        if (!orig->q->mq_ops)
2007                blk_start_request(orig);
2008        else
2009                blk_mq_start_request(orig);
2010        atomic_inc(&md->pending[rq_data_dir(orig)]);
2011
2012        if (md->seq_rq_merge_deadline_usecs) {
2013                md->last_rq_pos = rq_end_sector(orig);
2014                md->last_rq_rw = rq_data_dir(orig);
2015                md->last_rq_start_time = ktime_get();
2016        }
2017
2018        if (unlikely(dm_stats_used(&md->stats))) {
2019                struct dm_rq_target_io *tio = tio_from_request(orig);
2020                tio->duration_jiffies = jiffies;
2021                tio->n_sectors = blk_rq_sectors(orig);
2022                dm_stats_account_io(&md->stats, orig->cmd_flags, blk_rq_pos(orig),
2023                                    tio->n_sectors, false, 0, &tio->stats_aux);
2024        }
2025
2026        /*
2027         * Hold the md reference here for the in-flight I/O.
2028         * We can't rely on the reference count by device opener,
2029         * because the device may be closed during the request completion
2030         * when all bios are completed.
2031         * See the comment in rq_completed() too.
2032         */
2033        dm_get(md);
2034}
2035
2036#define MAX_SEQ_RQ_MERGE_DEADLINE_USECS 100000
2037
2038ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf)
2039{
2040        return sprintf(buf, "%u\n", md->seq_rq_merge_deadline_usecs);
2041}
2042
2043ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md,
2044                                                     const char *buf, size_t count)
2045{
2046        unsigned deadline;
2047
2048        if (!dm_request_based(md) || md->use_blk_mq)
2049                return count;
2050
2051        if (kstrtouint(buf, 10, &deadline))
2052                return -EINVAL;
2053
2054        if (deadline > MAX_SEQ_RQ_MERGE_DEADLINE_USECS)
2055                deadline = MAX_SEQ_RQ_MERGE_DEADLINE_USECS;
2056
2057        md->seq_rq_merge_deadline_usecs = deadline;
2058
2059        return count;
2060}
2061
2062static bool dm_request_peeked_before_merge_deadline(struct mapped_device *md)
2063{
2064        ktime_t kt_deadline;
2065
2066        if (!md->seq_rq_merge_deadline_usecs)
2067                return false;
2068
2069        kt_deadline = ns_to_ktime((u64)md->seq_rq_merge_deadline_usecs * NSEC_PER_USEC);
2070        kt_deadline = ktime_add_safe(md->last_rq_start_time, kt_deadline);
2071
2072        return !ktime_after(ktime_get(), kt_deadline);
2073}
2074
2075/*
2076 * q->request_fn for request-based dm.
2077 * Called with the queue lock held.
2078 */
2079static void dm_request_fn(struct request_queue *q)
2080{
2081        struct mapped_device *md = q->queuedata;
2082        int srcu_idx;
2083        struct dm_table *map = dm_get_live_table(md, &srcu_idx);
2084        struct dm_target *ti;
2085        struct request *rq;
2086        struct dm_rq_target_io *tio;
2087        sector_t pos;
2088
2089        /*
2090         * For suspend, check blk_queue_stopped() and increment
2091         * ->pending within a single queue_lock not to increment the
2092         * number of in-flight I/Os after the queue is stopped in
2093         * dm_suspend().
2094         */
2095        while (!blk_queue_stopped(q)) {
2096                rq = blk_peek_request(q);
2097                if (!rq)
2098                        goto out;
2099
2100                /* always use block 0 to find the target for flushes for now */
2101                pos = 0;
2102                if (!(rq->cmd_flags & REQ_FLUSH))
2103                        pos = blk_rq_pos(rq);
2104
2105                ti = dm_table_find_target(map, pos);
2106                if (!dm_target_is_valid(ti)) {
2107                        /*
2108                         * Must perform setup, that rq_completed() requires,
2109                         * before calling dm_kill_unmapped_request
2110                         */
2111                        DMERR_LIMIT("request attempted access beyond the end of device");
2112                        dm_start_request(md, rq);
2113                        dm_kill_unmapped_request(rq, -EIO);
2114                        continue;
2115                }
2116
2117                if (dm_request_peeked_before_merge_deadline(md) &&
2118                    md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 &&
2119                    md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq))
2120                        goto delay_and_out;
2121
2122                if (ti->type->busy && ti->type->busy(ti))
2123                        goto delay_and_out;
2124
2125                dm_start_request(md, rq);
2126
2127                tio = tio_from_request(rq);
2128                /* Establish tio->ti before queuing work (map_tio_request) */
2129                tio->ti = ti;
2130                queue_kthread_work(&md->kworker, &tio->work);
2131                BUG_ON(!irqs_disabled());
2132        }
2133
2134        goto out;
2135
2136delay_and_out:
2137        blk_delay_queue(q, HZ / 100);
2138out:
2139        dm_put_live_table(md, srcu_idx);
2140}
2141
2142static int dm_any_congested(void *congested_data, int bdi_bits)
2143{
2144        int r = bdi_bits;
2145        struct mapped_device *md = congested_data;
2146        struct dm_table *map;
2147
2148        if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2149                map = dm_get_live_table_fast(md);
2150                if (map) {
2151                        /*
2152                         * Request-based dm cares about only own queue for
2153                         * the query about congestion status of request_queue
2154                         */
2155                        if (dm_request_based(md))
2156                                r = md->queue->backing_dev_info.wb.state &
2157                                    bdi_bits;
2158                        else
2159                                r = dm_table_any_congested(map, bdi_bits);
2160                }
2161                dm_put_live_table_fast(md);
2162        }
2163
2164        return r;
2165}
2166
2167/*-----------------------------------------------------------------
2168 * An IDR is used to keep track of allocated minor numbers.
2169 *---------------------------------------------------------------*/
2170static void free_minor(int minor)
2171{
2172        spin_lock(&_minor_lock);
2173        idr_remove(&_minor_idr, minor);
2174        spin_unlock(&_minor_lock);
2175}
2176
2177/*
2178 * See if the device with a specific minor # is free.
2179 */
2180static int specific_minor(int minor)
2181{
2182        int r;
2183
2184        if (minor >= (1 << MINORBITS))
2185                return -EINVAL;
2186
2187        idr_preload(GFP_KERNEL);
2188        spin_lock(&_minor_lock);
2189
2190        r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
2191
2192        spin_unlock(&_minor_lock);
2193        idr_preload_end();
2194        if (r < 0)
2195                return r == -ENOSPC ? -EBUSY : r;
2196        return 0;
2197}
2198
2199static int next_free_minor(int *minor)
2200{
2201        int r;
2202
2203        idr_preload(GFP_KERNEL);
2204        spin_lock(&_minor_lock);
2205
2206        r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
2207
2208        spin_unlock(&_minor_lock);
2209        idr_preload_end();
2210        if (r < 0)
2211                return r;
2212        *minor = r;
2213        return 0;
2214}
2215
2216static const struct block_device_operations dm_blk_dops;
2217
2218static void dm_wq_work(struct work_struct *work);
2219
2220static void dm_init_md_queue(struct mapped_device *md)
2221{
2222        /*
2223         * Request-based dm devices cannot be stacked on top of bio-based dm
2224         * devices.  The type of this dm device may not have been decided yet.
2225         * The type is decided at the first table loading time.
2226         * To prevent problematic device stacking, clear the queue flag
2227         * for request stacking support until then.
2228         *
2229         * This queue is new, so no concurrency on the queue_flags.
2230         */
2231        queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
2232
2233        /*
2234         * Initialize data that will only be used by a non-blk-mq DM queue
2235         * - must do so here (in alloc_dev callchain) before queue is used
2236         */
2237        md->queue->queuedata = md;
2238        md->queue->backing_dev_info.congested_data = md;
2239}
2240
2241static void dm_init_old_md_queue(struct mapped_device *md)
2242{
2243        md->use_blk_mq = false;
2244        dm_init_md_queue(md);
2245
2246        /*
2247         * Initialize aspects of queue that aren't relevant for blk-mq
2248         */
2249        md->queue->backing_dev_info.congested_fn = dm_any_congested;
2250        blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
2251}
2252
2253static void cleanup_mapped_device(struct mapped_device *md)
2254{
2255        if (md->wq)
2256                destroy_workqueue(md->wq);
2257        if (md->kworker_task)
2258                kthread_stop(md->kworker_task);
2259        mempool_destroy(md->io_pool);
2260        mempool_destroy(md->rq_pool);
2261        if (md->bs)
2262                bioset_free(md->bs);
2263
2264        cleanup_srcu_struct(&md->io_barrier);
2265
2266        if (md->disk) {
2267                spin_lock(&_minor_lock);
2268                md->disk->private_data = NULL;
2269                spin_unlock(&_minor_lock);
2270                del_gendisk(md->disk);
2271                put_disk(md->disk);
2272        }
2273
2274        if (md->queue)
2275                blk_cleanup_queue(md->queue);
2276
2277        if (md->bdev) {
2278                bdput(md->bdev);
2279                md->bdev = NULL;
2280        }
2281}
2282
2283/*
2284 * Allocate and initialise a blank device with a given minor.
2285 */
2286static struct mapped_device *alloc_dev(int minor)
2287{
2288        int r;
2289        struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL);
2290        void *old_md;
2291
2292        if (!md) {
2293                DMWARN("unable to allocate device, out of memory.");
2294                return NULL;
2295        }
2296
2297        if (!try_module_get(THIS_MODULE))
2298                goto bad_module_get;
2299
2300        /* get a minor number for the dev */
2301        if (minor == DM_ANY_MINOR)
2302                r = next_free_minor(&minor);
2303        else
2304                r = specific_minor(minor);
2305        if (r < 0)
2306                goto bad_minor;
2307
2308        r = init_srcu_struct(&md->io_barrier);
2309        if (r < 0)
2310                goto bad_io_barrier;
2311
2312        md->use_blk_mq = use_blk_mq;
2313        md->type = DM_TYPE_NONE;
2314        mutex_init(&md->suspend_lock);
2315        mutex_init(&md->type_lock);
2316        mutex_init(&md->table_devices_lock);
2317        spin_lock_init(&md->deferred_lock);
2318        atomic_set(&md->holders, 1);
2319        atomic_set(&md->open_count, 0);
2320        atomic_set(&md->event_nr, 0);
2321        atomic_set(&md->uevent_seq, 0);
2322        INIT_LIST_HEAD(&md->uevent_list);
2323        INIT_LIST_HEAD(&md->table_devices);
2324        spin_lock_init(&md->uevent_lock);
2325
2326        md->queue = blk_alloc_queue(GFP_KERNEL);
2327        if (!md->queue)
2328                goto bad;
2329
2330        dm_init_md_queue(md);
2331
2332        md->disk = alloc_disk(1);
2333        if (!md->disk)
2334                goto bad;
2335
2336        atomic_set(&md->pending[0], 0);
2337        atomic_set(&md->pending[1], 0);
2338        init_waitqueue_head(&md->wait);
2339        INIT_WORK(&md->work, dm_wq_work);
2340        init_waitqueue_head(&md->eventq);
2341        init_completion(&md->kobj_holder.completion);
2342        md->kworker_task = NULL;
2343
2344        md->disk->major = _major;
2345        md->disk->first_minor = minor;
2346        md->disk->fops = &dm_blk_dops;
2347        md->disk->queue = md->queue;
2348        md->disk->private_data = md;
2349        sprintf(md->disk->disk_name, "dm-%d", minor);
2350        add_disk(md->disk);
2351        format_dev_t(md->name, MKDEV(_major, minor));
2352
2353        md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
2354        if (!md->wq)
2355                goto bad;
2356
2357        md->bdev = bdget_disk(md->disk, 0);
2358        if (!md->bdev)
2359                goto bad;
2360
2361        bio_init(&md->flush_bio);
2362        md->flush_bio.bi_bdev = md->bdev;
2363        md->flush_bio.bi_rw = WRITE_FLUSH;
2364
2365        dm_stats_init(&md->stats);
2366
2367        /* Populate the mapping, nobody knows we exist yet */
2368        spin_lock(&_minor_lock);
2369        old_md = idr_replace(&_minor_idr, md, minor);
2370        spin_unlock(&_minor_lock);
2371
2372        BUG_ON(old_md != MINOR_ALLOCED);
2373
2374        return md;
2375
2376bad:
2377        cleanup_mapped_device(md);
2378bad_io_barrier:
2379        free_minor(minor);
2380bad_minor:
2381        module_put(THIS_MODULE);
2382bad_module_get:
2383        kfree(md);
2384        return NULL;
2385}
2386
2387static void unlock_fs(struct mapped_device *md);
2388
2389static void free_dev(struct mapped_device *md)
2390{
2391        int minor = MINOR(disk_devt(md->disk));
2392
2393        unlock_fs(md);
2394
2395        cleanup_mapped_device(md);
2396        if (md->use_blk_mq)
2397                blk_mq_free_tag_set(&md->tag_set);
2398
2399        free_table_devices(&md->table_devices);
2400        dm_stats_cleanup(&md->stats);
2401        free_minor(minor);
2402
2403        module_put(THIS_MODULE);
2404        kfree(md);
2405}
2406
2407static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
2408{
2409        struct dm_md_mempools *p = dm_table_get_md_mempools(t);
2410
2411        if (md->bs) {
2412                /* The md already has necessary mempools. */
2413                if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) {
2414                        /*
2415                         * Reload bioset because front_pad may have changed
2416                         * because a different table was loaded.
2417                         */
2418                        bioset_free(md->bs);
2419                        md->bs = p->bs;
2420                        p->bs = NULL;
2421                }
2422                /*
2423                 * There's no need to reload with request-based dm
2424                 * because the size of front_pad doesn't change.
2425                 * Note for future: If you are to reload bioset,
2426                 * prep-ed requests in the queue may refer
2427                 * to bio from the old bioset, so you must walk
2428                 * through the queue to unprep.
2429                 */
2430                goto out;
2431        }
2432
2433        BUG_ON(!p || md->io_pool || md->rq_pool || md->bs);
2434
2435        md->io_pool = p->io_pool;
2436        p->io_pool = NULL;
2437        md->rq_pool = p->rq_pool;
2438        p->rq_pool = NULL;
2439        md->bs = p->bs;
2440        p->bs = NULL;
2441
2442out:
2443        /* mempool bind completed, no longer need any mempools in the table */
2444        dm_table_free_md_mempools(t);
2445}
2446
2447/*
2448 * Bind a table to the device.
2449 */
2450static void event_callback(void *context)
2451{
2452        unsigned long flags;
2453        LIST_HEAD(uevents);
2454        struct mapped_device *md = (struct mapped_device *) context;
2455
2456        spin_lock_irqsave(&md->uevent_lock, flags);
2457        list_splice_init(&md->uevent_list, &uevents);
2458        spin_unlock_irqrestore(&md->uevent_lock, flags);
2459
2460        dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
2461
2462        atomic_inc(&md->event_nr);
2463        wake_up(&md->eventq);
2464}
2465
2466/*
2467 * Protected by md->suspend_lock obtained by dm_swap_table().
2468 */
2469static void __set_size(struct mapped_device *md, sector_t size)
2470{
2471        set_capacity(md->disk, size);
2472
2473        i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
2474}
2475
2476/*
2477 * Returns old map, which caller must destroy.
2478 */
2479static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2480                               struct queue_limits *limits)
2481{
2482        struct dm_table *old_map;
2483        struct request_queue *q = md->queue;
2484        sector_t size;
2485
2486        size = dm_table_get_size(t);
2487
2488        /*
2489         * Wipe any geometry if the size of the table changed.
2490         */
2491        if (size != dm_get_size(md))
2492                memset(&md->geometry, 0, sizeof(md->geometry));
2493
2494        __set_size(md, size);
2495
2496        dm_table_event_callback(t, event_callback, md);
2497
2498        /*
2499         * The queue hasn't been stopped yet, if the old table type wasn't
2500         * for request-based during suspension.  So stop it to prevent
2501         * I/O mapping before resume.
2502         * This must be done before setting the queue restrictions,
2503         * because request-based dm may be run just after the setting.
2504         */
2505        if (dm_table_request_based(t))
2506                stop_queue(q);
2507
2508        __bind_mempools(md, t);
2509
2510        old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2511        rcu_assign_pointer(md->map, t);
2512        md->immutable_target_type = dm_table_get_immutable_target_type(t);
2513
2514        dm_table_set_restrictions(t, q, limits);
2515        if (old_map)
2516                dm_sync_table(md);
2517
2518        return old_map;
2519}
2520
2521/*
2522 * Returns unbound table for the caller to free.
2523 */
2524static struct dm_table *__unbind(struct mapped_device *md)
2525{
2526        struct dm_table *map = rcu_dereference_protected(md->map, 1);
2527
2528        if (!map)
2529                return NULL;
2530
2531        dm_table_event_callback(map, NULL, NULL);
2532        RCU_INIT_POINTER(md->map, NULL);
2533        dm_sync_table(md);
2534
2535        return map;
2536}
2537
2538/*
2539 * Constructor for a new device.
2540 */
2541int dm_create(int minor, struct mapped_device **result)
2542{
2543        struct mapped_device *md;
2544
2545        md = alloc_dev(minor);
2546        if (!md)
2547                return -ENXIO;
2548
2549        dm_sysfs_init(md);
2550
2551        *result = md;
2552        return 0;
2553}
2554
2555/*
2556 * Functions to manage md->type.
2557 * All are required to hold md->type_lock.
2558 */
2559void dm_lock_md_type(struct mapped_device *md)
2560{
2561        mutex_lock(&md->type_lock);
2562}
2563
2564void dm_unlock_md_type(struct mapped_device *md)
2565{
2566        mutex_unlock(&md->type_lock);
2567}
2568
2569void dm_set_md_type(struct mapped_device *md, unsigned type)
2570{
2571        BUG_ON(!mutex_is_locked(&md->type_lock));
2572        md->type = type;
2573}
2574
2575unsigned dm_get_md_type(struct mapped_device *md)
2576{
2577        BUG_ON(!mutex_is_locked(&md->type_lock));
2578        return md->type;
2579}
2580
2581struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
2582{
2583        return md->immutable_target_type;
2584}
2585
2586/*
2587 * The queue_limits are only valid as long as you have a reference
2588 * count on 'md'.
2589 */
2590struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
2591{
2592        BUG_ON(!atomic_read(&md->holders));
2593        return &md->queue->limits;
2594}
2595EXPORT_SYMBOL_GPL(dm_get_queue_limits);
2596
2597static void init_rq_based_worker_thread(struct mapped_device *md)
2598{
2599        /* Initialize the request-based DM worker thread */
2600        init_kthread_worker(&md->kworker);
2601        md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker,
2602                                       "kdmwork-%s", dm_device_name(md));
2603}
2604
2605/*
2606 * Fully initialize a request-based queue (->elevator, ->request_fn, etc).
2607 */
2608static int dm_init_request_based_queue(struct mapped_device *md)
2609{
2610        struct request_queue *q = NULL;
2611
2612        /* Fully initialize the queue */
2613        q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL);
2614        if (!q)
2615                return -EINVAL;
2616
2617        /* disable dm_request_fn's merge heuristic by default */
2618        md->seq_rq_merge_deadline_usecs = 0;
2619
2620        md->queue = q;
2621        dm_init_old_md_queue(md);
2622        blk_queue_softirq_done(md->queue, dm_softirq_done);
2623        blk_queue_prep_rq(md->queue, dm_prep_fn);
2624
2625        init_rq_based_worker_thread(md);
2626
2627        elv_register_queue(md->queue);
2628
2629        return 0;
2630}
2631
2632static int dm_mq_init_request(void *data, struct request *rq,
2633                              unsigned int hctx_idx, unsigned int request_idx,
2634                              unsigned int numa_node)
2635{
2636        struct mapped_device *md = data;
2637        struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
2638
2639        /*
2640         * Must initialize md member of tio, otherwise it won't
2641         * be available in dm_mq_queue_rq.
2642         */
2643        tio->md = md;
2644
2645        return 0;
2646}
2647
2648static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
2649                          const struct blk_mq_queue_data *bd)
2650{
2651        struct request *rq = bd->rq;
2652        struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
2653        struct mapped_device *md = tio->md;
2654        int srcu_idx;
2655        struct dm_table *map = dm_get_live_table(md, &srcu_idx);
2656        struct dm_target *ti;
2657        sector_t pos;
2658
2659        /* always use block 0 to find the target for flushes for now */
2660        pos = 0;
2661        if (!(rq->cmd_flags & REQ_FLUSH))
2662                pos = blk_rq_pos(rq);
2663
2664        ti = dm_table_find_target(map, pos);
2665        if (!dm_target_is_valid(ti)) {
2666                dm_put_live_table(md, srcu_idx);
2667                DMERR_LIMIT("request attempted access beyond the end of device");
2668                /*
2669                 * Must perform setup, that rq_completed() requires,
2670                 * before returning BLK_MQ_RQ_QUEUE_ERROR
2671                 */
2672                dm_start_request(md, rq);
2673                return BLK_MQ_RQ_QUEUE_ERROR;
2674        }
2675        dm_put_live_table(md, srcu_idx);
2676
2677        if (ti->type->busy && ti->type->busy(ti))
2678                return BLK_MQ_RQ_QUEUE_BUSY;
2679
2680        dm_start_request(md, rq);
2681
2682        /* Init tio using md established in .init_request */
2683        init_tio(tio, rq, md);
2684
2685        /*
2686         * Establish tio->ti before queuing work (map_tio_request)
2687         * or making direct call to map_request().
2688         */
2689        tio->ti = ti;
2690
2691        /* Clone the request if underlying devices aren't blk-mq */
2692        if (dm_table_get_type(map) == DM_TYPE_REQUEST_BASED) {
2693                /* clone request is allocated at the end of the pdu */
2694                tio->clone = (void *)blk_mq_rq_to_pdu(rq) + sizeof(struct dm_rq_target_io);
2695                (void) clone_rq(rq, md, tio, GFP_ATOMIC);
2696                queue_kthread_work(&md->kworker, &tio->work);
2697        } else {
2698                /* Direct call is fine since .queue_rq allows allocations */
2699                if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) {
2700                        /* Undo dm_start_request() before requeuing */
2701                        rq_end_stats(md, rq);
2702                        rq_completed(md, rq_data_dir(rq), false);
2703                        return BLK_MQ_RQ_QUEUE_BUSY;
2704                }
2705        }
2706
2707        return BLK_MQ_RQ_QUEUE_OK;
2708}
2709
2710static struct blk_mq_ops dm_mq_ops = {
2711        .queue_rq = dm_mq_queue_rq,
2712        .map_queue = blk_mq_map_queue,
2713        .complete = dm_softirq_done,
2714        .init_request = dm_mq_init_request,
2715};
2716
2717static int dm_init_request_based_blk_mq_queue(struct mapped_device *md)
2718{
2719        unsigned md_type = dm_get_md_type(md);
2720        struct request_queue *q;
2721        int err;
2722
2723        memset(&md->tag_set, 0, sizeof(md->tag_set));
2724        md->tag_set.ops = &dm_mq_ops;
2725        md->tag_set.queue_depth = BLKDEV_MAX_RQ;
2726        md->tag_set.numa_node = NUMA_NO_NODE;
2727        md->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
2728        md->tag_set.nr_hw_queues = 1;
2729        if (md_type == DM_TYPE_REQUEST_BASED) {
2730                /* make the memory for non-blk-mq clone part of the pdu */
2731                md->tag_set.cmd_size = sizeof(struct dm_rq_target_io) + sizeof(struct request);
2732        } else
2733                md->tag_set.cmd_size = sizeof(struct dm_rq_target_io);
2734        md->tag_set.driver_data = md;
2735
2736        err = blk_mq_alloc_tag_set(&md->tag_set);
2737        if (err)
2738                return err;
2739
2740        q = blk_mq_init_allocated_queue(&md->tag_set, md->queue);
2741        if (IS_ERR(q)) {
2742                err = PTR_ERR(q);
2743                goto out_tag_set;
2744        }
2745        md->queue = q;
2746        dm_init_md_queue(md);
2747
2748        /* backfill 'mq' sysfs registration normally done in blk_register_queue */
2749        blk_mq_register_disk(md->disk);
2750
2751        if (md_type == DM_TYPE_REQUEST_BASED)
2752                init_rq_based_worker_thread(md);
2753
2754        return 0;
2755
2756out_tag_set:
2757        blk_mq_free_tag_set(&md->tag_set);
2758        return err;
2759}
2760
2761static unsigned filter_md_type(unsigned type, struct mapped_device *md)
2762{
2763        if (type == DM_TYPE_BIO_BASED)
2764                return type;
2765
2766        return !md->use_blk_mq ? DM_TYPE_REQUEST_BASED : DM_TYPE_MQ_REQUEST_BASED;
2767}
2768
2769/*
2770 * Setup the DM device's queue based on md's type
2771 */
2772int dm_setup_md_queue(struct mapped_device *md)
2773{
2774        int r;
2775        unsigned md_type = filter_md_type(dm_get_md_type(md), md);
2776
2777        switch (md_type) {
2778        case DM_TYPE_REQUEST_BASED:
2779                r = dm_init_request_based_queue(md);
2780                if (r) {
2781                        DMWARN("Cannot initialize queue for request-based mapped device");
2782                        return r;
2783                }
2784                break;
2785        case DM_TYPE_MQ_REQUEST_BASED:
2786                r = dm_init_request_based_blk_mq_queue(md);
2787                if (r) {
2788                        DMWARN("Cannot initialize queue for request-based blk-mq mapped device");
2789                        return r;
2790                }
2791                break;
2792        case DM_TYPE_BIO_BASED:
2793                dm_init_old_md_queue(md);
2794                blk_queue_make_request(md->queue, dm_make_request);
2795                /*
2796                 * DM handles splitting bios as needed.  Free the bio_split bioset
2797                 * since it won't be used (saves 1 process per bio-based DM device).
2798                 */
2799                bioset_free(md->queue->bio_split);
2800                md->queue->bio_split = NULL;
2801                break;
2802        }
2803
2804        return 0;
2805}
2806
2807struct mapped_device *dm_get_md(dev_t dev)
2808{
2809        struct mapped_device *md;
2810        unsigned minor = MINOR(dev);
2811
2812        if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2813                return NULL;
2814
2815        spin_lock(&_minor_lock);
2816
2817        md = idr_find(&_minor_idr, minor);
2818        if (md) {
2819                if ((md == MINOR_ALLOCED ||
2820                     (MINOR(disk_devt(dm_disk(md))) != minor) ||
2821                     dm_deleting_md(md) ||
2822                     test_bit(DMF_FREEING, &md->flags))) {
2823                        md = NULL;
2824                        goto out;
2825                }
2826                dm_get(md);
2827        }
2828
2829out:
2830        spin_unlock(&_minor_lock);
2831
2832        return md;
2833}
2834EXPORT_SYMBOL_GPL(dm_get_md);
2835
2836void *dm_get_mdptr(struct mapped_device *md)
2837{
2838        return md->interface_ptr;
2839}
2840
2841void dm_set_mdptr(struct mapped_device *md, void *ptr)
2842{
2843        md->interface_ptr = ptr;
2844}
2845
2846void dm_get(struct mapped_device *md)
2847{
2848        atomic_inc(&md->holders);
2849        BUG_ON(test_bit(DMF_FREEING, &md->flags));
2850}
2851
2852int dm_hold(struct mapped_device *md)
2853{
2854        spin_lock(&_minor_lock);
2855        if (test_bit(DMF_FREEING, &md->flags)) {
2856                spin_unlock(&_minor_lock);
2857                return -EBUSY;
2858        }
2859        dm_get(md);
2860        spin_unlock(&_minor_lock);
2861        return 0;
2862}
2863EXPORT_SYMBOL_GPL(dm_hold);
2864
2865const char *dm_device_name(struct mapped_device *md)
2866{
2867        return md->name;
2868}
2869EXPORT_SYMBOL_GPL(dm_device_name);
2870
2871static void __dm_destroy(struct mapped_device *md, bool wait)
2872{
2873        struct dm_table *map;
2874        int srcu_idx;
2875
2876        might_sleep();
2877
2878        spin_lock(&_minor_lock);
2879        idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2880        set_bit(DMF_FREEING, &md->flags);
2881        spin_unlock(&_minor_lock);
2882
2883        if (dm_request_based(md) && md->kworker_task)
2884                flush_kthread_worker(&md->kworker);
2885
2886        /*
2887         * Take suspend_lock so that presuspend and postsuspend methods
2888         * do not race with internal suspend.
2889         */
2890        mutex_lock(&md->suspend_lock);
2891        map = dm_get_live_table(md, &srcu_idx);
2892        if (!dm_suspended_md(md)) {
2893                dm_table_presuspend_targets(map);
2894                dm_table_postsuspend_targets(map);
2895        }
2896        /* dm_put_live_table must be before msleep, otherwise deadlock is possible */
2897        dm_put_live_table(md, srcu_idx);
2898        mutex_unlock(&md->suspend_lock);
2899
2900        /*
2901         * Rare, but there may be I/O requests still going to complete,
2902         * for example.  Wait for all references to disappear.
2903         * No one should increment the reference count of the mapped_device,
2904         * after the mapped_device state becomes DMF_FREEING.
2905         */
2906        if (wait)
2907                while (atomic_read(&md->holders))
2908                        msleep(1);
2909        else if (atomic_read(&md->holders))
2910                DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2911                       dm_device_name(md), atomic_read(&md->holders));
2912
2913        dm_sysfs_exit(md);
2914        dm_table_destroy(__unbind(md));
2915        free_dev(md);
2916}
2917
2918void dm_destroy(struct mapped_device *md)
2919{
2920        __dm_destroy(md, true);
2921}
2922
2923void dm_destroy_immediate(struct mapped_device *md)
2924{
2925        __dm_destroy(md, false);
2926}
2927
2928void dm_put(struct mapped_device *md)
2929{
2930        atomic_dec(&md->holders);
2931}
2932EXPORT_SYMBOL_GPL(dm_put);
2933
2934static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
2935{
2936        int r = 0;
2937        DECLARE_WAITQUEUE(wait, current);
2938
2939        add_wait_queue(&md->wait, &wait);
2940
2941        while (1) {
2942                set_current_state(interruptible);
2943
2944                if (!md_in_flight(md))
2945                        break;
2946
2947                if (interruptible == TASK_INTERRUPTIBLE &&
2948                    signal_pending(current)) {
2949                        r = -EINTR;
2950                        break;
2951                }
2952
2953                io_schedule();
2954        }
2955        set_current_state(TASK_RUNNING);
2956
2957        remove_wait_queue(&md->wait, &wait);
2958
2959        return r;
2960}
2961
2962/*
2963 * Process the deferred bios
2964 */
2965static void dm_wq_work(struct work_struct *work)
2966{
2967        struct mapped_device *md = container_of(work, struct mapped_device,
2968                                                work);
2969        struct bio *c;
2970        int srcu_idx;
2971        struct dm_table *map;
2972
2973        map = dm_get_live_table(md, &srcu_idx);
2974
2975        while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2976                spin_lock_irq(&md->deferred_lock);
2977                c = bio_list_pop(&md->deferred);
2978                spin_unlock_irq(&md->deferred_lock);
2979
2980                if (!c)
2981                        break;
2982
2983                if (dm_request_based(md))
2984                        generic_make_request(c);
2985                else
2986                        __split_and_process_bio(md, map, c);
2987        }
2988
2989        dm_put_live_table(md, srcu_idx);
2990}
2991
2992static void dm_queue_flush(struct mapped_device *md)
2993{
2994        clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2995        smp_mb__after_atomic();
2996        queue_work(md->wq, &md->work);
2997}
2998
2999/*
3000 * Swap in a new table, returning the old one for the caller to destroy.

3001 */
3002struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
3003{
3004        struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
3005        struct queue_limits limits;
3006        int r;
3007
3008        mutex_lock(&md->suspend_lock);
3009
3010        /* device must be suspended */
3011        if (!dm_suspended_md(md))
3012                goto out;
3013
3014        /*
3015         * If the new table has no data devices, retain the existing limits.
3016         * This helps multipath with queue_if_no_path if all paths disappear,
3017         * then new I/O is queued based on these limits, and then some paths
3018         * reappear.
3019         */
3020        if (dm_table_has_no_data_devices(table)) {
3021                live_map = dm_get_live_table_fast(md);
3022                if (live_map)
3023                        limits = md->queue->limits;
3024                dm_put_live_table_fast(md);
3025        }
3026
3027        if (!live_map) {
3028                r = dm_calculate_queue_limits(table, &limits);
3029                if (r) {
3030                        map = ERR_PTR(r);
3031                        goto out;
3032                }
3033        }
3034
3035        map = __bind(md, table, &limits);
3036
3037out:
3038        mutex_unlock(&md->suspend_lock);
3039        return map;
3040}
3041
3042/*
3043 * Functions to lock and unlock any filesystem running on the
3044 * device.
3045 */
3046static int lock_fs(struct mapped_device *md)
3047{
3048        int r;
3049
3050        WARN_ON(md->frozen_sb);
3051
3052        md->frozen_sb = freeze_bdev(md->bdev);
3053        if (IS_ERR(md->frozen_sb)) {
3054                r = PTR_ERR(md->frozen_sb);
3055                md->frozen_sb = NULL;
3056                return r;
3057        }
3058
3059        set_bit(DMF_FROZEN, &md->flags);
3060
3061        return 0;
3062}
3063
3064static void unlock_fs(struct mapped_device *md)
3065{
3066        if (!test_bit(DMF_FROZEN, &md->flags))
3067                return;
3068
3069        thaw_bdev(md->bdev, md->frozen_sb);
3070        md->frozen_sb = NULL;
3071        clear_bit(DMF_FROZEN, &md->flags);
3072}
3073
3074/*
3075 * If __dm_suspend returns 0, the device is completely quiescent
3076 * now. There is no request-processing activity. All new requests
3077 * are being added to md->deferred list.
3078 *
3079 * Caller must hold md->suspend_lock
3080 */
3081static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
3082                        unsigned suspend_flags, int interruptible)
3083{
3084        bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
3085        bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
3086        int r;
3087
3088        /*
3089         * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
3090         * This flag is cleared before dm_suspend returns.
3091         */
3092        if (noflush)
3093                set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
3094
3095        /*
3096         * This gets reverted if there's an error later and the targets
3097         * provide the .presuspend_undo hook.
3098         */
3099        dm_table_presuspend_targets(map);
3100
3101        /*
3102         * Flush I/O to the device.
3103         * Any I/O submitted after lock_fs() may not be flushed.
3104         * noflush takes precedence over do_lockfs.
3105         * (lock_fs() flushes I/Os and waits for them to complete.)
3106         */
3107        if (!noflush && do_lockfs) {
3108                r = lock_fs(md);
3109                if (r) {
3110                        dm_table_presuspend_undo_targets(map);
3111                        return r;
3112                }
3113        }
3114
3115        /*
3116         * Here we must make sure that no processes are submitting requests
3117         * to target drivers i.e. no one may be executing
3118         * __split_and_process_bio. This is called from dm_request and
3119         * dm_wq_work.
3120         *
3121         * To get all processes out of __split_and_process_bio in dm_request,
3122         * we take the write lock. To prevent any process from reentering
3123         * __split_and_process_bio from dm_request and quiesce the thread
3124         * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
3125         * flush_workqueue(md->wq).
3126         */
3127        set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
3128        if (map)
3129                synchronize_srcu(&md->io_barrier);
3130
3131        /*
3132         * Stop md->queue before flushing md->wq in case request-based
3133         * dm defers requests to md->wq from md->queue.
3134         */
3135        if (dm_request_based(md)) {
3136                stop_queue(md->queue);
3137                if (md->kworker_task)
3138                        flush_kthread_worker(&md->kworker);
3139        }
3140
3141        flush_workqueue(md->wq);
3142
3143        /*
3144         * At this point no more requests are entering target request routines.
3145         * We call dm_wait_for_completion to wait for all existing requests
3146         * to finish.
3147         */
3148        r = dm_wait_for_completion(md, interruptible);
3149
3150        if (noflush)
3151                clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
3152        if (map)
3153                synchronize_srcu(&md->io_barrier);
3154
3155        /* were we interrupted ? */
3156        if (r < 0) {
3157                dm_queue_flush(md);
3158
3159                if (dm_request_based(md))
3160                        start_queue(md->queue);
3161
3162                unlock_fs(md);
3163                dm_table_presuspend_undo_targets(map);
3164                /* pushback list is already flushed, so skip flush */
3165        }
3166
3167        return r;
3168}
3169
3170/*
3171 * We need to be able to change a mapping table under a mounted
3172 * filesystem.  For example we might want to move some data in
3173 * the background.  Before the table can be swapped with
3174 * dm_bind_table, dm_suspend must be called to flush any in
3175 * flight bios and ensure that any further io gets deferred.
3176 */
3177/*
3178 * Suspend mechanism in request-based dm.
3179 *
3180 * 1. Flush all I/Os by lock_fs() if needed.
3181 * 2. Stop dispatching any I/O by stopping the request_queue.
3182 * 3. Wait for all in-flight I/Os to be completed or requeued.
3183 *
3184 * To abort suspend, start the request_queue.
3185 */
3186int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
3187{
3188        struct dm_table *map = NULL;
3189        int r = 0;
3190
3191retry:
3192        mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
3193
3194        if (dm_suspended_md(md)) {
3195                r = -EINVAL;
3196                goto out_unlock;
3197        }
3198
3199        if (dm_suspended_internally_md(md)) {
3200                /* already internally suspended, wait for internal resume */
3201                mutex_unlock(&md->suspend_lock);
3202                r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
3203                if (r)
3204                        return r;
3205                goto retry;
3206        }
3207
3208        map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
3209
3210        r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE);
3211        if (r)
3212                goto out_unlock;
3213
3214        set_bit(DMF_SUSPENDED, &md->flags);
3215
3216        dm_table_postsuspend_targets(map);
3217
3218out_unlock:
3219        mutex_unlock(&md->suspend_lock);
3220        return r;
3221}
3222
3223static int __dm_resume(struct mapped_device *md, struct dm_table *map)
3224{
3225        if (map) {
3226                int r = dm_table_resume_targets(map);
3227                if (r)
3228                        return r;
3229        }
3230
3231        dm_queue_flush(md);
3232
3233        /*
3234         * Flushing deferred I/Os must be done after targets are resumed
3235         * so that mapping of targets can work correctly.
3236         * Request-based dm is queueing the deferred I/Os in its request_queue.
3237         */
3238        if (dm_request_based(md))
3239                start_queue(md->queue);
3240
3241        unlock_fs(md);
3242
3243        return 0;
3244}
3245
3246int dm_resume(struct mapped_device *md)
3247{
3248        int r = -EINVAL;
3249        struct dm_table *map = NULL;
3250
3251retry:
3252        mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
3253
3254        if (!dm_suspended_md(md))
3255                goto out;
3256
3257        if (dm_suspended_internally_md(md)) {
3258                /* already internally suspended, wait for internal resume */
3259                mutex_unlock(&md->suspend_lock);
3260                r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
3261                if (r)
3262                        return r;
3263                goto retry;
3264        }
3265
3266        map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
3267        if (!map || !dm_table_get_size(map))
3268                goto out;
3269
3270        r = __dm_resume(md, map);
3271        if (r)
3272                goto out;
3273
3274        clear_bit(DMF_SUSPENDED, &md->flags);
3275
3276        r = 0;
3277out:
3278        mutex_unlock(&md->suspend_lock);
3279
3280        return r;
3281}
3282
3283/*
3284 * Internal suspend/resume works like userspace-driven suspend. It waits
3285 * until all bios finish and prevents issuing new bios to the target drivers.
3286 * It may be used only from the kernel.
3287 */
3288
3289static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
3290{
3291        struct dm_table *map = NULL;
3292
3293        if (md->internal_suspend_count++)
3294                return; /* nested internal suspend */
3295
3296        if (dm_suspended_md(md)) {
3297                set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
3298                return; /* nest suspend */
3299        }
3300
3301        map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
3302
3303        /*
3304         * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is
3305         * supported.  Properly supporting a TASK_INTERRUPTIBLE internal suspend
3306         * would require changing .presuspend to return an error -- avoid this
3307         * until there is a need for more elaborate variants of internal suspend.
3308         */
3309        (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE);
3310
3311        set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
3312
3313        dm_table_postsuspend_targets(map);
3314}
3315
3316static void __dm_internal_resume(struct mapped_device *md)
3317{
3318        BUG_ON(!md->internal_suspend_count);
3319
3320        if (--md->internal_suspend_count)
3321                return; /* resume from nested internal suspend */
3322
3323        if (dm_suspended_md(md))
3324                goto done; /* resume from nested suspend */
3325
3326        /*
3327         * NOTE: existing callers don't need to call dm_table_resume_targets
3328         * (which may fail -- so best to avoid it for now by passing NULL map)
3329         */
3330        (void) __dm_resume(md, NULL);
3331
3332done:
3333        clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
3334        smp_mb__after_atomic();
3335        wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
3336}
3337
3338void dm_internal_suspend_noflush(struct mapped_device *md)
3339{
3340        mutex_lock(&md->suspend_lock);
3341        __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
3342        mutex_unlock(&md->suspend_lock);
3343}
3344EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
3345
3346void dm_internal_resume(struct mapped_device *md)
3347{
3348        mutex_lock(&md->suspend_lock);
3349        __dm_internal_resume(md);
3350        mutex_unlock(&md->suspend_lock);
3351}
3352EXPORT_SYMBOL_GPL(dm_internal_resume);
3353
3354/*
3355 * Fast variants of internal suspend/resume hold md->suspend_lock,
3356 * which prevents interaction with userspace-driven suspend.
3357 */
3358
3359void dm_internal_suspend_fast(struct mapped_device *md)
3360{
3361        mutex_lock(&md->suspend_lock);
3362        if (dm_suspended_md(md) || dm_suspended_internally_md(md))
3363                return;
3364
3365        set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
3366        synchronize_srcu(&md->io_barrier);
3367        flush_workqueue(md->wq);
3368        dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
3369}
3370EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
3371
3372void dm_internal_resume_fast(struct mapped_device *md)
3373{
3374        if (dm_suspended_md(md) || dm_suspended_internally_md(md))
3375                goto done;
3376
3377        dm_queue_flush(md);
3378
3379done:
3380        mutex_unlock(&md->suspend_lock);
3381}
3382EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
3383
3384/*-----------------------------------------------------------------
3385 * Event notification.
3386 *---------------------------------------------------------------*/
3387int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
3388                       unsigned cookie)
3389{
3390        char udev_cookie[DM_COOKIE_LENGTH];
3391        char *envp[] = { udev_cookie, NULL };
3392
3393        if (!cookie)
3394                return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
3395        else {
3396                snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
3397                         DM_COOKIE_ENV_VAR_NAME, cookie);
3398                return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
3399                                          action, envp);
3400        }
3401}
3402
3403uint32_t dm_next_uevent_seq(struct mapped_device *md)
3404{
3405        return atomic_add_return(1, &md->uevent_seq);
3406}
3407
3408uint32_t dm_get_event_nr(struct mapped_device *md)
3409{
3410        return atomic_read(&md->event_nr);
3411}
3412
3413int dm_wait_event(struct mapped_device *md, int event_nr)
3414{
3415        return wait_event_interruptible(md->eventq,
3416                        (event_nr != atomic_read(&md->event_nr)));
3417}
3418
3419void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
3420{
3421        unsigned long flags;
3422
3423        spin_lock_irqsave(&md->uevent_lock, flags);
3424        list_add(elist, &md->uevent_list);
3425        spin_unlock_irqrestore(&md->uevent_lock, flags);
3426}
3427
3428/*
3429 * The gendisk is only valid as long as you have a reference
3430 * count on 'md'.
3431 */
3432struct gendisk *dm_disk(struct mapped_device *md)
3433{
3434        return md->disk;
3435}
3436EXPORT_SYMBOL_GPL(dm_disk);
3437
3438struct kobject *dm_kobject(struct mapped_device *md)
3439{
3440        return &md->kobj_holder.kobj;
3441}
3442
3443struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
3444{
3445        struct mapped_device *md;
3446
3447        md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
3448
3449        if (test_bit(DMF_FREEING, &md->flags) ||
3450            dm_deleting_md(md))
3451                return NULL;
3452
3453        dm_get(md);
3454        return md;
3455}
3456
3457int dm_suspended_md(struct mapped_device *md)
3458{
3459        return test_bit(DMF_SUSPENDED, &md->flags);
3460}
3461
3462int dm_suspended_internally_md(struct mapped_device *md)
3463{
3464        return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
3465}
3466
3467int dm_test_deferred_remove_flag(struct mapped_device *md)
3468{
3469        return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
3470}
3471
3472int dm_suspended(struct dm_target *ti)
3473{
3474        return dm_suspended_md(dm_table_get_md(ti->table));
3475}
3476EXPORT_SYMBOL_GPL(dm_suspended);
3477
3478int dm_noflush_suspending(struct dm_target *ti)
3479{
3480        return __noflush_suspending(dm_table_get_md(ti->table));
3481}
3482EXPORT_SYMBOL_GPL(dm_noflush_suspending);
3483
3484struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
3485                                            unsigned integrity, unsigned per_bio_data_size)
3486{
3487        struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL);
3488        struct kmem_cache *cachep = NULL;
3489        unsigned int pool_size = 0;
3490        unsigned int front_pad;
3491
3492        if (!pools)
3493                return NULL;
3494
3495        type = filter_md_type(type, md);
3496
3497        switch (type) {
3498        case DM_TYPE_BIO_BASED:
3499                cachep = _io_cache;
3500                pool_size = dm_get_reserved_bio_based_ios();
3501                front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
3502                break;
3503        case DM_TYPE_REQUEST_BASED:
3504                cachep = _rq_tio_cache;
3505                pool_size = dm_get_reserved_rq_based_ios();
3506                pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
3507                if (!pools->rq_pool)
3508                        goto out;
3509                /* fall through to setup remaining rq-based pools */
3510        case DM_TYPE_MQ_REQUEST_BASED:
3511                if (!pool_size)
3512                        pool_size = dm_get_reserved_rq_based_ios();
3513                front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
3514                /* per_bio_data_size is not used. See __bind_mempools(). */
3515                WARN_ON(per_bio_data_size != 0);
3516                break;
3517        default:
3518                BUG();
3519        }
3520
3521        if (cachep) {
3522                pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
3523                if (!pools->io_pool)
3524                        goto out;
3525        }
3526
3527        pools->bs = bioset_create_nobvec(pool_size, front_pad);
3528        if (!pools->bs)
3529                goto out;
3530
3531        if (integrity && bioset_integrity_create(pools->bs, pool_size))
3532                goto out;
3533
3534        return pools;
3535
3536out:
3537        dm_free_md_mempools(pools);
3538
3539        return NULL;
3540}
3541
3542void dm_free_md_mempools(struct dm_md_mempools *pools)
3543{
3544        if (!pools)
3545                return;
3546
3547        mempool_destroy(pools->io_pool);
3548        mempool_destroy(pools->rq_pool);
3549
3550        if (pools->bs)
3551                bioset_free(pools->bs);
3552
3553        kfree(pools);
3554}
3555
3556static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
3557                u32 flags)
3558{
3559        struct mapped_device *md = bdev->bd_disk->private_data;
3560        const struct pr_ops *ops;
3561        struct dm_target *tgt;
3562        fmode_t mode;
3563        int srcu_idx, r;
3564
3565        r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx);
3566        if (r < 0)
3567                return r;
3568
3569        ops = bdev->bd_disk->fops->pr_ops;
3570        if (ops && ops->pr_register)
3571                r = ops->pr_register(bdev, old_key, new_key, flags);
3572        else
3573                r = -EOPNOTSUPP;
3574
3575        dm_put_live_table(md, srcu_idx);
3576        return r;
3577}
3578
3579static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
3580                u32 flags)
3581{
3582        struct mapped_device *md = bdev->bd_disk->private_data;
3583        const struct pr_ops *ops;
3584        struct dm_target *tgt;
3585        fmode_t mode;
3586        int srcu_idx, r;
3587
3588        r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx);
3589        if (r < 0)
3590                return r;
3591
3592        ops = bdev->bd_disk->fops->pr_ops;
3593        if (ops && ops->pr_reserve)
3594                r = ops->pr_reserve(bdev, key, type, flags);
3595        else
3596                r = -EOPNOTSUPP;
3597
3598        dm_put_live_table(md, srcu_idx);
3599        return r;
3600}
3601
3602static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
3603{
3604        struct mapped_device *md = bdev->bd_disk->private_data;
3605        const struct pr_ops *ops;
3606        struct dm_target *tgt;
3607        fmode_t mode;
3608        int srcu_idx, r;
3609
3610        r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx);
3611        if (r < 0)
3612                return r;
3613
3614        ops = bdev->bd_disk->fops->pr_ops;
3615        if (ops && ops->pr_release)
3616                r = ops->pr_release(bdev, key, type);
3617        else
3618                r = -EOPNOTSUPP;
3619
3620        dm_put_live_table(md, srcu_idx);
3621        return r;
3622}
3623
3624static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
3625                enum pr_type type, bool abort)
3626{
3627        struct mapped_device *md = bdev->bd_disk->private_data;
3628        const struct pr_ops *ops;
3629        struct dm_target *tgt;
3630        fmode_t mode;
3631        int srcu_idx, r;
3632
3633        r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx);
3634        if (r < 0)
3635                return r;
3636
3637        ops = bdev->bd_disk->fops->pr_ops;
3638        if (ops && ops->pr_preempt)
3639                r = ops->pr_preempt(bdev, old_key, new_key, type, abort);
3640        else
3641                r = -EOPNOTSUPP;
3642
3643        dm_put_live_table(md, srcu_idx);
3644        return r;
3645}
3646
3647static int dm_pr_clear(struct block_device *bdev, u64 key)
3648{
3649        struct mapped_device *md = bdev->bd_disk->private_data;
3650        const struct pr_ops *ops;
3651        struct dm_target *tgt;
3652        fmode_t mode;
3653        int srcu_idx, r;
3654
3655        r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx);
3656        if (r < 0)
3657                return r;
3658
3659        ops = bdev->bd_disk->fops->pr_ops;
3660        if (ops && ops->pr_clear)
3661                r = ops->pr_clear(bdev, key);
3662        else
3663                r = -EOPNOTSUPP;
3664
3665        dm_put_live_table(md, srcu_idx);
3666        return r;
3667}
3668
3669static const struct pr_ops dm_pr_ops = {
3670        .pr_register    = dm_pr_register,
3671        .pr_reserve     = dm_pr_reserve,
3672        .pr_release     = dm_pr_release,
3673        .pr_preempt     = dm_pr_preempt,
3674        .pr_clear       = dm_pr_clear,
3675};
3676
3677static const struct block_device_operations dm_blk_dops = {
3678        .open = dm_blk_open,
3679        .release = dm_blk_close,
3680        .ioctl = dm_blk_ioctl,
3681        .getgeo = dm_blk_getgeo,
3682        .pr_ops = &dm_pr_ops,
3683        .owner = THIS_MODULE
3684};
3685
3686/*
3687 * module hooks
3688 */
3689module_init(dm_init);
3690module_exit(dm_exit);
3691
3692module_param(major, uint, 0);
3693MODULE_PARM_DESC(major, "The major number of the device mapper");
3694
3695module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
3696MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
3697
3698module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR);
3699MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools");
3700
3701module_param(use_blk_mq, bool, S_IRUGO | S_IWUSR);
3702MODULE_PARM_DESC(use_blk_mq, "Use block multiqueue for request-based DM devices");
3703
3704MODULE_DESCRIPTION(DM_NAME " driver");
3705MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3706MODULE_LICENSE("GPL");
3707