linux/drivers/md/dm.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
   3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
   4 *
   5 * This file is released under the GPL.
   6 */
   7
   8#include "dm.h"
   9#include "dm-uevent.h"
  10
  11#include <linux/init.h>
  12#include <linux/module.h>
  13#include <linux/mutex.h>
  14#include <linux/moduleparam.h>
  15#include <linux/blkpg.h>
  16#include <linux/bio.h>
  17#include <linux/mempool.h>
  18#include <linux/slab.h>
  19#include <linux/idr.h>
  20#include <linux/hdreg.h>
  21#include <linux/delay.h>
  22#include <linux/wait.h>
  23#include <linux/kthread.h>
  24
  25#include <trace/events/block.h>
  26
  27#define DM_MSG_PREFIX "core"
  28
  29#ifdef CONFIG_PRINTK
  30/*
  31 * ratelimit state to be used in DMXXX_LIMIT().
  32 */
  33DEFINE_RATELIMIT_STATE(dm_ratelimit_state,
  34                       DEFAULT_RATELIMIT_INTERVAL,
  35                       DEFAULT_RATELIMIT_BURST);
  36EXPORT_SYMBOL(dm_ratelimit_state);
  37#endif
  38
  39/*
  40 * Cookies are numeric values sent with CHANGE and REMOVE
  41 * uevents while resuming, removing or renaming the device.
  42 */
  43#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
  44#define DM_COOKIE_LENGTH 24
  45
  46static const char *_name = DM_NAME;
  47
  48static unsigned int major = 0;
  49static unsigned int _major = 0;
  50
  51static DEFINE_IDR(_minor_idr);
  52
  53static DEFINE_SPINLOCK(_minor_lock);
  54
  55static void do_deferred_remove(struct work_struct *w);
  56
  57static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
  58
  59static struct workqueue_struct *deferred_remove_workqueue;
  60
  61/*
  62 * For bio-based dm.
  63 * One of these is allocated per bio.
  64 */
  65struct dm_io {
  66        struct mapped_device *md;
  67        int error;
  68        atomic_t io_count;
  69        struct bio *bio;
  70        unsigned long start_time;
  71        spinlock_t endio_lock;
  72        struct dm_stats_aux stats_aux;
  73};
  74
  75/*
  76 * For request-based dm.
  77 * One of these is allocated per request.
  78 */
  79struct dm_rq_target_io {
  80        struct mapped_device *md;
  81        struct dm_target *ti;
  82        struct request *orig, *clone;
  83        struct kthread_work work;
  84        int error;
  85        union map_info info;
  86};
  87
  88/*
  89 * For request-based dm - the bio clones we allocate are embedded in these
  90 * structs.
  91 *
  92 * We allocate these with bio_alloc_bioset, using the front_pad parameter when
  93 * the bioset is created - this means the bio has to come at the end of the
  94 * struct.
  95 */
  96struct dm_rq_clone_bio_info {
  97        struct bio *orig;
  98        struct dm_rq_target_io *tio;
  99        struct bio clone;
 100};
 101
 102union map_info *dm_get_rq_mapinfo(struct request *rq)
 103{
 104        if (rq && rq->end_io_data)
 105                return &((struct dm_rq_target_io *)rq->end_io_data)->info;
 106        return NULL;
 107}
 108EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
 109
 110#define MINOR_ALLOCED ((void *)-1)
 111
 112/*
 113 * Bits for the md->flags field.
 114 */
 115#define DMF_BLOCK_IO_FOR_SUSPEND 0
 116#define DMF_SUSPENDED 1
 117#define DMF_FROZEN 2
 118#define DMF_FREEING 3
 119#define DMF_DELETING 4
 120#define DMF_NOFLUSH_SUSPENDING 5
 121#define DMF_MERGE_IS_OPTIONAL 6
 122#define DMF_DEFERRED_REMOVE 7
 123#define DMF_SUSPENDED_INTERNALLY 8
 124
 125/*
 126 * A dummy definition to make RCU happy.
 127 * struct dm_table should never be dereferenced in this file.
 128 */
 129struct dm_table {
 130        int undefined__;
 131};
 132
 133/*
 134 * Work processed by per-device workqueue.
 135 */
 136struct mapped_device {
 137        struct srcu_struct io_barrier;
 138        struct mutex suspend_lock;
 139        atomic_t holders;
 140        atomic_t open_count;
 141
 142        /*
 143         * The current mapping.
 144         * Use dm_get_live_table{_fast} or take suspend_lock for
 145         * dereference.
 146         */
 147        struct dm_table __rcu *map;
 148
 149        struct list_head table_devices;
 150        struct mutex table_devices_lock;
 151
 152        unsigned long flags;
 153
 154        struct request_queue *queue;
 155        unsigned type;
 156        /* Protect queue and type against concurrent access. */
 157        struct mutex type_lock;
 158
 159        struct target_type *immutable_target_type;
 160
 161        struct gendisk *disk;
 162        char name[16];
 163
 164        void *interface_ptr;
 165
 166        /*
 167         * A list of ios that arrived while we were suspended.
 168         */
 169        atomic_t pending[2];
 170        wait_queue_head_t wait;
 171        struct work_struct work;
 172        struct bio_list deferred;
 173        spinlock_t deferred_lock;
 174
 175        /*
 176         * Processing queue (flush)
 177         */
 178        struct workqueue_struct *wq;
 179
 180        /*
 181         * io objects are allocated from here.
 182         */
 183        mempool_t *io_pool;
 184        mempool_t *rq_pool;
 185
 186        struct bio_set *bs;
 187
 188        /*
 189         * Event handling.
 190         */
 191        atomic_t event_nr;
 192        wait_queue_head_t eventq;
 193        atomic_t uevent_seq;
 194        struct list_head uevent_list;
 195        spinlock_t uevent_lock; /* Protect access to uevent_list */
 196
 197        /*
 198         * freeze/thaw support require holding onto a super block
 199         */
 200        struct super_block *frozen_sb;
 201        struct block_device *bdev;
 202
 203        /* forced geometry settings */
 204        struct hd_geometry geometry;
 205
 206        /* kobject and completion */
 207        struct dm_kobject_holder kobj_holder;
 208
 209        /* zero-length flush that will be cloned and submitted to targets */
 210        struct bio flush_bio;
 211
 212        /* the number of internal suspends */
 213        unsigned internal_suspend_count;
 214
 215        struct dm_stats stats;
 216
 217        struct kthread_worker kworker;
 218        struct task_struct *kworker_task;
 219};
 220
 221/*
 222 * For mempools pre-allocation at the table loading time.
 223 */
 224struct dm_md_mempools {
 225        mempool_t *io_pool;
 226        mempool_t *rq_pool;
 227        struct bio_set *bs;
 228};
 229
 230struct table_device {
 231        struct list_head list;
 232        atomic_t count;
 233        struct dm_dev dm_dev;
 234};
 235
 236#define RESERVED_BIO_BASED_IOS          16
 237#define RESERVED_REQUEST_BASED_IOS      256
 238#define RESERVED_MAX_IOS                1024
 239static struct kmem_cache *_io_cache;
 240static struct kmem_cache *_rq_tio_cache;
 241static struct kmem_cache *_rq_cache;
 242
 243/*
 244 * Bio-based DM's mempools' reserved IOs set by the user.
 245 */
 246static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
 247
 248/*
 249 * Request-based DM's mempools' reserved IOs set by the user.
 250 */
 251static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS;
 252
 253static unsigned __dm_get_reserved_ios(unsigned *reserved_ios,
 254                                      unsigned def, unsigned max)
 255{
 256        unsigned ios = ACCESS_ONCE(*reserved_ios);
 257        unsigned modified_ios = 0;
 258
 259        if (!ios)
 260                modified_ios = def;
 261        else if (ios > max)
 262                modified_ios = max;
 263
 264        if (modified_ios) {
 265                (void)cmpxchg(reserved_ios, ios, modified_ios);
 266                ios = modified_ios;
 267        }
 268
 269        return ios;
 270}
 271
 272unsigned dm_get_reserved_bio_based_ios(void)
 273{
 274        return __dm_get_reserved_ios(&reserved_bio_based_ios,
 275                                     RESERVED_BIO_BASED_IOS, RESERVED_MAX_IOS);
 276}
 277EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
 278
 279unsigned dm_get_reserved_rq_based_ios(void)
 280{
 281        return __dm_get_reserved_ios(&reserved_rq_based_ios,
 282                                     RESERVED_REQUEST_BASED_IOS, RESERVED_MAX_IOS);
 283}
 284EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios);
 285
 286static int __init local_init(void)
 287{
 288        int r = -ENOMEM;
 289
 290        /* allocate a slab for the dm_ios */
 291        _io_cache = KMEM_CACHE(dm_io, 0);
 292        if (!_io_cache)
 293                return r;
 294
 295        _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
 296        if (!_rq_tio_cache)
 297                goto out_free_io_cache;
 298
 299        _rq_cache = kmem_cache_create("dm_clone_request", sizeof(struct request),
 300                                      __alignof__(struct request), 0, NULL);
 301        if (!_rq_cache)
 302                goto out_free_rq_tio_cache;
 303
 304        r = dm_uevent_init();
 305        if (r)
 306                goto out_free_rq_cache;
 307
 308        deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
 309        if (!deferred_remove_workqueue) {
 310                r = -ENOMEM;
 311                goto out_uevent_exit;
 312        }
 313
 314        _major = major;
 315        r = register_blkdev(_major, _name);
 316        if (r < 0)
 317                goto out_free_workqueue;
 318
 319        if (!_major)
 320                _major = r;
 321
 322        return 0;
 323
 324out_free_workqueue:
 325        destroy_workqueue(deferred_remove_workqueue);
 326out_uevent_exit:
 327        dm_uevent_exit();
 328out_free_rq_cache:
 329        kmem_cache_destroy(_rq_cache);
 330out_free_rq_tio_cache:
 331        kmem_cache_destroy(_rq_tio_cache);
 332out_free_io_cache:
 333        kmem_cache_destroy(_io_cache);
 334
 335        return r;
 336}
 337
 338static void local_exit(void)
 339{
 340        flush_scheduled_work();
 341        destroy_workqueue(deferred_remove_workqueue);
 342
 343        kmem_cache_destroy(_rq_cache);
 344        kmem_cache_destroy(_rq_tio_cache);
 345        kmem_cache_destroy(_io_cache);
 346        unregister_blkdev(_major, _name);
 347        dm_uevent_exit();
 348
 349        _major = 0;
 350
 351        DMINFO("cleaned up");
 352}
 353
 354static int (*_inits[])(void) __initdata = {
 355        local_init,
 356        dm_target_init,
 357        dm_linear_init,
 358        dm_stripe_init,
 359        dm_io_init,
 360        dm_kcopyd_init,
 361        dm_interface_init,
 362        dm_statistics_init,
 363};
 364
 365static void (*_exits[])(void) = {
 366        local_exit,
 367        dm_target_exit,
 368        dm_linear_exit,
 369        dm_stripe_exit,
 370        dm_io_exit,
 371        dm_kcopyd_exit,
 372        dm_interface_exit,
 373        dm_statistics_exit,
 374};
 375
 376static int __init dm_init(void)
 377{
 378        const int count = ARRAY_SIZE(_inits);
 379
 380        int r, i;
 381
 382        for (i = 0; i < count; i++) {
 383                r = _inits[i]();
 384                if (r)
 385                        goto bad;
 386        }
 387
 388        return 0;
 389
 390      bad:
 391        while (i--)
 392                _exits[i]();
 393
 394        return r;
 395}
 396
 397static void __exit dm_exit(void)
 398{
 399        int i = ARRAY_SIZE(_exits);
 400
 401        while (i--)
 402                _exits[i]();
 403
 404        /*
 405         * Should be empty by this point.
 406         */
 407        idr_destroy(&_minor_idr);
 408}
 409
 410/*
 411 * Block device functions
 412 */
 413int dm_deleting_md(struct mapped_device *md)
 414{
 415        return test_bit(DMF_DELETING, &md->flags);
 416}
 417
 418static int dm_blk_open(struct block_device *bdev, fmode_t mode)
 419{
 420        struct mapped_device *md;
 421
 422        spin_lock(&_minor_lock);
 423
 424        md = bdev->bd_disk->private_data;
 425        if (!md)
 426                goto out;
 427
 428        if (test_bit(DMF_FREEING, &md->flags) ||
 429            dm_deleting_md(md)) {
 430                md = NULL;
 431                goto out;
 432        }
 433
 434        dm_get(md);
 435        atomic_inc(&md->open_count);
 436out:
 437        spin_unlock(&_minor_lock);
 438
 439        return md ? 0 : -ENXIO;
 440}
 441
 442static void dm_blk_close(struct gendisk *disk, fmode_t mode)
 443{
 444        struct mapped_device *md;
 445
 446        spin_lock(&_minor_lock);
 447
 448        md = disk->private_data;
 449        if (WARN_ON(!md))
 450                goto out;
 451
 452        if (atomic_dec_and_test(&md->open_count) &&
 453            (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
 454                queue_work(deferred_remove_workqueue, &deferred_remove_work);
 455
 456        dm_put(md);
 457out:
 458        spin_unlock(&_minor_lock);
 459}
 460
 461int dm_open_count(struct mapped_device *md)
 462{
 463        return atomic_read(&md->open_count);
 464}
 465
 466/*
 467 * Guarantees nothing is using the device before it's deleted.
 468 */
 469int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
 470{
 471        int r = 0;
 472
 473        spin_lock(&_minor_lock);
 474
 475        if (dm_open_count(md)) {
 476                r = -EBUSY;
 477                if (mark_deferred)
 478                        set_bit(DMF_DEFERRED_REMOVE, &md->flags);
 479        } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
 480                r = -EEXIST;
 481        else
 482                set_bit(DMF_DELETING, &md->flags);
 483
 484        spin_unlock(&_minor_lock);
 485
 486        return r;
 487}
 488
 489int dm_cancel_deferred_remove(struct mapped_device *md)
 490{
 491        int r = 0;
 492
 493        spin_lock(&_minor_lock);
 494
 495        if (test_bit(DMF_DELETING, &md->flags))
 496                r = -EBUSY;
 497        else
 498                clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
 499
 500        spin_unlock(&_minor_lock);
 501
 502        return r;
 503}
 504
 505static void do_deferred_remove(struct work_struct *w)
 506{
 507        dm_deferred_remove();
 508}
 509
 510sector_t dm_get_size(struct mapped_device *md)
 511{
 512        return get_capacity(md->disk);
 513}
 514
 515struct request_queue *dm_get_md_queue(struct mapped_device *md)
 516{
 517        return md->queue;
 518}
 519
 520struct dm_stats *dm_get_stats(struct mapped_device *md)
 521{
 522        return &md->stats;
 523}
 524
 525static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 526{
 527        struct mapped_device *md = bdev->bd_disk->private_data;
 528
 529        return dm_get_geometry(md, geo);
 530}
 531
 532static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
 533                        unsigned int cmd, unsigned long arg)
 534{
 535        struct mapped_device *md = bdev->bd_disk->private_data;
 536        int srcu_idx;
 537        struct dm_table *map;
 538        struct dm_target *tgt;
 539        int r = -ENOTTY;
 540
 541retry:
 542        map = dm_get_live_table(md, &srcu_idx);
 543
 544        if (!map || !dm_table_get_size(map))
 545                goto out;
 546
 547        /* We only support devices that have a single target */
 548        if (dm_table_get_num_targets(map) != 1)
 549                goto out;
 550
 551        tgt = dm_table_get_target(map, 0);
 552        if (!tgt->type->ioctl)
 553                goto out;
 554
 555        if (dm_suspended_md(md)) {
 556                r = -EAGAIN;
 557                goto out;
 558        }
 559
 560        r = tgt->type->ioctl(tgt, cmd, arg);
 561
 562out:
 563        dm_put_live_table(md, srcu_idx);
 564
 565        if (r == -ENOTCONN) {
 566                msleep(10);
 567                goto retry;
 568        }
 569
 570        return r;
 571}
 572
 573static struct dm_io *alloc_io(struct mapped_device *md)
 574{
 575        return mempool_alloc(md->io_pool, GFP_NOIO);
 576}
 577
 578static void free_io(struct mapped_device *md, struct dm_io *io)
 579{
 580        mempool_free(io, md->io_pool);
 581}
 582
 583static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
 584{
 585        bio_put(&tio->clone);
 586}
 587
 588static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md,
 589                                            gfp_t gfp_mask)
 590{
 591        return mempool_alloc(md->io_pool, gfp_mask);
 592}
 593
 594static void free_rq_tio(struct dm_rq_target_io *tio)
 595{
 596        mempool_free(tio, tio->md->io_pool);
 597}
 598
 599static struct request *alloc_clone_request(struct mapped_device *md,
 600                                           gfp_t gfp_mask)
 601{
 602        return mempool_alloc(md->rq_pool, gfp_mask);
 603}
 604
 605static void free_clone_request(struct mapped_device *md, struct request *rq)
 606{
 607        mempool_free(rq, md->rq_pool);
 608}
 609
 610static int md_in_flight(struct mapped_device *md)
 611{
 612        return atomic_read(&md->pending[READ]) +
 613               atomic_read(&md->pending[WRITE]);
 614}
 615
 616static void start_io_acct(struct dm_io *io)
 617{
 618        struct mapped_device *md = io->md;
 619        struct bio *bio = io->bio;
 620        int cpu;
 621        int rw = bio_data_dir(bio);
 622
 623        io->start_time = jiffies;
 624
 625        cpu = part_stat_lock();
 626        part_round_stats(cpu, &dm_disk(md)->part0);
 627        part_stat_unlock();
 628        atomic_set(&dm_disk(md)->part0.in_flight[rw],
 629                atomic_inc_return(&md->pending[rw]));
 630
 631        if (unlikely(dm_stats_used(&md->stats)))
 632                dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector,
 633                                    bio_sectors(bio), false, 0, &io->stats_aux);
 634}
 635
 636static void end_io_acct(struct dm_io *io)
 637{
 638        struct mapped_device *md = io->md;
 639        struct bio *bio = io->bio;
 640        unsigned long duration = jiffies - io->start_time;
 641        int pending;
 642        int rw = bio_data_dir(bio);
 643
 644        generic_end_io_acct(rw, &dm_disk(md)->part0, io->start_time);
 645
 646        if (unlikely(dm_stats_used(&md->stats)))
 647                dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector,
 648                                    bio_sectors(bio), true, duration, &io->stats_aux);
 649
 650        /*
 651         * After this is decremented the bio must not be touched if it is
 652         * a flush.
 653         */
 654        pending = atomic_dec_return(&md->pending[rw]);
 655        atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
 656        pending += atomic_read(&md->pending[rw^0x1]);
 657
 658        /* nudge anyone waiting on suspend queue */
 659        if (!pending)
 660                wake_up(&md->wait);
 661}
 662
 663/*
 664 * Add the bio to the list of deferred io.
 665 */
 666static void queue_io(struct mapped_device *md, struct bio *bio)
 667{
 668        unsigned long flags;
 669
 670        spin_lock_irqsave(&md->deferred_lock, flags);
 671        bio_list_add(&md->deferred, bio);
 672        spin_unlock_irqrestore(&md->deferred_lock, flags);
 673        queue_work(md->wq, &md->work);
 674}
 675
 676/*
 677 * Everyone (including functions in this file), should use this
 678 * function to access the md->map field, and make sure they call
 679 * dm_put_live_table() when finished.
 680 */
 681struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
 682{
 683        *srcu_idx = srcu_read_lock(&md->io_barrier);
 684
 685        return srcu_dereference(md->map, &md->io_barrier);
 686}
 687
 688void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
 689{
 690        srcu_read_unlock(&md->io_barrier, srcu_idx);
 691}
 692
 693void dm_sync_table(struct mapped_device *md)
 694{
 695        synchronize_srcu(&md->io_barrier);
 696        synchronize_rcu_expedited();
 697}
 698
 699/*
 700 * A fast alternative to dm_get_live_table/dm_put_live_table.
 701 * The caller must not block between these two functions.
 702 */
 703static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
 704{
 705        rcu_read_lock();
 706        return rcu_dereference(md->map);
 707}
 708
 709static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
 710{
 711        rcu_read_unlock();
 712}
 713
 714/*
 715 * Open a table device so we can use it as a map destination.
 716 */
 717static int open_table_device(struct table_device *td, dev_t dev,
 718                             struct mapped_device *md)
 719{
 720        static char *_claim_ptr = "I belong to device-mapper";
 721        struct block_device *bdev;
 722
 723        int r;
 724
 725        BUG_ON(td->dm_dev.bdev);
 726
 727        bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _claim_ptr);
 728        if (IS_ERR(bdev))
 729                return PTR_ERR(bdev);
 730
 731        r = bd_link_disk_holder(bdev, dm_disk(md));
 732        if (r) {
 733                blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
 734                return r;
 735        }
 736
 737        td->dm_dev.bdev = bdev;
 738        return 0;
 739}
 740
 741/*
 742 * Close a table device that we've been using.
 743 */
 744static void close_table_device(struct table_device *td, struct mapped_device *md)
 745{
 746        if (!td->dm_dev.bdev)
 747                return;
 748
 749        bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
 750        blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
 751        td->dm_dev.bdev = NULL;
 752}
 753
 754static struct table_device *find_table_device(struct list_head *l, dev_t dev,
 755                                              fmode_t mode) {
 756        struct table_device *td;
 757
 758        list_for_each_entry(td, l, list)
 759                if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
 760                        return td;
 761
 762        return NULL;
 763}
 764
 765int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
 766                        struct dm_dev **result) {
 767        int r;
 768        struct table_device *td;
 769
 770        mutex_lock(&md->table_devices_lock);
 771        td = find_table_device(&md->table_devices, dev, mode);
 772        if (!td) {
 773                td = kmalloc(sizeof(*td), GFP_KERNEL);
 774                if (!td) {
 775                        mutex_unlock(&md->table_devices_lock);
 776                        return -ENOMEM;
 777                }
 778
 779                td->dm_dev.mode = mode;
 780                td->dm_dev.bdev = NULL;
 781
 782                if ((r = open_table_device(td, dev, md))) {
 783                        mutex_unlock(&md->table_devices_lock);
 784                        kfree(td);
 785                        return r;
 786                }
 787
 788                format_dev_t(td->dm_dev.name, dev);
 789
 790                atomic_set(&td->count, 0);
 791                list_add(&td->list, &md->table_devices);
 792        }
 793        atomic_inc(&td->count);
 794        mutex_unlock(&md->table_devices_lock);
 795
 796        *result = &td->dm_dev;
 797        return 0;
 798}
 799EXPORT_SYMBOL_GPL(dm_get_table_device);
 800
 801void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
 802{
 803        struct table_device *td = container_of(d, struct table_device, dm_dev);
 804
 805        mutex_lock(&md->table_devices_lock);
 806        if (atomic_dec_and_test(&td->count)) {
 807                close_table_device(td, md);
 808                list_del(&td->list);
 809                kfree(td);
 810        }
 811        mutex_unlock(&md->table_devices_lock);
 812}
 813EXPORT_SYMBOL(dm_put_table_device);
 814
 815static void free_table_devices(struct list_head *devices)
 816{
 817        struct list_head *tmp, *next;
 818
 819        list_for_each_safe(tmp, next, devices) {
 820                struct table_device *td = list_entry(tmp, struct table_device, list);
 821
 822                DMWARN("dm_destroy: %s still exists with %d references",
 823                       td->dm_dev.name, atomic_read(&td->count));
 824                kfree(td);
 825        }
 826}
 827
 828/*
 829 * Get the geometry associated with a dm device
 830 */
 831int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
 832{
 833        *geo = md->geometry;
 834
 835        return 0;
 836}
 837
 838/*
 839 * Set the geometry of a device.
 840 */
 841int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
 842{
 843        sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
 844
 845        if (geo->start > sz) {
 846                DMWARN("Start sector is beyond the geometry limits.");
 847                return -EINVAL;
 848        }
 849
 850        md->geometry = *geo;
 851
 852        return 0;
 853}
 854
 855/*-----------------------------------------------------------------
 856 * CRUD START:
 857 *   A more elegant soln is in the works that uses the queue
 858 *   merge fn, unfortunately there are a couple of changes to
 859 *   the block layer that I want to make for this.  So in the
 860 *   interests of getting something for people to use I give
 861 *   you this clearly demarcated crap.
 862 *---------------------------------------------------------------*/
 863
 864static int __noflush_suspending(struct mapped_device *md)
 865{
 866        return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
 867}
 868
 869/*
 870 * Decrements the number of outstanding ios that a bio has been
 871 * cloned into, completing the original io if necc.
 872 */
 873static void dec_pending(struct dm_io *io, int error)
 874{
 875        unsigned long flags;
 876        int io_error;
 877        struct bio *bio;
 878        struct mapped_device *md = io->md;
 879
 880        /* Push-back supersedes any I/O errors */
 881        if (unlikely(error)) {
 882                spin_lock_irqsave(&io->endio_lock, flags);
 883                if (!(io->error > 0 && __noflush_suspending(md)))
 884                        io->error = error;
 885                spin_unlock_irqrestore(&io->endio_lock, flags);
 886        }
 887
 888        if (atomic_dec_and_test(&io->io_count)) {
 889                if (io->error == DM_ENDIO_REQUEUE) {
 890                        /*
 891                         * Target requested pushing back the I/O.
 892                         */
 893                        spin_lock_irqsave(&md->deferred_lock, flags);
 894                        if (__noflush_suspending(md))
 895                                bio_list_add_head(&md->deferred, io->bio);
 896                        else
 897                                /* noflush suspend was interrupted. */
 898                                io->error = -EIO;
 899                        spin_unlock_irqrestore(&md->deferred_lock, flags);
 900                }
 901
 902                io_error = io->error;
 903                bio = io->bio;
 904                end_io_acct(io);
 905                free_io(md, io);
 906
 907                if (io_error == DM_ENDIO_REQUEUE)
 908                        return;
 909
 910                if ((bio->bi_rw & REQ_FLUSH) && bio->bi_iter.bi_size) {
 911                        /*
 912                         * Preflush done for flush with data, reissue
 913                         * without REQ_FLUSH.
 914                         */
 915                        bio->bi_rw &= ~REQ_FLUSH;
 916                        queue_io(md, bio);
 917                } else {
 918                        /* done with normal IO or empty flush */
 919                        trace_block_bio_complete(md->queue, bio, io_error);
 920                        bio_endio(bio, io_error);
 921                }
 922        }
 923}
 924
 925static void disable_write_same(struct mapped_device *md)
 926{
 927        struct queue_limits *limits = dm_get_queue_limits(md);
 928
 929        /* device doesn't really support WRITE SAME, disable it */
 930        limits->max_write_same_sectors = 0;
 931}
 932
 933static void clone_endio(struct bio *bio, int error)
 934{
 935        int r = error;
 936        struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
 937        struct dm_io *io = tio->io;
 938        struct mapped_device *md = tio->io->md;
 939        dm_endio_fn endio = tio->ti->type->end_io;
 940
 941        if (!bio_flagged(bio, BIO_UPTODATE) && !error)
 942                error = -EIO;
 943
 944        if (endio) {
 945                r = endio(tio->ti, bio, error);
 946                if (r < 0 || r == DM_ENDIO_REQUEUE)
 947                        /*
 948                         * error and requeue request are handled
 949                         * in dec_pending().
 950                         */
 951                        error = r;
 952                else if (r == DM_ENDIO_INCOMPLETE)
 953                        /* The target will handle the io */
 954                        return;
 955                else if (r) {
 956                        DMWARN("unimplemented target endio return value: %d", r);
 957                        BUG();
 958                }
 959        }
 960
 961        if (unlikely(r == -EREMOTEIO && (bio->bi_rw & REQ_WRITE_SAME) &&
 962                     !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors))
 963                disable_write_same(md);
 964
 965        free_tio(md, tio);
 966        dec_pending(io, error);
 967}
 968
 969/*
 970 * Partial completion handling for request-based dm
 971 */
 972static void end_clone_bio(struct bio *clone, int error)
 973{
 974        struct dm_rq_clone_bio_info *info =
 975                container_of(clone, struct dm_rq_clone_bio_info, clone);
 976        struct dm_rq_target_io *tio = info->tio;
 977        struct bio *bio = info->orig;
 978        unsigned int nr_bytes = info->orig->bi_iter.bi_size;
 979
 980        bio_put(clone);
 981
 982        if (tio->error)
 983                /*
 984                 * An error has already been detected on the request.
 985                 * Once error occurred, just let clone->end_io() handle
 986                 * the remainder.
 987                 */
 988                return;
 989        else if (error) {
 990                /*
 991                 * Don't notice the error to the upper layer yet.
 992                 * The error handling decision is made by the target driver,
 993                 * when the request is completed.
 994                 */
 995                tio->error = error;
 996                return;
 997        }
 998
 999        /*
1000         * I/O for the bio successfully completed.
1001         * Notice the data completion to the upper layer.
1002         */
1003
1004        /*
1005         * bios are processed from the head of the list.
1006         * So the completing bio should always be rq->bio.
1007         * If it's not, something wrong is happening.
1008         */
1009        if (tio->orig->bio != bio)
1010                DMERR("bio completion is going in the middle of the request");
1011
1012        /*
1013         * Update the original request.
1014         * Do not use blk_end_request() here, because it may complete
1015         * the original request before the clone, and break the ordering.
1016         */
1017        blk_update_request(tio->orig, 0, nr_bytes);
1018}
1019
1020/*
1021 * Don't touch any member of the md after calling this function because
1022 * the md may be freed in dm_put() at the end of this function.
1023 * Or do dm_get() before calling this function and dm_put() later.
1024 */
1025static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
1026{
1027        atomic_dec(&md->pending[rw]);
1028
1029        /* nudge anyone waiting on suspend queue */
1030        if (!md_in_flight(md))
1031                wake_up(&md->wait);
1032
1033        /*
1034         * Run this off this callpath, as drivers could invoke end_io while
1035         * inside their request_fn (and holding the queue lock). Calling
1036         * back into ->request_fn() could deadlock attempting to grab the
1037         * queue lock again.
1038         */
1039        if (run_queue)
1040                blk_run_queue_async(md->queue);
1041
1042        /*
1043         * dm_put() must be at the end of this function. See the comment above
1044         */
1045        dm_put(md);
1046}
1047
1048static void free_rq_clone(struct request *clone)
1049{
1050        struct dm_rq_target_io *tio = clone->end_io_data;
1051
1052        blk_rq_unprep_clone(clone);
1053        if (clone->q && clone->q->mq_ops)
1054                tio->ti->type->release_clone_rq(clone);
1055        else
1056                free_clone_request(tio->md, clone);
1057        free_rq_tio(tio);
1058}
1059
1060/*
1061 * Complete the clone and the original request.
1062 * Must be called without clone's queue lock held,
1063 * see end_clone_request() for more details.
1064 */
1065static void dm_end_request(struct request *clone, int error)
1066{
1067        int rw = rq_data_dir(clone);
1068        struct dm_rq_target_io *tio = clone->end_io_data;
1069        struct mapped_device *md = tio->md;
1070        struct request *rq = tio->orig;
1071
1072        if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
1073                rq->errors = clone->errors;
1074                rq->resid_len = clone->resid_len;
1075
1076                if (rq->sense)
1077                        /*
1078                         * We are using the sense buffer of the original
1079                         * request.
1080                         * So setting the length of the sense data is enough.
1081                         */
1082                        rq->sense_len = clone->sense_len;
1083        }
1084
1085        free_rq_clone(clone);
1086        blk_end_request_all(rq, error);
1087        rq_completed(md, rw, true);
1088}
1089
1090static void dm_unprep_request(struct request *rq)
1091{
1092        struct dm_rq_target_io *tio = rq->special;
1093        struct request *clone = tio->clone;
1094
1095        rq->special = NULL;
1096        rq->cmd_flags &= ~REQ_DONTPREP;
1097
1098        if (clone)
1099                free_rq_clone(clone);
1100}
1101
1102/*
1103 * Requeue the original request of a clone.
1104 */
1105static void dm_requeue_unmapped_original_request(struct mapped_device *md,
1106                                                 struct request *rq)
1107{
1108        int rw = rq_data_dir(rq);
1109        struct request_queue *q = rq->q;
1110        unsigned long flags;
1111
1112        dm_unprep_request(rq);
1113
1114        spin_lock_irqsave(q->queue_lock, flags);
1115        blk_requeue_request(q, rq);
1116        spin_unlock_irqrestore(q->queue_lock, flags);
1117
1118        rq_completed(md, rw, false);
1119}
1120
1121static void dm_requeue_unmapped_request(struct request *clone)
1122{
1123        struct dm_rq_target_io *tio = clone->end_io_data;
1124
1125        dm_requeue_unmapped_original_request(tio->md, tio->orig);
1126}
1127
1128static void __stop_queue(struct request_queue *q)
1129{
1130        blk_stop_queue(q);
1131}
1132
1133static void stop_queue(struct request_queue *q)
1134{
1135        unsigned long flags;
1136
1137        spin_lock_irqsave(q->queue_lock, flags);
1138        __stop_queue(q);
1139        spin_unlock_irqrestore(q->queue_lock, flags);
1140}
1141
1142static void __start_queue(struct request_queue *q)
1143{
1144        if (blk_queue_stopped(q))
1145                blk_start_queue(q);
1146}
1147
1148static void start_queue(struct request_queue *q)
1149{
1150        unsigned long flags;
1151
1152        spin_lock_irqsave(q->queue_lock, flags);
1153        __start_queue(q);
1154        spin_unlock_irqrestore(q->queue_lock, flags);
1155}
1156
1157static void dm_done(struct request *clone, int error, bool mapped)
1158{
1159        int r = error;
1160        struct dm_rq_target_io *tio = clone->end_io_data;
1161        dm_request_endio_fn rq_end_io = NULL;
1162
1163        if (tio->ti) {
1164                rq_end_io = tio->ti->type->rq_end_io;
1165
1166                if (mapped && rq_end_io)
1167                        r = rq_end_io(tio->ti, clone, error, &tio->info);
1168        }
1169
1170        if (unlikely(r == -EREMOTEIO && (clone->cmd_flags & REQ_WRITE_SAME) &&
1171                     !clone->q->limits.max_write_same_sectors))
1172                disable_write_same(tio->md);
1173
1174        if (r <= 0)
1175                /* The target wants to complete the I/O */
1176                dm_end_request(clone, r);
1177        else if (r == DM_ENDIO_INCOMPLETE)
1178                /* The target will handle the I/O */
1179                return;
1180        else if (r == DM_ENDIO_REQUEUE)
1181                /* The target wants to requeue the I/O */
1182                dm_requeue_unmapped_request(clone);
1183        else {
1184                DMWARN("unimplemented target endio return value: %d", r);
1185                BUG();
1186        }
1187}
1188
1189/*
1190 * Request completion handler for request-based dm
1191 */
1192static void dm_softirq_done(struct request *rq)
1193{
1194        bool mapped = true;
1195        struct dm_rq_target_io *tio = rq->special;
1196        struct request *clone = tio->clone;
1197
1198        if (!clone) {
1199                blk_end_request_all(rq, tio->error);
1200                rq_completed(tio->md, rq_data_dir(rq), false);
1201                free_rq_tio(tio);
1202                return;
1203        }
1204
1205        if (rq->cmd_flags & REQ_FAILED)
1206                mapped = false;
1207
1208        dm_done(clone, tio->error, mapped);
1209}
1210
1211/*
1212 * Complete the clone and the original request with the error status
1213 * through softirq context.
1214 */
1215static void dm_complete_request(struct request *rq, int error)
1216{
1217        struct dm_rq_target_io *tio = rq->special;
1218
1219        tio->error = error;
1220        blk_complete_request(rq);
1221}
1222
1223/*
1224 * Complete the not-mapped clone and the original request with the error status
1225 * through softirq context.
1226 * Target's rq_end_io() function isn't called.
1227 * This may be used when the target's map_rq() or clone_and_map_rq() functions fail.
1228 */
1229static void dm_kill_unmapped_request(struct request *rq, int error)
1230{
1231        rq->cmd_flags |= REQ_FAILED;
1232        dm_complete_request(rq, error);
1233}
1234
1235/*
1236 * Called with the clone's queue lock held
1237 */
1238static void end_clone_request(struct request *clone, int error)
1239{
1240        struct dm_rq_target_io *tio = clone->end_io_data;
1241
1242        if (!clone->q->mq_ops) {
1243                /*
1244                 * For just cleaning up the information of the queue in which
1245                 * the clone was dispatched.
1246                 * The clone is *NOT* freed actually here because it is alloced
1247                 * from dm own mempool (REQ_ALLOCED isn't set).
1248                 */
1249                __blk_put_request(clone->q, clone);
1250        }
1251
1252        /*
1253         * Actual request completion is done in a softirq context which doesn't
1254         * hold the clone's queue lock.  Otherwise, deadlock could occur because:
1255         *     - another request may be submitted by the upper level driver
1256         *       of the stacking during the completion
1257         *     - the submission which requires queue lock may be done
1258         *       against this clone's queue
1259         */
1260        dm_complete_request(tio->orig, error);
1261}
1262
1263/*
1264 * Return maximum size of I/O possible at the supplied sector up to the current
1265 * target boundary.
1266 */
1267static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
1268{
1269        sector_t target_offset = dm_target_offset(ti, sector);
1270
1271        return ti->len - target_offset;
1272}
1273
1274static sector_t max_io_len(sector_t sector, struct dm_target *ti)
1275{
1276        sector_t len = max_io_len_target_boundary(sector, ti);
1277        sector_t offset, max_len;
1278
1279        /*
1280         * Does the target need to split even further?
1281         */
1282        if (ti->max_io_len) {
1283                offset = dm_target_offset(ti, sector);
1284                if (unlikely(ti->max_io_len & (ti->max_io_len - 1)))
1285                        max_len = sector_div(offset, ti->max_io_len);
1286                else
1287                        max_len = offset & (ti->max_io_len - 1);
1288                max_len = ti->max_io_len - max_len;
1289
1290                if (len > max_len)
1291                        len = max_len;
1292        }
1293
1294        return len;
1295}
1296
1297int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
1298{
1299        if (len > UINT_MAX) {
1300                DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
1301                      (unsigned long long)len, UINT_MAX);
1302                ti->error = "Maximum size of target IO is too large";
1303                return -EINVAL;
1304        }
1305
1306        ti->max_io_len = (uint32_t) len;
1307
1308        return 0;
1309}
1310EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
1311
1312/*
1313 * A target may call dm_accept_partial_bio only from the map routine.  It is
1314 * allowed for all bio types except REQ_FLUSH.
1315 *
1316 * dm_accept_partial_bio informs the dm that the target only wants to process
1317 * additional n_sectors sectors of the bio and the rest of the data should be
1318 * sent in a next bio.
1319 *
1320 * A diagram that explains the arithmetics:
1321 * +--------------------+---------------+-------+
1322 * |         1          |       2       |   3   |
1323 * +--------------------+---------------+-------+
1324 *
1325 * <-------------- *tio->len_ptr --------------->
1326 *                      <------- bi_size ------->
1327 *                      <-- n_sectors -->
1328 *
1329 * Region 1 was already iterated over with bio_advance or similar function.
1330 *      (it may be empty if the target doesn't use bio_advance)
1331 * Region 2 is the remaining bio size that the target wants to process.
1332 *      (it may be empty if region 1 is non-empty, although there is no reason
1333 *       to make it empty)
1334 * The target requires that region 3 is to be sent in the next bio.
1335 *
1336 * If the target wants to receive multiple copies of the bio (via num_*bios, etc),
1337 * the partially processed part (the sum of regions 1+2) must be the same for all
1338 * copies of the bio.
1339 */
1340void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
1341{
1342        struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
1343        unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
1344        BUG_ON(bio->bi_rw & REQ_FLUSH);
1345        BUG_ON(bi_size > *tio->len_ptr);
1346        BUG_ON(n_sectors > bi_size);
1347        *tio->len_ptr -= bi_size - n_sectors;
1348        bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
1349}
1350EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
1351
1352static void __map_bio(struct dm_target_io *tio)
1353{
1354        int r;
1355        sector_t sector;
1356        struct mapped_device *md;
1357        struct bio *clone = &tio->clone;
1358        struct dm_target *ti = tio->ti;
1359
1360        clone->bi_end_io = clone_endio;
1361
1362        /*
1363         * Map the clone.  If r == 0 we don't need to do
1364         * anything, the target has assumed ownership of
1365         * this io.
1366         */
1367        atomic_inc(&tio->io->io_count);
1368        sector = clone->bi_iter.bi_sector;
1369        r = ti->type->map(ti, clone);
1370        if (r == DM_MAPIO_REMAPPED) {
1371                /* the bio has been remapped so dispatch it */
1372
1373                trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,
1374                                      tio->io->bio->bi_bdev->bd_dev, sector);
1375
1376                generic_make_request(clone);
1377        } else if (r < 0 || r == DM_MAPIO_REQUEUE) {
1378                /* error the io and bail out, or requeue it if needed */
1379                md = tio->io->md;
1380                dec_pending(tio->io, r);
1381                free_tio(md, tio);
1382        } else if (r) {
1383                DMWARN("unimplemented target map return value: %d", r);
1384                BUG();
1385        }
1386}
1387
1388struct clone_info {
1389        struct mapped_device *md;
1390        struct dm_table *map;
1391        struct bio *bio;
1392        struct dm_io *io;
1393        sector_t sector;
1394        unsigned sector_count;
1395};
1396
1397static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
1398{
1399        bio->bi_iter.bi_sector = sector;
1400        bio->bi_iter.bi_size = to_bytes(len);
1401}
1402
1403/*
1404 * Creates a bio that consists of range of complete bvecs.
1405 */
1406static void clone_bio(struct dm_target_io *tio, struct bio *bio,
1407                      sector_t sector, unsigned len)
1408{
1409        struct bio *clone = &tio->clone;
1410
1411        __bio_clone_fast(clone, bio);
1412
1413        if (bio_integrity(bio))
1414                bio_integrity_clone(clone, bio, GFP_NOIO);
1415
1416        bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
1417        clone->bi_iter.bi_size = to_bytes(len);
1418
1419        if (bio_integrity(bio))
1420                bio_integrity_trim(clone, 0, len);
1421}
1422
1423static struct dm_target_io *alloc_tio(struct clone_info *ci,
1424                                      struct dm_target *ti,
1425                                      unsigned target_bio_nr)
1426{
1427        struct dm_target_io *tio;
1428        struct bio *clone;
1429
1430        clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs);
1431        tio = container_of(clone, struct dm_target_io, clone);
1432
1433        tio->io = ci->io;
1434        tio->ti = ti;
1435        tio->target_bio_nr = target_bio_nr;
1436
1437        return tio;
1438}
1439
1440static void __clone_and_map_simple_bio(struct clone_info *ci,
1441                                       struct dm_target *ti,
1442                                       unsigned target_bio_nr, unsigned *len)
1443{
1444        struct dm_target_io *tio = alloc_tio(ci, ti, target_bio_nr);
1445        struct bio *clone = &tio->clone;
1446
1447        tio->len_ptr = len;
1448
1449        __bio_clone_fast(clone, ci->bio);
1450        if (len)
1451                bio_setup_sector(clone, ci->sector, *len);
1452
1453        __map_bio(tio);
1454}
1455
1456static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
1457                                  unsigned num_bios, unsigned *len)
1458{
1459        unsigned target_bio_nr;
1460
1461        for (target_bio_nr = 0; target_bio_nr < num_bios; target_bio_nr++)
1462                __clone_and_map_simple_bio(ci, ti, target_bio_nr, len);
1463}
1464
1465static int __send_empty_flush(struct clone_info *ci)
1466{
1467        unsigned target_nr = 0;
1468        struct dm_target *ti;
1469
1470        BUG_ON(bio_has_data(ci->bio));
1471        while ((ti = dm_table_get_target(ci->map, target_nr++)))
1472                __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
1473
1474        return 0;
1475}
1476
1477static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
1478                                     sector_t sector, unsigned *len)
1479{
1480        struct bio *bio = ci->bio;
1481        struct dm_target_io *tio;
1482        unsigned target_bio_nr;
1483        unsigned num_target_bios = 1;
1484
1485        /*
1486         * Does the target want to receive duplicate copies of the bio?
1487         */
1488        if (bio_data_dir(bio) == WRITE && ti->num_write_bios)
1489                num_target_bios = ti->num_write_bios(ti, bio);
1490
1491        for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) {
1492                tio = alloc_tio(ci, ti, target_bio_nr);
1493                tio->len_ptr = len;
1494                clone_bio(tio, bio, sector, *len);
1495                __map_bio(tio);
1496        }
1497}
1498
1499typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
1500
1501static unsigned get_num_discard_bios(struct dm_target *ti)
1502{
1503        return ti->num_discard_bios;
1504}
1505
1506static unsigned get_num_write_same_bios(struct dm_target *ti)
1507{
1508        return ti->num_write_same_bios;
1509}
1510
1511typedef bool (*is_split_required_fn)(struct dm_target *ti);
1512
1513static bool is_split_required_for_discard(struct dm_target *ti)
1514{
1515        return ti->split_discard_bios;
1516}
1517
1518static int __send_changing_extent_only(struct clone_info *ci,
1519                                       get_num_bios_fn get_num_bios,
1520                                       is_split_required_fn is_split_required)
1521{
1522        struct dm_target *ti;
1523        unsigned len;
1524        unsigned num_bios;
1525
1526        do {
1527                ti = dm_table_find_target(ci->map, ci->sector);
1528                if (!dm_target_is_valid(ti))
1529                        return -EIO;
1530
1531                /*
1532                 * Even though the device advertised support for this type of
1533                 * request, that does not mean every target supports it, and
1534                 * reconfiguration might also have changed that since the
1535                 * check was performed.
1536                 */
1537                num_bios = get_num_bios ? get_num_bios(ti) : 0;
1538                if (!num_bios)
1539                        return -EOPNOTSUPP;
1540
1541                if (is_split_required && !is_split_required(ti))
1542                        len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
1543                else
1544                        len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti));
1545
1546                __send_duplicate_bios(ci, ti, num_bios, &len);
1547
1548                ci->sector += len;
1549        } while (ci->sector_count -= len);
1550
1551        return 0;
1552}
1553
1554static int __send_discard(struct clone_info *ci)
1555{
1556        return __send_changing_extent_only(ci, get_num_discard_bios,
1557                                           is_split_required_for_discard);
1558}
1559
1560static int __send_write_same(struct clone_info *ci)
1561{
1562        return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
1563}
1564
1565/*
1566 * Select the correct strategy for processing a non-flush bio.
1567 */
1568static int __split_and_process_non_flush(struct clone_info *ci)
1569{
1570        struct bio *bio = ci->bio;
1571        struct dm_target *ti;
1572        unsigned len;
1573
1574        if (unlikely(bio->bi_rw & REQ_DISCARD))
1575                return __send_discard(ci);
1576        else if (unlikely(bio->bi_rw & REQ_WRITE_SAME))
1577                return __send_write_same(ci);
1578
1579        ti = dm_table_find_target(ci->map, ci->sector);
1580        if (!dm_target_is_valid(ti))
1581                return -EIO;
1582
1583        len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count);
1584
1585        __clone_and_map_data_bio(ci, ti, ci->sector, &len);
1586
1587        ci->sector += len;
1588        ci->sector_count -= len;
1589
1590        return 0;
1591}
1592
1593/*
1594 * Entry point to split a bio into clones and submit them to the targets.
1595 */
1596static void __split_and_process_bio(struct mapped_device *md,
1597                                    struct dm_table *map, struct bio *bio)
1598{
1599        struct clone_info ci;
1600        int error = 0;
1601
1602        if (unlikely(!map)) {
1603                bio_io_error(bio);
1604                return;
1605        }
1606
1607        ci.map = map;
1608        ci.md = md;
1609        ci.io = alloc_io(md);
1610        ci.io->error = 0;
1611        atomic_set(&ci.io->io_count, 1);
1612        ci.io->bio = bio;
1613        ci.io->md = md;
1614        spin_lock_init(&ci.io->endio_lock);
1615        ci.sector = bio->bi_iter.bi_sector;
1616
1617        start_io_acct(ci.io);
1618
1619        if (bio->bi_rw & REQ_FLUSH) {
1620                ci.bio = &ci.md->flush_bio;
1621                ci.sector_count = 0;
1622                error = __send_empty_flush(&ci);
1623                /* dec_pending submits any data associated with flush */
1624        } else {
1625                ci.bio = bio;
1626                ci.sector_count = bio_sectors(bio);
1627                while (ci.sector_count && !error)
1628                        error = __split_and_process_non_flush(&ci);
1629        }
1630
1631        /* drop the extra reference count */
1632        dec_pending(ci.io, error);
1633}
1634/*-----------------------------------------------------------------
1635 * CRUD END
1636 *---------------------------------------------------------------*/
1637
1638static int dm_merge_bvec(struct request_queue *q,
1639                         struct bvec_merge_data *bvm,
1640                         struct bio_vec *biovec)
1641{
1642        struct mapped_device *md = q->queuedata;
1643        struct dm_table *map = dm_get_live_table_fast(md);
1644        struct dm_target *ti;
1645        sector_t max_sectors;
1646        int max_size = 0;
1647
1648        if (unlikely(!map))
1649                goto out;
1650
1651        ti = dm_table_find_target(map, bvm->bi_sector);
1652        if (!dm_target_is_valid(ti))
1653                goto out;
1654
1655        /*
1656         * Find maximum amount of I/O that won't need splitting
1657         */
1658        max_sectors = min(max_io_len(bvm->bi_sector, ti),
1659                          (sector_t) queue_max_sectors(q));
1660        max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
1661        if (unlikely(max_size < 0)) /* this shouldn't _ever_ happen */
1662                max_size = 0;
1663
1664        /*
1665         * merge_bvec_fn() returns number of bytes
1666         * it can accept at this offset
1667         * max is precomputed maximal io size
1668         */
1669        if (max_size && ti->type->merge)
1670                max_size = ti->type->merge(ti, bvm, biovec, max_size);
1671        /*
1672         * If the target doesn't support merge method and some of the devices
1673         * provided their merge_bvec method (we know this by looking for the
1674         * max_hw_sectors that dm_set_device_limits may set), then we can't
1675         * allow bios with multiple vector entries.  So always set max_size
1676         * to 0, and the code below allows just one page.
1677         */
1678        else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
1679                max_size = 0;
1680
1681out:
1682        dm_put_live_table_fast(md);
1683        /*
1684         * Always allow an entire first page
1685         */
1686        if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT))
1687                max_size = biovec->bv_len;
1688
1689        return max_size;
1690}
1691
1692/*
1693 * The request function that just remaps the bio built up by
1694 * dm_merge_bvec.
1695 */
1696static void _dm_request(struct request_queue *q, struct bio *bio)
1697{
1698        int rw = bio_data_dir(bio);
1699        struct mapped_device *md = q->queuedata;
1700        int srcu_idx;
1701        struct dm_table *map;
1702
1703        map = dm_get_live_table(md, &srcu_idx);
1704
1705        generic_start_io_acct(rw, bio_sectors(bio), &dm_disk(md)->part0);
1706
1707        /* if we're suspended, we have to queue this io for later */
1708        if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1709                dm_put_live_table(md, srcu_idx);
1710
1711                if (bio_rw(bio) != READA)
1712                        queue_io(md, bio);
1713                else
1714                        bio_io_error(bio);
1715                return;
1716        }
1717
1718        __split_and_process_bio(md, map, bio);
1719        dm_put_live_table(md, srcu_idx);
1720        return;
1721}
1722
1723int dm_request_based(struct mapped_device *md)
1724{
1725        return blk_queue_stackable(md->queue);
1726}
1727
1728static void dm_request(struct request_queue *q, struct bio *bio)
1729{
1730        struct mapped_device *md = q->queuedata;
1731
1732        if (dm_request_based(md))
1733                blk_queue_bio(q, bio);
1734        else
1735                _dm_request(q, bio);
1736}
1737
1738static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
1739{
1740        int r;
1741
1742        if (blk_queue_io_stat(clone->q))
1743                clone->cmd_flags |= REQ_IO_STAT;
1744
1745        clone->start_time = jiffies;
1746        r = blk_insert_cloned_request(clone->q, clone);
1747        if (r)
1748                /* must complete clone in terms of original request */
1749                dm_complete_request(rq, r);
1750}
1751
1752static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
1753                                 void *data)
1754{
1755        struct dm_rq_target_io *tio = data;
1756        struct dm_rq_clone_bio_info *info =
1757                container_of(bio, struct dm_rq_clone_bio_info, clone);
1758
1759        info->orig = bio_orig;
1760        info->tio = tio;
1761        bio->bi_end_io = end_clone_bio;
1762
1763        return 0;
1764}
1765
1766static int setup_clone(struct request *clone, struct request *rq,
1767                       struct dm_rq_target_io *tio, gfp_t gfp_mask)
1768{
1769        int r;
1770
1771        r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask,
1772                              dm_rq_bio_constructor, tio);
1773        if (r)
1774                return r;
1775
1776        clone->cmd = rq->cmd;
1777        clone->cmd_len = rq->cmd_len;
1778        clone->sense = rq->sense;
1779        clone->end_io = end_clone_request;
1780        clone->end_io_data = tio;
1781
1782        tio->clone = clone;
1783
1784        return 0;
1785}
1786
1787static struct request *clone_rq(struct request *rq, struct mapped_device *md,
1788                                struct dm_rq_target_io *tio, gfp_t gfp_mask)
1789{
1790        struct request *clone = alloc_clone_request(md, gfp_mask);
1791
1792        if (!clone)
1793                return NULL;
1794
1795        blk_rq_init(NULL, clone);
1796        if (setup_clone(clone, rq, tio, gfp_mask)) {
1797                /* -ENOMEM */
1798                free_clone_request(md, clone);
1799                return NULL;
1800        }
1801
1802        return clone;
1803}
1804
1805static void map_tio_request(struct kthread_work *work);
1806
1807static struct dm_rq_target_io *prep_tio(struct request *rq,
1808                                        struct mapped_device *md, gfp_t gfp_mask)
1809{
1810        struct dm_rq_target_io *tio;
1811        int srcu_idx;
1812        struct dm_table *table;
1813
1814        tio = alloc_rq_tio(md, gfp_mask);
1815        if (!tio)
1816                return NULL;
1817
1818        tio->md = md;
1819        tio->ti = NULL;
1820        tio->clone = NULL;
1821        tio->orig = rq;
1822        tio->error = 0;
1823        memset(&tio->info, 0, sizeof(tio->info));
1824        init_kthread_work(&tio->work, map_tio_request);
1825
1826        table = dm_get_live_table(md, &srcu_idx);
1827        if (!dm_table_mq_request_based(table)) {
1828                if (!clone_rq(rq, md, tio, gfp_mask)) {
1829                        dm_put_live_table(md, srcu_idx);
1830                        free_rq_tio(tio);
1831                        return NULL;
1832                }
1833        }
1834        dm_put_live_table(md, srcu_idx);
1835
1836        return tio;
1837}
1838
1839/*
1840 * Called with the queue lock held.
1841 */
1842static int dm_prep_fn(struct request_queue *q, struct request *rq)
1843{
1844        struct mapped_device *md = q->queuedata;
1845        struct dm_rq_target_io *tio;
1846
1847        if (unlikely(rq->special)) {
1848                DMWARN("Already has something in rq->special.");
1849                return BLKPREP_KILL;
1850        }
1851
1852        tio = prep_tio(rq, md, GFP_ATOMIC);
1853        if (!tio)
1854                return BLKPREP_DEFER;
1855
1856        rq->special = tio;
1857        rq->cmd_flags |= REQ_DONTPREP;
1858
1859        return BLKPREP_OK;
1860}
1861
1862/*
1863 * Returns:
1864 * 0                : the request has been processed
1865 * DM_MAPIO_REQUEUE : the original request needs to be requeued
1866 * < 0              : the request was completed due to failure
1867 */
1868static int map_request(struct dm_target *ti, struct request *rq,
1869                       struct mapped_device *md)
1870{
1871        int r;
1872        struct dm_rq_target_io *tio = rq->special;
1873        struct request *clone = NULL;
1874
1875        if (tio->clone) {
1876                clone = tio->clone;
1877                r = ti->type->map_rq(ti, clone, &tio->info);
1878        } else {
1879                r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone);
1880                if (r < 0) {
1881                        /* The target wants to complete the I/O */
1882                        dm_kill_unmapped_request(rq, r);
1883                        return r;
1884                }
1885                if (IS_ERR(clone))
1886                        return DM_MAPIO_REQUEUE;
1887                if (setup_clone(clone, rq, tio, GFP_KERNEL)) {
1888                        /* -ENOMEM */
1889                        ti->type->release_clone_rq(clone);
1890                        return DM_MAPIO_REQUEUE;
1891                }
1892        }
1893
1894        switch (r) {
1895        case DM_MAPIO_SUBMITTED:
1896                /* The target has taken the I/O to submit by itself later */
1897                break;
1898        case DM_MAPIO_REMAPPED:
1899                /* The target has remapped the I/O so dispatch it */
1900                trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
1901                                     blk_rq_pos(rq));
1902                dm_dispatch_clone_request(clone, rq);
1903                break;
1904        case DM_MAPIO_REQUEUE:
1905                /* The target wants to requeue the I/O */
1906                dm_requeue_unmapped_request(clone);
1907                break;
1908        default:
1909                if (r > 0) {
1910                        DMWARN("unimplemented target map return value: %d", r);
1911                        BUG();
1912                }
1913
1914                /* The target wants to complete the I/O */
1915                dm_kill_unmapped_request(rq, r);
1916                return r;
1917        }
1918
1919        return 0;
1920}
1921
1922static void map_tio_request(struct kthread_work *work)
1923{
1924        struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work);
1925        struct request *rq = tio->orig;
1926        struct mapped_device *md = tio->md;
1927
1928        if (map_request(tio->ti, rq, md) == DM_MAPIO_REQUEUE)
1929                dm_requeue_unmapped_original_request(md, rq);
1930}
1931
1932static void dm_start_request(struct mapped_device *md, struct request *orig)
1933{
1934        blk_start_request(orig);
1935        atomic_inc(&md->pending[rq_data_dir(orig)]);
1936
1937        /*
1938         * Hold the md reference here for the in-flight I/O.
1939         * We can't rely on the reference count by device opener,
1940         * because the device may be closed during the request completion
1941         * when all bios are completed.
1942         * See the comment in rq_completed() too.
1943         */
1944        dm_get(md);
1945}
1946
1947/*
1948 * q->request_fn for request-based dm.
1949 * Called with the queue lock held.
1950 */
1951static void dm_request_fn(struct request_queue *q)
1952{
1953        struct mapped_device *md = q->queuedata;
1954        int srcu_idx;
1955        struct dm_table *map = dm_get_live_table(md, &srcu_idx);
1956        struct dm_target *ti;
1957        struct request *rq;
1958        struct dm_rq_target_io *tio;
1959        sector_t pos;
1960
1961        /*
1962         * For suspend, check blk_queue_stopped() and increment
1963         * ->pending within a single queue_lock not to increment the
1964         * number of in-flight I/Os after the queue is stopped in
1965         * dm_suspend().
1966         */
1967        while (!blk_queue_stopped(q)) {
1968                rq = blk_peek_request(q);
1969                if (!rq)
1970                        goto delay_and_out;
1971
1972                /* always use block 0 to find the target for flushes for now */
1973                pos = 0;
1974                if (!(rq->cmd_flags & REQ_FLUSH))
1975                        pos = blk_rq_pos(rq);
1976
1977                ti = dm_table_find_target(map, pos);
1978                if (!dm_target_is_valid(ti)) {
1979                        /*
1980                         * Must perform setup, that rq_completed() requires,
1981                         * before calling dm_kill_unmapped_request
1982                         */
1983                        DMERR_LIMIT("request attempted access beyond the end of device");
1984                        dm_start_request(md, rq);
1985                        dm_kill_unmapped_request(rq, -EIO);
1986                        continue;
1987                }
1988
1989                if (ti->type->busy && ti->type->busy(ti))
1990                        goto delay_and_out;
1991
1992                dm_start_request(md, rq);
1993
1994                tio = rq->special;
1995                /* Establish tio->ti before queuing work (map_tio_request) */
1996                tio->ti = ti;
1997                queue_kthread_work(&md->kworker, &tio->work);
1998                BUG_ON(!irqs_disabled());
1999        }
2000
2001        goto out;
2002
2003delay_and_out:
2004        blk_delay_queue(q, HZ / 10);
2005out:
2006        dm_put_live_table(md, srcu_idx);
2007}
2008
2009int dm_underlying_device_busy(struct request_queue *q)
2010{
2011        return blk_lld_busy(q);
2012}
2013EXPORT_SYMBOL_GPL(dm_underlying_device_busy);
2014
2015static int dm_lld_busy(struct request_queue *q)
2016{
2017        int r;
2018        struct mapped_device *md = q->queuedata;
2019        struct dm_table *map = dm_get_live_table_fast(md);
2020
2021        if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
2022                r = 1;
2023        else
2024                r = dm_table_any_busy_target(map);
2025
2026        dm_put_live_table_fast(md);
2027
2028        return r;
2029}
2030
2031static int dm_any_congested(void *congested_data, int bdi_bits)
2032{
2033        int r = bdi_bits;
2034        struct mapped_device *md = congested_data;
2035        struct dm_table *map;
2036
2037        if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2038                map = dm_get_live_table_fast(md);
2039                if (map) {
2040                        /*
2041                         * Request-based dm cares about only own queue for
2042                         * the query about congestion status of request_queue
2043                         */
2044                        if (dm_request_based(md))
2045                                r = md->queue->backing_dev_info.state &
2046                                    bdi_bits;
2047                        else
2048                                r = dm_table_any_congested(map, bdi_bits);
2049                }
2050                dm_put_live_table_fast(md);
2051        }
2052
2053        return r;
2054}
2055
2056/*-----------------------------------------------------------------
2057 * An IDR is used to keep track of allocated minor numbers.
2058 *---------------------------------------------------------------*/
2059static void free_minor(int minor)
2060{
2061        spin_lock(&_minor_lock);
2062        idr_remove(&_minor_idr, minor);
2063        spin_unlock(&_minor_lock);
2064}
2065
2066/*
2067 * See if the device with a specific minor # is free.
2068 */
2069static int specific_minor(int minor)
2070{
2071        int r;
2072
2073        if (minor >= (1 << MINORBITS))
2074                return -EINVAL;
2075
2076        idr_preload(GFP_KERNEL);
2077        spin_lock(&_minor_lock);
2078
2079        r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
2080
2081        spin_unlock(&_minor_lock);
2082        idr_preload_end();
2083        if (r < 0)
2084                return r == -ENOSPC ? -EBUSY : r;
2085        return 0;
2086}
2087
2088static int next_free_minor(int *minor)
2089{
2090        int r;
2091
2092        idr_preload(GFP_KERNEL);
2093        spin_lock(&_minor_lock);
2094
2095        r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
2096
2097        spin_unlock(&_minor_lock);
2098        idr_preload_end();
2099        if (r < 0)
2100                return r;
2101        *minor = r;
2102        return 0;
2103}
2104
2105static const struct block_device_operations dm_blk_dops;
2106
2107static void dm_wq_work(struct work_struct *work);
2108
2109static void dm_init_md_queue(struct mapped_device *md)
2110{
2111        /*
2112         * Request-based dm devices cannot be stacked on top of bio-based dm
2113         * devices.  The type of this dm device has not been decided yet.
2114         * The type is decided at the first table loading time.
2115         * To prevent problematic device stacking, clear the queue flag
2116         * for request stacking support until then.
2117         *
2118         * This queue is new, so no concurrency on the queue_flags.
2119         */
2120        queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
2121
2122        md->queue->queuedata = md;
2123        md->queue->backing_dev_info.congested_fn = dm_any_congested;
2124        md->queue->backing_dev_info.congested_data = md;
2125        blk_queue_make_request(md->queue, dm_request);
2126        blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
2127        blk_queue_merge_bvec(md->queue, dm_merge_bvec);
2128}
2129
2130/*
2131 * Allocate and initialise a blank device with a given minor.
2132 */
2133static struct mapped_device *alloc_dev(int minor)
2134{
2135        int r;
2136        struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL);
2137        void *old_md;
2138
2139        if (!md) {
2140                DMWARN("unable to allocate device, out of memory.");
2141                return NULL;
2142        }
2143
2144        if (!try_module_get(THIS_MODULE))
2145                goto bad_module_get;
2146
2147        /* get a minor number for the dev */
2148        if (minor == DM_ANY_MINOR)
2149                r = next_free_minor(&minor);
2150        else
2151                r = specific_minor(minor);
2152        if (r < 0)
2153                goto bad_minor;
2154
2155        r = init_srcu_struct(&md->io_barrier);
2156        if (r < 0)
2157                goto bad_io_barrier;
2158
2159        md->type = DM_TYPE_NONE;
2160        mutex_init(&md->suspend_lock);
2161        mutex_init(&md->type_lock);
2162        mutex_init(&md->table_devices_lock);
2163        spin_lock_init(&md->deferred_lock);
2164        atomic_set(&md->holders, 1);
2165        atomic_set(&md->open_count, 0);
2166        atomic_set(&md->event_nr, 0);
2167        atomic_set(&md->uevent_seq, 0);
2168        INIT_LIST_HEAD(&md->uevent_list);
2169        INIT_LIST_HEAD(&md->table_devices);
2170        spin_lock_init(&md->uevent_lock);
2171
2172        md->queue = blk_alloc_queue(GFP_KERNEL);
2173        if (!md->queue)
2174                goto bad_queue;
2175
2176        dm_init_md_queue(md);
2177
2178        md->disk = alloc_disk(1);
2179        if (!md->disk)
2180                goto bad_disk;
2181
2182        atomic_set(&md->pending[0], 0);
2183        atomic_set(&md->pending[1], 0);
2184        init_waitqueue_head(&md->wait);
2185        INIT_WORK(&md->work, dm_wq_work);
2186        init_waitqueue_head(&md->eventq);
2187        init_completion(&md->kobj_holder.completion);
2188        md->kworker_task = NULL;
2189
2190        md->disk->major = _major;
2191        md->disk->first_minor = minor;
2192        md->disk->fops = &dm_blk_dops;
2193        md->disk->queue = md->queue;
2194        md->disk->private_data = md;
2195        sprintf(md->disk->disk_name, "dm-%d", minor);
2196        add_disk(md->disk);
2197        format_dev_t(md->name, MKDEV(_major, minor));
2198
2199        md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
2200        if (!md->wq)
2201                goto bad_thread;
2202
2203        md->bdev = bdget_disk(md->disk, 0);
2204        if (!md->bdev)
2205                goto bad_bdev;
2206
2207        bio_init(&md->flush_bio);
2208        md->flush_bio.bi_bdev = md->bdev;
2209        md->flush_bio.bi_rw = WRITE_FLUSH;
2210
2211        dm_stats_init(&md->stats);
2212
2213        /* Populate the mapping, nobody knows we exist yet */
2214        spin_lock(&_minor_lock);
2215        old_md = idr_replace(&_minor_idr, md, minor);
2216        spin_unlock(&_minor_lock);
2217
2218        BUG_ON(old_md != MINOR_ALLOCED);
2219
2220        return md;
2221
2222bad_bdev:
2223        destroy_workqueue(md->wq);
2224bad_thread:
2225        del_gendisk(md->disk);
2226        put_disk(md->disk);
2227bad_disk:
2228        blk_cleanup_queue(md->queue);
2229bad_queue:
2230        cleanup_srcu_struct(&md->io_barrier);
2231bad_io_barrier:
2232        free_minor(minor);
2233bad_minor:
2234        module_put(THIS_MODULE);
2235bad_module_get:
2236        kfree(md);
2237        return NULL;
2238}
2239
2240static void unlock_fs(struct mapped_device *md);
2241
2242static void free_dev(struct mapped_device *md)
2243{
2244        int minor = MINOR(disk_devt(md->disk));
2245
2246        unlock_fs(md);
2247        destroy_workqueue(md->wq);
2248
2249        if (md->kworker_task)
2250                kthread_stop(md->kworker_task);
2251        if (md->io_pool)
2252                mempool_destroy(md->io_pool);
2253        if (md->rq_pool)
2254                mempool_destroy(md->rq_pool);
2255        if (md->bs)
2256                bioset_free(md->bs);
2257
2258        cleanup_srcu_struct(&md->io_barrier);
2259        free_table_devices(&md->table_devices);
2260        dm_stats_cleanup(&md->stats);
2261
2262        spin_lock(&_minor_lock);
2263        md->disk->private_data = NULL;
2264        spin_unlock(&_minor_lock);
2265        if (blk_get_integrity(md->disk))
2266                blk_integrity_unregister(md->disk);
2267        del_gendisk(md->disk);
2268        put_disk(md->disk);
2269        blk_cleanup_queue(md->queue);
2270        bdput(md->bdev);
2271        free_minor(minor);
2272
2273        module_put(THIS_MODULE);
2274        kfree(md);
2275}
2276
2277static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
2278{
2279        struct dm_md_mempools *p = dm_table_get_md_mempools(t);
2280
2281        if (md->io_pool && md->bs) {
2282                /* The md already has necessary mempools. */
2283                if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) {
2284                        /*
2285                         * Reload bioset because front_pad may have changed
2286                         * because a different table was loaded.
2287                         */
2288                        bioset_free(md->bs);
2289                        md->bs = p->bs;
2290                        p->bs = NULL;
2291                }
2292                /*
2293                 * There's no need to reload with request-based dm
2294                 * because the size of front_pad doesn't change.
2295                 * Note for future: If you are to reload bioset,
2296                 * prep-ed requests in the queue may refer
2297                 * to bio from the old bioset, so you must walk
2298                 * through the queue to unprep.
2299                 */
2300                goto out;
2301        }
2302
2303        BUG_ON(!p || md->io_pool || md->rq_pool || md->bs);
2304
2305        md->io_pool = p->io_pool;
2306        p->io_pool = NULL;
2307        md->rq_pool = p->rq_pool;
2308        p->rq_pool = NULL;
2309        md->bs = p->bs;
2310        p->bs = NULL;
2311
2312out:
2313        /* mempool bind completed, now no need any mempools in the table */
2314        dm_table_free_md_mempools(t);
2315}
2316
2317/*
2318 * Bind a table to the device.
2319 */
2320static void event_callback(void *context)
2321{
2322        unsigned long flags;
2323        LIST_HEAD(uevents);
2324        struct mapped_device *md = (struct mapped_device *) context;
2325
2326        spin_lock_irqsave(&md->uevent_lock, flags);
2327        list_splice_init(&md->uevent_list, &uevents);
2328        spin_unlock_irqrestore(&md->uevent_lock, flags);
2329
2330        dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
2331
2332        atomic_inc(&md->event_nr);
2333        wake_up(&md->eventq);
2334}
2335
2336/*
2337 * Protected by md->suspend_lock obtained by dm_swap_table().
2338 */
2339static void __set_size(struct mapped_device *md, sector_t size)
2340{
2341        set_capacity(md->disk, size);
2342
2343        i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
2344}
2345
2346/*
2347 * Return 1 if the queue has a compulsory merge_bvec_fn function.
2348 *
2349 * If this function returns 0, then the device is either a non-dm
2350 * device without a merge_bvec_fn, or it is a dm device that is
2351 * able to split any bios it receives that are too big.
2352 */
2353int dm_queue_merge_is_compulsory(struct request_queue *q)
2354{
2355        struct mapped_device *dev_md;
2356
2357        if (!q->merge_bvec_fn)
2358                return 0;
2359
2360        if (q->make_request_fn == dm_request) {
2361                dev_md = q->queuedata;
2362                if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags))
2363                        return 0;
2364        }
2365
2366        return 1;
2367}
2368
2369static int dm_device_merge_is_compulsory(struct dm_target *ti,
2370                                         struct dm_dev *dev, sector_t start,
2371                                         sector_t len, void *data)
2372{
2373        struct block_device *bdev = dev->bdev;
2374        struct request_queue *q = bdev_get_queue(bdev);
2375
2376        return dm_queue_merge_is_compulsory(q);
2377}
2378
2379/*
2380 * Return 1 if it is acceptable to ignore merge_bvec_fn based
2381 * on the properties of the underlying devices.
2382 */
2383static int dm_table_merge_is_optional(struct dm_table *table)
2384{
2385        unsigned i = 0;
2386        struct dm_target *ti;
2387
2388        while (i < dm_table_get_num_targets(table)) {
2389                ti = dm_table_get_target(table, i++);
2390
2391                if (ti->type->iterate_devices &&
2392                    ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL))
2393                        return 0;
2394        }
2395
2396        return 1;
2397}
2398
2399/*
2400 * Returns old map, which caller must destroy.
2401 */
2402static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2403                               struct queue_limits *limits)
2404{
2405        struct dm_table *old_map;
2406        struct request_queue *q = md->queue;
2407        sector_t size;
2408        int merge_is_optional;
2409
2410        size = dm_table_get_size(t);
2411
2412        /*
2413         * Wipe any geometry if the size of the table changed.
2414         */
2415        if (size != dm_get_size(md))
2416                memset(&md->geometry, 0, sizeof(md->geometry));
2417
2418        __set_size(md, size);
2419
2420        dm_table_event_callback(t, event_callback, md);
2421
2422        /*
2423         * The queue hasn't been stopped yet, if the old table type wasn't
2424         * for request-based during suspension.  So stop it to prevent
2425         * I/O mapping before resume.
2426         * This must be done before setting the queue restrictions,
2427         * because request-based dm may be run just after the setting.
2428         */
2429        if (dm_table_request_based(t) && !blk_queue_stopped(q))
2430                stop_queue(q);
2431
2432        __bind_mempools(md, t);
2433
2434        merge_is_optional = dm_table_merge_is_optional(t);
2435
2436        old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2437        rcu_assign_pointer(md->map, t);
2438        md->immutable_target_type = dm_table_get_immutable_target_type(t);
2439
2440        dm_table_set_restrictions(t, q, limits);
2441        if (merge_is_optional)
2442                set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
2443        else
2444                clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
2445        if (old_map)
2446                dm_sync_table(md);
2447
2448        return old_map;
2449}
2450
2451/*
2452 * Returns unbound table for the caller to free.
2453 */
2454static struct dm_table *__unbind(struct mapped_device *md)
2455{
2456        struct dm_table *map = rcu_dereference_protected(md->map, 1);
2457
2458        if (!map)
2459                return NULL;
2460
2461        dm_table_event_callback(map, NULL, NULL);
2462        RCU_INIT_POINTER(md->map, NULL);
2463        dm_sync_table(md);
2464
2465        return map;
2466}
2467
2468/*
2469 * Constructor for a new device.
2470 */
2471int dm_create(int minor, struct mapped_device **result)
2472{
2473        struct mapped_device *md;
2474
2475        md = alloc_dev(minor);
2476        if (!md)
2477                return -ENXIO;
2478
2479        dm_sysfs_init(md);
2480
2481        *result = md;
2482        return 0;
2483}
2484
2485/*
2486 * Functions to manage md->type.
2487 * All are required to hold md->type_lock.
2488 */
2489void dm_lock_md_type(struct mapped_device *md)
2490{
2491        mutex_lock(&md->type_lock);
2492}
2493
2494void dm_unlock_md_type(struct mapped_device *md)
2495{
2496        mutex_unlock(&md->type_lock);
2497}
2498
2499void dm_set_md_type(struct mapped_device *md, unsigned type)
2500{
2501        BUG_ON(!mutex_is_locked(&md->type_lock));
2502        md->type = type;
2503}
2504
2505unsigned dm_get_md_type(struct mapped_device *md)
2506{
2507        BUG_ON(!mutex_is_locked(&md->type_lock));
2508        return md->type;
2509}
2510
2511static bool dm_md_type_request_based(struct mapped_device *md)
2512{
2513        unsigned table_type = dm_get_md_type(md);
2514
2515        return (table_type == DM_TYPE_REQUEST_BASED ||
2516                table_type == DM_TYPE_MQ_REQUEST_BASED);
2517}
2518
2519struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
2520{
2521        return md->immutable_target_type;
2522}
2523
2524/*
2525 * The queue_limits are only valid as long as you have a reference
2526 * count on 'md'.
2527 */
2528struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
2529{
2530        BUG_ON(!atomic_read(&md->holders));
2531        return &md->queue->limits;
2532}
2533EXPORT_SYMBOL_GPL(dm_get_queue_limits);
2534
2535/*
2536 * Fully initialize a request-based queue (->elevator, ->request_fn, etc).
2537 */
2538static int dm_init_request_based_queue(struct mapped_device *md)
2539{
2540        struct request_queue *q = NULL;
2541
2542        if (md->queue->elevator)
2543                return 1;
2544
2545        /* Fully initialize the queue */
2546        q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL);
2547        if (!q)
2548                return 0;
2549
2550        md->queue = q;
2551        dm_init_md_queue(md);
2552        blk_queue_softirq_done(md->queue, dm_softirq_done);
2553        blk_queue_prep_rq(md->queue, dm_prep_fn);
2554        blk_queue_lld_busy(md->queue, dm_lld_busy);
2555
2556        /* Also initialize the request-based DM worker thread */
2557        init_kthread_worker(&md->kworker);
2558        md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker,
2559                                       "kdmwork-%s", dm_device_name(md));
2560
2561        elv_register_queue(md->queue);
2562
2563        return 1;
2564}
2565
2566/*
2567 * Setup the DM device's queue based on md's type
2568 */
2569int dm_setup_md_queue(struct mapped_device *md)
2570{
2571        if (dm_md_type_request_based(md) && !dm_init_request_based_queue(md)) {
2572                DMWARN("Cannot initialize queue for request-based mapped device");
2573                return -EINVAL;
2574        }
2575
2576        return 0;
2577}
2578
2579struct mapped_device *dm_get_md(dev_t dev)
2580{
2581        struct mapped_device *md;
2582        unsigned minor = MINOR(dev);
2583
2584        if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2585                return NULL;
2586
2587        spin_lock(&_minor_lock);
2588
2589        md = idr_find(&_minor_idr, minor);
2590        if (md) {
2591                if ((md == MINOR_ALLOCED ||
2592                     (MINOR(disk_devt(dm_disk(md))) != minor) ||
2593                     dm_deleting_md(md) ||
2594                     test_bit(DMF_FREEING, &md->flags))) {
2595                        md = NULL;
2596                        goto out;
2597                }
2598                dm_get(md);
2599        }
2600
2601out:
2602        spin_unlock(&_minor_lock);
2603
2604        return md;
2605}
2606EXPORT_SYMBOL_GPL(dm_get_md);
2607
2608void *dm_get_mdptr(struct mapped_device *md)
2609{
2610        return md->interface_ptr;
2611}
2612
2613void dm_set_mdptr(struct mapped_device *md, void *ptr)
2614{
2615        md->interface_ptr = ptr;
2616}
2617
2618void dm_get(struct mapped_device *md)
2619{
2620        atomic_inc(&md->holders);
2621        BUG_ON(test_bit(DMF_FREEING, &md->flags));
2622}
2623
2624int dm_hold(struct mapped_device *md)
2625{
2626        spin_lock(&_minor_lock);
2627        if (test_bit(DMF_FREEING, &md->flags)) {
2628                spin_unlock(&_minor_lock);
2629                return -EBUSY;
2630        }
2631        dm_get(md);
2632        spin_unlock(&_minor_lock);
2633        return 0;
2634}
2635EXPORT_SYMBOL_GPL(dm_hold);
2636
2637const char *dm_device_name(struct mapped_device *md)
2638{
2639        return md->name;
2640}
2641EXPORT_SYMBOL_GPL(dm_device_name);
2642
2643static void __dm_destroy(struct mapped_device *md, bool wait)
2644{
2645        struct dm_table *map;
2646        int srcu_idx;
2647
2648        might_sleep();
2649
2650        map = dm_get_live_table(md, &srcu_idx);
2651
2652        spin_lock(&_minor_lock);
2653        idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2654        set_bit(DMF_FREEING, &md->flags);
2655        spin_unlock(&_minor_lock);
2656
2657        if (dm_request_based(md))
2658                flush_kthread_worker(&md->kworker);
2659
2660        /*
2661         * Take suspend_lock so that presuspend and postsuspend methods
2662         * do not race with internal suspend.
2663         */
2664        mutex_lock(&md->suspend_lock);
2665        if (!dm_suspended_md(md)) {
2666                dm_table_presuspend_targets(map);
2667                dm_table_postsuspend_targets(map);
2668        }
2669        mutex_unlock(&md->suspend_lock);
2670
2671        /* dm_put_live_table must be before msleep, otherwise deadlock is possible */
2672        dm_put_live_table(md, srcu_idx);
2673
2674        /*
2675         * Rare, but there may be I/O requests still going to complete,
2676         * for example.  Wait for all references to disappear.
2677         * No one should increment the reference count of the mapped_device,
2678         * after the mapped_device state becomes DMF_FREEING.
2679         */
2680        if (wait)
2681                while (atomic_read(&md->holders))
2682                        msleep(1);
2683        else if (atomic_read(&md->holders))
2684                DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2685                       dm_device_name(md), atomic_read(&md->holders));
2686
2687        dm_sysfs_exit(md);
2688        dm_table_destroy(__unbind(md));
2689        free_dev(md);
2690}
2691
2692void dm_destroy(struct mapped_device *md)
2693{
2694        __dm_destroy(md, true);
2695}
2696
2697void dm_destroy_immediate(struct mapped_device *md)
2698{
2699        __dm_destroy(md, false);
2700}
2701
2702void dm_put(struct mapped_device *md)
2703{
2704        atomic_dec(&md->holders);
2705}
2706EXPORT_SYMBOL_GPL(dm_put);
2707
2708static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
2709{
2710        int r = 0;
2711        DECLARE_WAITQUEUE(wait, current);
2712
2713        add_wait_queue(&md->wait, &wait);
2714
2715        while (1) {
2716                set_current_state(interruptible);
2717
2718                if (!md_in_flight(md))
2719                        break;
2720
2721                if (interruptible == TASK_INTERRUPTIBLE &&
2722                    signal_pending(current)) {
2723                        r = -EINTR;
2724                        break;
2725                }
2726
2727                io_schedule();
2728        }
2729        set_current_state(TASK_RUNNING);
2730
2731        remove_wait_queue(&md->wait, &wait);
2732
2733        return r;
2734}
2735
2736/*
2737 * Process the deferred bios
2738 */
2739static void dm_wq_work(struct work_struct *work)
2740{
2741        struct mapped_device *md = container_of(work, struct mapped_device,
2742                                                work);
2743        struct bio *c;
2744        int srcu_idx;
2745        struct dm_table *map;
2746
2747        map = dm_get_live_table(md, &srcu_idx);
2748
2749        while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2750                spin_lock_irq(&md->deferred_lock);
2751                c = bio_list_pop(&md->deferred);
2752                spin_unlock_irq(&md->deferred_lock);
2753
2754                if (!c)
2755                        break;
2756
2757                if (dm_request_based(md))
2758                        generic_make_request(c);
2759                else
2760                        __split_and_process_bio(md, map, c);
2761        }
2762
2763        dm_put_live_table(md, srcu_idx);
2764}
2765
2766static void dm_queue_flush(struct mapped_device *md)
2767{
2768        clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2769        smp_mb__after_atomic();
2770        queue_work(md->wq, &md->work);
2771}
2772
2773/*
2774 * Swap in a new table, returning the old one for the caller to destroy.
2775 */
2776struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2777{
2778        struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
2779        struct queue_limits limits;
2780        int r;
2781
2782        mutex_lock(&md->suspend_lock);
2783
2784        /* device must be suspended */
2785        if (!dm_suspended_md(md))
2786                goto out;
2787
2788        /*
2789         * If the new table has no data devices, retain the existing limits.
2790         * This helps multipath with queue_if_no_path if all paths disappear,
2791         * then new I/O is queued based on these limits, and then some paths
2792         * reappear.
2793         */
2794        if (dm_table_has_no_data_devices(table)) {
2795                live_map = dm_get_live_table_fast(md);
2796                if (live_map)
2797                        limits = md->queue->limits;
2798                dm_put_live_table_fast(md);
2799        }
2800
2801        if (!live_map) {
2802                r = dm_calculate_queue_limits(table, &limits);
2803                if (r) {
2804                        map = ERR_PTR(r);
2805                        goto out;
2806                }
2807        }
2808
2809        map = __bind(md, table, &limits);
2810
2811out:
2812        mutex_unlock(&md->suspend_lock);
2813        return map;
2814}
2815
2816/*
2817 * Functions to lock and unlock any filesystem running on the
2818 * device.
2819 */
2820static int lock_fs(struct mapped_device *md)
2821{
2822        int r;
2823
2824        WARN_ON(md->frozen_sb);
2825
2826        md->frozen_sb = freeze_bdev(md->bdev);
2827        if (IS_ERR(md->frozen_sb)) {
2828                r = PTR_ERR(md->frozen_sb);
2829                md->frozen_sb = NULL;
2830                return r;
2831        }
2832
2833        set_bit(DMF_FROZEN, &md->flags);
2834
2835        return 0;
2836}
2837
2838static void unlock_fs(struct mapped_device *md)
2839{
2840        if (!test_bit(DMF_FROZEN, &md->flags))
2841                return;
2842
2843        thaw_bdev(md->bdev, md->frozen_sb);
2844        md->frozen_sb = NULL;
2845        clear_bit(DMF_FROZEN, &md->flags);
2846}
2847
2848/*
2849 * If __dm_suspend returns 0, the device is completely quiescent
2850 * now. There is no request-processing activity. All new requests
2851 * are being added to md->deferred list.
2852 *
2853 * Caller must hold md->suspend_lock
2854 */
2855static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
2856                        unsigned suspend_flags, int interruptible)
2857{
2858        bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
2859        bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
2860        int r;
2861
2862        /*
2863         * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
2864         * This flag is cleared before dm_suspend returns.
2865         */
2866        if (noflush)
2867                set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2868
2869        /*
2870         * This gets reverted if there's an error later and the targets
2871         * provide the .presuspend_undo hook.
2872         */
2873        dm_table_presuspend_targets(map);
2874
2875        /*
2876         * Flush I/O to the device.
2877         * Any I/O submitted after lock_fs() may not be flushed.
2878         * noflush takes precedence over do_lockfs.
2879         * (lock_fs() flushes I/Os and waits for them to complete.)
2880         */
2881        if (!noflush && do_lockfs) {
2882                r = lock_fs(md);
2883                if (r) {
2884                        dm_table_presuspend_undo_targets(map);
2885                        return r;
2886                }
2887        }
2888
2889        /*
2890         * Here we must make sure that no processes are submitting requests
2891         * to target drivers i.e. no one may be executing
2892         * __split_and_process_bio. This is called from dm_request and
2893         * dm_wq_work.
2894         *
2895         * To get all processes out of __split_and_process_bio in dm_request,
2896         * we take the write lock. To prevent any process from reentering
2897         * __split_and_process_bio from dm_request and quiesce the thread
2898         * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
2899         * flush_workqueue(md->wq).
2900         */
2901        set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2902        if (map)
2903                synchronize_srcu(&md->io_barrier);
2904
2905        /*
2906         * Stop md->queue before flushing md->wq in case request-based
2907         * dm defers requests to md->wq from md->queue.
2908         */
2909        if (dm_request_based(md)) {
2910                stop_queue(md->queue);
2911                flush_kthread_worker(&md->kworker);
2912        }
2913
2914        flush_workqueue(md->wq);
2915
2916        /*
2917         * At this point no more requests are entering target request routines.
2918         * We call dm_wait_for_completion to wait for all existing requests
2919         * to finish.
2920         */
2921        r = dm_wait_for_completion(md, interruptible);
2922
2923        if (noflush)
2924                clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2925        if (map)
2926                synchronize_srcu(&md->io_barrier);
2927
2928        /* were we interrupted ? */
2929        if (r < 0) {
2930                dm_queue_flush(md);
2931
2932                if (dm_request_based(md))
2933                        start_queue(md->queue);
2934
2935                unlock_fs(md);
2936                dm_table_presuspend_undo_targets(map);
2937                /* pushback list is already flushed, so skip flush */
2938        }
2939
2940        return r;
2941}
2942
2943/*
2944 * We need to be able to change a mapping table under a mounted
2945 * filesystem.  For example we might want to move some data in
2946 * the background.  Before the table can be swapped with
2947 * dm_bind_table, dm_suspend must be called to flush any in
2948 * flight bios and ensure that any further io gets deferred.
2949 */
2950/*
2951 * Suspend mechanism in request-based dm.
2952 *
2953 * 1. Flush all I/Os by lock_fs() if needed.
2954 * 2. Stop dispatching any I/O by stopping the request_queue.
2955 * 3. Wait for all in-flight I/Os to be completed or requeued.
2956 *
2957 * To abort suspend, start the request_queue.
2958 */
2959int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2960{
2961        struct dm_table *map = NULL;
2962        int r = 0;
2963
2964retry:
2965        mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2966
2967        if (dm_suspended_md(md)) {
2968                r = -EINVAL;
2969                goto out_unlock;
2970        }
2971
2972        if (dm_suspended_internally_md(md)) {
2973                /* already internally suspended, wait for internal resume */
2974                mutex_unlock(&md->suspend_lock);
2975                r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2976                if (r)
2977                        return r;
2978                goto retry;
2979        }
2980
2981        map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2982
2983        r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE);
2984        if (r)
2985                goto out_unlock;
2986
2987        set_bit(DMF_SUSPENDED, &md->flags);
2988
2989        dm_table_postsuspend_targets(map);
2990
2991out_unlock:
2992        mutex_unlock(&md->suspend_lock);
2993        return r;
2994}
2995
2996static int __dm_resume(struct mapped_device *md, struct dm_table *map)
2997{
2998        if (map) {
2999                int r = dm_table_resume_targets(map);
3000                if (r)
3001                        return r;
3002        }
3003
3004        dm_queue_flush(md);
3005
3006        /*
3007         * Flushing deferred I/Os must be done after targets are resumed
3008         * so that mapping of targets can work correctly.
3009         * Request-based dm is queueing the deferred I/Os in its request_queue.
3010         */
3011        if (dm_request_based(md))
3012                start_queue(md->queue);
3013
3014        unlock_fs(md);
3015
3016        return 0;
3017}
3018
3019int dm_resume(struct mapped_device *md)
3020{
3021        int r = -EINVAL;
3022        struct dm_table *map = NULL;
3023
3024retry:
3025        mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
3026
3027        if (!dm_suspended_md(md))
3028                goto out;
3029
3030        if (dm_suspended_internally_md(md)) {
3031                /* already internally suspended, wait for internal resume */
3032                mutex_unlock(&md->suspend_lock);
3033                r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
3034                if (r)
3035                        return r;
3036                goto retry;
3037        }
3038
3039        map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
3040        if (!map || !dm_table_get_size(map))
3041                goto out;
3042
3043        r = __dm_resume(md, map);
3044        if (r)
3045                goto out;
3046
3047        clear_bit(DMF_SUSPENDED, &md->flags);
3048
3049        r = 0;
3050out:
3051        mutex_unlock(&md->suspend_lock);
3052
3053        return r;
3054}
3055
3056/*
3057 * Internal suspend/resume works like userspace-driven suspend. It waits
3058 * until all bios finish and prevents issuing new bios to the target drivers.
3059 * It may be used only from the kernel.
3060 */
3061
3062static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
3063{
3064        struct dm_table *map = NULL;
3065
3066        if (md->internal_suspend_count++)
3067                return; /* nested internal suspend */
3068
3069        if (dm_suspended_md(md)) {
3070                set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
3071                return; /* nest suspend */
3072        }
3073
3074        map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
3075
3076        /*
3077         * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is
3078         * supported.  Properly supporting a TASK_INTERRUPTIBLE internal suspend
3079         * would require changing .presuspend to return an error -- avoid this
3080         * until there is a need for more elaborate variants of internal suspend.
3081         */
3082        (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE);
3083
3084        set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
3085
3086        dm_table_postsuspend_targets(map);
3087}
3088
3089static void __dm_internal_resume(struct mapped_device *md)
3090{
3091        BUG_ON(!md->internal_suspend_count);
3092
3093        if (--md->internal_suspend_count)
3094                return; /* resume from nested internal suspend */
3095
3096        if (dm_suspended_md(md))
3097                goto done; /* resume from nested suspend */
3098
3099        /*
3100         * NOTE: existing callers don't need to call dm_table_resume_targets
3101         * (which may fail -- so best to avoid it for now by passing NULL map)
3102         */
3103        (void) __dm_resume(md, NULL);
3104
3105done:
3106        clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
3107        smp_mb__after_atomic();
3108        wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
3109}
3110
3111void dm_internal_suspend_noflush(struct mapped_device *md)
3112{
3113        mutex_lock(&md->suspend_lock);
3114        __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
3115        mutex_unlock(&md->suspend_lock);
3116}
3117EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
3118
3119void dm_internal_resume(struct mapped_device *md)
3120{
3121        mutex_lock(&md->suspend_lock);
3122        __dm_internal_resume(md);
3123        mutex_unlock(&md->suspend_lock);
3124}
3125EXPORT_SYMBOL_GPL(dm_internal_resume);
3126
3127/*
3128 * Fast variants of internal suspend/resume hold md->suspend_lock,
3129 * which prevents interaction with userspace-driven suspend.
3130 */
3131
3132void dm_internal_suspend_fast(struct mapped_device *md)
3133{
3134        mutex_lock(&md->suspend_lock);
3135        if (dm_suspended_md(md) || dm_suspended_internally_md(md))
3136                return;
3137
3138        set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
3139        synchronize_srcu(&md->io_barrier);
3140        flush_workqueue(md->wq);
3141        dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
3142}
3143EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
3144
3145void dm_internal_resume_fast(struct mapped_device *md)
3146{
3147        if (dm_suspended_md(md) || dm_suspended_internally_md(md))
3148                goto done;
3149
3150        dm_queue_flush(md);
3151
3152done:
3153        mutex_unlock(&md->suspend_lock);
3154}
3155EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
3156
3157/*-----------------------------------------------------------------
3158 * Event notification.
3159 *---------------------------------------------------------------*/
3160int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
3161                       unsigned cookie)
3162{
3163        char udev_cookie[DM_COOKIE_LENGTH];
3164        char *envp[] = { udev_cookie, NULL };
3165
3166        if (!cookie)
3167                return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
3168        else {
3169                snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
3170                         DM_COOKIE_ENV_VAR_NAME, cookie);
3171                return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
3172                                          action, envp);
3173        }
3174}
3175
3176uint32_t dm_next_uevent_seq(struct mapped_device *md)
3177{
3178        return atomic_add_return(1, &md->uevent_seq);
3179}
3180
3181uint32_t dm_get_event_nr(struct mapped_device *md)
3182{
3183        return atomic_read(&md->event_nr);
3184}
3185
3186int dm_wait_event(struct mapped_device *md, int event_nr)
3187{
3188        return wait_event_interruptible(md->eventq,
3189                        (event_nr != atomic_read(&md->event_nr)));
3190}
3191
3192void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
3193{
3194        unsigned long flags;
3195
3196        spin_lock_irqsave(&md->uevent_lock, flags);
3197        list_add(elist, &md->uevent_list);
3198        spin_unlock_irqrestore(&md->uevent_lock, flags);
3199}
3200
3201/*
3202 * The gendisk is only valid as long as you have a reference
3203 * count on 'md'.
3204 */
3205struct gendisk *dm_disk(struct mapped_device *md)
3206{
3207        return md->disk;
3208}
3209
3210struct kobject *dm_kobject(struct mapped_device *md)
3211{
3212        return &md->kobj_holder.kobj;
3213}
3214
3215struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
3216{
3217        struct mapped_device *md;
3218
3219        md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
3220
3221        if (test_bit(DMF_FREEING, &md->flags) ||
3222            dm_deleting_md(md))
3223                return NULL;
3224
3225        dm_get(md);
3226        return md;
3227}
3228
3229int dm_suspended_md(struct mapped_device *md)
3230{
3231        return test_bit(DMF_SUSPENDED, &md->flags);
3232}
3233
3234int dm_suspended_internally_md(struct mapped_device *md)
3235{
3236        return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
3237}
3238
3239int dm_test_deferred_remove_flag(struct mapped_device *md)
3240{
3241        return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
3242}
3243
3244int dm_suspended(struct dm_target *ti)
3245{
3246        return dm_suspended_md(dm_table_get_md(ti->table));
3247}
3248EXPORT_SYMBOL_GPL(dm_suspended);
3249
3250int dm_noflush_suspending(struct dm_target *ti)
3251{
3252        return __noflush_suspending(dm_table_get_md(ti->table));
3253}
3254EXPORT_SYMBOL_GPL(dm_noflush_suspending);
3255
3256struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size)
3257{
3258        struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL);
3259        struct kmem_cache *cachep;
3260        unsigned int pool_size = 0;
3261        unsigned int front_pad;
3262
3263        if (!pools)
3264                return NULL;
3265
3266        switch (type) {
3267        case DM_TYPE_BIO_BASED:
3268                cachep = _io_cache;
3269                pool_size = dm_get_reserved_bio_based_ios();
3270                front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
3271                break;
3272        case DM_TYPE_REQUEST_BASED:
3273                pool_size = dm_get_reserved_rq_based_ios();
3274                pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
3275                if (!pools->rq_pool)
3276                        goto out;
3277                /* fall through to setup remaining rq-based pools */
3278        case DM_TYPE_MQ_REQUEST_BASED:
3279                cachep = _rq_tio_cache;
3280                if (!pool_size)
3281                        pool_size = dm_get_reserved_rq_based_ios();
3282                front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
3283                /* per_bio_data_size is not used. See __bind_mempools(). */
3284                WARN_ON(per_bio_data_size != 0);
3285                break;
3286        default:
3287                goto out;
3288        }
3289
3290        pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
3291        if (!pools->io_pool)
3292                goto out;
3293
3294        pools->bs = bioset_create_nobvec(pool_size, front_pad);
3295        if (!pools->bs)
3296                goto out;
3297
3298        if (integrity && bioset_integrity_create(pools->bs, pool_size))
3299                goto out;
3300
3301        return pools;
3302
3303out:
3304        dm_free_md_mempools(pools);
3305
3306        return NULL;
3307}
3308
3309void dm_free_md_mempools(struct dm_md_mempools *pools)
3310{
3311        if (!pools)
3312                return;
3313
3314        if (pools->io_pool)
3315                mempool_destroy(pools->io_pool);
3316
3317        if (pools->rq_pool)
3318                mempool_destroy(pools->rq_pool);
3319
3320        if (pools->bs)
3321                bioset_free(pools->bs);
3322
3323        kfree(pools);
3324}
3325
3326static const struct block_device_operations dm_blk_dops = {
3327        .open = dm_blk_open,
3328        .release = dm_blk_close,
3329        .ioctl = dm_blk_ioctl,
3330        .getgeo = dm_blk_getgeo,
3331        .owner = THIS_MODULE
3332};
3333
3334/*
3335 * module hooks
3336 */
3337module_init(dm_init);
3338module_exit(dm_exit);
3339
3340module_param(major, uint, 0);
3341MODULE_PARM_DESC(major, "The major number of the device mapper");
3342
3343module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
3344MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
3345
3346module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR);
3347MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools");
3348
3349MODULE_DESCRIPTION(DM_NAME " driver");
3350MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3351MODULE_LICENSE("GPL");
3352