LXR linux/drivers/md/dm.c

   1/*
   2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
   3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
   4 *
   5 * This file is released under the GPL.
   6 */
   7
   8#include "dm-core.h"
   9#include "dm-rq.h"
  10#include "dm-uevent.h"
  11
  12#include <linux/init.h>
  13#include <linux/module.h>
  14#include <linux/mutex.h>
  15#include <linux/blkpg.h>
  16#include <linux/bio.h>
  17#include <linux/mempool.h>
  18#include <linux/slab.h>
  19#include <linux/idr.h>
  20#include <linux/hdreg.h>
  21#include <linux/delay.h>
  22#include <linux/wait.h>
  23#include <linux/pr.h>
  24
  25#define DM_MSG_PREFIX "core"
  26
  27#ifdef CONFIG_PRINTK
  28/*
  29 * ratelimit state to be used in DMXXX_LIMIT().
  30 */
  31DEFINE_RATELIMIT_STATE(dm_ratelimit_state,
  32                       DEFAULT_RATELIMIT_INTERVAL,
  33                       DEFAULT_RATELIMIT_BURST);
  34EXPORT_SYMBOL(dm_ratelimit_state);
  35#endif
  36
  37/*
  38 * Cookies are numeric values sent with CHANGE and REMOVE
  39 * uevents while resuming, removing or renaming the device.
  40 */
  41#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
  42#define DM_COOKIE_LENGTH 24
  43
  44static const char *_name = DM_NAME;
  45
  46static unsigned int major = 0;
  47static unsigned int _major = 0;
  48
  49static DEFINE_IDR(_minor_idr);
  50
  51static DEFINE_SPINLOCK(_minor_lock);
  52
  53static void do_deferred_remove(struct work_struct *w);
  54
  55static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
  56
  57static struct workqueue_struct *deferred_remove_workqueue;
  58
  59/*
  60 * One of these is allocated per bio.
  61 */
  62struct dm_io {
  63        struct mapped_device *md;
  64        int error;
  65        atomic_t io_count;
  66        struct bio *bio;
  67        unsigned long start_time;
  68        spinlock_t endio_lock;
  69        struct dm_stats_aux stats_aux;
  70};
  71
  72#define MINOR_ALLOCED ((void *)-1)
  73
  74/*
  75 * Bits for the md->flags field.
  76 */
  77#define DMF_BLOCK_IO_FOR_SUSPEND 0
  78#define DMF_SUSPENDED 1
  79#define DMF_FROZEN 2
  80#define DMF_FREEING 3
  81#define DMF_DELETING 4
  82#define DMF_NOFLUSH_SUSPENDING 5
  83#define DMF_MERGE_IS_OPTIONAL 6
  84#define DMF_DEFERRED_REMOVE 7
  85#define DMF_SUSPENDED_INTERNALLY 8
  86
  87#define DM_NUMA_NODE NUMA_NO_NODE
  88static int dm_numa_node = DM_NUMA_NODE;
  89
  90/*
  91 * For mempools pre-allocation at the table loading time.
  92 */
  93struct dm_md_mempools {
  94        mempool_t *io_pool;
  95        mempool_t *rq_pool;
  96        struct bio_set *bs;
  97};
  98
  99struct table_device {
 100        struct list_head list;
 101        atomic_t count;
 102        struct dm_dev dm_dev;
 103};
 104
 105static struct kmem_cache *_io_cache;
 106static struct kmem_cache *_rq_tio_cache;
 107static struct kmem_cache *_rq_cache;
 108
 109/*
 110 * Bio-based DM's mempools' reserved IOs set by the user.
 111 */
 112#define RESERVED_BIO_BASED_IOS          16
 113static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
 114
 115static int __dm_get_module_param_int(int *module_param, int min, int max)
 116{
 117        int param = ACCESS_ONCE(*module_param);
 118        int modified_param = 0;
 119        bool modified = true;
 120
 121        if (param < min)
 122                modified_param = min;
 123        else if (param > max)
 124                modified_param = max;
 125        else
 126                modified = false;
 127
 128        if (modified) {
 129                (void)cmpxchg(module_param, param, modified_param);
 130                param = modified_param;
 131        }
 132
 133        return param;
 134}
 135
 136unsigned __dm_get_module_param(unsigned *module_param,
 137                               unsigned def, unsigned max)
 138{
 139        unsigned param = ACCESS_ONCE(*module_param);
 140        unsigned modified_param = 0;
 141
 142        if (!param)
 143                modified_param = def;
 144        else if (param > max)
 145                modified_param = max;
 146
 147        if (modified_param) {
 148                (void)cmpxchg(module_param, param, modified_param);
 149                param = modified_param;
 150        }
 151
 152        return param;
 153}
 154
 155unsigned dm_get_reserved_bio_based_ios(void)
 156{
 157        return __dm_get_module_param(&reserved_bio_based_ios,
 158                                     RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS);
 159}
 160EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
 161
 162static unsigned dm_get_numa_node(void)
 163{
 164        return __dm_get_module_param_int(&dm_numa_node,
 165                                         DM_NUMA_NODE, num_online_nodes() - 1);
 166}
 167
 168static int __init local_init(void)
 169{
 170        int r = -ENOMEM;
 171
 172        /* allocate a slab for the dm_ios */
 173        _io_cache = KMEM_CACHE(dm_io, 0);
 174        if (!_io_cache)
 175                return r;
 176
 177        _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
 178        if (!_rq_tio_cache)
 179                goto out_free_io_cache;
 180
 181        _rq_cache = kmem_cache_create("dm_old_clone_request", sizeof(struct request),
 182                                      __alignof__(struct request), 0, NULL);
 183        if (!_rq_cache)
 184                goto out_free_rq_tio_cache;
 185
 186        r = dm_uevent_init();
 187        if (r)
 188                goto out_free_rq_cache;
 189
 190        deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
 191        if (!deferred_remove_workqueue) {
 192                r = -ENOMEM;
 193                goto out_uevent_exit;
 194        }
 195
 196        _major = major;
 197        r = register_blkdev(_major, _name);
 198        if (r < 0)
 199                goto out_free_workqueue;
 200
 201        if (!_major)
 202                _major = r;
 203
 204        return 0;
 205
 206out_free_workqueue:
 207        destroy_workqueue(deferred_remove_workqueue);
 208out_uevent_exit:
 209        dm_uevent_exit();
 210out_free_rq_cache:
 211        kmem_cache_destroy(_rq_cache);
 212out_free_rq_tio_cache:
 213        kmem_cache_destroy(_rq_tio_cache);
 214out_free_io_cache:
 215        kmem_cache_destroy(_io_cache);
 216
 217        return r;
 218}
 219
 220static void local_exit(void)
 221{
 222        flush_scheduled_work();
 223        destroy_workqueue(deferred_remove_workqueue);
 224
 225        kmem_cache_destroy(_rq_cache);
 226        kmem_cache_destroy(_rq_tio_cache);
 227        kmem_cache_destroy(_io_cache);
 228        unregister_blkdev(_major, _name);
 229        dm_uevent_exit();
 230
 231        _major = 0;
 232
 233        DMINFO("cleaned up");
 234}
 235
 236static int (*_inits[])(void) __initdata = {
 237        local_init,
 238        dm_target_init,
 239        dm_linear_init,
 240        dm_stripe_init,
 241        dm_io_init,
 242        dm_kcopyd_init,
 243        dm_interface_init,
 244        dm_statistics_init,
 245};
 246
 247static void (*_exits[])(void) = {
 248        local_exit,
 249        dm_target_exit,
 250        dm_linear_exit,
 251        dm_stripe_exit,
 252        dm_io_exit,
 253        dm_kcopyd_exit,
 254        dm_interface_exit,
 255        dm_statistics_exit,
 256};
 257
 258static int __init dm_init(void)
 259{
 260        const int count = ARRAY_SIZE(_inits);
 261
 262        int r, i;
 263
 264        for (i = 0; i < count; i++) {
 265                r = _inits[i]();
 266                if (r)
 267                        goto bad;
 268        }
 269
 270        return 0;
 271
 272      bad:
 273        while (i--)
 274                _exits[i]();
 275
 276        return r;
 277}
 278
 279static void __exit dm_exit(void)
 280{
 281        int i = ARRAY_SIZE(_exits);
 282
 283        while (i--)
 284                _exits[i]();
 285
 286        /*
 287         * Should be empty by this point.
 288         */
 289        idr_destroy(&_minor_idr);
 290}
 291
 292/*
 293 * Block device functions
 294 */
 295int dm_deleting_md(struct mapped_device *md)
 296{
 297        return test_bit(DMF_DELETING, &md->flags);
 298}
 299
 300static int dm_blk_open(struct block_device *bdev, fmode_t mode)
 301{
 302        struct mapped_device *md;
 303
 304        spin_lock(&_minor_lock);
 305
 306        md = bdev->bd_disk->private_data;
 307        if (!md)
 308                goto out;
 309
 310        if (test_bit(DMF_FREEING, &md->flags) ||
 311            dm_deleting_md(md)) {
 312                md = NULL;
 313                goto out;
 314        }
 315
 316        dm_get(md);
 317        atomic_inc(&md->open_count);
 318out:
 319        spin_unlock(&_minor_lock);
 320
 321        return md ? 0 : -ENXIO;
 322}
 323
 324static void dm_blk_close(struct gendisk *disk, fmode_t mode)
 325{
 326        struct mapped_device *md;
 327
 328        spin_lock(&_minor_lock);
 329
 330        md = disk->private_data;
 331        if (WARN_ON(!md))
 332                goto out;
 333
 334        if (atomic_dec_and_test(&md->open_count) &&
 335            (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
 336                queue_work(deferred_remove_workqueue, &deferred_remove_work);
 337
 338        dm_put(md);
 339out:
 340        spin_unlock(&_minor_lock);
 341}
 342
 343int dm_open_count(struct mapped_device *md)
 344{
 345        return atomic_read(&md->open_count);
 346}
 347
 348/*
 349 * Guarantees nothing is using the device before it's deleted.
 350 */
 351int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
 352{
 353        int r = 0;
 354
 355        spin_lock(&_minor_lock);
 356
 357        if (dm_open_count(md)) {
 358                r = -EBUSY;
 359                if (mark_deferred)
 360                        set_bit(DMF_DEFERRED_REMOVE, &md->flags);
 361        } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
 362                r = -EEXIST;
 363        else
 364                set_bit(DMF_DELETING, &md->flags);
 365
 366        spin_unlock(&_minor_lock);
 367
 368        return r;
 369}
 370
 371int dm_cancel_deferred_remove(struct mapped_device *md)
 372{
 373        int r = 0;
 374
 375        spin_lock(&_minor_lock);
 376
 377        if (test_bit(DMF_DELETING, &md->flags))
 378                r = -EBUSY;
 379        else
 380                clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
 381
 382        spin_unlock(&_minor_lock);
 383
 384        return r;
 385}
 386
 387static void do_deferred_remove(struct work_struct *w)
 388{
 389        dm_deferred_remove();
 390}
 391
 392sector_t dm_get_size(struct mapped_device *md)
 393{
 394        return get_capacity(md->disk);
 395}
 396
 397struct request_queue *dm_get_md_queue(struct mapped_device *md)
 398{
 399        return md->queue;
 400}
 401
 402struct dm_stats *dm_get_stats(struct mapped_device *md)
 403{
 404        return &md->stats;
 405}
 406
 407static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 408{
 409        struct mapped_device *md = bdev->bd_disk->private_data;
 410
 411        return dm_get_geometry(md, geo);
 412}
 413
 414static int dm_grab_bdev_for_ioctl(struct mapped_device *md,
 415                                  struct block_device **bdev,
 416                                  fmode_t *mode)
 417{
 418        struct dm_target *tgt;
 419        struct dm_table *map;
 420        int srcu_idx, r;
 421
 422retry:
 423        r = -ENOTTY;
 424        map = dm_get_live_table(md, &srcu_idx);
 425        if (!map || !dm_table_get_size(map))
 426                goto out;
 427
 428        /* We only support devices that have a single target */
 429        if (dm_table_get_num_targets(map) != 1)
 430                goto out;
 431
 432        tgt = dm_table_get_target(map, 0);
 433        if (!tgt->type->prepare_ioctl)
 434                goto out;
 435
 436        if (dm_suspended_md(md)) {
 437                r = -EAGAIN;
 438                goto out;
 439        }
 440
 441        r = tgt->type->prepare_ioctl(tgt, bdev, mode);
 442        if (r < 0)
 443                goto out;
 444
 445        bdgrab(*bdev);
 446        dm_put_live_table(md, srcu_idx);
 447        return r;
 448
 449out:
 450        dm_put_live_table(md, srcu_idx);
 451        if (r == -ENOTCONN && !fatal_signal_pending(current)) {
 452                msleep(10);
 453                goto retry;
 454        }
 455        return r;
 456}
 457
 458static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
 459                        unsigned int cmd, unsigned long arg)
 460{
 461        struct mapped_device *md = bdev->bd_disk->private_data;
 462        int r;
 463
 464        r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
 465        if (r < 0)
 466                return r;
 467
 468        if (r > 0) {
 469                /*
 470                 * Target determined this ioctl is being issued against
 471                 * a logical partition of the parent bdev; so extra
 472                 * validation is needed.
 473                 */
 474                r = scsi_verify_blk_ioctl(NULL, cmd);
 475                if (r)
 476                        goto out;
 477        }
 478
 479        r =  __blkdev_driver_ioctl(bdev, mode, cmd, arg);
 480out:
 481        bdput(bdev);
 482        return r;
 483}
 484
 485static struct dm_io *alloc_io(struct mapped_device *md)
 486{
 487        return mempool_alloc(md->io_pool, GFP_NOIO);
 488}
 489
 490static void free_io(struct mapped_device *md, struct dm_io *io)
 491{
 492        mempool_free(io, md->io_pool);
 493}
 494
 495static void free_tio(struct dm_target_io *tio)
 496{
 497        bio_put(&tio->clone);
 498}
 499
 500int md_in_flight(struct mapped_device *md)
 501{
 502        return atomic_read(&md->pending[READ]) +
 503               atomic_read(&md->pending[WRITE]);
 504}
 505
 506static void start_io_acct(struct dm_io *io)
 507{
 508        struct mapped_device *md = io->md;
 509        struct bio *bio = io->bio;
 510        int cpu;
 511        int rw = bio_data_dir(bio);
 512
 513        io->start_time = jiffies;
 514
 515        cpu = part_stat_lock();
 516        part_round_stats(cpu, &dm_disk(md)->part0);
 517        part_stat_unlock();
 518        atomic_set(&dm_disk(md)->part0.in_flight[rw],
 519                atomic_inc_return(&md->pending[rw]));
 520
 521        if (unlikely(dm_stats_used(&md->stats)))
 522                dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_sector,
 523                                    bio_sectors(bio), false, 0, &io->stats_aux);
 524}
 525
 526static void end_io_acct(struct dm_io *io)
 527{
 528        struct mapped_device *md = io->md;
 529        struct bio *bio = io->bio;
 530        unsigned long duration = jiffies - io->start_time;
 531        int pending, cpu;
 532        int rw = bio_data_dir(bio);
 533
 534        cpu = part_stat_lock();
 535        part_round_stats(cpu, &dm_disk(md)->part0);
 536        part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration);
 537        part_stat_unlock();
 538
 539        if (unlikely(dm_stats_used(&md->stats)))
 540                dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_sector,
 541                                    bio_sectors(bio), true, duration, &io->stats_aux);
 542
 543        /*
 544         * After this is decremented the bio must not be touched if it is
 545         * a flush.
 546         */
 547        pending = atomic_dec_return(&md->pending[rw]);
 548        atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
 549        pending += atomic_read(&md->pending[rw^0x1]);
 550
 551        /* nudge anyone waiting on suspend queue */
 552        if (!pending)
 553                wake_up(&md->wait);
 554}
 555
 556/*
 557 * Add the bio to the list of deferred io.
 558 */
 559static void queue_io(struct mapped_device *md, struct bio *bio)
 560{
 561        unsigned long flags;
 562
 563        spin_lock_irqsave(&md->deferred_lock, flags);
 564        bio_list_add(&md->deferred, bio);
 565        spin_unlock_irqrestore(&md->deferred_lock, flags);
 566        queue_work(md->wq, &md->work);
 567}
 568
 569/*
 570 * Everyone (including functions in this file), should use this
 571 * function to access the md->map field, and make sure they call
 572 * dm_put_live_table() when finished.
 573 */
 574struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
 575{
 576        *srcu_idx = srcu_read_lock(&md->io_barrier);
 577
 578        return srcu_dereference(md->map, &md->io_barrier);
 579}
 580
 581void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
 582{
 583        srcu_read_unlock(&md->io_barrier, srcu_idx);
 584}
 585
 586void dm_sync_table(struct mapped_device *md)
 587{
 588        synchronize_srcu(&md->io_barrier);
 589        synchronize_rcu_expedited();
 590}
 591
 592/*
 593 * A fast alternative to dm_get_live_table/dm_put_live_table.
 594 * The caller must not block between these two functions.
 595 */
 596static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
 597{
 598        rcu_read_lock();
 599        return rcu_dereference(md->map);
 600}
 601
 602static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
 603{
 604        rcu_read_unlock();
 605}
 606
 607/*
 608 * Open a table device so we can use it as a map destination.
 609 */
 610static int open_table_device(struct table_device *td, dev_t dev,
 611                             struct mapped_device *md)
 612{
 613        static char *_claim_ptr = "I belong to device-mapper";
 614        struct block_device *bdev;
 615
 616        int r;
 617
 618        BUG_ON(td->dm_dev.bdev);
 619
 620        bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _claim_ptr);
 621        if (IS_ERR(bdev))
 622                return PTR_ERR(bdev);
 623
 624        r = bd_link_disk_holder(bdev, dm_disk(md));
 625        if (r) {
 626                blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
 627                return r;
 628        }
 629
 630        td->dm_dev.bdev = bdev;
 631        return 0;
 632}
 633
 634/*
 635 * Close a table device that we've been using.
 636 */
 637static void close_table_device(struct table_device *td, struct mapped_device *md)
 638{
 639        if (!td->dm_dev.bdev)
 640                return;
 641
 642        bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
 643        blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
 644        td->dm_dev.bdev = NULL;
 645}
 646
 647static struct table_device *find_table_device(struct list_head *l, dev_t dev,
 648                                              fmode_t mode) {
 649        struct table_device *td;
 650
 651        list_for_each_entry(td, l, list)
 652                if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
 653                        return td;
 654
 655        return NULL;
 656}
 657
 658int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
 659                        struct dm_dev **result) {
 660        int r;
 661        struct table_device *td;
 662
 663        mutex_lock(&md->table_devices_lock);
 664        td = find_table_device(&md->table_devices, dev, mode);
 665        if (!td) {
 666                td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
 667                if (!td) {
 668                        mutex_unlock(&md->table_devices_lock);
 669                        return -ENOMEM;
 670                }
 671
 672                td->dm_dev.mode = mode;
 673                td->dm_dev.bdev = NULL;
 674
 675                if ((r = open_table_device(td, dev, md))) {
 676                        mutex_unlock(&md->table_devices_lock);
 677                        kfree(td);
 678                        return r;
 679                }
 680
 681                format_dev_t(td->dm_dev.name, dev);
 682
 683                atomic_set(&td->count, 0);
 684                list_add(&td->list, &md->table_devices);
 685        }
 686        atomic_inc(&td->count);
 687        mutex_unlock(&md->table_devices_lock);
 688
 689        *result = &td->dm_dev;
 690        return 0;
 691}
 692EXPORT_SYMBOL_GPL(dm_get_table_device);
 693
 694void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
 695{
 696        struct table_device *td = container_of(d, struct table_device, dm_dev);
 697
 698        mutex_lock(&md->table_devices_lock);
 699        if (atomic_dec_and_test(&td->count)) {
 700                close_table_device(td, md);
 701                list_del(&td->list);
 702                kfree(td);
 703        }
 704        mutex_unlock(&md->table_devices_lock);
 705}
 706EXPORT_SYMBOL(dm_put_table_device);
 707
 708static void free_table_devices(struct list_head *devices)
 709{
 710        struct list_head *tmp, *next;
 711
 712        list_for_each_safe(tmp, next, devices) {
 713                struct table_device *td = list_entry(tmp, struct table_device, list);
 714
 715                DMWARN("dm_destroy: %s still exists with %d references",
 716                       td->dm_dev.name, atomic_read(&td->count));
 717                kfree(td);
 718        }
 719}
 720
 721/*
 722 * Get the geometry associated with a dm device
 723 */
 724int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
 725{
 726        *geo = md->geometry;
 727
 728        return 0;
 729}
 730
 731/*
 732 * Set the geometry of a device.
 733 */
 734int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
 735{
 736        sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
 737
 738        if (geo->start > sz) {
 739                DMWARN("Start sector is beyond the geometry limits.");
 740                return -EINVAL;
 741        }
 742
 743        md->geometry = *geo;
 744
 745        return 0;
 746}
 747
 748/*-----------------------------------------------------------------
 749 * CRUD START:
 750 *   A more elegant soln is in the works that uses the queue
 751 *   merge fn, unfortunately there are a couple of changes to
 752 *   the block layer that I want to make for this.  So in the
 753 *   interests of getting something for people to use I give
 754 *   you this clearly demarcated crap.
 755 *---------------------------------------------------------------*/
 756
 757static int __noflush_suspending(struct mapped_device *md)
 758{
 759        return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
 760}
 761
 762/*
 763 * Decrements the number of outstanding ios that a bio has been
 764 * cloned into, completing the original io if necc.
 765 */
 766static void dec_pending(struct dm_io *io, int error)
 767{
 768        unsigned long flags;
 769        int io_error;
 770        struct bio *bio;
 771        struct mapped_device *md = io->md;
 772
 773        /* Push-back supersedes any I/O errors */
 774        if (unlikely(error)) {
 775                spin_lock_irqsave(&io->endio_lock, flags);
 776                if (!(io->error > 0 && __noflush_suspending(md)))
 777                        io->error = error;
 778                spin_unlock_irqrestore(&io->endio_lock, flags);
 779        }
 780
 781        if (atomic_dec_and_test(&io->io_count)) {
 782                if (io->error == DM_ENDIO_REQUEUE) {
 783                        /*
 784                         * Target requested pushing back the I/O.
 785                         */
 786                        spin_lock_irqsave(&md->deferred_lock, flags);
 787                        if (__noflush_suspending(md))
 788                                bio_list_add_head(&md->deferred, io->bio);
 789                        else
 790                                /* noflush suspend was interrupted. */
 791                                io->error = -EIO;
 792                        spin_unlock_irqrestore(&md->deferred_lock, flags);
 793                }
 794
 795                io_error = io->error;
 796                bio = io->bio;
 797                end_io_acct(io);
 798                free_io(md, io);
 799
 800                if (io_error == DM_ENDIO_REQUEUE)
 801                        return;
 802
 803                if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) {
 804                        /*
 805                         * Preflush done for flush with data, reissue
 806                         * without REQ_FLUSH.
 807                         */
 808                        bio->bi_rw &= ~REQ_FLUSH;
 809                        queue_io(md, bio);
 810                } else {
 811                        /* done with normal IO or empty flush */
 812                        trace_block_bio_complete(md->queue, bio, io_error);
 813                        bio_endio(bio, io_error);
 814                }
 815        }
 816}
 817
 818void disable_write_same(struct mapped_device *md)
 819{
 820        struct queue_limits *limits = dm_get_queue_limits(md);
 821
 822        /* device doesn't really support WRITE SAME, disable it */
 823        limits->max_write_same_sectors = 0;
 824}
 825
 826static void clone_endio(struct bio *bio, int error)
 827{
 828        int r = error;
 829        struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
 830        struct dm_io *io = tio->io;
 831        struct mapped_device *md = tio->io->md;
 832        dm_endio_fn endio = tio->ti->type->end_io;
 833
 834        if (!bio_flagged(bio, BIO_UPTODATE) && !error)
 835                error = -EIO;
 836
 837        if (endio) {
 838                r = endio(tio->ti, bio, error);
 839                if (r < 0 || r == DM_ENDIO_REQUEUE)
 840                        /*
 841                         * error and requeue request are handled
 842                         * in dec_pending().
 843                         */
 844                        error = r;
 845                else if (r == DM_ENDIO_INCOMPLETE)
 846                        /* The target will handle the io */
 847                        return;
 848                else if (r) {
 849                        DMWARN("unimplemented target endio return value: %d", r);
 850                        BUG();
 851                }
 852        }
 853
 854        if (unlikely(r == -EREMOTEIO && (bio->bi_rw & REQ_WRITE_SAME) &&
 855                     !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors))
 856                disable_write_same(md);
 857
 858        free_tio(tio);
 859        dec_pending(io, error);
 860}
 861
 862/*
 863 * Return maximum size of I/O possible at the supplied sector up to the current
 864 * target boundary.
 865 */
 866static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
 867{
 868        sector_t target_offset = dm_target_offset(ti, sector);
 869
 870        return ti->len - target_offset;
 871}
 872
 873static sector_t max_io_len(sector_t sector, struct dm_target *ti)
 874{
 875        sector_t len = max_io_len_target_boundary(sector, ti);
 876        sector_t offset, max_len;
 877
 878        /*
 879         * Does the target need to split even further?
 880         */
 881        if (ti->max_io_len) {
 882                offset = dm_target_offset(ti, sector);
 883                if (unlikely(ti->max_io_len & (ti->max_io_len - 1)))
 884                        max_len = sector_div(offset, ti->max_io_len);
 885                else
 886                        max_len = offset & (ti->max_io_len - 1);
 887                max_len = ti->max_io_len - max_len;
 888
 889                if (len > max_len)
 890                        len = max_len;
 891        }
 892
 893        return len;
 894}
 895
 896int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
 897{
 898        if (len > UINT_MAX) {
 899                DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
 900                      (unsigned long long)len, UINT_MAX);
 901                ti->error = "Maximum size of target IO is too large";
 902                return -EINVAL;
 903        }
 904
 905        ti->max_io_len = (uint32_t) len;
 906
 907        return 0;
 908}
 909EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
 910
 911static long dm_blk_direct_access(struct block_device *bdev, sector_t sector,
 912                                 void **kaddr, pfn_t *pfn, long size)
 913{
 914        struct mapped_device *md = bdev->bd_disk->private_data;
 915        struct dm_table *map;
 916        struct dm_target *ti;
 917        int srcu_idx;
 918        long len, ret = -EIO;
 919
 920        map = dm_get_live_table(md, &srcu_idx);
 921        if (!map)
 922                goto out;
 923
 924        ti = dm_table_find_target(map, sector);
 925        if (!dm_target_is_valid(ti))
 926                goto out;
 927
 928        len = max_io_len(sector, ti) << SECTOR_SHIFT;
 929        size = min(len, size);
 930
 931        if (ti->type->direct_access)
 932                ret = ti->type->direct_access(ti, sector, kaddr, pfn, size);
 933out:
 934        dm_put_live_table(md, srcu_idx);
 935        return min(ret, size);
 936}
 937
 938/*
 939 * Flush current->bio_list when the target map method blocks.
 940 * This fixes deadlocks in snapshot and possibly in other targets.
 941 */
 942struct dm_offload {
 943        struct blk_plug plug;
 944        struct blk_plug_cb cb;
 945};
 946
 947static void flush_current_bio_list(struct blk_plug_cb *cb, bool from_schedule)
 948{
 949        struct dm_offload *o = container_of(cb, struct dm_offload, cb);
 950        struct bio_list list;
 951        struct bio *bio;
 952        int i;
 953
 954        INIT_LIST_HEAD(&o->cb.list);
 955
 956        if (unlikely(!current->bio_list))
 957                return;
 958
 959        for (i = 0; i < 2; i++) {
 960                list = current->bio_list[i];
 961                bio_list_init(&current->bio_list[i]);
 962
 963                while ((bio = bio_list_pop(&list))) {
 964                        struct bio_set *bs = bio->bi_pool;
 965                        if (unlikely(!bs) || bs == fs_bio_set) {
 966                                bio_list_add(&current->bio_list[i], bio);
 967                                continue;
 968                        }
 969
 970                        spin_lock(&bs->rescue_lock);
 971                        bio_list_add(&bs->rescue_list, bio);
 972                        queue_work(bs->rescue_workqueue, &bs->rescue_work);
 973                        spin_unlock(&bs->rescue_lock);
 974                }
 975        }
 976}
 977
 978static void dm_offload_start(struct dm_offload *o)
 979{
 980        blk_start_plug(&o->plug);
 981        o->cb.callback = flush_current_bio_list;
 982        list_add(&o->cb.list, &current->plug->cb_list);
 983}
 984
 985static void dm_offload_end(struct dm_offload *o)
 986{
 987        list_del(&o->cb.list);
 988        blk_finish_plug(&o->plug);
 989}
 990
 991static void __map_bio(struct dm_target_io *tio)
 992{
 993        int r;
 994        sector_t sector;
 995        struct dm_offload o;
 996        struct bio *clone = &tio->clone;
 997        struct dm_target *ti = tio->ti;
 998
 999        clone->bi_end_io = clone_endio;
1000

1001        /*
1002         * Map the clone.  If r == 0 we don't need to do
1003         * anything, the target has assumed ownership of
1004         * this io.
1005         */
1006        atomic_inc(&tio->io->io_count);
1007        sector = clone->bi_sector;
1008
1009        dm_offload_start(&o);
1010        r = ti->type->map(ti, clone);
1011        dm_offload_end(&o);
1012
1013        if (r == DM_MAPIO_REMAPPED) {
1014                /* the bio has been remapped so dispatch it */
1015
1016                trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,
1017                                      tio->io->bio->bi_bdev->bd_dev, sector);
1018
1019                generic_make_request(clone);
1020        } else if (r < 0 || r == DM_MAPIO_REQUEUE) {
1021                /* error the io and bail out, or requeue it if needed */
1022                dec_pending(tio->io, r);
1023                free_tio(tio);
1024        } else if (r != DM_MAPIO_SUBMITTED) {
1025                DMWARN("unimplemented target map return value: %d", r);
1026                BUG();
1027        }
1028}
1029
1030struct clone_info {
1031        struct mapped_device *md;
1032        struct dm_table *map;
1033        struct bio *bio;
1034        struct dm_io *io;
1035        sector_t sector;
1036        sector_t sector_count;
1037        unsigned short idx;
1038};
1039
1040static void bio_setup_sector(struct bio *bio, sector_t sector, sector_t len)
1041{
1042        bio->bi_sector = sector;
1043        bio->bi_size = to_bytes(len);
1044}
1045
1046static void bio_setup_bv(struct bio *bio, unsigned short idx, unsigned short bv_count)
1047{
1048        bio->bi_idx = idx;
1049        bio->bi_vcnt = idx + bv_count;
1050        bio->bi_flags &= ~(1 << BIO_SEG_VALID);
1051}
1052
1053static int clone_bio_integrity(struct bio *bio, struct bio *clone,
1054                               unsigned short idx, unsigned len, unsigned offset,
1055                               bool trim)
1056{
1057        int r;
1058
1059        r = bio_integrity_clone(clone, bio, GFP_NOIO);
1060        if (r < 0)
1061                return r;
1062
1063        if (trim)
1064                bio_integrity_trim(clone, bio_sector_offset(bio, idx, offset), len);
1065
1066        return 0;
1067}
1068
1069/*
1070 * Creates a little bio that just does part of a bvec.
1071 */
1072static int clone_split_bio(struct dm_target_io *tio, struct bio *bio,
1073                           sector_t sector, unsigned short idx,
1074                           unsigned offset, unsigned len)
1075{
1076        struct bio *clone = &tio->clone;
1077        struct bio_vec *bv = bio->bi_io_vec + idx;
1078
1079        *clone->bi_io_vec = *bv;
1080
1081        bio_setup_sector(clone, sector, len);
1082
1083        clone->bi_bdev = bio->bi_bdev;
1084        clone->bi_rw = bio->bi_rw;
1085        clone->bi_vcnt = 1;
1086        clone->bi_io_vec->bv_offset = offset;
1087        clone->bi_io_vec->bv_len = clone->bi_size;
1088        clone->bi_flags |= 1 << BIO_CLONED;
1089
1090        if (bio_integrity(bio)) {
1091                int r = clone_bio_integrity(bio, clone, idx, len, offset, true);
1092                if (r < 0)
1093                        return r;
1094        }
1095
1096        return 0;
1097}
1098
1099/*
1100 * Creates a bio that consists of range of complete bvecs.
1101 */
1102static int clone_bio(struct dm_target_io *tio, struct bio *bio,
1103                     sector_t sector, unsigned short idx,
1104                     unsigned short bv_count, unsigned len)
1105{
1106        struct bio *clone = &tio->clone;
1107
1108        __bio_clone(clone, bio);
1109        bio_setup_sector(clone, sector, len);
1110        bio_setup_bv(clone, idx, bv_count);
1111
1112        if (bio_integrity(bio)) {
1113                int r;
1114                bool trim = false;
1115
1116                if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
1117                        trim = true;
1118                r = clone_bio_integrity(bio, clone, idx, len, 0, trim);
1119                if (r < 0)
1120                        return r;
1121        }
1122
1123        return 0;
1124}
1125
1126static struct dm_target_io *alloc_tio(struct clone_info *ci,
1127                                      struct dm_target *ti, int nr_iovecs,
1128                                      unsigned target_bio_nr)
1129{
1130        struct dm_target_io *tio;
1131        struct bio *clone;
1132
1133        clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, ci->md->bs);
1134        tio = container_of(clone, struct dm_target_io, clone);
1135
1136        tio->io = ci->io;
1137        tio->ti = ti;
1138        tio->target_bio_nr = target_bio_nr;
1139
1140        return tio;
1141}
1142
1143static void __clone_and_map_simple_bio(struct clone_info *ci,
1144                                       struct dm_target *ti,
1145                                       unsigned target_bio_nr, sector_t len)
1146{
1147        struct dm_target_io *tio = alloc_tio(ci, ti, ci->bio->bi_max_vecs, target_bio_nr);
1148        struct bio *clone = &tio->clone;
1149
1150        /*
1151         * Discard requests require the bio's inline iovecs be initialized.
1152         * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush
1153         * and discard, so no need for concern about wasted bvec allocations.
1154         */
1155         __bio_clone(clone, ci->bio);
1156        if (len)
1157                bio_setup_sector(clone, ci->sector, len);
1158
1159        __map_bio(tio);
1160}
1161
1162static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
1163                                  unsigned num_bios, sector_t len)
1164{
1165        unsigned target_bio_nr;
1166
1167        for (target_bio_nr = 0; target_bio_nr < num_bios; target_bio_nr++)
1168                __clone_and_map_simple_bio(ci, ti, target_bio_nr, len);
1169}
1170
1171static int __send_empty_flush(struct clone_info *ci)
1172{
1173        unsigned target_nr = 0;
1174        struct dm_target *ti;
1175
1176        BUG_ON(bio_has_data(ci->bio));
1177        while ((ti = dm_table_get_target(ci->map, target_nr++)))
1178                __send_duplicate_bios(ci, ti, ti->num_flush_bios, 0);
1179
1180        return 0;
1181}
1182
1183static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
1184                                    sector_t sector, int nr_iovecs,
1185                                    unsigned short idx, unsigned short bv_count,
1186                                    unsigned offset, unsigned len,
1187                                    bool split_bvec)
1188{
1189        struct bio *bio = ci->bio;
1190        struct dm_target_io *tio;
1191        unsigned target_bio_nr;
1192        unsigned num_target_bios = 1;
1193        int r = 0;
1194
1195        /*
1196         * Does the target want to receive duplicate copies of the bio?
1197         */
1198        if (bio_data_dir(bio) == WRITE && ti->num_write_bios)
1199                num_target_bios = ti->num_write_bios(ti, bio);
1200
1201        for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) {
1202                tio = alloc_tio(ci, ti, nr_iovecs, target_bio_nr);
1203                if (split_bvec)
1204                        r = clone_split_bio(tio, bio, sector, idx, offset, len);
1205                else
1206                        r = clone_bio(tio, bio, sector, idx, bv_count, len);
1207                if (r < 0) {
1208                        free_tio(tio);
1209                        break;
1210                }
1211                __map_bio(tio);
1212        }
1213
1214        return r;
1215}
1216
1217typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
1218
1219static unsigned get_num_discard_bios(struct dm_target *ti)
1220{
1221        return ti->num_discard_bios;
1222}
1223
1224static unsigned get_num_write_same_bios(struct dm_target *ti)
1225{
1226        return ti->num_write_same_bios;
1227}
1228
1229typedef bool (*is_split_required_fn)(struct dm_target *ti);
1230
1231static bool is_split_required_for_discard(struct dm_target *ti)
1232{
1233        return ti->split_discard_bios;
1234}
1235
1236static int __send_changing_extent_only(struct clone_info *ci,
1237                                       get_num_bios_fn get_num_bios,
1238                                       is_split_required_fn is_split_required)
1239{
1240        struct dm_target *ti;
1241        sector_t len;
1242        unsigned num_bios;
1243
1244        do {
1245                ti = dm_table_find_target(ci->map, ci->sector);
1246                if (!dm_target_is_valid(ti))
1247                        return -EIO;
1248
1249                /*
1250                 * Even though the device advertised support for this type of
1251                 * request, that does not mean every target supports it, and
1252                 * reconfiguration might also have changed that since the
1253                 * check was performed.
1254                 */
1255                num_bios = get_num_bios ? get_num_bios(ti) : 0;
1256                if (!num_bios)
1257                        return -EOPNOTSUPP;
1258
1259                if (is_split_required && !is_split_required(ti))
1260                        len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
1261                else
1262                        len = min(ci->sector_count, max_io_len(ci->sector, ti));
1263
1264                __send_duplicate_bios(ci, ti, num_bios, len);
1265
1266                ci->sector += len;
1267        } while (ci->sector_count -= len);
1268
1269        return 0;
1270}
1271
1272static int __send_discard(struct clone_info *ci)
1273{
1274        return __send_changing_extent_only(ci, get_num_discard_bios,
1275                                           is_split_required_for_discard);
1276}
1277
1278static int __send_write_same(struct clone_info *ci)
1279{
1280        return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
1281}
1282
1283/*
1284 * Find maximum number of sectors / bvecs we can process with a single bio.
1285 */
1286static sector_t __len_within_target(struct clone_info *ci, sector_t max, int *idx)
1287{
1288        struct bio *bio = ci->bio;
1289        sector_t bv_len, total_len = 0;
1290
1291        for (*idx = ci->idx; max && (*idx < bio->bi_vcnt); (*idx)++) {
1292                bv_len = to_sector(bio->bi_io_vec[*idx].bv_len);
1293
1294                if (bv_len > max)
1295                        break;
1296
1297                max -= bv_len;
1298                total_len += bv_len;
1299        }
1300
1301        return total_len;
1302}
1303
1304static int __split_bvec_across_targets(struct clone_info *ci,
1305                                       struct dm_target *ti, sector_t max)
1306{
1307        struct bio *bio = ci->bio;
1308        struct bio_vec *bv = bio->bi_io_vec + ci->idx;
1309        sector_t remaining = to_sector(bv->bv_len);
1310        unsigned offset = 0;
1311        sector_t len;
1312        int r;
1313
1314        do {
1315                if (offset) {
1316                        ti = dm_table_find_target(ci->map, ci->sector);
1317                        if (!dm_target_is_valid(ti))
1318                                return -EIO;
1319
1320                        max = max_io_len(ci->sector, ti);
1321                }
1322
1323                len = min(remaining, max);
1324
1325                r = __clone_and_map_data_bio(ci, ti, ci->sector, 1, ci->idx, 0,
1326                                             bv->bv_offset + offset, len, true);
1327                if (r < 0)
1328                        return r;
1329
1330                ci->sector += len;
1331                ci->sector_count -= len;
1332                offset += to_bytes(len);
1333        } while (remaining -= len);
1334
1335        ci->idx++;
1336
1337        return 0;
1338}
1339
1340/*
1341 * Select the correct strategy for processing a non-flush bio.
1342 */
1343static int __split_and_process_non_flush(struct clone_info *ci)
1344{
1345        struct bio *bio = ci->bio;
1346        struct dm_target *ti;
1347        sector_t len, max;
1348        int idx;
1349        int r;
1350
1351        if (unlikely(bio->bi_rw & REQ_DISCARD))
1352                return __send_discard(ci);
1353        else if (unlikely(bio->bi_rw & REQ_WRITE_SAME))
1354                return __send_write_same(ci);
1355
1356        ti = dm_table_find_target(ci->map, ci->sector);
1357        if (!dm_target_is_valid(ti))
1358                return -EIO;
1359
1360        max = max_io_len(ci->sector, ti);
1361
1362        /*
1363         * Optimise for the simple case where we can do all of
1364         * the remaining io with a single clone.
1365         */
1366        if (ci->sector_count <= max) {
1367                r = __clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs,
1368                                             ci->idx, bio->bi_vcnt - ci->idx, 0,
1369                                             ci->sector_count, false);
1370                if (r < 0)
1371                        return r;
1372
1373                ci->sector_count = 0;
1374                return 0;
1375        }
1376
1377        /*
1378         * There are some bvecs that don't span targets.
1379         * Do as many of these as possible.
1380         */
1381        if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
1382                len = __len_within_target(ci, max, &idx);
1383
1384                r = __clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs,
1385                                             ci->idx, idx - ci->idx, 0, len, false);
1386                if (r < 0)
1387                        return r;
1388
1389                ci->sector += len;
1390                ci->sector_count -= len;
1391                ci->idx = idx;
1392
1393                return 0;
1394        }
1395
1396        /*
1397         * Handle a bvec that must be split between two or more targets.
1398         */
1399        return __split_bvec_across_targets(ci, ti, max);
1400}
1401
1402/*
1403 * Entry point to split a bio into clones and submit them to the targets.
1404 */
1405static void __split_and_process_bio(struct mapped_device *md,
1406                                    struct dm_table *map, struct bio *bio)
1407{
1408        struct clone_info ci;
1409        int error = 0;
1410
1411        if (unlikely(!map)) {
1412                bio_io_error(bio);
1413                return;
1414        }
1415
1416        ci.map = map;
1417        ci.md = md;
1418        ci.io = alloc_io(md);
1419        ci.io->error = 0;
1420        atomic_set(&ci.io->io_count, 1);
1421        ci.io->bio = bio;
1422        ci.io->md = md;
1423        spin_lock_init(&ci.io->endio_lock);
1424        ci.sector = bio->bi_sector;
1425        ci.idx = bio->bi_idx;
1426
1427        start_io_acct(ci.io);
1428
1429        if (bio->bi_rw & REQ_FLUSH) {
1430                ci.bio = &ci.md->flush_bio;
1431                ci.sector_count = 0;
1432                error = __send_empty_flush(&ci);
1433                /* dec_pending submits any data associated with flush */
1434        } else {
1435                ci.bio = bio;
1436                ci.sector_count = bio_sectors(bio);
1437                while (ci.sector_count && !error)
1438                        error = __split_and_process_non_flush(&ci);
1439        }
1440
1441        /* drop the extra reference count */
1442        dec_pending(ci.io, error);
1443}
1444/*-----------------------------------------------------------------
1445 * CRUD END
1446 *---------------------------------------------------------------*/
1447
1448static int dm_merge_bvec(struct request_queue *q,
1449                         struct bvec_merge_data *bvm,
1450                         struct bio_vec *biovec)
1451{
1452        struct mapped_device *md = q->queuedata;
1453        struct dm_table *map = dm_get_live_table_fast(md);
1454        struct dm_target *ti;
1455        sector_t max_sectors;
1456        int max_size = 0;
1457
1458        if (unlikely(!map))
1459                goto out;
1460
1461        ti = dm_table_find_target(map, bvm->bi_sector);
1462        if (!dm_target_is_valid(ti))
1463                goto out;
1464
1465        /*
1466         * Find maximum amount of I/O that won't need splitting
1467         */
1468        max_sectors = min(max_io_len(bvm->bi_sector, ti),
1469                          (sector_t) BIO_MAX_SECTORS);
1470        max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
1471        if (max_size < 0)
1472                max_size = 0;
1473
1474        /*
1475         * merge_bvec_fn() returns number of bytes
1476         * it can accept at this offset
1477         * max is precomputed maximal io size
1478         */
1479        if (max_size && ti->type->merge)
1480                max_size = ti->type->merge(ti, bvm, biovec, max_size);
1481        /*
1482         * If the target doesn't support merge method and some of the devices
1483         * provided their merge_bvec method (we know this by looking at
1484         * queue_max_hw_sectors), then we can't allow bios with multiple vector
1485         * entries.  So always set max_size to 0, and the code below allows
1486         * just one page.
1487         */
1488        else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
1489                max_size = 0;
1490
1491out:
1492        dm_put_live_table_fast(md);
1493        /*
1494         * Always allow an entire first page
1495         */
1496        if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT))
1497                max_size = biovec->bv_len;
1498
1499        return max_size;
1500}
1501
1502/*
1503 * The request function that just remaps the bio built up by
1504 * dm_merge_bvec.
1505 */
1506static void dm_make_request(struct request_queue *q, struct bio *bio)
1507{
1508        int rw = bio_data_dir(bio);
1509        struct mapped_device *md = q->queuedata;
1510        int cpu;
1511        int srcu_idx;
1512        struct dm_table *map;
1513
1514        map = dm_get_live_table(md, &srcu_idx);
1515
1516        cpu = part_stat_lock();
1517        part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]);
1518        part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
1519        part_stat_unlock();
1520
1521        /* if we're suspended, we have to queue this io for later */
1522        if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1523                dm_put_live_table(md, srcu_idx);
1524
1525                if (bio_rw(bio) != READA)
1526                        queue_io(md, bio);
1527                else
1528                        bio_io_error(bio);
1529                return;
1530        }
1531
1532        __split_and_process_bio(md, map, bio);
1533        dm_put_live_table(md, srcu_idx);
1534        return;
1535}
1536
1537static int dm_any_congested(void *congested_data, int bdi_bits)
1538{
1539        int r = bdi_bits;
1540        struct mapped_device *md = congested_data;
1541        struct dm_table *map;
1542
1543        if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
1544                if (dm_request_based(md)) {
1545                        /*
1546                         * With request-based DM we only need to check the
1547                         * top-level queue for congestion.
1548                         */
1549                        r = md->queue->backing_dev_info.state & bdi_bits;
1550                } else {
1551                        map = dm_get_live_table_fast(md);
1552                        if (map)
1553                                r = dm_table_any_congested(map, bdi_bits);
1554                        dm_put_live_table_fast(md);
1555                }
1556        }
1557
1558        return r;
1559}
1560
1561/*-----------------------------------------------------------------
1562 * An IDR is used to keep track of allocated minor numbers.
1563 *---------------------------------------------------------------*/
1564static void free_minor(int minor)
1565{
1566        spin_lock(&_minor_lock);
1567        idr_remove(&_minor_idr, minor);
1568        spin_unlock(&_minor_lock);
1569}
1570
1571/*
1572 * See if the device with a specific minor # is free.
1573 */
1574static int specific_minor(int minor)
1575{
1576        int r;
1577
1578        if (minor >= (1 << MINORBITS))
1579                return -EINVAL;
1580
1581        idr_preload(GFP_KERNEL);
1582        spin_lock(&_minor_lock);
1583
1584        r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
1585
1586        spin_unlock(&_minor_lock);
1587        idr_preload_end();
1588        if (r < 0)
1589                return r == -ENOSPC ? -EBUSY : r;
1590        return 0;
1591}
1592
1593static int next_free_minor(int *minor)
1594{
1595        int r;
1596
1597        idr_preload(GFP_KERNEL);
1598        spin_lock(&_minor_lock);
1599
1600        r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
1601
1602        spin_unlock(&_minor_lock);
1603        idr_preload_end();
1604        if (r < 0)
1605                return r;
1606        *minor = r;
1607        return 0;
1608}
1609
1610static const struct block_device_operations dm_blk_dops;
1611
1612static void dm_wq_work(struct work_struct *work);
1613
1614void dm_init_md_queue(struct mapped_device *md)
1615{
1616        /*
1617         * Request-based dm devices cannot be stacked on top of bio-based dm
1618         * devices.  The type of this dm device may not have been decided yet.
1619         * The type is decided at the first table loading time.
1620         * To prevent problematic device stacking, clear the queue flag
1621         * for request stacking support until then.
1622         *
1623         * This queue is new, so no concurrency on the queue_flags.
1624         */
1625        queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
1626
1627        /*
1628         * Initialize data that will only be used by a non-blk-mq DM queue
1629         * - must do so here (in alloc_dev callchain) before queue is used
1630         */
1631        md->queue->queuedata = md;
1632        md->queue->backing_dev_info.congested_data = md;
1633}
1634
1635void dm_init_normal_md_queue(struct mapped_device *md)
1636{
1637        md->use_blk_mq = false;
1638        dm_init_md_queue(md);
1639
1640        /*
1641         * Initialize aspects of queue that aren't relevant for blk-mq
1642         */
1643        md->queue->backing_dev_info.congested_fn = dm_any_congested;
1644        blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
1645}
1646
1647/*
1648 * Allocate and initialise a blank device with a given minor.
1649 */
1650static struct mapped_device *alloc_dev(int minor)
1651{
1652        int r, numa_node_id = dm_get_numa_node();
1653        struct mapped_device *md;
1654        void *old_md;
1655
1656        md = kzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
1657        if (!md) {
1658                DMWARN("unable to allocate device, out of memory.");
1659                return NULL;
1660        }
1661
1662        if (!try_module_get(THIS_MODULE))
1663                goto bad_module_get;
1664
1665        /* get a minor number for the dev */
1666        if (minor == DM_ANY_MINOR)
1667                r = next_free_minor(&minor);
1668        else
1669                r = specific_minor(minor);
1670        if (r < 0)
1671                goto bad_minor;
1672
1673        r = init_srcu_struct(&md->io_barrier);
1674        if (r < 0)
1675                goto bad_io_barrier;
1676
1677        md->numa_node_id = numa_node_id;
1678        md->use_blk_mq = dm_use_blk_mq_default();
1679        md->init_tio_pdu = false;
1680        md->type = DM_TYPE_NONE;
1681        mutex_init(&md->suspend_lock);
1682        mutex_init(&md->type_lock);
1683        mutex_init(&md->table_devices_lock);
1684        spin_lock_init(&md->deferred_lock);
1685        atomic_set(&md->holders, 1);
1686        atomic_set(&md->open_count, 0);
1687        atomic_set(&md->event_nr, 0);
1688        atomic_set(&md->uevent_seq, 0);
1689        INIT_LIST_HEAD(&md->uevent_list);
1690        INIT_LIST_HEAD(&md->table_devices);
1691        spin_lock_init(&md->uevent_lock);
1692
1693        md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id);
1694        if (!md->queue)
1695                goto bad_queue;
1696
1697        dm_init_md_queue(md);
1698
1699        md->disk = alloc_disk_node(1, numa_node_id);
1700        if (!md->disk)
1701                goto bad_disk;
1702
1703        atomic_set(&md->pending[0], 0);
1704        atomic_set(&md->pending[1], 0);
1705        init_waitqueue_head(&md->wait);
1706        INIT_WORK(&md->work, dm_wq_work);
1707        init_waitqueue_head(&md->eventq);
1708        init_completion(&md->kobj_holder.completion);
1709        md->kworker_task = NULL;
1710
1711        md->disk->major = _major;
1712        md->disk->first_minor = minor;
1713        md->disk->fops = &dm_blk_dops;
1714        md->disk->queue = md->queue;
1715        md->disk->private_data = md;
1716        sprintf(md->disk->disk_name, "dm-%d", minor);
1717        add_disk(md->disk);
1718        format_dev_t(md->name, MKDEV(_major, minor));
1719
1720        md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
1721        if (!md->wq)
1722                goto bad_thread;
1723
1724        md->bdev = bdget_disk(md->disk, 0);
1725        if (!md->bdev)
1726                goto bad_bdev;
1727
1728        bio_init(&md->flush_bio);
1729        md->flush_bio.bi_bdev = md->bdev;
1730        md->flush_bio.bi_rw = WRITE_FLUSH;
1731
1732        dm_stats_init(&md->stats);
1733
1734        /* Populate the mapping, nobody knows we exist yet */
1735        spin_lock(&_minor_lock);
1736        old_md = idr_replace(&_minor_idr, md, minor);
1737        spin_unlock(&_minor_lock);
1738
1739        BUG_ON(old_md != MINOR_ALLOCED);
1740
1741        return md;
1742
1743bad_bdev:
1744        destroy_workqueue(md->wq);
1745bad_thread:
1746        del_gendisk(md->disk);
1747        put_disk(md->disk);
1748bad_disk:
1749        blk_cleanup_queue(md->queue);
1750bad_queue:
1751        cleanup_srcu_struct(&md->io_barrier);
1752bad_io_barrier:
1753        free_minor(minor);
1754bad_minor:
1755        module_put(THIS_MODULE);
1756bad_module_get:
1757        kfree(md);
1758        return NULL;
1759}
1760
1761static void unlock_fs(struct mapped_device *md);
1762
1763static void free_dev(struct mapped_device *md)
1764{
1765        int minor = MINOR(disk_devt(md->disk));
1766
1767        unlock_fs(md);
1768        destroy_workqueue(md->wq);
1769
1770        if (md->kworker_task)
1771                kthread_stop(md->kworker_task);
1772        mempool_destroy(md->io_pool);
1773        mempool_destroy(md->rq_pool);
1774        if (md->bs)
1775                bioset_free(md->bs);
1776
1777        spin_lock(&_minor_lock);
1778        md->disk->private_data = NULL;
1779        spin_unlock(&_minor_lock);
1780        if (blk_get_integrity(md->disk))
1781                blk_integrity_unregister(md->disk);
1782        del_gendisk(md->disk);
1783        put_disk(md->disk);
1784        blk_cleanup_queue(md->queue);
1785
1786        cleanup_srcu_struct(&md->io_barrier);
1787        free_table_devices(&md->table_devices);
1788        dm_stats_cleanup(&md->stats);
1789
1790        dm_mq_cleanup_mapped_device(md);
1791        bdput(md->bdev);
1792        free_minor(minor);
1793
1794        module_put(THIS_MODULE);
1795        kfree(md);
1796}
1797
1798static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
1799{
1800        struct dm_md_mempools *p = dm_table_get_md_mempools(t);
1801
1802        if (md->bs) {
1803                /* The md already has necessary mempools. */
1804                if (dm_table_bio_based(t)) {
1805                        /*
1806                         * Reload bioset because front_pad may have changed
1807                         * because a different table was loaded.
1808                         */
1809                        bioset_free(md->bs);
1810                        md->bs = p->bs;
1811                        p->bs = NULL;
1812                }
1813                /*
1814                 * There's no need to reload with request-based dm
1815                 * because the size of front_pad doesn't change.
1816                 * Note for future: If you are to reload bioset,
1817                 * prep-ed requests in the queue may refer
1818                 * to bio from the old bioset, so you must walk
1819                 * through the queue to unprep.
1820                 */
1821                goto out;
1822        }
1823
1824        BUG_ON(!p || md->io_pool || md->rq_pool || md->bs);
1825
1826        md->io_pool = p->io_pool;
1827        p->io_pool = NULL;
1828        md->rq_pool = p->rq_pool;
1829        p->rq_pool = NULL;
1830        md->bs = p->bs;
1831        p->bs = NULL;
1832
1833out:
1834        /* mempool bind completed, no longer need any mempools in the table */
1835        dm_table_free_md_mempools(t);
1836}
1837
1838/*
1839 * Bind a table to the device.
1840 */
1841static void event_callback(void *context)
1842{
1843        unsigned long flags;
1844        LIST_HEAD(uevents);
1845        struct mapped_device *md = (struct mapped_device *) context;
1846
1847        spin_lock_irqsave(&md->uevent_lock, flags);
1848        list_splice_init(&md->uevent_list, &uevents);
1849        spin_unlock_irqrestore(&md->uevent_lock, flags);
1850
1851        dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
1852
1853        atomic_inc(&md->event_nr);
1854        wake_up(&md->eventq);
1855}
1856
1857/*
1858 * Protected by md->suspend_lock obtained by dm_swap_table().
1859 */
1860static void __set_size(struct mapped_device *md, sector_t size)
1861{
1862        set_capacity(md->disk, size);
1863
1864        i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
1865}
1866
1867/*
1868 * Return 1 if the queue has a compulsory merge_bvec_fn function.
1869 *
1870 * If this function returns 0, then the device is either a non-dm
1871 * device without a merge_bvec_fn, or it is a dm device that is
1872 * able to split any bios it receives that are too big.
1873 */
1874int dm_queue_merge_is_compulsory(struct request_queue *q)
1875{
1876        struct mapped_device *dev_md;
1877
1878        if (!q->merge_bvec_fn)
1879                return 0;
1880
1881        if (q->make_request_fn == dm_make_request) {
1882                dev_md = q->queuedata;
1883                if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags))
1884                        return 0;
1885        }
1886
1887        return 1;
1888}
1889
1890static int dm_device_merge_is_compulsory(struct dm_target *ti,
1891                                         struct dm_dev *dev, sector_t start,
1892                                         sector_t len, void *data)
1893{
1894        struct block_device *bdev = dev->bdev;
1895        struct request_queue *q = bdev_get_queue(bdev);
1896
1897        return dm_queue_merge_is_compulsory(q);
1898}
1899
1900/*
1901 * Return 1 if it is acceptable to ignore merge_bvec_fn based
1902 * on the properties of the underlying devices.
1903 */
1904static int dm_table_merge_is_optional(struct dm_table *table)
1905{
1906        unsigned i = 0;
1907        struct dm_target *ti;
1908
1909        while (i < dm_table_get_num_targets(table)) {
1910                ti = dm_table_get_target(table, i++);
1911
1912                if (ti->type->iterate_devices &&
1913                    ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL))
1914                        return 0;
1915        }
1916
1917        return 1;
1918}
1919
1920/*
1921 * Returns old map, which caller must destroy.
1922 */
1923static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
1924                               struct queue_limits *limits)
1925{
1926        struct dm_table *old_map;
1927        struct request_queue *q = md->queue;
1928        sector_t size;
1929        int merge_is_optional;
1930
1931        lockdep_assert_held(&md->suspend_lock);
1932
1933        size = dm_table_get_size(t);
1934
1935        /*
1936         * Wipe any geometry if the size of the table changed.
1937         */
1938        if (size != dm_get_size(md))
1939                memset(&md->geometry, 0, sizeof(md->geometry));
1940
1941        __set_size(md, size);
1942
1943        dm_table_event_callback(t, event_callback, md);
1944
1945        /*
1946         * The queue hasn't been stopped yet, if the old table type wasn't
1947         * for request-based during suspension.  So stop it to prevent
1948         * I/O mapping before resume.
1949         * This must be done before setting the queue restrictions,
1950         * because request-based dm may be run just after the setting.
1951         */
1952        if (dm_table_request_based(t)) {
1953                dm_stop_queue(q);
1954                /*
1955                 * Leverage the fact that request-based DM targets are
1956                 * immutable singletons and establish md->immutable_target
1957                 * - used to optimize both dm_request_fn and dm_mq_queue_rq
1958                 */
1959                md->immutable_target = dm_table_get_immutable_target(t);
1960        }
1961
1962        __bind_mempools(md, t);
1963
1964        merge_is_optional = dm_table_merge_is_optional(t);
1965
1966        old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
1967        rcu_assign_pointer(md->map, (void *)t);
1968        md->immutable_target_type = dm_table_get_immutable_target_type(t);
1969
1970        dm_table_set_restrictions(t, q, limits);
1971        if (merge_is_optional)
1972                set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
1973        else
1974                clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
1975        if (old_map)
1976                dm_sync_table(md);
1977
1978        return old_map;
1979}
1980
1981/*
1982 * Returns unbound table for the caller to free.
1983 */
1984static struct dm_table *__unbind(struct mapped_device *md)
1985{
1986        struct dm_table *map = rcu_dereference_protected(md->map, 1);
1987
1988        if (!map)
1989                return NULL;
1990
1991        dm_table_event_callback(map, NULL, NULL);
1992        RCU_INIT_POINTER(md->map, NULL);
1993        dm_sync_table(md);
1994
1995        return map;
1996}
1997
1998/*
1999 * Constructor for a new device.
2000 */

2001int dm_create(int minor, struct mapped_device **result)
2002{
2003        struct mapped_device *md;
2004
2005        md = alloc_dev(minor);
2006        if (!md)
2007                return -ENXIO;
2008
2009        dm_sysfs_init(md);
2010
2011        *result = md;
2012        return 0;
2013}
2014
2015/*
2016 * Functions to manage md->type.
2017 * All are required to hold md->type_lock.
2018 */
2019void dm_lock_md_type(struct mapped_device *md)
2020{
2021        mutex_lock(&md->type_lock);
2022}
2023
2024void dm_unlock_md_type(struct mapped_device *md)
2025{
2026        mutex_unlock(&md->type_lock);
2027}
2028
2029void dm_set_md_type(struct mapped_device *md, unsigned type)
2030{
2031        BUG_ON(!mutex_is_locked(&md->type_lock));
2032        md->type = type;
2033}
2034
2035unsigned dm_get_md_type(struct mapped_device *md)
2036{
2037        return md->type;
2038}
2039
2040struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
2041{
2042        return md->immutable_target_type;
2043}
2044
2045/*
2046 * The queue_limits are only valid as long as you have a reference
2047 * count on 'md'.
2048 */
2049struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
2050{
2051        BUG_ON(!atomic_read(&md->holders));
2052        return &md->queue->limits;
2053}
2054EXPORT_SYMBOL_GPL(dm_get_queue_limits);
2055
2056/*
2057 * Setup the DM device's queue based on md's type
2058 */
2059int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
2060{
2061        int r;
2062        unsigned type = dm_get_md_type(md);
2063
2064        switch (type) {
2065        case DM_TYPE_REQUEST_BASED:
2066                r = dm_old_init_request_queue(md);
2067                if (r) {
2068                        DMERR("Cannot initialize queue for request-based mapped device");
2069                        return r;
2070                }
2071                break;
2072        case DM_TYPE_MQ_REQUEST_BASED:
2073                r = dm_mq_init_request_queue(md, t);
2074                if (r) {
2075                        DMERR("Cannot initialize queue for request-based dm-mq mapped device");
2076                        return r;
2077                }
2078                break;
2079        case DM_TYPE_BIO_BASED:
2080        case DM_TYPE_DAX_BIO_BASED:
2081                dm_init_normal_md_queue(md);
2082                blk_queue_make_request(md->queue, dm_make_request);
2083                blk_queue_merge_bvec(md->queue, dm_merge_bvec);
2084
2085                if (type == DM_TYPE_DAX_BIO_BASED)
2086                        queue_flag_set_unlocked(QUEUE_FLAG_DAX, md->queue);
2087                break;
2088        }
2089
2090        return 0;
2091}
2092
2093struct mapped_device *dm_get_md(dev_t dev)
2094{
2095        struct mapped_device *md;
2096        unsigned minor = MINOR(dev);
2097
2098        if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2099                return NULL;
2100
2101        spin_lock(&_minor_lock);
2102
2103        md = idr_find(&_minor_idr, minor);
2104        if (md) {
2105                if ((md == MINOR_ALLOCED ||
2106                     (MINOR(disk_devt(dm_disk(md))) != minor) ||
2107                     dm_deleting_md(md) ||
2108                     test_bit(DMF_FREEING, &md->flags))) {
2109                        md = NULL;
2110                        goto out;
2111                }
2112                dm_get(md);
2113        }
2114
2115out:
2116        spin_unlock(&_minor_lock);
2117
2118        return md;
2119}
2120EXPORT_SYMBOL_GPL(dm_get_md);
2121
2122void *dm_get_mdptr(struct mapped_device *md)
2123{
2124        return md->interface_ptr;
2125}
2126
2127void dm_set_mdptr(struct mapped_device *md, void *ptr)
2128{
2129        md->interface_ptr = ptr;
2130}
2131
2132void dm_get(struct mapped_device *md)
2133{
2134        atomic_inc(&md->holders);
2135        BUG_ON(test_bit(DMF_FREEING, &md->flags));
2136}
2137
2138int dm_hold(struct mapped_device *md)
2139{
2140        spin_lock(&_minor_lock);
2141        if (test_bit(DMF_FREEING, &md->flags)) {
2142                spin_unlock(&_minor_lock);
2143                return -EBUSY;
2144        }
2145        dm_get(md);
2146        spin_unlock(&_minor_lock);
2147        return 0;
2148}
2149EXPORT_SYMBOL_GPL(dm_hold);
2150
2151const char *dm_device_name(struct mapped_device *md)
2152{
2153        return md->name;
2154}
2155EXPORT_SYMBOL_GPL(dm_device_name);
2156
2157static void __dm_destroy(struct mapped_device *md, bool wait)
2158{
2159        struct request_queue *q = dm_get_md_queue(md);
2160        struct dm_table *map;
2161        int srcu_idx;
2162
2163        might_sleep();
2164
2165        spin_lock(&_minor_lock);
2166        idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2167        set_bit(DMF_FREEING, &md->flags);
2168        spin_unlock(&_minor_lock);
2169
2170        blk_set_queue_dying(q);
2171
2172        if (dm_request_based(md) && md->kworker_task)
2173                flush_kthread_worker(&md->kworker);
2174
2175        /*
2176         * Take suspend_lock so that presuspend and postsuspend methods
2177         * do not race with internal suspend.
2178         */
2179        mutex_lock(&md->suspend_lock);
2180        map = dm_get_live_table(md, &srcu_idx);
2181        if (!dm_suspended_md(md)) {
2182                dm_table_presuspend_targets(map);
2183                dm_table_postsuspend_targets(map);
2184        }
2185        /* dm_put_live_table must be before msleep, otherwise deadlock is possible */
2186        dm_put_live_table(md, srcu_idx);
2187        mutex_unlock(&md->suspend_lock);
2188
2189        /*
2190         * Rare, but there may be I/O requests still going to complete,
2191         * for example.  Wait for all references to disappear.
2192         * No one should increment the reference count of the mapped_device,
2193         * after the mapped_device state becomes DMF_FREEING.
2194         */
2195        if (wait)
2196                while (atomic_read(&md->holders))
2197                        msleep(1);
2198        else if (atomic_read(&md->holders))
2199                DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2200                       dm_device_name(md), atomic_read(&md->holders));
2201
2202        dm_sysfs_exit(md);
2203        dm_table_destroy(__unbind(md));
2204        free_dev(md);
2205}
2206
2207void dm_destroy(struct mapped_device *md)
2208{
2209        __dm_destroy(md, true);
2210}
2211
2212void dm_destroy_immediate(struct mapped_device *md)
2213{
2214        __dm_destroy(md, false);
2215}
2216
2217void dm_put(struct mapped_device *md)
2218{
2219        atomic_dec(&md->holders);
2220}
2221EXPORT_SYMBOL_GPL(dm_put);
2222
2223static int dm_wait_for_completion(struct mapped_device *md, long task_state)
2224{
2225        int r = 0;
2226        DEFINE_WAIT(wait);
2227
2228        while (1) {
2229                prepare_to_wait(&md->wait, &wait, task_state);
2230
2231                if (!md_in_flight(md))
2232                        break;
2233
2234                if (signal_pending_state(task_state, current)) {
2235                        r = -EINTR;
2236                        break;
2237                }
2238
2239                io_schedule();
2240        }
2241        finish_wait(&md->wait, &wait);
2242
2243        return r;
2244}
2245
2246/*
2247 * Process the deferred bios
2248 */
2249static void dm_wq_work(struct work_struct *work)
2250{
2251        struct mapped_device *md = container_of(work, struct mapped_device,
2252                                                work);
2253        struct bio *c;
2254        int srcu_idx;
2255        struct dm_table *map;
2256
2257        map = dm_get_live_table(md, &srcu_idx);
2258
2259        while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2260                spin_lock_irq(&md->deferred_lock);
2261                c = bio_list_pop(&md->deferred);
2262                spin_unlock_irq(&md->deferred_lock);
2263
2264                if (!c)
2265                        break;
2266
2267                if (dm_request_based(md))
2268                        generic_make_request(c);
2269                else
2270                        __split_and_process_bio(md, map, c);
2271        }
2272
2273        dm_put_live_table(md, srcu_idx);
2274}
2275
2276static void dm_queue_flush(struct mapped_device *md)
2277{
2278        clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2279        smp_mb__after_clear_bit();
2280        queue_work(md->wq, &md->work);
2281}
2282
2283/*
2284 * Swap in a new table, returning the old one for the caller to destroy.
2285 */
2286struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2287{
2288        struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
2289        struct queue_limits limits;
2290        struct queue_limits_aux limits_aux;
2291        int r;
2292
2293        mutex_lock(&md->suspend_lock);
2294
2295        /* device must be suspended */
2296        if (!dm_suspended_md(md))
2297                goto out;
2298
2299        /*
2300         * Initialize limits aux pointer to stack queue_limits_aux
2301         * members.
2302         */
2303        limits.limits_aux = &limits_aux;
2304
2305        /*
2306         * If the new table has no data devices, retain the existing limits.
2307         * This helps multipath with queue_if_no_path if all paths disappear,
2308         * then new I/O is queued based on these limits, and then some paths
2309         * reappear.
2310         */
2311        if (dm_table_has_no_data_devices(table)) {
2312                live_map = dm_get_live_table_fast(md);
2313                if (live_map)
2314                        limits = md->queue->limits;
2315                dm_put_live_table_fast(md);
2316        }
2317
2318        if (!live_map) {
2319                r = dm_calculate_queue_limits(table, &limits);
2320                if (r) {
2321                        map = ERR_PTR(r);
2322                        goto out;
2323                }
2324        }
2325
2326        map = __bind(md, table, &limits);
2327
2328out:
2329        mutex_unlock(&md->suspend_lock);
2330        return map;
2331}
2332
2333/*
2334 * Functions to lock and unlock any filesystem running on the
2335 * device.
2336 */
2337static int lock_fs(struct mapped_device *md)
2338{
2339        int r;
2340
2341        WARN_ON(md->frozen_sb);
2342
2343        md->frozen_sb = freeze_bdev(md->bdev);
2344        if (IS_ERR(md->frozen_sb)) {
2345                r = PTR_ERR(md->frozen_sb);
2346                md->frozen_sb = NULL;
2347                return r;
2348        }
2349
2350        set_bit(DMF_FROZEN, &md->flags);
2351
2352        return 0;
2353}
2354
2355static void unlock_fs(struct mapped_device *md)
2356{
2357        if (!test_bit(DMF_FROZEN, &md->flags))
2358                return;
2359
2360        thaw_bdev(md->bdev, md->frozen_sb);
2361        md->frozen_sb = NULL;
2362        clear_bit(DMF_FROZEN, &md->flags);
2363}
2364
2365/*
2366 * @suspend_flags: DM_SUSPEND_LOCKFS_FLAG and/or DM_SUSPEND_NOFLUSH_FLAG
2367 * @task_state: e.g. TASK_INTERRUPTIBLE or TASK_UNINTERRUPTIBLE
2368 * @dmf_suspended_flag: DMF_SUSPENDED or DMF_SUSPENDED_INTERNALLY
2369 *
2370 * If __dm_suspend returns 0, the device is completely quiescent
2371 * now. There is no request-processing activity. All new requests
2372 * are being added to md->deferred list.
2373 *
2374 * Caller must hold md->suspend_lock
2375 */
2376static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
2377                        unsigned suspend_flags, long task_state,
2378                        int dmf_suspended_flag)
2379{
2380        bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
2381        bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
2382        int r;
2383
2384        lockdep_assert_held(&md->suspend_lock);
2385
2386        /*
2387         * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
2388         * This flag is cleared before dm_suspend returns.
2389         */
2390        if (noflush)
2391                set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2392
2393        /*
2394         * This gets reverted if there's an error later and the targets
2395         * provide the .presuspend_undo hook.
2396         */
2397        dm_table_presuspend_targets(map);
2398
2399        /*
2400         * Flush I/O to the device.
2401         * Any I/O submitted after lock_fs() may not be flushed.
2402         * noflush takes precedence over do_lockfs.
2403         * (lock_fs() flushes I/Os and waits for them to complete.)
2404         */
2405        if (!noflush && do_lockfs) {
2406                r = lock_fs(md);
2407                if (r) {
2408                        dm_table_presuspend_undo_targets(map);
2409                        return r;
2410                }
2411        }
2412
2413        /*
2414         * Here we must make sure that no processes are submitting requests
2415         * to target drivers i.e. no one may be executing
2416         * __split_and_process_bio. This is called from dm_request and
2417         * dm_wq_work.
2418         *
2419         * To get all processes out of __split_and_process_bio in dm_request,
2420         * we take the write lock. To prevent any process from reentering
2421         * __split_and_process_bio from dm_request and quiesce the thread
2422         * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
2423         * flush_workqueue(md->wq).
2424         */
2425        set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2426        if (map)
2427                synchronize_srcu(&md->io_barrier);
2428
2429        /*
2430         * Stop md->queue before flushing md->wq in case request-based
2431         * dm defers requests to md->wq from md->queue.
2432         */
2433        if (dm_request_based(md)) {
2434                dm_stop_queue(md->queue);
2435                if (md->kworker_task)
2436                        flush_kthread_worker(&md->kworker);
2437        }
2438
2439        flush_workqueue(md->wq);
2440
2441        /*
2442         * At this point no more requests are entering target request routines.
2443         * We call dm_wait_for_completion to wait for all existing requests
2444         * to finish.
2445         */
2446        r = dm_wait_for_completion(md, task_state);
2447        if (!r)
2448                set_bit(dmf_suspended_flag, &md->flags);
2449
2450        if (noflush)
2451                clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2452        if (map)
2453                synchronize_srcu(&md->io_barrier);
2454
2455        /* were we interrupted ? */
2456        if (r < 0) {
2457                dm_queue_flush(md);
2458
2459                if (dm_request_based(md))
2460                        dm_start_queue(md->queue);
2461
2462                unlock_fs(md);
2463                dm_table_presuspend_undo_targets(map);
2464                /* pushback list is already flushed, so skip flush */
2465        }
2466
2467        return r;
2468}
2469
2470/*
2471 * We need to be able to change a mapping table under a mounted
2472 * filesystem.  For example we might want to move some data in
2473 * the background.  Before the table can be swapped with
2474 * dm_bind_table, dm_suspend must be called to flush any in
2475 * flight bios and ensure that any further io gets deferred.
2476 */
2477/*
2478 * Suspend mechanism in request-based dm.
2479 *
2480 * 1. Flush all I/Os by lock_fs() if needed.
2481 * 2. Stop dispatching any I/O by stopping the request_queue.
2482 * 3. Wait for all in-flight I/Os to be completed or requeued.
2483 *
2484 * To abort suspend, start the request_queue.
2485 */
2486int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2487{
2488        struct dm_table *map = NULL;
2489        int r = 0;
2490
2491retry:
2492        mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2493
2494        if (dm_suspended_md(md)) {
2495                r = -EINVAL;
2496                goto out_unlock;
2497        }
2498
2499        if (dm_suspended_internally_md(md)) {
2500                /* already internally suspended, wait for internal resume */
2501                mutex_unlock(&md->suspend_lock);
2502                r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2503                if (r)
2504                        return r;
2505                goto retry;
2506        }
2507
2508        map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2509
2510        r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED);
2511        if (r)
2512                goto out_unlock;
2513
2514        dm_table_postsuspend_targets(map);
2515
2516out_unlock:
2517        mutex_unlock(&md->suspend_lock);
2518        return r;
2519}
2520
2521static int __dm_resume(struct mapped_device *md, struct dm_table *map)
2522{
2523        if (map) {
2524                int r = dm_table_resume_targets(map);
2525                if (r)
2526                        return r;
2527        }
2528
2529        dm_queue_flush(md);
2530
2531        /*
2532         * Flushing deferred I/Os must be done after targets are resumed
2533         * so that mapping of targets can work correctly.
2534         * Request-based dm is queueing the deferred I/Os in its request_queue.
2535         */
2536        if (dm_request_based(md))
2537                dm_start_queue(md->queue);
2538
2539        unlock_fs(md);
2540
2541        return 0;
2542}
2543
2544int dm_resume(struct mapped_device *md)
2545{
2546        int r;
2547        struct dm_table *map = NULL;
2548
2549retry:
2550        r = -EINVAL;
2551        mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2552
2553        if (!dm_suspended_md(md))
2554                goto out;
2555
2556        if (dm_suspended_internally_md(md)) {
2557                /* already internally suspended, wait for internal resume */
2558                mutex_unlock(&md->suspend_lock);
2559                r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2560                if (r)
2561                        return r;
2562                goto retry;
2563        }
2564
2565        map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2566        if (!map || !dm_table_get_size(map))
2567                goto out;
2568
2569        r = __dm_resume(md, map);
2570        if (r)
2571                goto out;
2572
2573        clear_bit(DMF_SUSPENDED, &md->flags);
2574out:
2575        mutex_unlock(&md->suspend_lock);
2576
2577        return r;
2578}
2579
2580/*
2581 * Internal suspend/resume works like userspace-driven suspend. It waits
2582 * until all bios finish and prevents issuing new bios to the target drivers.
2583 * It may be used only from the kernel.
2584 */
2585
2586static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
2587{
2588        struct dm_table *map = NULL;
2589
2590        if (md->internal_suspend_count++)
2591                return; /* nested internal suspend */
2592
2593        if (dm_suspended_md(md)) {
2594                set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2595                return; /* nest suspend */
2596        }
2597
2598        map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2599
2600        /*
2601         * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is
2602         * supported.  Properly supporting a TASK_INTERRUPTIBLE internal suspend
2603         * would require changing .presuspend to return an error -- avoid this
2604         * until there is a need for more elaborate variants of internal suspend.
2605         */
2606        (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE,
2607                            DMF_SUSPENDED_INTERNALLY);
2608
2609        dm_table_postsuspend_targets(map);
2610}
2611
2612static void __dm_internal_resume(struct mapped_device *md)
2613{
2614        BUG_ON(!md->internal_suspend_count);
2615
2616        if (--md->internal_suspend_count)
2617                return; /* resume from nested internal suspend */
2618
2619        if (dm_suspended_md(md))
2620                goto done; /* resume from nested suspend */
2621
2622        /*
2623         * NOTE: existing callers don't need to call dm_table_resume_targets
2624         * (which may fail -- so best to avoid it for now by passing NULL map)
2625         */
2626        (void) __dm_resume(md, NULL);
2627
2628done:
2629        clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2630        smp_mb__after_atomic();
2631        wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
2632}
2633
2634void dm_internal_suspend_noflush(struct mapped_device *md)
2635{
2636        mutex_lock(&md->suspend_lock);
2637        __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
2638        mutex_unlock(&md->suspend_lock);
2639}
2640EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
2641
2642void dm_internal_resume(struct mapped_device *md)
2643{
2644        mutex_lock(&md->suspend_lock);
2645        __dm_internal_resume(md);
2646        mutex_unlock(&md->suspend_lock);
2647}
2648EXPORT_SYMBOL_GPL(dm_internal_resume);
2649
2650/*
2651 * Fast variants of internal suspend/resume hold md->suspend_lock,
2652 * which prevents interaction with userspace-driven suspend.
2653 */
2654
2655void dm_internal_suspend_fast(struct mapped_device *md)
2656{
2657        mutex_lock(&md->suspend_lock);
2658        if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2659                return;
2660
2661        set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2662        synchronize_srcu(&md->io_barrier);
2663        flush_workqueue(md->wq);
2664        dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2665}
2666EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
2667
2668void dm_internal_resume_fast(struct mapped_device *md)
2669{
2670        if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2671                goto done;
2672
2673        dm_queue_flush(md);
2674
2675done:
2676        mutex_unlock(&md->suspend_lock);
2677}
2678EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
2679
2680/*-----------------------------------------------------------------
2681 * Event notification.
2682 *---------------------------------------------------------------*/
2683int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
2684                       unsigned cookie)
2685{
2686        char udev_cookie[DM_COOKIE_LENGTH];
2687        char *envp[] = { udev_cookie, NULL };
2688
2689        if (!cookie)
2690                return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
2691        else {
2692                snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2693                         DM_COOKIE_ENV_VAR_NAME, cookie);
2694                return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
2695                                          action, envp);
2696        }
2697
2698        dm_mq_cleanup_mapped_device(md);
2699}
2700
2701uint32_t dm_next_uevent_seq(struct mapped_device *md)
2702{
2703        return atomic_add_return(1, &md->uevent_seq);
2704}
2705
2706uint32_t dm_get_event_nr(struct mapped_device *md)
2707{
2708        return atomic_read(&md->event_nr);
2709}
2710
2711int dm_wait_event(struct mapped_device *md, int event_nr)
2712{
2713        return wait_event_interruptible(md->eventq,
2714                        (event_nr != atomic_read(&md->event_nr)));
2715}
2716
2717void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
2718{
2719        unsigned long flags;
2720
2721        spin_lock_irqsave(&md->uevent_lock, flags);
2722        list_add(elist, &md->uevent_list);
2723        spin_unlock_irqrestore(&md->uevent_lock, flags);
2724}
2725
2726/*
2727 * The gendisk is only valid as long as you have a reference
2728 * count on 'md'.
2729 */
2730struct gendisk *dm_disk(struct mapped_device *md)
2731{
2732        return md->disk;
2733}
2734EXPORT_SYMBOL_GPL(dm_disk);
2735
2736struct kobject *dm_kobject(struct mapped_device *md)
2737{
2738        return &md->kobj_holder.kobj;
2739}
2740
2741struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2742{
2743        struct mapped_device *md;
2744
2745        md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
2746
2747        if (test_bit(DMF_FREEING, &md->flags) ||
2748            dm_deleting_md(md))
2749                return NULL;
2750
2751        dm_get(md);
2752        return md;
2753}
2754
2755int dm_suspended_md(struct mapped_device *md)
2756{
2757        return test_bit(DMF_SUSPENDED, &md->flags);
2758}
2759
2760int dm_suspended_internally_md(struct mapped_device *md)
2761{
2762        return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2763}
2764
2765int dm_test_deferred_remove_flag(struct mapped_device *md)
2766{
2767        return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
2768}
2769
2770int dm_suspended(struct dm_target *ti)
2771{
2772        return dm_suspended_md(dm_table_get_md(ti->table));
2773}
2774EXPORT_SYMBOL_GPL(dm_suspended);
2775
2776int dm_noflush_suspending(struct dm_target *ti)
2777{
2778        return __noflush_suspending(dm_table_get_md(ti->table));
2779}
2780EXPORT_SYMBOL_GPL(dm_noflush_suspending);
2781
2782struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
2783                                            unsigned integrity, unsigned per_io_data_size)
2784{
2785        struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
2786        struct kmem_cache *cachep = NULL;
2787        unsigned int pool_size = 0;
2788        unsigned int front_pad;
2789
2790        if (!pools)
2791                return NULL;
2792
2793        switch (type) {
2794        case DM_TYPE_BIO_BASED:
2795        case DM_TYPE_DAX_BIO_BASED:
2796                cachep = _io_cache;
2797                pool_size = dm_get_reserved_bio_based_ios();
2798                front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
2799                break;
2800        case DM_TYPE_REQUEST_BASED:
2801                cachep = _rq_tio_cache;
2802                pool_size = dm_get_reserved_rq_based_ios();
2803                pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
2804                if (!pools->rq_pool)
2805                        goto out;
2806                /* fall through to setup remaining rq-based pools */
2807        case DM_TYPE_MQ_REQUEST_BASED:
2808                if (!pool_size)
2809                        pool_size = dm_get_reserved_rq_based_ios();
2810                front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
2811                /* per_io_data_size is used for blk-mq pdu at queue allocation */
2812                break;
2813        default:
2814                BUG();
2815        }
2816
2817        if (cachep) {
2818                pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
2819                if (!pools->io_pool)
2820                        goto out;
2821        }
2822
2823        pools->bs = bioset_create(pool_size, front_pad);
2824        if (!pools->bs)
2825                goto out;
2826
2827        if (integrity && bioset_integrity_create(pools->bs, pool_size))
2828                goto out;
2829
2830        return pools;
2831
2832out:
2833        dm_free_md_mempools(pools);
2834
2835        return NULL;
2836}
2837
2838void dm_free_md_mempools(struct dm_md_mempools *pools)
2839{
2840        if (!pools)
2841                return;
2842
2843        mempool_destroy(pools->io_pool);
2844        mempool_destroy(pools->rq_pool);
2845
2846        if (pools->bs)
2847                bioset_free(pools->bs);
2848
2849        kfree(pools);
2850}
2851
2852struct dm_pr {
2853        u64     old_key;
2854        u64     new_key;
2855        u32     flags;
2856        bool    fail_early;
2857};
2858
2859static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
2860                      void *data)
2861{
2862        struct mapped_device *md = bdev->bd_disk->private_data;
2863        struct dm_table *table;
2864        struct dm_target *ti;
2865        int ret = -ENOTTY, srcu_idx;
2866
2867        table = dm_get_live_table(md, &srcu_idx);
2868        if (!table || !dm_table_get_size(table))
2869                goto out;
2870
2871        /* We only support devices that have a single target */
2872        if (dm_table_get_num_targets(table) != 1)
2873                goto out;
2874        ti = dm_table_get_target(table, 0);
2875
2876        ret = -EINVAL;
2877        if (!ti->type->iterate_devices)
2878                goto out;
2879
2880        ret = ti->type->iterate_devices(ti, fn, data);
2881out:
2882        dm_put_live_table(md, srcu_idx);
2883        return ret;
2884}
2885
2886/*
2887 * For register / unregister we need to manually call out to every path.
2888 */
2889static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev,
2890                            sector_t start, sector_t len, void *data)
2891{
2892        struct dm_pr *pr = data;
2893        const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
2894
2895        if (!ops || !ops->pr_register)
2896                return -EOPNOTSUPP;
2897        return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);
2898}
2899
2900static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
2901                          u32 flags)
2902{
2903        struct dm_pr pr = {
2904                .old_key        = old_key,
2905                .new_key        = new_key,
2906                .flags          = flags,
2907                .fail_early     = true,
2908        };
2909        int ret;
2910
2911        ret = dm_call_pr(bdev, __dm_pr_register, &pr);
2912        if (ret && new_key) {
2913                /* unregister all paths if we failed to register any path */
2914                pr.old_key = new_key;
2915                pr.new_key = 0;
2916                pr.flags = 0;
2917                pr.fail_early = false;
2918                dm_call_pr(bdev, __dm_pr_register, &pr);
2919        }
2920
2921        return ret;
2922}
2923
2924static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
2925                         u32 flags)
2926{
2927        struct mapped_device *md = bdev->bd_disk->private_data;
2928        const struct pr_ops *ops;
2929        fmode_t mode;
2930        int r;
2931
2932        r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
2933        if (r < 0)
2934                return r;
2935
2936        ops = bdev->bd_disk->fops->pr_ops;
2937        if (ops && ops->pr_reserve)
2938                r = ops->pr_reserve(bdev, key, type, flags);
2939        else
2940                r = -EOPNOTSUPP;
2941
2942        bdput(bdev);
2943        return r;
2944}
2945
2946static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
2947{
2948        struct mapped_device *md = bdev->bd_disk->private_data;
2949        const struct pr_ops *ops;
2950        fmode_t mode;
2951        int r;
2952
2953        r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
2954        if (r < 0)
2955                return r;
2956
2957        ops = bdev->bd_disk->fops->pr_ops;
2958        if (ops && ops->pr_release)
2959                r = ops->pr_release(bdev, key, type);
2960        else
2961                r = -EOPNOTSUPP;
2962
2963        bdput(bdev);
2964        return r;
2965}
2966
2967static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
2968                         enum pr_type type, bool abort)
2969{
2970        struct mapped_device *md = bdev->bd_disk->private_data;
2971        const struct pr_ops *ops;
2972        fmode_t mode;
2973        int r;
2974
2975        r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
2976        if (r < 0)
2977                return r;
2978
2979        ops = bdev->bd_disk->fops->pr_ops;
2980        if (ops && ops->pr_preempt)
2981                r = ops->pr_preempt(bdev, old_key, new_key, type, abort);
2982        else
2983                r = -EOPNOTSUPP;
2984
2985        bdput(bdev);
2986        return r;
2987}
2988
2989static int dm_pr_clear(struct block_device *bdev, u64 key)
2990{
2991        struct mapped_device *md = bdev->bd_disk->private_data;
2992        const struct pr_ops *ops;
2993        fmode_t mode;
2994        int r;
2995
2996        r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
2997        if (r < 0)
2998                return r;
2999
3000        ops = bdev->bd_disk->fops->pr_ops;

3001        if (ops && ops->pr_clear)
3002                r = ops->pr_clear(bdev, key);
3003        else
3004                r = -EOPNOTSUPP;
3005
3006        bdput(bdev);
3007        return r;
3008}
3009
3010static const struct pr_ops dm_pr_ops = {
3011        .pr_register    = dm_pr_register,
3012        .pr_reserve     = dm_pr_reserve,
3013        .pr_release     = dm_pr_release,
3014        .pr_preempt     = dm_pr_preempt,
3015        .pr_clear       = dm_pr_clear,
3016};
3017
3018static const struct block_device_operations dm_blk_dops = {
3019        .open = dm_blk_open,
3020        .release = dm_blk_close,
3021        .ioctl = dm_blk_ioctl,
3022        .direct_access = dm_blk_direct_access,
3023        .getgeo = dm_blk_getgeo,
3024        .pr_ops = &dm_pr_ops,
3025        .owner = THIS_MODULE
3026};
3027
3028/*
3029 * module hooks
3030 */
3031module_init(dm_init);
3032module_exit(dm_exit);
3033
3034module_param(major, uint, 0);
3035MODULE_PARM_DESC(major, "The major number of the device mapper");
3036
3037module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
3038MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
3039
3040module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
3041MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
3042
3043MODULE_DESCRIPTION(DM_NAME " driver");
3044MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3045MODULE_LICENSE("GPL");
3046