LXR linux/drivers/md/dm.c

   1/*
   2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
   3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
   4 *
   5 * This file is released under the GPL.
   6 */
   7
   8#include "dm-core.h"
   9#include "dm-rq.h"
  10#include "dm-uevent.h"
  11
  12#include <linux/init.h>
  13#include <linux/module.h>
  14#include <linux/mutex.h>
  15#include <linux/blkpg.h>
  16#include <linux/bio.h>
  17#include <linux/mempool.h>
  18#include <linux/dax.h>
  19#include <linux/slab.h>
  20#include <linux/idr.h>
  21#include <linux/hdreg.h>
  22#include <linux/delay.h>
  23#include <linux/wait.h>
  24#include <linux/pr.h>
  25
  26#define DM_MSG_PREFIX "core"
  27
  28/*
  29 * Cookies are numeric values sent with CHANGE and REMOVE
  30 * uevents while resuming, removing or renaming the device.
  31 */
  32#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
  33#define DM_COOKIE_LENGTH 24
  34
  35static const char *_name = DM_NAME;
  36
  37static unsigned int major = 0;
  38static unsigned int _major = 0;
  39
  40static DEFINE_IDR(_minor_idr);
  41
  42static DEFINE_SPINLOCK(_minor_lock);
  43
  44static void do_deferred_remove(struct work_struct *w);
  45
  46static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
  47
  48static struct workqueue_struct *deferred_remove_workqueue;
  49
  50atomic_t dm_global_event_nr = ATOMIC_INIT(0);
  51DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq);
  52
  53void dm_issue_global_event(void)
  54{
  55        atomic_inc(&dm_global_event_nr);
  56        wake_up(&dm_global_eventq);
  57}
  58
  59/*
  60 * One of these is allocated per bio.
  61 */
  62struct dm_io {
  63        struct mapped_device *md;
  64        int error;
  65        atomic_t io_count;
  66        struct bio *bio;
  67        unsigned long start_time;
  68        spinlock_t endio_lock;
  69        struct dm_stats_aux stats_aux;
  70};
  71
  72#define MINOR_ALLOCED ((void *)-1)
  73
  74/*
  75 * Bits for the md->flags field.
  76 */
  77#define DMF_BLOCK_IO_FOR_SUSPEND 0
  78#define DMF_SUSPENDED 1
  79#define DMF_FROZEN 2
  80#define DMF_FREEING 3
  81#define DMF_DELETING 4
  82#define DMF_NOFLUSH_SUSPENDING 5
  83#define DMF_MERGE_IS_OPTIONAL 6
  84#define DMF_DEFERRED_REMOVE 7
  85#define DMF_SUSPENDED_INTERNALLY 8
  86
  87#define DM_NUMA_NODE NUMA_NO_NODE
  88static int dm_numa_node = DM_NUMA_NODE;
  89
  90/*
  91 * For mempools pre-allocation at the table loading time.
  92 */
  93struct dm_md_mempools {
  94        mempool_t *io_pool;
  95        mempool_t *rq_pool;
  96        struct bio_set *bs;
  97};
  98
  99struct table_device {
 100        struct list_head list;
 101        atomic_t count;
 102        struct dm_dev dm_dev;
 103};
 104
 105static struct kmem_cache *_io_cache;
 106static struct kmem_cache *_rq_tio_cache;
 107static struct kmem_cache *_rq_cache;
 108
 109/*
 110 * Bio-based DM's mempools' reserved IOs set by the user.
 111 */
 112#define RESERVED_BIO_BASED_IOS          16
 113static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
 114
 115static int __dm_get_module_param_int(int *module_param, int min, int max)
 116{
 117        int param = ACCESS_ONCE(*module_param);
 118        int modified_param = 0;
 119        bool modified = true;
 120
 121        if (param < min)
 122                modified_param = min;
 123        else if (param > max)
 124                modified_param = max;
 125        else
 126                modified = false;
 127
 128        if (modified) {
 129                (void)cmpxchg(module_param, param, modified_param);
 130                param = modified_param;
 131        }
 132
 133        return param;
 134}
 135
 136unsigned __dm_get_module_param(unsigned *module_param,
 137                               unsigned def, unsigned max)
 138{
 139        unsigned param = ACCESS_ONCE(*module_param);
 140        unsigned modified_param = 0;
 141
 142        if (!param)
 143                modified_param = def;
 144        else if (param > max)
 145                modified_param = max;
 146
 147        if (modified_param) {
 148                (void)cmpxchg(module_param, param, modified_param);
 149                param = modified_param;
 150        }
 151
 152        return param;
 153}
 154
 155unsigned dm_get_reserved_bio_based_ios(void)
 156{
 157        return __dm_get_module_param(&reserved_bio_based_ios,
 158                                     RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS);
 159}
 160EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
 161
 162static unsigned dm_get_numa_node(void)
 163{
 164        return __dm_get_module_param_int(&dm_numa_node,
 165                                         DM_NUMA_NODE, num_online_nodes() - 1);
 166}
 167
 168static int __init local_init(void)
 169{
 170        int r = -ENOMEM;
 171
 172        /* allocate a slab for the dm_ios */
 173        _io_cache = KMEM_CACHE(dm_io, 0);
 174        if (!_io_cache)
 175                return r;
 176
 177        _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
 178        if (!_rq_tio_cache)
 179                goto out_free_io_cache;
 180
 181        _rq_cache = kmem_cache_create("dm_old_clone_request", sizeof(struct request),
 182                                      __alignof__(struct request), 0, NULL);
 183        if (!_rq_cache)
 184                goto out_free_rq_tio_cache;
 185
 186        r = dm_uevent_init();
 187        if (r)
 188                goto out_free_rq_cache;
 189
 190        deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
 191        if (!deferred_remove_workqueue) {
 192                r = -ENOMEM;
 193                goto out_uevent_exit;
 194        }
 195
 196        _major = major;
 197        r = register_blkdev(_major, _name);
 198        if (r < 0)
 199                goto out_free_workqueue;
 200
 201        if (!_major)
 202                _major = r;
 203
 204        return 0;
 205
 206out_free_workqueue:
 207        destroy_workqueue(deferred_remove_workqueue);
 208out_uevent_exit:
 209        dm_uevent_exit();
 210out_free_rq_cache:
 211        kmem_cache_destroy(_rq_cache);
 212out_free_rq_tio_cache:
 213        kmem_cache_destroy(_rq_tio_cache);
 214out_free_io_cache:
 215        kmem_cache_destroy(_io_cache);
 216
 217        return r;
 218}
 219
 220static void local_exit(void)
 221{
 222        flush_scheduled_work();
 223        destroy_workqueue(deferred_remove_workqueue);
 224
 225        kmem_cache_destroy(_rq_cache);
 226        kmem_cache_destroy(_rq_tio_cache);
 227        kmem_cache_destroy(_io_cache);
 228        unregister_blkdev(_major, _name);
 229        dm_uevent_exit();
 230
 231        _major = 0;
 232
 233        DMINFO("cleaned up");
 234}
 235
 236static int (*_inits[])(void) __initdata = {
 237        local_init,
 238        dm_target_init,
 239        dm_linear_init,
 240        dm_stripe_init,
 241        dm_io_init,
 242        dm_kcopyd_init,
 243        dm_interface_init,
 244        dm_statistics_init,
 245};
 246
 247static void (*_exits[])(void) = {
 248        local_exit,
 249        dm_target_exit,
 250        dm_linear_exit,
 251        dm_stripe_exit,
 252        dm_io_exit,
 253        dm_kcopyd_exit,
 254        dm_interface_exit,
 255        dm_statistics_exit,
 256};
 257
 258static int __init dm_init(void)
 259{
 260        const int count = ARRAY_SIZE(_inits);
 261
 262        int r, i;
 263
 264        for (i = 0; i < count; i++) {
 265                r = _inits[i]();
 266                if (r)
 267                        goto bad;
 268        }
 269
 270        return 0;
 271
 272      bad:
 273        while (i--)
 274                _exits[i]();
 275
 276        return r;
 277}
 278
 279static void __exit dm_exit(void)
 280{
 281        int i = ARRAY_SIZE(_exits);
 282
 283        while (i--)
 284                _exits[i]();
 285
 286        /*
 287         * Should be empty by this point.
 288         */
 289        idr_destroy(&_minor_idr);
 290}
 291
 292/*
 293 * Block device functions
 294 */
 295int dm_deleting_md(struct mapped_device *md)
 296{
 297        return test_bit(DMF_DELETING, &md->flags);
 298}
 299
 300static int dm_blk_open(struct block_device *bdev, fmode_t mode)
 301{
 302        struct mapped_device *md;
 303
 304        spin_lock(&_minor_lock);
 305
 306        md = bdev->bd_disk->private_data;
 307        if (!md)
 308                goto out;
 309
 310        if (test_bit(DMF_FREEING, &md->flags) ||
 311            dm_deleting_md(md)) {
 312                md = NULL;
 313                goto out;
 314        }
 315
 316        dm_get(md);
 317        atomic_inc(&md->open_count);
 318out:
 319        spin_unlock(&_minor_lock);
 320
 321        return md ? 0 : -ENXIO;
 322}
 323
 324static void dm_blk_close(struct gendisk *disk, fmode_t mode)
 325{
 326        struct mapped_device *md;
 327
 328        spin_lock(&_minor_lock);
 329
 330        md = disk->private_data;
 331        if (WARN_ON(!md))
 332                goto out;
 333
 334        if (atomic_dec_and_test(&md->open_count) &&
 335            (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
 336                queue_work(deferred_remove_workqueue, &deferred_remove_work);
 337
 338        dm_put(md);
 339out:
 340        spin_unlock(&_minor_lock);
 341}
 342
 343int dm_open_count(struct mapped_device *md)
 344{
 345        return atomic_read(&md->open_count);
 346}
 347
 348/*
 349 * Guarantees nothing is using the device before it's deleted.
 350 */
 351int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
 352{
 353        int r = 0;
 354
 355        spin_lock(&_minor_lock);
 356
 357        if (dm_open_count(md)) {
 358                r = -EBUSY;
 359                if (mark_deferred)
 360                        set_bit(DMF_DEFERRED_REMOVE, &md->flags);
 361        } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
 362                r = -EEXIST;
 363        else
 364                set_bit(DMF_DELETING, &md->flags);
 365
 366        spin_unlock(&_minor_lock);
 367
 368        return r;
 369}
 370
 371int dm_cancel_deferred_remove(struct mapped_device *md)
 372{
 373        int r = 0;
 374
 375        spin_lock(&_minor_lock);
 376
 377        if (test_bit(DMF_DELETING, &md->flags))
 378                r = -EBUSY;
 379        else
 380                clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
 381
 382        spin_unlock(&_minor_lock);
 383
 384        return r;
 385}
 386
 387static void do_deferred_remove(struct work_struct *w)
 388{
 389        dm_deferred_remove();
 390}
 391
 392sector_t dm_get_size(struct mapped_device *md)
 393{
 394        return get_capacity(md->disk);
 395}
 396
 397struct request_queue *dm_get_md_queue(struct mapped_device *md)
 398{
 399        return md->queue;
 400}
 401
 402struct dm_stats *dm_get_stats(struct mapped_device *md)
 403{
 404        return &md->stats;
 405}
 406
 407static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 408{
 409        struct mapped_device *md = bdev->bd_disk->private_data;
 410
 411        return dm_get_geometry(md, geo);
 412}
 413
 414static char *_dm_claim_ptr = "I belong to device-mapper";
 415
 416static int dm_get_bdev_for_ioctl(struct mapped_device *md,
 417                                 struct block_device **bdev,
 418                                 fmode_t *mode)
 419{
 420        struct dm_target *tgt;
 421        struct dm_table *map;
 422        int srcu_idx, r;
 423
 424retry:
 425        r = -ENOTTY;
 426        map = dm_get_live_table(md, &srcu_idx);
 427        if (!map || !dm_table_get_size(map))
 428                goto out;
 429
 430        /* We only support devices that have a single target */
 431        if (dm_table_get_num_targets(map) != 1)
 432                goto out;
 433
 434        tgt = dm_table_get_target(map, 0);
 435        if (!tgt->type->prepare_ioctl)
 436                goto out;
 437
 438        if (dm_suspended_md(md)) {
 439                r = -EAGAIN;
 440                goto out;
 441        }
 442
 443        r = tgt->type->prepare_ioctl(tgt, bdev, mode);
 444        if (r < 0)
 445                goto out;
 446
 447        bdgrab(*bdev);
 448        r = blkdev_get(*bdev, *mode, _dm_claim_ptr);
 449        if (r < 0)
 450                goto out;
 451
 452        dm_put_live_table(md, srcu_idx);
 453        return r;
 454
 455out:
 456        dm_put_live_table(md, srcu_idx);
 457        if (r == -ENOTCONN && !fatal_signal_pending(current)) {
 458                msleep(10);
 459                goto retry;
 460        }
 461        return r;
 462}
 463
 464static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
 465                        unsigned int cmd, unsigned long arg)
 466{
 467        struct mapped_device *md = bdev->bd_disk->private_data;
 468        int r;
 469
 470        r = dm_get_bdev_for_ioctl(md, &bdev, &mode);
 471        if (r < 0)
 472                return r;
 473
 474        if (r > 0) {
 475                /*
 476                 * Target determined this ioctl is being issued against
 477                 * a logical partition of the parent bdev; so extra
 478                 * validation is needed.
 479                 */
 480                r = scsi_verify_blk_ioctl(NULL, cmd);
 481                if (r)
 482                        goto out;
 483        }
 484
 485        r =  __blkdev_driver_ioctl(bdev, mode, cmd, arg);
 486out:
 487        blkdev_put(bdev, mode);
 488        return r;
 489}
 490
 491static struct dm_io *alloc_io(struct mapped_device *md)
 492{
 493        return mempool_alloc(md->io_pool, GFP_NOIO);
 494}
 495
 496static void free_io(struct mapped_device *md, struct dm_io *io)
 497{
 498        mempool_free(io, md->io_pool);
 499}
 500
 501static void free_tio(struct dm_target_io *tio)
 502{
 503        bio_put(&tio->clone);
 504}
 505
 506int md_in_flight(struct mapped_device *md)
 507{
 508        return atomic_read(&md->pending[READ]) +
 509               atomic_read(&md->pending[WRITE]);
 510}
 511
 512static void start_io_acct(struct dm_io *io)
 513{
 514        struct mapped_device *md = io->md;
 515        struct bio *bio = io->bio;
 516        int cpu;
 517        int rw = bio_data_dir(bio);
 518
 519        io->start_time = jiffies;
 520
 521        cpu = part_stat_lock();
 522        part_round_stats(cpu, &dm_disk(md)->part0);
 523        part_stat_unlock();
 524        atomic_set(&dm_disk(md)->part0.in_flight[rw],
 525                atomic_inc_return(&md->pending[rw]));
 526
 527        if (unlikely(dm_stats_used(&md->stats)))
 528                dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_sector,
 529                                    bio_sectors(bio), false, 0, &io->stats_aux);
 530}
 531
 532static void end_io_acct(struct dm_io *io)
 533{
 534        struct mapped_device *md = io->md;
 535        struct bio *bio = io->bio;
 536        unsigned long duration = jiffies - io->start_time;
 537        int pending, cpu;
 538        int rw = bio_data_dir(bio);
 539
 540        cpu = part_stat_lock();
 541        part_round_stats(cpu, &dm_disk(md)->part0);
 542        part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration);
 543        part_stat_unlock();
 544
 545        if (unlikely(dm_stats_used(&md->stats)))
 546                dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_sector,
 547                                    bio_sectors(bio), true, duration, &io->stats_aux);
 548
 549        /*
 550         * After this is decremented the bio must not be touched if it is
 551         * a flush.
 552         */
 553        pending = atomic_dec_return(&md->pending[rw]);
 554        atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
 555        pending += atomic_read(&md->pending[rw^0x1]);
 556
 557        /* nudge anyone waiting on suspend queue */
 558        if (!pending)
 559                wake_up(&md->wait);
 560}
 561
 562/*
 563 * Add the bio to the list of deferred io.
 564 */
 565static void queue_io(struct mapped_device *md, struct bio *bio)
 566{
 567        unsigned long flags;
 568
 569        spin_lock_irqsave(&md->deferred_lock, flags);
 570        bio_list_add(&md->deferred, bio);
 571        spin_unlock_irqrestore(&md->deferred_lock, flags);
 572        queue_work(md->wq, &md->work);
 573}
 574
 575/*
 576 * Everyone (including functions in this file), should use this
 577 * function to access the md->map field, and make sure they call
 578 * dm_put_live_table() when finished.
 579 */
 580struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
 581{
 582        *srcu_idx = srcu_read_lock(&md->io_barrier);
 583
 584        return srcu_dereference(md->map, &md->io_barrier);
 585}
 586
 587void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
 588{
 589        srcu_read_unlock(&md->io_barrier, srcu_idx);
 590}
 591
 592void dm_sync_table(struct mapped_device *md)
 593{
 594        synchronize_srcu(&md->io_barrier);
 595        synchronize_rcu_expedited();
 596}
 597
 598/*
 599 * A fast alternative to dm_get_live_table/dm_put_live_table.
 600 * The caller must not block between these two functions.
 601 */
 602static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
 603{
 604        rcu_read_lock();
 605        return rcu_dereference(md->map);
 606}
 607
 608static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
 609{
 610        rcu_read_unlock();
 611}
 612
 613/*
 614 * Open a table device so we can use it as a map destination.
 615 */
 616static int open_table_device(struct table_device *td, dev_t dev,
 617                             struct mapped_device *md)
 618{
 619        struct block_device *bdev;
 620
 621        int r;
 622
 623        BUG_ON(td->dm_dev.bdev);
 624
 625        bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr);
 626        if (IS_ERR(bdev))
 627                return PTR_ERR(bdev);
 628
 629        r = bd_link_disk_holder(bdev, dm_disk(md));
 630        if (r) {
 631                blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
 632                return r;
 633        }
 634
 635        td->dm_dev.bdev = bdev;
 636        td->dm_dev.dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
 637        return 0;
 638}
 639
 640/*
 641 * Close a table device that we've been using.
 642 */
 643static void close_table_device(struct table_device *td, struct mapped_device *md)
 644{
 645        if (!td->dm_dev.bdev)
 646                return;
 647
 648        bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
 649        blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
 650        put_dax(td->dm_dev.dax_dev);
 651        td->dm_dev.bdev = NULL;
 652        td->dm_dev.dax_dev = NULL;
 653}
 654
 655static struct table_device *find_table_device(struct list_head *l, dev_t dev,
 656                                              fmode_t mode) {
 657        struct table_device *td;
 658
 659        list_for_each_entry(td, l, list)
 660                if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
 661                        return td;
 662
 663        return NULL;
 664}
 665
 666int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
 667                        struct dm_dev **result) {
 668        int r;
 669        struct table_device *td;
 670
 671        mutex_lock(&md->table_devices_lock);
 672        td = find_table_device(&md->table_devices, dev, mode);
 673        if (!td) {
 674                td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
 675                if (!td) {
 676                        mutex_unlock(&md->table_devices_lock);
 677                        return -ENOMEM;
 678                }
 679
 680                td->dm_dev.mode = mode;
 681                td->dm_dev.bdev = NULL;
 682
 683                if ((r = open_table_device(td, dev, md))) {
 684                        mutex_unlock(&md->table_devices_lock);
 685                        kfree(td);
 686                        return r;
 687                }
 688
 689                format_dev_t(td->dm_dev.name, dev);
 690
 691                atomic_set(&td->count, 0);
 692                list_add(&td->list, &md->table_devices);
 693        }
 694        atomic_inc(&td->count);
 695        mutex_unlock(&md->table_devices_lock);
 696
 697        *result = &td->dm_dev;
 698        return 0;
 699}
 700EXPORT_SYMBOL_GPL(dm_get_table_device);
 701
 702void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
 703{
 704        struct table_device *td = container_of(d, struct table_device, dm_dev);
 705
 706        mutex_lock(&md->table_devices_lock);
 707        if (atomic_dec_and_test(&td->count)) {
 708                close_table_device(td, md);
 709                list_del(&td->list);
 710                kfree(td);
 711        }
 712        mutex_unlock(&md->table_devices_lock);
 713}
 714EXPORT_SYMBOL(dm_put_table_device);
 715
 716static void free_table_devices(struct list_head *devices)
 717{
 718        struct list_head *tmp, *next;
 719
 720        list_for_each_safe(tmp, next, devices) {
 721                struct table_device *td = list_entry(tmp, struct table_device, list);
 722
 723                DMWARN("dm_destroy: %s still exists with %d references",
 724                       td->dm_dev.name, atomic_read(&td->count));
 725                kfree(td);
 726        }
 727}
 728
 729/*
 730 * Get the geometry associated with a dm device
 731 */
 732int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
 733{
 734        *geo = md->geometry;
 735
 736        return 0;
 737}
 738
 739/*
 740 * Set the geometry of a device.
 741 */
 742int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
 743{
 744        sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
 745
 746        if (geo->start > sz) {
 747                DMWARN("Start sector is beyond the geometry limits.");
 748                return -EINVAL;
 749        }
 750
 751        md->geometry = *geo;
 752
 753        return 0;
 754}
 755
 756/*-----------------------------------------------------------------
 757 * CRUD START:
 758 *   A more elegant soln is in the works that uses the queue
 759 *   merge fn, unfortunately there are a couple of changes to
 760 *   the block layer that I want to make for this.  So in the
 761 *   interests of getting something for people to use I give
 762 *   you this clearly demarcated crap.
 763 *---------------------------------------------------------------*/
 764
 765static int __noflush_suspending(struct mapped_device *md)
 766{
 767        return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
 768}
 769
 770/*
 771 * Decrements the number of outstanding ios that a bio has been
 772 * cloned into, completing the original io if necc.
 773 */
 774static void dec_pending(struct dm_io *io, int error)
 775{
 776        unsigned long flags;
 777        int io_error;
 778        struct bio *bio;
 779        struct mapped_device *md = io->md;
 780
 781        /* Push-back supersedes any I/O errors */
 782        if (unlikely(error)) {
 783                spin_lock_irqsave(&io->endio_lock, flags);
 784                if (!(io->error > 0 && __noflush_suspending(md)))
 785                        io->error = error;
 786                spin_unlock_irqrestore(&io->endio_lock, flags);
 787        }
 788
 789        if (atomic_dec_and_test(&io->io_count)) {
 790                if (io->error == DM_ENDIO_REQUEUE) {
 791                        /*
 792                         * Target requested pushing back the I/O.
 793                         */
 794                        spin_lock_irqsave(&md->deferred_lock, flags);
 795                        if (__noflush_suspending(md))
 796                                bio_list_add_head(&md->deferred, io->bio);
 797                        else
 798                                /* noflush suspend was interrupted. */
 799                                io->error = -EIO;
 800                        spin_unlock_irqrestore(&md->deferred_lock, flags);
 801                }
 802
 803                io_error = io->error;
 804                bio = io->bio;
 805                end_io_acct(io);
 806                free_io(md, io);
 807
 808                if (io_error == DM_ENDIO_REQUEUE)
 809                        return;
 810
 811                if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) {
 812                        /*
 813                         * Preflush done for flush with data, reissue
 814                         * without REQ_FLUSH.
 815                         */
 816                        bio->bi_rw &= ~REQ_FLUSH;
 817                        queue_io(md, bio);
 818                } else {
 819                        /* done with normal IO or empty flush */
 820                        trace_block_bio_complete(md->queue, bio, io_error);
 821                        bio_endio(bio, io_error);
 822                }
 823        }
 824}
 825
 826void disable_write_same(struct mapped_device *md)
 827{
 828        struct queue_limits *limits = dm_get_queue_limits(md);
 829
 830        /* device doesn't really support WRITE SAME, disable it */
 831        limits->max_write_same_sectors = 0;
 832}
 833
 834static void clone_endio(struct bio *bio, int error)
 835{
 836        int r = error;
 837        struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
 838        struct dm_io *io = tio->io;
 839        struct mapped_device *md = tio->io->md;
 840        dm_endio_fn endio = tio->ti->type->end_io;
 841
 842        if (!bio_flagged(bio, BIO_UPTODATE) && !error)
 843                error = -EIO;
 844
 845        if (endio) {
 846                r = endio(tio->ti, bio, error);
 847                if (r < 0 || r == DM_ENDIO_REQUEUE)
 848                        /*
 849                         * error and requeue request are handled
 850                         * in dec_pending().
 851                         */
 852                        error = r;
 853                else if (r == DM_ENDIO_INCOMPLETE)
 854                        /* The target will handle the io */
 855                        return;
 856                else if (r) {
 857                        DMWARN("unimplemented target endio return value: %d", r);
 858                        BUG();
 859                }
 860        }
 861
 862        if (unlikely(r == -EREMOTEIO && (bio->bi_rw & REQ_WRITE_SAME) &&
 863                     !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors))
 864                disable_write_same(md);
 865
 866        free_tio(tio);
 867        dec_pending(io, error);
 868}
 869
 870/*
 871 * Return maximum size of I/O possible at the supplied sector up to the current
 872 * target boundary.
 873 */
 874static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
 875{
 876        sector_t target_offset = dm_target_offset(ti, sector);
 877
 878        return ti->len - target_offset;
 879}
 880
 881static sector_t max_io_len(sector_t sector, struct dm_target *ti)
 882{
 883        sector_t len = max_io_len_target_boundary(sector, ti);
 884        sector_t offset, max_len;
 885
 886        /*
 887         * Does the target need to split even further?
 888         */
 889        if (ti->max_io_len) {
 890                offset = dm_target_offset(ti, sector);
 891                if (unlikely(ti->max_io_len & (ti->max_io_len - 1)))
 892                        max_len = sector_div(offset, ti->max_io_len);
 893                else
 894                        max_len = offset & (ti->max_io_len - 1);
 895                max_len = ti->max_io_len - max_len;
 896
 897                if (len > max_len)
 898                        len = max_len;
 899        }
 900
 901        return len;
 902}
 903
 904int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
 905{
 906        if (len > UINT_MAX) {
 907                DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
 908                      (unsigned long long)len, UINT_MAX);
 909                ti->error = "Maximum size of target IO is too large";
 910                return -EINVAL;
 911        }
 912
 913        ti->max_io_len = (uint32_t) len;
 914
 915        return 0;
 916}
 917EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
 918
 919static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
 920                sector_t sector, int *srcu_idx)
 921{
 922        struct dm_table *map;
 923        struct dm_target *ti;
 924
 925        map = dm_get_live_table(md, srcu_idx);
 926        if (!map)
 927                return NULL;
 928
 929        ti = dm_table_find_target(map, sector);
 930        if (!dm_target_is_valid(ti))
 931                return NULL;
 932
 933        return ti;
 934}
 935
 936static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
 937                long nr_pages, void **kaddr, pfn_t *pfn)
 938{
 939        struct mapped_device *md = dax_get_private(dax_dev);
 940        sector_t sector = pgoff * PAGE_SECTORS;
 941        struct dm_target *ti;
 942        long len, ret = -EIO;
 943        int srcu_idx;
 944
 945        ti = dm_dax_get_live_target(md, sector, &srcu_idx);
 946
 947        if (!ti)
 948                goto out;
 949        if (!ti->type->direct_access)
 950                goto out;
 951        len = max_io_len(sector, ti) / PAGE_SECTORS;
 952        if (len < 1)
 953                goto out;
 954        nr_pages = min(len, nr_pages);
 955        if (ti->type->direct_access)
 956                ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
 957
 958 out:
 959        dm_put_live_table(md, srcu_idx);
 960
 961        return ret;
 962}
 963
 964/*
 965 * Flush current->bio_list when the target map method blocks.
 966 * This fixes deadlocks in snapshot and possibly in other targets.
 967 */
 968struct dm_offload {
 969        struct blk_plug plug;
 970        struct blk_plug_cb cb;
 971};
 972
 973static void flush_current_bio_list(struct blk_plug_cb *cb, bool from_schedule)
 974{
 975        struct dm_offload *o = container_of(cb, struct dm_offload, cb);
 976        struct bio_list list;
 977        struct bio *bio;
 978        int i;
 979
 980        INIT_LIST_HEAD(&o->cb.list);
 981
 982        if (unlikely(!current->bio_list))
 983                return;
 984
 985        for (i = 0; i < 2; i++) {
 986                list = current->bio_list[i];
 987                bio_list_init(&current->bio_list[i]);
 988
 989                while ((bio = bio_list_pop(&list))) {
 990                        struct bio_set *bs = bio->bi_pool;
 991                        if (unlikely(!bs) || bs == fs_bio_set) {
 992                                bio_list_add(&current->bio_list[i], bio);
 993                                continue;
 994                        }
 995
 996                        spin_lock(&bs->rescue_lock);
 997                        bio_list_add(&bs->rescue_list, bio);
 998                        queue_work(bs->rescue_workqueue, &bs->rescue_work);
 999                        spin_unlock(&bs->rescue_lock);
1000                }

1001        }
1002}
1003
1004static void dm_offload_start(struct dm_offload *o)
1005{
1006        blk_start_plug(&o->plug);
1007        o->cb.callback = flush_current_bio_list;
1008        list_add(&o->cb.list, &current->plug->cb_list);
1009}
1010
1011static void dm_offload_end(struct dm_offload *o)
1012{
1013        list_del(&o->cb.list);
1014        blk_finish_plug(&o->plug);
1015}
1016
1017static void __map_bio(struct dm_target_io *tio)
1018{
1019        int r;
1020        sector_t sector;
1021        struct dm_offload o;
1022        struct bio *clone = &tio->clone;
1023        struct dm_target *ti = tio->ti;
1024
1025        clone->bi_end_io = clone_endio;
1026
1027        /*
1028         * Map the clone.  If r == 0 we don't need to do
1029         * anything, the target has assumed ownership of
1030         * this io.
1031         */
1032        atomic_inc(&tio->io->io_count);
1033        sector = clone->bi_sector;
1034
1035        dm_offload_start(&o);
1036        r = ti->type->map(ti, clone);
1037        dm_offload_end(&o);
1038
1039        if (r == DM_MAPIO_REMAPPED) {
1040                /* the bio has been remapped so dispatch it */
1041
1042                trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,
1043                                      tio->io->bio->bi_bdev->bd_dev, sector);
1044
1045                generic_make_request(clone);
1046        } else if (r < 0 || r == DM_MAPIO_REQUEUE) {
1047                /* error the io and bail out, or requeue it if needed */
1048                dec_pending(tio->io, r);
1049                free_tio(tio);
1050        } else if (r != DM_MAPIO_SUBMITTED) {
1051                DMWARN("unimplemented target map return value: %d", r);
1052                BUG();
1053        }
1054}
1055
1056struct clone_info {
1057        struct mapped_device *md;
1058        struct dm_table *map;
1059        struct bio *bio;
1060        struct dm_io *io;
1061        sector_t sector;
1062        sector_t sector_count;
1063        unsigned short idx;
1064};
1065
1066static void bio_setup_sector(struct bio *bio, sector_t sector, sector_t len)
1067{
1068        bio->bi_sector = sector;
1069        bio->bi_size = to_bytes(len);
1070}
1071
1072static void bio_setup_bv(struct bio *bio, unsigned short idx, unsigned short bv_count)
1073{
1074        bio->bi_idx = idx;
1075        bio->bi_vcnt = idx + bv_count;
1076        bio->bi_flags &= ~(1 << BIO_SEG_VALID);
1077}
1078
1079static int clone_bio_integrity(struct bio *bio, struct bio *clone,
1080                               unsigned short idx, unsigned len, unsigned offset,
1081                               bool trim)
1082{
1083        int r;
1084
1085        r = bio_integrity_clone(clone, bio, GFP_NOIO);
1086        if (r < 0)
1087                return r;
1088
1089        if (trim)
1090                bio_integrity_trim(clone, bio_sector_offset(bio, idx, offset), len);
1091
1092        return 0;
1093}
1094
1095/*
1096 * Creates a little bio that just does part of a bvec.
1097 */
1098static int clone_split_bio(struct dm_target_io *tio, struct bio *bio,
1099                           sector_t sector, unsigned short idx,
1100                           unsigned offset, unsigned len)
1101{
1102        struct bio *clone = &tio->clone;
1103        struct bio_vec *bv = bio->bi_io_vec + idx;
1104
1105        *clone->bi_io_vec = *bv;
1106
1107        bio_setup_sector(clone, sector, len);
1108
1109        clone->bi_bdev = bio->bi_bdev;
1110        clone->bi_rw = bio->bi_rw;
1111        clone->bi_vcnt = 1;
1112        clone->bi_io_vec->bv_offset = offset;
1113        clone->bi_io_vec->bv_len = clone->bi_size;
1114        clone->bi_flags |= 1 << BIO_CLONED;
1115
1116        if (bio_integrity(bio)) {
1117                int r = clone_bio_integrity(bio, clone, idx, len, offset, true);
1118                if (r < 0)
1119                        return r;
1120        }
1121
1122        return 0;
1123}
1124
1125/*
1126 * Creates a bio that consists of range of complete bvecs.
1127 */
1128static int clone_bio(struct dm_target_io *tio, struct bio *bio,
1129                     sector_t sector, unsigned short idx,
1130                     unsigned short bv_count, unsigned len)
1131{
1132        struct bio *clone = &tio->clone;
1133
1134        __bio_clone(clone, bio);
1135        bio_setup_sector(clone, sector, len);
1136        bio_setup_bv(clone, idx, bv_count);
1137
1138        if (bio_integrity(bio)) {
1139                int r;
1140                bool trim = false;
1141
1142                if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
1143                        trim = true;
1144                r = clone_bio_integrity(bio, clone, idx, len, 0, trim);
1145                if (r < 0)
1146                        return r;
1147        }
1148
1149        return 0;
1150}
1151
1152static struct dm_target_io *alloc_tio(struct clone_info *ci,
1153                                      struct dm_target *ti, int nr_iovecs,
1154                                      unsigned target_bio_nr)
1155{
1156        struct dm_target_io *tio;
1157        struct bio *clone;
1158
1159        clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, ci->md->bs);
1160        tio = container_of(clone, struct dm_target_io, clone);
1161
1162        tio->io = ci->io;
1163        tio->ti = ti;
1164        tio->target_bio_nr = target_bio_nr;
1165
1166        return tio;
1167}
1168
1169static void __clone_and_map_simple_bio(struct clone_info *ci,
1170                                       struct dm_target *ti,
1171                                       unsigned target_bio_nr, sector_t len)
1172{
1173        struct dm_target_io *tio = alloc_tio(ci, ti, ci->bio->bi_max_vecs, target_bio_nr);
1174        struct bio *clone = &tio->clone;
1175
1176        /*
1177         * Discard requests require the bio's inline iovecs be initialized.
1178         * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush
1179         * and discard, so no need for concern about wasted bvec allocations.
1180         */
1181         __bio_clone(clone, ci->bio);
1182        if (len)
1183                bio_setup_sector(clone, ci->sector, len);
1184
1185        __map_bio(tio);
1186}
1187
1188static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
1189                                  unsigned num_bios, sector_t len)
1190{
1191        unsigned target_bio_nr;
1192
1193        for (target_bio_nr = 0; target_bio_nr < num_bios; target_bio_nr++)
1194                __clone_and_map_simple_bio(ci, ti, target_bio_nr, len);
1195}
1196
1197static int __send_empty_flush(struct clone_info *ci)
1198{
1199        unsigned target_nr = 0;
1200        struct dm_target *ti;
1201
1202        BUG_ON(bio_has_data(ci->bio));
1203        while ((ti = dm_table_get_target(ci->map, target_nr++)))
1204                __send_duplicate_bios(ci, ti, ti->num_flush_bios, 0);
1205
1206        return 0;
1207}
1208
1209static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
1210                                    sector_t sector, int nr_iovecs,
1211                                    unsigned short idx, unsigned short bv_count,
1212                                    unsigned offset, unsigned len,
1213                                    bool split_bvec)
1214{
1215        struct bio *bio = ci->bio;
1216        struct dm_target_io *tio;
1217        unsigned target_bio_nr;
1218        unsigned num_target_bios = 1;
1219        int r = 0;
1220
1221        /*
1222         * Does the target want to receive duplicate copies of the bio?
1223         */
1224        if (bio_data_dir(bio) == WRITE && ti->num_write_bios)
1225                num_target_bios = ti->num_write_bios(ti, bio);
1226
1227        for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) {
1228                tio = alloc_tio(ci, ti, nr_iovecs, target_bio_nr);
1229                if (split_bvec)
1230                        r = clone_split_bio(tio, bio, sector, idx, offset, len);
1231                else
1232                        r = clone_bio(tio, bio, sector, idx, bv_count, len);
1233                if (r < 0) {
1234                        free_tio(tio);
1235                        break;
1236                }
1237                __map_bio(tio);
1238        }
1239
1240        return r;
1241}
1242
1243typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
1244
1245static unsigned get_num_discard_bios(struct dm_target *ti)
1246{
1247        return ti->num_discard_bios;
1248}
1249
1250static unsigned get_num_write_same_bios(struct dm_target *ti)
1251{
1252        return ti->num_write_same_bios;
1253}
1254
1255typedef bool (*is_split_required_fn)(struct dm_target *ti);
1256
1257static bool is_split_required_for_discard(struct dm_target *ti)
1258{
1259        return ti->split_discard_bios;
1260}
1261
1262static int __send_changing_extent_only(struct clone_info *ci,
1263                                       get_num_bios_fn get_num_bios,
1264                                       is_split_required_fn is_split_required)
1265{
1266        struct dm_target *ti;
1267        sector_t len;
1268        unsigned num_bios;
1269
1270        do {
1271                ti = dm_table_find_target(ci->map, ci->sector);
1272                if (!dm_target_is_valid(ti))
1273                        return -EIO;
1274
1275                /*
1276                 * Even though the device advertised support for this type of
1277                 * request, that does not mean every target supports it, and
1278                 * reconfiguration might also have changed that since the
1279                 * check was performed.
1280                 */
1281                num_bios = get_num_bios ? get_num_bios(ti) : 0;
1282                if (!num_bios)
1283                        return -EOPNOTSUPP;
1284
1285                if (is_split_required && !is_split_required(ti))
1286                        len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
1287                else
1288                        len = min(ci->sector_count, max_io_len(ci->sector, ti));
1289
1290                __send_duplicate_bios(ci, ti, num_bios, len);
1291
1292                ci->sector += len;
1293        } while (ci->sector_count -= len);
1294
1295        return 0;
1296}
1297
1298static int __send_discard(struct clone_info *ci)
1299{
1300        return __send_changing_extent_only(ci, get_num_discard_bios,
1301                                           is_split_required_for_discard);
1302}
1303
1304static int __send_write_same(struct clone_info *ci)
1305{
1306        return __send_changing_extent_only(ci, get_num_write_same_bios, NULL);
1307}
1308
1309/*
1310 * Find maximum number of sectors / bvecs we can process with a single bio.
1311 */
1312static sector_t __len_within_target(struct clone_info *ci, sector_t max, int *idx)
1313{
1314        struct bio *bio = ci->bio;
1315        sector_t bv_len, total_len = 0;
1316
1317        for (*idx = ci->idx; max && (*idx < bio->bi_vcnt); (*idx)++) {
1318                bv_len = to_sector(bio->bi_io_vec[*idx].bv_len);
1319
1320                if (bv_len > max)
1321                        break;
1322
1323                max -= bv_len;
1324                total_len += bv_len;
1325        }
1326
1327        return total_len;
1328}
1329
1330static int __split_bvec_across_targets(struct clone_info *ci,
1331                                       struct dm_target *ti, sector_t max)
1332{
1333        struct bio *bio = ci->bio;
1334        struct bio_vec *bv = bio->bi_io_vec + ci->idx;
1335        sector_t remaining = to_sector(bv->bv_len);
1336        unsigned offset = 0;
1337        sector_t len;
1338        int r;
1339
1340        do {
1341                if (offset) {
1342                        ti = dm_table_find_target(ci->map, ci->sector);
1343                        if (!dm_target_is_valid(ti))
1344                                return -EIO;
1345
1346                        max = max_io_len(ci->sector, ti);
1347                }
1348
1349                len = min(remaining, max);
1350
1351                r = __clone_and_map_data_bio(ci, ti, ci->sector, 1, ci->idx, 0,
1352                                             bv->bv_offset + offset, len, true);
1353                if (r < 0)
1354                        return r;
1355
1356                ci->sector += len;
1357                ci->sector_count -= len;
1358                offset += to_bytes(len);
1359        } while (remaining -= len);
1360
1361        ci->idx++;
1362
1363        return 0;
1364}
1365
1366/*
1367 * Select the correct strategy for processing a non-flush bio.
1368 */
1369static int __split_and_process_non_flush(struct clone_info *ci)
1370{
1371        struct bio *bio = ci->bio;
1372        struct dm_target *ti;
1373        sector_t len, max;
1374        int idx;
1375        int r;
1376
1377        if (unlikely(bio->bi_rw & REQ_DISCARD))
1378                return __send_discard(ci);
1379        else if (unlikely(bio->bi_rw & REQ_WRITE_SAME))
1380                return __send_write_same(ci);
1381
1382        ti = dm_table_find_target(ci->map, ci->sector);
1383        if (!dm_target_is_valid(ti))
1384                return -EIO;
1385
1386        max = max_io_len(ci->sector, ti);
1387
1388        /*
1389         * Optimise for the simple case where we can do all of
1390         * the remaining io with a single clone.
1391         */
1392        if (ci->sector_count <= max) {
1393                r = __clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs,
1394                                             ci->idx, bio->bi_vcnt - ci->idx, 0,
1395                                             ci->sector_count, false);
1396                if (r < 0)
1397                        return r;
1398
1399                ci->sector_count = 0;
1400                return 0;
1401        }
1402
1403        /*
1404         * There are some bvecs that don't span targets.
1405         * Do as many of these as possible.
1406         */
1407        if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
1408                len = __len_within_target(ci, max, &idx);
1409
1410                r = __clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs,
1411                                             ci->idx, idx - ci->idx, 0, len, false);
1412                if (r < 0)
1413                        return r;
1414
1415                ci->sector += len;
1416                ci->sector_count -= len;
1417                ci->idx = idx;
1418
1419                return 0;
1420        }
1421
1422        /*
1423         * Handle a bvec that must be split between two or more targets.
1424         */
1425        return __split_bvec_across_targets(ci, ti, max);
1426}
1427
1428/*
1429 * Entry point to split a bio into clones and submit them to the targets.
1430 */
1431static void __split_and_process_bio(struct mapped_device *md,
1432                                    struct dm_table *map, struct bio *bio)
1433{
1434        struct clone_info ci;
1435        int error = 0;
1436
1437        if (unlikely(!map)) {
1438                bio_io_error(bio);
1439                return;
1440        }
1441
1442        ci.map = map;
1443        ci.md = md;
1444        ci.io = alloc_io(md);
1445        ci.io->error = 0;
1446        atomic_set(&ci.io->io_count, 1);
1447        ci.io->bio = bio;
1448        ci.io->md = md;
1449        spin_lock_init(&ci.io->endio_lock);
1450        ci.sector = bio->bi_sector;
1451        ci.idx = bio->bi_idx;
1452
1453        start_io_acct(ci.io);
1454
1455        if (bio->bi_rw & REQ_FLUSH) {
1456                ci.bio = &ci.md->flush_bio;
1457                ci.sector_count = 0;
1458                error = __send_empty_flush(&ci);
1459                /* dec_pending submits any data associated with flush */
1460        } else {
1461                ci.bio = bio;
1462                ci.sector_count = bio_sectors(bio);
1463                while (ci.sector_count && !error)
1464                        error = __split_and_process_non_flush(&ci);
1465        }
1466
1467        /* drop the extra reference count */
1468        dec_pending(ci.io, error);
1469}
1470/*-----------------------------------------------------------------
1471 * CRUD END
1472 *---------------------------------------------------------------*/
1473
1474static int dm_merge_bvec(struct request_queue *q,
1475                         struct bvec_merge_data *bvm,
1476                         struct bio_vec *biovec)
1477{
1478        struct mapped_device *md = q->queuedata;
1479        struct dm_table *map = dm_get_live_table_fast(md);
1480        struct dm_target *ti;
1481        sector_t max_sectors;
1482        int max_size = 0;
1483
1484        if (unlikely(!map))
1485                goto out;
1486
1487        ti = dm_table_find_target(map, bvm->bi_sector);
1488        if (!dm_target_is_valid(ti))
1489                goto out;
1490
1491        /*
1492         * Find maximum amount of I/O that won't need splitting
1493         */
1494        max_sectors = min(max_io_len(bvm->bi_sector, ti),
1495                          (sector_t) BIO_MAX_SECTORS);
1496        max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
1497        if (max_size < 0)
1498                max_size = 0;
1499
1500        /*
1501         * merge_bvec_fn() returns number of bytes
1502         * it can accept at this offset
1503         * max is precomputed maximal io size
1504         */
1505        if (max_size && ti->type->merge)
1506                max_size = ti->type->merge(ti, bvm, biovec, max_size);
1507        /*
1508         * If the target doesn't support merge method and some of the devices
1509         * provided their merge_bvec method (we know this by looking at
1510         * queue_max_hw_sectors), then we can't allow bios with multiple vector
1511         * entries.  So always set max_size to 0, and the code below allows
1512         * just one page.
1513         */
1514        else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
1515                max_size = 0;
1516
1517out:
1518        dm_put_live_table_fast(md);
1519        /*
1520         * Always allow an entire first page
1521         */
1522        if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT))
1523                max_size = biovec->bv_len;
1524
1525        return max_size;
1526}
1527
1528/*
1529 * The request function that just remaps the bio built up by
1530 * dm_merge_bvec.
1531 */
1532static void dm_make_request(struct request_queue *q, struct bio *bio)
1533{
1534        int rw = bio_data_dir(bio);
1535        struct mapped_device *md = q->queuedata;
1536        int cpu;
1537        int srcu_idx;
1538        struct dm_table *map;
1539
1540        map = dm_get_live_table(md, &srcu_idx);
1541
1542        cpu = part_stat_lock();
1543        part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]);
1544        part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
1545        part_stat_unlock();
1546
1547        /* if we're suspended, we have to queue this io for later */
1548        if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1549                dm_put_live_table(md, srcu_idx);
1550
1551                if (bio_rw(bio) != READA)
1552                        queue_io(md, bio);
1553                else
1554                        bio_io_error(bio);
1555                return;
1556        }
1557
1558        __split_and_process_bio(md, map, bio);
1559        dm_put_live_table(md, srcu_idx);
1560        return;
1561}
1562
1563static int dm_any_congested(void *congested_data, int bdi_bits)
1564{
1565        int r = bdi_bits;
1566        struct mapped_device *md = congested_data;
1567        struct dm_table *map;
1568
1569        if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
1570                if (dm_request_based(md)) {
1571                        /*
1572                         * With request-based DM we only need to check the
1573                         * top-level queue for congestion.
1574                         */
1575                        r = md->queue->backing_dev_info.state & bdi_bits;
1576                } else {
1577                        map = dm_get_live_table_fast(md);
1578                        if (map)
1579                                r = dm_table_any_congested(map, bdi_bits);
1580                        dm_put_live_table_fast(md);
1581                }
1582        }
1583
1584        return r;
1585}
1586
1587/*-----------------------------------------------------------------
1588 * An IDR is used to keep track of allocated minor numbers.
1589 *---------------------------------------------------------------*/
1590static void free_minor(int minor)
1591{
1592        spin_lock(&_minor_lock);
1593        idr_remove(&_minor_idr, minor);
1594        spin_unlock(&_minor_lock);
1595}
1596
1597/*
1598 * See if the device with a specific minor # is free.
1599 */
1600static int specific_minor(int minor)
1601{
1602        int r;
1603
1604        if (minor >= (1 << MINORBITS))
1605                return -EINVAL;
1606
1607        idr_preload(GFP_KERNEL);
1608        spin_lock(&_minor_lock);
1609
1610        r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
1611
1612        spin_unlock(&_minor_lock);
1613        idr_preload_end();
1614        if (r < 0)
1615                return r == -ENOSPC ? -EBUSY : r;
1616        return 0;
1617}
1618
1619static int next_free_minor(int *minor)
1620{
1621        int r;
1622
1623        idr_preload(GFP_KERNEL);
1624        spin_lock(&_minor_lock);
1625
1626        r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
1627
1628        spin_unlock(&_minor_lock);
1629        idr_preload_end();
1630        if (r < 0)
1631                return r;
1632        *minor = r;
1633        return 0;
1634}
1635
1636static const struct block_device_operations dm_blk_dops;
1637static const struct dax_operations dm_dax_ops;
1638
1639static void dm_wq_work(struct work_struct *work);
1640
1641void dm_init_md_queue(struct mapped_device *md)
1642{
1643        /*
1644         * Request-based dm devices cannot be stacked on top of bio-based dm
1645         * devices.  The type of this dm device may not have been decided yet.
1646         * The type is decided at the first table loading time.
1647         * To prevent problematic device stacking, clear the queue flag
1648         * for request stacking support until then.
1649         *
1650         * This queue is new, so no concurrency on the queue_flags.
1651         */
1652        queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
1653
1654        /*
1655         * Initialize data that will only be used by a non-blk-mq DM queue
1656         * - must do so here (in alloc_dev callchain) before queue is used
1657         */
1658        md->queue->queuedata = md;
1659        md->queue->backing_dev_info.congested_data = md;
1660}
1661
1662void dm_init_normal_md_queue(struct mapped_device *md)
1663{
1664        md->use_blk_mq = false;
1665        dm_init_md_queue(md);
1666
1667        /*
1668         * Initialize aspects of queue that aren't relevant for blk-mq
1669         */
1670        md->queue->backing_dev_info.congested_fn = dm_any_congested;
1671        blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
1672}
1673
1674static void cleanup_mapped_device(struct mapped_device *md)
1675{
1676        if (md->wq)
1677                destroy_workqueue(md->wq);
1678        if (md->kworker_task)
1679                kthread_stop(md->kworker_task);
1680        mempool_destroy(md->io_pool);
1681        mempool_destroy(md->rq_pool);
1682        if (md->bs)
1683                bioset_free(md->bs);
1684
1685        if (md->dax_dev) {
1686                kill_dax(md->dax_dev);
1687                put_dax(md->dax_dev);
1688                md->dax_dev = NULL;
1689        }
1690
1691        if (md->disk) {
1692                spin_lock(&_minor_lock);
1693                md->disk->private_data = NULL;
1694                spin_unlock(&_minor_lock);
1695                if (blk_get_integrity(md->disk))
1696                        blk_integrity_unregister(md->disk);
1697                del_gendisk(md->disk);
1698                put_disk(md->disk);
1699        }
1700
1701        if (md->queue)
1702                blk_cleanup_queue(md->queue);
1703
1704        cleanup_srcu_struct(&md->io_barrier);
1705
1706        if (md->bdev) {
1707                bdput(md->bdev);
1708                md->bdev = NULL;
1709        }
1710
1711        dm_mq_cleanup_mapped_device(md);
1712}
1713
1714/*
1715 * Allocate and initialise a blank device with a given minor.
1716 */
1717static struct mapped_device *alloc_dev(int minor)
1718{
1719        int r, numa_node_id = dm_get_numa_node();
1720        struct dax_device *dax_dev;
1721        struct mapped_device *md;
1722        void *old_md;
1723
1724        md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
1725        if (!md) {
1726                DMWARN("unable to allocate device, out of memory.");
1727                return NULL;
1728        }
1729
1730        if (!try_module_get(THIS_MODULE))
1731                goto bad_module_get;
1732
1733        /* get a minor number for the dev */
1734        if (minor == DM_ANY_MINOR)
1735                r = next_free_minor(&minor);
1736        else
1737                r = specific_minor(minor);
1738        if (r < 0)
1739                goto bad_minor;
1740
1741        r = init_srcu_struct(&md->io_barrier);
1742        if (r < 0)
1743                goto bad_io_barrier;
1744
1745        md->numa_node_id = numa_node_id;
1746        md->use_blk_mq = dm_use_blk_mq_default();
1747        md->init_tio_pdu = false;
1748        md->type = DM_TYPE_NONE;
1749        mutex_init(&md->suspend_lock);
1750        mutex_init(&md->type_lock);
1751        mutex_init(&md->table_devices_lock);
1752        spin_lock_init(&md->deferred_lock);
1753        atomic_set(&md->holders, 1);
1754        atomic_set(&md->open_count, 0);
1755        atomic_set(&md->event_nr, 0);
1756        atomic_set(&md->uevent_seq, 0);
1757        INIT_LIST_HEAD(&md->uevent_list);
1758        INIT_LIST_HEAD(&md->table_devices);
1759        spin_lock_init(&md->uevent_lock);
1760
1761        md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id);
1762        if (!md->queue)
1763                goto bad;
1764
1765        dm_init_md_queue(md);
1766
1767        md->disk = alloc_disk_node(1, numa_node_id);
1768        if (!md->disk)
1769                goto bad;
1770
1771        atomic_set(&md->pending[0], 0);
1772        atomic_set(&md->pending[1], 0);
1773        init_waitqueue_head(&md->wait);
1774        INIT_WORK(&md->work, dm_wq_work);
1775        init_waitqueue_head(&md->eventq);
1776        init_completion(&md->kobj_holder.completion);
1777        md->kworker_task = NULL;
1778
1779        md->disk->major = _major;
1780        md->disk->first_minor = minor;
1781        md->disk->fops = &dm_blk_dops;
1782        md->disk->queue = md->queue;
1783        md->disk->private_data = md;
1784        sprintf(md->disk->disk_name, "dm-%d", minor);
1785
1786        dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops);
1787        if (!dax_dev)
1788                goto bad;
1789        md->dax_dev = dax_dev;
1790
1791        add_disk_no_queue_reg(md->disk);
1792        format_dev_t(md->name, MKDEV(_major, minor));
1793
1794        md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
1795        if (!md->wq)
1796                goto bad;
1797
1798        md->bdev = bdget_disk(md->disk, 0);
1799        if (!md->bdev)
1800                goto bad;
1801
1802        bio_init(&md->flush_bio);
1803        md->flush_bio.bi_bdev = md->bdev;
1804        md->flush_bio.bi_rw = WRITE_FLUSH;
1805
1806        dm_stats_init(&md->stats);
1807
1808        /* Populate the mapping, nobody knows we exist yet */
1809        spin_lock(&_minor_lock);
1810        old_md = idr_replace(&_minor_idr, md, minor);
1811        spin_unlock(&_minor_lock);
1812
1813        BUG_ON(old_md != MINOR_ALLOCED);
1814
1815        return md;
1816
1817bad:
1818        cleanup_mapped_device(md);
1819bad_io_barrier:
1820        free_minor(minor);
1821bad_minor:
1822        module_put(THIS_MODULE);
1823bad_module_get:
1824        kvfree(md);
1825        return NULL;
1826}
1827
1828static void unlock_fs(struct mapped_device *md);
1829
1830static void free_dev(struct mapped_device *md)
1831{
1832        int minor = MINOR(disk_devt(md->disk));
1833
1834        unlock_fs(md);
1835
1836        cleanup_mapped_device(md);
1837
1838        free_table_devices(&md->table_devices);
1839        dm_stats_cleanup(&md->stats);
1840        free_minor(minor);
1841
1842        module_put(THIS_MODULE);
1843        kvfree(md);
1844}
1845
1846static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
1847{
1848        struct dm_md_mempools *p = dm_table_get_md_mempools(t);
1849
1850        if (md->bs) {
1851                /* The md already has necessary mempools. */
1852                if (dm_table_bio_based(t)) {
1853                        /*
1854                         * Reload bioset because front_pad may have changed
1855                         * because a different table was loaded.
1856                         */
1857                        bioset_free(md->bs);
1858                        md->bs = p->bs;
1859                        p->bs = NULL;
1860                }
1861                /*
1862                 * There's no need to reload with request-based dm
1863                 * because the size of front_pad doesn't change.
1864                 * Note for future: If you are to reload bioset,
1865                 * prep-ed requests in the queue may refer
1866                 * to bio from the old bioset, so you must walk
1867                 * through the queue to unprep.
1868                 */
1869                goto out;
1870        }
1871
1872        BUG_ON(!p || md->io_pool || md->rq_pool || md->bs);
1873
1874        md->io_pool = p->io_pool;
1875        p->io_pool = NULL;
1876        md->rq_pool = p->rq_pool;
1877        p->rq_pool = NULL;
1878        md->bs = p->bs;
1879        p->bs = NULL;
1880
1881out:
1882        /* mempool bind completed, no longer need any mempools in the table */
1883        dm_table_free_md_mempools(t);
1884}
1885
1886/*
1887 * Bind a table to the device.
1888 */
1889static void event_callback(void *context)
1890{
1891        unsigned long flags;
1892        LIST_HEAD(uevents);
1893        struct mapped_device *md = (struct mapped_device *) context;
1894
1895        spin_lock_irqsave(&md->uevent_lock, flags);
1896        list_splice_init(&md->uevent_list, &uevents);
1897        spin_unlock_irqrestore(&md->uevent_lock, flags);
1898
1899        dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
1900
1901        atomic_inc(&md->event_nr);
1902        wake_up(&md->eventq);
1903        dm_issue_global_event();
1904}
1905
1906/*
1907 * Protected by md->suspend_lock obtained by dm_swap_table().
1908 */
1909static void __set_size(struct mapped_device *md, sector_t size)
1910{
1911        lockdep_assert_held(&md->suspend_lock);
1912
1913        set_capacity(md->disk, size);
1914
1915        i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
1916}
1917
1918/*
1919 * Return 1 if the queue has a compulsory merge_bvec_fn function.
1920 *
1921 * If this function returns 0, then the device is either a non-dm
1922 * device without a merge_bvec_fn, or it is a dm device that is
1923 * able to split any bios it receives that are too big.
1924 */
1925int dm_queue_merge_is_compulsory(struct request_queue *q)
1926{
1927        struct mapped_device *dev_md;
1928
1929        if (!q->merge_bvec_fn)
1930                return 0;
1931
1932        if (q->make_request_fn == dm_make_request) {
1933                dev_md = q->queuedata;
1934                if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags))
1935                        return 0;
1936        }
1937
1938        return 1;
1939}
1940
1941static int dm_device_merge_is_compulsory(struct dm_target *ti,
1942                                         struct dm_dev *dev, sector_t start,
1943                                         sector_t len, void *data)
1944{
1945        struct block_device *bdev = dev->bdev;
1946        struct request_queue *q = bdev_get_queue(bdev);
1947
1948        return dm_queue_merge_is_compulsory(q);
1949}
1950
1951/*
1952 * Return 1 if it is acceptable to ignore merge_bvec_fn based
1953 * on the properties of the underlying devices.
1954 */
1955static int dm_table_merge_is_optional(struct dm_table *table)
1956{
1957        unsigned i = 0;
1958        struct dm_target *ti;
1959
1960        while (i < dm_table_get_num_targets(table)) {
1961                ti = dm_table_get_target(table, i++);
1962
1963                if (ti->type->iterate_devices &&
1964                    ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL))
1965                        return 0;
1966        }
1967
1968        return 1;
1969}
1970
1971/*
1972 * Returns old map, which caller must destroy.
1973 */
1974static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
1975                               struct queue_limits *limits)
1976{
1977        struct dm_table *old_map;
1978        struct request_queue *q = md->queue;
1979        sector_t size;
1980        int merge_is_optional;
1981
1982        lockdep_assert_held(&md->suspend_lock);
1983
1984        size = dm_table_get_size(t);
1985
1986        /*
1987         * Wipe any geometry if the size of the table changed.
1988         */
1989        if (size != dm_get_size(md))
1990                memset(&md->geometry, 0, sizeof(md->geometry));
1991
1992        __set_size(md, size);
1993
1994        dm_table_event_callback(t, event_callback, md);
1995
1996        /*
1997         * The queue hasn't been stopped yet, if the old table type wasn't
1998         * for request-based during suspension.  So stop it to prevent
1999         * I/O mapping before resume.
2000         * This must be done before setting the queue restrictions,

2001         * because request-based dm may be run just after the setting.
2002         */
2003        if (dm_table_request_based(t)) {
2004                dm_stop_queue(q);
2005                /*
2006                 * Leverage the fact that request-based DM targets are
2007                 * immutable singletons and establish md->immutable_target
2008                 * - used to optimize both dm_request_fn and dm_mq_queue_rq
2009                 */
2010                md->immutable_target = dm_table_get_immutable_target(t);
2011        }
2012
2013        __bind_mempools(md, t);
2014
2015        merge_is_optional = dm_table_merge_is_optional(t);
2016
2017        old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2018        rcu_assign_pointer(md->map, (void *)t);
2019        md->immutable_target_type = dm_table_get_immutable_target_type(t);
2020
2021        dm_table_set_restrictions(t, q, limits);
2022        if (merge_is_optional)
2023                set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
2024        else
2025                clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
2026        if (old_map)
2027                dm_sync_table(md);
2028
2029        return old_map;
2030}
2031
2032/*
2033 * Returns unbound table for the caller to free.
2034 */
2035static struct dm_table *__unbind(struct mapped_device *md)
2036{
2037        struct dm_table *map = rcu_dereference_protected(md->map, 1);
2038
2039        if (!map)
2040                return NULL;
2041
2042        dm_table_event_callback(map, NULL, NULL);
2043        RCU_INIT_POINTER(md->map, NULL);
2044        dm_sync_table(md);
2045
2046        return map;
2047}
2048
2049/*
2050 * Constructor for a new device.
2051 */
2052int dm_create(int minor, struct mapped_device **result)
2053{
2054        struct mapped_device *md;
2055
2056        md = alloc_dev(minor);
2057        if (!md)
2058                return -ENXIO;
2059
2060        dm_sysfs_init(md);
2061
2062        *result = md;
2063        return 0;
2064}
2065
2066/*
2067 * Functions to manage md->type.
2068 * All are required to hold md->type_lock.
2069 */
2070void dm_lock_md_type(struct mapped_device *md)
2071{
2072        mutex_lock(&md->type_lock);
2073}
2074
2075void dm_unlock_md_type(struct mapped_device *md)
2076{
2077        mutex_unlock(&md->type_lock);
2078}
2079
2080void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
2081{
2082        BUG_ON(!mutex_is_locked(&md->type_lock));
2083        md->type = type;
2084}
2085
2086enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
2087{
2088        return md->type;
2089}
2090
2091struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
2092{
2093        return md->immutable_target_type;
2094}
2095
2096/*
2097 * The queue_limits are only valid as long as you have a reference
2098 * count on 'md'.
2099 */
2100struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
2101{
2102        BUG_ON(!atomic_read(&md->holders));
2103        return &md->queue->limits;
2104}
2105EXPORT_SYMBOL_GPL(dm_get_queue_limits);
2106
2107/*
2108 * Setup the DM device's queue based on md's type
2109 */
2110int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
2111{
2112        int r;
2113        struct queue_limits limits;
2114        struct queue_limits_aux limits_aux;
2115        enum dm_queue_mode type = dm_get_md_type(md);
2116
2117        switch (type) {
2118        case DM_TYPE_REQUEST_BASED:
2119                r = dm_old_init_request_queue(md);
2120                if (r) {
2121                        DMERR("Cannot initialize queue for request-based mapped device");
2122                        return r;
2123                }
2124                break;
2125        case DM_TYPE_MQ_REQUEST_BASED:
2126                r = dm_mq_init_request_queue(md, t);
2127                if (r) {
2128                        DMERR("Cannot initialize queue for request-based dm-mq mapped device");
2129                        return r;
2130                }
2131                break;
2132        case DM_TYPE_BIO_BASED:
2133        case DM_TYPE_DAX_BIO_BASED:
2134                dm_init_normal_md_queue(md);
2135                blk_queue_make_request(md->queue, dm_make_request);
2136                blk_queue_merge_bvec(md->queue, dm_merge_bvec);
2137
2138                if (type == DM_TYPE_DAX_BIO_BASED)
2139                        queue_flag_set_unlocked(QUEUE_FLAG_DAX, md->queue);
2140                break;
2141        case DM_TYPE_NONE:
2142                WARN_ON_ONCE(true);
2143                break;
2144        }
2145
2146        limits.limits_aux = &limits_aux;
2147        r = dm_calculate_queue_limits(t, &limits);
2148        if (r) {
2149                DMERR("Cannot calculate initial queue limits");
2150                return r;
2151        }
2152        dm_table_set_restrictions(t, md->queue, &limits);
2153        blk_register_queue(md->disk);
2154
2155        return 0;
2156}
2157
2158struct mapped_device *dm_get_md(dev_t dev)
2159{
2160        struct mapped_device *md;
2161        unsigned minor = MINOR(dev);
2162
2163        if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2164                return NULL;
2165
2166        spin_lock(&_minor_lock);
2167
2168        md = idr_find(&_minor_idr, minor);
2169        if (!md || md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) ||
2170            test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2171                md = NULL;
2172                goto out;
2173        }
2174        dm_get(md);
2175out:
2176        spin_unlock(&_minor_lock);
2177
2178        return md;
2179}
2180EXPORT_SYMBOL_GPL(dm_get_md);
2181
2182void *dm_get_mdptr(struct mapped_device *md)
2183{
2184        return md->interface_ptr;
2185}
2186
2187void dm_set_mdptr(struct mapped_device *md, void *ptr)
2188{
2189        md->interface_ptr = ptr;
2190}
2191
2192void dm_get(struct mapped_device *md)
2193{
2194        atomic_inc(&md->holders);
2195        BUG_ON(test_bit(DMF_FREEING, &md->flags));
2196}
2197
2198int dm_hold(struct mapped_device *md)
2199{
2200        spin_lock(&_minor_lock);
2201        if (test_bit(DMF_FREEING, &md->flags)) {
2202                spin_unlock(&_minor_lock);
2203                return -EBUSY;
2204        }
2205        dm_get(md);
2206        spin_unlock(&_minor_lock);
2207        return 0;
2208}
2209EXPORT_SYMBOL_GPL(dm_hold);
2210
2211const char *dm_device_name(struct mapped_device *md)
2212{
2213        return md->name;
2214}
2215EXPORT_SYMBOL_GPL(dm_device_name);
2216
2217static void __dm_destroy(struct mapped_device *md, bool wait)
2218{
2219        struct request_queue *q = dm_get_md_queue(md);
2220        struct dm_table *map;
2221        int srcu_idx;
2222
2223        might_sleep();
2224
2225        spin_lock(&_minor_lock);
2226        idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2227        set_bit(DMF_FREEING, &md->flags);
2228        spin_unlock(&_minor_lock);
2229
2230        blk_set_queue_dying(q);
2231
2232        if (dm_request_based(md) && md->kworker_task)
2233                flush_kthread_worker(&md->kworker);
2234
2235        /*
2236         * Take suspend_lock so that presuspend and postsuspend methods
2237         * do not race with internal suspend.
2238         */
2239        mutex_lock(&md->suspend_lock);
2240        map = dm_get_live_table(md, &srcu_idx);
2241        if (!dm_suspended_md(md)) {
2242                dm_table_presuspend_targets(map);
2243                dm_table_postsuspend_targets(map);
2244        }
2245        /* dm_put_live_table must be before msleep, otherwise deadlock is possible */
2246        dm_put_live_table(md, srcu_idx);
2247        mutex_unlock(&md->suspend_lock);
2248
2249        /*
2250         * Rare, but there may be I/O requests still going to complete,
2251         * for example.  Wait for all references to disappear.
2252         * No one should increment the reference count of the mapped_device,
2253         * after the mapped_device state becomes DMF_FREEING.
2254         */
2255        if (wait)
2256                while (atomic_read(&md->holders))
2257                        msleep(1);
2258        else if (atomic_read(&md->holders))
2259                DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2260                       dm_device_name(md), atomic_read(&md->holders));
2261
2262        dm_sysfs_exit(md);
2263        dm_table_destroy(__unbind(md));
2264        free_dev(md);
2265}
2266
2267void dm_destroy(struct mapped_device *md)
2268{
2269        __dm_destroy(md, true);
2270}
2271
2272void dm_destroy_immediate(struct mapped_device *md)
2273{
2274        __dm_destroy(md, false);
2275}
2276
2277void dm_put(struct mapped_device *md)
2278{
2279        atomic_dec(&md->holders);
2280}
2281EXPORT_SYMBOL_GPL(dm_put);
2282
2283static int dm_wait_for_completion(struct mapped_device *md, long task_state)
2284{
2285        int r = 0;
2286        DEFINE_WAIT(wait);
2287
2288        while (1) {
2289                prepare_to_wait(&md->wait, &wait, task_state);
2290
2291                if (!md_in_flight(md))
2292                        break;
2293
2294                if (signal_pending_state(task_state, current)) {
2295                        r = -EINTR;
2296                        break;
2297                }
2298
2299                io_schedule();
2300        }
2301        finish_wait(&md->wait, &wait);
2302
2303        return r;
2304}
2305
2306/*
2307 * Process the deferred bios
2308 */
2309static void dm_wq_work(struct work_struct *work)
2310{
2311        struct mapped_device *md = container_of(work, struct mapped_device,
2312                                                work);
2313        struct bio *c;
2314        int srcu_idx;
2315        struct dm_table *map;
2316
2317        map = dm_get_live_table(md, &srcu_idx);
2318
2319        while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2320                spin_lock_irq(&md->deferred_lock);
2321                c = bio_list_pop(&md->deferred);
2322                spin_unlock_irq(&md->deferred_lock);
2323
2324                if (!c)
2325                        break;
2326
2327                if (dm_request_based(md))
2328                        generic_make_request(c);
2329                else
2330                        __split_and_process_bio(md, map, c);
2331        }
2332
2333        dm_put_live_table(md, srcu_idx);
2334}
2335
2336static void dm_queue_flush(struct mapped_device *md)
2337{
2338        clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2339        smp_mb__after_clear_bit();
2340        queue_work(md->wq, &md->work);
2341}
2342
2343/*
2344 * Swap in a new table, returning the old one for the caller to destroy.
2345 */
2346struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2347{
2348        struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
2349        struct queue_limits limits;
2350        struct queue_limits_aux limits_aux;
2351        int r;
2352
2353        mutex_lock(&md->suspend_lock);
2354
2355        /* device must be suspended */
2356        if (!dm_suspended_md(md))
2357                goto out;
2358
2359        /*
2360         * Initialize limits aux pointer to stack queue_limits_aux
2361         * members.
2362         */
2363        limits.limits_aux = &limits_aux;
2364
2365        /*
2366         * If the new table has no data devices, retain the existing limits.
2367         * This helps multipath with queue_if_no_path if all paths disappear,
2368         * then new I/O is queued based on these limits, and then some paths
2369         * reappear.
2370         */
2371        if (dm_table_has_no_data_devices(table)) {
2372                live_map = dm_get_live_table_fast(md);
2373                if (live_map)
2374                        limits = md->queue->limits;
2375                dm_put_live_table_fast(md);
2376        }
2377
2378        if (!live_map) {
2379                r = dm_calculate_queue_limits(table, &limits);
2380                if (r) {
2381                        map = ERR_PTR(r);
2382                        goto out;
2383                }
2384        }
2385
2386        map = __bind(md, table, &limits);
2387        dm_issue_global_event();
2388
2389out:
2390        mutex_unlock(&md->suspend_lock);
2391        return map;
2392}
2393
2394/*
2395 * Functions to lock and unlock any filesystem running on the
2396 * device.
2397 */
2398static int lock_fs(struct mapped_device *md)
2399{
2400        int r;
2401
2402        WARN_ON(md->frozen_sb);
2403
2404        md->frozen_sb = freeze_bdev(md->bdev);
2405        if (IS_ERR(md->frozen_sb)) {
2406                r = PTR_ERR(md->frozen_sb);
2407                md->frozen_sb = NULL;
2408                return r;
2409        }
2410
2411        set_bit(DMF_FROZEN, &md->flags);
2412
2413        return 0;
2414}
2415
2416static void unlock_fs(struct mapped_device *md)
2417{
2418        if (!test_bit(DMF_FROZEN, &md->flags))
2419                return;
2420
2421        thaw_bdev(md->bdev, md->frozen_sb);
2422        md->frozen_sb = NULL;
2423        clear_bit(DMF_FROZEN, &md->flags);
2424}
2425
2426/*
2427 * @suspend_flags: DM_SUSPEND_LOCKFS_FLAG and/or DM_SUSPEND_NOFLUSH_FLAG
2428 * @task_state: e.g. TASK_INTERRUPTIBLE or TASK_UNINTERRUPTIBLE
2429 * @dmf_suspended_flag: DMF_SUSPENDED or DMF_SUSPENDED_INTERNALLY
2430 *
2431 * If __dm_suspend returns 0, the device is completely quiescent
2432 * now. There is no request-processing activity. All new requests
2433 * are being added to md->deferred list.
2434 */
2435static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
2436                        unsigned suspend_flags, long task_state,
2437                        int dmf_suspended_flag)
2438{
2439        bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
2440        bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
2441        int r;
2442
2443        lockdep_assert_held(&md->suspend_lock);
2444
2445        /*
2446         * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
2447         * This flag is cleared before dm_suspend returns.
2448         */
2449        if (noflush)
2450                set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2451        else
2452                pr_debug("%s: suspending with flush\n", dm_device_name(md));
2453
2454        /*
2455         * This gets reverted if there's an error later and the targets
2456         * provide the .presuspend_undo hook.
2457         */
2458        dm_table_presuspend_targets(map);
2459
2460        /*
2461         * Flush I/O to the device.
2462         * Any I/O submitted after lock_fs() may not be flushed.
2463         * noflush takes precedence over do_lockfs.
2464         * (lock_fs() flushes I/Os and waits for them to complete.)
2465         */
2466        if (!noflush && do_lockfs) {
2467                r = lock_fs(md);
2468                if (r) {
2469                        dm_table_presuspend_undo_targets(map);
2470                        return r;
2471                }
2472        }
2473
2474        /*
2475         * Here we must make sure that no processes are submitting requests
2476         * to target drivers i.e. no one may be executing
2477         * __split_and_process_bio. This is called from dm_request and
2478         * dm_wq_work.
2479         *
2480         * To get all processes out of __split_and_process_bio in dm_request,
2481         * we take the write lock. To prevent any process from reentering
2482         * __split_and_process_bio from dm_request and quiesce the thread
2483         * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
2484         * flush_workqueue(md->wq).
2485         */
2486        set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2487        if (map)
2488                synchronize_srcu(&md->io_barrier);
2489
2490        /*
2491         * Stop md->queue before flushing md->wq in case request-based
2492         * dm defers requests to md->wq from md->queue.
2493         */
2494        if (dm_request_based(md)) {
2495                dm_stop_queue(md->queue);
2496                if (md->kworker_task)
2497                        flush_kthread_worker(&md->kworker);
2498        }
2499
2500        flush_workqueue(md->wq);
2501
2502        /*
2503         * At this point no more requests are entering target request routines.
2504         * We call dm_wait_for_completion to wait for all existing requests
2505         * to finish.
2506         */
2507        r = dm_wait_for_completion(md, task_state);
2508        if (!r)
2509                set_bit(dmf_suspended_flag, &md->flags);
2510
2511        if (noflush)
2512                clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2513        if (map)
2514                synchronize_srcu(&md->io_barrier);
2515
2516        /* were we interrupted ? */
2517        if (r < 0) {
2518                dm_queue_flush(md);
2519
2520                if (dm_request_based(md))
2521                        dm_start_queue(md->queue);
2522
2523                unlock_fs(md);
2524                dm_table_presuspend_undo_targets(map);
2525                /* pushback list is already flushed, so skip flush */
2526        }
2527
2528        return r;
2529}
2530
2531/*
2532 * We need to be able to change a mapping table under a mounted
2533 * filesystem.  For example we might want to move some data in
2534 * the background.  Before the table can be swapped with
2535 * dm_bind_table, dm_suspend must be called to flush any in
2536 * flight bios and ensure that any further io gets deferred.
2537 */
2538/*
2539 * Suspend mechanism in request-based dm.
2540 *
2541 * 1. Flush all I/Os by lock_fs() if needed.
2542 * 2. Stop dispatching any I/O by stopping the request_queue.
2543 * 3. Wait for all in-flight I/Os to be completed or requeued.
2544 *
2545 * To abort suspend, start the request_queue.
2546 */
2547int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2548{
2549        struct dm_table *map = NULL;
2550        int r = 0;
2551
2552retry:
2553        mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2554
2555        if (dm_suspended_md(md)) {
2556                r = -EINVAL;
2557                goto out_unlock;
2558        }
2559
2560        if (dm_suspended_internally_md(md)) {
2561                /* already internally suspended, wait for internal resume */
2562                mutex_unlock(&md->suspend_lock);
2563                r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2564                if (r)
2565                        return r;
2566                goto retry;
2567        }
2568
2569        map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2570
2571        r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED);
2572        if (r)
2573                goto out_unlock;
2574
2575        dm_table_postsuspend_targets(map);
2576
2577out_unlock:
2578        mutex_unlock(&md->suspend_lock);
2579        return r;
2580}
2581
2582static int __dm_resume(struct mapped_device *md, struct dm_table *map)
2583{
2584        if (map) {
2585                int r = dm_table_resume_targets(map);
2586                if (r)
2587                        return r;
2588        }
2589
2590        dm_queue_flush(md);
2591
2592        /*
2593         * Flushing deferred I/Os must be done after targets are resumed
2594         * so that mapping of targets can work correctly.
2595         * Request-based dm is queueing the deferred I/Os in its request_queue.
2596         */
2597        if (dm_request_based(md))
2598                dm_start_queue(md->queue);
2599
2600        unlock_fs(md);
2601
2602        return 0;
2603}
2604
2605int dm_resume(struct mapped_device *md)
2606{
2607        int r;
2608        struct dm_table *map = NULL;
2609
2610retry:
2611        r = -EINVAL;
2612        mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2613
2614        if (!dm_suspended_md(md))
2615                goto out;
2616
2617        if (dm_suspended_internally_md(md)) {
2618                /* already internally suspended, wait for internal resume */
2619                mutex_unlock(&md->suspend_lock);
2620                r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2621                if (r)
2622                        return r;
2623                goto retry;
2624        }
2625
2626        map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2627        if (!map || !dm_table_get_size(map))
2628                goto out;
2629
2630        r = __dm_resume(md, map);
2631        if (r)
2632                goto out;
2633
2634        clear_bit(DMF_SUSPENDED, &md->flags);
2635out:
2636        mutex_unlock(&md->suspend_lock);
2637
2638        return r;
2639}
2640
2641/*
2642 * Internal suspend/resume works like userspace-driven suspend. It waits
2643 * until all bios finish and prevents issuing new bios to the target drivers.
2644 * It may be used only from the kernel.
2645 */
2646
2647static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
2648{
2649        struct dm_table *map = NULL;
2650
2651        lockdep_assert_held(&md->suspend_lock);
2652
2653        if (md->internal_suspend_count++)
2654                return; /* nested internal suspend */
2655
2656        if (dm_suspended_md(md)) {
2657                set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2658                return; /* nest suspend */
2659        }
2660
2661        map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2662
2663        /*
2664         * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is
2665         * supported.  Properly supporting a TASK_INTERRUPTIBLE internal suspend
2666         * would require changing .presuspend to return an error -- avoid this
2667         * until there is a need for more elaborate variants of internal suspend.
2668         */
2669        (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE,
2670                            DMF_SUSPENDED_INTERNALLY);
2671
2672        dm_table_postsuspend_targets(map);
2673}
2674
2675static void __dm_internal_resume(struct mapped_device *md)
2676{
2677        BUG_ON(!md->internal_suspend_count);
2678
2679        if (--md->internal_suspend_count)
2680                return; /* resume from nested internal suspend */
2681
2682        if (dm_suspended_md(md))
2683                goto done; /* resume from nested suspend */
2684
2685        /*
2686         * NOTE: existing callers don't need to call dm_table_resume_targets
2687         * (which may fail -- so best to avoid it for now by passing NULL map)
2688         */
2689        (void) __dm_resume(md, NULL);
2690
2691done:
2692        clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2693        smp_mb__after_atomic();
2694        wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
2695}
2696
2697void dm_internal_suspend_noflush(struct mapped_device *md)
2698{
2699        mutex_lock(&md->suspend_lock);
2700        __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
2701        mutex_unlock(&md->suspend_lock);
2702}
2703EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
2704
2705void dm_internal_resume(struct mapped_device *md)
2706{
2707        mutex_lock(&md->suspend_lock);
2708        __dm_internal_resume(md);
2709        mutex_unlock(&md->suspend_lock);
2710}
2711EXPORT_SYMBOL_GPL(dm_internal_resume);
2712
2713/*
2714 * Fast variants of internal suspend/resume hold md->suspend_lock,
2715 * which prevents interaction with userspace-driven suspend.
2716 */
2717
2718void dm_internal_suspend_fast(struct mapped_device *md)
2719{
2720        mutex_lock(&md->suspend_lock);
2721        if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2722                return;
2723
2724        set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2725        synchronize_srcu(&md->io_barrier);
2726        flush_workqueue(md->wq);
2727        dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2728}
2729EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
2730
2731void dm_internal_resume_fast(struct mapped_device *md)
2732{
2733        if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2734                goto done;
2735
2736        dm_queue_flush(md);
2737
2738done:
2739        mutex_unlock(&md->suspend_lock);
2740}
2741EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
2742
2743/*-----------------------------------------------------------------
2744 * Event notification.
2745 *---------------------------------------------------------------*/
2746int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
2747                       unsigned cookie)
2748{
2749        char udev_cookie[DM_COOKIE_LENGTH];
2750        char *envp[] = { udev_cookie, NULL };
2751
2752        if (!cookie)
2753                return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
2754        else {
2755                snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2756                         DM_COOKIE_ENV_VAR_NAME, cookie);
2757                return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
2758                                          action, envp);
2759        }
2760}
2761
2762uint32_t dm_next_uevent_seq(struct mapped_device *md)
2763{
2764        return atomic_add_return(1, &md->uevent_seq);
2765}
2766
2767uint32_t dm_get_event_nr(struct mapped_device *md)
2768{
2769        return atomic_read(&md->event_nr);
2770}
2771
2772int dm_wait_event(struct mapped_device *md, int event_nr)
2773{
2774        return wait_event_interruptible(md->eventq,
2775                        (event_nr != atomic_read(&md->event_nr)));
2776}
2777
2778void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
2779{
2780        unsigned long flags;
2781
2782        spin_lock_irqsave(&md->uevent_lock, flags);
2783        list_add(elist, &md->uevent_list);
2784        spin_unlock_irqrestore(&md->uevent_lock, flags);
2785}
2786
2787/*
2788 * The gendisk is only valid as long as you have a reference
2789 * count on 'md'.
2790 */
2791struct gendisk *dm_disk(struct mapped_device *md)
2792{
2793        return md->disk;
2794}
2795EXPORT_SYMBOL_GPL(dm_disk);
2796
2797struct kobject *dm_kobject(struct mapped_device *md)
2798{
2799        return &md->kobj_holder.kobj;
2800}
2801
2802struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2803{
2804        struct mapped_device *md;
2805
2806        md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
2807
2808        spin_lock(&_minor_lock);
2809        if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2810                md = NULL;
2811                goto out;
2812        }
2813        dm_get(md);
2814out:
2815        spin_unlock(&_minor_lock);
2816
2817        return md;
2818}
2819
2820int dm_suspended_md(struct mapped_device *md)
2821{
2822        return test_bit(DMF_SUSPENDED, &md->flags);
2823}
2824
2825int dm_suspended_internally_md(struct mapped_device *md)
2826{
2827        return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2828}
2829
2830int dm_test_deferred_remove_flag(struct mapped_device *md)
2831{
2832        return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
2833}
2834
2835int dm_suspended(struct dm_target *ti)
2836{
2837        return dm_suspended_md(dm_table_get_md(ti->table));
2838}
2839EXPORT_SYMBOL_GPL(dm_suspended);
2840
2841int dm_noflush_suspending(struct dm_target *ti)
2842{
2843        return __noflush_suspending(dm_table_get_md(ti->table));
2844}
2845EXPORT_SYMBOL_GPL(dm_noflush_suspending);
2846
2847struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
2848                                            unsigned integrity, unsigned per_io_data_size)
2849{
2850        struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
2851        struct kmem_cache *cachep = NULL;
2852        unsigned int pool_size = 0;
2853        unsigned int front_pad;
2854
2855        if (!pools)
2856                return NULL;
2857
2858        switch (type) {
2859        case DM_TYPE_BIO_BASED:
2860        case DM_TYPE_DAX_BIO_BASED:
2861                cachep = _io_cache;
2862                pool_size = dm_get_reserved_bio_based_ios();
2863                front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
2864                break;
2865        case DM_TYPE_REQUEST_BASED:
2866                cachep = _rq_tio_cache;
2867                pool_size = dm_get_reserved_rq_based_ios();
2868                pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache);
2869                if (!pools->rq_pool)
2870                        goto out;
2871                /* fall through to setup remaining rq-based pools */
2872        case DM_TYPE_MQ_REQUEST_BASED:
2873                if (!pool_size)
2874                        pool_size = dm_get_reserved_rq_based_ios();
2875                front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
2876                /* per_io_data_size is used for blk-mq pdu at queue allocation */
2877                break;
2878        default:
2879                BUG();
2880        }
2881
2882        if (cachep) {
2883                pools->io_pool = mempool_create_slab_pool(pool_size, cachep);
2884                if (!pools->io_pool)
2885                        goto out;
2886        }
2887
2888        pools->bs = bioset_create(pool_size, front_pad);
2889        if (!pools->bs)
2890                goto out;
2891
2892        if (integrity && bioset_integrity_create(pools->bs, pool_size))
2893                goto out;
2894
2895        return pools;
2896
2897out:
2898        dm_free_md_mempools(pools);
2899
2900        return NULL;
2901}
2902
2903void dm_free_md_mempools(struct dm_md_mempools *pools)
2904{
2905        if (!pools)
2906                return;
2907
2908        mempool_destroy(pools->io_pool);
2909        mempool_destroy(pools->rq_pool);
2910
2911        if (pools->bs)
2912                bioset_free(pools->bs);
2913
2914        kfree(pools);
2915}
2916
2917struct dm_pr {
2918        u64     old_key;
2919        u64     new_key;
2920        u32     flags;
2921        bool    fail_early;
2922};
2923
2924static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
2925                      void *data)
2926{
2927        struct mapped_device *md = bdev->bd_disk->private_data;
2928        struct dm_table *table;
2929        struct dm_target *ti;
2930        int ret = -ENOTTY, srcu_idx;
2931
2932        table = dm_get_live_table(md, &srcu_idx);
2933        if (!table || !dm_table_get_size(table))
2934                goto out;
2935
2936        /* We only support devices that have a single target */
2937        if (dm_table_get_num_targets(table) != 1)
2938                goto out;
2939        ti = dm_table_get_target(table, 0);
2940
2941        ret = -EINVAL;
2942        if (!ti->type->iterate_devices)
2943                goto out;
2944
2945        ret = ti->type->iterate_devices(ti, fn, data);
2946out:
2947        dm_put_live_table(md, srcu_idx);
2948        return ret;
2949}
2950
2951/*
2952 * For register / unregister we need to manually call out to every path.
2953 */
2954static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev,
2955                            sector_t start, sector_t len, void *data)
2956{
2957        struct dm_pr *pr = data;
2958        const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
2959
2960        if (!ops || !ops->pr_register)
2961                return -EOPNOTSUPP;
2962        return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);
2963}
2964
2965static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
2966                          u32 flags)
2967{
2968        struct dm_pr pr = {
2969                .old_key        = old_key,
2970                .new_key        = new_key,
2971                .flags          = flags,
2972                .fail_early     = true,
2973        };
2974        int ret;
2975
2976        ret = dm_call_pr(bdev, __dm_pr_register, &pr);
2977        if (ret && new_key) {
2978                /* unregister all paths if we failed to register any path */
2979                pr.old_key = new_key;
2980                pr.new_key = 0;
2981                pr.flags = 0;
2982                pr.fail_early = false;
2983                dm_call_pr(bdev, __dm_pr_register, &pr);
2984        }
2985
2986        return ret;
2987}
2988
2989static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
2990                         u32 flags)
2991{
2992        struct mapped_device *md = bdev->bd_disk->private_data;
2993        const struct pr_ops *ops;
2994        fmode_t mode;
2995        int r;
2996
2997        r = dm_get_bdev_for_ioctl(md, &bdev, &mode);
2998        if (r < 0)
2999                return r;
3000

3001        ops = bdev->bd_disk->fops->pr_ops;
3002        if (ops && ops->pr_reserve)
3003                r = ops->pr_reserve(bdev, key, type, flags);
3004        else
3005                r = -EOPNOTSUPP;
3006
3007        blkdev_put(bdev, mode);
3008        return r;
3009}
3010
3011static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
3012{
3013        struct mapped_device *md = bdev->bd_disk->private_data;
3014        const struct pr_ops *ops;
3015        fmode_t mode;
3016        int r;
3017
3018        r = dm_get_bdev_for_ioctl(md, &bdev, &mode);
3019        if (r < 0)
3020                return r;
3021
3022        ops = bdev->bd_disk->fops->pr_ops;
3023        if (ops && ops->pr_release)
3024                r = ops->pr_release(bdev, key, type);
3025        else
3026                r = -EOPNOTSUPP;
3027
3028        blkdev_put(bdev, mode);
3029        return r;
3030}
3031
3032static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
3033                         enum pr_type type, bool abort)
3034{
3035        struct mapped_device *md = bdev->bd_disk->private_data;
3036        const struct pr_ops *ops;
3037        fmode_t mode;
3038        int r;
3039
3040        r = dm_get_bdev_for_ioctl(md, &bdev, &mode);
3041        if (r < 0)
3042                return r;
3043
3044        ops = bdev->bd_disk->fops->pr_ops;
3045        if (ops && ops->pr_preempt)
3046                r = ops->pr_preempt(bdev, old_key, new_key, type, abort);
3047        else
3048                r = -EOPNOTSUPP;
3049
3050        blkdev_put(bdev, mode);
3051        return r;
3052}
3053
3054static int dm_pr_clear(struct block_device *bdev, u64 key)
3055{
3056        struct mapped_device *md = bdev->bd_disk->private_data;
3057        const struct pr_ops *ops;
3058        fmode_t mode;
3059        int r;
3060
3061        r = dm_get_bdev_for_ioctl(md, &bdev, &mode);
3062        if (r < 0)
3063                return r;
3064
3065        ops = bdev->bd_disk->fops->pr_ops;
3066        if (ops && ops->pr_clear)
3067                r = ops->pr_clear(bdev, key);
3068        else
3069                r = -EOPNOTSUPP;
3070
3071        blkdev_put(bdev, mode);
3072        return r;
3073}
3074
3075static const struct pr_ops dm_pr_ops = {
3076        .pr_register    = dm_pr_register,
3077        .pr_reserve     = dm_pr_reserve,
3078        .pr_release     = dm_pr_release,
3079        .pr_preempt     = dm_pr_preempt,
3080        .pr_clear       = dm_pr_clear,
3081};
3082
3083static const struct block_device_operations dm_blk_dops = {
3084        .open = dm_blk_open,
3085        .release = dm_blk_close,
3086        .ioctl = dm_blk_ioctl,
3087        .getgeo = dm_blk_getgeo,
3088        .pr_ops = &dm_pr_ops,
3089        .owner = THIS_MODULE
3090};
3091
3092static const struct dax_operations dm_dax_ops = {
3093        .direct_access = dm_dax_direct_access,
3094};
3095
3096/*
3097 * module hooks
3098 */
3099module_init(dm_init);
3100module_exit(dm_exit);
3101
3102module_param(major, uint, 0);
3103MODULE_PARM_DESC(major, "The major number of the device mapper");
3104
3105module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
3106MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
3107
3108module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
3109MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
3110
3111MODULE_DESCRIPTION(DM_NAME " driver");
3112MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3113MODULE_LICENSE("GPL");
3114