LXR linux/drivers/md/dm.c

   1/*
   2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
   3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
   4 *
   5 * This file is released under the GPL.
   6 */
   7
   8#include "dm-core.h"
   9#include "dm-rq.h"
  10#include "dm-uevent.h"
  11
  12#include <linux/init.h>
  13#include <linux/module.h>
  14#include <linux/mutex.h>
  15#include <linux/sched/mm.h>
  16#include <linux/sched/signal.h>
  17#include <linux/blkpg.h>
  18#include <linux/bio.h>
  19#include <linux/mempool.h>
  20#include <linux/dax.h>
  21#include <linux/slab.h>
  22#include <linux/idr.h>
  23#include <linux/uio.h>
  24#include <linux/hdreg.h>
  25#include <linux/delay.h>
  26#include <linux/wait.h>
  27#include <linux/pr.h>
  28#include <linux/refcount.h>
  29#include <linux/part_stat.h>
  30#include <linux/blk-crypto.h>
  31#include <linux/keyslot-manager.h>
  32
  33#define DM_MSG_PREFIX "core"
  34
  35/*
  36 * Cookies are numeric values sent with CHANGE and REMOVE
  37 * uevents while resuming, removing or renaming the device.
  38 */
  39#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
  40#define DM_COOKIE_LENGTH 24
  41
  42static const char *_name = DM_NAME;
  43
  44static unsigned int major = 0;
  45static unsigned int _major = 0;
  46
  47static DEFINE_IDR(_minor_idr);
  48
  49static DEFINE_SPINLOCK(_minor_lock);
  50
  51static void do_deferred_remove(struct work_struct *w);
  52
  53static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
  54
  55static struct workqueue_struct *deferred_remove_workqueue;
  56
  57atomic_t dm_global_event_nr = ATOMIC_INIT(0);
  58DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq);
  59
  60void dm_issue_global_event(void)
  61{
  62        atomic_inc(&dm_global_event_nr);
  63        wake_up(&dm_global_eventq);
  64}
  65
  66/*
  67 * One of these is allocated (on-stack) per original bio.
  68 */
  69struct clone_info {
  70        struct dm_table *map;
  71        struct bio *bio;
  72        struct dm_io *io;
  73        sector_t sector;
  74        unsigned sector_count;
  75};
  76
  77#define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone))
  78#define DM_IO_BIO_OFFSET \
  79        (offsetof(struct dm_target_io, clone) + offsetof(struct dm_io, tio))
  80
  81void *dm_per_bio_data(struct bio *bio, size_t data_size)
  82{
  83        struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
  84        if (!tio->inside_dm_io)
  85                return (char *)bio - DM_TARGET_IO_BIO_OFFSET - data_size;
  86        return (char *)bio - DM_IO_BIO_OFFSET - data_size;
  87}
  88EXPORT_SYMBOL_GPL(dm_per_bio_data);
  89
  90struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size)
  91{
  92        struct dm_io *io = (struct dm_io *)((char *)data + data_size);
  93        if (io->magic == DM_IO_MAGIC)
  94                return (struct bio *)((char *)io + DM_IO_BIO_OFFSET);
  95        BUG_ON(io->magic != DM_TIO_MAGIC);
  96        return (struct bio *)((char *)io + DM_TARGET_IO_BIO_OFFSET);
  97}
  98EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data);
  99
 100unsigned dm_bio_get_target_bio_nr(const struct bio *bio)
 101{
 102        return container_of(bio, struct dm_target_io, clone)->target_bio_nr;
 103}
 104EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr);
 105
 106#define MINOR_ALLOCED ((void *)-1)
 107
 108#define DM_NUMA_NODE NUMA_NO_NODE
 109static int dm_numa_node = DM_NUMA_NODE;
 110
 111#define DEFAULT_SWAP_BIOS       (8 * 1048576 / PAGE_SIZE)
 112static int swap_bios = DEFAULT_SWAP_BIOS;
 113static int get_swap_bios(void)
 114{
 115        int latch = READ_ONCE(swap_bios);
 116        if (unlikely(latch <= 0))
 117                latch = DEFAULT_SWAP_BIOS;
 118        return latch;
 119}
 120
 121/*
 122 * For mempools pre-allocation at the table loading time.
 123 */
 124struct dm_md_mempools {
 125        struct bio_set bs;
 126        struct bio_set io_bs;
 127};
 128
 129struct table_device {
 130        struct list_head list;
 131        refcount_t count;
 132        struct dm_dev dm_dev;
 133};
 134
 135/*
 136 * Bio-based DM's mempools' reserved IOs set by the user.
 137 */
 138#define RESERVED_BIO_BASED_IOS          16
 139static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
 140
 141static int __dm_get_module_param_int(int *module_param, int min, int max)
 142{
 143        int param = READ_ONCE(*module_param);
 144        int modified_param = 0;
 145        bool modified = true;
 146
 147        if (param < min)
 148                modified_param = min;
 149        else if (param > max)
 150                modified_param = max;
 151        else
 152                modified = false;
 153
 154        if (modified) {
 155                (void)cmpxchg(module_param, param, modified_param);
 156                param = modified_param;
 157        }
 158
 159        return param;
 160}
 161
 162unsigned __dm_get_module_param(unsigned *module_param,
 163                               unsigned def, unsigned max)
 164{
 165        unsigned param = READ_ONCE(*module_param);
 166        unsigned modified_param = 0;
 167
 168        if (!param)
 169                modified_param = def;
 170        else if (param > max)
 171                modified_param = max;
 172
 173        if (modified_param) {
 174                (void)cmpxchg(module_param, param, modified_param);
 175                param = modified_param;
 176        }
 177
 178        return param;
 179}
 180
 181unsigned dm_get_reserved_bio_based_ios(void)
 182{
 183        return __dm_get_module_param(&reserved_bio_based_ios,
 184                                     RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS);
 185}
 186EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
 187
 188static unsigned dm_get_numa_node(void)
 189{
 190        return __dm_get_module_param_int(&dm_numa_node,
 191                                         DM_NUMA_NODE, num_online_nodes() - 1);
 192}
 193
 194static int __init local_init(void)
 195{
 196        int r;
 197
 198        r = dm_uevent_init();
 199        if (r)
 200                return r;
 201
 202        deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
 203        if (!deferred_remove_workqueue) {
 204                r = -ENOMEM;
 205                goto out_uevent_exit;
 206        }
 207
 208        _major = major;
 209        r = register_blkdev(_major, _name);
 210        if (r < 0)
 211                goto out_free_workqueue;
 212
 213        if (!_major)
 214                _major = r;
 215
 216        return 0;
 217
 218out_free_workqueue:
 219        destroy_workqueue(deferred_remove_workqueue);
 220out_uevent_exit:
 221        dm_uevent_exit();
 222
 223        return r;
 224}
 225
 226static void local_exit(void)
 227{
 228        flush_scheduled_work();
 229        destroy_workqueue(deferred_remove_workqueue);
 230
 231        unregister_blkdev(_major, _name);
 232        dm_uevent_exit();
 233
 234        _major = 0;
 235
 236        DMINFO("cleaned up");
 237}
 238
 239static int (*_inits[])(void) __initdata = {
 240        local_init,
 241        dm_target_init,
 242        dm_linear_init,
 243        dm_stripe_init,
 244        dm_io_init,
 245        dm_kcopyd_init,
 246        dm_interface_init,
 247        dm_statistics_init,
 248};
 249
 250static void (*_exits[])(void) = {
 251        local_exit,
 252        dm_target_exit,
 253        dm_linear_exit,
 254        dm_stripe_exit,
 255        dm_io_exit,
 256        dm_kcopyd_exit,
 257        dm_interface_exit,
 258        dm_statistics_exit,
 259};
 260
 261static int __init dm_init(void)
 262{
 263        const int count = ARRAY_SIZE(_inits);
 264
 265        int r, i;
 266
 267        for (i = 0; i < count; i++) {
 268                r = _inits[i]();
 269                if (r)
 270                        goto bad;
 271        }
 272
 273        return 0;
 274
 275      bad:
 276        while (i--)
 277                _exits[i]();
 278
 279        return r;
 280}
 281
 282static void __exit dm_exit(void)
 283{
 284        int i = ARRAY_SIZE(_exits);
 285
 286        while (i--)
 287                _exits[i]();
 288
 289        /*
 290         * Should be empty by this point.
 291         */
 292        idr_destroy(&_minor_idr);
 293}
 294
 295/*
 296 * Block device functions
 297 */
 298int dm_deleting_md(struct mapped_device *md)
 299{
 300        return test_bit(DMF_DELETING, &md->flags);
 301}
 302
 303static int dm_blk_open(struct block_device *bdev, fmode_t mode)
 304{
 305        struct mapped_device *md;
 306
 307        spin_lock(&_minor_lock);
 308
 309        md = bdev->bd_disk->private_data;
 310        if (!md)
 311                goto out;
 312
 313        if (test_bit(DMF_FREEING, &md->flags) ||
 314            dm_deleting_md(md)) {
 315                md = NULL;
 316                goto out;
 317        }
 318
 319        dm_get(md);
 320        atomic_inc(&md->open_count);
 321out:
 322        spin_unlock(&_minor_lock);
 323
 324        return md ? 0 : -ENXIO;
 325}
 326
 327static void dm_blk_close(struct gendisk *disk, fmode_t mode)
 328{
 329        struct mapped_device *md;
 330
 331        spin_lock(&_minor_lock);
 332
 333        md = disk->private_data;
 334        if (WARN_ON(!md))
 335                goto out;
 336
 337        if (atomic_dec_and_test(&md->open_count) &&
 338            (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
 339                queue_work(deferred_remove_workqueue, &deferred_remove_work);
 340
 341        dm_put(md);
 342out:
 343        spin_unlock(&_minor_lock);
 344}
 345
 346int dm_open_count(struct mapped_device *md)
 347{
 348        return atomic_read(&md->open_count);
 349}
 350
 351/*
 352 * Guarantees nothing is using the device before it's deleted.
 353 */
 354int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
 355{
 356        int r = 0;
 357
 358        spin_lock(&_minor_lock);
 359
 360        if (dm_open_count(md)) {
 361                r = -EBUSY;
 362                if (mark_deferred)
 363                        set_bit(DMF_DEFERRED_REMOVE, &md->flags);
 364        } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
 365                r = -EEXIST;
 366        else
 367                set_bit(DMF_DELETING, &md->flags);
 368
 369        spin_unlock(&_minor_lock);
 370
 371        return r;
 372}
 373
 374int dm_cancel_deferred_remove(struct mapped_device *md)
 375{
 376        int r = 0;
 377
 378        spin_lock(&_minor_lock);
 379
 380        if (test_bit(DMF_DELETING, &md->flags))
 381                r = -EBUSY;
 382        else
 383                clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
 384
 385        spin_unlock(&_minor_lock);
 386
 387        return r;
 388}
 389
 390static void do_deferred_remove(struct work_struct *w)
 391{
 392        dm_deferred_remove();
 393}
 394
 395static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 396{
 397        struct mapped_device *md = bdev->bd_disk->private_data;
 398
 399        return dm_get_geometry(md, geo);
 400}
 401
 402static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx,
 403                            struct block_device **bdev)
 404{
 405        struct dm_target *tgt;
 406        struct dm_table *map;
 407        int r;
 408
 409retry:
 410        r = -ENOTTY;
 411        map = dm_get_live_table(md, srcu_idx);
 412        if (!map || !dm_table_get_size(map))
 413                return r;
 414
 415        /* We only support devices that have a single target */
 416        if (dm_table_get_num_targets(map) != 1)
 417                return r;
 418
 419        tgt = dm_table_get_target(map, 0);
 420        if (!tgt->type->prepare_ioctl)
 421                return r;
 422
 423        if (dm_suspended_md(md))
 424                return -EAGAIN;
 425
 426        r = tgt->type->prepare_ioctl(tgt, bdev);
 427        if (r == -ENOTCONN && !fatal_signal_pending(current)) {
 428                dm_put_live_table(md, *srcu_idx);
 429                msleep(10);
 430                goto retry;
 431        }
 432
 433        return r;
 434}
 435
 436static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx)
 437{
 438        dm_put_live_table(md, srcu_idx);
 439}
 440
 441static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
 442                        unsigned int cmd, unsigned long arg)
 443{
 444        struct mapped_device *md = bdev->bd_disk->private_data;
 445        int r, srcu_idx;
 446
 447        r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
 448        if (r < 0)
 449                goto out;
 450
 451        if (r > 0) {
 452                /*
 453                 * Target determined this ioctl is being issued against a
 454                 * subset of the parent bdev; require extra privileges.
 455                 */
 456                if (!capable(CAP_SYS_RAWIO)) {
 457                        DMDEBUG_LIMIT(
 458        "%s: sending ioctl %x to DM device without required privilege.",
 459                                current->comm, cmd);
 460                        r = -ENOIOCTLCMD;
 461                        goto out;
 462                }
 463        }
 464
 465        if (!bdev->bd_disk->fops->ioctl)
 466                r = -ENOTTY;
 467        else
 468                r = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg);
 469out:
 470        dm_unprepare_ioctl(md, srcu_idx);
 471        return r;
 472}
 473
 474u64 dm_start_time_ns_from_clone(struct bio *bio)
 475{
 476        struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
 477        struct dm_io *io = tio->io;
 478
 479        return jiffies_to_nsecs(io->start_time);
 480}
 481EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone);
 482
 483static void start_io_acct(struct dm_io *io)
 484{
 485        struct mapped_device *md = io->md;
 486        struct bio *bio = io->orig_bio;
 487
 488        io->start_time = bio_start_io_acct(bio);
 489        if (unlikely(dm_stats_used(&md->stats)))
 490                dm_stats_account_io(&md->stats, bio_data_dir(bio),
 491                                    bio->bi_iter.bi_sector, bio_sectors(bio),
 492                                    false, 0, &io->stats_aux);
 493}
 494
 495static void end_io_acct(struct dm_io *io)
 496{
 497        struct mapped_device *md = io->md;
 498        struct bio *bio = io->orig_bio;
 499        unsigned long duration = jiffies - io->start_time;
 500
 501        bio_end_io_acct(bio, io->start_time);
 502
 503        if (unlikely(dm_stats_used(&md->stats)))
 504                dm_stats_account_io(&md->stats, bio_data_dir(bio),
 505                                    bio->bi_iter.bi_sector, bio_sectors(bio),
 506                                    true, duration, &io->stats_aux);
 507
 508        /* nudge anyone waiting on suspend queue */
 509        if (unlikely(wq_has_sleeper(&md->wait)))
 510                wake_up(&md->wait);
 511}
 512
 513static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
 514{
 515        struct dm_io *io;
 516        struct dm_target_io *tio;
 517        struct bio *clone;
 518
 519        clone = bio_alloc_bioset(GFP_NOIO, 0, &md->io_bs);
 520        if (!clone)
 521                return NULL;
 522
 523        tio = container_of(clone, struct dm_target_io, clone);
 524        tio->inside_dm_io = true;
 525        tio->io = NULL;
 526
 527        io = container_of(tio, struct dm_io, tio);
 528        io->magic = DM_IO_MAGIC;
 529        io->status = 0;
 530        atomic_set(&io->io_count, 1);
 531        io->orig_bio = bio;
 532        io->md = md;
 533        spin_lock_init(&io->endio_lock);
 534
 535        start_io_acct(io);
 536
 537        return io;
 538}
 539
 540static void free_io(struct mapped_device *md, struct dm_io *io)
 541{
 542        bio_put(&io->tio.clone);
 543}
 544
 545static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *ti,
 546                                      unsigned target_bio_nr, gfp_t gfp_mask)
 547{
 548        struct dm_target_io *tio;
 549
 550        if (!ci->io->tio.io) {
 551                /* the dm_target_io embedded in ci->io is available */
 552                tio = &ci->io->tio;
 553        } else {
 554                struct bio *clone = bio_alloc_bioset(gfp_mask, 0, &ci->io->md->bs);
 555                if (!clone)
 556                        return NULL;
 557
 558                tio = container_of(clone, struct dm_target_io, clone);
 559                tio->inside_dm_io = false;
 560        }
 561
 562        tio->magic = DM_TIO_MAGIC;
 563        tio->io = ci->io;
 564        tio->ti = ti;
 565        tio->target_bio_nr = target_bio_nr;
 566
 567        return tio;
 568}
 569
 570static void free_tio(struct dm_target_io *tio)
 571{
 572        if (tio->inside_dm_io)
 573                return;
 574        bio_put(&tio->clone);
 575}
 576
 577/*
 578 * Add the bio to the list of deferred io.
 579 */
 580static void queue_io(struct mapped_device *md, struct bio *bio)
 581{
 582        unsigned long flags;
 583
 584        spin_lock_irqsave(&md->deferred_lock, flags);
 585        bio_list_add(&md->deferred, bio);
 586        spin_unlock_irqrestore(&md->deferred_lock, flags);
 587        queue_work(md->wq, &md->work);
 588}
 589
 590/*
 591 * Everyone (including functions in this file), should use this
 592 * function to access the md->map field, and make sure they call
 593 * dm_put_live_table() when finished.
 594 */
 595struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
 596{
 597        *srcu_idx = srcu_read_lock(&md->io_barrier);
 598
 599        return srcu_dereference(md->map, &md->io_barrier);
 600}
 601
 602void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
 603{
 604        srcu_read_unlock(&md->io_barrier, srcu_idx);
 605}
 606
 607void dm_sync_table(struct mapped_device *md)
 608{
 609        synchronize_srcu(&md->io_barrier);
 610        synchronize_rcu_expedited();
 611}
 612
 613/*
 614 * A fast alternative to dm_get_live_table/dm_put_live_table.
 615 * The caller must not block between these two functions.
 616 */
 617static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
 618{
 619        rcu_read_lock();
 620        return rcu_dereference(md->map);
 621}
 622
 623static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
 624{
 625        rcu_read_unlock();
 626}
 627
 628static char *_dm_claim_ptr = "I belong to device-mapper";
 629
 630/*
 631 * Open a table device so we can use it as a map destination.
 632 */
 633static int open_table_device(struct table_device *td, dev_t dev,
 634                             struct mapped_device *md)
 635{
 636        struct block_device *bdev;
 637
 638        int r;
 639
 640        BUG_ON(td->dm_dev.bdev);
 641
 642        bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr);
 643        if (IS_ERR(bdev))
 644                return PTR_ERR(bdev);
 645
 646        r = bd_link_disk_holder(bdev, dm_disk(md));
 647        if (r) {
 648                blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
 649                return r;
 650        }
 651
 652        td->dm_dev.bdev = bdev;
 653        td->dm_dev.dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
 654        return 0;
 655}
 656
 657/*
 658 * Close a table device that we've been using.
 659 */
 660static void close_table_device(struct table_device *td, struct mapped_device *md)
 661{
 662        if (!td->dm_dev.bdev)
 663                return;
 664
 665        bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
 666        blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
 667        put_dax(td->dm_dev.dax_dev);
 668        td->dm_dev.bdev = NULL;
 669        td->dm_dev.dax_dev = NULL;
 670}
 671
 672static struct table_device *find_table_device(struct list_head *l, dev_t dev,
 673                                              fmode_t mode)
 674{
 675        struct table_device *td;
 676
 677        list_for_each_entry(td, l, list)
 678                if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
 679                        return td;
 680
 681        return NULL;
 682}
 683
 684int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
 685                        struct dm_dev **result)
 686{
 687        int r;
 688        struct table_device *td;
 689
 690        mutex_lock(&md->table_devices_lock);
 691        td = find_table_device(&md->table_devices, dev, mode);
 692        if (!td) {
 693                td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
 694                if (!td) {
 695                        mutex_unlock(&md->table_devices_lock);
 696                        return -ENOMEM;
 697                }
 698
 699                td->dm_dev.mode = mode;
 700                td->dm_dev.bdev = NULL;
 701
 702                if ((r = open_table_device(td, dev, md))) {
 703                        mutex_unlock(&md->table_devices_lock);
 704                        kfree(td);
 705                        return r;
 706                }
 707
 708                format_dev_t(td->dm_dev.name, dev);
 709
 710                refcount_set(&td->count, 1);
 711                list_add(&td->list, &md->table_devices);
 712        } else {
 713                refcount_inc(&td->count);
 714        }
 715        mutex_unlock(&md->table_devices_lock);
 716
 717        *result = &td->dm_dev;
 718        return 0;
 719}
 720
 721void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
 722{
 723        struct table_device *td = container_of(d, struct table_device, dm_dev);
 724
 725        mutex_lock(&md->table_devices_lock);
 726        if (refcount_dec_and_test(&td->count)) {
 727                close_table_device(td, md);
 728                list_del(&td->list);
 729                kfree(td);
 730        }
 731        mutex_unlock(&md->table_devices_lock);
 732}
 733
 734static void free_table_devices(struct list_head *devices)
 735{
 736        struct list_head *tmp, *next;
 737
 738        list_for_each_safe(tmp, next, devices) {
 739                struct table_device *td = list_entry(tmp, struct table_device, list);
 740
 741                DMWARN("dm_destroy: %s still exists with %d references",
 742                       td->dm_dev.name, refcount_read(&td->count));
 743                kfree(td);
 744        }
 745}
 746
 747/*
 748 * Get the geometry associated with a dm device
 749 */
 750int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
 751{
 752        *geo = md->geometry;
 753
 754        return 0;
 755}
 756
 757/*
 758 * Set the geometry of a device.
 759 */
 760int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
 761{
 762        sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
 763
 764        if (geo->start > sz) {
 765                DMWARN("Start sector is beyond the geometry limits.");
 766                return -EINVAL;
 767        }
 768
 769        md->geometry = *geo;
 770
 771        return 0;
 772}
 773
 774static int __noflush_suspending(struct mapped_device *md)
 775{
 776        return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
 777}
 778
 779/*
 780 * Decrements the number of outstanding ios that a bio has been
 781 * cloned into, completing the original io if necc.
 782 */
 783void dm_io_dec_pending(struct dm_io *io, blk_status_t error)
 784{
 785        unsigned long flags;
 786        blk_status_t io_error;
 787        struct bio *bio;
 788        struct mapped_device *md = io->md;
 789
 790        /* Push-back supersedes any I/O errors */
 791        if (unlikely(error)) {
 792                spin_lock_irqsave(&io->endio_lock, flags);
 793                if (!(io->status == BLK_STS_DM_REQUEUE && __noflush_suspending(md)))
 794                        io->status = error;
 795                spin_unlock_irqrestore(&io->endio_lock, flags);
 796        }
 797
 798        if (atomic_dec_and_test(&io->io_count)) {
 799                bio = io->orig_bio;
 800                if (io->status == BLK_STS_DM_REQUEUE) {
 801                        /*
 802                         * Target requested pushing back the I/O.
 803                         */
 804                        spin_lock_irqsave(&md->deferred_lock, flags);
 805                        if (__noflush_suspending(md) &&
 806                            !WARN_ON_ONCE(dm_is_zone_write(md, bio))) {
 807                                /* NOTE early return due to BLK_STS_DM_REQUEUE below */
 808                                bio_list_add_head(&md->deferred, bio);
 809                        } else {
 810                                /*
 811                                 * noflush suspend was interrupted or this is
 812                                 * a write to a zoned target.
 813                                 */
 814                                io->status = BLK_STS_IOERR;
 815                        }
 816                        spin_unlock_irqrestore(&md->deferred_lock, flags);
 817                }
 818
 819                io_error = io->status;
 820                end_io_acct(io);
 821                free_io(md, io);
 822
 823                if (io_error == BLK_STS_DM_REQUEUE)
 824                        return;
 825
 826                if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
 827                        /*
 828                         * Preflush done for flush with data, reissue
 829                         * without REQ_PREFLUSH.
 830                         */
 831                        bio->bi_opf &= ~REQ_PREFLUSH;
 832                        queue_io(md, bio);
 833                } else {
 834                        /* done with normal IO or empty flush */
 835                        if (io_error)
 836                                bio->bi_status = io_error;
 837                        bio_endio(bio);
 838                }
 839        }
 840}
 841
 842void disable_discard(struct mapped_device *md)
 843{
 844        struct queue_limits *limits = dm_get_queue_limits(md);
 845
 846        /* device doesn't really support DISCARD, disable it */
 847        limits->max_discard_sectors = 0;
 848        blk_queue_flag_clear(QUEUE_FLAG_DISCARD, md->queue);
 849}
 850
 851void disable_write_same(struct mapped_device *md)
 852{
 853        struct queue_limits *limits = dm_get_queue_limits(md);
 854
 855        /* device doesn't really support WRITE SAME, disable it */
 856        limits->max_write_same_sectors = 0;
 857}
 858
 859void disable_write_zeroes(struct mapped_device *md)
 860{
 861        struct queue_limits *limits = dm_get_queue_limits(md);
 862
 863        /* device doesn't really support WRITE ZEROES, disable it */
 864        limits->max_write_zeroes_sectors = 0;
 865}
 866
 867static bool swap_bios_limit(struct dm_target *ti, struct bio *bio)
 868{
 869        return unlikely((bio->bi_opf & REQ_SWAP) != 0) && unlikely(ti->limit_swap_bios);
 870}
 871
 872static void clone_endio(struct bio *bio)
 873{
 874        blk_status_t error = bio->bi_status;
 875        struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
 876        struct dm_io *io = tio->io;
 877        struct mapped_device *md = tio->io->md;
 878        dm_endio_fn endio = tio->ti->type->end_io;
 879        struct request_queue *q = bio->bi_bdev->bd_disk->queue;
 880
 881        if (unlikely(error == BLK_STS_TARGET)) {
 882                if (bio_op(bio) == REQ_OP_DISCARD &&
 883                    !q->limits.max_discard_sectors)
 884                        disable_discard(md);
 885                else if (bio_op(bio) == REQ_OP_WRITE_SAME &&
 886                         !q->limits.max_write_same_sectors)
 887                        disable_write_same(md);
 888                else if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
 889                         !q->limits.max_write_zeroes_sectors)
 890                        disable_write_zeroes(md);
 891        }
 892
 893        if (blk_queue_is_zoned(q))
 894                dm_zone_endio(io, bio);
 895
 896        if (endio) {
 897                int r = endio(tio->ti, bio, &error);
 898                switch (r) {
 899                case DM_ENDIO_REQUEUE:
 900                        /*
 901                         * Requeuing writes to a sequential zone of a zoned
 902                         * target will break the sequential write pattern:
 903                         * fail such IO.
 904                         */
 905                        if (WARN_ON_ONCE(dm_is_zone_write(md, bio)))
 906                                error = BLK_STS_IOERR;
 907                        else
 908                                error = BLK_STS_DM_REQUEUE;
 909                        fallthrough;
 910                case DM_ENDIO_DONE:
 911                        break;
 912                case DM_ENDIO_INCOMPLETE:
 913                        /* The target will handle the io */
 914                        return;
 915                default:
 916                        DMWARN("unimplemented target endio return value: %d", r);
 917                        BUG();
 918                }
 919        }
 920
 921        if (unlikely(swap_bios_limit(tio->ti, bio))) {
 922                struct mapped_device *md = io->md;
 923                up(&md->swap_bios_semaphore);
 924        }
 925
 926        free_tio(tio);
 927        dm_io_dec_pending(io, error);
 928}
 929
 930/*
 931 * Return maximum size of I/O possible at the supplied sector up to the current
 932 * target boundary.
 933 */
 934static inline sector_t max_io_len_target_boundary(struct dm_target *ti,
 935                                                  sector_t target_offset)
 936{
 937        return ti->len - target_offset;
 938}
 939
 940static sector_t max_io_len(struct dm_target *ti, sector_t sector)
 941{
 942        sector_t target_offset = dm_target_offset(ti, sector);
 943        sector_t len = max_io_len_target_boundary(ti, target_offset);
 944        sector_t max_len;
 945
 946        /*
 947         * Does the target need to split IO even further?
 948         * - varied (per target) IO splitting is a tenet of DM; this
 949         *   explains why stacked chunk_sectors based splitting via
 950         *   blk_max_size_offset() isn't possible here. So pass in
 951         *   ti->max_io_len to override stacked chunk_sectors.
 952         */
 953        if (ti->max_io_len) {
 954                max_len = blk_max_size_offset(ti->table->md->queue,
 955                                              target_offset, ti->max_io_len);
 956                if (len > max_len)
 957                        len = max_len;
 958        }
 959
 960        return len;
 961}
 962
 963int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
 964{
 965        if (len > UINT_MAX) {
 966                DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
 967                      (unsigned long long)len, UINT_MAX);
 968                ti->error = "Maximum size of target IO is too large";
 969                return -EINVAL;
 970        }
 971
 972        ti->max_io_len = (uint32_t) len;
 973
 974        return 0;
 975}
 976EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
 977
 978static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
 979                                                sector_t sector, int *srcu_idx)
 980        __acquires(md->io_barrier)
 981{
 982        struct dm_table *map;
 983        struct dm_target *ti;
 984
 985        map = dm_get_live_table(md, srcu_idx);
 986        if (!map)
 987                return NULL;
 988
 989        ti = dm_table_find_target(map, sector);
 990        if (!ti)
 991                return NULL;
 992
 993        return ti;
 994}
 995
 996static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
 997                                 long nr_pages, void **kaddr, pfn_t *pfn)
 998{
 999        struct mapped_device *md = dax_get_private(dax_dev);
1000        sector_t sector = pgoff * PAGE_SECTORS;

1001        struct dm_target *ti;
1002        long len, ret = -EIO;
1003        int srcu_idx;
1004
1005        ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1006
1007        if (!ti)
1008                goto out;
1009        if (!ti->type->direct_access)
1010                goto out;
1011        len = max_io_len(ti, sector) / PAGE_SECTORS;
1012        if (len < 1)
1013                goto out;
1014        nr_pages = min(len, nr_pages);
1015        ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
1016
1017 out:
1018        dm_put_live_table(md, srcu_idx);
1019
1020        return ret;
1021}
1022
1023static bool dm_dax_supported(struct dax_device *dax_dev, struct block_device *bdev,
1024                int blocksize, sector_t start, sector_t len)
1025{
1026        struct mapped_device *md = dax_get_private(dax_dev);
1027        struct dm_table *map;
1028        bool ret = false;
1029        int srcu_idx;
1030
1031        map = dm_get_live_table(md, &srcu_idx);
1032        if (!map)
1033                goto out;
1034
1035        ret = dm_table_supports_dax(map, device_not_dax_capable, &blocksize);
1036
1037out:
1038        dm_put_live_table(md, srcu_idx);
1039
1040        return ret;
1041}
1042
1043static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
1044                                    void *addr, size_t bytes, struct iov_iter *i)
1045{
1046        struct mapped_device *md = dax_get_private(dax_dev);
1047        sector_t sector = pgoff * PAGE_SECTORS;
1048        struct dm_target *ti;
1049        long ret = 0;
1050        int srcu_idx;
1051
1052        ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1053
1054        if (!ti)
1055                goto out;
1056        if (!ti->type->dax_copy_from_iter) {
1057                ret = copy_from_iter(addr, bytes, i);
1058                goto out;
1059        }
1060        ret = ti->type->dax_copy_from_iter(ti, pgoff, addr, bytes, i);
1061 out:
1062        dm_put_live_table(md, srcu_idx);
1063
1064        return ret;
1065}
1066
1067static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
1068                void *addr, size_t bytes, struct iov_iter *i)
1069{
1070        struct mapped_device *md = dax_get_private(dax_dev);
1071        sector_t sector = pgoff * PAGE_SECTORS;
1072        struct dm_target *ti;
1073        long ret = 0;
1074        int srcu_idx;
1075
1076        ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1077
1078        if (!ti)
1079                goto out;
1080        if (!ti->type->dax_copy_to_iter) {
1081                ret = copy_to_iter(addr, bytes, i);
1082                goto out;
1083        }
1084        ret = ti->type->dax_copy_to_iter(ti, pgoff, addr, bytes, i);
1085 out:
1086        dm_put_live_table(md, srcu_idx);
1087
1088        return ret;
1089}
1090
1091static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
1092                                  size_t nr_pages)
1093{
1094        struct mapped_device *md = dax_get_private(dax_dev);
1095        sector_t sector = pgoff * PAGE_SECTORS;
1096        struct dm_target *ti;
1097        int ret = -EIO;
1098        int srcu_idx;
1099
1100        ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1101
1102        if (!ti)
1103                goto out;
1104        if (WARN_ON(!ti->type->dax_zero_page_range)) {
1105                /*
1106                 * ->zero_page_range() is mandatory dax operation. If we are
1107                 *  here, something is wrong.
1108                 */
1109                goto out;
1110        }
1111        ret = ti->type->dax_zero_page_range(ti, pgoff, nr_pages);
1112 out:
1113        dm_put_live_table(md, srcu_idx);
1114
1115        return ret;
1116}
1117
1118/*
1119 * A target may call dm_accept_partial_bio only from the map routine.  It is
1120 * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_* zone management
1121 * operations and REQ_OP_ZONE_APPEND (zone append writes).
1122 *
1123 * dm_accept_partial_bio informs the dm that the target only wants to process
1124 * additional n_sectors sectors of the bio and the rest of the data should be
1125 * sent in a next bio.
1126 *
1127 * A diagram that explains the arithmetics:
1128 * +--------------------+---------------+-------+
1129 * |         1          |       2       |   3   |
1130 * +--------------------+---------------+-------+
1131 *
1132 * <-------------- *tio->len_ptr --------------->
1133 *                      <------- bi_size ------->
1134 *                      <-- n_sectors -->
1135 *
1136 * Region 1 was already iterated over with bio_advance or similar function.
1137 *      (it may be empty if the target doesn't use bio_advance)
1138 * Region 2 is the remaining bio size that the target wants to process.
1139 *      (it may be empty if region 1 is non-empty, although there is no reason
1140 *       to make it empty)
1141 * The target requires that region 3 is to be sent in the next bio.
1142 *
1143 * If the target wants to receive multiple copies of the bio (via num_*bios, etc),
1144 * the partially processed part (the sum of regions 1+2) must be the same for all
1145 * copies of the bio.
1146 */
1147void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
1148{
1149        struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
1150        unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
1151
1152        BUG_ON(bio->bi_opf & REQ_PREFLUSH);
1153        BUG_ON(op_is_zone_mgmt(bio_op(bio)));
1154        BUG_ON(bio_op(bio) == REQ_OP_ZONE_APPEND);
1155        BUG_ON(bi_size > *tio->len_ptr);
1156        BUG_ON(n_sectors > bi_size);
1157
1158        *tio->len_ptr -= bi_size - n_sectors;
1159        bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
1160}
1161EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
1162
1163static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch)
1164{
1165        mutex_lock(&md->swap_bios_lock);
1166        while (latch < md->swap_bios) {
1167                cond_resched();
1168                down(&md->swap_bios_semaphore);
1169                md->swap_bios--;
1170        }
1171        while (latch > md->swap_bios) {
1172                cond_resched();
1173                up(&md->swap_bios_semaphore);
1174                md->swap_bios++;
1175        }
1176        mutex_unlock(&md->swap_bios_lock);
1177}
1178
1179static blk_qc_t __map_bio(struct dm_target_io *tio)
1180{
1181        int r;
1182        sector_t sector;
1183        struct bio *clone = &tio->clone;
1184        struct dm_io *io = tio->io;
1185        struct dm_target *ti = tio->ti;
1186        blk_qc_t ret = BLK_QC_T_NONE;
1187
1188        clone->bi_end_io = clone_endio;
1189
1190        /*
1191         * Map the clone.  If r == 0 we don't need to do
1192         * anything, the target has assumed ownership of
1193         * this io.
1194         */
1195        dm_io_inc_pending(io);
1196        sector = clone->bi_iter.bi_sector;
1197
1198        if (unlikely(swap_bios_limit(ti, clone))) {
1199                struct mapped_device *md = io->md;
1200                int latch = get_swap_bios();
1201                if (unlikely(latch != md->swap_bios))
1202                        __set_swap_bios_limit(md, latch);
1203                down(&md->swap_bios_semaphore);
1204        }
1205
1206        /*
1207         * Check if the IO needs a special mapping due to zone append emulation
1208         * on zoned target. In this case, dm_zone_map_bio() calls the target
1209         * map operation.
1210         */
1211        if (dm_emulate_zone_append(io->md))
1212                r = dm_zone_map_bio(tio);
1213        else
1214                r = ti->type->map(ti, clone);
1215
1216        switch (r) {
1217        case DM_MAPIO_SUBMITTED:
1218                break;
1219        case DM_MAPIO_REMAPPED:
1220                /* the bio has been remapped so dispatch it */
1221                trace_block_bio_remap(clone, bio_dev(io->orig_bio), sector);
1222                ret = submit_bio_noacct(clone);
1223                break;
1224        case DM_MAPIO_KILL:
1225                if (unlikely(swap_bios_limit(ti, clone))) {
1226                        struct mapped_device *md = io->md;
1227                        up(&md->swap_bios_semaphore);
1228                }
1229                free_tio(tio);
1230                dm_io_dec_pending(io, BLK_STS_IOERR);
1231                break;
1232        case DM_MAPIO_REQUEUE:
1233                if (unlikely(swap_bios_limit(ti, clone))) {
1234                        struct mapped_device *md = io->md;
1235                        up(&md->swap_bios_semaphore);
1236                }
1237                free_tio(tio);
1238                dm_io_dec_pending(io, BLK_STS_DM_REQUEUE);
1239                break;
1240        default:
1241                DMWARN("unimplemented target map return value: %d", r);
1242                BUG();
1243        }
1244
1245        return ret;
1246}
1247
1248static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
1249{
1250        bio->bi_iter.bi_sector = sector;
1251        bio->bi_iter.bi_size = to_bytes(len);
1252}
1253
1254/*
1255 * Creates a bio that consists of range of complete bvecs.
1256 */
1257static int clone_bio(struct dm_target_io *tio, struct bio *bio,
1258                     sector_t sector, unsigned len)
1259{
1260        struct bio *clone = &tio->clone;
1261        int r;
1262
1263        __bio_clone_fast(clone, bio);
1264
1265        r = bio_crypt_clone(clone, bio, GFP_NOIO);
1266        if (r < 0)
1267                return r;
1268
1269        if (bio_integrity(bio)) {
1270                if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
1271                             !dm_target_passes_integrity(tio->ti->type))) {
1272                        DMWARN("%s: the target %s doesn't support integrity data.",
1273                                dm_device_name(tio->io->md),
1274                                tio->ti->type->name);
1275                        return -EIO;
1276                }
1277
1278                r = bio_integrity_clone(clone, bio, GFP_NOIO);
1279                if (r < 0)
1280                        return r;
1281        }
1282
1283        bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
1284        clone->bi_iter.bi_size = to_bytes(len);
1285
1286        if (bio_integrity(bio))
1287                bio_integrity_trim(clone);
1288
1289        return 0;
1290}
1291
1292static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
1293                                struct dm_target *ti, unsigned num_bios)
1294{
1295        struct dm_target_io *tio;
1296        int try;
1297
1298        if (!num_bios)
1299                return;
1300
1301        if (num_bios == 1) {
1302                tio = alloc_tio(ci, ti, 0, GFP_NOIO);
1303                bio_list_add(blist, &tio->clone);
1304                return;
1305        }
1306
1307        for (try = 0; try < 2; try++) {
1308                int bio_nr;
1309                struct bio *bio;
1310
1311                if (try)
1312                        mutex_lock(&ci->io->md->table_devices_lock);
1313                for (bio_nr = 0; bio_nr < num_bios; bio_nr++) {
1314                        tio = alloc_tio(ci, ti, bio_nr, try ? GFP_NOIO : GFP_NOWAIT);
1315                        if (!tio)
1316                                break;
1317
1318                        bio_list_add(blist, &tio->clone);
1319                }
1320                if (try)
1321                        mutex_unlock(&ci->io->md->table_devices_lock);
1322                if (bio_nr == num_bios)
1323                        return;
1324
1325                while ((bio = bio_list_pop(blist))) {
1326                        tio = container_of(bio, struct dm_target_io, clone);
1327                        free_tio(tio);
1328                }
1329        }
1330}
1331
1332static blk_qc_t __clone_and_map_simple_bio(struct clone_info *ci,
1333                                           struct dm_target_io *tio, unsigned *len)
1334{
1335        struct bio *clone = &tio->clone;
1336
1337        tio->len_ptr = len;
1338
1339        __bio_clone_fast(clone, ci->bio);
1340        if (len)
1341                bio_setup_sector(clone, ci->sector, *len);
1342
1343        return __map_bio(tio);
1344}
1345
1346static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
1347                                  unsigned num_bios, unsigned *len)
1348{
1349        struct bio_list blist = BIO_EMPTY_LIST;
1350        struct bio *bio;
1351        struct dm_target_io *tio;
1352
1353        alloc_multiple_bios(&blist, ci, ti, num_bios);
1354
1355        while ((bio = bio_list_pop(&blist))) {
1356                tio = container_of(bio, struct dm_target_io, clone);
1357                (void) __clone_and_map_simple_bio(ci, tio, len);
1358        }
1359}
1360
1361static int __send_empty_flush(struct clone_info *ci)
1362{
1363        unsigned target_nr = 0;
1364        struct dm_target *ti;
1365        struct bio flush_bio;
1366
1367        /*
1368         * Use an on-stack bio for this, it's safe since we don't
1369         * need to reference it after submit. It's just used as
1370         * the basis for the clone(s).
1371         */
1372        bio_init(&flush_bio, NULL, 0);
1373        flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
1374        bio_set_dev(&flush_bio, ci->io->md->disk->part0);
1375
1376        ci->bio = &flush_bio;
1377        ci->sector_count = 0;
1378
1379        BUG_ON(bio_has_data(ci->bio));
1380        while ((ti = dm_table_get_target(ci->map, target_nr++)))
1381                __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
1382
1383        bio_uninit(ci->bio);
1384        return 0;
1385}
1386
1387static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
1388                                    sector_t sector, unsigned *len)
1389{
1390        struct bio *bio = ci->bio;
1391        struct dm_target_io *tio;
1392        int r;
1393
1394        tio = alloc_tio(ci, ti, 0, GFP_NOIO);
1395        tio->len_ptr = len;
1396        r = clone_bio(tio, bio, sector, *len);
1397        if (r < 0) {
1398                free_tio(tio);
1399                return r;
1400        }
1401        (void) __map_bio(tio);
1402
1403        return 0;
1404}
1405
1406static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
1407                                       unsigned num_bios)
1408{
1409        unsigned len;
1410
1411        /*
1412         * Even though the device advertised support for this type of
1413         * request, that does not mean every target supports it, and
1414         * reconfiguration might also have changed that since the
1415         * check was performed.
1416         */
1417        if (!num_bios)
1418                return -EOPNOTSUPP;
1419
1420        len = min_t(sector_t, ci->sector_count,
1421                    max_io_len_target_boundary(ti, dm_target_offset(ti, ci->sector)));
1422
1423        __send_duplicate_bios(ci, ti, num_bios, &len);
1424
1425        ci->sector += len;
1426        ci->sector_count -= len;
1427
1428        return 0;
1429}
1430
1431static bool is_abnormal_io(struct bio *bio)
1432{
1433        bool r = false;
1434
1435        switch (bio_op(bio)) {
1436        case REQ_OP_DISCARD:
1437        case REQ_OP_SECURE_ERASE:
1438        case REQ_OP_WRITE_SAME:
1439        case REQ_OP_WRITE_ZEROES:
1440                r = true;
1441                break;
1442        }
1443
1444        return r;
1445}
1446
1447static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti,
1448                                  int *result)
1449{
1450        struct bio *bio = ci->bio;
1451        unsigned num_bios = 0;
1452
1453        switch (bio_op(bio)) {
1454        case REQ_OP_DISCARD:
1455                num_bios = ti->num_discard_bios;
1456                break;
1457        case REQ_OP_SECURE_ERASE:
1458                num_bios = ti->num_secure_erase_bios;
1459                break;
1460        case REQ_OP_WRITE_SAME:
1461                num_bios = ti->num_write_same_bios;
1462                break;
1463        case REQ_OP_WRITE_ZEROES:
1464                num_bios = ti->num_write_zeroes_bios;
1465                break;
1466        default:
1467                return false;
1468        }
1469
1470        *result = __send_changing_extent_only(ci, ti, num_bios);
1471        return true;
1472}
1473
1474/*
1475 * Select the correct strategy for processing a non-flush bio.
1476 */
1477static int __split_and_process_non_flush(struct clone_info *ci)
1478{
1479        struct dm_target *ti;
1480        unsigned len;
1481        int r;
1482
1483        ti = dm_table_find_target(ci->map, ci->sector);
1484        if (!ti)
1485                return -EIO;
1486
1487        if (__process_abnormal_io(ci, ti, &r))
1488                return r;
1489
1490        len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count);
1491
1492        r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
1493        if (r < 0)
1494                return r;
1495
1496        ci->sector += len;
1497        ci->sector_count -= len;
1498
1499        return 0;
1500}
1501
1502static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
1503                            struct dm_table *map, struct bio *bio)
1504{
1505        ci->map = map;
1506        ci->io = alloc_io(md, bio);
1507        ci->sector = bio->bi_iter.bi_sector;
1508}
1509
1510#define __dm_part_stat_sub(part, field, subnd)  \
1511        (part_stat_get(part, field) -= (subnd))
1512
1513/*
1514 * Entry point to split a bio into clones and submit them to the targets.
1515 */
1516static blk_qc_t __split_and_process_bio(struct mapped_device *md,
1517                                        struct dm_table *map, struct bio *bio)
1518{
1519        struct clone_info ci;
1520        blk_qc_t ret = BLK_QC_T_NONE;
1521        int error = 0;
1522
1523        init_clone_info(&ci, md, map, bio);
1524
1525        if (bio->bi_opf & REQ_PREFLUSH) {
1526                error = __send_empty_flush(&ci);
1527                /* dm_io_dec_pending submits any data associated with flush */
1528        } else if (op_is_zone_mgmt(bio_op(bio))) {
1529                ci.bio = bio;
1530                ci.sector_count = 0;
1531                error = __split_and_process_non_flush(&ci);
1532        } else {
1533                ci.bio = bio;
1534                ci.sector_count = bio_sectors(bio);
1535                error = __split_and_process_non_flush(&ci);
1536                if (ci.sector_count && !error) {
1537                        /*
1538                         * Remainder must be passed to submit_bio_noacct()
1539                         * so that it gets handled *after* bios already submitted
1540                         * have been completely processed.
1541                         * We take a clone of the original to store in
1542                         * ci.io->orig_bio to be used by end_io_acct() and
1543                         * for dec_pending to use for completion handling.
1544                         */
1545                        struct bio *b = bio_split(bio, bio_sectors(bio) - ci.sector_count,
1546                                                  GFP_NOIO, &md->queue->bio_split);
1547                        ci.io->orig_bio = b;
1548
1549                        /*
1550                         * Adjust IO stats for each split, otherwise upon queue
1551                         * reentry there will be redundant IO accounting.
1552                         * NOTE: this is a stop-gap fix, a proper fix involves
1553                         * significant refactoring of DM core's bio splitting
1554                         * (by eliminating DM's splitting and just using bio_split)
1555                         */
1556                        part_stat_lock();
1557                        __dm_part_stat_sub(dm_disk(md)->part0,
1558                                           sectors[op_stat_group(bio_op(bio))], ci.sector_count);
1559                        part_stat_unlock();
1560
1561                        bio_chain(b, bio);
1562                        trace_block_split(b, bio->bi_iter.bi_sector);
1563                        ret = submit_bio_noacct(bio);
1564                }
1565        }
1566
1567        /* drop the extra reference count */
1568        dm_io_dec_pending(ci.io, errno_to_blk_status(error));
1569        return ret;
1570}
1571
1572static blk_qc_t dm_submit_bio(struct bio *bio)
1573{
1574        struct mapped_device *md = bio->bi_bdev->bd_disk->private_data;
1575        blk_qc_t ret = BLK_QC_T_NONE;
1576        int srcu_idx;
1577        struct dm_table *map;
1578
1579        map = dm_get_live_table(md, &srcu_idx);
1580        if (unlikely(!map)) {
1581                DMERR_LIMIT("%s: mapping table unavailable, erroring io",
1582                            dm_device_name(md));
1583                bio_io_error(bio);
1584                goto out;
1585        }
1586
1587        /* If suspended, queue this IO for later */
1588        if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1589                if (bio->bi_opf & REQ_NOWAIT)
1590                        bio_wouldblock_error(bio);
1591                else if (bio->bi_opf & REQ_RAHEAD)
1592                        bio_io_error(bio);
1593                else
1594                        queue_io(md, bio);
1595                goto out;
1596        }
1597
1598        /*
1599         * Use blk_queue_split() for abnormal IO (e.g. discard, writesame, etc)
1600         * otherwise associated queue_limits won't be imposed.
1601         */
1602        if (is_abnormal_io(bio))
1603                blk_queue_split(&bio);
1604
1605        ret = __split_and_process_bio(md, map, bio);
1606out:
1607        dm_put_live_table(md, srcu_idx);
1608        return ret;
1609}
1610
1611/*-----------------------------------------------------------------
1612 * An IDR is used to keep track of allocated minor numbers.
1613 *---------------------------------------------------------------*/
1614static void free_minor(int minor)
1615{
1616        spin_lock(&_minor_lock);
1617        idr_remove(&_minor_idr, minor);
1618        spin_unlock(&_minor_lock);
1619}
1620
1621/*
1622 * See if the device with a specific minor # is free.
1623 */
1624static int specific_minor(int minor)
1625{
1626        int r;
1627
1628        if (minor >= (1 << MINORBITS))
1629                return -EINVAL;
1630
1631        idr_preload(GFP_KERNEL);
1632        spin_lock(&_minor_lock);
1633
1634        r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
1635
1636        spin_unlock(&_minor_lock);
1637        idr_preload_end();
1638        if (r < 0)
1639                return r == -ENOSPC ? -EBUSY : r;
1640        return 0;
1641}
1642
1643static int next_free_minor(int *minor)
1644{
1645        int r;
1646
1647        idr_preload(GFP_KERNEL);
1648        spin_lock(&_minor_lock);
1649
1650        r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
1651
1652        spin_unlock(&_minor_lock);
1653        idr_preload_end();
1654        if (r < 0)
1655                return r;
1656        *minor = r;
1657        return 0;
1658}
1659
1660static const struct block_device_operations dm_blk_dops;
1661static const struct block_device_operations dm_rq_blk_dops;
1662static const struct dax_operations dm_dax_ops;
1663
1664static void dm_wq_work(struct work_struct *work);
1665
1666#ifdef CONFIG_BLK_INLINE_ENCRYPTION
1667static void dm_queue_destroy_keyslot_manager(struct request_queue *q)
1668{
1669        dm_destroy_keyslot_manager(q->ksm);
1670}
1671
1672#else /* CONFIG_BLK_INLINE_ENCRYPTION */
1673
1674static inline void dm_queue_destroy_keyslot_manager(struct request_queue *q)
1675{
1676}
1677#endif /* !CONFIG_BLK_INLINE_ENCRYPTION */
1678
1679static void cleanup_mapped_device(struct mapped_device *md)
1680{
1681        if (md->wq)
1682                destroy_workqueue(md->wq);
1683        bioset_exit(&md->bs);
1684        bioset_exit(&md->io_bs);
1685
1686        if (md->dax_dev) {
1687                kill_dax(md->dax_dev);
1688                put_dax(md->dax_dev);
1689                md->dax_dev = NULL;
1690        }
1691
1692        if (md->disk) {
1693                spin_lock(&_minor_lock);
1694                md->disk->private_data = NULL;
1695                spin_unlock(&_minor_lock);
1696                del_gendisk(md->disk);
1697        }
1698
1699        if (md->queue)
1700                dm_queue_destroy_keyslot_manager(md->queue);
1701
1702        if (md->disk)
1703                blk_cleanup_disk(md->disk);
1704
1705        cleanup_srcu_struct(&md->io_barrier);
1706
1707        mutex_destroy(&md->suspend_lock);
1708        mutex_destroy(&md->type_lock);
1709        mutex_destroy(&md->table_devices_lock);
1710        mutex_destroy(&md->swap_bios_lock);
1711
1712        dm_mq_cleanup_mapped_device(md);
1713        dm_cleanup_zoned_dev(md);
1714}
1715
1716/*
1717 * Allocate and initialise a blank device with a given minor.
1718 */
1719static struct mapped_device *alloc_dev(int minor)
1720{
1721        int r, numa_node_id = dm_get_numa_node();
1722        struct mapped_device *md;
1723        void *old_md;
1724
1725        md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
1726        if (!md) {
1727                DMWARN("unable to allocate device, out of memory.");
1728                return NULL;
1729        }
1730
1731        if (!try_module_get(THIS_MODULE))
1732                goto bad_module_get;
1733
1734        /* get a minor number for the dev */
1735        if (minor == DM_ANY_MINOR)
1736                r = next_free_minor(&minor);
1737        else
1738                r = specific_minor(minor);
1739        if (r < 0)
1740                goto bad_minor;
1741
1742        r = init_srcu_struct(&md->io_barrier);
1743        if (r < 0)
1744                goto bad_io_barrier;
1745
1746        md->numa_node_id = numa_node_id;
1747        md->init_tio_pdu = false;
1748        md->type = DM_TYPE_NONE;
1749        mutex_init(&md->suspend_lock);
1750        mutex_init(&md->type_lock);
1751        mutex_init(&md->table_devices_lock);
1752        spin_lock_init(&md->deferred_lock);
1753        atomic_set(&md->holders, 1);
1754        atomic_set(&md->open_count, 0);
1755        atomic_set(&md->event_nr, 0);
1756        atomic_set(&md->uevent_seq, 0);
1757        INIT_LIST_HEAD(&md->uevent_list);
1758        INIT_LIST_HEAD(&md->table_devices);
1759        spin_lock_init(&md->uevent_lock);
1760
1761        /*
1762         * default to bio-based until DM table is loaded and md->type
1763         * established. If request-based table is loaded: blk-mq will
1764         * override accordingly.
1765         */
1766        md->disk = blk_alloc_disk(md->numa_node_id);
1767        if (!md->disk)
1768                goto bad;
1769        md->queue = md->disk->queue;
1770
1771        init_waitqueue_head(&md->wait);
1772        INIT_WORK(&md->work, dm_wq_work);
1773        init_waitqueue_head(&md->eventq);
1774        init_completion(&md->kobj_holder.completion);
1775
1776        md->swap_bios = get_swap_bios();
1777        sema_init(&md->swap_bios_semaphore, md->swap_bios);
1778        mutex_init(&md->swap_bios_lock);
1779
1780        md->disk->major = _major;
1781        md->disk->first_minor = minor;
1782        md->disk->minors = 1;
1783        md->disk->fops = &dm_blk_dops;
1784        md->disk->queue = md->queue;
1785        md->disk->private_data = md;
1786        sprintf(md->disk->disk_name, "dm-%d", minor);
1787
1788        if (IS_ENABLED(CONFIG_DAX_DRIVER)) {
1789                md->dax_dev = alloc_dax(md, md->disk->disk_name,
1790                                        &dm_dax_ops, 0);
1791                if (IS_ERR(md->dax_dev))
1792                        goto bad;
1793        }
1794
1795        add_disk_no_queue_reg(md->disk);
1796        format_dev_t(md->name, MKDEV(_major, minor));
1797
1798        md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
1799        if (!md->wq)
1800                goto bad;
1801
1802        dm_stats_init(&md->stats);
1803
1804        /* Populate the mapping, nobody knows we exist yet */
1805        spin_lock(&_minor_lock);
1806        old_md = idr_replace(&_minor_idr, md, minor);
1807        spin_unlock(&_minor_lock);
1808
1809        BUG_ON(old_md != MINOR_ALLOCED);
1810
1811        return md;
1812
1813bad:
1814        cleanup_mapped_device(md);
1815bad_io_barrier:
1816        free_minor(minor);
1817bad_minor:
1818        module_put(THIS_MODULE);
1819bad_module_get:
1820        kvfree(md);
1821        return NULL;
1822}
1823
1824static void unlock_fs(struct mapped_device *md);
1825
1826static void free_dev(struct mapped_device *md)
1827{
1828        int minor = MINOR(disk_devt(md->disk));
1829
1830        unlock_fs(md);
1831
1832        cleanup_mapped_device(md);
1833
1834        free_table_devices(&md->table_devices);
1835        dm_stats_cleanup(&md->stats);
1836        free_minor(minor);
1837
1838        module_put(THIS_MODULE);
1839        kvfree(md);
1840}
1841
1842static int __bind_mempools(struct mapped_device *md, struct dm_table *t)
1843{
1844        struct dm_md_mempools *p = dm_table_get_md_mempools(t);
1845        int ret = 0;
1846
1847        if (dm_table_bio_based(t)) {
1848                /*
1849                 * The md may already have mempools that need changing.
1850                 * If so, reload bioset because front_pad may have changed
1851                 * because a different table was loaded.
1852                 */
1853                bioset_exit(&md->bs);
1854                bioset_exit(&md->io_bs);
1855
1856        } else if (bioset_initialized(&md->bs)) {
1857                /*
1858                 * There's no need to reload with request-based dm
1859                 * because the size of front_pad doesn't change.
1860                 * Note for future: If you are to reload bioset,
1861                 * prep-ed requests in the queue may refer
1862                 * to bio from the old bioset, so you must walk
1863                 * through the queue to unprep.
1864                 */
1865                goto out;
1866        }
1867
1868        BUG_ON(!p ||
1869               bioset_initialized(&md->bs) ||
1870               bioset_initialized(&md->io_bs));
1871
1872        ret = bioset_init_from_src(&md->bs, &p->bs);
1873        if (ret)
1874                goto out;
1875        ret = bioset_init_from_src(&md->io_bs, &p->io_bs);
1876        if (ret)
1877                bioset_exit(&md->bs);
1878out:
1879        /* mempool bind completed, no longer need any mempools in the table */
1880        dm_table_free_md_mempools(t);
1881        return ret;
1882}
1883
1884/*
1885 * Bind a table to the device.
1886 */
1887static void event_callback(void *context)
1888{
1889        unsigned long flags;
1890        LIST_HEAD(uevents);
1891        struct mapped_device *md = (struct mapped_device *) context;
1892
1893        spin_lock_irqsave(&md->uevent_lock, flags);
1894        list_splice_init(&md->uevent_list, &uevents);
1895        spin_unlock_irqrestore(&md->uevent_lock, flags);
1896
1897        dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
1898
1899        atomic_inc(&md->event_nr);
1900        wake_up(&md->eventq);
1901        dm_issue_global_event();
1902}
1903
1904/*
1905 * Returns old map, which caller must destroy.
1906 */
1907static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
1908                               struct queue_limits *limits)
1909{
1910        struct dm_table *old_map;
1911        struct request_queue *q = md->queue;
1912        bool request_based = dm_table_request_based(t);
1913        sector_t size;
1914        int ret;
1915
1916        lockdep_assert_held(&md->suspend_lock);
1917
1918        size = dm_table_get_size(t);
1919
1920        /*
1921         * Wipe any geometry if the size of the table changed.
1922         */
1923        if (size != dm_get_size(md))
1924                memset(&md->geometry, 0, sizeof(md->geometry));
1925
1926        if (!get_capacity(md->disk))
1927                set_capacity(md->disk, size);
1928        else
1929                set_capacity_and_notify(md->disk, size);
1930
1931        dm_table_event_callback(t, event_callback, md);
1932
1933        /*
1934         * The queue hasn't been stopped yet, if the old table type wasn't
1935         * for request-based during suspension.  So stop it to prevent
1936         * I/O mapping before resume.
1937         * This must be done before setting the queue restrictions,
1938         * because request-based dm may be run just after the setting.
1939         */
1940        if (request_based)
1941                dm_stop_queue(q);
1942
1943        if (request_based) {
1944                /*
1945                 * Leverage the fact that request-based DM targets are
1946                 * immutable singletons - used to optimize dm_mq_queue_rq.
1947                 */
1948                md->immutable_target = dm_table_get_immutable_target(t);
1949        }
1950
1951        ret = __bind_mempools(md, t);
1952        if (ret) {
1953                old_map = ERR_PTR(ret);
1954                goto out;
1955        }
1956
1957        ret = dm_table_set_restrictions(t, q, limits);
1958        if (ret) {
1959                old_map = ERR_PTR(ret);
1960                goto out;
1961        }
1962
1963        old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
1964        rcu_assign_pointer(md->map, (void *)t);
1965        md->immutable_target_type = dm_table_get_immutable_target_type(t);
1966
1967        if (old_map)
1968                dm_sync_table(md);
1969
1970out:
1971        return old_map;
1972}
1973
1974/*
1975 * Returns unbound table for the caller to free.
1976 */
1977static struct dm_table *__unbind(struct mapped_device *md)
1978{
1979        struct dm_table *map = rcu_dereference_protected(md->map, 1);
1980
1981        if (!map)
1982                return NULL;
1983
1984        dm_table_event_callback(map, NULL, NULL);
1985        RCU_INIT_POINTER(md->map, NULL);
1986        dm_sync_table(md);
1987
1988        return map;
1989}
1990
1991/*
1992 * Constructor for a new device.
1993 */
1994int dm_create(int minor, struct mapped_device **result)
1995{
1996        int r;
1997        struct mapped_device *md;
1998
1999        md = alloc_dev(minor);
2000        if (!md)

2001                return -ENXIO;
2002
2003        r = dm_sysfs_init(md);
2004        if (r) {
2005                free_dev(md);
2006                return r;
2007        }
2008
2009        *result = md;
2010        return 0;
2011}
2012
2013/*
2014 * Functions to manage md->type.
2015 * All are required to hold md->type_lock.
2016 */
2017void dm_lock_md_type(struct mapped_device *md)
2018{
2019        mutex_lock(&md->type_lock);
2020}
2021
2022void dm_unlock_md_type(struct mapped_device *md)
2023{
2024        mutex_unlock(&md->type_lock);
2025}
2026
2027void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
2028{
2029        BUG_ON(!mutex_is_locked(&md->type_lock));
2030        md->type = type;
2031}
2032
2033enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
2034{
2035        return md->type;
2036}
2037
2038struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
2039{
2040        return md->immutable_target_type;
2041}
2042
2043/*
2044 * The queue_limits are only valid as long as you have a reference
2045 * count on 'md'.
2046 */
2047struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
2048{
2049        BUG_ON(!atomic_read(&md->holders));
2050        return &md->queue->limits;
2051}
2052EXPORT_SYMBOL_GPL(dm_get_queue_limits);
2053
2054/*
2055 * Setup the DM device's queue based on md's type
2056 */
2057int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
2058{
2059        int r;
2060        struct queue_limits limits;
2061        enum dm_queue_mode type = dm_get_md_type(md);
2062
2063        switch (type) {
2064        case DM_TYPE_REQUEST_BASED:
2065                md->disk->fops = &dm_rq_blk_dops;
2066                r = dm_mq_init_request_queue(md, t);
2067                if (r) {
2068                        DMERR("Cannot initialize queue for request-based dm mapped device");
2069                        return r;
2070                }
2071                break;
2072        case DM_TYPE_BIO_BASED:
2073        case DM_TYPE_DAX_BIO_BASED:
2074                break;
2075        case DM_TYPE_NONE:
2076                WARN_ON_ONCE(true);
2077                break;
2078        }
2079
2080        r = dm_calculate_queue_limits(t, &limits);
2081        if (r) {
2082                DMERR("Cannot calculate initial queue limits");
2083                return r;
2084        }
2085        r = dm_table_set_restrictions(t, md->queue, &limits);
2086        if (r)
2087                return r;
2088
2089        blk_register_queue(md->disk);
2090
2091        return 0;
2092}
2093
2094struct mapped_device *dm_get_md(dev_t dev)
2095{
2096        struct mapped_device *md;
2097        unsigned minor = MINOR(dev);
2098
2099        if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2100                return NULL;
2101
2102        spin_lock(&_minor_lock);
2103
2104        md = idr_find(&_minor_idr, minor);
2105        if (!md || md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) ||
2106            test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2107                md = NULL;
2108                goto out;
2109        }
2110        dm_get(md);
2111out:
2112        spin_unlock(&_minor_lock);
2113
2114        return md;
2115}
2116EXPORT_SYMBOL_GPL(dm_get_md);
2117
2118void *dm_get_mdptr(struct mapped_device *md)
2119{
2120        return md->interface_ptr;
2121}
2122
2123void dm_set_mdptr(struct mapped_device *md, void *ptr)
2124{
2125        md->interface_ptr = ptr;
2126}
2127
2128void dm_get(struct mapped_device *md)
2129{
2130        atomic_inc(&md->holders);
2131        BUG_ON(test_bit(DMF_FREEING, &md->flags));
2132}
2133
2134int dm_hold(struct mapped_device *md)
2135{
2136        spin_lock(&_minor_lock);
2137        if (test_bit(DMF_FREEING, &md->flags)) {
2138                spin_unlock(&_minor_lock);
2139                return -EBUSY;
2140        }
2141        dm_get(md);
2142        spin_unlock(&_minor_lock);
2143        return 0;
2144}
2145EXPORT_SYMBOL_GPL(dm_hold);
2146
2147const char *dm_device_name(struct mapped_device *md)
2148{
2149        return md->name;
2150}
2151EXPORT_SYMBOL_GPL(dm_device_name);
2152
2153static void __dm_destroy(struct mapped_device *md, bool wait)
2154{
2155        struct dm_table *map;
2156        int srcu_idx;
2157
2158        might_sleep();
2159
2160        spin_lock(&_minor_lock);
2161        idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2162        set_bit(DMF_FREEING, &md->flags);
2163        spin_unlock(&_minor_lock);
2164
2165        blk_set_queue_dying(md->queue);
2166
2167        /*
2168         * Take suspend_lock so that presuspend and postsuspend methods
2169         * do not race with internal suspend.
2170         */
2171        mutex_lock(&md->suspend_lock);
2172        map = dm_get_live_table(md, &srcu_idx);
2173        if (!dm_suspended_md(md)) {
2174                dm_table_presuspend_targets(map);
2175                set_bit(DMF_SUSPENDED, &md->flags);
2176                set_bit(DMF_POST_SUSPENDING, &md->flags);
2177                dm_table_postsuspend_targets(map);
2178        }
2179        /* dm_put_live_table must be before msleep, otherwise deadlock is possible */
2180        dm_put_live_table(md, srcu_idx);
2181        mutex_unlock(&md->suspend_lock);
2182
2183        /*
2184         * Rare, but there may be I/O requests still going to complete,
2185         * for example.  Wait for all references to disappear.
2186         * No one should increment the reference count of the mapped_device,
2187         * after the mapped_device state becomes DMF_FREEING.
2188         */
2189        if (wait)
2190                while (atomic_read(&md->holders))
2191                        msleep(1);
2192        else if (atomic_read(&md->holders))
2193                DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2194                       dm_device_name(md), atomic_read(&md->holders));
2195
2196        dm_sysfs_exit(md);
2197        dm_table_destroy(__unbind(md));
2198        free_dev(md);
2199}
2200
2201void dm_destroy(struct mapped_device *md)
2202{
2203        __dm_destroy(md, true);
2204}
2205
2206void dm_destroy_immediate(struct mapped_device *md)
2207{
2208        __dm_destroy(md, false);
2209}
2210
2211void dm_put(struct mapped_device *md)
2212{
2213        atomic_dec(&md->holders);
2214}
2215EXPORT_SYMBOL_GPL(dm_put);
2216
2217static bool md_in_flight_bios(struct mapped_device *md)
2218{
2219        int cpu;
2220        struct block_device *part = dm_disk(md)->part0;
2221        long sum = 0;
2222
2223        for_each_possible_cpu(cpu) {
2224                sum += part_stat_local_read_cpu(part, in_flight[0], cpu);
2225                sum += part_stat_local_read_cpu(part, in_flight[1], cpu);
2226        }
2227
2228        return sum != 0;
2229}
2230
2231static int dm_wait_for_bios_completion(struct mapped_device *md, unsigned int task_state)
2232{
2233        int r = 0;
2234        DEFINE_WAIT(wait);
2235
2236        while (true) {
2237                prepare_to_wait(&md->wait, &wait, task_state);
2238
2239                if (!md_in_flight_bios(md))
2240                        break;
2241
2242                if (signal_pending_state(task_state, current)) {
2243                        r = -EINTR;
2244                        break;
2245                }
2246
2247                io_schedule();
2248        }
2249        finish_wait(&md->wait, &wait);
2250
2251        return r;
2252}
2253
2254static int dm_wait_for_completion(struct mapped_device *md, unsigned int task_state)
2255{
2256        int r = 0;
2257
2258        if (!queue_is_mq(md->queue))
2259                return dm_wait_for_bios_completion(md, task_state);
2260
2261        while (true) {
2262                if (!blk_mq_queue_inflight(md->queue))
2263                        break;
2264
2265                if (signal_pending_state(task_state, current)) {
2266                        r = -EINTR;
2267                        break;
2268                }
2269
2270                msleep(5);
2271        }
2272
2273        return r;
2274}
2275
2276/*
2277 * Process the deferred bios
2278 */
2279static void dm_wq_work(struct work_struct *work)
2280{
2281        struct mapped_device *md = container_of(work, struct mapped_device, work);
2282        struct bio *bio;
2283
2284        while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2285                spin_lock_irq(&md->deferred_lock);
2286                bio = bio_list_pop(&md->deferred);
2287                spin_unlock_irq(&md->deferred_lock);
2288
2289                if (!bio)
2290                        break;
2291
2292                submit_bio_noacct(bio);
2293        }
2294}
2295
2296static void dm_queue_flush(struct mapped_device *md)
2297{
2298        clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2299        smp_mb__after_atomic();
2300        queue_work(md->wq, &md->work);
2301}
2302
2303/*
2304 * Swap in a new table, returning the old one for the caller to destroy.
2305 */
2306struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2307{
2308        struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
2309        struct queue_limits limits;
2310        int r;
2311
2312        mutex_lock(&md->suspend_lock);
2313
2314        /* device must be suspended */
2315        if (!dm_suspended_md(md))
2316                goto out;
2317
2318        /*
2319         * If the new table has no data devices, retain the existing limits.
2320         * This helps multipath with queue_if_no_path if all paths disappear,
2321         * then new I/O is queued based on these limits, and then some paths
2322         * reappear.
2323         */
2324        if (dm_table_has_no_data_devices(table)) {
2325                live_map = dm_get_live_table_fast(md);
2326                if (live_map)
2327                        limits = md->queue->limits;
2328                dm_put_live_table_fast(md);
2329        }
2330
2331        if (!live_map) {
2332                r = dm_calculate_queue_limits(table, &limits);
2333                if (r) {
2334                        map = ERR_PTR(r);
2335                        goto out;
2336                }
2337        }
2338
2339        map = __bind(md, table, &limits);
2340        dm_issue_global_event();
2341
2342out:
2343        mutex_unlock(&md->suspend_lock);
2344        return map;
2345}
2346
2347/*
2348 * Functions to lock and unlock any filesystem running on the
2349 * device.
2350 */
2351static int lock_fs(struct mapped_device *md)
2352{
2353        int r;
2354
2355        WARN_ON(test_bit(DMF_FROZEN, &md->flags));
2356
2357        r = freeze_bdev(md->disk->part0);
2358        if (!r)
2359                set_bit(DMF_FROZEN, &md->flags);
2360        return r;
2361}
2362
2363static void unlock_fs(struct mapped_device *md)
2364{
2365        if (!test_bit(DMF_FROZEN, &md->flags))
2366                return;
2367        thaw_bdev(md->disk->part0);
2368        clear_bit(DMF_FROZEN, &md->flags);
2369}
2370
2371/*
2372 * @suspend_flags: DM_SUSPEND_LOCKFS_FLAG and/or DM_SUSPEND_NOFLUSH_FLAG
2373 * @task_state: e.g. TASK_INTERRUPTIBLE or TASK_UNINTERRUPTIBLE
2374 * @dmf_suspended_flag: DMF_SUSPENDED or DMF_SUSPENDED_INTERNALLY
2375 *
2376 * If __dm_suspend returns 0, the device is completely quiescent
2377 * now. There is no request-processing activity. All new requests
2378 * are being added to md->deferred list.
2379 */
2380static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
2381                        unsigned suspend_flags, unsigned int task_state,
2382                        int dmf_suspended_flag)
2383{
2384        bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
2385        bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
2386        int r;
2387
2388        lockdep_assert_held(&md->suspend_lock);
2389
2390        /*
2391         * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
2392         * This flag is cleared before dm_suspend returns.
2393         */
2394        if (noflush)
2395                set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2396        else
2397                DMDEBUG("%s: suspending with flush", dm_device_name(md));
2398
2399        /*
2400         * This gets reverted if there's an error later and the targets
2401         * provide the .presuspend_undo hook.
2402         */
2403        dm_table_presuspend_targets(map);
2404
2405        /*
2406         * Flush I/O to the device.
2407         * Any I/O submitted after lock_fs() may not be flushed.
2408         * noflush takes precedence over do_lockfs.
2409         * (lock_fs() flushes I/Os and waits for them to complete.)
2410         */
2411        if (!noflush && do_lockfs) {
2412                r = lock_fs(md);
2413                if (r) {
2414                        dm_table_presuspend_undo_targets(map);
2415                        return r;
2416                }
2417        }
2418
2419        /*
2420         * Here we must make sure that no processes are submitting requests
2421         * to target drivers i.e. no one may be executing
2422         * __split_and_process_bio from dm_submit_bio.
2423         *
2424         * To get all processes out of __split_and_process_bio in dm_submit_bio,
2425         * we take the write lock. To prevent any process from reentering
2426         * __split_and_process_bio from dm_submit_bio and quiesce the thread
2427         * (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND and call
2428         * flush_workqueue(md->wq).
2429         */
2430        set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2431        if (map)
2432                synchronize_srcu(&md->io_barrier);
2433
2434        /*
2435         * Stop md->queue before flushing md->wq in case request-based
2436         * dm defers requests to md->wq from md->queue.
2437         */
2438        if (dm_request_based(md))
2439                dm_stop_queue(md->queue);
2440
2441        flush_workqueue(md->wq);
2442
2443        /*
2444         * At this point no more requests are entering target request routines.
2445         * We call dm_wait_for_completion to wait for all existing requests
2446         * to finish.
2447         */
2448        r = dm_wait_for_completion(md, task_state);
2449        if (!r)
2450                set_bit(dmf_suspended_flag, &md->flags);
2451
2452        if (noflush)
2453                clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2454        if (map)
2455                synchronize_srcu(&md->io_barrier);
2456
2457        /* were we interrupted ? */
2458        if (r < 0) {
2459                dm_queue_flush(md);
2460
2461                if (dm_request_based(md))
2462                        dm_start_queue(md->queue);
2463
2464                unlock_fs(md);
2465                dm_table_presuspend_undo_targets(map);
2466                /* pushback list is already flushed, so skip flush */
2467        }
2468
2469        return r;
2470}
2471
2472/*
2473 * We need to be able to change a mapping table under a mounted
2474 * filesystem.  For example we might want to move some data in
2475 * the background.  Before the table can be swapped with
2476 * dm_bind_table, dm_suspend must be called to flush any in
2477 * flight bios and ensure that any further io gets deferred.
2478 */
2479/*
2480 * Suspend mechanism in request-based dm.
2481 *
2482 * 1. Flush all I/Os by lock_fs() if needed.
2483 * 2. Stop dispatching any I/O by stopping the request_queue.
2484 * 3. Wait for all in-flight I/Os to be completed or requeued.
2485 *
2486 * To abort suspend, start the request_queue.
2487 */
2488int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2489{
2490        struct dm_table *map = NULL;
2491        int r = 0;
2492
2493retry:
2494        mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2495
2496        if (dm_suspended_md(md)) {
2497                r = -EINVAL;
2498                goto out_unlock;
2499        }
2500
2501        if (dm_suspended_internally_md(md)) {
2502                /* already internally suspended, wait for internal resume */
2503                mutex_unlock(&md->suspend_lock);
2504                r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2505                if (r)
2506                        return r;
2507                goto retry;
2508        }
2509
2510        map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2511
2512        r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED);
2513        if (r)
2514                goto out_unlock;
2515
2516        set_bit(DMF_POST_SUSPENDING, &md->flags);
2517        dm_table_postsuspend_targets(map);
2518        clear_bit(DMF_POST_SUSPENDING, &md->flags);
2519
2520out_unlock:
2521        mutex_unlock(&md->suspend_lock);
2522        return r;
2523}
2524
2525static int __dm_resume(struct mapped_device *md, struct dm_table *map)
2526{
2527        if (map) {
2528                int r = dm_table_resume_targets(map);
2529                if (r)
2530                        return r;
2531        }
2532
2533        dm_queue_flush(md);
2534
2535        /*
2536         * Flushing deferred I/Os must be done after targets are resumed
2537         * so that mapping of targets can work correctly.
2538         * Request-based dm is queueing the deferred I/Os in its request_queue.
2539         */
2540        if (dm_request_based(md))
2541                dm_start_queue(md->queue);
2542
2543        unlock_fs(md);
2544
2545        return 0;
2546}
2547
2548int dm_resume(struct mapped_device *md)
2549{
2550        int r;
2551        struct dm_table *map = NULL;
2552
2553retry:
2554        r = -EINVAL;
2555        mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2556
2557        if (!dm_suspended_md(md))
2558                goto out;
2559
2560        if (dm_suspended_internally_md(md)) {
2561                /* already internally suspended, wait for internal resume */
2562                mutex_unlock(&md->suspend_lock);
2563                r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2564                if (r)
2565                        return r;
2566                goto retry;
2567        }
2568
2569        map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2570        if (!map || !dm_table_get_size(map))
2571                goto out;
2572
2573        r = __dm_resume(md, map);
2574        if (r)
2575                goto out;
2576
2577        clear_bit(DMF_SUSPENDED, &md->flags);
2578out:
2579        mutex_unlock(&md->suspend_lock);
2580
2581        return r;
2582}
2583
2584/*
2585 * Internal suspend/resume works like userspace-driven suspend. It waits
2586 * until all bios finish and prevents issuing new bios to the target drivers.
2587 * It may be used only from the kernel.
2588 */
2589
2590static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
2591{
2592        struct dm_table *map = NULL;
2593
2594        lockdep_assert_held(&md->suspend_lock);
2595
2596        if (md->internal_suspend_count++)
2597                return; /* nested internal suspend */
2598
2599        if (dm_suspended_md(md)) {
2600                set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2601                return; /* nest suspend */
2602        }
2603
2604        map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2605
2606        /*
2607         * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is
2608         * supported.  Properly supporting a TASK_INTERRUPTIBLE internal suspend
2609         * would require changing .presuspend to return an error -- avoid this
2610         * until there is a need for more elaborate variants of internal suspend.
2611         */
2612        (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE,
2613                            DMF_SUSPENDED_INTERNALLY);
2614
2615        set_bit(DMF_POST_SUSPENDING, &md->flags);
2616        dm_table_postsuspend_targets(map);
2617        clear_bit(DMF_POST_SUSPENDING, &md->flags);
2618}
2619
2620static void __dm_internal_resume(struct mapped_device *md)
2621{
2622        BUG_ON(!md->internal_suspend_count);
2623
2624        if (--md->internal_suspend_count)
2625                return; /* resume from nested internal suspend */
2626
2627        if (dm_suspended_md(md))
2628                goto done; /* resume from nested suspend */
2629
2630        /*
2631         * NOTE: existing callers don't need to call dm_table_resume_targets
2632         * (which may fail -- so best to avoid it for now by passing NULL map)
2633         */
2634        (void) __dm_resume(md, NULL);
2635
2636done:
2637        clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2638        smp_mb__after_atomic();
2639        wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
2640}
2641
2642void dm_internal_suspend_noflush(struct mapped_device *md)
2643{
2644        mutex_lock(&md->suspend_lock);
2645        __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
2646        mutex_unlock(&md->suspend_lock);
2647}
2648EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
2649
2650void dm_internal_resume(struct mapped_device *md)
2651{
2652        mutex_lock(&md->suspend_lock);
2653        __dm_internal_resume(md);
2654        mutex_unlock(&md->suspend_lock);
2655}
2656EXPORT_SYMBOL_GPL(dm_internal_resume);
2657
2658/*
2659 * Fast variants of internal suspend/resume hold md->suspend_lock,
2660 * which prevents interaction with userspace-driven suspend.
2661 */
2662
2663void dm_internal_suspend_fast(struct mapped_device *md)
2664{
2665        mutex_lock(&md->suspend_lock);
2666        if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2667                return;
2668
2669        set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2670        synchronize_srcu(&md->io_barrier);
2671        flush_workqueue(md->wq);
2672        dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2673}
2674EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
2675
2676void dm_internal_resume_fast(struct mapped_device *md)
2677{
2678        if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2679                goto done;
2680
2681        dm_queue_flush(md);
2682
2683done:
2684        mutex_unlock(&md->suspend_lock);
2685}
2686EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
2687
2688/*-----------------------------------------------------------------
2689 * Event notification.
2690 *---------------------------------------------------------------*/
2691int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
2692                       unsigned cookie)
2693{
2694        int r;
2695        unsigned noio_flag;
2696        char udev_cookie[DM_COOKIE_LENGTH];
2697        char *envp[] = { udev_cookie, NULL };
2698
2699        noio_flag = memalloc_noio_save();
2700
2701        if (!cookie)
2702                r = kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
2703        else {
2704                snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2705                         DM_COOKIE_ENV_VAR_NAME, cookie);
2706                r = kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
2707                                       action, envp);
2708        }
2709
2710        memalloc_noio_restore(noio_flag);
2711
2712        return r;
2713}
2714
2715uint32_t dm_next_uevent_seq(struct mapped_device *md)
2716{
2717        return atomic_add_return(1, &md->uevent_seq);
2718}
2719
2720uint32_t dm_get_event_nr(struct mapped_device *md)
2721{
2722        return atomic_read(&md->event_nr);
2723}
2724
2725int dm_wait_event(struct mapped_device *md, int event_nr)
2726{
2727        return wait_event_interruptible(md->eventq,
2728                        (event_nr != atomic_read(&md->event_nr)));
2729}
2730
2731void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
2732{
2733        unsigned long flags;
2734
2735        spin_lock_irqsave(&md->uevent_lock, flags);
2736        list_add(elist, &md->uevent_list);
2737        spin_unlock_irqrestore(&md->uevent_lock, flags);
2738}
2739
2740/*
2741 * The gendisk is only valid as long as you have a reference
2742 * count on 'md'.
2743 */
2744struct gendisk *dm_disk(struct mapped_device *md)
2745{
2746        return md->disk;
2747}
2748EXPORT_SYMBOL_GPL(dm_disk);
2749
2750struct kobject *dm_kobject(struct mapped_device *md)
2751{
2752        return &md->kobj_holder.kobj;
2753}
2754
2755struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2756{
2757        struct mapped_device *md;
2758
2759        md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
2760
2761        spin_lock(&_minor_lock);
2762        if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2763                md = NULL;
2764                goto out;
2765        }
2766        dm_get(md);
2767out:
2768        spin_unlock(&_minor_lock);
2769
2770        return md;
2771}
2772
2773int dm_suspended_md(struct mapped_device *md)
2774{
2775        return test_bit(DMF_SUSPENDED, &md->flags);
2776}
2777
2778static int dm_post_suspending_md(struct mapped_device *md)
2779{
2780        return test_bit(DMF_POST_SUSPENDING, &md->flags);
2781}
2782
2783int dm_suspended_internally_md(struct mapped_device *md)
2784{
2785        return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2786}
2787
2788int dm_test_deferred_remove_flag(struct mapped_device *md)
2789{
2790        return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
2791}
2792
2793int dm_suspended(struct dm_target *ti)
2794{
2795        return dm_suspended_md(ti->table->md);
2796}
2797EXPORT_SYMBOL_GPL(dm_suspended);
2798
2799int dm_post_suspending(struct dm_target *ti)
2800{
2801        return dm_post_suspending_md(ti->table->md);
2802}
2803EXPORT_SYMBOL_GPL(dm_post_suspending);
2804
2805int dm_noflush_suspending(struct dm_target *ti)
2806{
2807        return __noflush_suspending(ti->table->md);
2808}
2809EXPORT_SYMBOL_GPL(dm_noflush_suspending);
2810
2811struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
2812                                            unsigned integrity, unsigned per_io_data_size,
2813                                            unsigned min_pool_size)
2814{
2815        struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
2816        unsigned int pool_size = 0;
2817        unsigned int front_pad, io_front_pad;
2818        int ret;
2819
2820        if (!pools)
2821                return NULL;
2822
2823        switch (type) {
2824        case DM_TYPE_BIO_BASED:
2825        case DM_TYPE_DAX_BIO_BASED:
2826                pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
2827                front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + DM_TARGET_IO_BIO_OFFSET;
2828                io_front_pad = roundup(per_io_data_size,  __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET;
2829                ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0);
2830                if (ret)
2831                        goto out;
2832                if (integrity && bioset_integrity_create(&pools->io_bs, pool_size))
2833                        goto out;
2834                break;
2835        case DM_TYPE_REQUEST_BASED:
2836                pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size);
2837                front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
2838                /* per_io_data_size is used for blk-mq pdu at queue allocation */
2839                break;
2840        default:
2841                BUG();
2842        }
2843
2844        ret = bioset_init(&pools->bs, pool_size, front_pad, 0);
2845        if (ret)
2846                goto out;
2847
2848        if (integrity && bioset_integrity_create(&pools->bs, pool_size))
2849                goto out;
2850
2851        return pools;
2852
2853out:
2854        dm_free_md_mempools(pools);
2855
2856        return NULL;
2857}
2858
2859void dm_free_md_mempools(struct dm_md_mempools *pools)
2860{
2861        if (!pools)
2862                return;
2863
2864        bioset_exit(&pools->bs);
2865        bioset_exit(&pools->io_bs);
2866
2867        kfree(pools);
2868}
2869
2870struct dm_pr {
2871        u64     old_key;
2872        u64     new_key;
2873        u32     flags;
2874        bool    fail_early;
2875};
2876
2877static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
2878                      void *data)
2879{
2880        struct mapped_device *md = bdev->bd_disk->private_data;
2881        struct dm_table *table;
2882        struct dm_target *ti;
2883        int ret = -ENOTTY, srcu_idx;
2884
2885        table = dm_get_live_table(md, &srcu_idx);
2886        if (!table || !dm_table_get_size(table))
2887                goto out;
2888
2889        /* We only support devices that have a single target */
2890        if (dm_table_get_num_targets(table) != 1)
2891                goto out;
2892        ti = dm_table_get_target(table, 0);
2893
2894        ret = -EINVAL;
2895        if (!ti->type->iterate_devices)
2896                goto out;
2897
2898        ret = ti->type->iterate_devices(ti, fn, data);
2899out:
2900        dm_put_live_table(md, srcu_idx);
2901        return ret;
2902}
2903
2904/*
2905 * For register / unregister we need to manually call out to every path.
2906 */
2907static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev,
2908                            sector_t start, sector_t len, void *data)
2909{
2910        struct dm_pr *pr = data;
2911        const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
2912
2913        if (!ops || !ops->pr_register)
2914                return -EOPNOTSUPP;
2915        return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);
2916}
2917
2918static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
2919                          u32 flags)
2920{
2921        struct dm_pr pr = {
2922                .old_key        = old_key,
2923                .new_key        = new_key,
2924                .flags          = flags,
2925                .fail_early     = true,
2926        };
2927        int ret;
2928
2929        ret = dm_call_pr(bdev, __dm_pr_register, &pr);
2930        if (ret && new_key) {
2931                /* unregister all paths if we failed to register any path */
2932                pr.old_key = new_key;
2933                pr.new_key = 0;
2934                pr.flags = 0;
2935                pr.fail_early = false;
2936                dm_call_pr(bdev, __dm_pr_register, &pr);
2937        }
2938
2939        return ret;
2940}
2941
2942static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
2943                         u32 flags)
2944{
2945        struct mapped_device *md = bdev->bd_disk->private_data;
2946        const struct pr_ops *ops;
2947        int r, srcu_idx;
2948
2949        r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
2950        if (r < 0)
2951                goto out;
2952
2953        ops = bdev->bd_disk->fops->pr_ops;
2954        if (ops && ops->pr_reserve)
2955                r = ops->pr_reserve(bdev, key, type, flags);
2956        else
2957                r = -EOPNOTSUPP;
2958out:
2959        dm_unprepare_ioctl(md, srcu_idx);
2960        return r;
2961}
2962
2963static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
2964{
2965        struct mapped_device *md = bdev->bd_disk->private_data;
2966        const struct pr_ops *ops;
2967        int r, srcu_idx;
2968
2969        r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
2970        if (r < 0)
2971                goto out;
2972
2973        ops = bdev->bd_disk->fops->pr_ops;
2974        if (ops && ops->pr_release)
2975                r = ops->pr_release(bdev, key, type);
2976        else
2977                r = -EOPNOTSUPP;
2978out:
2979        dm_unprepare_ioctl(md, srcu_idx);
2980        return r;
2981}
2982
2983static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
2984                         enum pr_type type, bool abort)
2985{
2986        struct mapped_device *md = bdev->bd_disk->private_data;
2987        const struct pr_ops *ops;
2988        int r, srcu_idx;
2989
2990        r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
2991        if (r < 0)
2992                goto out;
2993
2994        ops = bdev->bd_disk->fops->pr_ops;
2995        if (ops && ops->pr_preempt)
2996                r = ops->pr_preempt(bdev, old_key, new_key, type, abort);
2997        else
2998                r = -EOPNOTSUPP;
2999out:
3000        dm_unprepare_ioctl(md, srcu_idx);

3001        return r;
3002}
3003
3004static int dm_pr_clear(struct block_device *bdev, u64 key)
3005{
3006        struct mapped_device *md = bdev->bd_disk->private_data;
3007        const struct pr_ops *ops;
3008        int r, srcu_idx;
3009
3010        r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3011        if (r < 0)
3012                goto out;
3013
3014        ops = bdev->bd_disk->fops->pr_ops;
3015        if (ops && ops->pr_clear)
3016                r = ops->pr_clear(bdev, key);
3017        else
3018                r = -EOPNOTSUPP;
3019out:
3020        dm_unprepare_ioctl(md, srcu_idx);
3021        return r;
3022}
3023
3024static const struct pr_ops dm_pr_ops = {
3025        .pr_register    = dm_pr_register,
3026        .pr_reserve     = dm_pr_reserve,
3027        .pr_release     = dm_pr_release,
3028        .pr_preempt     = dm_pr_preempt,
3029        .pr_clear       = dm_pr_clear,
3030};
3031
3032static const struct block_device_operations dm_blk_dops = {
3033        .submit_bio = dm_submit_bio,
3034        .open = dm_blk_open,
3035        .release = dm_blk_close,
3036        .ioctl = dm_blk_ioctl,
3037        .getgeo = dm_blk_getgeo,
3038        .report_zones = dm_blk_report_zones,
3039        .pr_ops = &dm_pr_ops,
3040        .owner = THIS_MODULE
3041};
3042
3043static const struct block_device_operations dm_rq_blk_dops = {
3044        .open = dm_blk_open,
3045        .release = dm_blk_close,
3046        .ioctl = dm_blk_ioctl,
3047        .getgeo = dm_blk_getgeo,
3048        .pr_ops = &dm_pr_ops,
3049        .owner = THIS_MODULE
3050};
3051
3052static const struct dax_operations dm_dax_ops = {
3053        .direct_access = dm_dax_direct_access,
3054        .dax_supported = dm_dax_supported,
3055        .copy_from_iter = dm_dax_copy_from_iter,
3056        .copy_to_iter = dm_dax_copy_to_iter,
3057        .zero_page_range = dm_dax_zero_page_range,
3058};
3059
3060/*
3061 * module hooks
3062 */
3063module_init(dm_init);
3064module_exit(dm_exit);
3065
3066module_param(major, uint, 0);
3067MODULE_PARM_DESC(major, "The major number of the device mapper");
3068
3069module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
3070MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
3071
3072module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
3073MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
3074
3075module_param(swap_bios, int, S_IRUGO | S_IWUSR);
3076MODULE_PARM_DESC(swap_bios, "Maximum allowed inflight swap IOs");
3077
3078MODULE_DESCRIPTION(DM_NAME " driver");
3079MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3080MODULE_LICENSE("GPL");
3081