LXR linux/drivers/md/dm.c

   1/*
   2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
   3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
   4 *
   5 * This file is released under the GPL.
   6 */
   7
   8#include "dm-core.h"
   9#include "dm-rq.h"
  10#include "dm-uevent.h"
  11
  12#include <linux/init.h>
  13#include <linux/module.h>
  14#include <linux/mutex.h>
  15#include <linux/sched/signal.h>
  16#include <linux/blkpg.h>
  17#include <linux/bio.h>
  18#include <linux/mempool.h>
  19#include <linux/dax.h>
  20#include <linux/slab.h>
  21#include <linux/idr.h>
  22#include <linux/uio.h>
  23#include <linux/hdreg.h>
  24#include <linux/delay.h>
  25#include <linux/wait.h>
  26#include <linux/pr.h>
  27#include <linux/refcount.h>
  28
  29#define DM_MSG_PREFIX "core"
  30
  31/*
  32 * Cookies are numeric values sent with CHANGE and REMOVE
  33 * uevents while resuming, removing or renaming the device.
  34 */
  35#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
  36#define DM_COOKIE_LENGTH 24
  37
  38static const char *_name = DM_NAME;
  39
  40static unsigned int major = 0;
  41static unsigned int _major = 0;
  42
  43static DEFINE_IDR(_minor_idr);
  44
  45static DEFINE_SPINLOCK(_minor_lock);
  46
  47static void do_deferred_remove(struct work_struct *w);
  48
  49static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
  50
  51static struct workqueue_struct *deferred_remove_workqueue;
  52
  53atomic_t dm_global_event_nr = ATOMIC_INIT(0);
  54DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq);
  55
  56void dm_issue_global_event(void)
  57{
  58        atomic_inc(&dm_global_event_nr);
  59        wake_up(&dm_global_eventq);
  60}
  61
  62/*
  63 * One of these is allocated (on-stack) per original bio.
  64 */
  65struct clone_info {
  66        struct dm_table *map;
  67        struct bio *bio;
  68        struct dm_io *io;
  69        sector_t sector;
  70        unsigned sector_count;
  71};
  72
  73/*
  74 * One of these is allocated per clone bio.
  75 */
  76#define DM_TIO_MAGIC 7282014
  77struct dm_target_io {
  78        unsigned magic;
  79        struct dm_io *io;
  80        struct dm_target *ti;
  81        unsigned target_bio_nr;
  82        unsigned *len_ptr;
  83        bool inside_dm_io;
  84        struct bio clone;
  85};
  86
  87/*
  88 * One of these is allocated per original bio.
  89 * It contains the first clone used for that original.
  90 */
  91#define DM_IO_MAGIC 5191977
  92struct dm_io {
  93        unsigned magic;
  94        struct mapped_device *md;
  95        blk_status_t status;
  96        atomic_t io_count;
  97        struct bio *orig_bio;
  98        unsigned long start_time;
  99        spinlock_t endio_lock;
 100        struct dm_stats_aux stats_aux;
 101        /* last member of dm_target_io is 'struct bio' */
 102        struct dm_target_io tio;
 103};
 104
 105void *dm_per_bio_data(struct bio *bio, size_t data_size)
 106{
 107        struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
 108        if (!tio->inside_dm_io)
 109                return (char *)bio - offsetof(struct dm_target_io, clone) - data_size;
 110        return (char *)bio - offsetof(struct dm_target_io, clone) - offsetof(struct dm_io, tio) - data_size;
 111}
 112EXPORT_SYMBOL_GPL(dm_per_bio_data);
 113
 114struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size)
 115{
 116        struct dm_io *io = (struct dm_io *)((char *)data + data_size);
 117        if (io->magic == DM_IO_MAGIC)
 118                return (struct bio *)((char *)io + offsetof(struct dm_io, tio) + offsetof(struct dm_target_io, clone));
 119        BUG_ON(io->magic != DM_TIO_MAGIC);
 120        return (struct bio *)((char *)io + offsetof(struct dm_target_io, clone));
 121}
 122EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data);
 123
 124unsigned dm_bio_get_target_bio_nr(const struct bio *bio)
 125{
 126        return container_of(bio, struct dm_target_io, clone)->target_bio_nr;
 127}
 128EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr);
 129
 130#define MINOR_ALLOCED ((void *)-1)
 131
 132/*
 133 * Bits for the md->flags field.
 134 */
 135#define DMF_BLOCK_IO_FOR_SUSPEND 0
 136#define DMF_SUSPENDED 1
 137#define DMF_FROZEN 2
 138#define DMF_FREEING 3
 139#define DMF_DELETING 4
 140#define DMF_NOFLUSH_SUSPENDING 5
 141#define DMF_DEFERRED_REMOVE 6
 142#define DMF_SUSPENDED_INTERNALLY 7
 143
 144#define DM_NUMA_NODE NUMA_NO_NODE
 145static int dm_numa_node = DM_NUMA_NODE;
 146
 147/*
 148 * For mempools pre-allocation at the table loading time.
 149 */
 150struct dm_md_mempools {
 151        struct bio_set bs;
 152        struct bio_set io_bs;
 153};
 154
 155struct table_device {
 156        struct list_head list;
 157        refcount_t count;
 158        struct dm_dev dm_dev;
 159};
 160
 161static struct kmem_cache *_rq_tio_cache;
 162static struct kmem_cache *_rq_cache;
 163
 164/*
 165 * Bio-based DM's mempools' reserved IOs set by the user.
 166 */
 167#define RESERVED_BIO_BASED_IOS          16
 168static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
 169
 170static int __dm_get_module_param_int(int *module_param, int min, int max)
 171{
 172        int param = READ_ONCE(*module_param);
 173        int modified_param = 0;
 174        bool modified = true;
 175
 176        if (param < min)
 177                modified_param = min;
 178        else if (param > max)
 179                modified_param = max;
 180        else
 181                modified = false;
 182
 183        if (modified) {
 184                (void)cmpxchg(module_param, param, modified_param);
 185                param = modified_param;
 186        }
 187
 188        return param;
 189}
 190
 191unsigned __dm_get_module_param(unsigned *module_param,
 192                               unsigned def, unsigned max)
 193{
 194        unsigned param = READ_ONCE(*module_param);
 195        unsigned modified_param = 0;
 196
 197        if (!param)
 198                modified_param = def;
 199        else if (param > max)
 200                modified_param = max;
 201
 202        if (modified_param) {
 203                (void)cmpxchg(module_param, param, modified_param);
 204                param = modified_param;
 205        }
 206
 207        return param;
 208}
 209
 210unsigned dm_get_reserved_bio_based_ios(void)
 211{
 212        return __dm_get_module_param(&reserved_bio_based_ios,
 213                                     RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS);
 214}
 215EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
 216
 217static unsigned dm_get_numa_node(void)
 218{
 219        return __dm_get_module_param_int(&dm_numa_node,
 220                                         DM_NUMA_NODE, num_online_nodes() - 1);
 221}
 222
 223static int __init local_init(void)
 224{
 225        int r = -ENOMEM;
 226
 227        _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
 228        if (!_rq_tio_cache)
 229                return r;
 230
 231        _rq_cache = kmem_cache_create("dm_old_clone_request", sizeof(struct request),
 232                                      __alignof__(struct request), 0, NULL);
 233        if (!_rq_cache)
 234                goto out_free_rq_tio_cache;
 235
 236        r = dm_uevent_init();
 237        if (r)
 238                goto out_free_rq_cache;
 239
 240        deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
 241        if (!deferred_remove_workqueue) {
 242                r = -ENOMEM;
 243                goto out_uevent_exit;
 244        }
 245
 246        _major = major;
 247        r = register_blkdev(_major, _name);
 248        if (r < 0)
 249                goto out_free_workqueue;
 250
 251        if (!_major)
 252                _major = r;
 253
 254        return 0;
 255
 256out_free_workqueue:
 257        destroy_workqueue(deferred_remove_workqueue);
 258out_uevent_exit:
 259        dm_uevent_exit();
 260out_free_rq_cache:
 261        kmem_cache_destroy(_rq_cache);
 262out_free_rq_tio_cache:
 263        kmem_cache_destroy(_rq_tio_cache);
 264
 265        return r;
 266}
 267
 268static void local_exit(void)
 269{
 270        flush_scheduled_work();
 271        destroy_workqueue(deferred_remove_workqueue);
 272
 273        kmem_cache_destroy(_rq_cache);
 274        kmem_cache_destroy(_rq_tio_cache);
 275        unregister_blkdev(_major, _name);
 276        dm_uevent_exit();
 277
 278        _major = 0;
 279
 280        DMINFO("cleaned up");
 281}
 282
 283static int (*_inits[])(void) __initdata = {
 284        local_init,
 285        dm_target_init,
 286        dm_linear_init,
 287        dm_stripe_init,
 288        dm_io_init,
 289        dm_kcopyd_init,
 290        dm_interface_init,
 291        dm_statistics_init,
 292};
 293
 294static void (*_exits[])(void) = {
 295        local_exit,
 296        dm_target_exit,
 297        dm_linear_exit,
 298        dm_stripe_exit,
 299        dm_io_exit,
 300        dm_kcopyd_exit,
 301        dm_interface_exit,
 302        dm_statistics_exit,
 303};
 304
 305static int __init dm_init(void)
 306{
 307        const int count = ARRAY_SIZE(_inits);
 308
 309        int r, i;
 310
 311        for (i = 0; i < count; i++) {
 312                r = _inits[i]();
 313                if (r)
 314                        goto bad;
 315        }
 316
 317        return 0;
 318
 319      bad:
 320        while (i--)
 321                _exits[i]();
 322
 323        return r;
 324}
 325
 326static void __exit dm_exit(void)
 327{
 328        int i = ARRAY_SIZE(_exits);
 329
 330        while (i--)
 331                _exits[i]();
 332
 333        /*
 334         * Should be empty by this point.
 335         */
 336        idr_destroy(&_minor_idr);
 337}
 338
 339/*
 340 * Block device functions
 341 */
 342int dm_deleting_md(struct mapped_device *md)
 343{
 344        return test_bit(DMF_DELETING, &md->flags);
 345}
 346
 347static int dm_blk_open(struct block_device *bdev, fmode_t mode)
 348{
 349        struct mapped_device *md;
 350
 351        spin_lock(&_minor_lock);
 352
 353        md = bdev->bd_disk->private_data;
 354        if (!md)
 355                goto out;
 356
 357        if (test_bit(DMF_FREEING, &md->flags) ||
 358            dm_deleting_md(md)) {
 359                md = NULL;
 360                goto out;
 361        }
 362
 363        dm_get(md);
 364        atomic_inc(&md->open_count);
 365out:
 366        spin_unlock(&_minor_lock);
 367
 368        return md ? 0 : -ENXIO;
 369}
 370
 371static void dm_blk_close(struct gendisk *disk, fmode_t mode)
 372{
 373        struct mapped_device *md;
 374
 375        spin_lock(&_minor_lock);
 376
 377        md = disk->private_data;
 378        if (WARN_ON(!md))
 379                goto out;
 380
 381        if (atomic_dec_and_test(&md->open_count) &&
 382            (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
 383                queue_work(deferred_remove_workqueue, &deferred_remove_work);
 384
 385        dm_put(md);
 386out:
 387        spin_unlock(&_minor_lock);
 388}
 389
 390int dm_open_count(struct mapped_device *md)
 391{
 392        return atomic_read(&md->open_count);
 393}
 394
 395/*
 396 * Guarantees nothing is using the device before it's deleted.
 397 */
 398int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
 399{
 400        int r = 0;
 401
 402        spin_lock(&_minor_lock);
 403
 404        if (dm_open_count(md)) {
 405                r = -EBUSY;
 406                if (mark_deferred)
 407                        set_bit(DMF_DEFERRED_REMOVE, &md->flags);
 408        } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
 409                r = -EEXIST;
 410        else
 411                set_bit(DMF_DELETING, &md->flags);
 412
 413        spin_unlock(&_minor_lock);
 414
 415        return r;
 416}
 417
 418int dm_cancel_deferred_remove(struct mapped_device *md)
 419{
 420        int r = 0;
 421
 422        spin_lock(&_minor_lock);
 423
 424        if (test_bit(DMF_DELETING, &md->flags))
 425                r = -EBUSY;
 426        else
 427                clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
 428
 429        spin_unlock(&_minor_lock);
 430
 431        return r;
 432}
 433
 434static void do_deferred_remove(struct work_struct *w)
 435{
 436        dm_deferred_remove();
 437}
 438
 439sector_t dm_get_size(struct mapped_device *md)
 440{
 441        return get_capacity(md->disk);
 442}
 443
 444struct request_queue *dm_get_md_queue(struct mapped_device *md)
 445{
 446        return md->queue;
 447}
 448
 449struct dm_stats *dm_get_stats(struct mapped_device *md)
 450{
 451        return &md->stats;
 452}
 453
 454static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 455{
 456        struct mapped_device *md = bdev->bd_disk->private_data;
 457
 458        return dm_get_geometry(md, geo);
 459}
 460
 461static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx,
 462                            struct block_device **bdev)
 463        __acquires(md->io_barrier)
 464{
 465        struct dm_target *tgt;
 466        struct dm_table *map;
 467        int r;
 468
 469retry:
 470        r = -ENOTTY;
 471        map = dm_get_live_table(md, srcu_idx);
 472        if (!map || !dm_table_get_size(map))
 473                return r;
 474
 475        /* We only support devices that have a single target */
 476        if (dm_table_get_num_targets(map) != 1)
 477                return r;
 478
 479        tgt = dm_table_get_target(map, 0);
 480        if (!tgt->type->prepare_ioctl)
 481                return r;
 482
 483        if (dm_suspended_md(md))
 484                return -EAGAIN;
 485
 486        r = tgt->type->prepare_ioctl(tgt, bdev);
 487        if (r == -ENOTCONN && !fatal_signal_pending(current)) {
 488                dm_put_live_table(md, *srcu_idx);
 489                msleep(10);
 490                goto retry;
 491        }
 492
 493        return r;
 494}
 495
 496static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx)
 497        __releases(md->io_barrier)
 498{
 499        dm_put_live_table(md, srcu_idx);
 500}
 501
 502static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
 503                        unsigned int cmd, unsigned long arg)
 504{
 505        struct mapped_device *md = bdev->bd_disk->private_data;
 506        int r, srcu_idx;
 507
 508        r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
 509        if (r < 0)
 510                goto out;
 511
 512        if (r > 0) {
 513                /*
 514                 * Target determined this ioctl is being issued against a
 515                 * subset of the parent bdev; require extra privileges.
 516                 */
 517                if (!capable(CAP_SYS_RAWIO)) {
 518                        DMWARN_LIMIT(
 519        "%s: sending ioctl %x to DM device without required privilege.",
 520                                current->comm, cmd);
 521                        r = -ENOIOCTLCMD;
 522                        goto out;
 523                }
 524        }
 525
 526        r =  __blkdev_driver_ioctl(bdev, mode, cmd, arg);
 527out:
 528        dm_unprepare_ioctl(md, srcu_idx);
 529        return r;
 530}
 531
 532static void start_io_acct(struct dm_io *io);
 533
 534static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
 535{
 536        struct dm_io *io;
 537        struct dm_target_io *tio;
 538        struct bio *clone;
 539
 540        clone = bio_alloc_bioset(GFP_NOIO, 0, &md->io_bs);
 541        if (!clone)
 542                return NULL;
 543
 544        tio = container_of(clone, struct dm_target_io, clone);
 545        tio->inside_dm_io = true;
 546        tio->io = NULL;
 547
 548        io = container_of(tio, struct dm_io, tio);
 549        io->magic = DM_IO_MAGIC;
 550        io->status = 0;
 551        atomic_set(&io->io_count, 1);
 552        io->orig_bio = bio;
 553        io->md = md;
 554        spin_lock_init(&io->endio_lock);
 555
 556        start_io_acct(io);
 557
 558        return io;
 559}
 560
 561static void free_io(struct mapped_device *md, struct dm_io *io)
 562{
 563        bio_put(&io->tio.clone);
 564}
 565
 566static struct dm_target_io *alloc_tio(struct clone_info *ci, struct dm_target *ti,
 567                                      unsigned target_bio_nr, gfp_t gfp_mask)
 568{
 569        struct dm_target_io *tio;
 570
 571        if (!ci->io->tio.io) {
 572                /* the dm_target_io embedded in ci->io is available */
 573                tio = &ci->io->tio;
 574        } else {
 575                struct bio *clone = bio_alloc_bioset(gfp_mask, 0, &ci->io->md->bs);
 576                if (!clone)
 577                        return NULL;
 578
 579                tio = container_of(clone, struct dm_target_io, clone);
 580                tio->inside_dm_io = false;
 581        }
 582
 583        tio->magic = DM_TIO_MAGIC;
 584        tio->io = ci->io;
 585        tio->ti = ti;
 586        tio->target_bio_nr = target_bio_nr;
 587
 588        return tio;
 589}
 590
 591static void free_tio(struct dm_target_io *tio)
 592{
 593        if (tio->inside_dm_io)
 594                return;
 595        bio_put(&tio->clone);
 596}
 597
 598int md_in_flight(struct mapped_device *md)
 599{
 600        return atomic_read(&md->pending[READ]) +
 601               atomic_read(&md->pending[WRITE]);
 602}
 603
 604static void start_io_acct(struct dm_io *io)
 605{
 606        struct mapped_device *md = io->md;
 607        struct bio *bio = io->orig_bio;
 608        int rw = bio_data_dir(bio);
 609
 610        io->start_time = jiffies;
 611
 612        generic_start_io_acct(md->queue, bio_op(bio), bio_sectors(bio),
 613                              &dm_disk(md)->part0);
 614
 615        atomic_set(&dm_disk(md)->part0.in_flight[rw],
 616                   atomic_inc_return(&md->pending[rw]));
 617
 618        if (unlikely(dm_stats_used(&md->stats)))
 619                dm_stats_account_io(&md->stats, bio_data_dir(bio),
 620                                    bio->bi_iter.bi_sector, bio_sectors(bio),
 621                                    false, 0, &io->stats_aux);
 622}
 623
 624static void end_io_acct(struct dm_io *io)
 625{
 626        struct mapped_device *md = io->md;
 627        struct bio *bio = io->orig_bio;
 628        unsigned long duration = jiffies - io->start_time;
 629        int pending;
 630        int rw = bio_data_dir(bio);
 631
 632        generic_end_io_acct(md->queue, bio_op(bio), &dm_disk(md)->part0,
 633                            io->start_time);
 634
 635        if (unlikely(dm_stats_used(&md->stats)))
 636                dm_stats_account_io(&md->stats, bio_data_dir(bio),
 637                                    bio->bi_iter.bi_sector, bio_sectors(bio),
 638                                    true, duration, &io->stats_aux);
 639
 640        /*
 641         * After this is decremented the bio must not be touched if it is
 642         * a flush.
 643         */
 644        pending = atomic_dec_return(&md->pending[rw]);
 645        atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
 646        pending += atomic_read(&md->pending[rw^0x1]);
 647
 648        /* nudge anyone waiting on suspend queue */
 649        if (!pending)
 650                wake_up(&md->wait);
 651}
 652
 653/*
 654 * Add the bio to the list of deferred io.
 655 */
 656static void queue_io(struct mapped_device *md, struct bio *bio)
 657{
 658        unsigned long flags;
 659
 660        spin_lock_irqsave(&md->deferred_lock, flags);
 661        bio_list_add(&md->deferred, bio);
 662        spin_unlock_irqrestore(&md->deferred_lock, flags);
 663        queue_work(md->wq, &md->work);
 664}
 665
 666/*
 667 * Everyone (including functions in this file), should use this
 668 * function to access the md->map field, and make sure they call
 669 * dm_put_live_table() when finished.
 670 */
 671struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
 672{
 673        *srcu_idx = srcu_read_lock(&md->io_barrier);
 674
 675        return srcu_dereference(md->map, &md->io_barrier);
 676}
 677
 678void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
 679{
 680        srcu_read_unlock(&md->io_barrier, srcu_idx);
 681}
 682
 683void dm_sync_table(struct mapped_device *md)
 684{
 685        synchronize_srcu(&md->io_barrier);
 686        synchronize_rcu_expedited();
 687}
 688
 689/*
 690 * A fast alternative to dm_get_live_table/dm_put_live_table.
 691 * The caller must not block between these two functions.
 692 */
 693static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
 694{
 695        rcu_read_lock();
 696        return rcu_dereference(md->map);
 697}
 698
 699static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
 700{
 701        rcu_read_unlock();
 702}
 703
 704static char *_dm_claim_ptr = "I belong to device-mapper";
 705
 706/*
 707 * Open a table device so we can use it as a map destination.
 708 */
 709static int open_table_device(struct table_device *td, dev_t dev,
 710                             struct mapped_device *md)
 711{
 712        struct block_device *bdev;
 713
 714        int r;
 715
 716        BUG_ON(td->dm_dev.bdev);
 717
 718        bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr);
 719        if (IS_ERR(bdev))
 720                return PTR_ERR(bdev);
 721
 722        r = bd_link_disk_holder(bdev, dm_disk(md));
 723        if (r) {
 724                blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
 725                return r;
 726        }
 727
 728        td->dm_dev.bdev = bdev;
 729        td->dm_dev.dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
 730        return 0;
 731}
 732
 733/*
 734 * Close a table device that we've been using.
 735 */
 736static void close_table_device(struct table_device *td, struct mapped_device *md)
 737{
 738        if (!td->dm_dev.bdev)
 739                return;
 740
 741        bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
 742        blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
 743        put_dax(td->dm_dev.dax_dev);
 744        td->dm_dev.bdev = NULL;
 745        td->dm_dev.dax_dev = NULL;
 746}
 747
 748static struct table_device *find_table_device(struct list_head *l, dev_t dev,
 749                                              fmode_t mode) {
 750        struct table_device *td;
 751
 752        list_for_each_entry(td, l, list)
 753                if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
 754                        return td;
 755
 756        return NULL;
 757}
 758
 759int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
 760                        struct dm_dev **result) {
 761        int r;
 762        struct table_device *td;
 763
 764        mutex_lock(&md->table_devices_lock);
 765        td = find_table_device(&md->table_devices, dev, mode);
 766        if (!td) {
 767                td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
 768                if (!td) {
 769                        mutex_unlock(&md->table_devices_lock);
 770                        return -ENOMEM;
 771                }
 772
 773                td->dm_dev.mode = mode;
 774                td->dm_dev.bdev = NULL;
 775
 776                if ((r = open_table_device(td, dev, md))) {
 777                        mutex_unlock(&md->table_devices_lock);
 778                        kfree(td);
 779                        return r;
 780                }
 781
 782                format_dev_t(td->dm_dev.name, dev);
 783
 784                refcount_set(&td->count, 1);
 785                list_add(&td->list, &md->table_devices);
 786        } else {
 787                refcount_inc(&td->count);
 788        }
 789        mutex_unlock(&md->table_devices_lock);
 790
 791        *result = &td->dm_dev;
 792        return 0;
 793}
 794EXPORT_SYMBOL_GPL(dm_get_table_device);
 795
 796void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
 797{
 798        struct table_device *td = container_of(d, struct table_device, dm_dev);
 799
 800        mutex_lock(&md->table_devices_lock);
 801        if (refcount_dec_and_test(&td->count)) {
 802                close_table_device(td, md);
 803                list_del(&td->list);
 804                kfree(td);
 805        }
 806        mutex_unlock(&md->table_devices_lock);
 807}
 808EXPORT_SYMBOL(dm_put_table_device);
 809
 810static void free_table_devices(struct list_head *devices)
 811{
 812        struct list_head *tmp, *next;
 813
 814        list_for_each_safe(tmp, next, devices) {
 815                struct table_device *td = list_entry(tmp, struct table_device, list);
 816
 817                DMWARN("dm_destroy: %s still exists with %d references",
 818                       td->dm_dev.name, refcount_read(&td->count));
 819                kfree(td);
 820        }
 821}
 822
 823/*
 824 * Get the geometry associated with a dm device
 825 */
 826int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
 827{
 828        *geo = md->geometry;
 829
 830        return 0;
 831}
 832
 833/*
 834 * Set the geometry of a device.
 835 */
 836int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
 837{
 838        sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
 839
 840        if (geo->start > sz) {
 841                DMWARN("Start sector is beyond the geometry limits.");
 842                return -EINVAL;
 843        }
 844
 845        md->geometry = *geo;
 846
 847        return 0;
 848}
 849
 850static int __noflush_suspending(struct mapped_device *md)
 851{
 852        return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
 853}
 854
 855/*
 856 * Decrements the number of outstanding ios that a bio has been
 857 * cloned into, completing the original io if necc.
 858 */
 859static void dec_pending(struct dm_io *io, blk_status_t error)
 860{
 861        unsigned long flags;
 862        blk_status_t io_error;
 863        struct bio *bio;
 864        struct mapped_device *md = io->md;
 865
 866        /* Push-back supersedes any I/O errors */
 867        if (unlikely(error)) {
 868                spin_lock_irqsave(&io->endio_lock, flags);
 869                if (!(io->status == BLK_STS_DM_REQUEUE && __noflush_suspending(md)))
 870                        io->status = error;
 871                spin_unlock_irqrestore(&io->endio_lock, flags);
 872        }
 873
 874        if (atomic_dec_and_test(&io->io_count)) {
 875                if (io->status == BLK_STS_DM_REQUEUE) {
 876                        /*
 877                         * Target requested pushing back the I/O.
 878                         */
 879                        spin_lock_irqsave(&md->deferred_lock, flags);
 880                        if (__noflush_suspending(md))
 881                                /* NOTE early return due to BLK_STS_DM_REQUEUE below */
 882                                bio_list_add_head(&md->deferred, io->orig_bio);
 883                        else
 884                                /* noflush suspend was interrupted. */
 885                                io->status = BLK_STS_IOERR;
 886                        spin_unlock_irqrestore(&md->deferred_lock, flags);
 887                }
 888
 889                io_error = io->status;
 890                bio = io->orig_bio;
 891                end_io_acct(io);
 892                free_io(md, io);
 893
 894                if (io_error == BLK_STS_DM_REQUEUE)
 895                        return;
 896
 897                if ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size) {
 898                        /*
 899                         * Preflush done for flush with data, reissue
 900                         * without REQ_PREFLUSH.
 901                         */
 902                        bio->bi_opf &= ~REQ_PREFLUSH;
 903                        queue_io(md, bio);
 904                } else {
 905                        /* done with normal IO or empty flush */
 906                        if (io_error)
 907                                bio->bi_status = io_error;
 908                        bio_endio(bio);
 909                }
 910        }
 911}
 912
 913void disable_write_same(struct mapped_device *md)
 914{
 915        struct queue_limits *limits = dm_get_queue_limits(md);
 916
 917        /* device doesn't really support WRITE SAME, disable it */
 918        limits->max_write_same_sectors = 0;
 919}
 920
 921void disable_write_zeroes(struct mapped_device *md)
 922{
 923        struct queue_limits *limits = dm_get_queue_limits(md);
 924
 925        /* device doesn't really support WRITE ZEROES, disable it */
 926        limits->max_write_zeroes_sectors = 0;
 927}
 928
 929static void clone_endio(struct bio *bio)
 930{
 931        blk_status_t error = bio->bi_status;
 932        struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
 933        struct dm_io *io = tio->io;
 934        struct mapped_device *md = tio->io->md;
 935        dm_endio_fn endio = tio->ti->type->end_io;
 936
 937        if (unlikely(error == BLK_STS_TARGET) && md->type != DM_TYPE_NVME_BIO_BASED) {
 938                if (bio_op(bio) == REQ_OP_WRITE_SAME &&
 939                    !bio->bi_disk->queue->limits.max_write_same_sectors)
 940                        disable_write_same(md);
 941                if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
 942                    !bio->bi_disk->queue->limits.max_write_zeroes_sectors)
 943                        disable_write_zeroes(md);
 944        }
 945
 946        if (endio) {
 947                int r = endio(tio->ti, bio, &error);
 948                switch (r) {
 949                case DM_ENDIO_REQUEUE:
 950                        error = BLK_STS_DM_REQUEUE;
 951                        /*FALLTHRU*/
 952                case DM_ENDIO_DONE:
 953                        break;
 954                case DM_ENDIO_INCOMPLETE:
 955                        /* The target will handle the io */
 956                        return;
 957                default:
 958                        DMWARN("unimplemented target endio return value: %d", r);
 959                        BUG();
 960                }
 961        }
 962
 963        free_tio(tio);
 964        dec_pending(io, error);
 965}
 966
 967/*
 968 * Return maximum size of I/O possible at the supplied sector up to the current
 969 * target boundary.
 970 */
 971static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
 972{
 973        sector_t target_offset = dm_target_offset(ti, sector);
 974
 975        return ti->len - target_offset;
 976}
 977
 978static sector_t max_io_len(sector_t sector, struct dm_target *ti)
 979{
 980        sector_t len = max_io_len_target_boundary(sector, ti);
 981        sector_t offset, max_len;
 982
 983        /*
 984         * Does the target need to split even further?
 985         */
 986        if (ti->max_io_len) {
 987                offset = dm_target_offset(ti, sector);
 988                if (unlikely(ti->max_io_len & (ti->max_io_len - 1)))
 989                        max_len = sector_div(offset, ti->max_io_len);
 990                else
 991                        max_len = offset & (ti->max_io_len - 1);
 992                max_len = ti->max_io_len - max_len;
 993
 994                if (len > max_len)
 995                        len = max_len;
 996        }
 997
 998        return len;
 999}
1000

1001int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
1002{
1003        if (len > UINT_MAX) {
1004                DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
1005                      (unsigned long long)len, UINT_MAX);
1006                ti->error = "Maximum size of target IO is too large";
1007                return -EINVAL;
1008        }
1009
1010        /*
1011         * BIO based queue uses its own splitting. When multipage bvecs
1012         * is switched on, size of the incoming bio may be too big to
1013         * be handled in some targets, such as crypt.
1014         *
1015         * When these targets are ready for the big bio, we can remove
1016         * the limit.
1017         */
1018        ti->max_io_len = min_t(uint32_t, len, BIO_MAX_PAGES * PAGE_SIZE);
1019
1020        return 0;
1021}
1022EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
1023
1024static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
1025                                                sector_t sector, int *srcu_idx)
1026        __acquires(md->io_barrier)
1027{
1028        struct dm_table *map;
1029        struct dm_target *ti;
1030
1031        map = dm_get_live_table(md, srcu_idx);
1032        if (!map)
1033                return NULL;
1034
1035        ti = dm_table_find_target(map, sector);
1036        if (!dm_target_is_valid(ti))
1037                return NULL;
1038
1039        return ti;
1040}
1041
1042static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
1043                                 long nr_pages, void **kaddr, pfn_t *pfn)
1044{
1045        struct mapped_device *md = dax_get_private(dax_dev);
1046        sector_t sector = pgoff * PAGE_SECTORS;
1047        struct dm_target *ti;
1048        long len, ret = -EIO;
1049        int srcu_idx;
1050
1051        ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1052
1053        if (!ti)
1054                goto out;
1055        if (!ti->type->direct_access)
1056                goto out;
1057        len = max_io_len(sector, ti) / PAGE_SECTORS;
1058        if (len < 1)
1059                goto out;
1060        nr_pages = min(len, nr_pages);
1061        ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
1062
1063 out:
1064        dm_put_live_table(md, srcu_idx);
1065
1066        return ret;
1067}
1068
1069static size_t dm_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
1070                                    void *addr, size_t bytes, struct iov_iter *i)
1071{
1072        struct mapped_device *md = dax_get_private(dax_dev);
1073        sector_t sector = pgoff * PAGE_SECTORS;
1074        struct dm_target *ti;
1075        long ret = 0;
1076        int srcu_idx;
1077
1078        ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1079
1080        if (!ti)
1081                goto out;
1082        if (!ti->type->dax_copy_from_iter) {
1083                ret = copy_from_iter(addr, bytes, i);
1084                goto out;
1085        }
1086        ret = ti->type->dax_copy_from_iter(ti, pgoff, addr, bytes, i);
1087 out:
1088        dm_put_live_table(md, srcu_idx);
1089
1090        return ret;
1091}
1092
1093static size_t dm_dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff,
1094                void *addr, size_t bytes, struct iov_iter *i)
1095{
1096        struct mapped_device *md = dax_get_private(dax_dev);
1097        sector_t sector = pgoff * PAGE_SECTORS;
1098        struct dm_target *ti;
1099        long ret = 0;
1100        int srcu_idx;
1101
1102        ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1103
1104        if (!ti)
1105                goto out;
1106        if (!ti->type->dax_copy_to_iter) {
1107                ret = copy_to_iter(addr, bytes, i);
1108                goto out;
1109        }
1110        ret = ti->type->dax_copy_to_iter(ti, pgoff, addr, bytes, i);
1111 out:
1112        dm_put_live_table(md, srcu_idx);
1113
1114        return ret;
1115}
1116
1117/*
1118 * A target may call dm_accept_partial_bio only from the map routine.  It is
1119 * allowed for all bio types except REQ_PREFLUSH and REQ_OP_ZONE_RESET.
1120 *
1121 * dm_accept_partial_bio informs the dm that the target only wants to process
1122 * additional n_sectors sectors of the bio and the rest of the data should be
1123 * sent in a next bio.
1124 *
1125 * A diagram that explains the arithmetics:
1126 * +--------------------+---------------+-------+
1127 * |         1          |       2       |   3   |
1128 * +--------------------+---------------+-------+
1129 *
1130 * <-------------- *tio->len_ptr --------------->
1131 *                      <------- bi_size ------->
1132 *                      <-- n_sectors -->
1133 *
1134 * Region 1 was already iterated over with bio_advance or similar function.
1135 *      (it may be empty if the target doesn't use bio_advance)
1136 * Region 2 is the remaining bio size that the target wants to process.
1137 *      (it may be empty if region 1 is non-empty, although there is no reason
1138 *       to make it empty)
1139 * The target requires that region 3 is to be sent in the next bio.
1140 *
1141 * If the target wants to receive multiple copies of the bio (via num_*bios, etc),
1142 * the partially processed part (the sum of regions 1+2) must be the same for all
1143 * copies of the bio.
1144 */
1145void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
1146{
1147        struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
1148        unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
1149        BUG_ON(bio->bi_opf & REQ_PREFLUSH);
1150        BUG_ON(bi_size > *tio->len_ptr);
1151        BUG_ON(n_sectors > bi_size);
1152        *tio->len_ptr -= bi_size - n_sectors;
1153        bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
1154}
1155EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
1156
1157/*
1158 * The zone descriptors obtained with a zone report indicate zone positions
1159 * within the target backing device, regardless of that device is a partition
1160 * and regardless of the target mapping start sector on the device or partition.
1161 * The zone descriptors start sector and write pointer position must be adjusted
1162 * to match their relative position within the dm device.
1163 * A target may call dm_remap_zone_report() after completion of a
1164 * REQ_OP_ZONE_REPORT bio to remap the zone descriptors obtained from the
1165 * backing device.
1166 */
1167void dm_remap_zone_report(struct dm_target *ti, struct bio *bio, sector_t start)
1168{
1169#ifdef CONFIG_BLK_DEV_ZONED
1170        struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone);
1171        struct bio *report_bio = tio->io->orig_bio;
1172        struct blk_zone_report_hdr *hdr = NULL;
1173        struct blk_zone *zone;
1174        unsigned int nr_rep = 0;
1175        unsigned int ofst;
1176        sector_t part_offset;
1177        struct bio_vec bvec;
1178        struct bvec_iter iter;
1179        void *addr;
1180
1181        if (bio->bi_status)
1182                return;
1183
1184        /*
1185         * bio sector was incremented by the request size on completion. Taking
1186         * into account the original request sector, the target start offset on
1187         * the backing device and the target mapping offset (ti->begin), the
1188         * start sector of the backing device. The partition offset is always 0
1189         * if the target uses a whole device.
1190         */
1191        part_offset = bio->bi_iter.bi_sector + ti->begin - (start + bio_end_sector(report_bio));
1192
1193        /*
1194         * Remap the start sector of the reported zones. For sequential zones,
1195         * also remap the write pointer position.
1196         */
1197        bio_for_each_segment(bvec, report_bio, iter) {
1198                addr = kmap_atomic(bvec.bv_page);
1199
1200                /* Remember the report header in the first page */
1201                if (!hdr) {
1202                        hdr = addr;
1203                        ofst = sizeof(struct blk_zone_report_hdr);
1204                } else
1205                        ofst = 0;
1206
1207                /* Set zones start sector */
1208                while (hdr->nr_zones && ofst < bvec.bv_len) {
1209                        zone = addr + ofst;
1210                        zone->start -= part_offset;
1211                        if (zone->start >= start + ti->len) {
1212                                hdr->nr_zones = 0;
1213                                break;
1214                        }
1215                        zone->start = zone->start + ti->begin - start;
1216                        if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) {
1217                                if (zone->cond == BLK_ZONE_COND_FULL)
1218                                        zone->wp = zone->start + zone->len;
1219                                else if (zone->cond == BLK_ZONE_COND_EMPTY)
1220                                        zone->wp = zone->start;
1221                                else
1222                                        zone->wp = zone->wp + ti->begin - start - part_offset;
1223                        }
1224                        ofst += sizeof(struct blk_zone);
1225                        hdr->nr_zones--;
1226                        nr_rep++;
1227                }
1228
1229                if (addr != hdr)
1230                        kunmap_atomic(addr);
1231
1232                if (!hdr->nr_zones)
1233                        break;
1234        }
1235
1236        if (hdr) {
1237                hdr->nr_zones = nr_rep;
1238                kunmap_atomic(hdr);
1239        }
1240
1241        bio_advance(report_bio, report_bio->bi_iter.bi_size);
1242
1243#else /* !CONFIG_BLK_DEV_ZONED */
1244        bio->bi_status = BLK_STS_NOTSUPP;
1245#endif
1246}
1247EXPORT_SYMBOL_GPL(dm_remap_zone_report);
1248
1249static blk_qc_t __map_bio(struct dm_target_io *tio)
1250{
1251        int r;
1252        sector_t sector;
1253        struct bio *clone = &tio->clone;
1254        struct dm_io *io = tio->io;
1255        struct mapped_device *md = io->md;
1256        struct dm_target *ti = tio->ti;
1257        blk_qc_t ret = BLK_QC_T_NONE;
1258
1259        clone->bi_end_io = clone_endio;
1260
1261        /*
1262         * Map the clone.  If r == 0 we don't need to do
1263         * anything, the target has assumed ownership of
1264         * this io.
1265         */
1266        atomic_inc(&io->io_count);
1267        sector = clone->bi_iter.bi_sector;
1268
1269        r = ti->type->map(ti, clone);
1270        switch (r) {
1271        case DM_MAPIO_SUBMITTED:
1272                break;
1273        case DM_MAPIO_REMAPPED:
1274                /* the bio has been remapped so dispatch it */
1275                trace_block_bio_remap(clone->bi_disk->queue, clone,
1276                                      bio_dev(io->orig_bio), sector);
1277                if (md->type == DM_TYPE_NVME_BIO_BASED)
1278                        ret = direct_make_request(clone);
1279                else
1280                        ret = generic_make_request(clone);
1281                break;
1282        case DM_MAPIO_KILL:
1283                free_tio(tio);
1284                dec_pending(io, BLK_STS_IOERR);
1285                break;
1286        case DM_MAPIO_REQUEUE:
1287                free_tio(tio);
1288                dec_pending(io, BLK_STS_DM_REQUEUE);
1289                break;
1290        default:
1291                DMWARN("unimplemented target map return value: %d", r);
1292                BUG();
1293        }
1294
1295        return ret;
1296}
1297
1298static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
1299{
1300        bio->bi_iter.bi_sector = sector;
1301        bio->bi_iter.bi_size = to_bytes(len);
1302}
1303
1304/*
1305 * Creates a bio that consists of range of complete bvecs.
1306 */
1307static int clone_bio(struct dm_target_io *tio, struct bio *bio,
1308                     sector_t sector, unsigned len)
1309{
1310        struct bio *clone = &tio->clone;
1311
1312        __bio_clone_fast(clone, bio);
1313
1314        if (unlikely(bio_integrity(bio) != NULL)) {
1315                int r;
1316
1317                if (unlikely(!dm_target_has_integrity(tio->ti->type) &&
1318                             !dm_target_passes_integrity(tio->ti->type))) {
1319                        DMWARN("%s: the target %s doesn't support integrity data.",
1320                                dm_device_name(tio->io->md),
1321                                tio->ti->type->name);
1322                        return -EIO;
1323                }
1324
1325                r = bio_integrity_clone(clone, bio, GFP_NOIO);
1326                if (r < 0)
1327                        return r;
1328        }
1329
1330        if (bio_op(bio) != REQ_OP_ZONE_REPORT)
1331                bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
1332        clone->bi_iter.bi_size = to_bytes(len);
1333
1334        if (unlikely(bio_integrity(bio) != NULL))
1335                bio_integrity_trim(clone);
1336
1337        return 0;
1338}
1339
1340static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
1341                                struct dm_target *ti, unsigned num_bios)
1342{
1343        struct dm_target_io *tio;
1344        int try;
1345
1346        if (!num_bios)
1347                return;
1348
1349        if (num_bios == 1) {
1350                tio = alloc_tio(ci, ti, 0, GFP_NOIO);
1351                bio_list_add(blist, &tio->clone);
1352                return;
1353        }
1354
1355        for (try = 0; try < 2; try++) {
1356                int bio_nr;
1357                struct bio *bio;
1358
1359                if (try)
1360                        mutex_lock(&ci->io->md->table_devices_lock);
1361                for (bio_nr = 0; bio_nr < num_bios; bio_nr++) {
1362                        tio = alloc_tio(ci, ti, bio_nr, try ? GFP_NOIO : GFP_NOWAIT);
1363                        if (!tio)
1364                                break;
1365
1366                        bio_list_add(blist, &tio->clone);
1367                }
1368                if (try)
1369                        mutex_unlock(&ci->io->md->table_devices_lock);
1370                if (bio_nr == num_bios)
1371                        return;
1372
1373                while ((bio = bio_list_pop(blist))) {
1374                        tio = container_of(bio, struct dm_target_io, clone);
1375                        free_tio(tio);
1376                }
1377        }
1378}
1379
1380static blk_qc_t __clone_and_map_simple_bio(struct clone_info *ci,
1381                                           struct dm_target_io *tio, unsigned *len)
1382{
1383        struct bio *clone = &tio->clone;
1384
1385        tio->len_ptr = len;
1386
1387        __bio_clone_fast(clone, ci->bio);
1388        if (len)
1389                bio_setup_sector(clone, ci->sector, *len);
1390
1391        return __map_bio(tio);
1392}
1393
1394static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
1395                                  unsigned num_bios, unsigned *len)
1396{
1397        struct bio_list blist = BIO_EMPTY_LIST;
1398        struct bio *bio;
1399        struct dm_target_io *tio;
1400
1401        alloc_multiple_bios(&blist, ci, ti, num_bios);
1402
1403        while ((bio = bio_list_pop(&blist))) {
1404                tio = container_of(bio, struct dm_target_io, clone);
1405                (void) __clone_and_map_simple_bio(ci, tio, len);
1406        }
1407}
1408
1409static int __send_empty_flush(struct clone_info *ci)
1410{
1411        unsigned target_nr = 0;
1412        struct dm_target *ti;
1413
1414        BUG_ON(bio_has_data(ci->bio));
1415        while ((ti = dm_table_get_target(ci->map, target_nr++)))
1416                __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
1417
1418        return 0;
1419}
1420
1421static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
1422                                    sector_t sector, unsigned *len)
1423{
1424        struct bio *bio = ci->bio;
1425        struct dm_target_io *tio;
1426        int r;
1427
1428        tio = alloc_tio(ci, ti, 0, GFP_NOIO);
1429        tio->len_ptr = len;
1430        r = clone_bio(tio, bio, sector, *len);
1431        if (r < 0) {
1432                free_tio(tio);
1433                return r;
1434        }
1435        (void) __map_bio(tio);
1436
1437        return 0;
1438}
1439
1440typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
1441
1442static unsigned get_num_discard_bios(struct dm_target *ti)
1443{
1444        return ti->num_discard_bios;
1445}
1446
1447static unsigned get_num_secure_erase_bios(struct dm_target *ti)
1448{
1449        return ti->num_secure_erase_bios;
1450}
1451
1452static unsigned get_num_write_same_bios(struct dm_target *ti)
1453{
1454        return ti->num_write_same_bios;
1455}
1456
1457static unsigned get_num_write_zeroes_bios(struct dm_target *ti)
1458{
1459        return ti->num_write_zeroes_bios;
1460}
1461
1462typedef bool (*is_split_required_fn)(struct dm_target *ti);
1463
1464static bool is_split_required_for_discard(struct dm_target *ti)
1465{
1466        return ti->split_discard_bios;
1467}
1468
1469static int __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
1470                                       get_num_bios_fn get_num_bios,
1471                                       is_split_required_fn is_split_required)
1472{
1473        unsigned len;
1474        unsigned num_bios;
1475
1476        /*
1477         * Even though the device advertised support for this type of
1478         * request, that does not mean every target supports it, and
1479         * reconfiguration might also have changed that since the
1480         * check was performed.
1481         */
1482        num_bios = get_num_bios ? get_num_bios(ti) : 0;
1483        if (!num_bios)
1484                return -EOPNOTSUPP;
1485
1486        if (is_split_required && !is_split_required(ti))
1487                len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
1488        else
1489                len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti));
1490
1491        __send_duplicate_bios(ci, ti, num_bios, &len);
1492
1493        ci->sector += len;
1494        ci->sector_count -= len;
1495
1496        return 0;
1497}
1498
1499static int __send_discard(struct clone_info *ci, struct dm_target *ti)
1500{
1501        return __send_changing_extent_only(ci, ti, get_num_discard_bios,
1502                                           is_split_required_for_discard);
1503}
1504
1505static int __send_secure_erase(struct clone_info *ci, struct dm_target *ti)
1506{
1507        return __send_changing_extent_only(ci, ti, get_num_secure_erase_bios, NULL);
1508}
1509
1510static int __send_write_same(struct clone_info *ci, struct dm_target *ti)
1511{
1512        return __send_changing_extent_only(ci, ti, get_num_write_same_bios, NULL);
1513}
1514
1515static int __send_write_zeroes(struct clone_info *ci, struct dm_target *ti)
1516{
1517        return __send_changing_extent_only(ci, ti, get_num_write_zeroes_bios, NULL);
1518}
1519
1520static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti,
1521                                  int *result)
1522{
1523        struct bio *bio = ci->bio;
1524
1525        if (bio_op(bio) == REQ_OP_DISCARD)
1526                *result = __send_discard(ci, ti);
1527        else if (bio_op(bio) == REQ_OP_SECURE_ERASE)
1528                *result = __send_secure_erase(ci, ti);
1529        else if (bio_op(bio) == REQ_OP_WRITE_SAME)
1530                *result = __send_write_same(ci, ti);
1531        else if (bio_op(bio) == REQ_OP_WRITE_ZEROES)
1532                *result = __send_write_zeroes(ci, ti);
1533        else
1534                return false;
1535
1536        return true;
1537}
1538
1539/*
1540 * Select the correct strategy for processing a non-flush bio.
1541 */
1542static int __split_and_process_non_flush(struct clone_info *ci)
1543{
1544        struct bio *bio = ci->bio;
1545        struct dm_target *ti;
1546        unsigned len;
1547        int r;
1548
1549        ti = dm_table_find_target(ci->map, ci->sector);
1550        if (!dm_target_is_valid(ti))
1551                return -EIO;
1552
1553        if (unlikely(__process_abnormal_io(ci, ti, &r)))
1554                return r;
1555
1556        if (bio_op(bio) == REQ_OP_ZONE_REPORT)
1557                len = ci->sector_count;
1558        else
1559                len = min_t(sector_t, max_io_len(ci->sector, ti),
1560                            ci->sector_count);
1561
1562        r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
1563        if (r < 0)
1564                return r;
1565
1566        ci->sector += len;
1567        ci->sector_count -= len;
1568
1569        return 0;
1570}
1571
1572static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
1573                            struct dm_table *map, struct bio *bio)
1574{
1575        ci->map = map;
1576        ci->io = alloc_io(md, bio);
1577        ci->sector = bio->bi_iter.bi_sector;
1578}
1579
1580/*
1581 * Entry point to split a bio into clones and submit them to the targets.
1582 */
1583static blk_qc_t __split_and_process_bio(struct mapped_device *md,
1584                                        struct dm_table *map, struct bio *bio)
1585{
1586        struct clone_info ci;
1587        blk_qc_t ret = BLK_QC_T_NONE;
1588        int error = 0;
1589
1590        if (unlikely(!map)) {
1591                bio_io_error(bio);
1592                return ret;
1593        }
1594
1595        init_clone_info(&ci, md, map, bio);
1596
1597        if (bio->bi_opf & REQ_PREFLUSH) {
1598                ci.bio = &ci.io->md->flush_bio;
1599                ci.sector_count = 0;
1600                error = __send_empty_flush(&ci);
1601                /* dec_pending submits any data associated with flush */
1602        } else if (bio_op(bio) == REQ_OP_ZONE_RESET) {
1603                ci.bio = bio;
1604                ci.sector_count = 0;
1605                error = __split_and_process_non_flush(&ci);
1606        } else {
1607                ci.bio = bio;
1608                ci.sector_count = bio_sectors(bio);
1609                while (ci.sector_count && !error) {
1610                        error = __split_and_process_non_flush(&ci);
1611                        if (current->bio_list && ci.sector_count && !error) {
1612                                /*
1613                                 * Remainder must be passed to generic_make_request()
1614                                 * so that it gets handled *after* bios already submitted
1615                                 * have been completely processed.
1616                                 * We take a clone of the original to store in
1617                                 * ci.io->orig_bio to be used by end_io_acct() and
1618                                 * for dec_pending to use for completion handling.
1619                                 * As this path is not used for REQ_OP_ZONE_REPORT,
1620                                 * the usage of io->orig_bio in dm_remap_zone_report()
1621                                 * won't be affected by this reassignment.
1622                                 */
1623                                struct bio *b = bio_split(bio, bio_sectors(bio) - ci.sector_count,
1624                                                          GFP_NOIO, &md->queue->bio_split);
1625                                ci.io->orig_bio = b;
1626                                bio_chain(b, bio);
1627                                ret = generic_make_request(bio);
1628                                break;
1629                        }
1630                }
1631        }
1632
1633        /* drop the extra reference count */
1634        dec_pending(ci.io, errno_to_blk_status(error));
1635        return ret;
1636}
1637
1638/*
1639 * Optimized variant of __split_and_process_bio that leverages the
1640 * fact that targets that use it do _not_ have a need to split bios.
1641 */
1642static blk_qc_t __process_bio(struct mapped_device *md,
1643                              struct dm_table *map, struct bio *bio)
1644{
1645        struct clone_info ci;
1646        blk_qc_t ret = BLK_QC_T_NONE;
1647        int error = 0;
1648
1649        if (unlikely(!map)) {
1650                bio_io_error(bio);
1651                return ret;
1652        }
1653
1654        init_clone_info(&ci, md, map, bio);
1655
1656        if (bio->bi_opf & REQ_PREFLUSH) {
1657                ci.bio = &ci.io->md->flush_bio;
1658                ci.sector_count = 0;
1659                error = __send_empty_flush(&ci);
1660                /* dec_pending submits any data associated with flush */
1661        } else {
1662                struct dm_target *ti = md->immutable_target;
1663                struct dm_target_io *tio;
1664
1665                /*
1666                 * Defend against IO still getting in during teardown
1667                 * - as was seen for a time with nvme-fcloop
1668                 */
1669                if (unlikely(WARN_ON_ONCE(!ti || !dm_target_is_valid(ti)))) {
1670                        error = -EIO;
1671                        goto out;
1672                }
1673
1674                ci.bio = bio;
1675                ci.sector_count = bio_sectors(bio);
1676                if (unlikely(__process_abnormal_io(&ci, ti, &error)))
1677                        goto out;
1678
1679                tio = alloc_tio(&ci, ti, 0, GFP_NOIO);
1680                ret = __clone_and_map_simple_bio(&ci, tio, NULL);
1681        }
1682out:
1683        /* drop the extra reference count */
1684        dec_pending(ci.io, errno_to_blk_status(error));
1685        return ret;
1686}
1687
1688typedef blk_qc_t (process_bio_fn)(struct mapped_device *, struct dm_table *, struct bio *);
1689
1690static blk_qc_t __dm_make_request(struct request_queue *q, struct bio *bio,
1691                                  process_bio_fn process_bio)
1692{
1693        struct mapped_device *md = q->queuedata;
1694        blk_qc_t ret = BLK_QC_T_NONE;
1695        int srcu_idx;
1696        struct dm_table *map;
1697
1698        map = dm_get_live_table(md, &srcu_idx);
1699
1700        /* if we're suspended, we have to queue this io for later */
1701        if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1702                dm_put_live_table(md, srcu_idx);
1703
1704                if (!(bio->bi_opf & REQ_RAHEAD))
1705                        queue_io(md, bio);
1706                else
1707                        bio_io_error(bio);
1708                return ret;
1709        }
1710
1711        ret = process_bio(md, map, bio);
1712
1713        dm_put_live_table(md, srcu_idx);
1714        return ret;
1715}
1716
1717/*
1718 * The request function that remaps the bio to one target and
1719 * splits off any remainder.
1720 */
1721static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio)
1722{
1723        return __dm_make_request(q, bio, __split_and_process_bio);
1724}
1725
1726static blk_qc_t dm_make_request_nvme(struct request_queue *q, struct bio *bio)
1727{
1728        return __dm_make_request(q, bio, __process_bio);
1729}
1730
1731static int dm_any_congested(void *congested_data, int bdi_bits)
1732{
1733        int r = bdi_bits;
1734        struct mapped_device *md = congested_data;
1735        struct dm_table *map;
1736
1737        if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
1738                if (dm_request_based(md)) {
1739                        /*
1740                         * With request-based DM we only need to check the
1741                         * top-level queue for congestion.
1742                         */
1743                        r = md->queue->backing_dev_info->wb.state & bdi_bits;
1744                } else {
1745                        map = dm_get_live_table_fast(md);
1746                        if (map)
1747                                r = dm_table_any_congested(map, bdi_bits);
1748                        dm_put_live_table_fast(md);
1749                }
1750        }
1751
1752        return r;
1753}
1754
1755/*-----------------------------------------------------------------
1756 * An IDR is used to keep track of allocated minor numbers.
1757 *---------------------------------------------------------------*/
1758static void free_minor(int minor)
1759{
1760        spin_lock(&_minor_lock);
1761        idr_remove(&_minor_idr, minor);
1762        spin_unlock(&_minor_lock);
1763}
1764
1765/*
1766 * See if the device with a specific minor # is free.
1767 */
1768static int specific_minor(int minor)
1769{
1770        int r;
1771
1772        if (minor >= (1 << MINORBITS))
1773                return -EINVAL;
1774
1775        idr_preload(GFP_KERNEL);
1776        spin_lock(&_minor_lock);
1777
1778        r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
1779
1780        spin_unlock(&_minor_lock);
1781        idr_preload_end();
1782        if (r < 0)
1783                return r == -ENOSPC ? -EBUSY : r;
1784        return 0;
1785}
1786
1787static int next_free_minor(int *minor)
1788{
1789        int r;
1790
1791        idr_preload(GFP_KERNEL);
1792        spin_lock(&_minor_lock);
1793
1794        r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
1795
1796        spin_unlock(&_minor_lock);
1797        idr_preload_end();
1798        if (r < 0)
1799                return r;
1800        *minor = r;
1801        return 0;
1802}
1803
1804static const struct block_device_operations dm_blk_dops;
1805static const struct dax_operations dm_dax_ops;
1806
1807static void dm_wq_work(struct work_struct *work);
1808
1809static void dm_init_normal_md_queue(struct mapped_device *md)
1810{
1811        md->use_blk_mq = false;
1812
1813        /*
1814         * Initialize aspects of queue that aren't relevant for blk-mq
1815         */
1816        md->queue->backing_dev_info->congested_fn = dm_any_congested;
1817}
1818
1819static void cleanup_mapped_device(struct mapped_device *md)
1820{
1821        if (md->wq)
1822                destroy_workqueue(md->wq);
1823        if (md->kworker_task)
1824                kthread_stop(md->kworker_task);
1825        bioset_exit(&md->bs);
1826        bioset_exit(&md->io_bs);
1827
1828        if (md->dax_dev) {
1829                kill_dax(md->dax_dev);
1830                put_dax(md->dax_dev);
1831                md->dax_dev = NULL;
1832        }
1833
1834        if (md->disk) {
1835                spin_lock(&_minor_lock);
1836                md->disk->private_data = NULL;
1837                spin_unlock(&_minor_lock);
1838                del_gendisk(md->disk);
1839                put_disk(md->disk);
1840        }
1841
1842        if (md->queue)
1843                blk_cleanup_queue(md->queue);
1844
1845        cleanup_srcu_struct(&md->io_barrier);
1846
1847        if (md->bdev) {
1848                bdput(md->bdev);
1849                md->bdev = NULL;
1850        }
1851
1852        mutex_destroy(&md->suspend_lock);
1853        mutex_destroy(&md->type_lock);
1854        mutex_destroy(&md->table_devices_lock);
1855
1856        dm_mq_cleanup_mapped_device(md);
1857}
1858
1859/*
1860 * Allocate and initialise a blank device with a given minor.
1861 */
1862static struct mapped_device *alloc_dev(int minor)
1863{
1864        int r, numa_node_id = dm_get_numa_node();
1865        struct dax_device *dax_dev = NULL;
1866        struct mapped_device *md;
1867        void *old_md;
1868
1869        md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
1870        if (!md) {
1871                DMWARN("unable to allocate device, out of memory.");
1872                return NULL;
1873        }
1874
1875        if (!try_module_get(THIS_MODULE))
1876                goto bad_module_get;
1877
1878        /* get a minor number for the dev */
1879        if (minor == DM_ANY_MINOR)
1880                r = next_free_minor(&minor);
1881        else
1882                r = specific_minor(minor);
1883        if (r < 0)
1884                goto bad_minor;
1885
1886        r = init_srcu_struct(&md->io_barrier);
1887        if (r < 0)
1888                goto bad_io_barrier;
1889
1890        md->numa_node_id = numa_node_id;
1891        md->use_blk_mq = dm_use_blk_mq_default();
1892        md->init_tio_pdu = false;
1893        md->type = DM_TYPE_NONE;
1894        mutex_init(&md->suspend_lock);
1895        mutex_init(&md->type_lock);
1896        mutex_init(&md->table_devices_lock);
1897        spin_lock_init(&md->deferred_lock);
1898        atomic_set(&md->holders, 1);
1899        atomic_set(&md->open_count, 0);
1900        atomic_set(&md->event_nr, 0);
1901        atomic_set(&md->uevent_seq, 0);
1902        INIT_LIST_HEAD(&md->uevent_list);
1903        INIT_LIST_HEAD(&md->table_devices);
1904        spin_lock_init(&md->uevent_lock);
1905
1906        md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id, NULL);
1907        if (!md->queue)
1908                goto bad;
1909        md->queue->queuedata = md;
1910        md->queue->backing_dev_info->congested_data = md;
1911
1912        md->disk = alloc_disk_node(1, md->numa_node_id);
1913        if (!md->disk)
1914                goto bad;
1915
1916        atomic_set(&md->pending[0], 0);
1917        atomic_set(&md->pending[1], 0);
1918        init_waitqueue_head(&md->wait);
1919        INIT_WORK(&md->work, dm_wq_work);
1920        init_waitqueue_head(&md->eventq);
1921        init_completion(&md->kobj_holder.completion);
1922        md->kworker_task = NULL;
1923
1924        md->disk->major = _major;
1925        md->disk->first_minor = minor;
1926        md->disk->fops = &dm_blk_dops;
1927        md->disk->queue = md->queue;
1928        md->disk->private_data = md;
1929        sprintf(md->disk->disk_name, "dm-%d", minor);
1930
1931        if (IS_ENABLED(CONFIG_DAX_DRIVER)) {
1932                dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops);
1933                if (!dax_dev)
1934                        goto bad;
1935        }
1936        md->dax_dev = dax_dev;
1937
1938        add_disk_no_queue_reg(md->disk);
1939        format_dev_t(md->name, MKDEV(_major, minor));
1940
1941        md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
1942        if (!md->wq)
1943                goto bad;
1944
1945        md->bdev = bdget_disk(md->disk, 0);
1946        if (!md->bdev)
1947                goto bad;
1948
1949        bio_init(&md->flush_bio, NULL, 0);
1950        bio_set_dev(&md->flush_bio, md->bdev);
1951        md->flush_bio.bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC;
1952
1953        dm_stats_init(&md->stats);
1954
1955        /* Populate the mapping, nobody knows we exist yet */
1956        spin_lock(&_minor_lock);
1957        old_md = idr_replace(&_minor_idr, md, minor);
1958        spin_unlock(&_minor_lock);
1959
1960        BUG_ON(old_md != MINOR_ALLOCED);
1961
1962        return md;
1963
1964bad:
1965        cleanup_mapped_device(md);
1966bad_io_barrier:
1967        free_minor(minor);
1968bad_minor:
1969        module_put(THIS_MODULE);
1970bad_module_get:
1971        kvfree(md);
1972        return NULL;
1973}
1974
1975static void unlock_fs(struct mapped_device *md);
1976
1977static void free_dev(struct mapped_device *md)
1978{
1979        int minor = MINOR(disk_devt(md->disk));
1980
1981        unlock_fs(md);
1982
1983        cleanup_mapped_device(md);
1984
1985        free_table_devices(&md->table_devices);
1986        dm_stats_cleanup(&md->stats);
1987        free_minor(minor);
1988
1989        module_put(THIS_MODULE);
1990        kvfree(md);
1991}
1992
1993static int __bind_mempools(struct mapped_device *md, struct dm_table *t)
1994{
1995        struct dm_md_mempools *p = dm_table_get_md_mempools(t);
1996        int ret = 0;
1997
1998        if (dm_table_bio_based(t)) {
1999                /*
2000                 * The md may already have mempools that need changing.

2001                 * If so, reload bioset because front_pad may have changed
2002                 * because a different table was loaded.
2003                 */
2004                bioset_exit(&md->bs);
2005                bioset_exit(&md->io_bs);
2006
2007        } else if (bioset_initialized(&md->bs)) {
2008                /*
2009                 * There's no need to reload with request-based dm
2010                 * because the size of front_pad doesn't change.
2011                 * Note for future: If you are to reload bioset,
2012                 * prep-ed requests in the queue may refer
2013                 * to bio from the old bioset, so you must walk
2014                 * through the queue to unprep.
2015                 */
2016                goto out;
2017        }
2018
2019        BUG_ON(!p ||
2020               bioset_initialized(&md->bs) ||
2021               bioset_initialized(&md->io_bs));
2022
2023        ret = bioset_init_from_src(&md->bs, &p->bs);
2024        if (ret)
2025                goto out;
2026        ret = bioset_init_from_src(&md->io_bs, &p->io_bs);
2027        if (ret)
2028                bioset_exit(&md->bs);
2029out:
2030        /* mempool bind completed, no longer need any mempools in the table */
2031        dm_table_free_md_mempools(t);
2032        return ret;
2033}
2034
2035/*
2036 * Bind a table to the device.
2037 */
2038static void event_callback(void *context)
2039{
2040        unsigned long flags;
2041        LIST_HEAD(uevents);
2042        struct mapped_device *md = (struct mapped_device *) context;
2043
2044        spin_lock_irqsave(&md->uevent_lock, flags);
2045        list_splice_init(&md->uevent_list, &uevents);
2046        spin_unlock_irqrestore(&md->uevent_lock, flags);
2047
2048        dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
2049
2050        atomic_inc(&md->event_nr);
2051        wake_up(&md->eventq);
2052        dm_issue_global_event();
2053}
2054
2055/*
2056 * Protected by md->suspend_lock obtained by dm_swap_table().
2057 */
2058static void __set_size(struct mapped_device *md, sector_t size)
2059{
2060        lockdep_assert_held(&md->suspend_lock);
2061
2062        set_capacity(md->disk, size);
2063
2064        i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
2065}
2066
2067/*
2068 * Returns old map, which caller must destroy.
2069 */
2070static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2071                               struct queue_limits *limits)
2072{
2073        struct dm_table *old_map;
2074        struct request_queue *q = md->queue;
2075        bool request_based = dm_table_request_based(t);
2076        sector_t size;
2077        int ret;
2078
2079        lockdep_assert_held(&md->suspend_lock);
2080
2081        size = dm_table_get_size(t);
2082
2083        /*
2084         * Wipe any geometry if the size of the table changed.
2085         */
2086        if (size != dm_get_size(md))
2087                memset(&md->geometry, 0, sizeof(md->geometry));
2088
2089        __set_size(md, size);
2090
2091        dm_table_event_callback(t, event_callback, md);
2092
2093        /*
2094         * The queue hasn't been stopped yet, if the old table type wasn't
2095         * for request-based during suspension.  So stop it to prevent
2096         * I/O mapping before resume.
2097         * This must be done before setting the queue restrictions,
2098         * because request-based dm may be run just after the setting.
2099         */
2100        if (request_based)
2101                dm_stop_queue(q);
2102
2103        if (request_based || md->type == DM_TYPE_NVME_BIO_BASED) {
2104                /*
2105                 * Leverage the fact that request-based DM targets and
2106                 * NVMe bio based targets are immutable singletons
2107                 * - used to optimize both dm_request_fn and dm_mq_queue_rq;
2108                 *   and __process_bio.
2109                 */
2110                md->immutable_target = dm_table_get_immutable_target(t);
2111        }
2112
2113        ret = __bind_mempools(md, t);
2114        if (ret) {
2115                old_map = ERR_PTR(ret);
2116                goto out;
2117        }
2118
2119        old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2120        rcu_assign_pointer(md->map, (void *)t);
2121        md->immutable_target_type = dm_table_get_immutable_target_type(t);
2122
2123        dm_table_set_restrictions(t, q, limits);
2124        if (old_map)
2125                dm_sync_table(md);
2126
2127out:
2128        return old_map;
2129}
2130
2131/*
2132 * Returns unbound table for the caller to free.
2133 */
2134static struct dm_table *__unbind(struct mapped_device *md)
2135{
2136        struct dm_table *map = rcu_dereference_protected(md->map, 1);
2137
2138        if (!map)
2139                return NULL;
2140
2141        dm_table_event_callback(map, NULL, NULL);
2142        RCU_INIT_POINTER(md->map, NULL);
2143        dm_sync_table(md);
2144
2145        return map;
2146}
2147
2148/*
2149 * Constructor for a new device.
2150 */
2151int dm_create(int minor, struct mapped_device **result)
2152{
2153        int r;
2154        struct mapped_device *md;
2155
2156        md = alloc_dev(minor);
2157        if (!md)
2158                return -ENXIO;
2159
2160        r = dm_sysfs_init(md);
2161        if (r) {
2162                free_dev(md);
2163                return r;
2164        }
2165
2166        *result = md;
2167        return 0;
2168}
2169
2170/*
2171 * Functions to manage md->type.
2172 * All are required to hold md->type_lock.
2173 */
2174void dm_lock_md_type(struct mapped_device *md)
2175{
2176        mutex_lock(&md->type_lock);
2177}
2178
2179void dm_unlock_md_type(struct mapped_device *md)
2180{
2181        mutex_unlock(&md->type_lock);
2182}
2183
2184void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
2185{
2186        BUG_ON(!mutex_is_locked(&md->type_lock));
2187        md->type = type;
2188}
2189
2190enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
2191{
2192        return md->type;
2193}
2194
2195struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
2196{
2197        return md->immutable_target_type;
2198}
2199
2200/*
2201 * The queue_limits are only valid as long as you have a reference
2202 * count on 'md'.
2203 */
2204struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
2205{
2206        BUG_ON(!atomic_read(&md->holders));
2207        return &md->queue->limits;
2208}
2209EXPORT_SYMBOL_GPL(dm_get_queue_limits);
2210
2211/*
2212 * Setup the DM device's queue based on md's type
2213 */
2214int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
2215{
2216        int r;
2217        struct queue_limits limits;
2218        enum dm_queue_mode type = dm_get_md_type(md);
2219
2220        switch (type) {
2221        case DM_TYPE_REQUEST_BASED:
2222                dm_init_normal_md_queue(md);
2223                r = dm_old_init_request_queue(md, t);
2224                if (r) {
2225                        DMERR("Cannot initialize queue for request-based mapped device");
2226                        return r;
2227                }
2228                break;
2229        case DM_TYPE_MQ_REQUEST_BASED:
2230                r = dm_mq_init_request_queue(md, t);
2231                if (r) {
2232                        DMERR("Cannot initialize queue for request-based dm-mq mapped device");
2233                        return r;
2234                }
2235                break;
2236        case DM_TYPE_BIO_BASED:
2237        case DM_TYPE_DAX_BIO_BASED:
2238                dm_init_normal_md_queue(md);
2239                blk_queue_make_request(md->queue, dm_make_request);
2240                break;
2241        case DM_TYPE_NVME_BIO_BASED:
2242                dm_init_normal_md_queue(md);
2243                blk_queue_make_request(md->queue, dm_make_request_nvme);
2244                break;
2245        case DM_TYPE_NONE:
2246                WARN_ON_ONCE(true);
2247                break;
2248        }
2249
2250        r = dm_calculate_queue_limits(t, &limits);
2251        if (r) {
2252                DMERR("Cannot calculate initial queue limits");
2253                return r;
2254        }
2255        dm_table_set_restrictions(t, md->queue, &limits);
2256        blk_register_queue(md->disk);
2257
2258        return 0;
2259}
2260
2261struct mapped_device *dm_get_md(dev_t dev)
2262{
2263        struct mapped_device *md;
2264        unsigned minor = MINOR(dev);
2265
2266        if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2267                return NULL;
2268
2269        spin_lock(&_minor_lock);
2270
2271        md = idr_find(&_minor_idr, minor);
2272        if (!md || md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) ||
2273            test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2274                md = NULL;
2275                goto out;
2276        }
2277        dm_get(md);
2278out:
2279        spin_unlock(&_minor_lock);
2280
2281        return md;
2282}
2283EXPORT_SYMBOL_GPL(dm_get_md);
2284
2285void *dm_get_mdptr(struct mapped_device *md)
2286{
2287        return md->interface_ptr;
2288}
2289
2290void dm_set_mdptr(struct mapped_device *md, void *ptr)
2291{
2292        md->interface_ptr = ptr;
2293}
2294
2295void dm_get(struct mapped_device *md)
2296{
2297        atomic_inc(&md->holders);
2298        BUG_ON(test_bit(DMF_FREEING, &md->flags));
2299}
2300
2301int dm_hold(struct mapped_device *md)
2302{
2303        spin_lock(&_minor_lock);
2304        if (test_bit(DMF_FREEING, &md->flags)) {
2305                spin_unlock(&_minor_lock);
2306                return -EBUSY;
2307        }
2308        dm_get(md);
2309        spin_unlock(&_minor_lock);
2310        return 0;
2311}
2312EXPORT_SYMBOL_GPL(dm_hold);
2313
2314const char *dm_device_name(struct mapped_device *md)
2315{
2316        return md->name;
2317}
2318EXPORT_SYMBOL_GPL(dm_device_name);
2319
2320static void __dm_destroy(struct mapped_device *md, bool wait)
2321{
2322        struct dm_table *map;
2323        int srcu_idx;
2324
2325        might_sleep();
2326
2327        spin_lock(&_minor_lock);
2328        idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2329        set_bit(DMF_FREEING, &md->flags);
2330        spin_unlock(&_minor_lock);
2331
2332        blk_set_queue_dying(md->queue);
2333
2334        if (dm_request_based(md) && md->kworker_task)
2335                kthread_flush_worker(&md->kworker);
2336
2337        /*
2338         * Take suspend_lock so that presuspend and postsuspend methods
2339         * do not race with internal suspend.
2340         */
2341        mutex_lock(&md->suspend_lock);
2342        map = dm_get_live_table(md, &srcu_idx);
2343        if (!dm_suspended_md(md)) {
2344                dm_table_presuspend_targets(map);
2345                dm_table_postsuspend_targets(map);
2346        }
2347        /* dm_put_live_table must be before msleep, otherwise deadlock is possible */
2348        dm_put_live_table(md, srcu_idx);
2349        mutex_unlock(&md->suspend_lock);
2350
2351        /*
2352         * Rare, but there may be I/O requests still going to complete,
2353         * for example.  Wait for all references to disappear.
2354         * No one should increment the reference count of the mapped_device,
2355         * after the mapped_device state becomes DMF_FREEING.
2356         */
2357        if (wait)
2358                while (atomic_read(&md->holders))
2359                        msleep(1);
2360        else if (atomic_read(&md->holders))
2361                DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2362                       dm_device_name(md), atomic_read(&md->holders));
2363
2364        dm_sysfs_exit(md);
2365        dm_table_destroy(__unbind(md));
2366        free_dev(md);
2367}
2368
2369void dm_destroy(struct mapped_device *md)
2370{
2371        __dm_destroy(md, true);
2372}
2373
2374void dm_destroy_immediate(struct mapped_device *md)
2375{
2376        __dm_destroy(md, false);
2377}
2378
2379void dm_put(struct mapped_device *md)
2380{
2381        atomic_dec(&md->holders);
2382}
2383EXPORT_SYMBOL_GPL(dm_put);
2384
2385static int dm_wait_for_completion(struct mapped_device *md, long task_state)
2386{
2387        int r = 0;
2388        DEFINE_WAIT(wait);
2389
2390        while (1) {
2391                prepare_to_wait(&md->wait, &wait, task_state);
2392
2393                if (!md_in_flight(md))
2394                        break;
2395
2396                if (signal_pending_state(task_state, current)) {
2397                        r = -EINTR;
2398                        break;
2399                }
2400
2401                io_schedule();
2402        }
2403        finish_wait(&md->wait, &wait);
2404
2405        return r;
2406}
2407
2408/*
2409 * Process the deferred bios
2410 */
2411static void dm_wq_work(struct work_struct *work)
2412{
2413        struct mapped_device *md = container_of(work, struct mapped_device,
2414                                                work);
2415        struct bio *c;
2416        int srcu_idx;
2417        struct dm_table *map;
2418
2419        map = dm_get_live_table(md, &srcu_idx);
2420
2421        while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2422                spin_lock_irq(&md->deferred_lock);
2423                c = bio_list_pop(&md->deferred);
2424                spin_unlock_irq(&md->deferred_lock);
2425
2426                if (!c)
2427                        break;
2428
2429                if (dm_request_based(md))
2430                        generic_make_request(c);
2431                else
2432                        __split_and_process_bio(md, map, c);
2433        }
2434
2435        dm_put_live_table(md, srcu_idx);
2436}
2437
2438static void dm_queue_flush(struct mapped_device *md)
2439{
2440        clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2441        smp_mb__after_atomic();
2442        queue_work(md->wq, &md->work);
2443}
2444
2445/*
2446 * Swap in a new table, returning the old one for the caller to destroy.
2447 */
2448struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2449{
2450        struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
2451        struct queue_limits limits;
2452        int r;
2453
2454        mutex_lock(&md->suspend_lock);
2455
2456        /* device must be suspended */
2457        if (!dm_suspended_md(md))
2458                goto out;
2459
2460        /*
2461         * If the new table has no data devices, retain the existing limits.
2462         * This helps multipath with queue_if_no_path if all paths disappear,
2463         * then new I/O is queued based on these limits, and then some paths
2464         * reappear.
2465         */
2466        if (dm_table_has_no_data_devices(table)) {
2467                live_map = dm_get_live_table_fast(md);
2468                if (live_map)
2469                        limits = md->queue->limits;
2470                dm_put_live_table_fast(md);
2471        }
2472
2473        if (!live_map) {
2474                r = dm_calculate_queue_limits(table, &limits);
2475                if (r) {
2476                        map = ERR_PTR(r);
2477                        goto out;
2478                }
2479        }
2480
2481        map = __bind(md, table, &limits);
2482        dm_issue_global_event();
2483
2484out:
2485        mutex_unlock(&md->suspend_lock);
2486        return map;
2487}
2488
2489/*
2490 * Functions to lock and unlock any filesystem running on the
2491 * device.
2492 */
2493static int lock_fs(struct mapped_device *md)
2494{
2495        int r;
2496
2497        WARN_ON(md->frozen_sb);
2498
2499        md->frozen_sb = freeze_bdev(md->bdev);
2500        if (IS_ERR(md->frozen_sb)) {
2501                r = PTR_ERR(md->frozen_sb);
2502                md->frozen_sb = NULL;
2503                return r;
2504        }
2505
2506        set_bit(DMF_FROZEN, &md->flags);
2507
2508        return 0;
2509}
2510
2511static void unlock_fs(struct mapped_device *md)
2512{
2513        if (!test_bit(DMF_FROZEN, &md->flags))
2514                return;
2515
2516        thaw_bdev(md->bdev, md->frozen_sb);
2517        md->frozen_sb = NULL;
2518        clear_bit(DMF_FROZEN, &md->flags);
2519}
2520
2521/*
2522 * @suspend_flags: DM_SUSPEND_LOCKFS_FLAG and/or DM_SUSPEND_NOFLUSH_FLAG
2523 * @task_state: e.g. TASK_INTERRUPTIBLE or TASK_UNINTERRUPTIBLE
2524 * @dmf_suspended_flag: DMF_SUSPENDED or DMF_SUSPENDED_INTERNALLY
2525 *
2526 * If __dm_suspend returns 0, the device is completely quiescent
2527 * now. There is no request-processing activity. All new requests
2528 * are being added to md->deferred list.
2529 */
2530static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
2531                        unsigned suspend_flags, long task_state,
2532                        int dmf_suspended_flag)
2533{
2534        bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
2535        bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
2536        int r;
2537
2538        lockdep_assert_held(&md->suspend_lock);
2539
2540        /*
2541         * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
2542         * This flag is cleared before dm_suspend returns.
2543         */
2544        if (noflush)
2545                set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2546        else
2547                pr_debug("%s: suspending with flush\n", dm_device_name(md));
2548
2549        /*
2550         * This gets reverted if there's an error later and the targets
2551         * provide the .presuspend_undo hook.
2552         */
2553        dm_table_presuspend_targets(map);
2554
2555        /*
2556         * Flush I/O to the device.
2557         * Any I/O submitted after lock_fs() may not be flushed.
2558         * noflush takes precedence over do_lockfs.
2559         * (lock_fs() flushes I/Os and waits for them to complete.)
2560         */
2561        if (!noflush && do_lockfs) {
2562                r = lock_fs(md);
2563                if (r) {
2564                        dm_table_presuspend_undo_targets(map);
2565                        return r;
2566                }
2567        }
2568
2569        /*
2570         * Here we must make sure that no processes are submitting requests
2571         * to target drivers i.e. no one may be executing
2572         * __split_and_process_bio. This is called from dm_request and
2573         * dm_wq_work.
2574         *
2575         * To get all processes out of __split_and_process_bio in dm_request,
2576         * we take the write lock. To prevent any process from reentering
2577         * __split_and_process_bio from dm_request and quiesce the thread
2578         * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
2579         * flush_workqueue(md->wq).
2580         */
2581        set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2582        if (map)
2583                synchronize_srcu(&md->io_barrier);
2584
2585        /*
2586         * Stop md->queue before flushing md->wq in case request-based
2587         * dm defers requests to md->wq from md->queue.
2588         */
2589        if (dm_request_based(md)) {
2590                dm_stop_queue(md->queue);
2591                if (md->kworker_task)
2592                        kthread_flush_worker(&md->kworker);
2593        }
2594
2595        flush_workqueue(md->wq);
2596
2597        /*
2598         * At this point no more requests are entering target request routines.
2599         * We call dm_wait_for_completion to wait for all existing requests
2600         * to finish.
2601         */
2602        r = dm_wait_for_completion(md, task_state);
2603        if (!r)
2604                set_bit(dmf_suspended_flag, &md->flags);
2605
2606        if (noflush)
2607                clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2608        if (map)
2609                synchronize_srcu(&md->io_barrier);
2610
2611        /* were we interrupted ? */
2612        if (r < 0) {
2613                dm_queue_flush(md);
2614
2615                if (dm_request_based(md))
2616                        dm_start_queue(md->queue);
2617
2618                unlock_fs(md);
2619                dm_table_presuspend_undo_targets(map);
2620                /* pushback list is already flushed, so skip flush */
2621        }
2622
2623        return r;
2624}
2625
2626/*
2627 * We need to be able to change a mapping table under a mounted
2628 * filesystem.  For example we might want to move some data in
2629 * the background.  Before the table can be swapped with
2630 * dm_bind_table, dm_suspend must be called to flush any in
2631 * flight bios and ensure that any further io gets deferred.
2632 */
2633/*
2634 * Suspend mechanism in request-based dm.
2635 *
2636 * 1. Flush all I/Os by lock_fs() if needed.
2637 * 2. Stop dispatching any I/O by stopping the request_queue.
2638 * 3. Wait for all in-flight I/Os to be completed or requeued.
2639 *
2640 * To abort suspend, start the request_queue.
2641 */
2642int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2643{
2644        struct dm_table *map = NULL;
2645        int r = 0;
2646
2647retry:
2648        mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2649
2650        if (dm_suspended_md(md)) {
2651                r = -EINVAL;
2652                goto out_unlock;
2653        }
2654
2655        if (dm_suspended_internally_md(md)) {
2656                /* already internally suspended, wait for internal resume */
2657                mutex_unlock(&md->suspend_lock);
2658                r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2659                if (r)
2660                        return r;
2661                goto retry;
2662        }
2663
2664        map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2665
2666        r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED);
2667        if (r)
2668                goto out_unlock;
2669
2670        dm_table_postsuspend_targets(map);
2671
2672out_unlock:
2673        mutex_unlock(&md->suspend_lock);
2674        return r;
2675}
2676
2677static int __dm_resume(struct mapped_device *md, struct dm_table *map)
2678{
2679        if (map) {
2680                int r = dm_table_resume_targets(map);
2681                if (r)
2682                        return r;
2683        }
2684
2685        dm_queue_flush(md);
2686
2687        /*
2688         * Flushing deferred I/Os must be done after targets are resumed
2689         * so that mapping of targets can work correctly.
2690         * Request-based dm is queueing the deferred I/Os in its request_queue.
2691         */
2692        if (dm_request_based(md))
2693                dm_start_queue(md->queue);
2694
2695        unlock_fs(md);
2696
2697        return 0;
2698}
2699
2700int dm_resume(struct mapped_device *md)
2701{
2702        int r;
2703        struct dm_table *map = NULL;
2704
2705retry:
2706        r = -EINVAL;
2707        mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2708
2709        if (!dm_suspended_md(md))
2710                goto out;
2711
2712        if (dm_suspended_internally_md(md)) {
2713                /* already internally suspended, wait for internal resume */
2714                mutex_unlock(&md->suspend_lock);
2715                r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2716                if (r)
2717                        return r;
2718                goto retry;
2719        }
2720
2721        map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2722        if (!map || !dm_table_get_size(map))
2723                goto out;
2724
2725        r = __dm_resume(md, map);
2726        if (r)
2727                goto out;
2728
2729        clear_bit(DMF_SUSPENDED, &md->flags);
2730out:
2731        mutex_unlock(&md->suspend_lock);
2732
2733        return r;
2734}
2735
2736/*
2737 * Internal suspend/resume works like userspace-driven suspend. It waits
2738 * until all bios finish and prevents issuing new bios to the target drivers.
2739 * It may be used only from the kernel.
2740 */
2741
2742static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
2743{
2744        struct dm_table *map = NULL;
2745
2746        lockdep_assert_held(&md->suspend_lock);
2747
2748        if (md->internal_suspend_count++)
2749                return; /* nested internal suspend */
2750
2751        if (dm_suspended_md(md)) {
2752                set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2753                return; /* nest suspend */
2754        }
2755
2756        map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2757
2758        /*
2759         * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is
2760         * supported.  Properly supporting a TASK_INTERRUPTIBLE internal suspend
2761         * would require changing .presuspend to return an error -- avoid this
2762         * until there is a need for more elaborate variants of internal suspend.
2763         */
2764        (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE,
2765                            DMF_SUSPENDED_INTERNALLY);
2766
2767        dm_table_postsuspend_targets(map);
2768}
2769
2770static void __dm_internal_resume(struct mapped_device *md)
2771{
2772        BUG_ON(!md->internal_suspend_count);
2773
2774        if (--md->internal_suspend_count)
2775                return; /* resume from nested internal suspend */
2776
2777        if (dm_suspended_md(md))
2778                goto done; /* resume from nested suspend */
2779
2780        /*
2781         * NOTE: existing callers don't need to call dm_table_resume_targets
2782         * (which may fail -- so best to avoid it for now by passing NULL map)
2783         */
2784        (void) __dm_resume(md, NULL);
2785
2786done:
2787        clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2788        smp_mb__after_atomic();
2789        wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
2790}
2791
2792void dm_internal_suspend_noflush(struct mapped_device *md)
2793{
2794        mutex_lock(&md->suspend_lock);
2795        __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
2796        mutex_unlock(&md->suspend_lock);
2797}
2798EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
2799
2800void dm_internal_resume(struct mapped_device *md)
2801{
2802        mutex_lock(&md->suspend_lock);
2803        __dm_internal_resume(md);
2804        mutex_unlock(&md->suspend_lock);
2805}
2806EXPORT_SYMBOL_GPL(dm_internal_resume);
2807
2808/*
2809 * Fast variants of internal suspend/resume hold md->suspend_lock,
2810 * which prevents interaction with userspace-driven suspend.
2811 */
2812
2813void dm_internal_suspend_fast(struct mapped_device *md)
2814{
2815        mutex_lock(&md->suspend_lock);
2816        if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2817                return;
2818
2819        set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2820        synchronize_srcu(&md->io_barrier);
2821        flush_workqueue(md->wq);
2822        dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2823}
2824EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
2825
2826void dm_internal_resume_fast(struct mapped_device *md)
2827{
2828        if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2829                goto done;
2830
2831        dm_queue_flush(md);
2832
2833done:
2834        mutex_unlock(&md->suspend_lock);
2835}
2836EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
2837
2838/*-----------------------------------------------------------------
2839 * Event notification.
2840 *---------------------------------------------------------------*/
2841int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
2842                       unsigned cookie)
2843{
2844        char udev_cookie[DM_COOKIE_LENGTH];
2845        char *envp[] = { udev_cookie, NULL };
2846
2847        if (!cookie)
2848                return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
2849        else {
2850                snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2851                         DM_COOKIE_ENV_VAR_NAME, cookie);
2852                return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
2853                                          action, envp);
2854        }
2855}
2856
2857uint32_t dm_next_uevent_seq(struct mapped_device *md)
2858{
2859        return atomic_add_return(1, &md->uevent_seq);
2860}
2861
2862uint32_t dm_get_event_nr(struct mapped_device *md)
2863{
2864        return atomic_read(&md->event_nr);
2865}
2866
2867int dm_wait_event(struct mapped_device *md, int event_nr)
2868{
2869        return wait_event_interruptible(md->eventq,
2870                        (event_nr != atomic_read(&md->event_nr)));
2871}
2872
2873void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
2874{
2875        unsigned long flags;
2876
2877        spin_lock_irqsave(&md->uevent_lock, flags);
2878        list_add(elist, &md->uevent_list);
2879        spin_unlock_irqrestore(&md->uevent_lock, flags);
2880}
2881
2882/*
2883 * The gendisk is only valid as long as you have a reference
2884 * count on 'md'.
2885 */
2886struct gendisk *dm_disk(struct mapped_device *md)
2887{
2888        return md->disk;
2889}
2890EXPORT_SYMBOL_GPL(dm_disk);
2891
2892struct kobject *dm_kobject(struct mapped_device *md)
2893{
2894        return &md->kobj_holder.kobj;
2895}
2896
2897struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2898{
2899        struct mapped_device *md;
2900
2901        md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
2902
2903        spin_lock(&_minor_lock);
2904        if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2905                md = NULL;
2906                goto out;
2907        }
2908        dm_get(md);
2909out:
2910        spin_unlock(&_minor_lock);
2911
2912        return md;
2913}
2914
2915int dm_suspended_md(struct mapped_device *md)
2916{
2917        return test_bit(DMF_SUSPENDED, &md->flags);
2918}
2919
2920int dm_suspended_internally_md(struct mapped_device *md)
2921{
2922        return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2923}
2924
2925int dm_test_deferred_remove_flag(struct mapped_device *md)
2926{
2927        return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
2928}
2929
2930int dm_suspended(struct dm_target *ti)
2931{
2932        return dm_suspended_md(dm_table_get_md(ti->table));
2933}
2934EXPORT_SYMBOL_GPL(dm_suspended);
2935
2936int dm_noflush_suspending(struct dm_target *ti)
2937{
2938        return __noflush_suspending(dm_table_get_md(ti->table));
2939}
2940EXPORT_SYMBOL_GPL(dm_noflush_suspending);
2941
2942struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
2943                                            unsigned integrity, unsigned per_io_data_size,
2944                                            unsigned min_pool_size)
2945{
2946        struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
2947        unsigned int pool_size = 0;
2948        unsigned int front_pad, io_front_pad;
2949        int ret;
2950
2951        if (!pools)
2952                return NULL;
2953
2954        switch (type) {
2955        case DM_TYPE_BIO_BASED:
2956        case DM_TYPE_DAX_BIO_BASED:
2957        case DM_TYPE_NVME_BIO_BASED:
2958                pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
2959                front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
2960                io_front_pad = roundup(front_pad,  __alignof__(struct dm_io)) + offsetof(struct dm_io, tio);
2961                ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0);
2962                if (ret)
2963                        goto out;
2964                if (integrity && bioset_integrity_create(&pools->io_bs, pool_size))
2965                        goto out;
2966                break;
2967        case DM_TYPE_REQUEST_BASED:
2968        case DM_TYPE_MQ_REQUEST_BASED:
2969                pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size);
2970                front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
2971                /* per_io_data_size is used for blk-mq pdu at queue allocation */
2972                break;
2973        default:
2974                BUG();
2975        }
2976
2977        ret = bioset_init(&pools->bs, pool_size, front_pad, 0);
2978        if (ret)
2979                goto out;
2980
2981        if (integrity && bioset_integrity_create(&pools->bs, pool_size))
2982                goto out;
2983
2984        return pools;
2985
2986out:
2987        dm_free_md_mempools(pools);
2988
2989        return NULL;
2990}
2991
2992void dm_free_md_mempools(struct dm_md_mempools *pools)
2993{
2994        if (!pools)
2995                return;
2996
2997        bioset_exit(&pools->bs);
2998        bioset_exit(&pools->io_bs);
2999
3000        kfree(pools);

3001}
3002
3003struct dm_pr {
3004        u64     old_key;
3005        u64     new_key;
3006        u32     flags;
3007        bool    fail_early;
3008};
3009
3010static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
3011                      void *data)
3012{
3013        struct mapped_device *md = bdev->bd_disk->private_data;
3014        struct dm_table *table;
3015        struct dm_target *ti;
3016        int ret = -ENOTTY, srcu_idx;
3017
3018        table = dm_get_live_table(md, &srcu_idx);
3019        if (!table || !dm_table_get_size(table))
3020                goto out;
3021
3022        /* We only support devices that have a single target */
3023        if (dm_table_get_num_targets(table) != 1)
3024                goto out;
3025        ti = dm_table_get_target(table, 0);
3026
3027        ret = -EINVAL;
3028        if (!ti->type->iterate_devices)
3029                goto out;
3030
3031        ret = ti->type->iterate_devices(ti, fn, data);
3032out:
3033        dm_put_live_table(md, srcu_idx);
3034        return ret;
3035}
3036
3037/*
3038 * For register / unregister we need to manually call out to every path.
3039 */
3040static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev,
3041                            sector_t start, sector_t len, void *data)
3042{
3043        struct dm_pr *pr = data;
3044        const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
3045
3046        if (!ops || !ops->pr_register)
3047                return -EOPNOTSUPP;
3048        return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);
3049}
3050
3051static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
3052                          u32 flags)
3053{
3054        struct dm_pr pr = {
3055                .old_key        = old_key,
3056                .new_key        = new_key,
3057                .flags          = flags,
3058                .fail_early     = true,
3059        };
3060        int ret;
3061
3062        ret = dm_call_pr(bdev, __dm_pr_register, &pr);
3063        if (ret && new_key) {
3064                /* unregister all paths if we failed to register any path */
3065                pr.old_key = new_key;
3066                pr.new_key = 0;
3067                pr.flags = 0;
3068                pr.fail_early = false;
3069                dm_call_pr(bdev, __dm_pr_register, &pr);
3070        }
3071
3072        return ret;
3073}
3074
3075static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
3076                         u32 flags)
3077{
3078        struct mapped_device *md = bdev->bd_disk->private_data;
3079        const struct pr_ops *ops;
3080        int r, srcu_idx;
3081
3082        r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3083        if (r < 0)
3084                goto out;
3085
3086        ops = bdev->bd_disk->fops->pr_ops;
3087        if (ops && ops->pr_reserve)
3088                r = ops->pr_reserve(bdev, key, type, flags);
3089        else
3090                r = -EOPNOTSUPP;
3091out:
3092        dm_unprepare_ioctl(md, srcu_idx);
3093        return r;
3094}
3095
3096static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
3097{
3098        struct mapped_device *md = bdev->bd_disk->private_data;
3099        const struct pr_ops *ops;
3100        int r, srcu_idx;
3101
3102        r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3103        if (r < 0)
3104                goto out;
3105
3106        ops = bdev->bd_disk->fops->pr_ops;
3107        if (ops && ops->pr_release)
3108                r = ops->pr_release(bdev, key, type);
3109        else
3110                r = -EOPNOTSUPP;
3111out:
3112        dm_unprepare_ioctl(md, srcu_idx);
3113        return r;
3114}
3115
3116static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
3117                         enum pr_type type, bool abort)
3118{
3119        struct mapped_device *md = bdev->bd_disk->private_data;
3120        const struct pr_ops *ops;
3121        int r, srcu_idx;
3122
3123        r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3124        if (r < 0)
3125                goto out;
3126
3127        ops = bdev->bd_disk->fops->pr_ops;
3128        if (ops && ops->pr_preempt)
3129                r = ops->pr_preempt(bdev, old_key, new_key, type, abort);
3130        else
3131                r = -EOPNOTSUPP;
3132out:
3133        dm_unprepare_ioctl(md, srcu_idx);
3134        return r;
3135}
3136
3137static int dm_pr_clear(struct block_device *bdev, u64 key)
3138{
3139        struct mapped_device *md = bdev->bd_disk->private_data;
3140        const struct pr_ops *ops;
3141        int r, srcu_idx;
3142
3143        r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3144        if (r < 0)
3145                goto out;
3146
3147        ops = bdev->bd_disk->fops->pr_ops;
3148        if (ops && ops->pr_clear)
3149                r = ops->pr_clear(bdev, key);
3150        else
3151                r = -EOPNOTSUPP;
3152out:
3153        dm_unprepare_ioctl(md, srcu_idx);
3154        return r;
3155}
3156
3157static const struct pr_ops dm_pr_ops = {
3158        .pr_register    = dm_pr_register,
3159        .pr_reserve     = dm_pr_reserve,
3160        .pr_release     = dm_pr_release,
3161        .pr_preempt     = dm_pr_preempt,
3162        .pr_clear       = dm_pr_clear,
3163};
3164
3165static const struct block_device_operations dm_blk_dops = {
3166        .open = dm_blk_open,
3167        .release = dm_blk_close,
3168        .ioctl = dm_blk_ioctl,
3169        .getgeo = dm_blk_getgeo,
3170        .pr_ops = &dm_pr_ops,
3171        .owner = THIS_MODULE
3172};
3173
3174static const struct dax_operations dm_dax_ops = {
3175        .direct_access = dm_dax_direct_access,
3176        .copy_from_iter = dm_dax_copy_from_iter,
3177        .copy_to_iter = dm_dax_copy_to_iter,
3178};
3179
3180/*
3181 * module hooks
3182 */
3183module_init(dm_init);
3184module_exit(dm_exit);
3185
3186module_param(major, uint, 0);
3187MODULE_PARM_DESC(major, "The major number of the device mapper");
3188
3189module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
3190MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
3191
3192module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
3193MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
3194
3195MODULE_DESCRIPTION(DM_NAME " driver");
3196MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3197MODULE_LICENSE("GPL");
3198