linux/drivers/md/dm.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
   3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
   4 *
   5 * This file is released under the GPL.
   6 */
   7
   8#include "dm-core.h"
   9#include "dm-rq.h"
  10#include "dm-uevent.h"
  11#include "dm-ima.h"
  12
  13#include <linux/init.h>
  14#include <linux/module.h>
  15#include <linux/mutex.h>
  16#include <linux/sched/mm.h>
  17#include <linux/sched/signal.h>
  18#include <linux/blkpg.h>
  19#include <linux/bio.h>
  20#include <linux/mempool.h>
  21#include <linux/dax.h>
  22#include <linux/slab.h>
  23#include <linux/idr.h>
  24#include <linux/uio.h>
  25#include <linux/hdreg.h>
  26#include <linux/delay.h>
  27#include <linux/wait.h>
  28#include <linux/pr.h>
  29#include <linux/refcount.h>
  30#include <linux/part_stat.h>
  31#include <linux/blk-crypto.h>
  32#include <linux/blk-crypto-profile.h>
  33
  34#define DM_MSG_PREFIX "core"
  35
  36/*
  37 * Cookies are numeric values sent with CHANGE and REMOVE
  38 * uevents while resuming, removing or renaming the device.
  39 */
  40#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
  41#define DM_COOKIE_LENGTH 24
  42
  43/*
  44 * For REQ_POLLED fs bio, this flag is set if we link mapped underlying
  45 * dm_io into one list, and reuse bio->bi_private as the list head. Before
  46 * ending this fs bio, we will recover its ->bi_private.
  47 */
  48#define REQ_DM_POLL_LIST        REQ_DRV
  49
  50static const char *_name = DM_NAME;
  51
  52static unsigned int major = 0;
  53static unsigned int _major = 0;
  54
  55static DEFINE_IDR(_minor_idr);
  56
  57static DEFINE_SPINLOCK(_minor_lock);
  58
  59static void do_deferred_remove(struct work_struct *w);
  60
  61static DECLARE_WORK(deferred_remove_work, do_deferred_remove);
  62
  63static struct workqueue_struct *deferred_remove_workqueue;
  64
  65atomic_t dm_global_event_nr = ATOMIC_INIT(0);
  66DECLARE_WAIT_QUEUE_HEAD(dm_global_eventq);
  67
  68void dm_issue_global_event(void)
  69{
  70        atomic_inc(&dm_global_event_nr);
  71        wake_up(&dm_global_eventq);
  72}
  73
  74/*
  75 * One of these is allocated (on-stack) per original bio.
  76 */
  77struct clone_info {
  78        struct dm_table *map;
  79        struct bio *bio;
  80        struct dm_io *io;
  81        sector_t sector;
  82        unsigned sector_count;
  83        bool submit_as_polled;
  84};
  85
  86#define DM_TARGET_IO_BIO_OFFSET (offsetof(struct dm_target_io, clone))
  87#define DM_IO_BIO_OFFSET \
  88        (offsetof(struct dm_target_io, clone) + offsetof(struct dm_io, tio))
  89
  90static inline struct dm_target_io *clone_to_tio(struct bio *clone)
  91{
  92        return container_of(clone, struct dm_target_io, clone);
  93}
  94
  95void *dm_per_bio_data(struct bio *bio, size_t data_size)
  96{
  97        if (!dm_tio_flagged(clone_to_tio(bio), DM_TIO_INSIDE_DM_IO))
  98                return (char *)bio - DM_TARGET_IO_BIO_OFFSET - data_size;
  99        return (char *)bio - DM_IO_BIO_OFFSET - data_size;
 100}
 101EXPORT_SYMBOL_GPL(dm_per_bio_data);
 102
 103struct bio *dm_bio_from_per_bio_data(void *data, size_t data_size)
 104{
 105        struct dm_io *io = (struct dm_io *)((char *)data + data_size);
 106        if (io->magic == DM_IO_MAGIC)
 107                return (struct bio *)((char *)io + DM_IO_BIO_OFFSET);
 108        BUG_ON(io->magic != DM_TIO_MAGIC);
 109        return (struct bio *)((char *)io + DM_TARGET_IO_BIO_OFFSET);
 110}
 111EXPORT_SYMBOL_GPL(dm_bio_from_per_bio_data);
 112
 113unsigned dm_bio_get_target_bio_nr(const struct bio *bio)
 114{
 115        return container_of(bio, struct dm_target_io, clone)->target_bio_nr;
 116}
 117EXPORT_SYMBOL_GPL(dm_bio_get_target_bio_nr);
 118
 119#define MINOR_ALLOCED ((void *)-1)
 120
 121#define DM_NUMA_NODE NUMA_NO_NODE
 122static int dm_numa_node = DM_NUMA_NODE;
 123
 124#define DEFAULT_SWAP_BIOS       (8 * 1048576 / PAGE_SIZE)
 125static int swap_bios = DEFAULT_SWAP_BIOS;
 126static int get_swap_bios(void)
 127{
 128        int latch = READ_ONCE(swap_bios);
 129        if (unlikely(latch <= 0))
 130                latch = DEFAULT_SWAP_BIOS;
 131        return latch;
 132}
 133
 134/*
 135 * For mempools pre-allocation at the table loading time.
 136 */
 137struct dm_md_mempools {
 138        struct bio_set bs;
 139        struct bio_set io_bs;
 140};
 141
 142struct table_device {
 143        struct list_head list;
 144        refcount_t count;
 145        struct dm_dev dm_dev;
 146};
 147
 148/*
 149 * Bio-based DM's mempools' reserved IOs set by the user.
 150 */
 151#define RESERVED_BIO_BASED_IOS          16
 152static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
 153
 154static int __dm_get_module_param_int(int *module_param, int min, int max)
 155{
 156        int param = READ_ONCE(*module_param);
 157        int modified_param = 0;
 158        bool modified = true;
 159
 160        if (param < min)
 161                modified_param = min;
 162        else if (param > max)
 163                modified_param = max;
 164        else
 165                modified = false;
 166
 167        if (modified) {
 168                (void)cmpxchg(module_param, param, modified_param);
 169                param = modified_param;
 170        }
 171
 172        return param;
 173}
 174
 175unsigned __dm_get_module_param(unsigned *module_param,
 176                               unsigned def, unsigned max)
 177{
 178        unsigned param = READ_ONCE(*module_param);
 179        unsigned modified_param = 0;
 180
 181        if (!param)
 182                modified_param = def;
 183        else if (param > max)
 184                modified_param = max;
 185
 186        if (modified_param) {
 187                (void)cmpxchg(module_param, param, modified_param);
 188                param = modified_param;
 189        }
 190
 191        return param;
 192}
 193
 194unsigned dm_get_reserved_bio_based_ios(void)
 195{
 196        return __dm_get_module_param(&reserved_bio_based_ios,
 197                                     RESERVED_BIO_BASED_IOS, DM_RESERVED_MAX_IOS);
 198}
 199EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios);
 200
 201static unsigned dm_get_numa_node(void)
 202{
 203        return __dm_get_module_param_int(&dm_numa_node,
 204                                         DM_NUMA_NODE, num_online_nodes() - 1);
 205}
 206
 207static int __init local_init(void)
 208{
 209        int r;
 210
 211        r = dm_uevent_init();
 212        if (r)
 213                return r;
 214
 215        deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1);
 216        if (!deferred_remove_workqueue) {
 217                r = -ENOMEM;
 218                goto out_uevent_exit;
 219        }
 220
 221        _major = major;
 222        r = register_blkdev(_major, _name);
 223        if (r < 0)
 224                goto out_free_workqueue;
 225
 226        if (!_major)
 227                _major = r;
 228
 229        return 0;
 230
 231out_free_workqueue:
 232        destroy_workqueue(deferred_remove_workqueue);
 233out_uevent_exit:
 234        dm_uevent_exit();
 235
 236        return r;
 237}
 238
 239static void local_exit(void)
 240{
 241        flush_scheduled_work();
 242        destroy_workqueue(deferred_remove_workqueue);
 243
 244        unregister_blkdev(_major, _name);
 245        dm_uevent_exit();
 246
 247        _major = 0;
 248
 249        DMINFO("cleaned up");
 250}
 251
 252static int (*_inits[])(void) __initdata = {
 253        local_init,
 254        dm_target_init,
 255        dm_linear_init,
 256        dm_stripe_init,
 257        dm_io_init,
 258        dm_kcopyd_init,
 259        dm_interface_init,
 260        dm_statistics_init,
 261};
 262
 263static void (*_exits[])(void) = {
 264        local_exit,
 265        dm_target_exit,
 266        dm_linear_exit,
 267        dm_stripe_exit,
 268        dm_io_exit,
 269        dm_kcopyd_exit,
 270        dm_interface_exit,
 271        dm_statistics_exit,
 272};
 273
 274static int __init dm_init(void)
 275{
 276        const int count = ARRAY_SIZE(_inits);
 277        int r, i;
 278
 279#if (IS_ENABLED(CONFIG_IMA) && !IS_ENABLED(CONFIG_IMA_DISABLE_HTABLE))
 280        DMWARN("CONFIG_IMA_DISABLE_HTABLE is disabled."
 281               " Duplicate IMA measurements will not be recorded in the IMA log.");
 282#endif
 283
 284        for (i = 0; i < count; i++) {
 285                r = _inits[i]();
 286                if (r)
 287                        goto bad;
 288        }
 289
 290        return 0;
 291bad:
 292        while (i--)
 293                _exits[i]();
 294
 295        return r;
 296}
 297
 298static void __exit dm_exit(void)
 299{
 300        int i = ARRAY_SIZE(_exits);
 301
 302        while (i--)
 303                _exits[i]();
 304
 305        /*
 306         * Should be empty by this point.
 307         */
 308        idr_destroy(&_minor_idr);
 309}
 310
 311/*
 312 * Block device functions
 313 */
 314int dm_deleting_md(struct mapped_device *md)
 315{
 316        return test_bit(DMF_DELETING, &md->flags);
 317}
 318
 319static int dm_blk_open(struct block_device *bdev, fmode_t mode)
 320{
 321        struct mapped_device *md;
 322
 323        spin_lock(&_minor_lock);
 324
 325        md = bdev->bd_disk->private_data;
 326        if (!md)
 327                goto out;
 328
 329        if (test_bit(DMF_FREEING, &md->flags) ||
 330            dm_deleting_md(md)) {
 331                md = NULL;
 332                goto out;
 333        }
 334
 335        dm_get(md);
 336        atomic_inc(&md->open_count);
 337out:
 338        spin_unlock(&_minor_lock);
 339
 340        return md ? 0 : -ENXIO;
 341}
 342
 343static void dm_blk_close(struct gendisk *disk, fmode_t mode)
 344{
 345        struct mapped_device *md;
 346
 347        spin_lock(&_minor_lock);
 348
 349        md = disk->private_data;
 350        if (WARN_ON(!md))
 351                goto out;
 352
 353        if (atomic_dec_and_test(&md->open_count) &&
 354            (test_bit(DMF_DEFERRED_REMOVE, &md->flags)))
 355                queue_work(deferred_remove_workqueue, &deferred_remove_work);
 356
 357        dm_put(md);
 358out:
 359        spin_unlock(&_minor_lock);
 360}
 361
 362int dm_open_count(struct mapped_device *md)
 363{
 364        return atomic_read(&md->open_count);
 365}
 366
 367/*
 368 * Guarantees nothing is using the device before it's deleted.
 369 */
 370int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred)
 371{
 372        int r = 0;
 373
 374        spin_lock(&_minor_lock);
 375
 376        if (dm_open_count(md)) {
 377                r = -EBUSY;
 378                if (mark_deferred)
 379                        set_bit(DMF_DEFERRED_REMOVE, &md->flags);
 380        } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags))
 381                r = -EEXIST;
 382        else
 383                set_bit(DMF_DELETING, &md->flags);
 384
 385        spin_unlock(&_minor_lock);
 386
 387        return r;
 388}
 389
 390int dm_cancel_deferred_remove(struct mapped_device *md)
 391{
 392        int r = 0;
 393
 394        spin_lock(&_minor_lock);
 395
 396        if (test_bit(DMF_DELETING, &md->flags))
 397                r = -EBUSY;
 398        else
 399                clear_bit(DMF_DEFERRED_REMOVE, &md->flags);
 400
 401        spin_unlock(&_minor_lock);
 402
 403        return r;
 404}
 405
 406static void do_deferred_remove(struct work_struct *w)
 407{
 408        dm_deferred_remove();
 409}
 410
 411static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 412{
 413        struct mapped_device *md = bdev->bd_disk->private_data;
 414
 415        return dm_get_geometry(md, geo);
 416}
 417
 418static int dm_prepare_ioctl(struct mapped_device *md, int *srcu_idx,
 419                            struct block_device **bdev)
 420{
 421        struct dm_target *tgt;
 422        struct dm_table *map;
 423        int r;
 424
 425retry:
 426        r = -ENOTTY;
 427        map = dm_get_live_table(md, srcu_idx);
 428        if (!map || !dm_table_get_size(map))
 429                return r;
 430
 431        /* We only support devices that have a single target */
 432        if (dm_table_get_num_targets(map) != 1)
 433                return r;
 434
 435        tgt = dm_table_get_target(map, 0);
 436        if (!tgt->type->prepare_ioctl)
 437                return r;
 438
 439        if (dm_suspended_md(md))
 440                return -EAGAIN;
 441
 442        r = tgt->type->prepare_ioctl(tgt, bdev);
 443        if (r == -ENOTCONN && !fatal_signal_pending(current)) {
 444                dm_put_live_table(md, *srcu_idx);
 445                msleep(10);
 446                goto retry;
 447        }
 448
 449        return r;
 450}
 451
 452static void dm_unprepare_ioctl(struct mapped_device *md, int srcu_idx)
 453{
 454        dm_put_live_table(md, srcu_idx);
 455}
 456
 457static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
 458                        unsigned int cmd, unsigned long arg)
 459{
 460        struct mapped_device *md = bdev->bd_disk->private_data;
 461        int r, srcu_idx;
 462
 463        r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
 464        if (r < 0)
 465                goto out;
 466
 467        if (r > 0) {
 468                /*
 469                 * Target determined this ioctl is being issued against a
 470                 * subset of the parent bdev; require extra privileges.
 471                 */
 472                if (!capable(CAP_SYS_RAWIO)) {
 473                        DMDEBUG_LIMIT(
 474        "%s: sending ioctl %x to DM device without required privilege.",
 475                                current->comm, cmd);
 476                        r = -ENOIOCTLCMD;
 477                        goto out;
 478                }
 479        }
 480
 481        if (!bdev->bd_disk->fops->ioctl)
 482                r = -ENOTTY;
 483        else
 484                r = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg);
 485out:
 486        dm_unprepare_ioctl(md, srcu_idx);
 487        return r;
 488}
 489
 490u64 dm_start_time_ns_from_clone(struct bio *bio)
 491{
 492        return jiffies_to_nsecs(clone_to_tio(bio)->io->start_time);
 493}
 494EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone);
 495
 496static bool bio_is_flush_with_data(struct bio *bio)
 497{
 498        return ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size);
 499}
 500
 501static void dm_io_acct(bool end, struct mapped_device *md, struct bio *bio,
 502                       unsigned long start_time, struct dm_stats_aux *stats_aux)
 503{
 504        bool is_flush_with_data;
 505        unsigned int bi_size;
 506
 507        /* If REQ_PREFLUSH set save any payload but do not account it */
 508        is_flush_with_data = bio_is_flush_with_data(bio);
 509        if (is_flush_with_data) {
 510                bi_size = bio->bi_iter.bi_size;
 511                bio->bi_iter.bi_size = 0;
 512        }
 513
 514        if (!end)
 515                bio_start_io_acct_time(bio, start_time);
 516        else
 517                bio_end_io_acct(bio, start_time);
 518
 519        if (unlikely(dm_stats_used(&md->stats)))
 520                dm_stats_account_io(&md->stats, bio_data_dir(bio),
 521                                    bio->bi_iter.bi_sector, bio_sectors(bio),
 522                                    end, start_time, stats_aux);
 523
 524        /* Restore bio's payload so it does get accounted upon requeue */
 525        if (is_flush_with_data)
 526                bio->bi_iter.bi_size = bi_size;
 527}
 528
 529static void __dm_start_io_acct(struct dm_io *io, struct bio *bio)
 530{
 531        dm_io_acct(false, io->md, bio, io->start_time, &io->stats_aux);
 532}
 533
 534static void dm_start_io_acct(struct dm_io *io, struct bio *clone)
 535{
 536        /* Must account IO to DM device in terms of orig_bio */
 537        struct bio *bio = io->orig_bio;
 538
 539        /*
 540         * Ensure IO accounting is only ever started once.
 541         * Expect no possibility for race unless DM_TIO_IS_DUPLICATE_BIO.
 542         */
 543        if (!clone ||
 544            likely(!dm_tio_flagged(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO))) {
 545                if (WARN_ON_ONCE(dm_io_flagged(io, DM_IO_ACCOUNTED)))
 546                        return;
 547                dm_io_set_flag(io, DM_IO_ACCOUNTED);
 548        } else {
 549                unsigned long flags;
 550                if (dm_io_flagged(io, DM_IO_ACCOUNTED))
 551                        return;
 552                /* Can afford locking given DM_TIO_IS_DUPLICATE_BIO */
 553                spin_lock_irqsave(&io->lock, flags);
 554                dm_io_set_flag(io, DM_IO_ACCOUNTED);
 555                spin_unlock_irqrestore(&io->lock, flags);
 556        }
 557
 558        __dm_start_io_acct(io, bio);
 559}
 560
 561static void dm_end_io_acct(struct dm_io *io, struct bio *bio)
 562{
 563        dm_io_acct(true, io->md, bio, io->start_time, &io->stats_aux);
 564}
 565
 566static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio)
 567{
 568        struct dm_io *io;
 569        struct dm_target_io *tio;
 570        struct bio *clone;
 571
 572        clone = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOIO, &md->io_bs);
 573
 574        tio = clone_to_tio(clone);
 575        tio->flags = 0;
 576        dm_tio_set_flag(tio, DM_TIO_INSIDE_DM_IO);
 577        tio->io = NULL;
 578
 579        io = container_of(tio, struct dm_io, tio);
 580        io->magic = DM_IO_MAGIC;
 581        io->status = 0;
 582        atomic_set(&io->io_count, 1);
 583        this_cpu_inc(*md->pending_io);
 584        io->orig_bio = NULL;
 585        io->md = md;
 586        io->map_task = current;
 587        spin_lock_init(&io->lock);
 588        io->start_time = jiffies;
 589        io->flags = 0;
 590
 591        dm_stats_record_start(&md->stats, &io->stats_aux);
 592
 593        return io;
 594}
 595
 596static void free_io(struct dm_io *io)
 597{
 598        bio_put(&io->tio.clone);
 599}
 600
 601static struct bio *alloc_tio(struct clone_info *ci, struct dm_target *ti,
 602                unsigned target_bio_nr, unsigned *len, gfp_t gfp_mask)
 603{
 604        struct dm_target_io *tio;
 605        struct bio *clone;
 606
 607        if (!ci->io->tio.io) {
 608                /* the dm_target_io embedded in ci->io is available */
 609                tio = &ci->io->tio;
 610                /* alloc_io() already initialized embedded clone */
 611                clone = &tio->clone;
 612        } else {
 613                clone = bio_alloc_clone(ci->bio->bi_bdev, ci->bio,
 614                                        gfp_mask, &ci->io->md->bs);
 615                if (!clone)
 616                        return NULL;
 617
 618                /* REQ_DM_POLL_LIST shouldn't be inherited */
 619                clone->bi_opf &= ~REQ_DM_POLL_LIST;
 620
 621                tio = clone_to_tio(clone);
 622                tio->flags = 0; /* also clears DM_TIO_INSIDE_DM_IO */
 623        }
 624
 625        tio->magic = DM_TIO_MAGIC;
 626        tio->io = ci->io;
 627        tio->ti = ti;
 628        tio->target_bio_nr = target_bio_nr;
 629        tio->len_ptr = len;
 630        tio->old_sector = 0;
 631
 632        if (len) {
 633                clone->bi_iter.bi_size = to_bytes(*len);
 634                if (bio_integrity(clone))
 635                        bio_integrity_trim(clone);
 636        }
 637
 638        return clone;
 639}
 640
 641static void free_tio(struct bio *clone)
 642{
 643        if (dm_tio_flagged(clone_to_tio(clone), DM_TIO_INSIDE_DM_IO))
 644                return;
 645        bio_put(clone);
 646}
 647
 648/*
 649 * Add the bio to the list of deferred io.
 650 */
 651static void queue_io(struct mapped_device *md, struct bio *bio)
 652{
 653        unsigned long flags;
 654
 655        spin_lock_irqsave(&md->deferred_lock, flags);
 656        bio_list_add(&md->deferred, bio);
 657        spin_unlock_irqrestore(&md->deferred_lock, flags);
 658        queue_work(md->wq, &md->work);
 659}
 660
 661/*
 662 * Everyone (including functions in this file), should use this
 663 * function to access the md->map field, and make sure they call
 664 * dm_put_live_table() when finished.
 665 */
 666struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier)
 667{
 668        *srcu_idx = srcu_read_lock(&md->io_barrier);
 669
 670        return srcu_dereference(md->map, &md->io_barrier);
 671}
 672
 673void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier)
 674{
 675        srcu_read_unlock(&md->io_barrier, srcu_idx);
 676}
 677
 678void dm_sync_table(struct mapped_device *md)
 679{
 680        synchronize_srcu(&md->io_barrier);
 681        synchronize_rcu_expedited();
 682}
 683
 684/*
 685 * A fast alternative to dm_get_live_table/dm_put_live_table.
 686 * The caller must not block between these two functions.
 687 */
 688static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU)
 689{
 690        rcu_read_lock();
 691        return rcu_dereference(md->map);
 692}
 693
 694static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU)
 695{
 696        rcu_read_unlock();
 697}
 698
 699static char *_dm_claim_ptr = "I belong to device-mapper";
 700
 701/*
 702 * Open a table device so we can use it as a map destination.
 703 */
 704static int open_table_device(struct table_device *td, dev_t dev,
 705                             struct mapped_device *md)
 706{
 707        struct block_device *bdev;
 708        u64 part_off;
 709        int r;
 710
 711        BUG_ON(td->dm_dev.bdev);
 712
 713        bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr);
 714        if (IS_ERR(bdev))
 715                return PTR_ERR(bdev);
 716
 717        r = bd_link_disk_holder(bdev, dm_disk(md));
 718        if (r) {
 719                blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
 720                return r;
 721        }
 722
 723        td->dm_dev.bdev = bdev;
 724        td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off);
 725        return 0;
 726}
 727
 728/*
 729 * Close a table device that we've been using.
 730 */
 731static void close_table_device(struct table_device *td, struct mapped_device *md)
 732{
 733        if (!td->dm_dev.bdev)
 734                return;
 735
 736        bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
 737        blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
 738        put_dax(td->dm_dev.dax_dev);
 739        td->dm_dev.bdev = NULL;
 740        td->dm_dev.dax_dev = NULL;
 741}
 742
 743static struct table_device *find_table_device(struct list_head *l, dev_t dev,
 744                                              fmode_t mode)
 745{
 746        struct table_device *td;
 747
 748        list_for_each_entry(td, l, list)
 749                if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode)
 750                        return td;
 751
 752        return NULL;
 753}
 754
 755int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
 756                        struct dm_dev **result)
 757{
 758        int r;
 759        struct table_device *td;
 760
 761        mutex_lock(&md->table_devices_lock);
 762        td = find_table_device(&md->table_devices, dev, mode);
 763        if (!td) {
 764                td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
 765                if (!td) {
 766                        mutex_unlock(&md->table_devices_lock);
 767                        return -ENOMEM;
 768                }
 769
 770                td->dm_dev.mode = mode;
 771                td->dm_dev.bdev = NULL;
 772
 773                if ((r = open_table_device(td, dev, md))) {
 774                        mutex_unlock(&md->table_devices_lock);
 775                        kfree(td);
 776                        return r;
 777                }
 778
 779                format_dev_t(td->dm_dev.name, dev);
 780
 781                refcount_set(&td->count, 1);
 782                list_add(&td->list, &md->table_devices);
 783        } else {
 784                refcount_inc(&td->count);
 785        }
 786        mutex_unlock(&md->table_devices_lock);
 787
 788        *result = &td->dm_dev;
 789        return 0;
 790}
 791
 792void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
 793{
 794        struct table_device *td = container_of(d, struct table_device, dm_dev);
 795
 796        mutex_lock(&md->table_devices_lock);
 797        if (refcount_dec_and_test(&td->count)) {
 798                close_table_device(td, md);
 799                list_del(&td->list);
 800                kfree(td);
 801        }
 802        mutex_unlock(&md->table_devices_lock);
 803}
 804
 805static void free_table_devices(struct list_head *devices)
 806{
 807        struct list_head *tmp, *next;
 808
 809        list_for_each_safe(tmp, next, devices) {
 810                struct table_device *td = list_entry(tmp, struct table_device, list);
 811
 812                DMWARN("dm_destroy: %s still exists with %d references",
 813                       td->dm_dev.name, refcount_read(&td->count));
 814                kfree(td);
 815        }
 816}
 817
 818/*
 819 * Get the geometry associated with a dm device
 820 */
 821int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
 822{
 823        *geo = md->geometry;
 824
 825        return 0;
 826}
 827
 828/*
 829 * Set the geometry of a device.
 830 */
 831int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
 832{
 833        sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
 834
 835        if (geo->start > sz) {
 836                DMWARN("Start sector is beyond the geometry limits.");
 837                return -EINVAL;
 838        }
 839
 840        md->geometry = *geo;
 841
 842        return 0;
 843}
 844
 845static int __noflush_suspending(struct mapped_device *md)
 846{
 847        return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
 848}
 849
 850static void dm_io_complete(struct dm_io *io)
 851{
 852        blk_status_t io_error;
 853        struct mapped_device *md = io->md;
 854        struct bio *bio = io->orig_bio;
 855
 856        if (io->status == BLK_STS_DM_REQUEUE) {
 857                unsigned long flags;
 858                /*
 859                 * Target requested pushing back the I/O.
 860                 */
 861                spin_lock_irqsave(&md->deferred_lock, flags);
 862                if (__noflush_suspending(md) &&
 863                    !WARN_ON_ONCE(dm_is_zone_write(md, bio))) {
 864                        /* NOTE early return due to BLK_STS_DM_REQUEUE below */
 865                        bio_list_add_head(&md->deferred, bio);
 866                } else {
 867                        /*
 868                         * noflush suspend was interrupted or this is
 869                         * a write to a zoned target.
 870                         */
 871                        io->status = BLK_STS_IOERR;
 872                }
 873                spin_unlock_irqrestore(&md->deferred_lock, flags);
 874        }
 875
 876        io_error = io->status;
 877        if (dm_io_flagged(io, DM_IO_ACCOUNTED))
 878                dm_end_io_acct(io, bio);
 879        else if (!io_error) {
 880                /*
 881                 * Must handle target that DM_MAPIO_SUBMITTED only to
 882                 * then bio_endio() rather than dm_submit_bio_remap()
 883                 */
 884                __dm_start_io_acct(io, bio);
 885                dm_end_io_acct(io, bio);
 886        }
 887        free_io(io);
 888        smp_wmb();
 889        this_cpu_dec(*md->pending_io);
 890
 891        /* nudge anyone waiting on suspend queue */
 892        if (unlikely(wq_has_sleeper(&md->wait)))
 893                wake_up(&md->wait);
 894
 895        if (io_error == BLK_STS_DM_REQUEUE || io_error == BLK_STS_AGAIN) {
 896                if (bio->bi_opf & REQ_POLLED) {
 897                        /*
 898                         * Upper layer won't help us poll split bio (io->orig_bio
 899                         * may only reflect a subset of the pre-split original)
 900                         * so clear REQ_POLLED in case of requeue.
 901                         */
 902                        bio->bi_opf &= ~REQ_POLLED;
 903                        if (io_error == BLK_STS_AGAIN) {
 904                                /* io_uring doesn't handle BLK_STS_AGAIN (yet) */
 905                                queue_io(md, bio);
 906                        }
 907                }
 908                return;
 909        }
 910
 911        if (bio_is_flush_with_data(bio)) {
 912                /*
 913                 * Preflush done for flush with data, reissue
 914                 * without REQ_PREFLUSH.
 915                 */
 916                bio->bi_opf &= ~REQ_PREFLUSH;
 917                queue_io(md, bio);
 918        } else {
 919                /* done with normal IO or empty flush */
 920                if (io_error)
 921                        bio->bi_status = io_error;
 922                bio_endio(bio);
 923        }
 924}
 925
 926static inline bool dm_tio_is_normal(struct dm_target_io *tio)
 927{
 928        return (dm_tio_flagged(tio, DM_TIO_INSIDE_DM_IO) &&
 929                !dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO));
 930}
 931
 932/*
 933 * Decrements the number of outstanding ios that a bio has been
 934 * cloned into, completing the original io if necc.
 935 */
 936void dm_io_dec_pending(struct dm_io *io, blk_status_t error)
 937{
 938        /* Push-back supersedes any I/O errors */
 939        if (unlikely(error)) {
 940                unsigned long flags;
 941                spin_lock_irqsave(&io->lock, flags);
 942                if (!(io->status == BLK_STS_DM_REQUEUE &&
 943                      __noflush_suspending(io->md)))
 944                        io->status = error;
 945                spin_unlock_irqrestore(&io->lock, flags);
 946        }
 947
 948        if (atomic_dec_and_test(&io->io_count))
 949                dm_io_complete(io);
 950}
 951
 952void disable_discard(struct mapped_device *md)
 953{
 954        struct queue_limits *limits = dm_get_queue_limits(md);
 955
 956        /* device doesn't really support DISCARD, disable it */
 957        limits->max_discard_sectors = 0;
 958        blk_queue_flag_clear(QUEUE_FLAG_DISCARD, md->queue);
 959}
 960
 961void disable_write_zeroes(struct mapped_device *md)
 962{
 963        struct queue_limits *limits = dm_get_queue_limits(md);
 964
 965        /* device doesn't really support WRITE ZEROES, disable it */
 966        limits->max_write_zeroes_sectors = 0;
 967}
 968
 969static bool swap_bios_limit(struct dm_target *ti, struct bio *bio)
 970{
 971        return unlikely((bio->bi_opf & REQ_SWAP) != 0) && unlikely(ti->limit_swap_bios);
 972}
 973
 974static void clone_endio(struct bio *bio)
 975{
 976        blk_status_t error = bio->bi_status;
 977        struct dm_target_io *tio = clone_to_tio(bio);
 978        struct dm_io *io = tio->io;
 979        struct mapped_device *md = tio->io->md;
 980        dm_endio_fn endio = tio->ti->type->end_io;
 981        struct request_queue *q = bio->bi_bdev->bd_disk->queue;
 982
 983        if (unlikely(error == BLK_STS_TARGET)) {
 984                if (bio_op(bio) == REQ_OP_DISCARD &&
 985                    !q->limits.max_discard_sectors)
 986                        disable_discard(md);
 987                else if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
 988                         !q->limits.max_write_zeroes_sectors)
 989                        disable_write_zeroes(md);
 990        }
 991
 992        if (blk_queue_is_zoned(q))
 993                dm_zone_endio(io, bio);
 994
 995        if (endio) {
 996                int r = endio(tio->ti, bio, &error);
 997                switch (r) {
 998                case DM_ENDIO_REQUEUE:
 999                        /*
1000                         * Requeuing writes to a sequential zone of a zoned
1001                         * target will break the sequential write pattern:
1002                         * fail such IO.
1003                         */
1004                        if (WARN_ON_ONCE(dm_is_zone_write(md, bio)))
1005                                error = BLK_STS_IOERR;
1006                        else
1007                                error = BLK_STS_DM_REQUEUE;
1008                        fallthrough;
1009                case DM_ENDIO_DONE:
1010                        break;
1011                case DM_ENDIO_INCOMPLETE:
1012                        /* The target will handle the io */
1013                        return;
1014                default:
1015                        DMWARN("unimplemented target endio return value: %d", r);
1016                        BUG();
1017                }
1018        }
1019
1020        if (unlikely(swap_bios_limit(tio->ti, bio))) {
1021                struct mapped_device *md = io->md;
1022                up(&md->swap_bios_semaphore);
1023        }
1024
1025        free_tio(bio);
1026        dm_io_dec_pending(io, error);
1027}
1028
1029/*
1030 * Return maximum size of I/O possible at the supplied sector up to the current
1031 * target boundary.
1032 */
1033static inline sector_t max_io_len_target_boundary(struct dm_target *ti,
1034                                                  sector_t target_offset)
1035{
1036        return ti->len - target_offset;
1037}
1038
1039static sector_t max_io_len(struct dm_target *ti, sector_t sector)
1040{
1041        sector_t target_offset = dm_target_offset(ti, sector);
1042        sector_t len = max_io_len_target_boundary(ti, target_offset);
1043        sector_t max_len;
1044
1045        /*
1046         * Does the target need to split IO even further?
1047         * - varied (per target) IO splitting is a tenet of DM; this
1048         *   explains why stacked chunk_sectors based splitting via
1049         *   blk_max_size_offset() isn't possible here. So pass in
1050         *   ti->max_io_len to override stacked chunk_sectors.
1051         */
1052        if (ti->max_io_len) {
1053                max_len = blk_max_size_offset(ti->table->md->queue,
1054                                              target_offset, ti->max_io_len);
1055                if (len > max_len)
1056                        len = max_len;
1057        }
1058
1059        return len;
1060}
1061
1062int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
1063{
1064        if (len > UINT_MAX) {
1065                DMERR("Specified maximum size of target IO (%llu) exceeds limit (%u)",
1066                      (unsigned long long)len, UINT_MAX);
1067                ti->error = "Maximum size of target IO is too large";
1068                return -EINVAL;
1069        }
1070
1071        ti->max_io_len = (uint32_t) len;
1072
1073        return 0;
1074}
1075EXPORT_SYMBOL_GPL(dm_set_target_max_io_len);
1076
1077static struct dm_target *dm_dax_get_live_target(struct mapped_device *md,
1078                                                sector_t sector, int *srcu_idx)
1079        __acquires(md->io_barrier)
1080{
1081        struct dm_table *map;
1082        struct dm_target *ti;
1083
1084        map = dm_get_live_table(md, srcu_idx);
1085        if (!map)
1086                return NULL;
1087
1088        ti = dm_table_find_target(map, sector);
1089        if (!ti)
1090                return NULL;
1091
1092        return ti;
1093}
1094
1095static long dm_dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff,
1096                                 long nr_pages, void **kaddr, pfn_t *pfn)
1097{
1098        struct mapped_device *md = dax_get_private(dax_dev);
1099        sector_t sector = pgoff * PAGE_SECTORS;
1100        struct dm_target *ti;
1101        long len, ret = -EIO;
1102        int srcu_idx;
1103
1104        ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1105
1106        if (!ti)
1107                goto out;
1108        if (!ti->type->direct_access)
1109                goto out;
1110        len = max_io_len(ti, sector) / PAGE_SECTORS;
1111        if (len < 1)
1112                goto out;
1113        nr_pages = min(len, nr_pages);
1114        ret = ti->type->direct_access(ti, pgoff, nr_pages, kaddr, pfn);
1115
1116 out:
1117        dm_put_live_table(md, srcu_idx);
1118
1119        return ret;
1120}
1121
1122static int dm_dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
1123                                  size_t nr_pages)
1124{
1125        struct mapped_device *md = dax_get_private(dax_dev);
1126        sector_t sector = pgoff * PAGE_SECTORS;
1127        struct dm_target *ti;
1128        int ret = -EIO;
1129        int srcu_idx;
1130
1131        ti = dm_dax_get_live_target(md, sector, &srcu_idx);
1132
1133        if (!ti)
1134                goto out;
1135        if (WARN_ON(!ti->type->dax_zero_page_range)) {
1136                /*
1137                 * ->zero_page_range() is mandatory dax operation. If we are
1138                 *  here, something is wrong.
1139                 */
1140                goto out;
1141        }
1142        ret = ti->type->dax_zero_page_range(ti, pgoff, nr_pages);
1143 out:
1144        dm_put_live_table(md, srcu_idx);
1145
1146        return ret;
1147}
1148
1149/*
1150 * A target may call dm_accept_partial_bio only from the map routine.  It is
1151 * allowed for all bio types except REQ_PREFLUSH, REQ_OP_ZONE_* zone management
1152 * operations, REQ_OP_ZONE_APPEND (zone append writes) and any bio serviced by
1153 * __send_duplicate_bios().
1154 *
1155 * dm_accept_partial_bio informs the dm that the target only wants to process
1156 * additional n_sectors sectors of the bio and the rest of the data should be
1157 * sent in a next bio.
1158 *
1159 * A diagram that explains the arithmetics:
1160 * +--------------------+---------------+-------+
1161 * |         1          |       2       |   3   |
1162 * +--------------------+---------------+-------+
1163 *
1164 * <-------------- *tio->len_ptr --------------->
1165 *                      <------- bi_size ------->
1166 *                      <-- n_sectors -->
1167 *
1168 * Region 1 was already iterated over with bio_advance or similar function.
1169 *      (it may be empty if the target doesn't use bio_advance)
1170 * Region 2 is the remaining bio size that the target wants to process.
1171 *      (it may be empty if region 1 is non-empty, although there is no reason
1172 *       to make it empty)
1173 * The target requires that region 3 is to be sent in the next bio.
1174 *
1175 * If the target wants to receive multiple copies of the bio (via num_*bios, etc),
1176 * the partially processed part (the sum of regions 1+2) must be the same for all
1177 * copies of the bio.
1178 */
1179void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors)
1180{
1181        struct dm_target_io *tio = clone_to_tio(bio);
1182        unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
1183
1184        BUG_ON(dm_tio_flagged(tio, DM_TIO_IS_DUPLICATE_BIO));
1185        BUG_ON(op_is_zone_mgmt(bio_op(bio)));
1186        BUG_ON(bio_op(bio) == REQ_OP_ZONE_APPEND);
1187        BUG_ON(bi_size > *tio->len_ptr);
1188        BUG_ON(n_sectors > bi_size);
1189
1190        *tio->len_ptr -= bi_size - n_sectors;
1191        bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT;
1192}
1193EXPORT_SYMBOL_GPL(dm_accept_partial_bio);
1194
1195static inline void __dm_submit_bio_remap(struct bio *clone,
1196                                         dev_t dev, sector_t old_sector)
1197{
1198        trace_block_bio_remap(clone, dev, old_sector);
1199        submit_bio_noacct(clone);
1200}
1201
1202/*
1203 * @clone: clone bio that DM core passed to target's .map function
1204 * @tgt_clone: clone of @clone bio that target needs submitted
1205 *
1206 * Targets should use this interface to submit bios they take
1207 * ownership of when returning DM_MAPIO_SUBMITTED.
1208 *
1209 * Target should also enable ti->accounts_remapped_io
1210 */
1211void dm_submit_bio_remap(struct bio *clone, struct bio *tgt_clone)
1212{
1213        struct dm_target_io *tio = clone_to_tio(clone);
1214        struct dm_io *io = tio->io;
1215
1216        WARN_ON_ONCE(!tio->ti->accounts_remapped_io);
1217
1218        /* establish bio that will get submitted */
1219        if (!tgt_clone)
1220                tgt_clone = clone;
1221
1222        /*
1223         * Account io->origin_bio to DM dev on behalf of target
1224         * that took ownership of IO with DM_MAPIO_SUBMITTED.
1225         */
1226        if (io->map_task == current) {
1227                /* Still in target's map function */
1228                dm_io_set_flag(io, DM_IO_START_ACCT);
1229        } else {
1230                /*
1231                 * Called by another thread, managed by DM target,
1232                 * wait for dm_split_and_process_bio() to store
1233                 * io->orig_bio
1234                 */
1235                while (unlikely(!smp_load_acquire(&io->orig_bio)))
1236                        msleep(1);
1237                dm_start_io_acct(io, clone);
1238        }
1239
1240        __dm_submit_bio_remap(tgt_clone, disk_devt(io->md->disk),
1241                              tio->old_sector);
1242}
1243EXPORT_SYMBOL_GPL(dm_submit_bio_remap);
1244
1245static noinline void __set_swap_bios_limit(struct mapped_device *md, int latch)
1246{
1247        mutex_lock(&md->swap_bios_lock);
1248        while (latch < md->swap_bios) {
1249                cond_resched();
1250                down(&md->swap_bios_semaphore);
1251                md->swap_bios--;
1252        }
1253        while (latch > md->swap_bios) {
1254                cond_resched();
1255                up(&md->swap_bios_semaphore);
1256                md->swap_bios++;
1257        }
1258        mutex_unlock(&md->swap_bios_lock);
1259}
1260
1261static void __map_bio(struct bio *clone)
1262{
1263        struct dm_target_io *tio = clone_to_tio(clone);
1264        int r;
1265        struct dm_io *io = tio->io;
1266        struct dm_target *ti = tio->ti;
1267
1268        clone->bi_end_io = clone_endio;
1269
1270        /*
1271         * Map the clone.
1272         */
1273        dm_io_inc_pending(io);
1274        tio->old_sector = clone->bi_iter.bi_sector;
1275
1276        if (unlikely(swap_bios_limit(ti, clone))) {
1277                struct mapped_device *md = io->md;
1278                int latch = get_swap_bios();
1279                if (unlikely(latch != md->swap_bios))
1280                        __set_swap_bios_limit(md, latch);
1281                down(&md->swap_bios_semaphore);
1282        }
1283
1284        /*
1285         * Check if the IO needs a special mapping due to zone append emulation
1286         * on zoned target. In this case, dm_zone_map_bio() calls the target
1287         * map operation.
1288         */
1289        if (dm_emulate_zone_append(io->md))
1290                r = dm_zone_map_bio(tio);
1291        else
1292                r = ti->type->map(ti, clone);
1293
1294        switch (r) {
1295        case DM_MAPIO_SUBMITTED:
1296                /* target has assumed ownership of this io */
1297                if (!ti->accounts_remapped_io)
1298                        dm_io_set_flag(io, DM_IO_START_ACCT);
1299                break;
1300        case DM_MAPIO_REMAPPED:
1301                /*
1302                 * the bio has been remapped so dispatch it, but defer
1303                 * dm_start_io_acct() until after possible bio_split().
1304                 */
1305                __dm_submit_bio_remap(clone, disk_devt(io->md->disk),
1306                                      tio->old_sector);
1307                dm_io_set_flag(io, DM_IO_START_ACCT);
1308                break;
1309        case DM_MAPIO_KILL:
1310        case DM_MAPIO_REQUEUE:
1311                if (unlikely(swap_bios_limit(ti, clone)))
1312                        up(&io->md->swap_bios_semaphore);
1313                free_tio(clone);
1314                if (r == DM_MAPIO_KILL)
1315                        dm_io_dec_pending(io, BLK_STS_IOERR);
1316                else
1317                        dm_io_dec_pending(io, BLK_STS_DM_REQUEUE);
1318                break;
1319        default:
1320                DMWARN("unimplemented target map return value: %d", r);
1321                BUG();
1322        }
1323}
1324
1325static void alloc_multiple_bios(struct bio_list *blist, struct clone_info *ci,
1326                                struct dm_target *ti, unsigned num_bios)
1327{
1328        struct bio *bio;
1329        int try;
1330
1331        for (try = 0; try < 2; try++) {
1332                int bio_nr;
1333
1334                if (try)
1335                        mutex_lock(&ci->io->md->table_devices_lock);
1336                for (bio_nr = 0; bio_nr < num_bios; bio_nr++) {
1337                        bio = alloc_tio(ci, ti, bio_nr, NULL,
1338                                        try ? GFP_NOIO : GFP_NOWAIT);
1339                        if (!bio)
1340                                break;
1341
1342                        bio_list_add(blist, bio);
1343                }
1344                if (try)
1345                        mutex_unlock(&ci->io->md->table_devices_lock);
1346                if (bio_nr == num_bios)
1347                        return;
1348
1349                while ((bio = bio_list_pop(blist)))
1350                        free_tio(bio);
1351        }
1352}
1353
1354static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti,
1355                                  unsigned num_bios, unsigned *len)
1356{
1357        struct bio_list blist = BIO_EMPTY_LIST;
1358        struct bio *clone;
1359
1360        switch (num_bios) {
1361        case 0:
1362                break;
1363        case 1:
1364                clone = alloc_tio(ci, ti, 0, len, GFP_NOIO);
1365                __map_bio(clone);
1366                break;
1367        default:
1368                /* dm_accept_partial_bio() is not supported with shared tio->len_ptr */
1369                alloc_multiple_bios(&blist, ci, ti, num_bios);
1370                while ((clone = bio_list_pop(&blist))) {
1371                        dm_tio_set_flag(clone_to_tio(clone), DM_TIO_IS_DUPLICATE_BIO);
1372                        __map_bio(clone);
1373                }
1374                break;
1375        }
1376}
1377
1378static void __send_empty_flush(struct clone_info *ci)
1379{
1380        unsigned target_nr = 0;
1381        struct dm_target *ti;
1382        struct bio flush_bio;
1383
1384        /*
1385         * Use an on-stack bio for this, it's safe since we don't
1386         * need to reference it after submit. It's just used as
1387         * the basis for the clone(s).
1388         */
1389        bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0,
1390                 REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC);
1391
1392        ci->bio = &flush_bio;
1393        ci->sector_count = 0;
1394        ci->io->tio.clone.bi_iter.bi_size = 0;
1395
1396        while ((ti = dm_table_get_target(ci->map, target_nr++)))
1397                __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL);
1398
1399        bio_uninit(ci->bio);
1400}
1401
1402static void __send_changing_extent_only(struct clone_info *ci, struct dm_target *ti,
1403                                        unsigned num_bios)
1404{
1405        unsigned len;
1406
1407        len = min_t(sector_t, ci->sector_count,
1408                    max_io_len_target_boundary(ti, dm_target_offset(ti, ci->sector)));
1409
1410        __send_duplicate_bios(ci, ti, num_bios, &len);
1411
1412        ci->sector += len;
1413        ci->sector_count -= len;
1414}
1415
1416static bool is_abnormal_io(struct bio *bio)
1417{
1418        bool r = false;
1419
1420        switch (bio_op(bio)) {
1421        case REQ_OP_DISCARD:
1422        case REQ_OP_SECURE_ERASE:
1423        case REQ_OP_WRITE_ZEROES:
1424                r = true;
1425                break;
1426        }
1427
1428        return r;
1429}
1430
1431static bool __process_abnormal_io(struct clone_info *ci, struct dm_target *ti,
1432                                  int *result)
1433{
1434        unsigned num_bios = 0;
1435
1436        switch (bio_op(ci->bio)) {
1437        case REQ_OP_DISCARD:
1438                num_bios = ti->num_discard_bios;
1439                break;
1440        case REQ_OP_SECURE_ERASE:
1441                num_bios = ti->num_secure_erase_bios;
1442                break;
1443        case REQ_OP_WRITE_ZEROES:
1444                num_bios = ti->num_write_zeroes_bios;
1445                break;
1446        default:
1447                return false;
1448        }
1449
1450        /*
1451         * Even though the device advertised support for this type of
1452         * request, that does not mean every target supports it, and
1453         * reconfiguration might also have changed that since the
1454         * check was performed.
1455         */
1456        if (!num_bios)
1457                *result = -EOPNOTSUPP;
1458        else {
1459                __send_changing_extent_only(ci, ti, num_bios);
1460                *result = 0;
1461        }
1462        return true;
1463}
1464
1465/*
1466 * Reuse ->bi_private as hlist head for storing all dm_io instances
1467 * associated with this bio, and this bio's bi_private needs to be
1468 * stored in dm_io->data before the reuse.
1469 *
1470 * bio->bi_private is owned by fs or upper layer, so block layer won't
1471 * touch it after splitting. Meantime it won't be changed by anyone after
1472 * bio is submitted. So this reuse is safe.
1473 */
1474static inline struct hlist_head *dm_get_bio_hlist_head(struct bio *bio)
1475{
1476        return (struct hlist_head *)&bio->bi_private;
1477}
1478
1479static void dm_queue_poll_io(struct bio *bio, struct dm_io *io)
1480{
1481        struct hlist_head *head = dm_get_bio_hlist_head(bio);
1482
1483        if (!(bio->bi_opf & REQ_DM_POLL_LIST)) {
1484                bio->bi_opf |= REQ_DM_POLL_LIST;
1485                /*
1486                 * Save .bi_private into dm_io, so that we can reuse
1487                 * .bi_private as hlist head for storing dm_io list
1488                 */
1489                io->data = bio->bi_private;
1490
1491                INIT_HLIST_HEAD(head);
1492
1493                /* tell block layer to poll for completion */
1494                bio->bi_cookie = ~BLK_QC_T_NONE;
1495        } else {
1496                /*
1497                 * bio recursed due to split, reuse original poll list,
1498                 * and save bio->bi_private too.
1499                 */
1500                io->data = hlist_entry(head->first, struct dm_io, node)->data;
1501        }
1502
1503        hlist_add_head(&io->node, head);
1504}
1505
1506/*
1507 * Select the correct strategy for processing a non-flush bio.
1508 */
1509static int __split_and_process_bio(struct clone_info *ci)
1510{
1511        struct bio *clone;
1512        struct dm_target *ti;
1513        unsigned len;
1514        int r;
1515
1516        ti = dm_table_find_target(ci->map, ci->sector);
1517        if (!ti)
1518                return -EIO;
1519
1520        if (__process_abnormal_io(ci, ti, &r))
1521                return r;
1522
1523        /*
1524         * Only support bio polling for normal IO, and the target io is
1525         * exactly inside the dm_io instance (verified in dm_poll_dm_io)
1526         */
1527        ci->submit_as_polled = ci->bio->bi_opf & REQ_POLLED;
1528
1529        len = min_t(sector_t, max_io_len(ti, ci->sector), ci->sector_count);
1530        clone = alloc_tio(ci, ti, 0, &len, GFP_NOIO);
1531        __map_bio(clone);
1532
1533        ci->sector += len;
1534        ci->sector_count -= len;
1535
1536        return 0;
1537}
1538
1539static void init_clone_info(struct clone_info *ci, struct mapped_device *md,
1540                            struct dm_table *map, struct bio *bio)
1541{
1542        ci->map = map;
1543        ci->io = alloc_io(md, bio);
1544        ci->bio = bio;
1545        ci->submit_as_polled = false;
1546        ci->sector = bio->bi_iter.bi_sector;
1547        ci->sector_count = bio_sectors(bio);
1548
1549        /* Shouldn't happen but sector_count was being set to 0 so... */
1550        if (WARN_ON_ONCE(op_is_zone_mgmt(bio_op(bio)) && ci->sector_count))
1551                ci->sector_count = 0;
1552}
1553
1554/*
1555 * Entry point to split a bio into clones and submit them to the targets.
1556 */
1557static void dm_split_and_process_bio(struct mapped_device *md,
1558                                     struct dm_table *map, struct bio *bio)
1559{
1560        struct clone_info ci;
1561        struct bio *orig_bio = NULL;
1562        int error = 0;
1563
1564        init_clone_info(&ci, md, map, bio);
1565
1566        if (bio->bi_opf & REQ_PREFLUSH) {
1567                __send_empty_flush(&ci);
1568                /* dm_io_complete submits any data associated with flush */
1569                goto out;
1570        }
1571
1572        error = __split_and_process_bio(&ci);
1573        ci.io->map_task = NULL;
1574        if (error || !ci.sector_count)
1575                goto out;
1576
1577        /*
1578         * Remainder must be passed to submit_bio_noacct() so it gets handled
1579         * *after* bios already submitted have been completely processed.
1580         * We take a clone of the original to store in ci.io->orig_bio to be
1581         * used by dm_end_io_acct() and for dm_io_complete() to use for
1582         * completion handling.
1583         */
1584        orig_bio = bio_split(bio, bio_sectors(bio) - ci.sector_count,
1585                             GFP_NOIO, &md->queue->bio_split);
1586        bio_chain(orig_bio, bio);
1587        trace_block_split(orig_bio, bio->bi_iter.bi_sector);
1588        submit_bio_noacct(bio);
1589out:
1590        if (!orig_bio)
1591                orig_bio = bio;
1592        smp_store_release(&ci.io->orig_bio, orig_bio);
1593        if (dm_io_flagged(ci.io, DM_IO_START_ACCT))
1594                dm_start_io_acct(ci.io, NULL);
1595
1596        /*
1597         * Drop the extra reference count for non-POLLED bio, and hold one
1598         * reference for POLLED bio, which will be released in dm_poll_bio
1599         *
1600         * Add every dm_io instance into the hlist_head which is stored in
1601         * bio->bi_private, so that dm_poll_bio can poll them all.
1602         */
1603        if (error || !ci.submit_as_polled)
1604                dm_io_dec_pending(ci.io, errno_to_blk_status(error));
1605        else
1606                dm_queue_poll_io(bio, ci.io);
1607}
1608
1609static void dm_submit_bio(struct bio *bio)
1610{
1611        struct mapped_device *md = bio->bi_bdev->bd_disk->private_data;
1612        int srcu_idx;
1613        struct dm_table *map;
1614
1615        map = dm_get_live_table(md, &srcu_idx);
1616
1617        /* If suspended, or map not yet available, queue this IO for later */
1618        if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) ||
1619            unlikely(!map)) {
1620                if (bio->bi_opf & REQ_NOWAIT)
1621                        bio_wouldblock_error(bio);
1622                else if (bio->bi_opf & REQ_RAHEAD)
1623                        bio_io_error(bio);
1624                else
1625                        queue_io(md, bio);
1626                goto out;
1627        }
1628
1629        /*
1630         * Use blk_queue_split() for abnormal IO (e.g. discard, writesame, etc)
1631         * otherwise associated queue_limits won't be imposed.
1632         */
1633        if (is_abnormal_io(bio))
1634                blk_queue_split(&bio);
1635
1636        dm_split_and_process_bio(md, map, bio);
1637out:
1638        dm_put_live_table(md, srcu_idx);
1639}
1640
1641static bool dm_poll_dm_io(struct dm_io *io, struct io_comp_batch *iob,
1642                          unsigned int flags)
1643{
1644        WARN_ON_ONCE(!dm_tio_is_normal(&io->tio));
1645
1646        /* don't poll if the mapped io is done */
1647        if (atomic_read(&io->io_count) > 1)
1648                bio_poll(&io->tio.clone, iob, flags);
1649
1650        /* bio_poll holds the last reference */
1651        return atomic_read(&io->io_count) == 1;
1652}
1653
1654static int dm_poll_bio(struct bio *bio, struct io_comp_batch *iob,
1655                       unsigned int flags)
1656{
1657        struct hlist_head *head = dm_get_bio_hlist_head(bio);
1658        struct hlist_head tmp = HLIST_HEAD_INIT;
1659        struct hlist_node *next;
1660        struct dm_io *io;
1661
1662        /* Only poll normal bio which was marked as REQ_DM_POLL_LIST */
1663        if (!(bio->bi_opf & REQ_DM_POLL_LIST))
1664                return 0;
1665
1666        WARN_ON_ONCE(hlist_empty(head));
1667
1668        hlist_move_list(head, &tmp);
1669
1670        /*
1671         * Restore .bi_private before possibly completing dm_io.
1672         *
1673         * bio_poll() is only possible once @bio has been completely
1674         * submitted via submit_bio_noacct()'s depth-first submission.
1675         * So there is no dm_queue_poll_io() race associated with
1676         * clearing REQ_DM_POLL_LIST here.
1677         */
1678        bio->bi_opf &= ~REQ_DM_POLL_LIST;
1679        bio->bi_private = hlist_entry(tmp.first, struct dm_io, node)->data;
1680
1681        hlist_for_each_entry_safe(io, next, &tmp, node) {
1682                if (dm_poll_dm_io(io, iob, flags)) {
1683                        hlist_del_init(&io->node);
1684                        /*
1685                         * clone_endio() has already occurred, so passing
1686                         * error as 0 here doesn't override io->status
1687                         */
1688                        dm_io_dec_pending(io, 0);
1689                }
1690        }
1691
1692        /* Not done? */
1693        if (!hlist_empty(&tmp)) {
1694                bio->bi_opf |= REQ_DM_POLL_LIST;
1695                /* Reset bio->bi_private to dm_io list head */
1696                hlist_move_list(&tmp, head);
1697                return 0;
1698        }
1699        return 1;
1700}
1701
1702/*-----------------------------------------------------------------
1703 * An IDR is used to keep track of allocated minor numbers.
1704 *---------------------------------------------------------------*/
1705static void free_minor(int minor)
1706{
1707        spin_lock(&_minor_lock);
1708        idr_remove(&_minor_idr, minor);
1709        spin_unlock(&_minor_lock);
1710}
1711
1712/*
1713 * See if the device with a specific minor # is free.
1714 */
1715static int specific_minor(int minor)
1716{
1717        int r;
1718
1719        if (minor >= (1 << MINORBITS))
1720                return -EINVAL;
1721
1722        idr_preload(GFP_KERNEL);
1723        spin_lock(&_minor_lock);
1724
1725        r = idr_alloc(&_minor_idr, MINOR_ALLOCED, minor, minor + 1, GFP_NOWAIT);
1726
1727        spin_unlock(&_minor_lock);
1728        idr_preload_end();
1729        if (r < 0)
1730                return r == -ENOSPC ? -EBUSY : r;
1731        return 0;
1732}
1733
1734static int next_free_minor(int *minor)
1735{
1736        int r;
1737
1738        idr_preload(GFP_KERNEL);
1739        spin_lock(&_minor_lock);
1740
1741        r = idr_alloc(&_minor_idr, MINOR_ALLOCED, 0, 1 << MINORBITS, GFP_NOWAIT);
1742
1743        spin_unlock(&_minor_lock);
1744        idr_preload_end();
1745        if (r < 0)
1746                return r;
1747        *minor = r;
1748        return 0;
1749}
1750
1751static const struct block_device_operations dm_blk_dops;
1752static const struct block_device_operations dm_rq_blk_dops;
1753static const struct dax_operations dm_dax_ops;
1754
1755static void dm_wq_work(struct work_struct *work);
1756
1757#ifdef CONFIG_BLK_INLINE_ENCRYPTION
1758static void dm_queue_destroy_crypto_profile(struct request_queue *q)
1759{
1760        dm_destroy_crypto_profile(q->crypto_profile);
1761}
1762
1763#else /* CONFIG_BLK_INLINE_ENCRYPTION */
1764
1765static inline void dm_queue_destroy_crypto_profile(struct request_queue *q)
1766{
1767}
1768#endif /* !CONFIG_BLK_INLINE_ENCRYPTION */
1769
1770static void cleanup_mapped_device(struct mapped_device *md)
1771{
1772        if (md->wq)
1773                destroy_workqueue(md->wq);
1774        bioset_exit(&md->bs);
1775        bioset_exit(&md->io_bs);
1776
1777        if (md->dax_dev) {
1778                dax_remove_host(md->disk);
1779                kill_dax(md->dax_dev);
1780                put_dax(md->dax_dev);
1781                md->dax_dev = NULL;
1782        }
1783
1784        dm_cleanup_zoned_dev(md);
1785        if (md->disk) {
1786                spin_lock(&_minor_lock);
1787                md->disk->private_data = NULL;
1788                spin_unlock(&_minor_lock);
1789                if (dm_get_md_type(md) != DM_TYPE_NONE) {
1790                        dm_sysfs_exit(md);
1791                        del_gendisk(md->disk);
1792                }
1793                dm_queue_destroy_crypto_profile(md->queue);
1794                blk_cleanup_disk(md->disk);
1795        }
1796
1797        if (md->pending_io) {
1798                free_percpu(md->pending_io);
1799                md->pending_io = NULL;
1800        }
1801
1802        cleanup_srcu_struct(&md->io_barrier);
1803
1804        mutex_destroy(&md->suspend_lock);
1805        mutex_destroy(&md->type_lock);
1806        mutex_destroy(&md->table_devices_lock);
1807        mutex_destroy(&md->swap_bios_lock);
1808
1809        dm_mq_cleanup_mapped_device(md);
1810}
1811
1812/*
1813 * Allocate and initialise a blank device with a given minor.
1814 */
1815static struct mapped_device *alloc_dev(int minor)
1816{
1817        int r, numa_node_id = dm_get_numa_node();
1818        struct mapped_device *md;
1819        void *old_md;
1820
1821        md = kvzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
1822        if (!md) {
1823                DMWARN("unable to allocate device, out of memory.");
1824                return NULL;
1825        }
1826
1827        if (!try_module_get(THIS_MODULE))
1828                goto bad_module_get;
1829
1830        /* get a minor number for the dev */
1831        if (minor == DM_ANY_MINOR)
1832                r = next_free_minor(&minor);
1833        else
1834                r = specific_minor(minor);
1835        if (r < 0)
1836                goto bad_minor;
1837
1838        r = init_srcu_struct(&md->io_barrier);
1839        if (r < 0)
1840                goto bad_io_barrier;
1841
1842        md->numa_node_id = numa_node_id;
1843        md->init_tio_pdu = false;
1844        md->type = DM_TYPE_NONE;
1845        mutex_init(&md->suspend_lock);
1846        mutex_init(&md->type_lock);
1847        mutex_init(&md->table_devices_lock);
1848        spin_lock_init(&md->deferred_lock);
1849        atomic_set(&md->holders, 1);
1850        atomic_set(&md->open_count, 0);
1851        atomic_set(&md->event_nr, 0);
1852        atomic_set(&md->uevent_seq, 0);
1853        INIT_LIST_HEAD(&md->uevent_list);
1854        INIT_LIST_HEAD(&md->table_devices);
1855        spin_lock_init(&md->uevent_lock);
1856
1857        /*
1858         * default to bio-based until DM table is loaded and md->type
1859         * established. If request-based table is loaded: blk-mq will
1860         * override accordingly.
1861         */
1862        md->disk = blk_alloc_disk(md->numa_node_id);
1863        if (!md->disk)
1864                goto bad;
1865        md->queue = md->disk->queue;
1866
1867        init_waitqueue_head(&md->wait);
1868        INIT_WORK(&md->work, dm_wq_work);
1869        init_waitqueue_head(&md->eventq);
1870        init_completion(&md->kobj_holder.completion);
1871
1872        md->swap_bios = get_swap_bios();
1873        sema_init(&md->swap_bios_semaphore, md->swap_bios);
1874        mutex_init(&md->swap_bios_lock);
1875
1876        md->disk->major = _major;
1877        md->disk->first_minor = minor;
1878        md->disk->minors = 1;
1879        md->disk->flags |= GENHD_FL_NO_PART;
1880        md->disk->fops = &dm_blk_dops;
1881        md->disk->queue = md->queue;
1882        md->disk->private_data = md;
1883        sprintf(md->disk->disk_name, "dm-%d", minor);
1884
1885        if (IS_ENABLED(CONFIG_FS_DAX)) {
1886                md->dax_dev = alloc_dax(md, &dm_dax_ops);
1887                if (IS_ERR(md->dax_dev)) {
1888                        md->dax_dev = NULL;
1889                        goto bad;
1890                }
1891                set_dax_nocache(md->dax_dev);
1892                set_dax_nomc(md->dax_dev);
1893                if (dax_add_host(md->dax_dev, md->disk))
1894                        goto bad;
1895        }
1896
1897        format_dev_t(md->name, MKDEV(_major, minor));
1898
1899        md->wq = alloc_workqueue("kdmflush/%s", WQ_MEM_RECLAIM, 0, md->name);
1900        if (!md->wq)
1901                goto bad;
1902
1903        md->pending_io = alloc_percpu(unsigned long);
1904        if (!md->pending_io)
1905                goto bad;
1906
1907        dm_stats_init(&md->stats);
1908
1909        /* Populate the mapping, nobody knows we exist yet */
1910        spin_lock(&_minor_lock);
1911        old_md = idr_replace(&_minor_idr, md, minor);
1912        spin_unlock(&_minor_lock);
1913
1914        BUG_ON(old_md != MINOR_ALLOCED);
1915
1916        return md;
1917
1918bad:
1919        cleanup_mapped_device(md);
1920bad_io_barrier:
1921        free_minor(minor);
1922bad_minor:
1923        module_put(THIS_MODULE);
1924bad_module_get:
1925        kvfree(md);
1926        return NULL;
1927}
1928
1929static void unlock_fs(struct mapped_device *md);
1930
1931static void free_dev(struct mapped_device *md)
1932{
1933        int minor = MINOR(disk_devt(md->disk));
1934
1935        unlock_fs(md);
1936
1937        cleanup_mapped_device(md);
1938
1939        free_table_devices(&md->table_devices);
1940        dm_stats_cleanup(&md->stats);
1941        free_minor(minor);
1942
1943        module_put(THIS_MODULE);
1944        kvfree(md);
1945}
1946
1947static int __bind_mempools(struct mapped_device *md, struct dm_table *t)
1948{
1949        struct dm_md_mempools *p = dm_table_get_md_mempools(t);
1950        int ret = 0;
1951
1952        if (dm_table_bio_based(t)) {
1953                /*
1954                 * The md may already have mempools that need changing.
1955                 * If so, reload bioset because front_pad may have changed
1956                 * because a different table was loaded.
1957                 */
1958                bioset_exit(&md->bs);
1959                bioset_exit(&md->io_bs);
1960
1961        } else if (bioset_initialized(&md->bs)) {
1962                /*
1963                 * There's no need to reload with request-based dm
1964                 * because the size of front_pad doesn't change.
1965                 * Note for future: If you are to reload bioset,
1966                 * prep-ed requests in the queue may refer
1967                 * to bio from the old bioset, so you must walk
1968                 * through the queue to unprep.
1969                 */
1970                goto out;
1971        }
1972
1973        BUG_ON(!p ||
1974               bioset_initialized(&md->bs) ||
1975               bioset_initialized(&md->io_bs));
1976
1977        ret = bioset_init_from_src(&md->bs, &p->bs);
1978        if (ret)
1979                goto out;
1980        ret = bioset_init_from_src(&md->io_bs, &p->io_bs);
1981        if (ret)
1982                bioset_exit(&md->bs);
1983out:
1984        /* mempool bind completed, no longer need any mempools in the table */
1985        dm_table_free_md_mempools(t);
1986        return ret;
1987}
1988
1989/*
1990 * Bind a table to the device.
1991 */
1992static void event_callback(void *context)
1993{
1994        unsigned long flags;
1995        LIST_HEAD(uevents);
1996        struct mapped_device *md = (struct mapped_device *) context;
1997
1998        spin_lock_irqsave(&md->uevent_lock, flags);
1999        list_splice_init(&md->uevent_list, &uevents);
2000        spin_unlock_irqrestore(&md->uevent_lock, flags);
2001
2002        dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
2003
2004        atomic_inc(&md->event_nr);
2005        wake_up(&md->eventq);
2006        dm_issue_global_event();
2007}
2008
2009/*
2010 * Returns old map, which caller must destroy.
2011 */
2012static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2013                               struct queue_limits *limits)
2014{
2015        struct dm_table *old_map;
2016        sector_t size;
2017        int ret;
2018
2019        lockdep_assert_held(&md->suspend_lock);
2020
2021        size = dm_table_get_size(t);
2022
2023        /*
2024         * Wipe any geometry if the size of the table changed.
2025         */
2026        if (size != dm_get_size(md))
2027                memset(&md->geometry, 0, sizeof(md->geometry));
2028
2029        if (!get_capacity(md->disk))
2030                set_capacity(md->disk, size);
2031        else
2032                set_capacity_and_notify(md->disk, size);
2033
2034        dm_table_event_callback(t, event_callback, md);
2035
2036        if (dm_table_request_based(t)) {
2037                /*
2038                 * Leverage the fact that request-based DM targets are
2039                 * immutable singletons - used to optimize dm_mq_queue_rq.
2040                 */
2041                md->immutable_target = dm_table_get_immutable_target(t);
2042        }
2043
2044        ret = __bind_mempools(md, t);
2045        if (ret) {
2046                old_map = ERR_PTR(ret);
2047                goto out;
2048        }
2049
2050        ret = dm_table_set_restrictions(t, md->queue, limits);
2051        if (ret) {
2052                old_map = ERR_PTR(ret);
2053                goto out;
2054        }
2055
2056        old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2057        rcu_assign_pointer(md->map, (void *)t);
2058        md->immutable_target_type = dm_table_get_immutable_target_type(t);
2059
2060        if (old_map)
2061                dm_sync_table(md);
2062out:
2063        return old_map;
2064}
2065
2066/*
2067 * Returns unbound table for the caller to free.
2068 */
2069static struct dm_table *__unbind(struct mapped_device *md)
2070{
2071        struct dm_table *map = rcu_dereference_protected(md->map, 1);
2072
2073        if (!map)
2074                return NULL;
2075
2076        dm_table_event_callback(map, NULL, NULL);
2077        RCU_INIT_POINTER(md->map, NULL);
2078        dm_sync_table(md);
2079
2080        return map;
2081}
2082
2083/*
2084 * Constructor for a new device.
2085 */
2086int dm_create(int minor, struct mapped_device **result)
2087{
2088        struct mapped_device *md;
2089
2090        md = alloc_dev(minor);
2091        if (!md)
2092                return -ENXIO;
2093
2094        dm_ima_reset_data(md);
2095
2096        *result = md;
2097        return 0;
2098}
2099
2100/*
2101 * Functions to manage md->type.
2102 * All are required to hold md->type_lock.
2103 */
2104void dm_lock_md_type(struct mapped_device *md)
2105{
2106        mutex_lock(&md->type_lock);
2107}
2108
2109void dm_unlock_md_type(struct mapped_device *md)
2110{
2111        mutex_unlock(&md->type_lock);
2112}
2113
2114void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type)
2115{
2116        BUG_ON(!mutex_is_locked(&md->type_lock));
2117        md->type = type;
2118}
2119
2120enum dm_queue_mode dm_get_md_type(struct mapped_device *md)
2121{
2122        return md->type;
2123}
2124
2125struct target_type *dm_get_immutable_target_type(struct mapped_device *md)
2126{
2127        return md->immutable_target_type;
2128}
2129
2130/*
2131 * The queue_limits are only valid as long as you have a reference
2132 * count on 'md'.
2133 */
2134struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
2135{
2136        BUG_ON(!atomic_read(&md->holders));
2137        return &md->queue->limits;
2138}
2139EXPORT_SYMBOL_GPL(dm_get_queue_limits);
2140
2141/*
2142 * Setup the DM device's queue based on md's type
2143 */
2144int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
2145{
2146        enum dm_queue_mode type = dm_table_get_type(t);
2147        struct queue_limits limits;
2148        int r;
2149
2150        switch (type) {
2151        case DM_TYPE_REQUEST_BASED:
2152                md->disk->fops = &dm_rq_blk_dops;
2153                r = dm_mq_init_request_queue(md, t);
2154                if (r) {
2155                        DMERR("Cannot initialize queue for request-based dm mapped device");
2156                        return r;
2157                }
2158                break;
2159        case DM_TYPE_BIO_BASED:
2160        case DM_TYPE_DAX_BIO_BASED:
2161                break;
2162        case DM_TYPE_NONE:
2163                WARN_ON_ONCE(true);
2164                break;
2165        }
2166
2167        r = dm_calculate_queue_limits(t, &limits);
2168        if (r) {
2169                DMERR("Cannot calculate initial queue limits");
2170                return r;
2171        }
2172        r = dm_table_set_restrictions(t, md->queue, &limits);
2173        if (r)
2174                return r;
2175
2176        r = add_disk(md->disk);
2177        if (r)
2178                return r;
2179
2180        r = dm_sysfs_init(md);
2181        if (r) {
2182                del_gendisk(md->disk);
2183                return r;
2184        }
2185        md->type = type;
2186        return 0;
2187}
2188
2189struct mapped_device *dm_get_md(dev_t dev)
2190{
2191        struct mapped_device *md;
2192        unsigned minor = MINOR(dev);
2193
2194        if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2195                return NULL;
2196
2197        spin_lock(&_minor_lock);
2198
2199        md = idr_find(&_minor_idr, minor);
2200        if (!md || md == MINOR_ALLOCED || (MINOR(disk_devt(dm_disk(md))) != minor) ||
2201            test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2202                md = NULL;
2203                goto out;
2204        }
2205        dm_get(md);
2206out:
2207        spin_unlock(&_minor_lock);
2208
2209        return md;
2210}
2211EXPORT_SYMBOL_GPL(dm_get_md);
2212
2213void *dm_get_mdptr(struct mapped_device *md)
2214{
2215        return md->interface_ptr;
2216}
2217
2218void dm_set_mdptr(struct mapped_device *md, void *ptr)
2219{
2220        md->interface_ptr = ptr;
2221}
2222
2223void dm_get(struct mapped_device *md)
2224{
2225        atomic_inc(&md->holders);
2226        BUG_ON(test_bit(DMF_FREEING, &md->flags));
2227}
2228
2229int dm_hold(struct mapped_device *md)
2230{
2231        spin_lock(&_minor_lock);
2232        if (test_bit(DMF_FREEING, &md->flags)) {
2233                spin_unlock(&_minor_lock);
2234                return -EBUSY;
2235        }
2236        dm_get(md);
2237        spin_unlock(&_minor_lock);
2238        return 0;
2239}
2240EXPORT_SYMBOL_GPL(dm_hold);
2241
2242const char *dm_device_name(struct mapped_device *md)
2243{
2244        return md->name;
2245}
2246EXPORT_SYMBOL_GPL(dm_device_name);
2247
2248static void __dm_destroy(struct mapped_device *md, bool wait)
2249{
2250        struct dm_table *map;
2251        int srcu_idx;
2252
2253        might_sleep();
2254
2255        spin_lock(&_minor_lock);
2256        idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2257        set_bit(DMF_FREEING, &md->flags);
2258        spin_unlock(&_minor_lock);
2259
2260        blk_mark_disk_dead(md->disk);
2261
2262        /*
2263         * Take suspend_lock so that presuspend and postsuspend methods
2264         * do not race with internal suspend.
2265         */
2266        mutex_lock(&md->suspend_lock);
2267        map = dm_get_live_table(md, &srcu_idx);
2268        if (!dm_suspended_md(md)) {
2269                dm_table_presuspend_targets(map);
2270                set_bit(DMF_SUSPENDED, &md->flags);
2271                set_bit(DMF_POST_SUSPENDING, &md->flags);
2272                dm_table_postsuspend_targets(map);
2273        }
2274        /* dm_put_live_table must be before msleep, otherwise deadlock is possible */
2275        dm_put_live_table(md, srcu_idx);
2276        mutex_unlock(&md->suspend_lock);
2277
2278        /*
2279         * Rare, but there may be I/O requests still going to complete,
2280         * for example.  Wait for all references to disappear.
2281         * No one should increment the reference count of the mapped_device,
2282         * after the mapped_device state becomes DMF_FREEING.
2283         */
2284        if (wait)
2285                while (atomic_read(&md->holders))
2286                        msleep(1);
2287        else if (atomic_read(&md->holders))
2288                DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2289                       dm_device_name(md), atomic_read(&md->holders));
2290
2291        dm_table_destroy(__unbind(md));
2292        free_dev(md);
2293}
2294
2295void dm_destroy(struct mapped_device *md)
2296{
2297        __dm_destroy(md, true);
2298}
2299
2300void dm_destroy_immediate(struct mapped_device *md)
2301{
2302        __dm_destroy(md, false);
2303}
2304
2305void dm_put(struct mapped_device *md)
2306{
2307        atomic_dec(&md->holders);
2308}
2309EXPORT_SYMBOL_GPL(dm_put);
2310
2311static bool dm_in_flight_bios(struct mapped_device *md)
2312{
2313        int cpu;
2314        unsigned long sum = 0;
2315
2316        for_each_possible_cpu(cpu)
2317                sum += *per_cpu_ptr(md->pending_io, cpu);
2318
2319        return sum != 0;
2320}
2321
2322static int dm_wait_for_bios_completion(struct mapped_device *md, unsigned int task_state)
2323{
2324        int r = 0;
2325        DEFINE_WAIT(wait);
2326
2327        while (true) {
2328                prepare_to_wait(&md->wait, &wait, task_state);
2329
2330                if (!dm_in_flight_bios(md))
2331                        break;
2332
2333                if (signal_pending_state(task_state, current)) {
2334                        r = -EINTR;
2335                        break;
2336                }
2337
2338                io_schedule();
2339        }
2340        finish_wait(&md->wait, &wait);
2341
2342        smp_rmb();
2343
2344        return r;
2345}
2346
2347static int dm_wait_for_completion(struct mapped_device *md, unsigned int task_state)
2348{
2349        int r = 0;
2350
2351        if (!queue_is_mq(md->queue))
2352                return dm_wait_for_bios_completion(md, task_state);
2353
2354        while (true) {
2355                if (!blk_mq_queue_inflight(md->queue))
2356                        break;
2357
2358                if (signal_pending_state(task_state, current)) {
2359                        r = -EINTR;
2360                        break;
2361                }
2362
2363                msleep(5);
2364        }
2365
2366        return r;
2367}
2368
2369/*
2370 * Process the deferred bios
2371 */
2372static void dm_wq_work(struct work_struct *work)
2373{
2374        struct mapped_device *md = container_of(work, struct mapped_device, work);
2375        struct bio *bio;
2376
2377        while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2378                spin_lock_irq(&md->deferred_lock);
2379                bio = bio_list_pop(&md->deferred);
2380                spin_unlock_irq(&md->deferred_lock);
2381
2382                if (!bio)
2383                        break;
2384
2385                submit_bio_noacct(bio);
2386        }
2387}
2388
2389static void dm_queue_flush(struct mapped_device *md)
2390{
2391        clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2392        smp_mb__after_atomic();
2393        queue_work(md->wq, &md->work);
2394}
2395
2396/*
2397 * Swap in a new table, returning the old one for the caller to destroy.
2398 */
2399struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2400{
2401        struct dm_table *live_map = NULL, *map = ERR_PTR(-EINVAL);
2402        struct queue_limits limits;
2403        int r;
2404
2405        mutex_lock(&md->suspend_lock);
2406
2407        /* device must be suspended */
2408        if (!dm_suspended_md(md))
2409                goto out;
2410
2411        /*
2412         * If the new table has no data devices, retain the existing limits.
2413         * This helps multipath with queue_if_no_path if all paths disappear,
2414         * then new I/O is queued based on these limits, and then some paths
2415         * reappear.
2416         */
2417        if (dm_table_has_no_data_devices(table)) {
2418                live_map = dm_get_live_table_fast(md);
2419                if (live_map)
2420                        limits = md->queue->limits;
2421                dm_put_live_table_fast(md);
2422        }
2423
2424        if (!live_map) {
2425                r = dm_calculate_queue_limits(table, &limits);
2426                if (r) {
2427                        map = ERR_PTR(r);
2428                        goto out;
2429                }
2430        }
2431
2432        map = __bind(md, table, &limits);
2433        dm_issue_global_event();
2434
2435out:
2436        mutex_unlock(&md->suspend_lock);
2437        return map;
2438}
2439
2440/*
2441 * Functions to lock and unlock any filesystem running on the
2442 * device.
2443 */
2444static int lock_fs(struct mapped_device *md)
2445{
2446        int r;
2447
2448        WARN_ON(test_bit(DMF_FROZEN, &md->flags));
2449
2450        r = freeze_bdev(md->disk->part0);
2451        if (!r)
2452                set_bit(DMF_FROZEN, &md->flags);
2453        return r;
2454}
2455
2456static void unlock_fs(struct mapped_device *md)
2457{
2458        if (!test_bit(DMF_FROZEN, &md->flags))
2459                return;
2460        thaw_bdev(md->disk->part0);
2461        clear_bit(DMF_FROZEN, &md->flags);
2462}
2463
2464/*
2465 * @suspend_flags: DM_SUSPEND_LOCKFS_FLAG and/or DM_SUSPEND_NOFLUSH_FLAG
2466 * @task_state: e.g. TASK_INTERRUPTIBLE or TASK_UNINTERRUPTIBLE
2467 * @dmf_suspended_flag: DMF_SUSPENDED or DMF_SUSPENDED_INTERNALLY
2468 *
2469 * If __dm_suspend returns 0, the device is completely quiescent
2470 * now. There is no request-processing activity. All new requests
2471 * are being added to md->deferred list.
2472 */
2473static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
2474                        unsigned suspend_flags, unsigned int task_state,
2475                        int dmf_suspended_flag)
2476{
2477        bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
2478        bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
2479        int r;
2480
2481        lockdep_assert_held(&md->suspend_lock);
2482
2483        /*
2484         * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
2485         * This flag is cleared before dm_suspend returns.
2486         */
2487        if (noflush)
2488                set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2489        else
2490                DMDEBUG("%s: suspending with flush", dm_device_name(md));
2491
2492        /*
2493         * This gets reverted if there's an error later and the targets
2494         * provide the .presuspend_undo hook.
2495         */
2496        dm_table_presuspend_targets(map);
2497
2498        /*
2499         * Flush I/O to the device.
2500         * Any I/O submitted after lock_fs() may not be flushed.
2501         * noflush takes precedence over do_lockfs.
2502         * (lock_fs() flushes I/Os and waits for them to complete.)
2503         */
2504        if (!noflush && do_lockfs) {
2505                r = lock_fs(md);
2506                if (r) {
2507                        dm_table_presuspend_undo_targets(map);
2508                        return r;
2509                }
2510        }
2511
2512        /*
2513         * Here we must make sure that no processes are submitting requests
2514         * to target drivers i.e. no one may be executing
2515         * dm_split_and_process_bio from dm_submit_bio.
2516         *
2517         * To get all processes out of dm_split_and_process_bio in dm_submit_bio,
2518         * we take the write lock. To prevent any process from reentering
2519         * dm_split_and_process_bio from dm_submit_bio and quiesce the thread
2520         * (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND and call
2521         * flush_workqueue(md->wq).
2522         */
2523        set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2524        if (map)
2525                synchronize_srcu(&md->io_barrier);
2526
2527        /*
2528         * Stop md->queue before flushing md->wq in case request-based
2529         * dm defers requests to md->wq from md->queue.
2530         */
2531        if (dm_request_based(md))
2532                dm_stop_queue(md->queue);
2533
2534        flush_workqueue(md->wq);
2535
2536        /*
2537         * At this point no more requests are entering target request routines.
2538         * We call dm_wait_for_completion to wait for all existing requests
2539         * to finish.
2540         */
2541        r = dm_wait_for_completion(md, task_state);
2542        if (!r)
2543                set_bit(dmf_suspended_flag, &md->flags);
2544
2545        if (noflush)
2546                clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2547        if (map)
2548                synchronize_srcu(&md->io_barrier);
2549
2550        /* were we interrupted ? */
2551        if (r < 0) {
2552                dm_queue_flush(md);
2553
2554                if (dm_request_based(md))
2555                        dm_start_queue(md->queue);
2556
2557                unlock_fs(md);
2558                dm_table_presuspend_undo_targets(map);
2559                /* pushback list is already flushed, so skip flush */
2560        }
2561
2562        return r;
2563}
2564
2565/*
2566 * We need to be able to change a mapping table under a mounted
2567 * filesystem.  For example we might want to move some data in
2568 * the background.  Before the table can be swapped with
2569 * dm_bind_table, dm_suspend must be called to flush any in
2570 * flight bios and ensure that any further io gets deferred.
2571 */
2572/*
2573 * Suspend mechanism in request-based dm.
2574 *
2575 * 1. Flush all I/Os by lock_fs() if needed.
2576 * 2. Stop dispatching any I/O by stopping the request_queue.
2577 * 3. Wait for all in-flight I/Os to be completed or requeued.
2578 *
2579 * To abort suspend, start the request_queue.
2580 */
2581int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2582{
2583        struct dm_table *map = NULL;
2584        int r = 0;
2585
2586retry:
2587        mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2588
2589        if (dm_suspended_md(md)) {
2590                r = -EINVAL;
2591                goto out_unlock;
2592        }
2593
2594        if (dm_suspended_internally_md(md)) {
2595                /* already internally suspended, wait for internal resume */
2596                mutex_unlock(&md->suspend_lock);
2597                r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2598                if (r)
2599                        return r;
2600                goto retry;
2601        }
2602
2603        map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2604
2605        r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED);
2606        if (r)
2607                goto out_unlock;
2608
2609        set_bit(DMF_POST_SUSPENDING, &md->flags);
2610        dm_table_postsuspend_targets(map);
2611        clear_bit(DMF_POST_SUSPENDING, &md->flags);
2612
2613out_unlock:
2614        mutex_unlock(&md->suspend_lock);
2615        return r;
2616}
2617
2618static int __dm_resume(struct mapped_device *md, struct dm_table *map)
2619{
2620        if (map) {
2621                int r = dm_table_resume_targets(map);
2622                if (r)
2623                        return r;
2624        }
2625
2626        dm_queue_flush(md);
2627
2628        /*
2629         * Flushing deferred I/Os must be done after targets are resumed
2630         * so that mapping of targets can work correctly.
2631         * Request-based dm is queueing the deferred I/Os in its request_queue.
2632         */
2633        if (dm_request_based(md))
2634                dm_start_queue(md->queue);
2635
2636        unlock_fs(md);
2637
2638        return 0;
2639}
2640
2641int dm_resume(struct mapped_device *md)
2642{
2643        int r;
2644        struct dm_table *map = NULL;
2645
2646retry:
2647        r = -EINVAL;
2648        mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING);
2649
2650        if (!dm_suspended_md(md))
2651                goto out;
2652
2653        if (dm_suspended_internally_md(md)) {
2654                /* already internally suspended, wait for internal resume */
2655                mutex_unlock(&md->suspend_lock);
2656                r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE);
2657                if (r)
2658                        return r;
2659                goto retry;
2660        }
2661
2662        map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2663        if (!map || !dm_table_get_size(map))
2664                goto out;
2665
2666        r = __dm_resume(md, map);
2667        if (r)
2668                goto out;
2669
2670        clear_bit(DMF_SUSPENDED, &md->flags);
2671out:
2672        mutex_unlock(&md->suspend_lock);
2673
2674        return r;
2675}
2676
2677/*
2678 * Internal suspend/resume works like userspace-driven suspend. It waits
2679 * until all bios finish and prevents issuing new bios to the target drivers.
2680 * It may be used only from the kernel.
2681 */
2682
2683static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags)
2684{
2685        struct dm_table *map = NULL;
2686
2687        lockdep_assert_held(&md->suspend_lock);
2688
2689        if (md->internal_suspend_count++)
2690                return; /* nested internal suspend */
2691
2692        if (dm_suspended_md(md)) {
2693                set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2694                return; /* nest suspend */
2695        }
2696
2697        map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
2698
2699        /*
2700         * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is
2701         * supported.  Properly supporting a TASK_INTERRUPTIBLE internal suspend
2702         * would require changing .presuspend to return an error -- avoid this
2703         * until there is a need for more elaborate variants of internal suspend.
2704         */
2705        (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE,
2706                            DMF_SUSPENDED_INTERNALLY);
2707
2708        set_bit(DMF_POST_SUSPENDING, &md->flags);
2709        dm_table_postsuspend_targets(map);
2710        clear_bit(DMF_POST_SUSPENDING, &md->flags);
2711}
2712
2713static void __dm_internal_resume(struct mapped_device *md)
2714{
2715        BUG_ON(!md->internal_suspend_count);
2716
2717        if (--md->internal_suspend_count)
2718                return; /* resume from nested internal suspend */
2719
2720        if (dm_suspended_md(md))
2721                goto done; /* resume from nested suspend */
2722
2723        /*
2724         * NOTE: existing callers don't need to call dm_table_resume_targets
2725         * (which may fail -- so best to avoid it for now by passing NULL map)
2726         */
2727        (void) __dm_resume(md, NULL);
2728
2729done:
2730        clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2731        smp_mb__after_atomic();
2732        wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY);
2733}
2734
2735void dm_internal_suspend_noflush(struct mapped_device *md)
2736{
2737        mutex_lock(&md->suspend_lock);
2738        __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG);
2739        mutex_unlock(&md->suspend_lock);
2740}
2741EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush);
2742
2743void dm_internal_resume(struct mapped_device *md)
2744{
2745        mutex_lock(&md->suspend_lock);
2746        __dm_internal_resume(md);
2747        mutex_unlock(&md->suspend_lock);
2748}
2749EXPORT_SYMBOL_GPL(dm_internal_resume);
2750
2751/*
2752 * Fast variants of internal suspend/resume hold md->suspend_lock,
2753 * which prevents interaction with userspace-driven suspend.
2754 */
2755
2756void dm_internal_suspend_fast(struct mapped_device *md)
2757{
2758        mutex_lock(&md->suspend_lock);
2759        if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2760                return;
2761
2762        set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2763        synchronize_srcu(&md->io_barrier);
2764        flush_workqueue(md->wq);
2765        dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2766}
2767EXPORT_SYMBOL_GPL(dm_internal_suspend_fast);
2768
2769void dm_internal_resume_fast(struct mapped_device *md)
2770{
2771        if (dm_suspended_md(md) || dm_suspended_internally_md(md))
2772                goto done;
2773
2774        dm_queue_flush(md);
2775
2776done:
2777        mutex_unlock(&md->suspend_lock);
2778}
2779EXPORT_SYMBOL_GPL(dm_internal_resume_fast);
2780
2781/*-----------------------------------------------------------------
2782 * Event notification.
2783 *---------------------------------------------------------------*/
2784int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
2785                       unsigned cookie)
2786{
2787        int r;
2788        unsigned noio_flag;
2789        char udev_cookie[DM_COOKIE_LENGTH];
2790        char *envp[] = { udev_cookie, NULL };
2791
2792        noio_flag = memalloc_noio_save();
2793
2794        if (!cookie)
2795                r = kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
2796        else {
2797                snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2798                         DM_COOKIE_ENV_VAR_NAME, cookie);
2799                r = kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
2800                                       action, envp);
2801        }
2802
2803        memalloc_noio_restore(noio_flag);
2804
2805        return r;
2806}
2807
2808uint32_t dm_next_uevent_seq(struct mapped_device *md)
2809{
2810        return atomic_add_return(1, &md->uevent_seq);
2811}
2812
2813uint32_t dm_get_event_nr(struct mapped_device *md)
2814{
2815        return atomic_read(&md->event_nr);
2816}
2817
2818int dm_wait_event(struct mapped_device *md, int event_nr)
2819{
2820        return wait_event_interruptible(md->eventq,
2821                        (event_nr != atomic_read(&md->event_nr)));
2822}
2823
2824void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
2825{
2826        unsigned long flags;
2827
2828        spin_lock_irqsave(&md->uevent_lock, flags);
2829        list_add(elist, &md->uevent_list);
2830        spin_unlock_irqrestore(&md->uevent_lock, flags);
2831}
2832
2833/*
2834 * The gendisk is only valid as long as you have a reference
2835 * count on 'md'.
2836 */
2837struct gendisk *dm_disk(struct mapped_device *md)
2838{
2839        return md->disk;
2840}
2841EXPORT_SYMBOL_GPL(dm_disk);
2842
2843struct kobject *dm_kobject(struct mapped_device *md)
2844{
2845        return &md->kobj_holder.kobj;
2846}
2847
2848struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2849{
2850        struct mapped_device *md;
2851
2852        md = container_of(kobj, struct mapped_device, kobj_holder.kobj);
2853
2854        spin_lock(&_minor_lock);
2855        if (test_bit(DMF_FREEING, &md->flags) || dm_deleting_md(md)) {
2856                md = NULL;
2857                goto out;
2858        }
2859        dm_get(md);
2860out:
2861        spin_unlock(&_minor_lock);
2862
2863        return md;
2864}
2865
2866int dm_suspended_md(struct mapped_device *md)
2867{
2868        return test_bit(DMF_SUSPENDED, &md->flags);
2869}
2870
2871static int dm_post_suspending_md(struct mapped_device *md)
2872{
2873        return test_bit(DMF_POST_SUSPENDING, &md->flags);
2874}
2875
2876int dm_suspended_internally_md(struct mapped_device *md)
2877{
2878        return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags);
2879}
2880
2881int dm_test_deferred_remove_flag(struct mapped_device *md)
2882{
2883        return test_bit(DMF_DEFERRED_REMOVE, &md->flags);
2884}
2885
2886int dm_suspended(struct dm_target *ti)
2887{
2888        return dm_suspended_md(ti->table->md);
2889}
2890EXPORT_SYMBOL_GPL(dm_suspended);
2891
2892int dm_post_suspending(struct dm_target *ti)
2893{
2894        return dm_post_suspending_md(ti->table->md);
2895}
2896EXPORT_SYMBOL_GPL(dm_post_suspending);
2897
2898int dm_noflush_suspending(struct dm_target *ti)
2899{
2900        return __noflush_suspending(ti->table->md);
2901}
2902EXPORT_SYMBOL_GPL(dm_noflush_suspending);
2903
2904struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, enum dm_queue_mode type,
2905                                            unsigned integrity, unsigned per_io_data_size,
2906                                            unsigned min_pool_size)
2907{
2908        struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
2909        unsigned int pool_size = 0;
2910        unsigned int front_pad, io_front_pad;
2911        int ret;
2912
2913        if (!pools)
2914                return NULL;
2915
2916        switch (type) {
2917        case DM_TYPE_BIO_BASED:
2918        case DM_TYPE_DAX_BIO_BASED:
2919                pool_size = max(dm_get_reserved_bio_based_ios(), min_pool_size);
2920                front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + DM_TARGET_IO_BIO_OFFSET;
2921                io_front_pad = roundup(per_io_data_size,  __alignof__(struct dm_io)) + DM_IO_BIO_OFFSET;
2922                ret = bioset_init(&pools->io_bs, pool_size, io_front_pad, 0);
2923                if (ret)
2924                        goto out;
2925                if (integrity && bioset_integrity_create(&pools->io_bs, pool_size))
2926                        goto out;
2927                break;
2928        case DM_TYPE_REQUEST_BASED:
2929                pool_size = max(dm_get_reserved_rq_based_ios(), min_pool_size);
2930                front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
2931                /* per_io_data_size is used for blk-mq pdu at queue allocation */
2932                break;
2933        default:
2934                BUG();
2935        }
2936
2937        ret = bioset_init(&pools->bs, pool_size, front_pad, 0);
2938        if (ret)
2939                goto out;
2940
2941        if (integrity && bioset_integrity_create(&pools->bs, pool_size))
2942                goto out;
2943
2944        return pools;
2945
2946out:
2947        dm_free_md_mempools(pools);
2948
2949        return NULL;
2950}
2951
2952void dm_free_md_mempools(struct dm_md_mempools *pools)
2953{
2954        if (!pools)
2955                return;
2956
2957        bioset_exit(&pools->bs);
2958        bioset_exit(&pools->io_bs);
2959
2960        kfree(pools);
2961}
2962
2963struct dm_pr {
2964        u64     old_key;
2965        u64     new_key;
2966        u32     flags;
2967        bool    fail_early;
2968};
2969
2970static int dm_call_pr(struct block_device *bdev, iterate_devices_callout_fn fn,
2971                      void *data)
2972{
2973        struct mapped_device *md = bdev->bd_disk->private_data;
2974        struct dm_table *table;
2975        struct dm_target *ti;
2976        int ret = -ENOTTY, srcu_idx;
2977
2978        table = dm_get_live_table(md, &srcu_idx);
2979        if (!table || !dm_table_get_size(table))
2980                goto out;
2981
2982        /* We only support devices that have a single target */
2983        if (dm_table_get_num_targets(table) != 1)
2984                goto out;
2985        ti = dm_table_get_target(table, 0);
2986
2987        ret = -EINVAL;
2988        if (!ti->type->iterate_devices)
2989                goto out;
2990
2991        ret = ti->type->iterate_devices(ti, fn, data);
2992out:
2993        dm_put_live_table(md, srcu_idx);
2994        return ret;
2995}
2996
2997/*
2998 * For register / unregister we need to manually call out to every path.
2999 */
3000static int __dm_pr_register(struct dm_target *ti, struct dm_dev *dev,
3001                            sector_t start, sector_t len, void *data)
3002{
3003        struct dm_pr *pr = data;
3004        const struct pr_ops *ops = dev->bdev->bd_disk->fops->pr_ops;
3005
3006        if (!ops || !ops->pr_register)
3007                return -EOPNOTSUPP;
3008        return ops->pr_register(dev->bdev, pr->old_key, pr->new_key, pr->flags);
3009}
3010
3011static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
3012                          u32 flags)
3013{
3014        struct dm_pr pr = {
3015                .old_key        = old_key,
3016                .new_key        = new_key,
3017                .flags          = flags,
3018                .fail_early     = true,
3019        };
3020        int ret;
3021
3022        ret = dm_call_pr(bdev, __dm_pr_register, &pr);
3023        if (ret && new_key) {
3024                /* unregister all paths if we failed to register any path */
3025                pr.old_key = new_key;
3026                pr.new_key = 0;
3027                pr.flags = 0;
3028                pr.fail_early = false;
3029                dm_call_pr(bdev, __dm_pr_register, &pr);
3030        }
3031
3032        return ret;
3033}
3034
3035static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
3036                         u32 flags)
3037{
3038        struct mapped_device *md = bdev->bd_disk->private_data;
3039        const struct pr_ops *ops;
3040        int r, srcu_idx;
3041
3042        r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3043        if (r < 0)
3044                goto out;
3045
3046        ops = bdev->bd_disk->fops->pr_ops;
3047        if (ops && ops->pr_reserve)
3048                r = ops->pr_reserve(bdev, key, type, flags);
3049        else
3050                r = -EOPNOTSUPP;
3051out:
3052        dm_unprepare_ioctl(md, srcu_idx);
3053        return r;
3054}
3055
3056static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
3057{
3058        struct mapped_device *md = bdev->bd_disk->private_data;
3059        const struct pr_ops *ops;
3060        int r, srcu_idx;
3061
3062        r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3063        if (r < 0)
3064                goto out;
3065
3066        ops = bdev->bd_disk->fops->pr_ops;
3067        if (ops && ops->pr_release)
3068                r = ops->pr_release(bdev, key, type);
3069        else
3070                r = -EOPNOTSUPP;
3071out:
3072        dm_unprepare_ioctl(md, srcu_idx);
3073        return r;
3074}
3075
3076static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
3077                         enum pr_type type, bool abort)
3078{
3079        struct mapped_device *md = bdev->bd_disk->private_data;
3080        const struct pr_ops *ops;
3081        int r, srcu_idx;
3082
3083        r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3084        if (r < 0)
3085                goto out;
3086
3087        ops = bdev->bd_disk->fops->pr_ops;
3088        if (ops && ops->pr_preempt)
3089                r = ops->pr_preempt(bdev, old_key, new_key, type, abort);
3090        else
3091                r = -EOPNOTSUPP;
3092out:
3093        dm_unprepare_ioctl(md, srcu_idx);
3094        return r;
3095}
3096
3097static int dm_pr_clear(struct block_device *bdev, u64 key)
3098{
3099        struct mapped_device *md = bdev->bd_disk->private_data;
3100        const struct pr_ops *ops;
3101        int r, srcu_idx;
3102
3103        r = dm_prepare_ioctl(md, &srcu_idx, &bdev);
3104        if (r < 0)
3105                goto out;
3106
3107        ops = bdev->bd_disk->fops->pr_ops;
3108        if (ops && ops->pr_clear)
3109                r = ops->pr_clear(bdev, key);
3110        else
3111                r = -EOPNOTSUPP;
3112out:
3113        dm_unprepare_ioctl(md, srcu_idx);
3114        return r;
3115}
3116
3117static const struct pr_ops dm_pr_ops = {
3118        .pr_register    = dm_pr_register,
3119        .pr_reserve     = dm_pr_reserve,
3120        .pr_release     = dm_pr_release,
3121        .pr_preempt     = dm_pr_preempt,
3122        .pr_clear       = dm_pr_clear,
3123};
3124
3125static const struct block_device_operations dm_blk_dops = {
3126        .submit_bio = dm_submit_bio,
3127        .poll_bio = dm_poll_bio,
3128        .open = dm_blk_open,
3129        .release = dm_blk_close,
3130        .ioctl = dm_blk_ioctl,
3131        .getgeo = dm_blk_getgeo,
3132        .report_zones = dm_blk_report_zones,
3133        .pr_ops = &dm_pr_ops,
3134        .owner = THIS_MODULE
3135};
3136
3137static const struct block_device_operations dm_rq_blk_dops = {
3138        .open = dm_blk_open,
3139        .release = dm_blk_close,
3140        .ioctl = dm_blk_ioctl,
3141        .getgeo = dm_blk_getgeo,
3142        .pr_ops = &dm_pr_ops,
3143        .owner = THIS_MODULE
3144};
3145
3146static const struct dax_operations dm_dax_ops = {
3147        .direct_access = dm_dax_direct_access,
3148        .zero_page_range = dm_dax_zero_page_range,
3149};
3150
3151/*
3152 * module hooks
3153 */
3154module_init(dm_init);
3155module_exit(dm_exit);
3156
3157module_param(major, uint, 0);
3158MODULE_PARM_DESC(major, "The major number of the device mapper");
3159
3160module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR);
3161MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools");
3162
3163module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
3164MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
3165
3166module_param(swap_bios, int, S_IRUGO | S_IWUSR);
3167MODULE_PARM_DESC(swap_bios, "Maximum allowed inflight swap IOs");
3168
3169MODULE_DESCRIPTION(DM_NAME " driver");
3170MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
3171MODULE_LICENSE("GPL");
3172