linux/drivers/md/dm.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
   3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
   4 *
   5 * This file is released under the GPL.
   6 */
   7
   8#include "dm.h"
   9#include "dm-uevent.h"
  10
  11#include <linux/init.h>
  12#include <linux/module.h>
  13#include <linux/mutex.h>
  14#include <linux/moduleparam.h>
  15#include <linux/blkpg.h>
  16#include <linux/bio.h>
  17#include <linux/buffer_head.h>
  18#include <linux/mempool.h>
  19#include <linux/slab.h>
  20#include <linux/idr.h>
  21#include <linux/hdreg.h>
  22
  23#include <trace/events/block.h>
  24
  25#define DM_MSG_PREFIX "core"
  26
  27/*
  28 * Cookies are numeric values sent with CHANGE and REMOVE
  29 * uevents while resuming, removing or renaming the device.
  30 */
  31#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
  32#define DM_COOKIE_LENGTH 24
  33
  34static const char *_name = DM_NAME;
  35
  36static unsigned int major = 0;
  37static unsigned int _major = 0;
  38
  39static DEFINE_SPINLOCK(_minor_lock);
  40/*
  41 * For bio-based dm.
  42 * One of these is allocated per bio.
  43 */
  44struct dm_io {
  45        struct mapped_device *md;
  46        int error;
  47        atomic_t io_count;
  48        struct bio *bio;
  49        unsigned long start_time;
  50        spinlock_t endio_lock;
  51};
  52
  53/*
  54 * For bio-based dm.
  55 * One of these is allocated per target within a bio.  Hopefully
  56 * this will be simplified out one day.
  57 */
  58struct dm_target_io {
  59        struct dm_io *io;
  60        struct dm_target *ti;
  61        union map_info info;
  62};
  63
  64/*
  65 * For request-based dm.
  66 * One of these is allocated per request.
  67 */
  68struct dm_rq_target_io {
  69        struct mapped_device *md;
  70        struct dm_target *ti;
  71        struct request *orig, clone;
  72        int error;
  73        union map_info info;
  74};
  75
  76/*
  77 * For request-based dm.
  78 * One of these is allocated per bio.
  79 */
  80struct dm_rq_clone_bio_info {
  81        struct bio *orig;
  82        struct dm_rq_target_io *tio;
  83};
  84
  85union map_info *dm_get_mapinfo(struct bio *bio)
  86{
  87        if (bio && bio->bi_private)
  88                return &((struct dm_target_io *)bio->bi_private)->info;
  89        return NULL;
  90}
  91
  92union map_info *dm_get_rq_mapinfo(struct request *rq)
  93{
  94        if (rq && rq->end_io_data)
  95                return &((struct dm_rq_target_io *)rq->end_io_data)->info;
  96        return NULL;
  97}
  98EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
  99
 100#define MINOR_ALLOCED ((void *)-1)
 101
 102/*
 103 * Bits for the md->flags field.
 104 */
 105#define DMF_BLOCK_IO_FOR_SUSPEND 0
 106#define DMF_SUSPENDED 1
 107#define DMF_FROZEN 2
 108#define DMF_FREEING 3
 109#define DMF_DELETING 4
 110#define DMF_NOFLUSH_SUSPENDING 5
 111#define DMF_QUEUE_IO_TO_THREAD 6
 112
 113/*
 114 * Work processed by per-device workqueue.
 115 */
 116struct mapped_device {
 117        struct rw_semaphore io_lock;
 118        struct mutex suspend_lock;
 119        rwlock_t map_lock;
 120        atomic_t holders;
 121        atomic_t open_count;
 122
 123        unsigned long flags;
 124
 125        struct request_queue *queue;
 126        struct gendisk *disk;
 127        char name[16];
 128
 129        void *interface_ptr;
 130
 131        /*
 132         * A list of ios that arrived while we were suspended.
 133         */
 134        atomic_t pending[2];
 135        wait_queue_head_t wait;
 136        struct work_struct work;
 137        struct bio_list deferred;
 138        spinlock_t deferred_lock;
 139
 140        /*
 141         * An error from the barrier request currently being processed.
 142         */
 143        int barrier_error;
 144
 145        /*
 146         * Processing queue (flush/barriers)
 147         */
 148        struct workqueue_struct *wq;
 149
 150        /*
 151         * The current mapping.
 152         */
 153        struct dm_table *map;
 154
 155        /*
 156         * io objects are allocated from here.
 157         */
 158        mempool_t *io_pool;
 159        mempool_t *tio_pool;
 160
 161        struct bio_set *bs;
 162
 163        /*
 164         * Event handling.
 165         */
 166        atomic_t event_nr;
 167        wait_queue_head_t eventq;
 168        atomic_t uevent_seq;
 169        struct list_head uevent_list;
 170        spinlock_t uevent_lock; /* Protect access to uevent_list */
 171
 172        /*
 173         * freeze/thaw support require holding onto a super block
 174         */
 175        struct super_block *frozen_sb;
 176        struct block_device *bdev;
 177
 178        /* forced geometry settings */
 179        struct hd_geometry geometry;
 180
 181        /* marker of flush suspend for request-based dm */
 182        struct request suspend_rq;
 183
 184        /* For saving the address of __make_request for request based dm */
 185        make_request_fn *saved_make_request_fn;
 186
 187        /* sysfs handle */
 188        struct kobject kobj;
 189
 190        /* zero-length barrier that will be cloned and submitted to targets */
 191        struct bio barrier_bio;
 192};
 193
 194/*
 195 * For mempools pre-allocation at the table loading time.
 196 */
 197struct dm_md_mempools {
 198        mempool_t *io_pool;
 199        mempool_t *tio_pool;
 200        struct bio_set *bs;
 201};
 202
 203#define MIN_IOS 256
 204static struct kmem_cache *_io_cache;
 205static struct kmem_cache *_tio_cache;
 206static struct kmem_cache *_rq_tio_cache;
 207static struct kmem_cache *_rq_bio_info_cache;
 208
 209static int __init local_init(void)
 210{
 211        int r = -ENOMEM;
 212
 213        /* allocate a slab for the dm_ios */
 214        _io_cache = KMEM_CACHE(dm_io, 0);
 215        if (!_io_cache)
 216                return r;
 217
 218        /* allocate a slab for the target ios */
 219        _tio_cache = KMEM_CACHE(dm_target_io, 0);
 220        if (!_tio_cache)
 221                goto out_free_io_cache;
 222
 223        _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
 224        if (!_rq_tio_cache)
 225                goto out_free_tio_cache;
 226
 227        _rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0);
 228        if (!_rq_bio_info_cache)
 229                goto out_free_rq_tio_cache;
 230
 231        r = dm_uevent_init();
 232        if (r)
 233                goto out_free_rq_bio_info_cache;
 234
 235        _major = major;
 236        r = register_blkdev(_major, _name);
 237        if (r < 0)
 238                goto out_uevent_exit;
 239
 240        if (!_major)
 241                _major = r;
 242
 243        return 0;
 244
 245out_uevent_exit:
 246        dm_uevent_exit();
 247out_free_rq_bio_info_cache:
 248        kmem_cache_destroy(_rq_bio_info_cache);
 249out_free_rq_tio_cache:
 250        kmem_cache_destroy(_rq_tio_cache);
 251out_free_tio_cache:
 252        kmem_cache_destroy(_tio_cache);
 253out_free_io_cache:
 254        kmem_cache_destroy(_io_cache);
 255
 256        return r;
 257}
 258
 259static void local_exit(void)
 260{
 261        kmem_cache_destroy(_rq_bio_info_cache);
 262        kmem_cache_destroy(_rq_tio_cache);
 263        kmem_cache_destroy(_tio_cache);
 264        kmem_cache_destroy(_io_cache);
 265        unregister_blkdev(_major, _name);
 266        dm_uevent_exit();
 267
 268        _major = 0;
 269
 270        DMINFO("cleaned up");
 271}
 272
 273static int (*_inits[])(void) __initdata = {
 274        local_init,
 275        dm_target_init,
 276        dm_linear_init,
 277        dm_stripe_init,
 278        dm_kcopyd_init,
 279        dm_interface_init,
 280};
 281
 282static void (*_exits[])(void) = {
 283        local_exit,
 284        dm_target_exit,
 285        dm_linear_exit,
 286        dm_stripe_exit,
 287        dm_kcopyd_exit,
 288        dm_interface_exit,
 289};
 290
 291static int __init dm_init(void)
 292{
 293        const int count = ARRAY_SIZE(_inits);
 294
 295        int r, i;
 296
 297        for (i = 0; i < count; i++) {
 298                r = _inits[i]();
 299                if (r)
 300                        goto bad;
 301        }
 302
 303        return 0;
 304
 305      bad:
 306        while (i--)
 307                _exits[i]();
 308
 309        return r;
 310}
 311
 312static void __exit dm_exit(void)
 313{
 314        int i = ARRAY_SIZE(_exits);
 315
 316        while (i--)
 317                _exits[i]();
 318}
 319
 320/*
 321 * Block device functions
 322 */
 323static int dm_blk_open(struct block_device *bdev, fmode_t mode)
 324{
 325        struct mapped_device *md;
 326
 327        spin_lock(&_minor_lock);
 328
 329        md = bdev->bd_disk->private_data;
 330        if (!md)
 331                goto out;
 332
 333        if (test_bit(DMF_FREEING, &md->flags) ||
 334            test_bit(DMF_DELETING, &md->flags)) {
 335                md = NULL;
 336                goto out;
 337        }
 338
 339        dm_get(md);
 340        atomic_inc(&md->open_count);
 341
 342out:
 343        spin_unlock(&_minor_lock);
 344
 345        return md ? 0 : -ENXIO;
 346}
 347
 348static int dm_blk_close(struct gendisk *disk, fmode_t mode)
 349{
 350        struct mapped_device *md = disk->private_data;
 351        atomic_dec(&md->open_count);
 352        dm_put(md);
 353        return 0;
 354}
 355
 356int dm_open_count(struct mapped_device *md)
 357{
 358        return atomic_read(&md->open_count);
 359}
 360
 361/*
 362 * Guarantees nothing is using the device before it's deleted.
 363 */
 364int dm_lock_for_deletion(struct mapped_device *md)
 365{
 366        int r = 0;
 367
 368        spin_lock(&_minor_lock);
 369
 370        if (dm_open_count(md))
 371                r = -EBUSY;
 372        else
 373                set_bit(DMF_DELETING, &md->flags);
 374
 375        spin_unlock(&_minor_lock);
 376
 377        return r;
 378}
 379
 380static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 381{
 382        struct mapped_device *md = bdev->bd_disk->private_data;
 383
 384        return dm_get_geometry(md, geo);
 385}
 386
 387static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
 388                        unsigned int cmd, unsigned long arg)
 389{
 390        struct mapped_device *md = bdev->bd_disk->private_data;
 391        struct dm_table *map = dm_get_table(md);
 392        struct dm_target *tgt;
 393        int r = -ENOTTY;
 394
 395        if (!map || !dm_table_get_size(map))
 396                goto out;
 397
 398        /* We only support devices that have a single target */
 399        if (dm_table_get_num_targets(map) != 1)
 400                goto out;
 401
 402        tgt = dm_table_get_target(map, 0);
 403
 404        if (dm_suspended(md)) {
 405                r = -EAGAIN;
 406                goto out;
 407        }
 408
 409        if (tgt->type->ioctl)
 410                r = tgt->type->ioctl(tgt, cmd, arg);
 411
 412out:
 413        dm_table_put(map);
 414
 415        return r;
 416}
 417
 418static struct dm_io *alloc_io(struct mapped_device *md)
 419{
 420        return mempool_alloc(md->io_pool, GFP_NOIO);
 421}
 422
 423static void free_io(struct mapped_device *md, struct dm_io *io)
 424{
 425        mempool_free(io, md->io_pool);
 426}
 427
 428static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
 429{
 430        mempool_free(tio, md->tio_pool);
 431}
 432
 433static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md)
 434{
 435        return mempool_alloc(md->tio_pool, GFP_ATOMIC);
 436}
 437
 438static void free_rq_tio(struct dm_rq_target_io *tio)
 439{
 440        mempool_free(tio, tio->md->tio_pool);
 441}
 442
 443static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md)
 444{
 445        return mempool_alloc(md->io_pool, GFP_ATOMIC);
 446}
 447
 448static void free_bio_info(struct dm_rq_clone_bio_info *info)
 449{
 450        mempool_free(info, info->tio->md->io_pool);
 451}
 452
 453static void start_io_acct(struct dm_io *io)
 454{
 455        struct mapped_device *md = io->md;
 456        int cpu;
 457        int rw = bio_data_dir(io->bio);
 458
 459        io->start_time = jiffies;
 460
 461        cpu = part_stat_lock();
 462        part_round_stats(cpu, &dm_disk(md)->part0);
 463        part_stat_unlock();
 464        dm_disk(md)->part0.in_flight[rw] = atomic_inc_return(&md->pending[rw]);
 465}
 466
 467static void end_io_acct(struct dm_io *io)
 468{
 469        struct mapped_device *md = io->md;
 470        struct bio *bio = io->bio;
 471        unsigned long duration = jiffies - io->start_time;
 472        int pending, cpu;
 473        int rw = bio_data_dir(bio);
 474
 475        cpu = part_stat_lock();
 476        part_round_stats(cpu, &dm_disk(md)->part0);
 477        part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration);
 478        part_stat_unlock();
 479
 480        /*
 481         * After this is decremented the bio must not be touched if it is
 482         * a barrier.
 483         */
 484        dm_disk(md)->part0.in_flight[rw] = pending =
 485                atomic_dec_return(&md->pending[rw]);
 486        pending += atomic_read(&md->pending[rw^0x1]);
 487
 488        /* nudge anyone waiting on suspend queue */
 489        if (!pending)
 490                wake_up(&md->wait);
 491}
 492
 493/*
 494 * Add the bio to the list of deferred io.
 495 */
 496static void queue_io(struct mapped_device *md, struct bio *bio)
 497{
 498        down_write(&md->io_lock);
 499
 500        spin_lock_irq(&md->deferred_lock);
 501        bio_list_add(&md->deferred, bio);
 502        spin_unlock_irq(&md->deferred_lock);
 503
 504        if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags))
 505                queue_work(md->wq, &md->work);
 506
 507        up_write(&md->io_lock);
 508}
 509
 510/*
 511 * Everyone (including functions in this file), should use this
 512 * function to access the md->map field, and make sure they call
 513 * dm_table_put() when finished.
 514 */
 515struct dm_table *dm_get_table(struct mapped_device *md)
 516{
 517        struct dm_table *t;
 518        unsigned long flags;
 519
 520        read_lock_irqsave(&md->map_lock, flags);
 521        t = md->map;
 522        if (t)
 523                dm_table_get(t);
 524        read_unlock_irqrestore(&md->map_lock, flags);
 525
 526        return t;
 527}
 528
 529/*
 530 * Get the geometry associated with a dm device
 531 */
 532int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
 533{
 534        *geo = md->geometry;
 535
 536        return 0;
 537}
 538
 539/*
 540 * Set the geometry of a device.
 541 */
 542int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
 543{
 544        sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
 545
 546        if (geo->start > sz) {
 547                DMWARN("Start sector is beyond the geometry limits.");
 548                return -EINVAL;
 549        }
 550
 551        md->geometry = *geo;
 552
 553        return 0;
 554}
 555
 556/*-----------------------------------------------------------------
 557 * CRUD START:
 558 *   A more elegant soln is in the works that uses the queue
 559 *   merge fn, unfortunately there are a couple of changes to
 560 *   the block layer that I want to make for this.  So in the
 561 *   interests of getting something for people to use I give
 562 *   you this clearly demarcated crap.
 563 *---------------------------------------------------------------*/
 564
 565static int __noflush_suspending(struct mapped_device *md)
 566{
 567        return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
 568}
 569
 570/*
 571 * Decrements the number of outstanding ios that a bio has been
 572 * cloned into, completing the original io if necc.
 573 */
 574static void dec_pending(struct dm_io *io, int error)
 575{
 576        unsigned long flags;
 577        int io_error;
 578        struct bio *bio;
 579        struct mapped_device *md = io->md;
 580
 581        /* Push-back supersedes any I/O errors */
 582        if (unlikely(error)) {
 583                spin_lock_irqsave(&io->endio_lock, flags);
 584                if (!(io->error > 0 && __noflush_suspending(md)))
 585                        io->error = error;
 586                spin_unlock_irqrestore(&io->endio_lock, flags);
 587        }
 588
 589        if (atomic_dec_and_test(&io->io_count)) {
 590                if (io->error == DM_ENDIO_REQUEUE) {
 591                        /*
 592                         * Target requested pushing back the I/O.
 593                         */
 594                        spin_lock_irqsave(&md->deferred_lock, flags);
 595                        if (__noflush_suspending(md)) {
 596                                if (!bio_rw_flagged(io->bio, BIO_RW_BARRIER))
 597                                        bio_list_add_head(&md->deferred,
 598                                                          io->bio);
 599                        } else
 600                                /* noflush suspend was interrupted. */
 601                                io->error = -EIO;
 602                        spin_unlock_irqrestore(&md->deferred_lock, flags);
 603                }
 604
 605                io_error = io->error;
 606                bio = io->bio;
 607
 608                if (bio_rw_flagged(bio, BIO_RW_BARRIER)) {
 609                        /*
 610                         * There can be just one barrier request so we use
 611                         * a per-device variable for error reporting.
 612                         * Note that you can't touch the bio after end_io_acct
 613                         */
 614                        if (!md->barrier_error && io_error != -EOPNOTSUPP)
 615                                md->barrier_error = io_error;
 616                        end_io_acct(io);
 617                } else {
 618                        end_io_acct(io);
 619
 620                        if (io_error != DM_ENDIO_REQUEUE) {
 621                                trace_block_bio_complete(md->queue, bio);
 622
 623                                bio_endio(bio, io_error);
 624                        }
 625                }
 626
 627                free_io(md, io);
 628        }
 629}
 630
 631static void clone_endio(struct bio *bio, int error)
 632{
 633        int r = 0;
 634        struct dm_target_io *tio = bio->bi_private;
 635        struct dm_io *io = tio->io;
 636        struct mapped_device *md = tio->io->md;
 637        dm_endio_fn endio = tio->ti->type->end_io;
 638
 639        if (!bio_flagged(bio, BIO_UPTODATE) && !error)
 640                error = -EIO;
 641
 642        if (endio) {
 643                r = endio(tio->ti, bio, error, &tio->info);
 644                if (r < 0 || r == DM_ENDIO_REQUEUE)
 645                        /*
 646                         * error and requeue request are handled
 647                         * in dec_pending().
 648                         */
 649                        error = r;
 650                else if (r == DM_ENDIO_INCOMPLETE)
 651                        /* The target will handle the io */
 652                        return;
 653                else if (r) {
 654                        DMWARN("unimplemented target endio return value: %d", r);
 655                        BUG();
 656                }
 657        }
 658
 659        /*
 660         * Store md for cleanup instead of tio which is about to get freed.
 661         */
 662        bio->bi_private = md->bs;
 663
 664        free_tio(md, tio);
 665        bio_put(bio);
 666        dec_pending(io, error);
 667}
 668
 669/*
 670 * Partial completion handling for request-based dm
 671 */
 672static void end_clone_bio(struct bio *clone, int error)
 673{
 674        struct dm_rq_clone_bio_info *info = clone->bi_private;
 675        struct dm_rq_target_io *tio = info->tio;
 676        struct bio *bio = info->orig;
 677        unsigned int nr_bytes = info->orig->bi_size;
 678
 679        bio_put(clone);
 680
 681        if (tio->error)
 682                /*
 683                 * An error has already been detected on the request.
 684                 * Once error occurred, just let clone->end_io() handle
 685                 * the remainder.
 686                 */
 687                return;
 688        else if (error) {
 689                /*
 690                 * Don't notice the error to the upper layer yet.
 691                 * The error handling decision is made by the target driver,
 692                 * when the request is completed.
 693                 */
 694                tio->error = error;
 695                return;
 696        }
 697
 698        /*
 699         * I/O for the bio successfully completed.
 700         * Notice the data completion to the upper layer.
 701         */
 702
 703        /*
 704         * bios are processed from the head of the list.
 705         * So the completing bio should always be rq->bio.
 706         * If it's not, something wrong is happening.
 707         */
 708        if (tio->orig->bio != bio)
 709                DMERR("bio completion is going in the middle of the request");
 710
 711        /*
 712         * Update the original request.
 713         * Do not use blk_end_request() here, because it may complete
 714         * the original request before the clone, and break the ordering.
 715         */
 716        blk_update_request(tio->orig, 0, nr_bytes);
 717}
 718
 719/*
 720 * Don't touch any member of the md after calling this function because
 721 * the md may be freed in dm_put() at the end of this function.
 722 * Or do dm_get() before calling this function and dm_put() later.
 723 */
 724static void rq_completed(struct mapped_device *md, int run_queue)
 725{
 726        int wakeup_waiters = 0;
 727        struct request_queue *q = md->queue;
 728        unsigned long flags;
 729
 730        spin_lock_irqsave(q->queue_lock, flags);
 731        if (!queue_in_flight(q))
 732                wakeup_waiters = 1;
 733        spin_unlock_irqrestore(q->queue_lock, flags);
 734
 735        /* nudge anyone waiting on suspend queue */
 736        if (wakeup_waiters)
 737                wake_up(&md->wait);
 738
 739        if (run_queue)
 740                blk_run_queue(q);
 741
 742        /*
 743         * dm_put() must be at the end of this function. See the comment above
 744         */
 745        dm_put(md);
 746}
 747
 748static void free_rq_clone(struct request *clone)
 749{
 750        struct dm_rq_target_io *tio = clone->end_io_data;
 751
 752        blk_rq_unprep_clone(clone);
 753        free_rq_tio(tio);
 754}
 755
 756static void dm_unprep_request(struct request *rq)
 757{
 758        struct request *clone = rq->special;
 759
 760        rq->special = NULL;
 761        rq->cmd_flags &= ~REQ_DONTPREP;
 762
 763        free_rq_clone(clone);
 764}
 765
 766/*
 767 * Requeue the original request of a clone.
 768 */
 769void dm_requeue_unmapped_request(struct request *clone)
 770{
 771        struct dm_rq_target_io *tio = clone->end_io_data;
 772        struct mapped_device *md = tio->md;
 773        struct request *rq = tio->orig;
 774        struct request_queue *q = rq->q;
 775        unsigned long flags;
 776
 777        dm_unprep_request(rq);
 778
 779        spin_lock_irqsave(q->queue_lock, flags);
 780        if (elv_queue_empty(q))
 781                blk_plug_device(q);
 782        blk_requeue_request(q, rq);
 783        spin_unlock_irqrestore(q->queue_lock, flags);
 784
 785        rq_completed(md, 0);
 786}
 787EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request);
 788
 789static void __stop_queue(struct request_queue *q)
 790{
 791        blk_stop_queue(q);
 792}
 793
 794static void stop_queue(struct request_queue *q)
 795{
 796        unsigned long flags;
 797
 798        spin_lock_irqsave(q->queue_lock, flags);
 799        __stop_queue(q);
 800        spin_unlock_irqrestore(q->queue_lock, flags);
 801}
 802
 803static void __start_queue(struct request_queue *q)
 804{
 805        if (blk_queue_stopped(q))
 806                blk_start_queue(q);
 807}
 808
 809static void start_queue(struct request_queue *q)
 810{
 811        unsigned long flags;
 812
 813        spin_lock_irqsave(q->queue_lock, flags);
 814        __start_queue(q);
 815        spin_unlock_irqrestore(q->queue_lock, flags);
 816}
 817
 818/*
 819 * Complete the clone and the original request.
 820 * Must be called without queue lock.
 821 */
 822static void dm_end_request(struct request *clone, int error)
 823{
 824        struct dm_rq_target_io *tio = clone->end_io_data;
 825        struct mapped_device *md = tio->md;
 826        struct request *rq = tio->orig;
 827
 828        if (blk_pc_request(rq)) {
 829                rq->errors = clone->errors;
 830                rq->resid_len = clone->resid_len;
 831
 832                if (rq->sense)
 833                        /*
 834                         * We are using the sense buffer of the original
 835                         * request.
 836                         * So setting the length of the sense data is enough.
 837                         */
 838                        rq->sense_len = clone->sense_len;
 839        }
 840
 841        free_rq_clone(clone);
 842
 843        blk_end_request_all(rq, error);
 844
 845        rq_completed(md, 1);
 846}
 847
 848/*
 849 * Request completion handler for request-based dm
 850 */
 851static void dm_softirq_done(struct request *rq)
 852{
 853        struct request *clone = rq->completion_data;
 854        struct dm_rq_target_io *tio = clone->end_io_data;
 855        dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io;
 856        int error = tio->error;
 857
 858        if (!(rq->cmd_flags & REQ_FAILED) && rq_end_io)
 859                error = rq_end_io(tio->ti, clone, error, &tio->info);
 860
 861        if (error <= 0)
 862                /* The target wants to complete the I/O */
 863                dm_end_request(clone, error);
 864        else if (error == DM_ENDIO_INCOMPLETE)
 865                /* The target will handle the I/O */
 866                return;
 867        else if (error == DM_ENDIO_REQUEUE)
 868                /* The target wants to requeue the I/O */
 869                dm_requeue_unmapped_request(clone);
 870        else {
 871                DMWARN("unimplemented target endio return value: %d", error);
 872                BUG();
 873        }
 874}
 875
 876/*
 877 * Complete the clone and the original request with the error status
 878 * through softirq context.
 879 */
 880static void dm_complete_request(struct request *clone, int error)
 881{
 882        struct dm_rq_target_io *tio = clone->end_io_data;
 883        struct request *rq = tio->orig;
 884
 885        tio->error = error;
 886        rq->completion_data = clone;
 887        blk_complete_request(rq);
 888}
 889
 890/*
 891 * Complete the not-mapped clone and the original request with the error status
 892 * through softirq context.
 893 * Target's rq_end_io() function isn't called.
 894 * This may be used when the target's map_rq() function fails.
 895 */
 896void dm_kill_unmapped_request(struct request *clone, int error)
 897{
 898        struct dm_rq_target_io *tio = clone->end_io_data;
 899        struct request *rq = tio->orig;
 900
 901        rq->cmd_flags |= REQ_FAILED;
 902        dm_complete_request(clone, error);
 903}
 904EXPORT_SYMBOL_GPL(dm_kill_unmapped_request);
 905
 906/*
 907 * Called with the queue lock held
 908 */
 909static void end_clone_request(struct request *clone, int error)
 910{
 911        /*
 912         * For just cleaning up the information of the queue in which
 913         * the clone was dispatched.
 914         * The clone is *NOT* freed actually here because it is alloced from
 915         * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags.
 916         */
 917        __blk_put_request(clone->q, clone);
 918
 919        /*
 920         * Actual request completion is done in a softirq context which doesn't
 921         * hold the queue lock.  Otherwise, deadlock could occur because:
 922         *     - another request may be submitted by the upper level driver
 923         *       of the stacking during the completion
 924         *     - the submission which requires queue lock may be done
 925         *       against this queue
 926         */
 927        dm_complete_request(clone, error);
 928}
 929
 930static sector_t max_io_len(struct mapped_device *md,
 931                           sector_t sector, struct dm_target *ti)
 932{
 933        sector_t offset = sector - ti->begin;
 934        sector_t len = ti->len - offset;
 935
 936        /*
 937         * Does the target need to split even further ?
 938         */
 939        if (ti->split_io) {
 940                sector_t boundary;
 941                boundary = ((offset + ti->split_io) & ~(ti->split_io - 1))
 942                           - offset;
 943                if (len > boundary)
 944                        len = boundary;
 945        }
 946
 947        return len;
 948}
 949
 950static void __map_bio(struct dm_target *ti, struct bio *clone,
 951                      struct dm_target_io *tio)
 952{
 953        int r;
 954        sector_t sector;
 955        struct mapped_device *md;
 956
 957        clone->bi_end_io = clone_endio;
 958        clone->bi_private = tio;
 959
 960        /*
 961         * Map the clone.  If r == 0 we don't need to do
 962         * anything, the target has assumed ownership of
 963         * this io.
 964         */
 965        atomic_inc(&tio->io->io_count);
 966        sector = clone->bi_sector;
 967        r = ti->type->map(ti, clone, &tio->info);
 968        if (r == DM_MAPIO_REMAPPED) {
 969                /* the bio has been remapped so dispatch it */
 970
 971                trace_block_remap(bdev_get_queue(clone->bi_bdev), clone,
 972                                    tio->io->bio->bi_bdev->bd_dev, sector);
 973
 974                generic_make_request(clone);
 975        } else if (r < 0 || r == DM_MAPIO_REQUEUE) {
 976                /* error the io and bail out, or requeue it if needed */
 977                md = tio->io->md;
 978                dec_pending(tio->io, r);
 979                /*
 980                 * Store bio_set for cleanup.
 981                 */
 982                clone->bi_private = md->bs;
 983                bio_put(clone);
 984                free_tio(md, tio);
 985        } else if (r) {
 986                DMWARN("unimplemented target map return value: %d", r);
 987                BUG();
 988        }
 989}
 990
 991struct clone_info {
 992        struct mapped_device *md;
 993        struct dm_table *map;
 994        struct bio *bio;
 995        struct dm_io *io;
 996        sector_t sector;
 997        sector_t sector_count;
 998        unsigned short idx;
 999};
1000
1001static void dm_bio_destructor(struct bio *bio)
1002{
1003        struct bio_set *bs = bio->bi_private;
1004
1005        bio_free(bio, bs);
1006}
1007
1008/*
1009 * Creates a little bio that is just does part of a bvec.
1010 */
1011static struct bio *split_bvec(struct bio *bio, sector_t sector,
1012                              unsigned short idx, unsigned int offset,
1013                              unsigned int len, struct bio_set *bs)
1014{
1015        struct bio *clone;
1016        struct bio_vec *bv = bio->bi_io_vec + idx;
1017
1018        clone = bio_alloc_bioset(GFP_NOIO, 1, bs);
1019        clone->bi_destructor = dm_bio_destructor;
1020        *clone->bi_io_vec = *bv;
1021
1022        clone->bi_sector = sector;
1023        clone->bi_bdev = bio->bi_bdev;
1024        clone->bi_rw = bio->bi_rw & ~(1 << BIO_RW_BARRIER);
1025        clone->bi_vcnt = 1;
1026        clone->bi_size = to_bytes(len);
1027        clone->bi_io_vec->bv_offset = offset;
1028        clone->bi_io_vec->bv_len = clone->bi_size;
1029        clone->bi_flags |= 1 << BIO_CLONED;
1030
1031        if (bio_integrity(bio)) {
1032                bio_integrity_clone(clone, bio, GFP_NOIO, bs);
1033                bio_integrity_trim(clone,
1034                                   bio_sector_offset(bio, idx, offset), len);
1035        }
1036
1037        return clone;
1038}
1039
1040/*
1041 * Creates a bio that consists of range of complete bvecs.
1042 */
1043static struct bio *clone_bio(struct bio *bio, sector_t sector,
1044                             unsigned short idx, unsigned short bv_count,
1045                             unsigned int len, struct bio_set *bs)
1046{
1047        struct bio *clone;
1048
1049        clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
1050        __bio_clone(clone, bio);
1051        clone->bi_rw &= ~(1 << BIO_RW_BARRIER);
1052        clone->bi_destructor = dm_bio_destructor;
1053        clone->bi_sector = sector;
1054        clone->bi_idx = idx;
1055        clone->bi_vcnt = idx + bv_count;
1056        clone->bi_size = to_bytes(len);
1057        clone->bi_flags &= ~(1 << BIO_SEG_VALID);
1058
1059        if (bio_integrity(bio)) {
1060                bio_integrity_clone(clone, bio, GFP_NOIO, bs);
1061
1062                if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
1063                        bio_integrity_trim(clone,
1064                                           bio_sector_offset(bio, idx, 0), len);
1065        }
1066
1067        return clone;
1068}
1069
1070static struct dm_target_io *alloc_tio(struct clone_info *ci,
1071                                      struct dm_target *ti)
1072{
1073        struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO);
1074
1075        tio->io = ci->io;
1076        tio->ti = ti;
1077        memset(&tio->info, 0, sizeof(tio->info));
1078
1079        return tio;
1080}
1081
1082static void __flush_target(struct clone_info *ci, struct dm_target *ti,
1083                          unsigned flush_nr)
1084{
1085        struct dm_target_io *tio = alloc_tio(ci, ti);
1086        struct bio *clone;
1087
1088        tio->info.flush_request = flush_nr;
1089
1090        clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs);
1091        __bio_clone(clone, ci->bio);
1092        clone->bi_destructor = dm_bio_destructor;
1093
1094        __map_bio(ti, clone, tio);
1095}
1096
1097static int __clone_and_map_empty_barrier(struct clone_info *ci)
1098{
1099        unsigned target_nr = 0, flush_nr;
1100        struct dm_target *ti;
1101
1102        while ((ti = dm_table_get_target(ci->map, target_nr++)))
1103                for (flush_nr = 0; flush_nr < ti->num_flush_requests;
1104                     flush_nr++)
1105                        __flush_target(ci, ti, flush_nr);
1106
1107        ci->sector_count = 0;
1108
1109        return 0;
1110}
1111
1112static int __clone_and_map(struct clone_info *ci)
1113{
1114        struct bio *clone, *bio = ci->bio;
1115        struct dm_target *ti;
1116        sector_t len = 0, max;
1117        struct dm_target_io *tio;
1118
1119        if (unlikely(bio_empty_barrier(bio)))
1120                return __clone_and_map_empty_barrier(ci);
1121
1122        ti = dm_table_find_target(ci->map, ci->sector);
1123        if (!dm_target_is_valid(ti))
1124                return -EIO;
1125
1126        max = max_io_len(ci->md, ci->sector, ti);
1127
1128        /*
1129         * Allocate a target io object.
1130         */
1131        tio = alloc_tio(ci, ti);
1132
1133        if (ci->sector_count <= max) {
1134                /*
1135                 * Optimise for the simple case where we can do all of
1136                 * the remaining io with a single clone.
1137                 */
1138                clone = clone_bio(bio, ci->sector, ci->idx,
1139                                  bio->bi_vcnt - ci->idx, ci->sector_count,
1140                                  ci->md->bs);
1141                __map_bio(ti, clone, tio);
1142                ci->sector_count = 0;
1143
1144        } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
1145                /*
1146                 * There are some bvecs that don't span targets.
1147                 * Do as many of these as possible.
1148                 */
1149                int i;
1150                sector_t remaining = max;
1151                sector_t bv_len;
1152
1153                for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) {
1154                        bv_len = to_sector(bio->bi_io_vec[i].bv_len);
1155
1156                        if (bv_len > remaining)
1157                                break;
1158
1159                        remaining -= bv_len;
1160                        len += bv_len;
1161                }
1162
1163                clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len,
1164                                  ci->md->bs);
1165                __map_bio(ti, clone, tio);
1166
1167                ci->sector += len;
1168                ci->sector_count -= len;
1169                ci->idx = i;
1170
1171        } else {
1172                /*
1173                 * Handle a bvec that must be split between two or more targets.
1174                 */
1175                struct bio_vec *bv = bio->bi_io_vec + ci->idx;
1176                sector_t remaining = to_sector(bv->bv_len);
1177                unsigned int offset = 0;
1178
1179                do {
1180                        if (offset) {
1181                                ti = dm_table_find_target(ci->map, ci->sector);
1182                                if (!dm_target_is_valid(ti))
1183                                        return -EIO;
1184
1185                                max = max_io_len(ci->md, ci->sector, ti);
1186
1187                                tio = alloc_tio(ci, ti);
1188                        }
1189
1190                        len = min(remaining, max);
1191
1192                        clone = split_bvec(bio, ci->sector, ci->idx,
1193                                           bv->bv_offset + offset, len,
1194                                           ci->md->bs);
1195
1196                        __map_bio(ti, clone, tio);
1197
1198                        ci->sector += len;
1199                        ci->sector_count -= len;
1200                        offset += to_bytes(len);
1201                } while (remaining -= len);
1202
1203                ci->idx++;
1204        }
1205
1206        return 0;
1207}
1208
1209/*
1210 * Split the bio into several clones and submit it to targets.
1211 */
1212static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
1213{
1214        struct clone_info ci;
1215        int error = 0;
1216
1217        ci.map = dm_get_table(md);
1218        if (unlikely(!ci.map)) {
1219                if (!bio_rw_flagged(bio, BIO_RW_BARRIER))
1220                        bio_io_error(bio);
1221                else
1222                        if (!md->barrier_error)
1223                                md->barrier_error = -EIO;
1224                return;
1225        }
1226
1227        ci.md = md;
1228        ci.bio = bio;
1229        ci.io = alloc_io(md);
1230        ci.io->error = 0;
1231        atomic_set(&ci.io->io_count, 1);
1232        ci.io->bio = bio;
1233        ci.io->md = md;
1234        spin_lock_init(&ci.io->endio_lock);
1235        ci.sector = bio->bi_sector;
1236        ci.sector_count = bio_sectors(bio);
1237        if (unlikely(bio_empty_barrier(bio)))
1238                ci.sector_count = 1;
1239        ci.idx = bio->bi_idx;
1240
1241        start_io_acct(ci.io);
1242        while (ci.sector_count && !error)
1243                error = __clone_and_map(&ci);
1244
1245        /* drop the extra reference count */
1246        dec_pending(ci.io, error);
1247        dm_table_put(ci.map);
1248}
1249/*-----------------------------------------------------------------
1250 * CRUD END
1251 *---------------------------------------------------------------*/
1252
1253static int dm_merge_bvec(struct request_queue *q,
1254                         struct bvec_merge_data *bvm,
1255                         struct bio_vec *biovec)
1256{
1257        struct mapped_device *md = q->queuedata;
1258        struct dm_table *map = dm_get_table(md);
1259        struct dm_target *ti;
1260        sector_t max_sectors;
1261        int max_size = 0;
1262
1263        if (unlikely(!map))
1264                goto out;
1265
1266        ti = dm_table_find_target(map, bvm->bi_sector);
1267        if (!dm_target_is_valid(ti))
1268                goto out_table;
1269
1270        /*
1271         * Find maximum amount of I/O that won't need splitting
1272         */
1273        max_sectors = min(max_io_len(md, bvm->bi_sector, ti),
1274                          (sector_t) BIO_MAX_SECTORS);
1275        max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
1276        if (max_size < 0)
1277                max_size = 0;
1278
1279        /*
1280         * merge_bvec_fn() returns number of bytes
1281         * it can accept at this offset
1282         * max is precomputed maximal io size
1283         */
1284        if (max_size && ti->type->merge)
1285                max_size = ti->type->merge(ti, bvm, biovec, max_size);
1286        /*
1287         * If the target doesn't support merge method and some of the devices
1288         * provided their merge_bvec method (we know this by looking at
1289         * queue_max_hw_sectors), then we can't allow bios with multiple vector
1290         * entries.  So always set max_size to 0, and the code below allows
1291         * just one page.
1292         */
1293        else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
1294
1295                max_size = 0;
1296
1297out_table:
1298        dm_table_put(map);
1299
1300out:
1301        /*
1302         * Always allow an entire first page
1303         */
1304        if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT))
1305                max_size = biovec->bv_len;
1306
1307        return max_size;
1308}
1309
1310/*
1311 * The request function that just remaps the bio built up by
1312 * dm_merge_bvec.
1313 */
1314static int _dm_request(struct request_queue *q, struct bio *bio)
1315{
1316        int rw = bio_data_dir(bio);
1317        struct mapped_device *md = q->queuedata;
1318        int cpu;
1319
1320        down_read(&md->io_lock);
1321
1322        cpu = part_stat_lock();
1323        part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]);
1324        part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
1325        part_stat_unlock();
1326
1327        /*
1328         * If we're suspended or the thread is processing barriers
1329         * we have to queue this io for later.
1330         */
1331        if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) ||
1332            unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
1333                up_read(&md->io_lock);
1334
1335                if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) &&
1336                    bio_rw(bio) == READA) {
1337                        bio_io_error(bio);
1338                        return 0;
1339                }
1340
1341                queue_io(md, bio);
1342
1343                return 0;
1344        }
1345
1346        __split_and_process_bio(md, bio);
1347        up_read(&md->io_lock);
1348        return 0;
1349}
1350
1351static int dm_make_request(struct request_queue *q, struct bio *bio)
1352{
1353        struct mapped_device *md = q->queuedata;
1354
1355        if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
1356                bio_endio(bio, -EOPNOTSUPP);
1357                return 0;
1358        }
1359
1360        return md->saved_make_request_fn(q, bio); /* call __make_request() */
1361}
1362
1363static int dm_request_based(struct mapped_device *md)
1364{
1365        return blk_queue_stackable(md->queue);
1366}
1367
1368static int dm_request(struct request_queue *q, struct bio *bio)
1369{
1370        struct mapped_device *md = q->queuedata;
1371
1372        if (dm_request_based(md))
1373                return dm_make_request(q, bio);
1374
1375        return _dm_request(q, bio);
1376}
1377
1378void dm_dispatch_request(struct request *rq)
1379{
1380        int r;
1381
1382        if (blk_queue_io_stat(rq->q))
1383                rq->cmd_flags |= REQ_IO_STAT;
1384
1385        rq->start_time = jiffies;
1386        r = blk_insert_cloned_request(rq->q, rq);
1387        if (r)
1388                dm_complete_request(rq, r);
1389}
1390EXPORT_SYMBOL_GPL(dm_dispatch_request);
1391
1392static void dm_rq_bio_destructor(struct bio *bio)
1393{
1394        struct dm_rq_clone_bio_info *info = bio->bi_private;
1395        struct mapped_device *md = info->tio->md;
1396
1397        free_bio_info(info);
1398        bio_free(bio, md->bs);
1399}
1400
1401static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
1402                                 void *data)
1403{
1404        struct dm_rq_target_io *tio = data;
1405        struct mapped_device *md = tio->md;
1406        struct dm_rq_clone_bio_info *info = alloc_bio_info(md);
1407
1408        if (!info)
1409                return -ENOMEM;
1410
1411        info->orig = bio_orig;
1412        info->tio = tio;
1413        bio->bi_end_io = end_clone_bio;
1414        bio->bi_private = info;
1415        bio->bi_destructor = dm_rq_bio_destructor;
1416
1417        return 0;
1418}
1419
1420static int setup_clone(struct request *clone, struct request *rq,
1421                       struct dm_rq_target_io *tio)
1422{
1423        int r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
1424                                  dm_rq_bio_constructor, tio);
1425
1426        if (r)
1427                return r;
1428
1429        clone->cmd = rq->cmd;
1430        clone->cmd_len = rq->cmd_len;
1431        clone->sense = rq->sense;
1432        clone->buffer = rq->buffer;
1433        clone->end_io = end_clone_request;
1434        clone->end_io_data = tio;
1435
1436        return 0;
1437}
1438
1439static int dm_rq_flush_suspending(struct mapped_device *md)
1440{
1441        return !md->suspend_rq.special;
1442}
1443
1444/*
1445 * Called with the queue lock held.
1446 */
1447static int dm_prep_fn(struct request_queue *q, struct request *rq)
1448{
1449        struct mapped_device *md = q->queuedata;
1450        struct dm_rq_target_io *tio;
1451        struct request *clone;
1452
1453        if (unlikely(rq == &md->suspend_rq)) {
1454                if (dm_rq_flush_suspending(md))
1455                        return BLKPREP_OK;
1456                else
1457                        /* The flush suspend was interrupted */
1458                        return BLKPREP_KILL;
1459        }
1460
1461        if (unlikely(rq->special)) {
1462                DMWARN("Already has something in rq->special.");
1463                return BLKPREP_KILL;
1464        }
1465
1466        tio = alloc_rq_tio(md); /* Only one for each original request */
1467        if (!tio)
1468                /* -ENOMEM */
1469                return BLKPREP_DEFER;
1470
1471        tio->md = md;
1472        tio->ti = NULL;
1473        tio->orig = rq;
1474        tio->error = 0;
1475        memset(&tio->info, 0, sizeof(tio->info));
1476
1477        clone = &tio->clone;
1478        if (setup_clone(clone, rq, tio)) {
1479                /* -ENOMEM */
1480                free_rq_tio(tio);
1481                return BLKPREP_DEFER;
1482        }
1483
1484        rq->special = clone;
1485        rq->cmd_flags |= REQ_DONTPREP;
1486
1487        return BLKPREP_OK;
1488}
1489
1490static void map_request(struct dm_target *ti, struct request *rq,
1491                        struct mapped_device *md)
1492{
1493        int r;
1494        struct request *clone = rq->special;
1495        struct dm_rq_target_io *tio = clone->end_io_data;
1496
1497        /*
1498         * Hold the md reference here for the in-flight I/O.
1499         * We can't rely on the reference count by device opener,
1500         * because the device may be closed during the request completion
1501         * when all bios are completed.
1502         * See the comment in rq_completed() too.
1503         */
1504        dm_get(md);
1505
1506        tio->ti = ti;
1507        r = ti->type->map_rq(ti, clone, &tio->info);
1508        switch (r) {
1509        case DM_MAPIO_SUBMITTED:
1510                /* The target has taken the I/O to submit by itself later */
1511                break;
1512        case DM_MAPIO_REMAPPED:
1513                /* The target has remapped the I/O so dispatch it */
1514                dm_dispatch_request(clone);
1515                break;
1516        case DM_MAPIO_REQUEUE:
1517                /* The target wants to requeue the I/O */
1518                dm_requeue_unmapped_request(clone);
1519                break;
1520        default:
1521                if (r > 0) {
1522                        DMWARN("unimplemented target map return value: %d", r);
1523                        BUG();
1524                }
1525
1526                /* The target wants to complete the I/O */
1527                dm_kill_unmapped_request(clone, r);
1528                break;
1529        }
1530}
1531
1532/*
1533 * q->request_fn for request-based dm.
1534 * Called with the queue lock held.
1535 */
1536static void dm_request_fn(struct request_queue *q)
1537{
1538        struct mapped_device *md = q->queuedata;
1539        struct dm_table *map = dm_get_table(md);
1540        struct dm_target *ti;
1541        struct request *rq;
1542
1543        /*
1544         * For noflush suspend, check blk_queue_stopped() to immediately
1545         * quit I/O dispatching.
1546         */
1547        while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) {
1548                rq = blk_peek_request(q);
1549                if (!rq)
1550                        goto plug_and_out;
1551
1552                if (unlikely(rq == &md->suspend_rq)) { /* Flush suspend maker */
1553                        if (queue_in_flight(q))
1554                                /* Not quiet yet.  Wait more */
1555                                goto plug_and_out;
1556
1557                        /* This device should be quiet now */
1558                        __stop_queue(q);
1559                        blk_start_request(rq);
1560                        __blk_end_request_all(rq, 0);
1561                        wake_up(&md->wait);
1562                        goto out;
1563                }
1564
1565                ti = dm_table_find_target(map, blk_rq_pos(rq));
1566                if (ti->type->busy && ti->type->busy(ti))
1567                        goto plug_and_out;
1568
1569                blk_start_request(rq);
1570                spin_unlock(q->queue_lock);
1571                map_request(ti, rq, md);
1572                spin_lock_irq(q->queue_lock);
1573        }
1574
1575        goto out;
1576
1577plug_and_out:
1578        if (!elv_queue_empty(q))
1579                /* Some requests still remain, retry later */
1580                blk_plug_device(q);
1581
1582out:
1583        dm_table_put(map);
1584
1585        return;
1586}
1587
1588int dm_underlying_device_busy(struct request_queue *q)
1589{
1590        return blk_lld_busy(q);
1591}
1592EXPORT_SYMBOL_GPL(dm_underlying_device_busy);
1593
1594static int dm_lld_busy(struct request_queue *q)
1595{
1596        int r;
1597        struct mapped_device *md = q->queuedata;
1598        struct dm_table *map = dm_get_table(md);
1599
1600        if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
1601                r = 1;
1602        else
1603                r = dm_table_any_busy_target(map);
1604
1605        dm_table_put(map);
1606
1607        return r;
1608}
1609
1610static void dm_unplug_all(struct request_queue *q)
1611{
1612        struct mapped_device *md = q->queuedata;
1613        struct dm_table *map = dm_get_table(md);
1614
1615        if (map) {
1616                if (dm_request_based(md))
1617                        generic_unplug_device(q);
1618
1619                dm_table_unplug_all(map);
1620                dm_table_put(map);
1621        }
1622}
1623
1624static int dm_any_congested(void *congested_data, int bdi_bits)
1625{
1626        int r = bdi_bits;
1627        struct mapped_device *md = congested_data;
1628        struct dm_table *map;
1629
1630        if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
1631                map = dm_get_table(md);
1632                if (map) {
1633                        /*
1634                         * Request-based dm cares about only own queue for
1635                         * the query about congestion status of request_queue
1636                         */
1637                        if (dm_request_based(md))
1638                                r = md->queue->backing_dev_info.state &
1639                                    bdi_bits;
1640                        else
1641                                r = dm_table_any_congested(map, bdi_bits);
1642
1643                        dm_table_put(map);
1644                }
1645        }
1646
1647        return r;
1648}
1649
1650/*-----------------------------------------------------------------
1651 * An IDR is used to keep track of allocated minor numbers.
1652 *---------------------------------------------------------------*/
1653static DEFINE_IDR(_minor_idr);
1654
1655static void free_minor(int minor)
1656{
1657        spin_lock(&_minor_lock);
1658        idr_remove(&_minor_idr, minor);
1659        spin_unlock(&_minor_lock);
1660}
1661
1662/*
1663 * See if the device with a specific minor # is free.
1664 */
1665static int specific_minor(int minor)
1666{
1667        int r, m;
1668
1669        if (minor >= (1 << MINORBITS))
1670                return -EINVAL;
1671
1672        r = idr_pre_get(&_minor_idr, GFP_KERNEL);
1673        if (!r)
1674                return -ENOMEM;
1675
1676        spin_lock(&_minor_lock);
1677
1678        if (idr_find(&_minor_idr, minor)) {
1679                r = -EBUSY;
1680                goto out;
1681        }
1682
1683        r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m);
1684        if (r)
1685                goto out;
1686
1687        if (m != minor) {
1688                idr_remove(&_minor_idr, m);
1689                r = -EBUSY;
1690                goto out;
1691        }
1692
1693out:
1694        spin_unlock(&_minor_lock);
1695        return r;
1696}
1697
1698static int next_free_minor(int *minor)
1699{
1700        int r, m;
1701
1702        r = idr_pre_get(&_minor_idr, GFP_KERNEL);
1703        if (!r)
1704                return -ENOMEM;
1705
1706        spin_lock(&_minor_lock);
1707
1708        r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m);
1709        if (r)
1710                goto out;
1711
1712        if (m >= (1 << MINORBITS)) {
1713                idr_remove(&_minor_idr, m);
1714                r = -ENOSPC;
1715                goto out;
1716        }
1717
1718        *minor = m;
1719
1720out:
1721        spin_unlock(&_minor_lock);
1722        return r;
1723}
1724
1725static const struct block_device_operations dm_blk_dops;
1726
1727static void dm_wq_work(struct work_struct *work);
1728
1729/*
1730 * Allocate and initialise a blank device with a given minor.
1731 */
1732static struct mapped_device *alloc_dev(int minor)
1733{
1734        int r;
1735        struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL);
1736        void *old_md;
1737
1738        if (!md) {
1739                DMWARN("unable to allocate device, out of memory.");
1740                return NULL;
1741        }
1742
1743        if (!try_module_get(THIS_MODULE))
1744                goto bad_module_get;
1745
1746        /* get a minor number for the dev */
1747        if (minor == DM_ANY_MINOR)
1748                r = next_free_minor(&minor);
1749        else
1750                r = specific_minor(minor);
1751        if (r < 0)
1752                goto bad_minor;
1753
1754        init_rwsem(&md->io_lock);
1755        mutex_init(&md->suspend_lock);
1756        spin_lock_init(&md->deferred_lock);
1757        rwlock_init(&md->map_lock);
1758        atomic_set(&md->holders, 1);
1759        atomic_set(&md->open_count, 0);
1760        atomic_set(&md->event_nr, 0);
1761        atomic_set(&md->uevent_seq, 0);
1762        INIT_LIST_HEAD(&md->uevent_list);
1763        spin_lock_init(&md->uevent_lock);
1764
1765        md->queue = blk_init_queue(dm_request_fn, NULL);
1766        if (!md->queue)
1767                goto bad_queue;
1768
1769        /*
1770         * Request-based dm devices cannot be stacked on top of bio-based dm
1771         * devices.  The type of this dm device has not been decided yet,
1772         * although we initialized the queue using blk_init_queue().
1773         * The type is decided at the first table loading time.
1774         * To prevent problematic device stacking, clear the queue flag
1775         * for request stacking support until then.
1776         *
1777         * This queue is new, so no concurrency on the queue_flags.
1778         */
1779        queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
1780        md->saved_make_request_fn = md->queue->make_request_fn;
1781        md->queue->queuedata = md;
1782        md->queue->backing_dev_info.congested_fn = dm_any_congested;
1783        md->queue->backing_dev_info.congested_data = md;
1784        blk_queue_make_request(md->queue, dm_request);
1785        blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
1786        md->queue->unplug_fn = dm_unplug_all;
1787        blk_queue_merge_bvec(md->queue, dm_merge_bvec);
1788        blk_queue_softirq_done(md->queue, dm_softirq_done);
1789        blk_queue_prep_rq(md->queue, dm_prep_fn);
1790        blk_queue_lld_busy(md->queue, dm_lld_busy);
1791
1792        md->disk = alloc_disk(1);
1793        if (!md->disk)
1794                goto bad_disk;
1795
1796        atomic_set(&md->pending[0], 0);
1797        atomic_set(&md->pending[1], 0);
1798        init_waitqueue_head(&md->wait);
1799        INIT_WORK(&md->work, dm_wq_work);
1800        init_waitqueue_head(&md->eventq);
1801
1802        md->disk->major = _major;
1803        md->disk->first_minor = minor;
1804        md->disk->fops = &dm_blk_dops;
1805        md->disk->queue = md->queue;
1806        md->disk->private_data = md;
1807        sprintf(md->disk->disk_name, "dm-%d", minor);
1808        add_disk(md->disk);
1809        format_dev_t(md->name, MKDEV(_major, minor));
1810
1811        md->wq = create_singlethread_workqueue("kdmflush");
1812        if (!md->wq)
1813                goto bad_thread;
1814
1815        md->bdev = bdget_disk(md->disk, 0);
1816        if (!md->bdev)
1817                goto bad_bdev;
1818
1819        /* Populate the mapping, nobody knows we exist yet */
1820        spin_lock(&_minor_lock);
1821        old_md = idr_replace(&_minor_idr, md, minor);
1822        spin_unlock(&_minor_lock);
1823
1824        BUG_ON(old_md != MINOR_ALLOCED);
1825
1826        return md;
1827
1828bad_bdev:
1829        destroy_workqueue(md->wq);
1830bad_thread:
1831        del_gendisk(md->disk);
1832        put_disk(md->disk);
1833bad_disk:
1834        blk_cleanup_queue(md->queue);
1835bad_queue:
1836        free_minor(minor);
1837bad_minor:
1838        module_put(THIS_MODULE);
1839bad_module_get:
1840        kfree(md);
1841        return NULL;
1842}
1843
1844static void unlock_fs(struct mapped_device *md);
1845
1846static void free_dev(struct mapped_device *md)
1847{
1848        int minor = MINOR(disk_devt(md->disk));
1849
1850        unlock_fs(md);
1851        bdput(md->bdev);
1852        destroy_workqueue(md->wq);
1853        if (md->tio_pool)
1854                mempool_destroy(md->tio_pool);
1855        if (md->io_pool)
1856                mempool_destroy(md->io_pool);
1857        if (md->bs)
1858                bioset_free(md->bs);
1859        blk_integrity_unregister(md->disk);
1860        del_gendisk(md->disk);
1861        free_minor(minor);
1862
1863        spin_lock(&_minor_lock);
1864        md->disk->private_data = NULL;
1865        spin_unlock(&_minor_lock);
1866
1867        put_disk(md->disk);
1868        blk_cleanup_queue(md->queue);
1869        module_put(THIS_MODULE);
1870        kfree(md);
1871}
1872
1873static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
1874{
1875        struct dm_md_mempools *p;
1876
1877        if (md->io_pool && md->tio_pool && md->bs)
1878                /* the md already has necessary mempools */
1879                goto out;
1880
1881        p = dm_table_get_md_mempools(t);
1882        BUG_ON(!p || md->io_pool || md->tio_pool || md->bs);
1883
1884        md->io_pool = p->io_pool;
1885        p->io_pool = NULL;
1886        md->tio_pool = p->tio_pool;
1887        p->tio_pool = NULL;
1888        md->bs = p->bs;
1889        p->bs = NULL;
1890
1891out:
1892        /* mempool bind completed, now no need any mempools in the table */
1893        dm_table_free_md_mempools(t);
1894}
1895
1896/*
1897 * Bind a table to the device.
1898 */
1899static void event_callback(void *context)
1900{
1901        unsigned long flags;
1902        LIST_HEAD(uevents);
1903        struct mapped_device *md = (struct mapped_device *) context;
1904
1905        spin_lock_irqsave(&md->uevent_lock, flags);
1906        list_splice_init(&md->uevent_list, &uevents);
1907        spin_unlock_irqrestore(&md->uevent_lock, flags);
1908
1909        dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
1910
1911        atomic_inc(&md->event_nr);
1912        wake_up(&md->eventq);
1913}
1914
1915static void __set_size(struct mapped_device *md, sector_t size)
1916{
1917        set_capacity(md->disk, size);
1918
1919        mutex_lock(&md->bdev->bd_inode->i_mutex);
1920        i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
1921        mutex_unlock(&md->bdev->bd_inode->i_mutex);
1922}
1923
1924static int __bind(struct mapped_device *md, struct dm_table *t,
1925                  struct queue_limits *limits)
1926{
1927        struct request_queue *q = md->queue;
1928        sector_t size;
1929        unsigned long flags;
1930
1931        size = dm_table_get_size(t);
1932
1933        /*
1934         * Wipe any geometry if the size of the table changed.
1935         */
1936        if (size != get_capacity(md->disk))
1937                memset(&md->geometry, 0, sizeof(md->geometry));
1938
1939        __set_size(md, size);
1940
1941        if (!size) {
1942                dm_table_destroy(t);
1943                return 0;
1944        }
1945
1946        dm_table_event_callback(t, event_callback, md);
1947
1948        /*
1949         * The queue hasn't been stopped yet, if the old table type wasn't
1950         * for request-based during suspension.  So stop it to prevent
1951         * I/O mapping before resume.
1952         * This must be done before setting the queue restrictions,
1953         * because request-based dm may be run just after the setting.
1954         */
1955        if (dm_table_request_based(t) && !blk_queue_stopped(q))
1956                stop_queue(q);
1957
1958        __bind_mempools(md, t);
1959
1960        write_lock_irqsave(&md->map_lock, flags);
1961        md->map = t;
1962        dm_table_set_restrictions(t, q, limits);
1963        write_unlock_irqrestore(&md->map_lock, flags);
1964
1965        return 0;
1966}
1967
1968static void __unbind(struct mapped_device *md)
1969{
1970        struct dm_table *map = md->map;
1971        unsigned long flags;
1972
1973        if (!map)
1974                return;
1975
1976        dm_table_event_callback(map, NULL, NULL);
1977        write_lock_irqsave(&md->map_lock, flags);
1978        md->map = NULL;
1979        write_unlock_irqrestore(&md->map_lock, flags);
1980        dm_table_destroy(map);
1981}
1982
1983/*
1984 * Constructor for a new device.
1985 */
1986int dm_create(int minor, struct mapped_device **result)
1987{
1988        struct mapped_device *md;
1989
1990        md = alloc_dev(minor);
1991        if (!md)
1992                return -ENXIO;
1993
1994        dm_sysfs_init(md);
1995
1996        *result = md;
1997        return 0;
1998}
1999
2000static struct mapped_device *dm_find_md(dev_t dev)
2001{
2002        struct mapped_device *md;
2003        unsigned minor = MINOR(dev);
2004
2005        if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2006                return NULL;
2007
2008        spin_lock(&_minor_lock);
2009
2010        md = idr_find(&_minor_idr, minor);
2011        if (md && (md == MINOR_ALLOCED ||
2012                   (MINOR(disk_devt(dm_disk(md))) != minor) ||
2013                   test_bit(DMF_FREEING, &md->flags))) {
2014                md = NULL;
2015                goto out;
2016        }
2017
2018out:
2019        spin_unlock(&_minor_lock);
2020
2021        return md;
2022}
2023
2024struct mapped_device *dm_get_md(dev_t dev)
2025{
2026        struct mapped_device *md = dm_find_md(dev);
2027
2028        if (md)
2029                dm_get(md);
2030
2031        return md;
2032}
2033
2034void *dm_get_mdptr(struct mapped_device *md)
2035{
2036        return md->interface_ptr;
2037}
2038
2039void dm_set_mdptr(struct mapped_device *md, void *ptr)
2040{
2041        md->interface_ptr = ptr;
2042}
2043
2044void dm_get(struct mapped_device *md)
2045{
2046        atomic_inc(&md->holders);
2047}
2048
2049const char *dm_device_name(struct mapped_device *md)
2050{
2051        return md->name;
2052}
2053EXPORT_SYMBOL_GPL(dm_device_name);
2054
2055void dm_put(struct mapped_device *md)
2056{
2057        struct dm_table *map;
2058
2059        BUG_ON(test_bit(DMF_FREEING, &md->flags));
2060
2061        if (atomic_dec_and_lock(&md->holders, &_minor_lock)) {
2062                map = dm_get_table(md);
2063                idr_replace(&_minor_idr, MINOR_ALLOCED,
2064                            MINOR(disk_devt(dm_disk(md))));
2065                set_bit(DMF_FREEING, &md->flags);
2066                spin_unlock(&_minor_lock);
2067                if (!dm_suspended(md)) {
2068                        dm_table_presuspend_targets(map);
2069                        dm_table_postsuspend_targets(map);
2070                }
2071                dm_sysfs_exit(md);
2072                dm_table_put(map);
2073                __unbind(md);
2074                free_dev(md);
2075        }
2076}
2077EXPORT_SYMBOL_GPL(dm_put);
2078
2079static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
2080{
2081        int r = 0;
2082        DECLARE_WAITQUEUE(wait, current);
2083        struct request_queue *q = md->queue;
2084        unsigned long flags;
2085
2086        dm_unplug_all(md->queue);
2087
2088        add_wait_queue(&md->wait, &wait);
2089
2090        while (1) {
2091                set_current_state(interruptible);
2092
2093                smp_mb();
2094                if (dm_request_based(md)) {
2095                        spin_lock_irqsave(q->queue_lock, flags);
2096                        if (!queue_in_flight(q) && blk_queue_stopped(q)) {
2097                                spin_unlock_irqrestore(q->queue_lock, flags);
2098                                break;
2099                        }
2100                        spin_unlock_irqrestore(q->queue_lock, flags);
2101                } else if (!atomic_read(&md->pending[0]) &&
2102                                        !atomic_read(&md->pending[1]))
2103                        break;
2104
2105                if (interruptible == TASK_INTERRUPTIBLE &&
2106                    signal_pending(current)) {
2107                        r = -EINTR;
2108                        break;
2109                }
2110
2111                io_schedule();
2112        }
2113        set_current_state(TASK_RUNNING);
2114
2115        remove_wait_queue(&md->wait, &wait);
2116
2117        return r;
2118}
2119
2120static void dm_flush(struct mapped_device *md)
2121{
2122        dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2123
2124        bio_init(&md->barrier_bio);
2125        md->barrier_bio.bi_bdev = md->bdev;
2126        md->barrier_bio.bi_rw = WRITE_BARRIER;
2127        __split_and_process_bio(md, &md->barrier_bio);
2128
2129        dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2130}
2131
2132static void process_barrier(struct mapped_device *md, struct bio *bio)
2133{
2134        md->barrier_error = 0;
2135
2136        dm_flush(md);
2137
2138        if (!bio_empty_barrier(bio)) {
2139                __split_and_process_bio(md, bio);
2140                dm_flush(md);
2141        }
2142
2143        if (md->barrier_error != DM_ENDIO_REQUEUE)
2144                bio_endio(bio, md->barrier_error);
2145        else {
2146                spin_lock_irq(&md->deferred_lock);
2147                bio_list_add_head(&md->deferred, bio);
2148                spin_unlock_irq(&md->deferred_lock);
2149        }
2150}
2151
2152/*
2153 * Process the deferred bios
2154 */
2155static void dm_wq_work(struct work_struct *work)
2156{
2157        struct mapped_device *md = container_of(work, struct mapped_device,
2158                                                work);
2159        struct bio *c;
2160
2161        down_write(&md->io_lock);
2162
2163        while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2164                spin_lock_irq(&md->deferred_lock);
2165                c = bio_list_pop(&md->deferred);
2166                spin_unlock_irq(&md->deferred_lock);
2167
2168                if (!c) {
2169                        clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
2170                        break;
2171                }
2172
2173                up_write(&md->io_lock);
2174
2175                if (dm_request_based(md))
2176                        generic_make_request(c);
2177                else {
2178                        if (bio_rw_flagged(c, BIO_RW_BARRIER))
2179                                process_barrier(md, c);
2180                        else
2181                                __split_and_process_bio(md, c);
2182                }
2183
2184                down_write(&md->io_lock);
2185        }
2186
2187        up_write(&md->io_lock);
2188}
2189
2190static void dm_queue_flush(struct mapped_device *md)
2191{
2192        clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2193        smp_mb__after_clear_bit();
2194        queue_work(md->wq, &md->work);
2195}
2196
2197/*
2198 * Swap in a new table (destroying old one).
2199 */
2200int dm_swap_table(struct mapped_device *md, struct dm_table *table)
2201{
2202        struct queue_limits limits;
2203        int r = -EINVAL;
2204
2205        mutex_lock(&md->suspend_lock);
2206
2207        /* device must be suspended */
2208        if (!dm_suspended(md))
2209                goto out;
2210
2211        r = dm_calculate_queue_limits(table, &limits);
2212        if (r)
2213                goto out;
2214
2215        /* cannot change the device type, once a table is bound */
2216        if (md->map &&
2217            (dm_table_get_type(md->map) != dm_table_get_type(table))) {
2218                DMWARN("can't change the device type after a table is bound");
2219                goto out;
2220        }
2221
2222        __unbind(md);
2223        r = __bind(md, table, &limits);
2224
2225out:
2226        mutex_unlock(&md->suspend_lock);
2227        return r;
2228}
2229
2230static void dm_rq_invalidate_suspend_marker(struct mapped_device *md)
2231{
2232        md->suspend_rq.special = (void *)0x1;
2233}
2234
2235static void dm_rq_abort_suspend(struct mapped_device *md, int noflush)
2236{
2237        struct request_queue *q = md->queue;
2238        unsigned long flags;
2239
2240        spin_lock_irqsave(q->queue_lock, flags);
2241        if (!noflush)
2242                dm_rq_invalidate_suspend_marker(md);
2243        __start_queue(q);
2244        spin_unlock_irqrestore(q->queue_lock, flags);
2245}
2246
2247static void dm_rq_start_suspend(struct mapped_device *md, int noflush)
2248{
2249        struct request *rq = &md->suspend_rq;
2250        struct request_queue *q = md->queue;
2251
2252        if (noflush)
2253                stop_queue(q);
2254        else {
2255                blk_rq_init(q, rq);
2256                blk_insert_request(q, rq, 0, NULL);
2257        }
2258}
2259
2260static int dm_rq_suspend_available(struct mapped_device *md, int noflush)
2261{
2262        int r = 1;
2263        struct request *rq = &md->suspend_rq;
2264        struct request_queue *q = md->queue;
2265        unsigned long flags;
2266
2267        if (noflush)
2268                return r;
2269
2270        /* The marker must be protected by queue lock if it is in use */
2271        spin_lock_irqsave(q->queue_lock, flags);
2272        if (unlikely(rq->ref_count)) {
2273                /*
2274                 * This can happen, when the previous flush suspend was
2275                 * interrupted, the marker is still in the queue and
2276                 * this flush suspend has been invoked, because we don't
2277                 * remove the marker at the time of suspend interruption.
2278                 * We have only one marker per mapped_device, so we can't
2279                 * start another flush suspend while it is in use.
2280                 */
2281                BUG_ON(!rq->special); /* The marker should be invalidated */
2282                DMWARN("Invalidating the previous flush suspend is still in"
2283                       " progress.  Please retry later.");
2284                r = 0;
2285        }
2286        spin_unlock_irqrestore(q->queue_lock, flags);
2287
2288        return r;
2289}
2290
2291/*
2292 * Functions to lock and unlock any filesystem running on the
2293 * device.
2294 */
2295static int lock_fs(struct mapped_device *md)
2296{
2297        int r;
2298
2299        WARN_ON(md->frozen_sb);
2300
2301        md->frozen_sb = freeze_bdev(md->bdev);
2302        if (IS_ERR(md->frozen_sb)) {
2303                r = PTR_ERR(md->frozen_sb);
2304                md->frozen_sb = NULL;
2305                return r;
2306        }
2307
2308        set_bit(DMF_FROZEN, &md->flags);
2309
2310        return 0;
2311}
2312
2313static void unlock_fs(struct mapped_device *md)
2314{
2315        if (!test_bit(DMF_FROZEN, &md->flags))
2316                return;
2317
2318        thaw_bdev(md->bdev, md->frozen_sb);
2319        md->frozen_sb = NULL;
2320        clear_bit(DMF_FROZEN, &md->flags);
2321}
2322
2323/*
2324 * We need to be able to change a mapping table under a mounted
2325 * filesystem.  For example we might want to move some data in
2326 * the background.  Before the table can be swapped with
2327 * dm_bind_table, dm_suspend must be called to flush any in
2328 * flight bios and ensure that any further io gets deferred.
2329 */
2330/*
2331 * Suspend mechanism in request-based dm.
2332 *
2333 * After the suspend starts, further incoming requests are kept in
2334 * the request_queue and deferred.
2335 * Remaining requests in the request_queue at the start of suspend are flushed
2336 * if it is flush suspend.
2337 * The suspend completes when the following conditions have been satisfied,
2338 * so wait for it:
2339 *    1. q->in_flight is 0 (which means no in_flight request)
2340 *    2. queue has been stopped (which means no request dispatching)
2341 *
2342 *
2343 * Noflush suspend
2344 * ---------------
2345 * Noflush suspend doesn't need to dispatch remaining requests.
2346 * So stop the queue immediately.  Then, wait for all in_flight requests
2347 * to be completed or requeued.
2348 *
2349 * To abort noflush suspend, start the queue.
2350 *
2351 *
2352 * Flush suspend
2353 * -------------
2354 * Flush suspend needs to dispatch remaining requests.  So stop the queue
2355 * after the remaining requests are completed. (Requeued request must be also
2356 * re-dispatched and completed.  Until then, we can't stop the queue.)
2357 *
2358 * During flushing the remaining requests, further incoming requests are also
2359 * inserted to the same queue.  To distinguish which requests are to be
2360 * flushed, we insert a marker request to the queue at the time of starting
2361 * flush suspend, like a barrier.
2362 * The dispatching is blocked when the marker is found on the top of the queue.
2363 * And the queue is stopped when all in_flight requests are completed, since
2364 * that means the remaining requests are completely flushed.
2365 * Then, the marker is removed from the queue.
2366 *
2367 * To abort flush suspend, we also need to take care of the marker, not only
2368 * starting the queue.
2369 * We don't remove the marker forcibly from the queue since it's against
2370 * the block-layer manner.  Instead, we put a invalidated mark on the marker.
2371 * When the invalidated marker is found on the top of the queue, it is
2372 * immediately removed from the queue, so it doesn't block dispatching.
2373 * Because we have only one marker per mapped_device, we can't start another
2374 * flush suspend until the invalidated marker is removed from the queue.
2375 * So fail and return with -EBUSY in such a case.
2376 */
2377int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2378{
2379        struct dm_table *map = NULL;
2380        int r = 0;
2381        int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0;
2382        int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;
2383
2384        mutex_lock(&md->suspend_lock);
2385
2386        if (dm_suspended(md)) {
2387                r = -EINVAL;
2388                goto out_unlock;
2389        }
2390
2391        if (dm_request_based(md) && !dm_rq_suspend_available(md, noflush)) {
2392                r = -EBUSY;
2393                goto out_unlock;
2394        }
2395
2396        map = dm_get_table(md);
2397
2398        /*
2399         * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
2400         * This flag is cleared before dm_suspend returns.
2401         */
2402        if (noflush)
2403                set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2404
2405        /* This does not get reverted if there's an error later. */
2406        dm_table_presuspend_targets(map);
2407
2408        /*
2409         * Flush I/O to the device. noflush supersedes do_lockfs,
2410         * because lock_fs() needs to flush I/Os.
2411         */
2412        if (!noflush && do_lockfs) {
2413                r = lock_fs(md);
2414                if (r)
2415                        goto out;
2416        }
2417
2418        /*
2419         * Here we must make sure that no processes are submitting requests
2420         * to target drivers i.e. no one may be executing
2421         * __split_and_process_bio. This is called from dm_request and
2422         * dm_wq_work.
2423         *
2424         * To get all processes out of __split_and_process_bio in dm_request,
2425         * we take the write lock. To prevent any process from reentering
2426         * __split_and_process_bio from dm_request, we set
2427         * DMF_QUEUE_IO_TO_THREAD.
2428         *
2429         * To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND
2430         * and call flush_workqueue(md->wq). flush_workqueue will wait until
2431         * dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any
2432         * further calls to __split_and_process_bio from dm_wq_work.
2433         */
2434        down_write(&md->io_lock);
2435        set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2436        set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
2437        up_write(&md->io_lock);
2438
2439        flush_workqueue(md->wq);
2440
2441        if (dm_request_based(md))
2442                dm_rq_start_suspend(md, noflush);
2443
2444        /*
2445         * At this point no more requests are entering target request routines.
2446         * We call dm_wait_for_completion to wait for all existing requests
2447         * to finish.
2448         */
2449        r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE);
2450
2451        down_write(&md->io_lock);
2452        if (noflush)
2453                clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2454        up_write(&md->io_lock);
2455
2456        /* were we interrupted ? */
2457        if (r < 0) {
2458                dm_queue_flush(md);
2459
2460                if (dm_request_based(md))
2461                        dm_rq_abort_suspend(md, noflush);
2462
2463                unlock_fs(md);
2464                goto out; /* pushback list is already flushed, so skip flush */
2465        }
2466
2467        /*
2468         * If dm_wait_for_completion returned 0, the device is completely
2469         * quiescent now. There is no request-processing activity. All new
2470         * requests are being added to md->deferred list.
2471         */
2472
2473        dm_table_postsuspend_targets(map);
2474
2475        set_bit(DMF_SUSPENDED, &md->flags);
2476
2477out:
2478        dm_table_put(map);
2479
2480out_unlock:
2481        mutex_unlock(&md->suspend_lock);
2482        return r;
2483}
2484
2485int dm_resume(struct mapped_device *md)
2486{
2487        int r = -EINVAL;
2488        struct dm_table *map = NULL;
2489
2490        mutex_lock(&md->suspend_lock);
2491        if (!dm_suspended(md))
2492                goto out;
2493
2494        map = dm_get_table(md);
2495        if (!map || !dm_table_get_size(map))
2496                goto out;
2497
2498        r = dm_table_resume_targets(map);
2499        if (r)
2500                goto out;
2501
2502        dm_queue_flush(md);
2503
2504        /*
2505         * Flushing deferred I/Os must be done after targets are resumed
2506         * so that mapping of targets can work correctly.
2507         * Request-based dm is queueing the deferred I/Os in its request_queue.
2508         */
2509        if (dm_request_based(md))
2510                start_queue(md->queue);
2511
2512        unlock_fs(md);
2513
2514        clear_bit(DMF_SUSPENDED, &md->flags);
2515
2516        dm_table_unplug_all(map);
2517        r = 0;
2518out:
2519        dm_table_put(map);
2520        mutex_unlock(&md->suspend_lock);
2521
2522        return r;
2523}
2524
2525/*-----------------------------------------------------------------
2526 * Event notification.
2527 *---------------------------------------------------------------*/
2528void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
2529                       unsigned cookie)
2530{
2531        char udev_cookie[DM_COOKIE_LENGTH];
2532        char *envp[] = { udev_cookie, NULL };
2533
2534        if (!cookie)
2535                kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
2536        else {
2537                snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2538                         DM_COOKIE_ENV_VAR_NAME, cookie);
2539                kobject_uevent_env(&disk_to_dev(md->disk)->kobj, action, envp);
2540        }
2541}
2542
2543uint32_t dm_next_uevent_seq(struct mapped_device *md)
2544{
2545        return atomic_add_return(1, &md->uevent_seq);
2546}
2547
2548uint32_t dm_get_event_nr(struct mapped_device *md)
2549{
2550        return atomic_read(&md->event_nr);
2551}
2552
2553int dm_wait_event(struct mapped_device *md, int event_nr)
2554{
2555        return wait_event_interruptible(md->eventq,
2556                        (event_nr != atomic_read(&md->event_nr)));
2557}
2558
2559void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
2560{
2561        unsigned long flags;
2562
2563        spin_lock_irqsave(&md->uevent_lock, flags);
2564        list_add(elist, &md->uevent_list);
2565        spin_unlock_irqrestore(&md->uevent_lock, flags);
2566}
2567
2568/*
2569 * The gendisk is only valid as long as you have a reference
2570 * count on 'md'.
2571 */
2572struct gendisk *dm_disk(struct mapped_device *md)
2573{
2574        return md->disk;
2575}
2576
2577struct kobject *dm_kobject(struct mapped_device *md)
2578{
2579        return &md->kobj;
2580}
2581
2582/*
2583 * struct mapped_device should not be exported outside of dm.c
2584 * so use this check to verify that kobj is part of md structure
2585 */
2586struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2587{
2588        struct mapped_device *md;
2589
2590        md = container_of(kobj, struct mapped_device, kobj);
2591        if (&md->kobj != kobj)
2592                return NULL;
2593
2594        if (test_bit(DMF_FREEING, &md->flags) ||
2595            test_bit(DMF_DELETING, &md->flags))
2596                return NULL;
2597
2598        dm_get(md);
2599        return md;
2600}
2601
2602int dm_suspended(struct mapped_device *md)
2603{
2604        return test_bit(DMF_SUSPENDED, &md->flags);
2605}
2606
2607int dm_noflush_suspending(struct dm_target *ti)
2608{
2609        struct mapped_device *md = dm_table_get_md(ti->table);
2610        int r = __noflush_suspending(md);
2611
2612        dm_put(md);
2613
2614        return r;
2615}
2616EXPORT_SYMBOL_GPL(dm_noflush_suspending);
2617
2618struct dm_md_mempools *dm_alloc_md_mempools(unsigned type)
2619{
2620        struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL);
2621
2622        if (!pools)
2623                return NULL;
2624
2625        pools->io_pool = (type == DM_TYPE_BIO_BASED) ?
2626                         mempool_create_slab_pool(MIN_IOS, _io_cache) :
2627                         mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache);
2628        if (!pools->io_pool)
2629                goto free_pools_and_out;
2630
2631        pools->tio_pool = (type == DM_TYPE_BIO_BASED) ?
2632                          mempool_create_slab_pool(MIN_IOS, _tio_cache) :
2633                          mempool_create_slab_pool(MIN_IOS, _rq_tio_cache);
2634        if (!pools->tio_pool)
2635                goto free_io_pool_and_out;
2636
2637        pools->bs = (type == DM_TYPE_BIO_BASED) ?
2638                    bioset_create(16, 0) : bioset_create(MIN_IOS, 0);
2639        if (!pools->bs)
2640                goto free_tio_pool_and_out;
2641
2642        return pools;
2643
2644free_tio_pool_and_out:
2645        mempool_destroy(pools->tio_pool);
2646
2647free_io_pool_and_out:
2648        mempool_destroy(pools->io_pool);
2649
2650free_pools_and_out:
2651        kfree(pools);
2652
2653        return NULL;
2654}
2655
2656void dm_free_md_mempools(struct dm_md_mempools *pools)
2657{
2658        if (!pools)
2659                return;
2660
2661        if (pools->io_pool)
2662                mempool_destroy(pools->io_pool);
2663
2664        if (pools->tio_pool)
2665                mempool_destroy(pools->tio_pool);
2666
2667        if (pools->bs)
2668                bioset_free(pools->bs);
2669
2670        kfree(pools);
2671}
2672
2673static const struct block_device_operations dm_blk_dops = {
2674        .open = dm_blk_open,
2675        .release = dm_blk_close,
2676        .ioctl = dm_blk_ioctl,
2677        .getgeo = dm_blk_getgeo,
2678        .owner = THIS_MODULE
2679};
2680
2681EXPORT_SYMBOL(dm_get_mapinfo);
2682
2683/*
2684 * module hooks
2685 */
2686module_init(dm_init);
2687module_exit(dm_exit);
2688
2689module_param(major, uint, 0);
2690MODULE_PARM_DESC(major, "The major number of the device mapper");
2691MODULE_DESCRIPTION(DM_NAME " driver");
2692MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
2693MODULE_LICENSE("GPL");
2694