linux/drivers/md/dm-thin.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2011-2012 Red Hat UK.
   3 *
   4 * This file is released under the GPL.
   5 */
   6
   7#include "dm-thin-metadata.h"
   8#include "dm-bio-prison.h"
   9#include "dm.h"
  10
  11#include <linux/device-mapper.h>
  12#include <linux/dm-io.h>
  13#include <linux/dm-kcopyd.h>
  14#include <linux/list.h>
  15#include <linux/init.h>
  16#include <linux/module.h>
  17#include <linux/slab.h>
  18
  19#define DM_MSG_PREFIX   "thin"
  20
  21/*
  22 * Tunable constants
  23 */
  24#define ENDIO_HOOK_POOL_SIZE 1024
  25#define MAPPING_POOL_SIZE 1024
  26#define PRISON_CELLS 1024
  27#define COMMIT_PERIOD HZ
  28
  29/*
  30 * The block size of the device holding pool data must be
  31 * between 64KB and 1GB.
  32 */
  33#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
  34#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
  35
  36/*
  37 * Device id is restricted to 24 bits.
  38 */
  39#define MAX_DEV_ID ((1 << 24) - 1)
  40
  41/*
  42 * How do we handle breaking sharing of data blocks?
  43 * =================================================
  44 *
  45 * We use a standard copy-on-write btree to store the mappings for the
  46 * devices (note I'm talking about copy-on-write of the metadata here, not
  47 * the data).  When you take an internal snapshot you clone the root node
  48 * of the origin btree.  After this there is no concept of an origin or a
  49 * snapshot.  They are just two device trees that happen to point to the
  50 * same data blocks.
  51 *
  52 * When we get a write in we decide if it's to a shared data block using
  53 * some timestamp magic.  If it is, we have to break sharing.
  54 *
  55 * Let's say we write to a shared block in what was the origin.  The
  56 * steps are:
  57 *
  58 * i) plug io further to this physical block. (see bio_prison code).
  59 *
  60 * ii) quiesce any read io to that shared data block.  Obviously
  61 * including all devices that share this block.  (see dm_deferred_set code)
  62 *
  63 * iii) copy the data block to a newly allocate block.  This step can be
  64 * missed out if the io covers the block. (schedule_copy).
  65 *
  66 * iv) insert the new mapping into the origin's btree
  67 * (process_prepared_mapping).  This act of inserting breaks some
  68 * sharing of btree nodes between the two devices.  Breaking sharing only
  69 * effects the btree of that specific device.  Btrees for the other
  70 * devices that share the block never change.  The btree for the origin
  71 * device as it was after the last commit is untouched, ie. we're using
  72 * persistent data structures in the functional programming sense.
  73 *
  74 * v) unplug io to this physical block, including the io that triggered
  75 * the breaking of sharing.
  76 *
  77 * Steps (ii) and (iii) occur in parallel.
  78 *
  79 * The metadata _doesn't_ need to be committed before the io continues.  We
  80 * get away with this because the io is always written to a _new_ block.
  81 * If there's a crash, then:
  82 *
  83 * - The origin mapping will point to the old origin block (the shared
  84 * one).  This will contain the data as it was before the io that triggered
  85 * the breaking of sharing came in.
  86 *
  87 * - The snap mapping still points to the old block.  As it would after
  88 * the commit.
  89 *
  90 * The downside of this scheme is the timestamp magic isn't perfect, and
  91 * will continue to think that data block in the snapshot device is shared
  92 * even after the write to the origin has broken sharing.  I suspect data
  93 * blocks will typically be shared by many different devices, so we're
  94 * breaking sharing n + 1 times, rather than n, where n is the number of
  95 * devices that reference this data block.  At the moment I think the
  96 * benefits far, far outweigh the disadvantages.
  97 */
  98
  99/*----------------------------------------------------------------*/
 100
 101/*
 102 * Key building.
 103 */
 104static void build_data_key(struct dm_thin_device *td,
 105                           dm_block_t b, struct dm_cell_key *key)
 106{
 107        key->virtual = 0;
 108        key->dev = dm_thin_dev_id(td);
 109        key->block = b;
 110}
 111
 112static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
 113                              struct dm_cell_key *key)
 114{
 115        key->virtual = 1;
 116        key->dev = dm_thin_dev_id(td);
 117        key->block = b;
 118}
 119
 120/*----------------------------------------------------------------*/
 121
 122/*
 123 * A pool device ties together a metadata device and a data device.  It
 124 * also provides the interface for creating and destroying internal
 125 * devices.
 126 */
 127struct dm_thin_new_mapping;
 128
 129/*
 130 * The pool runs in 3 modes.  Ordered in degraded order for comparisons.
 131 */
 132enum pool_mode {
 133        PM_WRITE,               /* metadata may be changed */
 134        PM_READ_ONLY,           /* metadata may not be changed */
 135        PM_FAIL,                /* all I/O fails */
 136};
 137
 138struct pool_features {
 139        enum pool_mode mode;
 140
 141        bool zero_new_blocks:1;
 142        bool discard_enabled:1;
 143        bool discard_passdown:1;
 144};
 145
 146struct thin_c;
 147typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
 148typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);
 149
 150struct pool {
 151        struct list_head list;
 152        struct dm_target *ti;   /* Only set if a pool target is bound */
 153
 154        struct mapped_device *pool_md;
 155        struct block_device *md_dev;
 156        struct dm_pool_metadata *pmd;
 157
 158        dm_block_t low_water_blocks;
 159        uint32_t sectors_per_block;
 160        int sectors_per_block_shift;
 161
 162        struct pool_features pf;
 163        unsigned low_water_triggered:1; /* A dm event has been sent */
 164        unsigned no_free_space:1;       /* A -ENOSPC warning has been issued */
 165
 166        struct dm_bio_prison *prison;
 167        struct dm_kcopyd_client *copier;
 168
 169        struct workqueue_struct *wq;
 170        struct work_struct worker;
 171        struct delayed_work waker;
 172
 173        unsigned long last_commit_jiffies;
 174        unsigned ref_count;
 175
 176        spinlock_t lock;
 177        struct bio_list deferred_bios;
 178        struct bio_list deferred_flush_bios;
 179        struct list_head prepared_mappings;
 180        struct list_head prepared_discards;
 181
 182        struct bio_list retry_on_resume_list;
 183
 184        struct dm_deferred_set *shared_read_ds;
 185        struct dm_deferred_set *all_io_ds;
 186
 187        struct dm_thin_new_mapping *next_mapping;
 188        mempool_t *mapping_pool;
 189        mempool_t *endio_hook_pool;
 190
 191        process_bio_fn process_bio;
 192        process_bio_fn process_discard;
 193
 194        process_mapping_fn process_prepared_mapping;
 195        process_mapping_fn process_prepared_discard;
 196};
 197
 198static enum pool_mode get_pool_mode(struct pool *pool);
 199static void set_pool_mode(struct pool *pool, enum pool_mode mode);
 200
 201/*
 202 * Target context for a pool.
 203 */
 204struct pool_c {
 205        struct dm_target *ti;
 206        struct pool *pool;
 207        struct dm_dev *data_dev;
 208        struct dm_dev *metadata_dev;
 209        struct dm_target_callbacks callbacks;
 210
 211        dm_block_t low_water_blocks;
 212        struct pool_features requested_pf; /* Features requested during table load */
 213        struct pool_features adjusted_pf;  /* Features used after adjusting for constituent devices */
 214};
 215
 216/*
 217 * Target context for a thin.
 218 */
 219struct thin_c {
 220        struct dm_dev *pool_dev;
 221        struct dm_dev *origin_dev;
 222        dm_thin_id dev_id;
 223
 224        struct pool *pool;
 225        struct dm_thin_device *td;
 226};
 227
 228/*----------------------------------------------------------------*/
 229
 230/*
 231 * A global list of pools that uses a struct mapped_device as a key.
 232 */
 233static struct dm_thin_pool_table {
 234        struct mutex mutex;
 235        struct list_head pools;
 236} dm_thin_pool_table;
 237
 238static void pool_table_init(void)
 239{
 240        mutex_init(&dm_thin_pool_table.mutex);
 241        INIT_LIST_HEAD(&dm_thin_pool_table.pools);
 242}
 243
 244static void __pool_table_insert(struct pool *pool)
 245{
 246        BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 247        list_add(&pool->list, &dm_thin_pool_table.pools);
 248}
 249
 250static void __pool_table_remove(struct pool *pool)
 251{
 252        BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 253        list_del(&pool->list);
 254}
 255
 256static struct pool *__pool_table_lookup(struct mapped_device *md)
 257{
 258        struct pool *pool = NULL, *tmp;
 259
 260        BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 261
 262        list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
 263                if (tmp->pool_md == md) {
 264                        pool = tmp;
 265                        break;
 266                }
 267        }
 268
 269        return pool;
 270}
 271
 272static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev)
 273{
 274        struct pool *pool = NULL, *tmp;
 275
 276        BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 277
 278        list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
 279                if (tmp->md_dev == md_dev) {
 280                        pool = tmp;
 281                        break;
 282                }
 283        }
 284
 285        return pool;
 286}
 287
 288/*----------------------------------------------------------------*/
 289
 290struct dm_thin_endio_hook {
 291        struct thin_c *tc;
 292        struct dm_deferred_entry *shared_read_entry;
 293        struct dm_deferred_entry *all_io_entry;
 294        struct dm_thin_new_mapping *overwrite_mapping;
 295};
 296
 297static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
 298{
 299        struct bio *bio;
 300        struct bio_list bios;
 301
 302        bio_list_init(&bios);
 303        bio_list_merge(&bios, master);
 304        bio_list_init(master);
 305
 306        while ((bio = bio_list_pop(&bios))) {
 307                struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
 308
 309                if (h->tc == tc)
 310                        bio_endio(bio, DM_ENDIO_REQUEUE);
 311                else
 312                        bio_list_add(master, bio);
 313        }
 314}
 315
 316static void requeue_io(struct thin_c *tc)
 317{
 318        struct pool *pool = tc->pool;
 319        unsigned long flags;
 320
 321        spin_lock_irqsave(&pool->lock, flags);
 322        __requeue_bio_list(tc, &pool->deferred_bios);
 323        __requeue_bio_list(tc, &pool->retry_on_resume_list);
 324        spin_unlock_irqrestore(&pool->lock, flags);
 325}
 326
 327/*
 328 * This section of code contains the logic for processing a thin device's IO.
 329 * Much of the code depends on pool object resources (lists, workqueues, etc)
 330 * but most is exclusively called from the thin target rather than the thin-pool
 331 * target.
 332 */
 333
 334static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
 335{
 336        sector_t block_nr = bio->bi_sector;
 337
 338        if (tc->pool->sectors_per_block_shift < 0)
 339                (void) sector_div(block_nr, tc->pool->sectors_per_block);
 340        else
 341                block_nr >>= tc->pool->sectors_per_block_shift;
 342
 343        return block_nr;
 344}
 345
 346static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
 347{
 348        struct pool *pool = tc->pool;
 349        sector_t bi_sector = bio->bi_sector;
 350
 351        bio->bi_bdev = tc->pool_dev->bdev;
 352        if (tc->pool->sectors_per_block_shift < 0)
 353                bio->bi_sector = (block * pool->sectors_per_block) +
 354                                 sector_div(bi_sector, pool->sectors_per_block);
 355        else
 356                bio->bi_sector = (block << pool->sectors_per_block_shift) |
 357                                (bi_sector & (pool->sectors_per_block - 1));
 358}
 359
 360static void remap_to_origin(struct thin_c *tc, struct bio *bio)
 361{
 362        bio->bi_bdev = tc->origin_dev->bdev;
 363}
 364
 365static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
 366{
 367        return (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) &&
 368                dm_thin_changed_this_transaction(tc->td);
 369}
 370
 371static void issue(struct thin_c *tc, struct bio *bio)
 372{
 373        struct pool *pool = tc->pool;
 374        unsigned long flags;
 375
 376        if (!bio_triggers_commit(tc, bio)) {
 377                generic_make_request(bio);
 378                return;
 379        }
 380
 381        /*
 382         * Complete bio with an error if earlier I/O caused changes to
 383         * the metadata that can't be committed e.g, due to I/O errors
 384         * on the metadata device.
 385         */
 386        if (dm_thin_aborted_changes(tc->td)) {
 387                bio_io_error(bio);
 388                return;
 389        }
 390
 391        /*
 392         * Batch together any bios that trigger commits and then issue a
 393         * single commit for them in process_deferred_bios().
 394         */
 395        spin_lock_irqsave(&pool->lock, flags);
 396        bio_list_add(&pool->deferred_flush_bios, bio);
 397        spin_unlock_irqrestore(&pool->lock, flags);
 398}
 399
 400static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
 401{
 402        remap_to_origin(tc, bio);
 403        issue(tc, bio);
 404}
 405
 406static void remap_and_issue(struct thin_c *tc, struct bio *bio,
 407                            dm_block_t block)
 408{
 409        remap(tc, bio, block);
 410        issue(tc, bio);
 411}
 412
 413/*
 414 * wake_worker() is used when new work is queued and when pool_resume is
 415 * ready to continue deferred IO processing.
 416 */
 417static void wake_worker(struct pool *pool)
 418{
 419        queue_work(pool->wq, &pool->worker);
 420}
 421
 422/*----------------------------------------------------------------*/
 423
 424/*
 425 * Bio endio functions.
 426 */
 427struct dm_thin_new_mapping {
 428        struct list_head list;
 429
 430        unsigned quiesced:1;
 431        unsigned prepared:1;
 432        unsigned pass_discard:1;
 433
 434        struct thin_c *tc;
 435        dm_block_t virt_block;
 436        dm_block_t data_block;
 437        struct dm_bio_prison_cell *cell, *cell2;
 438        int err;
 439
 440        /*
 441         * If the bio covers the whole area of a block then we can avoid
 442         * zeroing or copying.  Instead this bio is hooked.  The bio will
 443         * still be in the cell, so care has to be taken to avoid issuing
 444         * the bio twice.
 445         */
 446        struct bio *bio;
 447        bio_end_io_t *saved_bi_end_io;
 448};
 449
 450static void __maybe_add_mapping(struct dm_thin_new_mapping *m)
 451{
 452        struct pool *pool = m->tc->pool;
 453
 454        if (m->quiesced && m->prepared) {
 455                list_add(&m->list, &pool->prepared_mappings);
 456                wake_worker(pool);
 457        }
 458}
 459
 460static void copy_complete(int read_err, unsigned long write_err, void *context)
 461{
 462        unsigned long flags;
 463        struct dm_thin_new_mapping *m = context;
 464        struct pool *pool = m->tc->pool;
 465
 466        m->err = read_err || write_err ? -EIO : 0;
 467
 468        spin_lock_irqsave(&pool->lock, flags);
 469        m->prepared = 1;
 470        __maybe_add_mapping(m);
 471        spin_unlock_irqrestore(&pool->lock, flags);
 472}
 473
 474static void overwrite_endio(struct bio *bio, int err)
 475{
 476        unsigned long flags;
 477        struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
 478        struct dm_thin_new_mapping *m = h->overwrite_mapping;
 479        struct pool *pool = m->tc->pool;
 480
 481        m->err = err;
 482
 483        spin_lock_irqsave(&pool->lock, flags);
 484        m->prepared = 1;
 485        __maybe_add_mapping(m);
 486        spin_unlock_irqrestore(&pool->lock, flags);
 487}
 488
 489/*----------------------------------------------------------------*/
 490
 491/*
 492 * Workqueue.
 493 */
 494
 495/*
 496 * Prepared mapping jobs.
 497 */
 498
 499/*
 500 * This sends the bios in the cell back to the deferred_bios list.
 501 */
 502static void cell_defer(struct thin_c *tc, struct dm_bio_prison_cell *cell,
 503                       dm_block_t data_block)
 504{
 505        struct pool *pool = tc->pool;
 506        unsigned long flags;
 507
 508        spin_lock_irqsave(&pool->lock, flags);
 509        dm_cell_release(cell, &pool->deferred_bios);
 510        spin_unlock_irqrestore(&tc->pool->lock, flags);
 511
 512        wake_worker(pool);
 513}
 514
 515/*
 516 * Same as cell_defer above, except it omits one particular detainee,
 517 * a write bio that covers the block and has already been processed.
 518 */
 519static void cell_defer_except(struct thin_c *tc, struct dm_bio_prison_cell *cell)
 520{
 521        struct bio_list bios;
 522        struct pool *pool = tc->pool;
 523        unsigned long flags;
 524
 525        bio_list_init(&bios);
 526
 527        spin_lock_irqsave(&pool->lock, flags);
 528        dm_cell_release_no_holder(cell, &pool->deferred_bios);
 529        spin_unlock_irqrestore(&pool->lock, flags);
 530
 531        wake_worker(pool);
 532}
 533
 534static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
 535{
 536        if (m->bio)
 537                m->bio->bi_end_io = m->saved_bi_end_io;
 538        dm_cell_error(m->cell);
 539        list_del(&m->list);
 540        mempool_free(m, m->tc->pool->mapping_pool);
 541}
 542static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 543{
 544        struct thin_c *tc = m->tc;
 545        struct bio *bio;
 546        int r;
 547
 548        bio = m->bio;
 549        if (bio)
 550                bio->bi_end_io = m->saved_bi_end_io;
 551
 552        if (m->err) {
 553                dm_cell_error(m->cell);
 554                goto out;
 555        }
 556
 557        /*
 558         * Commit the prepared block into the mapping btree.
 559         * Any I/O for this block arriving after this point will get
 560         * remapped to it directly.
 561         */
 562        r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
 563        if (r) {
 564                DMERR("dm_thin_insert_block() failed");
 565                dm_cell_error(m->cell);
 566                goto out;
 567        }
 568
 569        /*
 570         * Release any bios held while the block was being provisioned.
 571         * If we are processing a write bio that completely covers the block,
 572         * we already processed it so can ignore it now when processing
 573         * the bios in the cell.
 574         */
 575        if (bio) {
 576                cell_defer_except(tc, m->cell);
 577                bio_endio(bio, 0);
 578        } else
 579                cell_defer(tc, m->cell, m->data_block);
 580
 581out:
 582        list_del(&m->list);
 583        mempool_free(m, tc->pool->mapping_pool);
 584}
 585
 586static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
 587{
 588        struct thin_c *tc = m->tc;
 589
 590        bio_io_error(m->bio);
 591        cell_defer_except(tc, m->cell);
 592        cell_defer_except(tc, m->cell2);
 593        mempool_free(m, tc->pool->mapping_pool);
 594}
 595
 596static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
 597{
 598        struct thin_c *tc = m->tc;
 599
 600        if (m->pass_discard)
 601                remap_and_issue(tc, m->bio, m->data_block);
 602        else
 603                bio_endio(m->bio, 0);
 604
 605        cell_defer_except(tc, m->cell);
 606        cell_defer_except(tc, m->cell2);
 607        mempool_free(m, tc->pool->mapping_pool);
 608}
 609
 610static void process_prepared_discard(struct dm_thin_new_mapping *m)
 611{
 612        int r;
 613        struct thin_c *tc = m->tc;
 614
 615        r = dm_thin_remove_block(tc->td, m->virt_block);
 616        if (r)
 617                DMERR("dm_thin_remove_block() failed");
 618
 619        process_prepared_discard_passdown(m);
 620}
 621
 622static void process_prepared(struct pool *pool, struct list_head *head,
 623                             process_mapping_fn *fn)
 624{
 625        unsigned long flags;
 626        struct list_head maps;
 627        struct dm_thin_new_mapping *m, *tmp;
 628
 629        INIT_LIST_HEAD(&maps);
 630        spin_lock_irqsave(&pool->lock, flags);
 631        list_splice_init(head, &maps);
 632        spin_unlock_irqrestore(&pool->lock, flags);
 633
 634        list_for_each_entry_safe(m, tmp, &maps, list)
 635                (*fn)(m);
 636}
 637
 638/*
 639 * Deferred bio jobs.
 640 */
 641static int io_overlaps_block(struct pool *pool, struct bio *bio)
 642{
 643        return bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT);
 644}
 645
 646static int io_overwrites_block(struct pool *pool, struct bio *bio)
 647{
 648        return (bio_data_dir(bio) == WRITE) &&
 649                io_overlaps_block(pool, bio);
 650}
 651
 652static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
 653                               bio_end_io_t *fn)
 654{
 655        *save = bio->bi_end_io;
 656        bio->bi_end_io = fn;
 657}
 658
 659static int ensure_next_mapping(struct pool *pool)
 660{
 661        if (pool->next_mapping)
 662                return 0;
 663
 664        pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC);
 665
 666        return pool->next_mapping ? 0 : -ENOMEM;
 667}
 668
 669static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
 670{
 671        struct dm_thin_new_mapping *r = pool->next_mapping;
 672
 673        BUG_ON(!pool->next_mapping);
 674
 675        pool->next_mapping = NULL;
 676
 677        return r;
 678}
 679
 680static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
 681                          struct dm_dev *origin, dm_block_t data_origin,
 682                          dm_block_t data_dest,
 683                          struct dm_bio_prison_cell *cell, struct bio *bio)
 684{
 685        int r;
 686        struct pool *pool = tc->pool;
 687        struct dm_thin_new_mapping *m = get_next_mapping(pool);
 688
 689        INIT_LIST_HEAD(&m->list);
 690        m->quiesced = 0;
 691        m->prepared = 0;
 692        m->tc = tc;
 693        m->virt_block = virt_block;
 694        m->data_block = data_dest;
 695        m->cell = cell;
 696        m->err = 0;
 697        m->bio = NULL;
 698
 699        if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
 700                m->quiesced = 1;
 701
 702        /*
 703         * IO to pool_dev remaps to the pool target's data_dev.
 704         *
 705         * If the whole block of data is being overwritten, we can issue the
 706         * bio immediately. Otherwise we use kcopyd to clone the data first.
 707         */
 708        if (io_overwrites_block(pool, bio)) {
 709                struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
 710
 711                h->overwrite_mapping = m;
 712                m->bio = bio;
 713                save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
 714                remap_and_issue(tc, bio, data_dest);
 715        } else {
 716                struct dm_io_region from, to;
 717
 718                from.bdev = origin->bdev;
 719                from.sector = data_origin * pool->sectors_per_block;
 720                from.count = pool->sectors_per_block;
 721
 722                to.bdev = tc->pool_dev->bdev;
 723                to.sector = data_dest * pool->sectors_per_block;
 724                to.count = pool->sectors_per_block;
 725
 726                r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
 727                                   0, copy_complete, m);
 728                if (r < 0) {
 729                        mempool_free(m, pool->mapping_pool);
 730                        DMERR("dm_kcopyd_copy() failed");
 731                        dm_cell_error(cell);
 732                }
 733        }
 734}
 735
 736static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
 737                                   dm_block_t data_origin, dm_block_t data_dest,
 738                                   struct dm_bio_prison_cell *cell, struct bio *bio)
 739{
 740        schedule_copy(tc, virt_block, tc->pool_dev,
 741                      data_origin, data_dest, cell, bio);
 742}
 743
 744static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
 745                                   dm_block_t data_dest,
 746                                   struct dm_bio_prison_cell *cell, struct bio *bio)
 747{
 748        schedule_copy(tc, virt_block, tc->origin_dev,
 749                      virt_block, data_dest, cell, bio);
 750}
 751
 752static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
 753                          dm_block_t data_block, struct dm_bio_prison_cell *cell,
 754                          struct bio *bio)
 755{
 756        struct pool *pool = tc->pool;
 757        struct dm_thin_new_mapping *m = get_next_mapping(pool);
 758
 759        INIT_LIST_HEAD(&m->list);
 760        m->quiesced = 1;
 761        m->prepared = 0;
 762        m->tc = tc;
 763        m->virt_block = virt_block;
 764        m->data_block = data_block;
 765        m->cell = cell;
 766        m->err = 0;
 767        m->bio = NULL;
 768
 769        /*
 770         * If the whole block of data is being overwritten or we are not
 771         * zeroing pre-existing data, we can issue the bio immediately.
 772         * Otherwise we use kcopyd to zero the data first.
 773         */
 774        if (!pool->pf.zero_new_blocks)
 775                process_prepared_mapping(m);
 776
 777        else if (io_overwrites_block(pool, bio)) {
 778                struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
 779
 780                h->overwrite_mapping = m;
 781                m->bio = bio;
 782                save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
 783                remap_and_issue(tc, bio, data_block);
 784        } else {
 785                int r;
 786                struct dm_io_region to;
 787
 788                to.bdev = tc->pool_dev->bdev;
 789                to.sector = data_block * pool->sectors_per_block;
 790                to.count = pool->sectors_per_block;
 791
 792                r = dm_kcopyd_zero(pool->copier, 1, &to, 0, copy_complete, m);
 793                if (r < 0) {
 794                        mempool_free(m, pool->mapping_pool);
 795                        DMERR("dm_kcopyd_zero() failed");
 796                        dm_cell_error(cell);
 797                }
 798        }
 799}
 800
 801static int commit(struct pool *pool)
 802{
 803        int r;
 804
 805        r = dm_pool_commit_metadata(pool->pmd);
 806        if (r)
 807                DMERR("commit failed, error = %d", r);
 808
 809        return r;
 810}
 811
 812/*
 813 * A non-zero return indicates read_only or fail_io mode.
 814 * Many callers don't care about the return value.
 815 */
 816static int commit_or_fallback(struct pool *pool)
 817{
 818        int r;
 819
 820        if (get_pool_mode(pool) != PM_WRITE)
 821                return -EINVAL;
 822
 823        r = commit(pool);
 824        if (r)
 825                set_pool_mode(pool, PM_READ_ONLY);
 826
 827        return r;
 828}
 829
 830static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
 831{
 832        int r;
 833        dm_block_t free_blocks;
 834        unsigned long flags;
 835        struct pool *pool = tc->pool;
 836
 837        r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
 838        if (r)
 839                return r;
 840
 841        if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
 842                DMWARN("%s: reached low water mark, sending event.",
 843                       dm_device_name(pool->pool_md));
 844                spin_lock_irqsave(&pool->lock, flags);
 845                pool->low_water_triggered = 1;
 846                spin_unlock_irqrestore(&pool->lock, flags);
 847                dm_table_event(pool->ti->table);
 848        }
 849
 850        if (!free_blocks) {
 851                if (pool->no_free_space)
 852                        return -ENOSPC;
 853                else {
 854                        /*
 855                         * Try to commit to see if that will free up some
 856                         * more space.
 857                         */
 858                        (void) commit_or_fallback(pool);
 859
 860                        r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
 861                        if (r)
 862                                return r;
 863
 864                        /*
 865                         * If we still have no space we set a flag to avoid
 866                         * doing all this checking and return -ENOSPC.
 867                         */
 868                        if (!free_blocks) {
 869                                DMWARN("%s: no free space available.",
 870                                       dm_device_name(pool->pool_md));
 871                                spin_lock_irqsave(&pool->lock, flags);
 872                                pool->no_free_space = 1;
 873                                spin_unlock_irqrestore(&pool->lock, flags);
 874                                return -ENOSPC;
 875                        }
 876                }
 877        }
 878
 879        r = dm_pool_alloc_data_block(pool->pmd, result);
 880        if (r)
 881                return r;
 882
 883        return 0;
 884}
 885
 886/*
 887 * If we have run out of space, queue bios until the device is
 888 * resumed, presumably after having been reloaded with more space.
 889 */
 890static void retry_on_resume(struct bio *bio)
 891{
 892        struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
 893        struct thin_c *tc = h->tc;
 894        struct pool *pool = tc->pool;
 895        unsigned long flags;
 896
 897        spin_lock_irqsave(&pool->lock, flags);
 898        bio_list_add(&pool->retry_on_resume_list, bio);
 899        spin_unlock_irqrestore(&pool->lock, flags);
 900}
 901
 902static void no_space(struct dm_bio_prison_cell *cell)
 903{
 904        struct bio *bio;
 905        struct bio_list bios;
 906
 907        bio_list_init(&bios);
 908        dm_cell_release(cell, &bios);
 909
 910        while ((bio = bio_list_pop(&bios)))
 911                retry_on_resume(bio);
 912}
 913
 914static void process_discard(struct thin_c *tc, struct bio *bio)
 915{
 916        int r;
 917        unsigned long flags;
 918        struct pool *pool = tc->pool;
 919        struct dm_bio_prison_cell *cell, *cell2;
 920        struct dm_cell_key key, key2;
 921        dm_block_t block = get_bio_block(tc, bio);
 922        struct dm_thin_lookup_result lookup_result;
 923        struct dm_thin_new_mapping *m;
 924
 925        build_virtual_key(tc->td, block, &key);
 926        if (dm_bio_detain(tc->pool->prison, &key, bio, &cell))
 927                return;
 928
 929        r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
 930        switch (r) {
 931        case 0:
 932                /*
 933                 * Check nobody is fiddling with this pool block.  This can
 934                 * happen if someone's in the process of breaking sharing
 935                 * on this block.
 936                 */
 937                build_data_key(tc->td, lookup_result.block, &key2);
 938                if (dm_bio_detain(tc->pool->prison, &key2, bio, &cell2)) {
 939                        dm_cell_release_singleton(cell, bio);
 940                        break;
 941                }
 942
 943                if (io_overlaps_block(pool, bio)) {
 944                        /*
 945                         * IO may still be going to the destination block.  We must
 946                         * quiesce before we can do the removal.
 947                         */
 948                        m = get_next_mapping(pool);
 949                        m->tc = tc;
 950                        m->pass_discard = (!lookup_result.shared) && pool->pf.discard_passdown;
 951                        m->virt_block = block;
 952                        m->data_block = lookup_result.block;
 953                        m->cell = cell;
 954                        m->cell2 = cell2;
 955                        m->err = 0;
 956                        m->bio = bio;
 957
 958                        if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list)) {
 959                                spin_lock_irqsave(&pool->lock, flags);
 960                                list_add(&m->list, &pool->prepared_discards);
 961                                spin_unlock_irqrestore(&pool->lock, flags);
 962                                wake_worker(pool);
 963                        }
 964                } else {
 965                        /*
 966                         * The DM core makes sure that the discard doesn't span
 967                         * a block boundary.  So we submit the discard of a
 968                         * partial block appropriately.
 969                         */
 970                        dm_cell_release_singleton(cell, bio);
 971                        dm_cell_release_singleton(cell2, bio);
 972                        if ((!lookup_result.shared) && pool->pf.discard_passdown)
 973                                remap_and_issue(tc, bio, lookup_result.block);
 974                        else
 975                                bio_endio(bio, 0);
 976                }
 977                break;
 978
 979        case -ENODATA:
 980                /*
 981                 * It isn't provisioned, just forget it.
 982                 */
 983                dm_cell_release_singleton(cell, bio);
 984                bio_endio(bio, 0);
 985                break;
 986
 987        default:
 988                DMERR("discard: find block unexpectedly returned %d", r);
 989                dm_cell_release_singleton(cell, bio);
 990                bio_io_error(bio);
 991                break;
 992        }
 993}
 994
 995static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
 996                          struct dm_cell_key *key,
 997                          struct dm_thin_lookup_result *lookup_result,
 998                          struct dm_bio_prison_cell *cell)
 999{
1000        int r;
1001        dm_block_t data_block;
1002
1003        r = alloc_data_block(tc, &data_block);
1004        switch (r) {
1005        case 0:
1006                schedule_internal_copy(tc, block, lookup_result->block,
1007                                       data_block, cell, bio);
1008                break;
1009
1010        case -ENOSPC:
1011                no_space(cell);
1012                break;
1013
1014        default:
1015                DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
1016                dm_cell_error(cell);
1017                break;
1018        }
1019}
1020
1021static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1022                               dm_block_t block,
1023                               struct dm_thin_lookup_result *lookup_result)
1024{
1025        struct dm_bio_prison_cell *cell;
1026        struct pool *pool = tc->pool;
1027        struct dm_cell_key key;
1028
1029        /*
1030         * If cell is already occupied, then sharing is already in the process
1031         * of being broken so we have nothing further to do here.
1032         */
1033        build_data_key(tc->td, lookup_result->block, &key);
1034        if (dm_bio_detain(pool->prison, &key, bio, &cell))
1035                return;
1036
1037        if (bio_data_dir(bio) == WRITE && bio->bi_size)
1038                break_sharing(tc, bio, block, &key, lookup_result, cell);
1039        else {
1040                struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
1041
1042                h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds);
1043
1044                dm_cell_release_singleton(cell, bio);
1045                remap_and_issue(tc, bio, lookup_result->block);
1046        }
1047}
1048
1049static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block,
1050                            struct dm_bio_prison_cell *cell)
1051{
1052        int r;
1053        dm_block_t data_block;
1054
1055        /*
1056         * Remap empty bios (flushes) immediately, without provisioning.
1057         */
1058        if (!bio->bi_size) {
1059                dm_cell_release_singleton(cell, bio);
1060                remap_and_issue(tc, bio, 0);
1061                return;
1062        }
1063
1064        /*
1065         * Fill read bios with zeroes and complete them immediately.
1066         */
1067        if (bio_data_dir(bio) == READ) {
1068                zero_fill_bio(bio);
1069                dm_cell_release_singleton(cell, bio);
1070                bio_endio(bio, 0);
1071                return;
1072        }
1073
1074        r = alloc_data_block(tc, &data_block);
1075        switch (r) {
1076        case 0:
1077                if (tc->origin_dev)
1078                        schedule_external_copy(tc, block, data_block, cell, bio);
1079                else
1080                        schedule_zero(tc, block, data_block, cell, bio);
1081                break;
1082
1083        case -ENOSPC:
1084                no_space(cell);
1085                break;
1086
1087        default:
1088                DMERR("%s: alloc_data_block() failed, error = %d", __func__, r);
1089                set_pool_mode(tc->pool, PM_READ_ONLY);
1090                dm_cell_error(cell);
1091                break;
1092        }
1093}
1094
1095static void process_bio(struct thin_c *tc, struct bio *bio)
1096{
1097        int r;
1098        dm_block_t block = get_bio_block(tc, bio);
1099        struct dm_bio_prison_cell *cell;
1100        struct dm_cell_key key;
1101        struct dm_thin_lookup_result lookup_result;
1102
1103        /*
1104         * If cell is already occupied, then the block is already
1105         * being provisioned so we have nothing further to do here.
1106         */
1107        build_virtual_key(tc->td, block, &key);
1108        if (dm_bio_detain(tc->pool->prison, &key, bio, &cell))
1109                return;
1110
1111        r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1112        switch (r) {
1113        case 0:
1114                /*
1115                 * We can release this cell now.  This thread is the only
1116                 * one that puts bios into a cell, and we know there were
1117                 * no preceding bios.
1118                 */
1119                /*
1120                 * TODO: this will probably have to change when discard goes
1121                 * back in.
1122                 */
1123                dm_cell_release_singleton(cell, bio);
1124
1125                if (lookup_result.shared)
1126                        process_shared_bio(tc, bio, block, &lookup_result);
1127                else
1128                        remap_and_issue(tc, bio, lookup_result.block);
1129                break;
1130
1131        case -ENODATA:
1132                if (bio_data_dir(bio) == READ && tc->origin_dev) {
1133                        dm_cell_release_singleton(cell, bio);
1134                        remap_to_origin_and_issue(tc, bio);
1135                } else
1136                        provision_block(tc, bio, block, cell);
1137                break;
1138
1139        default:
1140                DMERR("dm_thin_find_block() failed, error = %d", r);
1141                dm_cell_release_singleton(cell, bio);
1142                bio_io_error(bio);
1143                break;
1144        }
1145}
1146
1147static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
1148{
1149        int r;
1150        int rw = bio_data_dir(bio);
1151        dm_block_t block = get_bio_block(tc, bio);
1152        struct dm_thin_lookup_result lookup_result;
1153
1154        r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1155        switch (r) {
1156        case 0:
1157                if (lookup_result.shared && (rw == WRITE) && bio->bi_size)
1158                        bio_io_error(bio);
1159                else
1160                        remap_and_issue(tc, bio, lookup_result.block);
1161                break;
1162
1163        case -ENODATA:
1164                if (rw != READ) {
1165                        bio_io_error(bio);
1166                        break;
1167                }
1168
1169                if (tc->origin_dev) {
1170                        remap_to_origin_and_issue(tc, bio);
1171                        break;
1172                }
1173
1174                zero_fill_bio(bio);
1175                bio_endio(bio, 0);
1176                break;
1177
1178        default:
1179                DMERR("dm_thin_find_block() failed, error = %d", r);
1180                bio_io_error(bio);
1181                break;
1182        }
1183}
1184
1185static void process_bio_fail(struct thin_c *tc, struct bio *bio)
1186{
1187        bio_io_error(bio);
1188}
1189
1190static int need_commit_due_to_time(struct pool *pool)
1191{
1192        return jiffies < pool->last_commit_jiffies ||
1193               jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
1194}
1195
1196static void process_deferred_bios(struct pool *pool)
1197{
1198        unsigned long flags;
1199        struct bio *bio;
1200        struct bio_list bios;
1201
1202        bio_list_init(&bios);
1203
1204        spin_lock_irqsave(&pool->lock, flags);
1205        bio_list_merge(&bios, &pool->deferred_bios);
1206        bio_list_init(&pool->deferred_bios);
1207        spin_unlock_irqrestore(&pool->lock, flags);
1208
1209        while ((bio = bio_list_pop(&bios))) {
1210                struct dm_thin_endio_hook *h = dm_get_mapinfo(bio)->ptr;
1211                struct thin_c *tc = h->tc;
1212
1213                /*
1214                 * If we've got no free new_mapping structs, and processing
1215                 * this bio might require one, we pause until there are some
1216                 * prepared mappings to process.
1217                 */
1218                if (ensure_next_mapping(pool)) {
1219                        spin_lock_irqsave(&pool->lock, flags);
1220                        bio_list_merge(&pool->deferred_bios, &bios);
1221                        spin_unlock_irqrestore(&pool->lock, flags);
1222
1223                        break;
1224                }
1225
1226                if (bio->bi_rw & REQ_DISCARD)
1227                        pool->process_discard(tc, bio);
1228                else
1229                        pool->process_bio(tc, bio);
1230        }
1231
1232        /*
1233         * If there are any deferred flush bios, we must commit
1234         * the metadata before issuing them.
1235         */
1236        bio_list_init(&bios);
1237        spin_lock_irqsave(&pool->lock, flags);
1238        bio_list_merge(&bios, &pool->deferred_flush_bios);
1239        bio_list_init(&pool->deferred_flush_bios);
1240        spin_unlock_irqrestore(&pool->lock, flags);
1241
1242        if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
1243                return;
1244
1245        if (commit_or_fallback(pool)) {
1246                while ((bio = bio_list_pop(&bios)))
1247                        bio_io_error(bio);
1248                return;
1249        }
1250        pool->last_commit_jiffies = jiffies;
1251
1252        while ((bio = bio_list_pop(&bios)))
1253                generic_make_request(bio);
1254}
1255
1256static void do_worker(struct work_struct *ws)
1257{
1258        struct pool *pool = container_of(ws, struct pool, worker);
1259
1260        process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
1261        process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
1262        process_deferred_bios(pool);
1263}
1264
1265/*
1266 * We want to commit periodically so that not too much
1267 * unwritten data builds up.
1268 */
1269static void do_waker(struct work_struct *ws)
1270{
1271        struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
1272        wake_worker(pool);
1273        queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
1274}
1275
1276/*----------------------------------------------------------------*/
1277
1278static enum pool_mode get_pool_mode(struct pool *pool)
1279{
1280        return pool->pf.mode;
1281}
1282
1283static void set_pool_mode(struct pool *pool, enum pool_mode mode)
1284{
1285        int r;
1286
1287        pool->pf.mode = mode;
1288
1289        switch (mode) {
1290        case PM_FAIL:
1291                DMERR("switching pool to failure mode");
1292                pool->process_bio = process_bio_fail;
1293                pool->process_discard = process_bio_fail;
1294                pool->process_prepared_mapping = process_prepared_mapping_fail;
1295                pool->process_prepared_discard = process_prepared_discard_fail;
1296                break;
1297
1298        case PM_READ_ONLY:
1299                DMERR("switching pool to read-only mode");
1300                r = dm_pool_abort_metadata(pool->pmd);
1301                if (r) {
1302                        DMERR("aborting transaction failed");
1303                        set_pool_mode(pool, PM_FAIL);
1304                } else {
1305                        dm_pool_metadata_read_only(pool->pmd);
1306                        pool->process_bio = process_bio_read_only;
1307                        pool->process_discard = process_discard;
1308                        pool->process_prepared_mapping = process_prepared_mapping_fail;
1309                        pool->process_prepared_discard = process_prepared_discard_passdown;
1310                }
1311                break;
1312
1313        case PM_WRITE:
1314                pool->process_bio = process_bio;
1315                pool->process_discard = process_discard;
1316                pool->process_prepared_mapping = process_prepared_mapping;
1317                pool->process_prepared_discard = process_prepared_discard;
1318                break;
1319        }
1320}
1321
1322/*----------------------------------------------------------------*/
1323
1324/*
1325 * Mapping functions.
1326 */
1327
1328/*
1329 * Called only while mapping a thin bio to hand it over to the workqueue.
1330 */
1331static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
1332{
1333        unsigned long flags;
1334        struct pool *pool = tc->pool;
1335
1336        spin_lock_irqsave(&pool->lock, flags);
1337        bio_list_add(&pool->deferred_bios, bio);
1338        spin_unlock_irqrestore(&pool->lock, flags);
1339
1340        wake_worker(pool);
1341}
1342
1343static struct dm_thin_endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio)
1344{
1345        struct pool *pool = tc->pool;
1346        struct dm_thin_endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
1347
1348        h->tc = tc;
1349        h->shared_read_entry = NULL;
1350        h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : dm_deferred_entry_inc(pool->all_io_ds);
1351        h->overwrite_mapping = NULL;
1352
1353        return h;
1354}
1355
1356/*
1357 * Non-blocking function called from the thin target's map function.
1358 */
1359static int thin_bio_map(struct dm_target *ti, struct bio *bio,
1360                        union map_info *map_context)
1361{
1362        int r;
1363        struct thin_c *tc = ti->private;
1364        dm_block_t block = get_bio_block(tc, bio);
1365        struct dm_thin_device *td = tc->td;
1366        struct dm_thin_lookup_result result;
1367
1368        map_context->ptr = thin_hook_bio(tc, bio);
1369
1370        if (get_pool_mode(tc->pool) == PM_FAIL) {
1371                bio_io_error(bio);
1372                return DM_MAPIO_SUBMITTED;
1373        }
1374
1375        if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
1376                thin_defer_bio(tc, bio);
1377                return DM_MAPIO_SUBMITTED;
1378        }
1379
1380        r = dm_thin_find_block(td, block, 0, &result);
1381
1382        /*
1383         * Note that we defer readahead too.
1384         */
1385        switch (r) {
1386        case 0:
1387                if (unlikely(result.shared)) {
1388                        /*
1389                         * We have a race condition here between the
1390                         * result.shared value returned by the lookup and
1391                         * snapshot creation, which may cause new
1392                         * sharing.
1393                         *
1394                         * To avoid this always quiesce the origin before
1395                         * taking the snap.  You want to do this anyway to
1396                         * ensure a consistent application view
1397                         * (i.e. lockfs).
1398                         *
1399                         * More distant ancestors are irrelevant. The
1400                         * shared flag will be set in their case.
1401                         */
1402                        thin_defer_bio(tc, bio);
1403                        r = DM_MAPIO_SUBMITTED;
1404                } else {
1405                        remap(tc, bio, result.block);
1406                        r = DM_MAPIO_REMAPPED;
1407                }
1408                break;
1409
1410        case -ENODATA:
1411                if (get_pool_mode(tc->pool) == PM_READ_ONLY) {
1412                        /*
1413                         * This block isn't provisioned, and we have no way
1414                         * of doing so.  Just error it.
1415                         */
1416                        bio_io_error(bio);
1417                        r = DM_MAPIO_SUBMITTED;
1418                        break;
1419                }
1420                /* fall through */
1421
1422        case -EWOULDBLOCK:
1423                /*
1424                 * In future, the failed dm_thin_find_block above could
1425                 * provide the hint to load the metadata into cache.
1426                 */
1427                thin_defer_bio(tc, bio);
1428                r = DM_MAPIO_SUBMITTED;
1429                break;
1430
1431        default:
1432                /*
1433                 * Must always call bio_io_error on failure.
1434                 * dm_thin_find_block can fail with -EINVAL if the
1435                 * pool is switched to fail-io mode.
1436                 */
1437                bio_io_error(bio);
1438                r = DM_MAPIO_SUBMITTED;
1439                break;
1440        }
1441
1442        return r;
1443}
1444
1445static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
1446{
1447        int r;
1448        unsigned long flags;
1449        struct pool_c *pt = container_of(cb, struct pool_c, callbacks);
1450
1451        spin_lock_irqsave(&pt->pool->lock, flags);
1452        r = !bio_list_empty(&pt->pool->retry_on_resume_list);
1453        spin_unlock_irqrestore(&pt->pool->lock, flags);
1454
1455        if (!r) {
1456                struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
1457                r = bdi_congested(&q->backing_dev_info, bdi_bits);
1458        }
1459
1460        return r;
1461}
1462
1463static void __requeue_bios(struct pool *pool)
1464{
1465        bio_list_merge(&pool->deferred_bios, &pool->retry_on_resume_list);
1466        bio_list_init(&pool->retry_on_resume_list);
1467}
1468
1469/*----------------------------------------------------------------
1470 * Binding of control targets to a pool object
1471 *--------------------------------------------------------------*/
1472static bool data_dev_supports_discard(struct pool_c *pt)
1473{
1474        struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
1475
1476        return q && blk_queue_discard(q);
1477}
1478
1479/*
1480 * If discard_passdown was enabled verify that the data device
1481 * supports discards.  Disable discard_passdown if not.
1482 */
1483static void disable_passdown_if_not_supported(struct pool_c *pt)
1484{
1485        struct pool *pool = pt->pool;
1486        struct block_device *data_bdev = pt->data_dev->bdev;
1487        struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;
1488        sector_t block_size = pool->sectors_per_block << SECTOR_SHIFT;
1489        const char *reason = NULL;
1490        char buf[BDEVNAME_SIZE];
1491
1492        if (!pt->adjusted_pf.discard_passdown)
1493                return;
1494
1495        if (!data_dev_supports_discard(pt))
1496                reason = "discard unsupported";
1497
1498        else if (data_limits->max_discard_sectors < pool->sectors_per_block)
1499                reason = "max discard sectors smaller than a block";
1500
1501        else if (data_limits->discard_granularity > block_size)
1502                reason = "discard granularity larger than a block";
1503
1504        else if (block_size & (data_limits->discard_granularity - 1))
1505                reason = "discard granularity not a factor of block size";
1506
1507        if (reason) {
1508                DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason);
1509                pt->adjusted_pf.discard_passdown = false;
1510        }
1511}
1512
1513static int bind_control_target(struct pool *pool, struct dm_target *ti)
1514{
1515        struct pool_c *pt = ti->private;
1516
1517        /*
1518         * We want to make sure that degraded pools are never upgraded.
1519         */
1520        enum pool_mode old_mode = pool->pf.mode;
1521        enum pool_mode new_mode = pt->adjusted_pf.mode;
1522
1523        if (old_mode > new_mode)
1524                new_mode = old_mode;
1525
1526        pool->ti = ti;
1527        pool->low_water_blocks = pt->low_water_blocks;
1528        pool->pf = pt->adjusted_pf;
1529
1530        set_pool_mode(pool, new_mode);
1531
1532        return 0;
1533}
1534
1535static void unbind_control_target(struct pool *pool, struct dm_target *ti)
1536{
1537        if (pool->ti == ti)
1538                pool->ti = NULL;
1539}
1540
1541/*----------------------------------------------------------------
1542 * Pool creation
1543 *--------------------------------------------------------------*/
1544/* Initialize pool features. */
1545static void pool_features_init(struct pool_features *pf)
1546{
1547        pf->mode = PM_WRITE;
1548        pf->zero_new_blocks = true;
1549        pf->discard_enabled = true;
1550        pf->discard_passdown = true;
1551}
1552
1553static void __pool_destroy(struct pool *pool)
1554{
1555        __pool_table_remove(pool);
1556
1557        if (dm_pool_metadata_close(pool->pmd) < 0)
1558                DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
1559
1560        dm_bio_prison_destroy(pool->prison);
1561        dm_kcopyd_client_destroy(pool->copier);
1562
1563        if (pool->wq)
1564                destroy_workqueue(pool->wq);
1565
1566        if (pool->next_mapping)
1567                mempool_free(pool->next_mapping, pool->mapping_pool);
1568        mempool_destroy(pool->mapping_pool);
1569        mempool_destroy(pool->endio_hook_pool);
1570        dm_deferred_set_destroy(pool->shared_read_ds);
1571        dm_deferred_set_destroy(pool->all_io_ds);
1572        kfree(pool);
1573}
1574
1575static struct kmem_cache *_new_mapping_cache;
1576static struct kmem_cache *_endio_hook_cache;
1577
1578static struct pool *pool_create(struct mapped_device *pool_md,
1579                                struct block_device *metadata_dev,
1580                                unsigned long block_size,
1581                                int read_only, char **error)
1582{
1583        int r;
1584        void *err_p;
1585        struct pool *pool;
1586        struct dm_pool_metadata *pmd;
1587        bool format_device = read_only ? false : true;
1588
1589        pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device);
1590        if (IS_ERR(pmd)) {
1591                *error = "Error creating metadata object";
1592                return (struct pool *)pmd;
1593        }
1594
1595        pool = kmalloc(sizeof(*pool), GFP_KERNEL);
1596        if (!pool) {
1597                *error = "Error allocating memory for pool";
1598                err_p = ERR_PTR(-ENOMEM);
1599                goto bad_pool;
1600        }
1601
1602        pool->pmd = pmd;
1603        pool->sectors_per_block = block_size;
1604        if (block_size & (block_size - 1))
1605                pool->sectors_per_block_shift = -1;
1606        else
1607                pool->sectors_per_block_shift = __ffs(block_size);
1608        pool->low_water_blocks = 0;
1609        pool_features_init(&pool->pf);
1610        pool->prison = dm_bio_prison_create(PRISON_CELLS);
1611        if (!pool->prison) {
1612                *error = "Error creating pool's bio prison";
1613                err_p = ERR_PTR(-ENOMEM);
1614                goto bad_prison;
1615        }
1616
1617        pool->copier = dm_kcopyd_client_create();
1618        if (IS_ERR(pool->copier)) {
1619                r = PTR_ERR(pool->copier);
1620                *error = "Error creating pool's kcopyd client";
1621                err_p = ERR_PTR(r);
1622                goto bad_kcopyd_client;
1623        }
1624
1625        /*
1626         * Create singlethreaded workqueue that will service all devices
1627         * that use this metadata.
1628         */
1629        pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
1630        if (!pool->wq) {
1631                *error = "Error creating pool's workqueue";
1632                err_p = ERR_PTR(-ENOMEM);
1633                goto bad_wq;
1634        }
1635
1636        INIT_WORK(&pool->worker, do_worker);
1637        INIT_DELAYED_WORK(&pool->waker, do_waker);
1638        spin_lock_init(&pool->lock);
1639        bio_list_init(&pool->deferred_bios);
1640        bio_list_init(&pool->deferred_flush_bios);
1641        INIT_LIST_HEAD(&pool->prepared_mappings);
1642        INIT_LIST_HEAD(&pool->prepared_discards);
1643        pool->low_water_triggered = 0;
1644        pool->no_free_space = 0;
1645        bio_list_init(&pool->retry_on_resume_list);
1646
1647        pool->shared_read_ds = dm_deferred_set_create();
1648        if (!pool->shared_read_ds) {
1649                *error = "Error creating pool's shared read deferred set";
1650                err_p = ERR_PTR(-ENOMEM);
1651                goto bad_shared_read_ds;
1652        }
1653
1654        pool->all_io_ds = dm_deferred_set_create();
1655        if (!pool->all_io_ds) {
1656                *error = "Error creating pool's all io deferred set";
1657                err_p = ERR_PTR(-ENOMEM);
1658                goto bad_all_io_ds;
1659        }
1660
1661        pool->next_mapping = NULL;
1662        pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE,
1663                                                      _new_mapping_cache);
1664        if (!pool->mapping_pool) {
1665                *error = "Error creating pool's mapping mempool";
1666                err_p = ERR_PTR(-ENOMEM);
1667                goto bad_mapping_pool;
1668        }
1669
1670        pool->endio_hook_pool = mempool_create_slab_pool(ENDIO_HOOK_POOL_SIZE,
1671                                                         _endio_hook_cache);
1672        if (!pool->endio_hook_pool) {
1673                *error = "Error creating pool's endio_hook mempool";
1674                err_p = ERR_PTR(-ENOMEM);
1675                goto bad_endio_hook_pool;
1676        }
1677        pool->ref_count = 1;
1678        pool->last_commit_jiffies = jiffies;
1679        pool->pool_md = pool_md;
1680        pool->md_dev = metadata_dev;
1681        __pool_table_insert(pool);
1682
1683        return pool;
1684
1685bad_endio_hook_pool:
1686        mempool_destroy(pool->mapping_pool);
1687bad_mapping_pool:
1688        dm_deferred_set_destroy(pool->all_io_ds);
1689bad_all_io_ds:
1690        dm_deferred_set_destroy(pool->shared_read_ds);
1691bad_shared_read_ds:
1692        destroy_workqueue(pool->wq);
1693bad_wq:
1694        dm_kcopyd_client_destroy(pool->copier);
1695bad_kcopyd_client:
1696        dm_bio_prison_destroy(pool->prison);
1697bad_prison:
1698        kfree(pool);
1699bad_pool:
1700        if (dm_pool_metadata_close(pmd))
1701                DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
1702
1703        return err_p;
1704}
1705
1706static void __pool_inc(struct pool *pool)
1707{
1708        BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
1709        pool->ref_count++;
1710}
1711
1712static void __pool_dec(struct pool *pool)
1713{
1714        BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
1715        BUG_ON(!pool->ref_count);
1716        if (!--pool->ref_count)
1717                __pool_destroy(pool);
1718}
1719
1720static struct pool *__pool_find(struct mapped_device *pool_md,
1721                                struct block_device *metadata_dev,
1722                                unsigned long block_size, int read_only,
1723                                char **error, int *created)
1724{
1725        struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
1726
1727        if (pool) {
1728                if (pool->pool_md != pool_md) {
1729                        *error = "metadata device already in use by a pool";
1730                        return ERR_PTR(-EBUSY);
1731                }
1732                __pool_inc(pool);
1733
1734        } else {
1735                pool = __pool_table_lookup(pool_md);
1736                if (pool) {
1737                        if (pool->md_dev != metadata_dev) {
1738                                *error = "different pool cannot replace a pool";
1739                                return ERR_PTR(-EINVAL);
1740                        }
1741                        __pool_inc(pool);
1742
1743                } else {
1744                        pool = pool_create(pool_md, metadata_dev, block_size, read_only, error);
1745                        *created = 1;
1746                }
1747        }
1748
1749        return pool;
1750}
1751
1752/*----------------------------------------------------------------
1753 * Pool target methods
1754 *--------------------------------------------------------------*/
1755static void pool_dtr(struct dm_target *ti)
1756{
1757        struct pool_c *pt = ti->private;
1758
1759        mutex_lock(&dm_thin_pool_table.mutex);
1760
1761        unbind_control_target(pt->pool, ti);
1762        __pool_dec(pt->pool);
1763        dm_put_device(ti, pt->metadata_dev);
1764        dm_put_device(ti, pt->data_dev);
1765        kfree(pt);
1766
1767        mutex_unlock(&dm_thin_pool_table.mutex);
1768}
1769
1770static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
1771                               struct dm_target *ti)
1772{
1773        int r;
1774        unsigned argc;
1775        const char *arg_name;
1776
1777        static struct dm_arg _args[] = {
1778                {0, 3, "Invalid number of pool feature arguments"},
1779        };
1780
1781        /*
1782         * No feature arguments supplied.
1783         */
1784        if (!as->argc)
1785                return 0;
1786
1787        r = dm_read_arg_group(_args, as, &argc, &ti->error);
1788        if (r)
1789                return -EINVAL;
1790
1791        while (argc && !r) {
1792                arg_name = dm_shift_arg(as);
1793                argc--;
1794
1795                if (!strcasecmp(arg_name, "skip_block_zeroing"))
1796                        pf->zero_new_blocks = false;
1797
1798                else if (!strcasecmp(arg_name, "ignore_discard"))
1799                        pf->discard_enabled = false;
1800
1801                else if (!strcasecmp(arg_name, "no_discard_passdown"))
1802                        pf->discard_passdown = false;
1803
1804                else if (!strcasecmp(arg_name, "read_only"))
1805                        pf->mode = PM_READ_ONLY;
1806
1807                else {
1808                        ti->error = "Unrecognised pool feature requested";
1809                        r = -EINVAL;
1810                        break;
1811                }
1812        }
1813
1814        return r;
1815}
1816
1817/*
1818 * thin-pool <metadata dev> <data dev>
1819 *           <data block size (sectors)>
1820 *           <low water mark (blocks)>
1821 *           [<#feature args> [<arg>]*]
1822 *
1823 * Optional feature arguments are:
1824 *           skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
1825 *           ignore_discard: disable discard
1826 *           no_discard_passdown: don't pass discards down to the data device
1827 */
1828static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1829{
1830        int r, pool_created = 0;
1831        struct pool_c *pt;
1832        struct pool *pool;
1833        struct pool_features pf;
1834        struct dm_arg_set as;
1835        struct dm_dev *data_dev;
1836        unsigned long block_size;
1837        dm_block_t low_water_blocks;
1838        struct dm_dev *metadata_dev;
1839        sector_t metadata_dev_size;
1840        char b[BDEVNAME_SIZE];
1841
1842        /*
1843         * FIXME Remove validation from scope of lock.
1844         */
1845        mutex_lock(&dm_thin_pool_table.mutex);
1846
1847        if (argc < 4) {
1848                ti->error = "Invalid argument count";
1849                r = -EINVAL;
1850                goto out_unlock;
1851        }
1852        as.argc = argc;
1853        as.argv = argv;
1854
1855        r = dm_get_device(ti, argv[0], FMODE_READ | FMODE_WRITE, &metadata_dev);
1856        if (r) {
1857                ti->error = "Error opening metadata block device";
1858                goto out_unlock;
1859        }
1860
1861        metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT;
1862        if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
1863                DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
1864                       bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
1865
1866        r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
1867        if (r) {
1868                ti->error = "Error getting data device";
1869                goto out_metadata;
1870        }
1871
1872        if (kstrtoul(argv[2], 10, &block_size) || !block_size ||
1873            block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
1874            block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
1875            block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
1876                ti->error = "Invalid block size";
1877                r = -EINVAL;
1878                goto out;
1879        }
1880
1881        if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {
1882                ti->error = "Invalid low water mark";
1883                r = -EINVAL;
1884                goto out;
1885        }
1886
1887        /*
1888         * Set default pool features.
1889         */
1890        pool_features_init(&pf);
1891
1892        dm_consume_args(&as, 4);
1893        r = parse_pool_features(&as, &pf, ti);
1894        if (r)
1895                goto out;
1896
1897        pt = kzalloc(sizeof(*pt), GFP_KERNEL);
1898        if (!pt) {
1899                r = -ENOMEM;
1900                goto out;
1901        }
1902
1903        pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
1904                           block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);
1905        if (IS_ERR(pool)) {
1906                r = PTR_ERR(pool);
1907                goto out_free_pt;
1908        }
1909
1910        /*
1911         * 'pool_created' reflects whether this is the first table load.
1912         * Top level discard support is not allowed to be changed after
1913         * initial load.  This would require a pool reload to trigger thin
1914         * device changes.
1915         */
1916        if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
1917                ti->error = "Discard support cannot be disabled once enabled";
1918                r = -EINVAL;
1919                goto out_flags_changed;
1920        }
1921
1922        pt->pool = pool;
1923        pt->ti = ti;
1924        pt->metadata_dev = metadata_dev;
1925        pt->data_dev = data_dev;
1926        pt->low_water_blocks = low_water_blocks;
1927        pt->adjusted_pf = pt->requested_pf = pf;
1928        ti->num_flush_requests = 1;
1929
1930        /*
1931         * Only need to enable discards if the pool should pass
1932         * them down to the data device.  The thin device's discard
1933         * processing will cause mappings to be removed from the btree.
1934         */
1935        if (pf.discard_enabled && pf.discard_passdown) {
1936                ti->num_discard_requests = 1;
1937
1938                /*
1939                 * Setting 'discards_supported' circumvents the normal
1940                 * stacking of discard limits (this keeps the pool and
1941                 * thin devices' discard limits consistent).
1942                 */
1943                ti->discards_supported = true;
1944                ti->discard_zeroes_data_unsupported = true;
1945        }
1946        ti->private = pt;
1947
1948        pt->callbacks.congested_fn = pool_is_congested;
1949        dm_table_add_target_callbacks(ti->table, &pt->callbacks);
1950
1951        mutex_unlock(&dm_thin_pool_table.mutex);
1952
1953        return 0;
1954
1955out_flags_changed:
1956        __pool_dec(pool);
1957out_free_pt:
1958        kfree(pt);
1959out:
1960        dm_put_device(ti, data_dev);
1961out_metadata:
1962        dm_put_device(ti, metadata_dev);
1963out_unlock:
1964        mutex_unlock(&dm_thin_pool_table.mutex);
1965
1966        return r;
1967}
1968
1969static int pool_map(struct dm_target *ti, struct bio *bio,
1970                    union map_info *map_context)
1971{
1972        int r;
1973        struct pool_c *pt = ti->private;
1974        struct pool *pool = pt->pool;
1975        unsigned long flags;
1976
1977        /*
1978         * As this is a singleton target, ti->begin is always zero.
1979         */
1980        spin_lock_irqsave(&pool->lock, flags);
1981        bio->bi_bdev = pt->data_dev->bdev;
1982        r = DM_MAPIO_REMAPPED;
1983        spin_unlock_irqrestore(&pool->lock, flags);
1984
1985        return r;
1986}
1987
1988/*
1989 * Retrieves the number of blocks of the data device from
1990 * the superblock and compares it to the actual device size,
1991 * thus resizing the data device in case it has grown.
1992 *
1993 * This both copes with opening preallocated data devices in the ctr
1994 * being followed by a resume
1995 * -and-
1996 * calling the resume method individually after userspace has
1997 * grown the data device in reaction to a table event.
1998 */
1999static int pool_preresume(struct dm_target *ti)
2000{
2001        int r;
2002        struct pool_c *pt = ti->private;
2003        struct pool *pool = pt->pool;
2004        sector_t data_size = ti->len;
2005        dm_block_t sb_data_size;
2006
2007        /*
2008         * Take control of the pool object.
2009         */
2010        r = bind_control_target(pool, ti);
2011        if (r)
2012                return r;
2013
2014        (void) sector_div(data_size, pool->sectors_per_block);
2015
2016        r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
2017        if (r) {
2018                DMERR("failed to retrieve data device size");
2019                return r;
2020        }
2021
2022        if (data_size < sb_data_size) {
2023                DMERR("pool target too small, is %llu blocks (expected %llu)",
2024                      (unsigned long long)data_size, sb_data_size);
2025                return -EINVAL;
2026
2027        } else if (data_size > sb_data_size) {
2028                r = dm_pool_resize_data_dev(pool->pmd, data_size);
2029                if (r) {
2030                        DMERR("failed to resize data device");
2031                        /* FIXME Stricter than necessary: Rollback transaction instead here */
2032                        set_pool_mode(pool, PM_READ_ONLY);
2033                        return r;
2034                }
2035
2036                (void) commit_or_fallback(pool);
2037        }
2038
2039        return 0;
2040}
2041
2042static void pool_resume(struct dm_target *ti)
2043{
2044        struct pool_c *pt = ti->private;
2045        struct pool *pool = pt->pool;
2046        unsigned long flags;
2047
2048        spin_lock_irqsave(&pool->lock, flags);
2049        pool->low_water_triggered = 0;
2050        pool->no_free_space = 0;
2051        __requeue_bios(pool);
2052        spin_unlock_irqrestore(&pool->lock, flags);
2053
2054        do_waker(&pool->waker.work);
2055}
2056
2057static void pool_postsuspend(struct dm_target *ti)
2058{
2059        struct pool_c *pt = ti->private;
2060        struct pool *pool = pt->pool;
2061
2062        cancel_delayed_work(&pool->waker);
2063        flush_workqueue(pool->wq);
2064        (void) commit_or_fallback(pool);
2065}
2066
2067static int check_arg_count(unsigned argc, unsigned args_required)
2068{
2069        if (argc != args_required) {
2070                DMWARN("Message received with %u arguments instead of %u.",
2071                       argc, args_required);
2072                return -EINVAL;
2073        }
2074
2075        return 0;
2076}
2077
2078static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning)
2079{
2080        if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
2081            *dev_id <= MAX_DEV_ID)
2082                return 0;
2083
2084        if (warning)
2085                DMWARN("Message received with invalid device id: %s", arg);
2086
2087        return -EINVAL;
2088}
2089
2090static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool)
2091{
2092        dm_thin_id dev_id;
2093        int r;
2094
2095        r = check_arg_count(argc, 2);
2096        if (r)
2097                return r;
2098
2099        r = read_dev_id(argv[1], &dev_id, 1);
2100        if (r)
2101                return r;
2102
2103        r = dm_pool_create_thin(pool->pmd, dev_id);
2104        if (r) {
2105                DMWARN("Creation of new thinly-provisioned device with id %s failed.",
2106                       argv[1]);
2107                return r;
2108        }
2109
2110        return 0;
2111}
2112
2113static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2114{
2115        dm_thin_id dev_id;
2116        dm_thin_id origin_dev_id;
2117        int r;
2118
2119        r = check_arg_count(argc, 3);
2120        if (r)
2121                return r;
2122
2123        r = read_dev_id(argv[1], &dev_id, 1);
2124        if (r)
2125                return r;
2126
2127        r = read_dev_id(argv[2], &origin_dev_id, 1);
2128        if (r)
2129                return r;
2130
2131        r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
2132        if (r) {
2133                DMWARN("Creation of new snapshot %s of device %s failed.",
2134                       argv[1], argv[2]);
2135                return r;
2136        }
2137
2138        return 0;
2139}
2140
2141static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool)
2142{
2143        dm_thin_id dev_id;
2144        int r;
2145
2146        r = check_arg_count(argc, 2);
2147        if (r)
2148                return r;
2149
2150        r = read_dev_id(argv[1], &dev_id, 1);
2151        if (r)
2152                return r;
2153
2154        r = dm_pool_delete_thin_device(pool->pmd, dev_id);
2155        if (r)
2156                DMWARN("Deletion of thin device %s failed.", argv[1]);
2157
2158        return r;
2159}
2160
2161static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool)
2162{
2163        dm_thin_id old_id, new_id;
2164        int r;
2165
2166        r = check_arg_count(argc, 3);
2167        if (r)
2168                return r;
2169
2170        if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
2171                DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
2172                return -EINVAL;
2173        }
2174
2175        if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
2176                DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
2177                return -EINVAL;
2178        }
2179
2180        r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
2181        if (r) {
2182                DMWARN("Failed to change transaction id from %s to %s.",
2183                       argv[1], argv[2]);
2184                return r;
2185        }
2186
2187        return 0;
2188}
2189
2190static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2191{
2192        int r;
2193
2194        r = check_arg_count(argc, 1);
2195        if (r)
2196                return r;
2197
2198        (void) commit_or_fallback(pool);
2199
2200        r = dm_pool_reserve_metadata_snap(pool->pmd);
2201        if (r)
2202                DMWARN("reserve_metadata_snap message failed.");
2203
2204        return r;
2205}
2206
2207static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
2208{
2209        int r;
2210
2211        r = check_arg_count(argc, 1);
2212        if (r)
2213                return r;
2214
2215        r = dm_pool_release_metadata_snap(pool->pmd);
2216        if (r)
2217                DMWARN("release_metadata_snap message failed.");
2218
2219        return r;
2220}
2221
2222/*
2223 * Messages supported:
2224 *   create_thin        <dev_id>
2225 *   create_snap        <dev_id> <origin_id>
2226 *   delete             <dev_id>
2227 *   trim               <dev_id> <new_size_in_sectors>
2228 *   set_transaction_id <current_trans_id> <new_trans_id>
2229 *   reserve_metadata_snap
2230 *   release_metadata_snap
2231 */
2232static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
2233{
2234        int r = -EINVAL;
2235        struct pool_c *pt = ti->private;
2236        struct pool *pool = pt->pool;
2237
2238        if (!strcasecmp(argv[0], "create_thin"))
2239                r = process_create_thin_mesg(argc, argv, pool);
2240
2241        else if (!strcasecmp(argv[0], "create_snap"))
2242                r = process_create_snap_mesg(argc, argv, pool);
2243
2244        else if (!strcasecmp(argv[0], "delete"))
2245                r = process_delete_mesg(argc, argv, pool);
2246
2247        else if (!strcasecmp(argv[0], "set_transaction_id"))
2248                r = process_set_transaction_id_mesg(argc, argv, pool);
2249
2250        else if (!strcasecmp(argv[0], "reserve_metadata_snap"))
2251                r = process_reserve_metadata_snap_mesg(argc, argv, pool);
2252
2253        else if (!strcasecmp(argv[0], "release_metadata_snap"))
2254                r = process_release_metadata_snap_mesg(argc, argv, pool);
2255
2256        else
2257                DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
2258
2259        if (!r)
2260                (void) commit_or_fallback(pool);
2261
2262        return r;
2263}
2264
2265static void emit_flags(struct pool_features *pf, char *result,
2266                       unsigned sz, unsigned maxlen)
2267{
2268        unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
2269                !pf->discard_passdown + (pf->mode == PM_READ_ONLY);
2270        DMEMIT("%u ", count);
2271
2272        if (!pf->zero_new_blocks)
2273                DMEMIT("skip_block_zeroing ");
2274
2275        if (!pf->discard_enabled)
2276                DMEMIT("ignore_discard ");
2277
2278        if (!pf->discard_passdown)
2279                DMEMIT("no_discard_passdown ");
2280
2281        if (pf->mode == PM_READ_ONLY)
2282                DMEMIT("read_only ");
2283}
2284
2285/*
2286 * Status line is:
2287 *    <transaction id> <used metadata sectors>/<total metadata sectors>
2288 *    <used data sectors>/<total data sectors> <held metadata root>
2289 */
2290static int pool_status(struct dm_target *ti, status_type_t type,
2291                       unsigned status_flags, char *result, unsigned maxlen)
2292{
2293        int r;
2294        unsigned sz = 0;
2295        uint64_t transaction_id;
2296        dm_block_t nr_free_blocks_data;
2297        dm_block_t nr_free_blocks_metadata;
2298        dm_block_t nr_blocks_data;
2299        dm_block_t nr_blocks_metadata;
2300        dm_block_t held_root;
2301        char buf[BDEVNAME_SIZE];
2302        char buf2[BDEVNAME_SIZE];
2303        struct pool_c *pt = ti->private;
2304        struct pool *pool = pt->pool;
2305
2306        switch (type) {
2307        case STATUSTYPE_INFO:
2308                if (get_pool_mode(pool) == PM_FAIL) {
2309                        DMEMIT("Fail");
2310                        break;
2311                }
2312
2313                /* Commit to ensure statistics aren't out-of-date */
2314                if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
2315                        (void) commit_or_fallback(pool);
2316
2317                r = dm_pool_get_metadata_transaction_id(pool->pmd,
2318                                                        &transaction_id);
2319                if (r)
2320                        return r;
2321
2322                r = dm_pool_get_free_metadata_block_count(pool->pmd,
2323                                                          &nr_free_blocks_metadata);
2324                if (r)
2325                        return r;
2326
2327                r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
2328                if (r)
2329                        return r;
2330
2331                r = dm_pool_get_free_block_count(pool->pmd,
2332                                                 &nr_free_blocks_data);
2333                if (r)
2334                        return r;
2335
2336                r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
2337                if (r)
2338                        return r;
2339
2340                r = dm_pool_get_metadata_snap(pool->pmd, &held_root);
2341                if (r)
2342                        return r;
2343
2344                DMEMIT("%llu %llu/%llu %llu/%llu ",
2345                       (unsigned long long)transaction_id,
2346                       (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
2347                       (unsigned long long)nr_blocks_metadata,
2348                       (unsigned long long)(nr_blocks_data - nr_free_blocks_data),
2349                       (unsigned long long)nr_blocks_data);
2350
2351                if (held_root)
2352                        DMEMIT("%llu ", held_root);
2353                else
2354                        DMEMIT("- ");
2355
2356                if (pool->pf.mode == PM_READ_ONLY)
2357                        DMEMIT("ro ");
2358                else
2359                        DMEMIT("rw ");
2360
2361                if (pool->pf.discard_enabled && pool->pf.discard_passdown)
2362                        DMEMIT("discard_passdown");
2363                else
2364                        DMEMIT("no_discard_passdown");
2365
2366                break;
2367
2368        case STATUSTYPE_TABLE:
2369                DMEMIT("%s %s %lu %llu ",
2370                       format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
2371                       format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
2372                       (unsigned long)pool->sectors_per_block,
2373                       (unsigned long long)pt->low_water_blocks);
2374                emit_flags(&pt->requested_pf, result, sz, maxlen);
2375                break;
2376        }
2377
2378        return 0;
2379}
2380
2381static int pool_iterate_devices(struct dm_target *ti,
2382                                iterate_devices_callout_fn fn, void *data)
2383{
2384        struct pool_c *pt = ti->private;
2385
2386        return fn(ti, pt->data_dev, 0, ti->len, data);
2387}
2388
2389static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
2390                      struct bio_vec *biovec, int max_size)
2391{
2392        struct pool_c *pt = ti->private;
2393        struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
2394
2395        if (!q->merge_bvec_fn)
2396                return max_size;
2397
2398        bvm->bi_bdev = pt->data_dev->bdev;
2399
2400        return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
2401}
2402
2403static bool block_size_is_power_of_two(struct pool *pool)
2404{
2405        return pool->sectors_per_block_shift >= 0;
2406}
2407
2408static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
2409{
2410        struct pool *pool = pt->pool;
2411        struct queue_limits *data_limits;
2412
2413        limits->max_discard_sectors = pool->sectors_per_block;
2414
2415        /*
2416         * discard_granularity is just a hint, and not enforced.
2417         */
2418        if (pt->adjusted_pf.discard_passdown) {
2419                data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;
2420                limits->discard_granularity = data_limits->discard_granularity;
2421        } else if (block_size_is_power_of_two(pool))
2422                limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
2423        else
2424                /*
2425                 * Use largest power of 2 that is a factor of sectors_per_block
2426                 * but at least DATA_DEV_BLOCK_SIZE_MIN_SECTORS.
2427                 */
2428                limits->discard_granularity = max(1 << (ffs(pool->sectors_per_block) - 1),
2429                                                  DATA_DEV_BLOCK_SIZE_MIN_SECTORS) << SECTOR_SHIFT;
2430}
2431
2432static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
2433{
2434        struct pool_c *pt = ti->private;
2435        struct pool *pool = pt->pool;
2436
2437        blk_limits_io_min(limits, 0);
2438        blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
2439
2440        /*
2441         * pt->adjusted_pf is a staging area for the actual features to use.
2442         * They get transferred to the live pool in bind_control_target()
2443         * called from pool_preresume().
2444         */
2445        if (!pt->adjusted_pf.discard_enabled)
2446                return;
2447
2448        disable_passdown_if_not_supported(pt);
2449
2450        set_discard_limits(pt, limits);
2451}
2452
2453static struct target_type pool_target = {
2454        .name = "thin-pool",
2455        .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
2456                    DM_TARGET_IMMUTABLE,
2457        .version = {1, 5, 0},
2458        .module = THIS_MODULE,
2459        .ctr = pool_ctr,
2460        .dtr = pool_dtr,
2461        .map = pool_map,
2462        .postsuspend = pool_postsuspend,
2463        .preresume = pool_preresume,
2464        .resume = pool_resume,
2465        .message = pool_message,
2466        .status = pool_status,
2467        .merge = pool_merge,
2468        .iterate_devices = pool_iterate_devices,
2469        .io_hints = pool_io_hints,
2470};
2471
2472/*----------------------------------------------------------------
2473 * Thin target methods
2474 *--------------------------------------------------------------*/
2475static void thin_dtr(struct dm_target *ti)
2476{
2477        struct thin_c *tc = ti->private;
2478
2479        mutex_lock(&dm_thin_pool_table.mutex);
2480
2481        __pool_dec(tc->pool);
2482        dm_pool_close_thin_device(tc->td);
2483        dm_put_device(ti, tc->pool_dev);
2484        if (tc->origin_dev)
2485                dm_put_device(ti, tc->origin_dev);
2486        kfree(tc);
2487
2488        mutex_unlock(&dm_thin_pool_table.mutex);
2489}
2490
2491/*
2492 * Thin target parameters:
2493 *
2494 * <pool_dev> <dev_id> [origin_dev]
2495 *
2496 * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
2497 * dev_id: the internal device identifier
2498 * origin_dev: a device external to the pool that should act as the origin
2499 *
2500 * If the pool device has discards disabled, they get disabled for the thin
2501 * device as well.
2502 */
2503static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2504{
2505        int r;
2506        struct thin_c *tc;
2507        struct dm_dev *pool_dev, *origin_dev;
2508        struct mapped_device *pool_md;
2509
2510        mutex_lock(&dm_thin_pool_table.mutex);
2511
2512        if (argc != 2 && argc != 3) {
2513                ti->error = "Invalid argument count";
2514                r = -EINVAL;
2515                goto out_unlock;
2516        }
2517
2518        tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
2519        if (!tc) {
2520                ti->error = "Out of memory";
2521                r = -ENOMEM;
2522                goto out_unlock;
2523        }
2524
2525        if (argc == 3) {
2526                r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
2527                if (r) {
2528                        ti->error = "Error opening origin device";
2529                        goto bad_origin_dev;
2530                }
2531                tc->origin_dev = origin_dev;
2532        }
2533
2534        r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
2535        if (r) {
2536                ti->error = "Error opening pool device";
2537                goto bad_pool_dev;
2538        }
2539        tc->pool_dev = pool_dev;
2540
2541        if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
2542                ti->error = "Invalid device id";
2543                r = -EINVAL;
2544                goto bad_common;
2545        }
2546
2547        pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
2548        if (!pool_md) {
2549                ti->error = "Couldn't get pool mapped device";
2550                r = -EINVAL;
2551                goto bad_common;
2552        }
2553
2554        tc->pool = __pool_table_lookup(pool_md);
2555        if (!tc->pool) {
2556                ti->error = "Couldn't find pool object";
2557                r = -EINVAL;
2558                goto bad_pool_lookup;
2559        }
2560        __pool_inc(tc->pool);
2561
2562        if (get_pool_mode(tc->pool) == PM_FAIL) {
2563                ti->error = "Couldn't open thin device, Pool is in fail mode";
2564                goto bad_thin_open;
2565        }
2566
2567        r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
2568        if (r) {
2569                ti->error = "Couldn't open thin internal device";
2570                goto bad_thin_open;
2571        }
2572
2573        r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
2574        if (r)
2575                goto bad_thin_open;
2576
2577        ti->num_flush_requests = 1;
2578        ti->flush_supported = true;
2579
2580        /* In case the pool supports discards, pass them on. */
2581        if (tc->pool->pf.discard_enabled) {
2582                ti->discards_supported = true;
2583                ti->num_discard_requests = 1;
2584                ti->discard_zeroes_data_unsupported = true;
2585                /* Discard requests must be split on a block boundary */
2586                ti->split_discard_requests = true;
2587        }
2588
2589        dm_put(pool_md);
2590
2591        mutex_unlock(&dm_thin_pool_table.mutex);
2592
2593        return 0;
2594
2595bad_thin_open:
2596        __pool_dec(tc->pool);
2597bad_pool_lookup:
2598        dm_put(pool_md);
2599bad_common:
2600        dm_put_device(ti, tc->pool_dev);
2601bad_pool_dev:
2602        if (tc->origin_dev)
2603                dm_put_device(ti, tc->origin_dev);
2604bad_origin_dev:
2605        kfree(tc);
2606out_unlock:
2607        mutex_unlock(&dm_thin_pool_table.mutex);
2608
2609        return r;
2610}
2611
2612static int thin_map(struct dm_target *ti, struct bio *bio,
2613                    union map_info *map_context)
2614{
2615        bio->bi_sector = dm_target_offset(ti, bio->bi_sector);
2616
2617        return thin_bio_map(ti, bio, map_context);
2618}
2619
2620static int thin_endio(struct dm_target *ti,
2621                      struct bio *bio, int err,
2622                      union map_info *map_context)
2623{
2624        unsigned long flags;
2625        struct dm_thin_endio_hook *h = map_context->ptr;
2626        struct list_head work;
2627        struct dm_thin_new_mapping *m, *tmp;
2628        struct pool *pool = h->tc->pool;
2629
2630        if (h->shared_read_entry) {
2631                INIT_LIST_HEAD(&work);
2632                dm_deferred_entry_dec(h->shared_read_entry, &work);
2633
2634                spin_lock_irqsave(&pool->lock, flags);
2635                list_for_each_entry_safe(m, tmp, &work, list) {
2636                        list_del(&m->list);
2637                        m->quiesced = 1;
2638                        __maybe_add_mapping(m);
2639                }
2640                spin_unlock_irqrestore(&pool->lock, flags);
2641        }
2642
2643        if (h->all_io_entry) {
2644                INIT_LIST_HEAD(&work);
2645                dm_deferred_entry_dec(h->all_io_entry, &work);
2646                spin_lock_irqsave(&pool->lock, flags);
2647                list_for_each_entry_safe(m, tmp, &work, list)
2648                        list_add(&m->list, &pool->prepared_discards);
2649                spin_unlock_irqrestore(&pool->lock, flags);
2650        }
2651
2652        mempool_free(h, pool->endio_hook_pool);
2653
2654        return 0;
2655}
2656
2657static void thin_postsuspend(struct dm_target *ti)
2658{
2659        if (dm_noflush_suspending(ti))
2660                requeue_io((struct thin_c *)ti->private);
2661}
2662
2663/*
2664 * <nr mapped sectors> <highest mapped sector>
2665 */
2666static int thin_status(struct dm_target *ti, status_type_t type,
2667                       unsigned status_flags, char *result, unsigned maxlen)
2668{
2669        int r;
2670        ssize_t sz = 0;
2671        dm_block_t mapped, highest;
2672        char buf[BDEVNAME_SIZE];
2673        struct thin_c *tc = ti->private;
2674
2675        if (get_pool_mode(tc->pool) == PM_FAIL) {
2676                DMEMIT("Fail");
2677                return 0;
2678        }
2679
2680        if (!tc->td)
2681                DMEMIT("-");
2682        else {
2683                switch (type) {
2684                case STATUSTYPE_INFO:
2685                        r = dm_thin_get_mapped_count(tc->td, &mapped);
2686                        if (r)
2687                                return r;
2688
2689                        r = dm_thin_get_highest_mapped_block(tc->td, &highest);
2690                        if (r < 0)
2691                                return r;
2692
2693                        DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
2694                        if (r)
2695                                DMEMIT("%llu", ((highest + 1) *
2696                                                tc->pool->sectors_per_block) - 1);
2697                        else
2698                                DMEMIT("-");
2699                        break;
2700
2701                case STATUSTYPE_TABLE:
2702                        DMEMIT("%s %lu",
2703                               format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
2704                               (unsigned long) tc->dev_id);
2705                        if (tc->origin_dev)
2706                                DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
2707                        break;
2708                }
2709        }
2710
2711        return 0;
2712}
2713
2714static int thin_iterate_devices(struct dm_target *ti,
2715                                iterate_devices_callout_fn fn, void *data)
2716{
2717        sector_t blocks;
2718        struct thin_c *tc = ti->private;
2719        struct pool *pool = tc->pool;
2720
2721        /*
2722         * We can't call dm_pool_get_data_dev_size() since that blocks.  So
2723         * we follow a more convoluted path through to the pool's target.
2724         */
2725        if (!pool->ti)
2726                return 0;       /* nothing is bound */
2727
2728        blocks = pool->ti->len;
2729        (void) sector_div(blocks, pool->sectors_per_block);
2730        if (blocks)
2731                return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data);
2732
2733        return 0;
2734}
2735
2736/*
2737 * A thin device always inherits its queue limits from its pool.
2738 */
2739static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
2740{
2741        struct thin_c *tc = ti->private;
2742
2743        *limits = bdev_get_queue(tc->pool_dev->bdev)->limits;
2744}
2745
2746static struct target_type thin_target = {
2747        .name = "thin",
2748        .version = {1, 5, 0},
2749        .module = THIS_MODULE,
2750        .ctr = thin_ctr,
2751        .dtr = thin_dtr,
2752        .map = thin_map,
2753        .end_io = thin_endio,
2754        .postsuspend = thin_postsuspend,
2755        .status = thin_status,
2756        .iterate_devices = thin_iterate_devices,
2757        .io_hints = thin_io_hints,
2758};
2759
2760/*----------------------------------------------------------------*/
2761
2762static int __init dm_thin_init(void)
2763{
2764        int r;
2765
2766        pool_table_init();
2767
2768        r = dm_register_target(&thin_target);
2769        if (r)
2770                return r;
2771
2772        r = dm_register_target(&pool_target);
2773        if (r)
2774                goto bad_pool_target;
2775
2776        r = -ENOMEM;
2777
2778        _new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0);
2779        if (!_new_mapping_cache)
2780                goto bad_new_mapping_cache;
2781
2782        _endio_hook_cache = KMEM_CACHE(dm_thin_endio_hook, 0);
2783        if (!_endio_hook_cache)
2784                goto bad_endio_hook_cache;
2785
2786        return 0;
2787
2788bad_endio_hook_cache:
2789        kmem_cache_destroy(_new_mapping_cache);
2790bad_new_mapping_cache:
2791        dm_unregister_target(&pool_target);
2792bad_pool_target:
2793        dm_unregister_target(&thin_target);
2794
2795        return r;
2796}
2797
2798static void dm_thin_exit(void)
2799{
2800        dm_unregister_target(&thin_target);
2801        dm_unregister_target(&pool_target);
2802
2803        kmem_cache_destroy(_new_mapping_cache);
2804        kmem_cache_destroy(_endio_hook_cache);
2805}
2806
2807module_init(dm_thin_init);
2808module_exit(dm_thin_exit);
2809
2810MODULE_DESCRIPTION(DM_NAME " thin provisioning target");
2811MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
2812MODULE_LICENSE("GPL");
2813