linux/drivers/md/dm-thin.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2011-2012 Red Hat UK.
   3 *
   4 * This file is released under the GPL.
   5 */
   6
   7#include "dm-thin-metadata.h"
   8#include "dm-bio-prison-v1.h"
   9#include "dm.h"
  10
  11#include <linux/device-mapper.h>
  12#include <linux/dm-io.h>
  13#include <linux/dm-kcopyd.h>
  14#include <linux/jiffies.h>
  15#include <linux/log2.h>
  16#include <linux/list.h>
  17#include <linux/rculist.h>
  18#include <linux/init.h>
  19#include <linux/module.h>
  20#include <linux/slab.h>
  21#include <linux/vmalloc.h>
  22#include <linux/sort.h>
  23#include <linux/rbtree.h>
  24
  25#define DM_MSG_PREFIX   "thin"
  26
  27/*
  28 * Tunable constants
  29 */
  30#define ENDIO_HOOK_POOL_SIZE 1024
  31#define MAPPING_POOL_SIZE 1024
  32#define COMMIT_PERIOD HZ
  33#define NO_SPACE_TIMEOUT_SECS 60
  34
  35static unsigned no_space_timeout_secs = NO_SPACE_TIMEOUT_SECS;
  36
  37DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
  38                "A percentage of time allocated for copy on write");
  39
  40/*
  41 * The block size of the device holding pool data must be
  42 * between 64KB and 1GB.
  43 */
  44#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
  45#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
  46
  47/*
  48 * Device id is restricted to 24 bits.
  49 */
  50#define MAX_DEV_ID ((1 << 24) - 1)
  51
  52/*
  53 * How do we handle breaking sharing of data blocks?
  54 * =================================================
  55 *
  56 * We use a standard copy-on-write btree to store the mappings for the
  57 * devices (note I'm talking about copy-on-write of the metadata here, not
  58 * the data).  When you take an internal snapshot you clone the root node
  59 * of the origin btree.  After this there is no concept of an origin or a
  60 * snapshot.  They are just two device trees that happen to point to the
  61 * same data blocks.
  62 *
  63 * When we get a write in we decide if it's to a shared data block using
  64 * some timestamp magic.  If it is, we have to break sharing.
  65 *
  66 * Let's say we write to a shared block in what was the origin.  The
  67 * steps are:
  68 *
  69 * i) plug io further to this physical block. (see bio_prison code).
  70 *
  71 * ii) quiesce any read io to that shared data block.  Obviously
  72 * including all devices that share this block.  (see dm_deferred_set code)
  73 *
  74 * iii) copy the data block to a newly allocate block.  This step can be
  75 * missed out if the io covers the block. (schedule_copy).
  76 *
  77 * iv) insert the new mapping into the origin's btree
  78 * (process_prepared_mapping).  This act of inserting breaks some
  79 * sharing of btree nodes between the two devices.  Breaking sharing only
  80 * effects the btree of that specific device.  Btrees for the other
  81 * devices that share the block never change.  The btree for the origin
  82 * device as it was after the last commit is untouched, ie. we're using
  83 * persistent data structures in the functional programming sense.
  84 *
  85 * v) unplug io to this physical block, including the io that triggered
  86 * the breaking of sharing.
  87 *
  88 * Steps (ii) and (iii) occur in parallel.
  89 *
  90 * The metadata _doesn't_ need to be committed before the io continues.  We
  91 * get away with this because the io is always written to a _new_ block.
  92 * If there's a crash, then:
  93 *
  94 * - The origin mapping will point to the old origin block (the shared
  95 * one).  This will contain the data as it was before the io that triggered
  96 * the breaking of sharing came in.
  97 *
  98 * - The snap mapping still points to the old block.  As it would after
  99 * the commit.
 100 *
 101 * The downside of this scheme is the timestamp magic isn't perfect, and
 102 * will continue to think that data block in the snapshot device is shared
 103 * even after the write to the origin has broken sharing.  I suspect data
 104 * blocks will typically be shared by many different devices, so we're
 105 * breaking sharing n + 1 times, rather than n, where n is the number of
 106 * devices that reference this data block.  At the moment I think the
 107 * benefits far, far outweigh the disadvantages.
 108 */
 109
 110/*----------------------------------------------------------------*/
 111
 112/*
 113 * Key building.
 114 */
 115enum lock_space {
 116        VIRTUAL,
 117        PHYSICAL
 118};
 119
 120static void build_key(struct dm_thin_device *td, enum lock_space ls,
 121                      dm_block_t b, dm_block_t e, struct dm_cell_key *key)
 122{
 123        key->virtual = (ls == VIRTUAL);
 124        key->dev = dm_thin_dev_id(td);
 125        key->block_begin = b;
 126        key->block_end = e;
 127}
 128
 129static void build_data_key(struct dm_thin_device *td, dm_block_t b,
 130                           struct dm_cell_key *key)
 131{
 132        build_key(td, PHYSICAL, b, b + 1llu, key);
 133}
 134
 135static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
 136                              struct dm_cell_key *key)
 137{
 138        build_key(td, VIRTUAL, b, b + 1llu, key);
 139}
 140
 141/*----------------------------------------------------------------*/
 142
 143#define THROTTLE_THRESHOLD (1 * HZ)
 144
 145struct throttle {
 146        struct rw_semaphore lock;
 147        unsigned long threshold;
 148        bool throttle_applied;
 149};
 150
 151static void throttle_init(struct throttle *t)
 152{
 153        init_rwsem(&t->lock);
 154        t->throttle_applied = false;
 155}
 156
 157static void throttle_work_start(struct throttle *t)
 158{
 159        t->threshold = jiffies + THROTTLE_THRESHOLD;
 160}
 161
 162static void throttle_work_update(struct throttle *t)
 163{
 164        if (!t->throttle_applied && jiffies > t->threshold) {
 165                down_write(&t->lock);
 166                t->throttle_applied = true;
 167        }
 168}
 169
 170static void throttle_work_complete(struct throttle *t)
 171{
 172        if (t->throttle_applied) {
 173                t->throttle_applied = false;
 174                up_write(&t->lock);
 175        }
 176}
 177
 178static void throttle_lock(struct throttle *t)
 179{
 180        down_read(&t->lock);
 181}
 182
 183static void throttle_unlock(struct throttle *t)
 184{
 185        up_read(&t->lock);
 186}
 187
 188/*----------------------------------------------------------------*/
 189
 190/*
 191 * A pool device ties together a metadata device and a data device.  It
 192 * also provides the interface for creating and destroying internal
 193 * devices.
 194 */
 195struct dm_thin_new_mapping;
 196
 197/*
 198 * The pool runs in various modes.  Ordered in degraded order for comparisons.
 199 */
 200enum pool_mode {
 201        PM_WRITE,               /* metadata may be changed */
 202        PM_OUT_OF_DATA_SPACE,   /* metadata may be changed, though data may not be allocated */
 203
 204        /*
 205         * Like READ_ONLY, except may switch back to WRITE on metadata resize. Reported as READ_ONLY.
 206         */
 207        PM_OUT_OF_METADATA_SPACE,
 208        PM_READ_ONLY,           /* metadata may not be changed */
 209
 210        PM_FAIL,                /* all I/O fails */
 211};
 212
 213struct pool_features {
 214        enum pool_mode mode;
 215
 216        bool zero_new_blocks:1;
 217        bool discard_enabled:1;
 218        bool discard_passdown:1;
 219        bool error_if_no_space:1;
 220};
 221
 222struct thin_c;
 223typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
 224typedef void (*process_cell_fn)(struct thin_c *tc, struct dm_bio_prison_cell *cell);
 225typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);
 226
 227#define CELL_SORT_ARRAY_SIZE 8192
 228
 229struct pool {
 230        struct list_head list;
 231        struct dm_target *ti;   /* Only set if a pool target is bound */
 232
 233        struct mapped_device *pool_md;
 234        struct block_device *data_dev;
 235        struct block_device *md_dev;
 236        struct dm_pool_metadata *pmd;
 237
 238        dm_block_t low_water_blocks;
 239        uint32_t sectors_per_block;
 240        int sectors_per_block_shift;
 241
 242        struct pool_features pf;
 243        bool low_water_triggered:1;     /* A dm event has been sent */
 244        bool suspended:1;
 245        bool out_of_data_space:1;
 246
 247        struct dm_bio_prison *prison;
 248        struct dm_kcopyd_client *copier;
 249
 250        struct work_struct worker;
 251        struct workqueue_struct *wq;
 252        struct throttle throttle;
 253        struct delayed_work waker;
 254        struct delayed_work no_space_timeout;
 255
 256        unsigned long last_commit_jiffies;
 257        unsigned ref_count;
 258
 259        spinlock_t lock;
 260        struct bio_list deferred_flush_bios;
 261        struct bio_list deferred_flush_completions;
 262        struct list_head prepared_mappings;
 263        struct list_head prepared_discards;
 264        struct list_head prepared_discards_pt2;
 265        struct list_head active_thins;
 266
 267        struct dm_deferred_set *shared_read_ds;
 268        struct dm_deferred_set *all_io_ds;
 269
 270        struct dm_thin_new_mapping *next_mapping;
 271
 272        process_bio_fn process_bio;
 273        process_bio_fn process_discard;
 274
 275        process_cell_fn process_cell;
 276        process_cell_fn process_discard_cell;
 277
 278        process_mapping_fn process_prepared_mapping;
 279        process_mapping_fn process_prepared_discard;
 280        process_mapping_fn process_prepared_discard_pt2;
 281
 282        struct dm_bio_prison_cell **cell_sort_array;
 283
 284        mempool_t mapping_pool;
 285
 286        struct bio flush_bio;
 287};
 288
 289static void metadata_operation_failed(struct pool *pool, const char *op, int r);
 290
 291static enum pool_mode get_pool_mode(struct pool *pool)
 292{
 293        return pool->pf.mode;
 294}
 295
 296static void notify_of_pool_mode_change(struct pool *pool)
 297{
 298        const char *descs[] = {
 299                "write",
 300                "out-of-data-space",
 301                "read-only",
 302                "read-only",
 303                "fail"
 304        };
 305        const char *extra_desc = NULL;
 306        enum pool_mode mode = get_pool_mode(pool);
 307
 308        if (mode == PM_OUT_OF_DATA_SPACE) {
 309                if (!pool->pf.error_if_no_space)
 310                        extra_desc = " (queue IO)";
 311                else
 312                        extra_desc = " (error IO)";
 313        }
 314
 315        dm_table_event(pool->ti->table);
 316        DMINFO("%s: switching pool to %s%s mode",
 317               dm_device_name(pool->pool_md),
 318               descs[(int)mode], extra_desc ? : "");
 319}
 320
 321/*
 322 * Target context for a pool.
 323 */
 324struct pool_c {
 325        struct dm_target *ti;
 326        struct pool *pool;
 327        struct dm_dev *data_dev;
 328        struct dm_dev *metadata_dev;
 329
 330        dm_block_t low_water_blocks;
 331        struct pool_features requested_pf; /* Features requested during table load */
 332        struct pool_features adjusted_pf;  /* Features used after adjusting for constituent devices */
 333};
 334
 335/*
 336 * Target context for a thin.
 337 */
 338struct thin_c {
 339        struct list_head list;
 340        struct dm_dev *pool_dev;
 341        struct dm_dev *origin_dev;
 342        sector_t origin_size;
 343        dm_thin_id dev_id;
 344
 345        struct pool *pool;
 346        struct dm_thin_device *td;
 347        struct mapped_device *thin_md;
 348
 349        bool requeue_mode:1;
 350        spinlock_t lock;
 351        struct list_head deferred_cells;
 352        struct bio_list deferred_bio_list;
 353        struct bio_list retry_on_resume_list;
 354        struct rb_root sort_bio_list; /* sorted list of deferred bios */
 355
 356        /*
 357         * Ensures the thin is not destroyed until the worker has finished
 358         * iterating the active_thins list.
 359         */
 360        refcount_t refcount;
 361        struct completion can_destroy;
 362};
 363
 364/*----------------------------------------------------------------*/
 365
 366static bool block_size_is_power_of_two(struct pool *pool)
 367{
 368        return pool->sectors_per_block_shift >= 0;
 369}
 370
 371static sector_t block_to_sectors(struct pool *pool, dm_block_t b)
 372{
 373        return block_size_is_power_of_two(pool) ?
 374                (b << pool->sectors_per_block_shift) :
 375                (b * pool->sectors_per_block);
 376}
 377
 378/*----------------------------------------------------------------*/
 379
 380struct discard_op {
 381        struct thin_c *tc;
 382        struct blk_plug plug;
 383        struct bio *parent_bio;
 384        struct bio *bio;
 385};
 386
 387static void begin_discard(struct discard_op *op, struct thin_c *tc, struct bio *parent)
 388{
 389        BUG_ON(!parent);
 390
 391        op->tc = tc;
 392        blk_start_plug(&op->plug);
 393        op->parent_bio = parent;
 394        op->bio = NULL;
 395}
 396
 397static int issue_discard(struct discard_op *op, dm_block_t data_b, dm_block_t data_e)
 398{
 399        struct thin_c *tc = op->tc;
 400        sector_t s = block_to_sectors(tc->pool, data_b);
 401        sector_t len = block_to_sectors(tc->pool, data_e - data_b);
 402
 403        return __blkdev_issue_discard(tc->pool_dev->bdev, s, len,
 404                                      GFP_NOWAIT, 0, &op->bio);
 405}
 406
 407static void end_discard(struct discard_op *op, int r)
 408{
 409        if (op->bio) {
 410                /*
 411                 * Even if one of the calls to issue_discard failed, we
 412                 * need to wait for the chain to complete.
 413                 */
 414                bio_chain(op->bio, op->parent_bio);
 415                bio_set_op_attrs(op->bio, REQ_OP_DISCARD, 0);
 416                submit_bio(op->bio);
 417        }
 418
 419        blk_finish_plug(&op->plug);
 420
 421        /*
 422         * Even if r is set, there could be sub discards in flight that we
 423         * need to wait for.
 424         */
 425        if (r && !op->parent_bio->bi_status)
 426                op->parent_bio->bi_status = errno_to_blk_status(r);
 427        bio_endio(op->parent_bio);
 428}
 429
 430/*----------------------------------------------------------------*/
 431
 432/*
 433 * wake_worker() is used when new work is queued and when pool_resume is
 434 * ready to continue deferred IO processing.
 435 */
 436static void wake_worker(struct pool *pool)
 437{
 438        queue_work(pool->wq, &pool->worker);
 439}
 440
 441/*----------------------------------------------------------------*/
 442
 443static int bio_detain(struct pool *pool, struct dm_cell_key *key, struct bio *bio,
 444                      struct dm_bio_prison_cell **cell_result)
 445{
 446        int r;
 447        struct dm_bio_prison_cell *cell_prealloc;
 448
 449        /*
 450         * Allocate a cell from the prison's mempool.
 451         * This might block but it can't fail.
 452         */
 453        cell_prealloc = dm_bio_prison_alloc_cell(pool->prison, GFP_NOIO);
 454
 455        r = dm_bio_detain(pool->prison, key, bio, cell_prealloc, cell_result);
 456        if (r)
 457                /*
 458                 * We reused an old cell; we can get rid of
 459                 * the new one.
 460                 */
 461                dm_bio_prison_free_cell(pool->prison, cell_prealloc);
 462
 463        return r;
 464}
 465
 466static void cell_release(struct pool *pool,
 467                         struct dm_bio_prison_cell *cell,
 468                         struct bio_list *bios)
 469{
 470        dm_cell_release(pool->prison, cell, bios);
 471        dm_bio_prison_free_cell(pool->prison, cell);
 472}
 473
 474static void cell_visit_release(struct pool *pool,
 475                               void (*fn)(void *, struct dm_bio_prison_cell *),
 476                               void *context,
 477                               struct dm_bio_prison_cell *cell)
 478{
 479        dm_cell_visit_release(pool->prison, fn, context, cell);
 480        dm_bio_prison_free_cell(pool->prison, cell);
 481}
 482
 483static void cell_release_no_holder(struct pool *pool,
 484                                   struct dm_bio_prison_cell *cell,
 485                                   struct bio_list *bios)
 486{
 487        dm_cell_release_no_holder(pool->prison, cell, bios);
 488        dm_bio_prison_free_cell(pool->prison, cell);
 489}
 490
 491static void cell_error_with_code(struct pool *pool,
 492                struct dm_bio_prison_cell *cell, blk_status_t error_code)
 493{
 494        dm_cell_error(pool->prison, cell, error_code);
 495        dm_bio_prison_free_cell(pool->prison, cell);
 496}
 497
 498static blk_status_t get_pool_io_error_code(struct pool *pool)
 499{
 500        return pool->out_of_data_space ? BLK_STS_NOSPC : BLK_STS_IOERR;
 501}
 502
 503static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell)
 504{
 505        cell_error_with_code(pool, cell, get_pool_io_error_code(pool));
 506}
 507
 508static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell)
 509{
 510        cell_error_with_code(pool, cell, 0);
 511}
 512
 513static void cell_requeue(struct pool *pool, struct dm_bio_prison_cell *cell)
 514{
 515        cell_error_with_code(pool, cell, BLK_STS_DM_REQUEUE);
 516}
 517
 518/*----------------------------------------------------------------*/
 519
 520/*
 521 * A global list of pools that uses a struct mapped_device as a key.
 522 */
 523static struct dm_thin_pool_table {
 524        struct mutex mutex;
 525        struct list_head pools;
 526} dm_thin_pool_table;
 527
 528static void pool_table_init(void)
 529{
 530        mutex_init(&dm_thin_pool_table.mutex);
 531        INIT_LIST_HEAD(&dm_thin_pool_table.pools);
 532}
 533
 534static void pool_table_exit(void)
 535{
 536        mutex_destroy(&dm_thin_pool_table.mutex);
 537}
 538
 539static void __pool_table_insert(struct pool *pool)
 540{
 541        BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 542        list_add(&pool->list, &dm_thin_pool_table.pools);
 543}
 544
 545static void __pool_table_remove(struct pool *pool)
 546{
 547        BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 548        list_del(&pool->list);
 549}
 550
 551static struct pool *__pool_table_lookup(struct mapped_device *md)
 552{
 553        struct pool *pool = NULL, *tmp;
 554
 555        BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 556
 557        list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
 558                if (tmp->pool_md == md) {
 559                        pool = tmp;
 560                        break;
 561                }
 562        }
 563
 564        return pool;
 565}
 566
 567static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev)
 568{
 569        struct pool *pool = NULL, *tmp;
 570
 571        BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 572
 573        list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
 574                if (tmp->md_dev == md_dev) {
 575                        pool = tmp;
 576                        break;
 577                }
 578        }
 579
 580        return pool;
 581}
 582
 583/*----------------------------------------------------------------*/
 584
 585struct dm_thin_endio_hook {
 586        struct thin_c *tc;
 587        struct dm_deferred_entry *shared_read_entry;
 588        struct dm_deferred_entry *all_io_entry;
 589        struct dm_thin_new_mapping *overwrite_mapping;
 590        struct rb_node rb_node;
 591        struct dm_bio_prison_cell *cell;
 592};
 593
 594static void __merge_bio_list(struct bio_list *bios, struct bio_list *master)
 595{
 596        bio_list_merge(bios, master);
 597        bio_list_init(master);
 598}
 599
 600static void error_bio_list(struct bio_list *bios, blk_status_t error)
 601{
 602        struct bio *bio;
 603
 604        while ((bio = bio_list_pop(bios))) {
 605                bio->bi_status = error;
 606                bio_endio(bio);
 607        }
 608}
 609
 610static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master,
 611                blk_status_t error)
 612{
 613        struct bio_list bios;
 614
 615        bio_list_init(&bios);
 616
 617        spin_lock_irq(&tc->lock);
 618        __merge_bio_list(&bios, master);
 619        spin_unlock_irq(&tc->lock);
 620
 621        error_bio_list(&bios, error);
 622}
 623
 624static void requeue_deferred_cells(struct thin_c *tc)
 625{
 626        struct pool *pool = tc->pool;
 627        struct list_head cells;
 628        struct dm_bio_prison_cell *cell, *tmp;
 629
 630        INIT_LIST_HEAD(&cells);
 631
 632        spin_lock_irq(&tc->lock);
 633        list_splice_init(&tc->deferred_cells, &cells);
 634        spin_unlock_irq(&tc->lock);
 635
 636        list_for_each_entry_safe(cell, tmp, &cells, user_list)
 637                cell_requeue(pool, cell);
 638}
 639
 640static void requeue_io(struct thin_c *tc)
 641{
 642        struct bio_list bios;
 643
 644        bio_list_init(&bios);
 645
 646        spin_lock_irq(&tc->lock);
 647        __merge_bio_list(&bios, &tc->deferred_bio_list);
 648        __merge_bio_list(&bios, &tc->retry_on_resume_list);
 649        spin_unlock_irq(&tc->lock);
 650
 651        error_bio_list(&bios, BLK_STS_DM_REQUEUE);
 652        requeue_deferred_cells(tc);
 653}
 654
 655static void error_retry_list_with_code(struct pool *pool, blk_status_t error)
 656{
 657        struct thin_c *tc;
 658
 659        rcu_read_lock();
 660        list_for_each_entry_rcu(tc, &pool->active_thins, list)
 661                error_thin_bio_list(tc, &tc->retry_on_resume_list, error);
 662        rcu_read_unlock();
 663}
 664
 665static void error_retry_list(struct pool *pool)
 666{
 667        error_retry_list_with_code(pool, get_pool_io_error_code(pool));
 668}
 669
 670/*
 671 * This section of code contains the logic for processing a thin device's IO.
 672 * Much of the code depends on pool object resources (lists, workqueues, etc)
 673 * but most is exclusively called from the thin target rather than the thin-pool
 674 * target.
 675 */
 676
 677static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
 678{
 679        struct pool *pool = tc->pool;
 680        sector_t block_nr = bio->bi_iter.bi_sector;
 681
 682        if (block_size_is_power_of_two(pool))
 683                block_nr >>= pool->sectors_per_block_shift;
 684        else
 685                (void) sector_div(block_nr, pool->sectors_per_block);
 686
 687        return block_nr;
 688}
 689
 690/*
 691 * Returns the _complete_ blocks that this bio covers.
 692 */
 693static void get_bio_block_range(struct thin_c *tc, struct bio *bio,
 694                                dm_block_t *begin, dm_block_t *end)
 695{
 696        struct pool *pool = tc->pool;
 697        sector_t b = bio->bi_iter.bi_sector;
 698        sector_t e = b + (bio->bi_iter.bi_size >> SECTOR_SHIFT);
 699
 700        b += pool->sectors_per_block - 1ull; /* so we round up */
 701
 702        if (block_size_is_power_of_two(pool)) {
 703                b >>= pool->sectors_per_block_shift;
 704                e >>= pool->sectors_per_block_shift;
 705        } else {
 706                (void) sector_div(b, pool->sectors_per_block);
 707                (void) sector_div(e, pool->sectors_per_block);
 708        }
 709
 710        if (e < b)
 711                /* Can happen if the bio is within a single block. */
 712                e = b;
 713
 714        *begin = b;
 715        *end = e;
 716}
 717
 718static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
 719{
 720        struct pool *pool = tc->pool;
 721        sector_t bi_sector = bio->bi_iter.bi_sector;
 722
 723        bio_set_dev(bio, tc->pool_dev->bdev);
 724        if (block_size_is_power_of_two(pool))
 725                bio->bi_iter.bi_sector =
 726                        (block << pool->sectors_per_block_shift) |
 727                        (bi_sector & (pool->sectors_per_block - 1));
 728        else
 729                bio->bi_iter.bi_sector = (block * pool->sectors_per_block) +
 730                                 sector_div(bi_sector, pool->sectors_per_block);
 731}
 732
 733static void remap_to_origin(struct thin_c *tc, struct bio *bio)
 734{
 735        bio_set_dev(bio, tc->origin_dev->bdev);
 736}
 737
 738static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
 739{
 740        return op_is_flush(bio->bi_opf) &&
 741                dm_thin_changed_this_transaction(tc->td);
 742}
 743
 744static void inc_all_io_entry(struct pool *pool, struct bio *bio)
 745{
 746        struct dm_thin_endio_hook *h;
 747
 748        if (bio_op(bio) == REQ_OP_DISCARD)
 749                return;
 750
 751        h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
 752        h->all_io_entry = dm_deferred_entry_inc(pool->all_io_ds);
 753}
 754
 755static void issue(struct thin_c *tc, struct bio *bio)
 756{
 757        struct pool *pool = tc->pool;
 758
 759        if (!bio_triggers_commit(tc, bio)) {
 760                submit_bio_noacct(bio);
 761                return;
 762        }
 763
 764        /*
 765         * Complete bio with an error if earlier I/O caused changes to
 766         * the metadata that can't be committed e.g, due to I/O errors
 767         * on the metadata device.
 768         */
 769        if (dm_thin_aborted_changes(tc->td)) {
 770                bio_io_error(bio);
 771                return;
 772        }
 773
 774        /*
 775         * Batch together any bios that trigger commits and then issue a
 776         * single commit for them in process_deferred_bios().
 777         */
 778        spin_lock_irq(&pool->lock);
 779        bio_list_add(&pool->deferred_flush_bios, bio);
 780        spin_unlock_irq(&pool->lock);
 781}
 782
 783static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
 784{
 785        remap_to_origin(tc, bio);
 786        issue(tc, bio);
 787}
 788
 789static void remap_and_issue(struct thin_c *tc, struct bio *bio,
 790                            dm_block_t block)
 791{
 792        remap(tc, bio, block);
 793        issue(tc, bio);
 794}
 795
 796/*----------------------------------------------------------------*/
 797
 798/*
 799 * Bio endio functions.
 800 */
 801struct dm_thin_new_mapping {
 802        struct list_head list;
 803
 804        bool pass_discard:1;
 805        bool maybe_shared:1;
 806
 807        /*
 808         * Track quiescing, copying and zeroing preparation actions.  When this
 809         * counter hits zero the block is prepared and can be inserted into the
 810         * btree.
 811         */
 812        atomic_t prepare_actions;
 813
 814        blk_status_t status;
 815        struct thin_c *tc;
 816        dm_block_t virt_begin, virt_end;
 817        dm_block_t data_block;
 818        struct dm_bio_prison_cell *cell;
 819
 820        /*
 821         * If the bio covers the whole area of a block then we can avoid
 822         * zeroing or copying.  Instead this bio is hooked.  The bio will
 823         * still be in the cell, so care has to be taken to avoid issuing
 824         * the bio twice.
 825         */
 826        struct bio *bio;
 827        bio_end_io_t *saved_bi_end_io;
 828};
 829
 830static void __complete_mapping_preparation(struct dm_thin_new_mapping *m)
 831{
 832        struct pool *pool = m->tc->pool;
 833
 834        if (atomic_dec_and_test(&m->prepare_actions)) {
 835                list_add_tail(&m->list, &pool->prepared_mappings);
 836                wake_worker(pool);
 837        }
 838}
 839
 840static void complete_mapping_preparation(struct dm_thin_new_mapping *m)
 841{
 842        unsigned long flags;
 843        struct pool *pool = m->tc->pool;
 844
 845        spin_lock_irqsave(&pool->lock, flags);
 846        __complete_mapping_preparation(m);
 847        spin_unlock_irqrestore(&pool->lock, flags);
 848}
 849
 850static void copy_complete(int read_err, unsigned long write_err, void *context)
 851{
 852        struct dm_thin_new_mapping *m = context;
 853
 854        m->status = read_err || write_err ? BLK_STS_IOERR : 0;
 855        complete_mapping_preparation(m);
 856}
 857
 858static void overwrite_endio(struct bio *bio)
 859{
 860        struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
 861        struct dm_thin_new_mapping *m = h->overwrite_mapping;
 862
 863        bio->bi_end_io = m->saved_bi_end_io;
 864
 865        m->status = bio->bi_status;
 866        complete_mapping_preparation(m);
 867}
 868
 869/*----------------------------------------------------------------*/
 870
 871/*
 872 * Workqueue.
 873 */
 874
 875/*
 876 * Prepared mapping jobs.
 877 */
 878
 879/*
 880 * This sends the bios in the cell, except the original holder, back
 881 * to the deferred_bios list.
 882 */
 883static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)
 884{
 885        struct pool *pool = tc->pool;
 886        unsigned long flags;
 887        int has_work;
 888
 889        spin_lock_irqsave(&tc->lock, flags);
 890        cell_release_no_holder(pool, cell, &tc->deferred_bio_list);
 891        has_work = !bio_list_empty(&tc->deferred_bio_list);
 892        spin_unlock_irqrestore(&tc->lock, flags);
 893
 894        if (has_work)
 895                wake_worker(pool);
 896}
 897
 898static void thin_defer_bio(struct thin_c *tc, struct bio *bio);
 899
 900struct remap_info {
 901        struct thin_c *tc;
 902        struct bio_list defer_bios;
 903        struct bio_list issue_bios;
 904};
 905
 906static void __inc_remap_and_issue_cell(void *context,
 907                                       struct dm_bio_prison_cell *cell)
 908{
 909        struct remap_info *info = context;
 910        struct bio *bio;
 911
 912        while ((bio = bio_list_pop(&cell->bios))) {
 913                if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD)
 914                        bio_list_add(&info->defer_bios, bio);
 915                else {
 916                        inc_all_io_entry(info->tc->pool, bio);
 917
 918                        /*
 919                         * We can't issue the bios with the bio prison lock
 920                         * held, so we add them to a list to issue on
 921                         * return from this function.
 922                         */
 923                        bio_list_add(&info->issue_bios, bio);
 924                }
 925        }
 926}
 927
 928static void inc_remap_and_issue_cell(struct thin_c *tc,
 929                                     struct dm_bio_prison_cell *cell,
 930                                     dm_block_t block)
 931{
 932        struct bio *bio;
 933        struct remap_info info;
 934
 935        info.tc = tc;
 936        bio_list_init(&info.defer_bios);
 937        bio_list_init(&info.issue_bios);
 938
 939        /*
 940         * We have to be careful to inc any bios we're about to issue
 941         * before the cell is released, and avoid a race with new bios
 942         * being added to the cell.
 943         */
 944        cell_visit_release(tc->pool, __inc_remap_and_issue_cell,
 945                           &info, cell);
 946
 947        while ((bio = bio_list_pop(&info.defer_bios)))
 948                thin_defer_bio(tc, bio);
 949
 950        while ((bio = bio_list_pop(&info.issue_bios)))
 951                remap_and_issue(info.tc, bio, block);
 952}
 953
 954static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
 955{
 956        cell_error(m->tc->pool, m->cell);
 957        list_del(&m->list);
 958        mempool_free(m, &m->tc->pool->mapping_pool);
 959}
 960
 961static void complete_overwrite_bio(struct thin_c *tc, struct bio *bio)
 962{
 963        struct pool *pool = tc->pool;
 964
 965        /*
 966         * If the bio has the REQ_FUA flag set we must commit the metadata
 967         * before signaling its completion.
 968         */
 969        if (!bio_triggers_commit(tc, bio)) {
 970                bio_endio(bio);
 971                return;
 972        }
 973
 974        /*
 975         * Complete bio with an error if earlier I/O caused changes to the
 976         * metadata that can't be committed, e.g, due to I/O errors on the
 977         * metadata device.
 978         */
 979        if (dm_thin_aborted_changes(tc->td)) {
 980                bio_io_error(bio);
 981                return;
 982        }
 983
 984        /*
 985         * Batch together any bios that trigger commits and then issue a
 986         * single commit for them in process_deferred_bios().
 987         */
 988        spin_lock_irq(&pool->lock);
 989        bio_list_add(&pool->deferred_flush_completions, bio);
 990        spin_unlock_irq(&pool->lock);
 991}
 992
 993static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 994{
 995        struct thin_c *tc = m->tc;
 996        struct pool *pool = tc->pool;
 997        struct bio *bio = m->bio;
 998        int r;
 999
1000        if (m->status) {
1001                cell_error(pool, m->cell);
1002                goto out;
1003        }
1004
1005        /*
1006         * Commit the prepared block into the mapping btree.
1007         * Any I/O for this block arriving after this point will get
1008         * remapped to it directly.
1009         */
1010        r = dm_thin_insert_block(tc->td, m->virt_begin, m->data_block);
1011        if (r) {
1012                metadata_operation_failed(pool, "dm_thin_insert_block", r);
1013                cell_error(pool, m->cell);
1014                goto out;
1015        }
1016
1017        /*
1018         * Release any bios held while the block was being provisioned.
1019         * If we are processing a write bio that completely covers the block,
1020         * we already processed it so can ignore it now when processing
1021         * the bios in the cell.
1022         */
1023        if (bio) {
1024                inc_remap_and_issue_cell(tc, m->cell, m->data_block);
1025                complete_overwrite_bio(tc, bio);
1026        } else {
1027                inc_all_io_entry(tc->pool, m->cell->holder);
1028                remap_and_issue(tc, m->cell->holder, m->data_block);
1029                inc_remap_and_issue_cell(tc, m->cell, m->data_block);
1030        }
1031
1032out:
1033        list_del(&m->list);
1034        mempool_free(m, &pool->mapping_pool);
1035}
1036
1037/*----------------------------------------------------------------*/
1038
1039static void free_discard_mapping(struct dm_thin_new_mapping *m)
1040{
1041        struct thin_c *tc = m->tc;
1042        if (m->cell)
1043                cell_defer_no_holder(tc, m->cell);
1044        mempool_free(m, &tc->pool->mapping_pool);
1045}
1046
1047static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
1048{
1049        bio_io_error(m->bio);
1050        free_discard_mapping(m);
1051}
1052
1053static void process_prepared_discard_success(struct dm_thin_new_mapping *m)
1054{
1055        bio_endio(m->bio);
1056        free_discard_mapping(m);
1057}
1058
1059static void process_prepared_discard_no_passdown(struct dm_thin_new_mapping *m)
1060{
1061        int r;
1062        struct thin_c *tc = m->tc;
1063
1064        r = dm_thin_remove_range(tc->td, m->cell->key.block_begin, m->cell->key.block_end);
1065        if (r) {
1066                metadata_operation_failed(tc->pool, "dm_thin_remove_range", r);
1067                bio_io_error(m->bio);
1068        } else
1069                bio_endio(m->bio);
1070
1071        cell_defer_no_holder(tc, m->cell);
1072        mempool_free(m, &tc->pool->mapping_pool);
1073}
1074
1075/*----------------------------------------------------------------*/
1076
1077static void passdown_double_checking_shared_status(struct dm_thin_new_mapping *m,
1078                                                   struct bio *discard_parent)
1079{
1080        /*
1081         * We've already unmapped this range of blocks, but before we
1082         * passdown we have to check that these blocks are now unused.
1083         */
1084        int r = 0;
1085        bool shared = true;
1086        struct thin_c *tc = m->tc;
1087        struct pool *pool = tc->pool;
1088        dm_block_t b = m->data_block, e, end = m->data_block + m->virt_end - m->virt_begin;
1089        struct discard_op op;
1090
1091        begin_discard(&op, tc, discard_parent);
1092        while (b != end) {
1093                /* find start of unmapped run */
1094                for (; b < end; b++) {
1095                        r = dm_pool_block_is_shared(pool->pmd, b, &shared);
1096                        if (r)
1097                                goto out;
1098
1099                        if (!shared)
1100                                break;
1101                }
1102
1103                if (b == end)
1104                        break;
1105
1106                /* find end of run */
1107                for (e = b + 1; e != end; e++) {
1108                        r = dm_pool_block_is_shared(pool->pmd, e, &shared);
1109                        if (r)
1110                                goto out;
1111
1112                        if (shared)
1113                                break;
1114                }
1115
1116                r = issue_discard(&op, b, e);
1117                if (r)
1118                        goto out;
1119
1120                b = e;
1121        }
1122out:
1123        end_discard(&op, r);
1124}
1125
1126static void queue_passdown_pt2(struct dm_thin_new_mapping *m)
1127{
1128        unsigned long flags;
1129        struct pool *pool = m->tc->pool;
1130
1131        spin_lock_irqsave(&pool->lock, flags);
1132        list_add_tail(&m->list, &pool->prepared_discards_pt2);
1133        spin_unlock_irqrestore(&pool->lock, flags);
1134        wake_worker(pool);
1135}
1136
1137static void passdown_endio(struct bio *bio)
1138{
1139        /*
1140         * It doesn't matter if the passdown discard failed, we still want
1141         * to unmap (we ignore err).
1142         */
1143        queue_passdown_pt2(bio->bi_private);
1144        bio_put(bio);
1145}
1146
1147static void process_prepared_discard_passdown_pt1(struct dm_thin_new_mapping *m)
1148{
1149        int r;
1150        struct thin_c *tc = m->tc;
1151        struct pool *pool = tc->pool;
1152        struct bio *discard_parent;
1153        dm_block_t data_end = m->data_block + (m->virt_end - m->virt_begin);
1154
1155        /*
1156         * Only this thread allocates blocks, so we can be sure that the
1157         * newly unmapped blocks will not be allocated before the end of
1158         * the function.
1159         */
1160        r = dm_thin_remove_range(tc->td, m->virt_begin, m->virt_end);
1161        if (r) {
1162                metadata_operation_failed(pool, "dm_thin_remove_range", r);
1163                bio_io_error(m->bio);
1164                cell_defer_no_holder(tc, m->cell);
1165                mempool_free(m, &pool->mapping_pool);
1166                return;
1167        }
1168
1169        /*
1170         * Increment the unmapped blocks.  This prevents a race between the
1171         * passdown io and reallocation of freed blocks.
1172         */
1173        r = dm_pool_inc_data_range(pool->pmd, m->data_block, data_end);
1174        if (r) {
1175                metadata_operation_failed(pool, "dm_pool_inc_data_range", r);
1176                bio_io_error(m->bio);
1177                cell_defer_no_holder(tc, m->cell);
1178                mempool_free(m, &pool->mapping_pool);
1179                return;
1180        }
1181
1182        discard_parent = bio_alloc(GFP_NOIO, 1);
1183        if (!discard_parent) {
1184                DMWARN("%s: unable to allocate top level discard bio for passdown. Skipping passdown.",
1185                       dm_device_name(tc->pool->pool_md));
1186                queue_passdown_pt2(m);
1187
1188        } else {
1189                discard_parent->bi_end_io = passdown_endio;
1190                discard_parent->bi_private = m;
1191
1192                if (m->maybe_shared)
1193                        passdown_double_checking_shared_status(m, discard_parent);
1194                else {
1195                        struct discard_op op;
1196
1197                        begin_discard(&op, tc, discard_parent);
1198                        r = issue_discard(&op, m->data_block, data_end);
1199                        end_discard(&op, r);
1200                }
1201        }
1202}
1203
1204static void process_prepared_discard_passdown_pt2(struct dm_thin_new_mapping *m)
1205{
1206        int r;
1207        struct thin_c *tc = m->tc;
1208        struct pool *pool = tc->pool;
1209
1210        /*
1211         * The passdown has completed, so now we can decrement all those
1212         * unmapped blocks.
1213         */
1214        r = dm_pool_dec_data_range(pool->pmd, m->data_block,
1215                                   m->data_block + (m->virt_end - m->virt_begin));
1216        if (r) {
1217                metadata_operation_failed(pool, "dm_pool_dec_data_range", r);
1218                bio_io_error(m->bio);
1219        } else
1220                bio_endio(m->bio);
1221
1222        cell_defer_no_holder(tc, m->cell);
1223        mempool_free(m, &pool->mapping_pool);
1224}
1225
1226static void process_prepared(struct pool *pool, struct list_head *head,
1227                             process_mapping_fn *fn)
1228{
1229        struct list_head maps;
1230        struct dm_thin_new_mapping *m, *tmp;
1231
1232        INIT_LIST_HEAD(&maps);
1233        spin_lock_irq(&pool->lock);
1234        list_splice_init(head, &maps);
1235        spin_unlock_irq(&pool->lock);
1236
1237        list_for_each_entry_safe(m, tmp, &maps, list)
1238                (*fn)(m);
1239}
1240
1241/*
1242 * Deferred bio jobs.
1243 */
1244static int io_overlaps_block(struct pool *pool, struct bio *bio)
1245{
1246        return bio->bi_iter.bi_size ==
1247                (pool->sectors_per_block << SECTOR_SHIFT);
1248}
1249
1250static int io_overwrites_block(struct pool *pool, struct bio *bio)
1251{
1252        return (bio_data_dir(bio) == WRITE) &&
1253                io_overlaps_block(pool, bio);
1254}
1255
1256static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
1257                               bio_end_io_t *fn)
1258{
1259        *save = bio->bi_end_io;
1260        bio->bi_end_io = fn;
1261}
1262
1263static int ensure_next_mapping(struct pool *pool)
1264{
1265        if (pool->next_mapping)
1266                return 0;
1267
1268        pool->next_mapping = mempool_alloc(&pool->mapping_pool, GFP_ATOMIC);
1269
1270        return pool->next_mapping ? 0 : -ENOMEM;
1271}
1272
1273static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
1274{
1275        struct dm_thin_new_mapping *m = pool->next_mapping;
1276
1277        BUG_ON(!pool->next_mapping);
1278
1279        memset(m, 0, sizeof(struct dm_thin_new_mapping));
1280        INIT_LIST_HEAD(&m->list);
1281        m->bio = NULL;
1282
1283        pool->next_mapping = NULL;
1284
1285        return m;
1286}
1287
1288static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m,
1289                    sector_t begin, sector_t end)
1290{
1291        struct dm_io_region to;
1292
1293        to.bdev = tc->pool_dev->bdev;
1294        to.sector = begin;
1295        to.count = end - begin;
1296
1297        dm_kcopyd_zero(tc->pool->copier, 1, &to, 0, copy_complete, m);
1298}
1299
1300static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,
1301                                      dm_block_t data_begin,
1302                                      struct dm_thin_new_mapping *m)
1303{
1304        struct pool *pool = tc->pool;
1305        struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1306
1307        h->overwrite_mapping = m;
1308        m->bio = bio;
1309        save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
1310        inc_all_io_entry(pool, bio);
1311        remap_and_issue(tc, bio, data_begin);
1312}
1313
1314/*
1315 * A partial copy also needs to zero the uncopied region.
1316 */
1317static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
1318                          struct dm_dev *origin, dm_block_t data_origin,
1319                          dm_block_t data_dest,
1320                          struct dm_bio_prison_cell *cell, struct bio *bio,
1321                          sector_t len)
1322{
1323        struct pool *pool = tc->pool;
1324        struct dm_thin_new_mapping *m = get_next_mapping(pool);
1325
1326        m->tc = tc;
1327        m->virt_begin = virt_block;
1328        m->virt_end = virt_block + 1u;
1329        m->data_block = data_dest;
1330        m->cell = cell;
1331
1332        /*
1333         * quiesce action + copy action + an extra reference held for the
1334         * duration of this function (we may need to inc later for a
1335         * partial zero).
1336         */
1337        atomic_set(&m->prepare_actions, 3);
1338
1339        if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
1340                complete_mapping_preparation(m); /* already quiesced */
1341
1342        /*
1343         * IO to pool_dev remaps to the pool target's data_dev.
1344         *
1345         * If the whole block of data is being overwritten, we can issue the
1346         * bio immediately. Otherwise we use kcopyd to clone the data first.
1347         */
1348        if (io_overwrites_block(pool, bio))
1349                remap_and_issue_overwrite(tc, bio, data_dest, m);
1350        else {
1351                struct dm_io_region from, to;
1352
1353                from.bdev = origin->bdev;
1354                from.sector = data_origin * pool->sectors_per_block;
1355                from.count = len;
1356
1357                to.bdev = tc->pool_dev->bdev;
1358                to.sector = data_dest * pool->sectors_per_block;
1359                to.count = len;
1360
1361                dm_kcopyd_copy(pool->copier, &from, 1, &to,
1362                               0, copy_complete, m);
1363
1364                /*
1365                 * Do we need to zero a tail region?
1366                 */
1367                if (len < pool->sectors_per_block && pool->pf.zero_new_blocks) {
1368                        atomic_inc(&m->prepare_actions);
1369                        ll_zero(tc, m,
1370                                data_dest * pool->sectors_per_block + len,
1371                                (data_dest + 1) * pool->sectors_per_block);
1372                }
1373        }
1374
1375        complete_mapping_preparation(m); /* drop our ref */
1376}
1377
1378static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
1379                                   dm_block_t data_origin, dm_block_t data_dest,
1380                                   struct dm_bio_prison_cell *cell, struct bio *bio)
1381{
1382        schedule_copy(tc, virt_block, tc->pool_dev,
1383                      data_origin, data_dest, cell, bio,
1384                      tc->pool->sectors_per_block);
1385}
1386
1387static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
1388                          dm_block_t data_block, struct dm_bio_prison_cell *cell,
1389                          struct bio *bio)
1390{
1391        struct pool *pool = tc->pool;
1392        struct dm_thin_new_mapping *m = get_next_mapping(pool);
1393
1394        atomic_set(&m->prepare_actions, 1); /* no need to quiesce */
1395        m->tc = tc;
1396        m->virt_begin = virt_block;
1397        m->virt_end = virt_block + 1u;
1398        m->data_block = data_block;
1399        m->cell = cell;
1400
1401        /*
1402         * If the whole block of data is being overwritten or we are not
1403         * zeroing pre-existing data, we can issue the bio immediately.
1404         * Otherwise we use kcopyd to zero the data first.
1405         */
1406        if (pool->pf.zero_new_blocks) {
1407                if (io_overwrites_block(pool, bio))
1408                        remap_and_issue_overwrite(tc, bio, data_block, m);
1409                else
1410                        ll_zero(tc, m, data_block * pool->sectors_per_block,
1411                                (data_block + 1) * pool->sectors_per_block);
1412        } else
1413                process_prepared_mapping(m);
1414}
1415
1416static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
1417                                   dm_block_t data_dest,
1418                                   struct dm_bio_prison_cell *cell, struct bio *bio)
1419{
1420        struct pool *pool = tc->pool;
1421        sector_t virt_block_begin = virt_block * pool->sectors_per_block;
1422        sector_t virt_block_end = (virt_block + 1) * pool->sectors_per_block;
1423
1424        if (virt_block_end <= tc->origin_size)
1425                schedule_copy(tc, virt_block, tc->origin_dev,
1426                              virt_block, data_dest, cell, bio,
1427                              pool->sectors_per_block);
1428
1429        else if (virt_block_begin < tc->origin_size)
1430                schedule_copy(tc, virt_block, tc->origin_dev,
1431                              virt_block, data_dest, cell, bio,
1432                              tc->origin_size - virt_block_begin);
1433
1434        else
1435                schedule_zero(tc, virt_block, data_dest, cell, bio);
1436}
1437
1438static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
1439
1440static void requeue_bios(struct pool *pool);
1441
1442static bool is_read_only_pool_mode(enum pool_mode mode)
1443{
1444        return (mode == PM_OUT_OF_METADATA_SPACE || mode == PM_READ_ONLY);
1445}
1446
1447static bool is_read_only(struct pool *pool)
1448{
1449        return is_read_only_pool_mode(get_pool_mode(pool));
1450}
1451
1452static void check_for_metadata_space(struct pool *pool)
1453{
1454        int r;
1455        const char *ooms_reason = NULL;
1456        dm_block_t nr_free;
1457
1458        r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free);
1459        if (r)
1460                ooms_reason = "Could not get free metadata blocks";
1461        else if (!nr_free)
1462                ooms_reason = "No free metadata blocks";
1463
1464        if (ooms_reason && !is_read_only(pool)) {
1465                DMERR("%s", ooms_reason);
1466                set_pool_mode(pool, PM_OUT_OF_METADATA_SPACE);
1467        }
1468}
1469
1470static void check_for_data_space(struct pool *pool)
1471{
1472        int r;
1473        dm_block_t nr_free;
1474
1475        if (get_pool_mode(pool) != PM_OUT_OF_DATA_SPACE)
1476                return;
1477
1478        r = dm_pool_get_free_block_count(pool->pmd, &nr_free);
1479        if (r)
1480                return;
1481
1482        if (nr_free) {
1483                set_pool_mode(pool, PM_WRITE);
1484                requeue_bios(pool);
1485        }
1486}
1487
1488/*
1489 * A non-zero return indicates read_only or fail_io mode.
1490 * Many callers don't care about the return value.
1491 */
1492static int commit(struct pool *pool)
1493{
1494        int r;
1495
1496        if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE)
1497                return -EINVAL;
1498
1499        r = dm_pool_commit_metadata(pool->pmd);
1500        if (r)
1501                metadata_operation_failed(pool, "dm_pool_commit_metadata", r);
1502        else {
1503                check_for_metadata_space(pool);
1504                check_for_data_space(pool);
1505        }
1506
1507        return r;
1508}
1509
1510static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
1511{
1512        if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
1513                DMWARN("%s: reached low water mark for data device: sending event.",
1514                       dm_device_name(pool->pool_md));
1515                spin_lock_irq(&pool->lock);
1516                pool->low_water_triggered = true;
1517                spin_unlock_irq(&pool->lock);
1518                dm_table_event(pool->ti->table);
1519        }
1520}
1521
1522static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
1523{
1524        int r;
1525        dm_block_t free_blocks;
1526        struct pool *pool = tc->pool;
1527
1528        if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
1529                return -EINVAL;
1530
1531        r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
1532        if (r) {
1533                metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
1534                return r;
1535        }
1536
1537        check_low_water_mark(pool, free_blocks);
1538
1539        if (!free_blocks) {
1540                /*
1541                 * Try to commit to see if that will free up some
1542                 * more space.
1543                 */
1544                r = commit(pool);
1545                if (r)
1546                        return r;
1547
1548                r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
1549                if (r) {
1550                        metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
1551                        return r;
1552                }
1553
1554                if (!free_blocks) {
1555                        set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
1556                        return -ENOSPC;
1557                }
1558        }
1559
1560        r = dm_pool_alloc_data_block(pool->pmd, result);
1561        if (r) {
1562                if (r == -ENOSPC)
1563                        set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
1564                else
1565                        metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);
1566                return r;
1567        }
1568
1569        r = dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks);
1570        if (r) {
1571                metadata_operation_failed(pool, "dm_pool_get_free_metadata_block_count", r);
1572                return r;
1573        }
1574
1575        if (!free_blocks) {
1576                /* Let's commit before we use up the metadata reserve. */
1577                r = commit(pool);
1578                if (r)
1579                        return r;
1580        }
1581
1582        return 0;
1583}
1584
1585/*
1586 * If we have run out of space, queue bios until the device is
1587 * resumed, presumably after having been reloaded with more space.
1588 */
1589static void retry_on_resume(struct bio *bio)
1590{
1591        struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1592        struct thin_c *tc = h->tc;
1593
1594        spin_lock_irq(&tc->lock);
1595        bio_list_add(&tc->retry_on_resume_list, bio);
1596        spin_unlock_irq(&tc->lock);
1597}
1598
1599static blk_status_t should_error_unserviceable_bio(struct pool *pool)
1600{
1601        enum pool_mode m = get_pool_mode(pool);
1602
1603        switch (m) {
1604        case PM_WRITE:
1605                /* Shouldn't get here */
1606                DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");
1607                return BLK_STS_IOERR;
1608
1609        case PM_OUT_OF_DATA_SPACE:
1610                return pool->pf.error_if_no_space ? BLK_STS_NOSPC : 0;
1611
1612        case PM_OUT_OF_METADATA_SPACE:
1613        case PM_READ_ONLY:
1614        case PM_FAIL:
1615                return BLK_STS_IOERR;
1616        default:
1617                /* Shouldn't get here */
1618                DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");
1619                return BLK_STS_IOERR;
1620        }
1621}
1622
1623static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
1624{
1625        blk_status_t error = should_error_unserviceable_bio(pool);
1626
1627        if (error) {
1628                bio->bi_status = error;
1629                bio_endio(bio);
1630        } else
1631                retry_on_resume(bio);
1632}
1633
1634static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *cell)
1635{
1636        struct bio *bio;
1637        struct bio_list bios;
1638        blk_status_t error;
1639
1640        error = should_error_unserviceable_bio(pool);
1641        if (error) {
1642                cell_error_with_code(pool, cell, error);
1643                return;
1644        }
1645
1646        bio_list_init(&bios);
1647        cell_release(pool, cell, &bios);
1648
1649        while ((bio = bio_list_pop(&bios)))
1650                retry_on_resume(bio);
1651}
1652
1653static void process_discard_cell_no_passdown(struct thin_c *tc,
1654                                             struct dm_bio_prison_cell *virt_cell)
1655{
1656        struct pool *pool = tc->pool;
1657        struct dm_thin_new_mapping *m = get_next_mapping(pool);
1658
1659        /*
1660         * We don't need to lock the data blocks, since there's no
1661         * passdown.  We only lock data blocks for allocation and breaking sharing.
1662         */
1663        m->tc = tc;
1664        m->virt_begin = virt_cell->key.block_begin;
1665        m->virt_end = virt_cell->key.block_end;
1666        m->cell = virt_cell;
1667        m->bio = virt_cell->holder;
1668
1669        if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
1670                pool->process_prepared_discard(m);
1671}
1672
1673static void break_up_discard_bio(struct thin_c *tc, dm_block_t begin, dm_block_t end,
1674                                 struct bio *bio)
1675{
1676        struct pool *pool = tc->pool;
1677
1678        int r;
1679        bool maybe_shared;
1680        struct dm_cell_key data_key;
1681        struct dm_bio_prison_cell *data_cell;
1682        struct dm_thin_new_mapping *m;
1683        dm_block_t virt_begin, virt_end, data_begin;
1684
1685        while (begin != end) {
1686                r = ensure_next_mapping(pool);
1687                if (r)
1688                        /* we did our best */
1689                        return;
1690
1691                r = dm_thin_find_mapped_range(tc->td, begin, end, &virt_begin, &virt_end,
1692                                              &data_begin, &maybe_shared);
1693                if (r)
1694                        /*
1695                         * Silently fail, letting any mappings we've
1696                         * created complete.
1697                         */
1698                        break;
1699
1700                build_key(tc->td, PHYSICAL, data_begin, data_begin + (virt_end - virt_begin), &data_key);
1701                if (bio_detain(tc->pool, &data_key, NULL, &data_cell)) {
1702                        /* contention, we'll give up with this range */
1703                        begin = virt_end;
1704                        continue;
1705                }
1706
1707                /*
1708                 * IO may still be going to the destination block.  We must
1709                 * quiesce before we can do the removal.
1710                 */
1711                m = get_next_mapping(pool);
1712                m->tc = tc;
1713                m->maybe_shared = maybe_shared;
1714                m->virt_begin = virt_begin;
1715                m->virt_end = virt_end;
1716                m->data_block = data_begin;
1717                m->cell = data_cell;
1718                m->bio = bio;
1719
1720                /*
1721                 * The parent bio must not complete before sub discard bios are
1722                 * chained to it (see end_discard's bio_chain)!
1723                 *
1724                 * This per-mapping bi_remaining increment is paired with
1725                 * the implicit decrement that occurs via bio_endio() in
1726                 * end_discard().
1727                 */
1728                bio_inc_remaining(bio);
1729                if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
1730                        pool->process_prepared_discard(m);
1731
1732                begin = virt_end;
1733        }
1734}
1735
1736static void process_discard_cell_passdown(struct thin_c *tc, struct dm_bio_prison_cell *virt_cell)
1737{
1738        struct bio *bio = virt_cell->holder;
1739        struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1740
1741        /*
1742         * The virt_cell will only get freed once the origin bio completes.
1743         * This means it will remain locked while all the individual
1744         * passdown bios are in flight.
1745         */
1746        h->cell = virt_cell;
1747        break_up_discard_bio(tc, virt_cell->key.block_begin, virt_cell->key.block_end, bio);
1748
1749        /*
1750         * We complete the bio now, knowing that the bi_remaining field
1751         * will prevent completion until the sub range discards have
1752         * completed.
1753         */
1754        bio_endio(bio);
1755}
1756
1757static void process_discard_bio(struct thin_c *tc, struct bio *bio)
1758{
1759        dm_block_t begin, end;
1760        struct dm_cell_key virt_key;
1761        struct dm_bio_prison_cell *virt_cell;
1762
1763        get_bio_block_range(tc, bio, &begin, &end);
1764        if (begin == end) {
1765                /*
1766                 * The discard covers less than a block.
1767                 */
1768                bio_endio(bio);
1769                return;
1770        }
1771
1772        build_key(tc->td, VIRTUAL, begin, end, &virt_key);
1773        if (bio_detain(tc->pool, &virt_key, bio, &virt_cell))
1774                /*
1775                 * Potential starvation issue: We're relying on the
1776                 * fs/application being well behaved, and not trying to
1777                 * send IO to a region at the same time as discarding it.
1778                 * If they do this persistently then it's possible this
1779                 * cell will never be granted.
1780                 */
1781                return;
1782
1783        tc->pool->process_discard_cell(tc, virt_cell);
1784}
1785
1786static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
1787                          struct dm_cell_key *key,
1788                          struct dm_thin_lookup_result *lookup_result,
1789                          struct dm_bio_prison_cell *cell)
1790{
1791        int r;
1792        dm_block_t data_block;
1793        struct pool *pool = tc->pool;
1794
1795        r = alloc_data_block(tc, &data_block);
1796        switch (r) {
1797        case 0:
1798                schedule_internal_copy(tc, block, lookup_result->block,
1799                                       data_block, cell, bio);
1800                break;
1801
1802        case -ENOSPC:
1803                retry_bios_on_resume(pool, cell);
1804                break;
1805
1806        default:
1807                DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1808                            __func__, r);
1809                cell_error(pool, cell);
1810                break;
1811        }
1812}
1813
1814static void __remap_and_issue_shared_cell(void *context,
1815                                          struct dm_bio_prison_cell *cell)
1816{
1817        struct remap_info *info = context;
1818        struct bio *bio;
1819
1820        while ((bio = bio_list_pop(&cell->bios))) {
1821                if (bio_data_dir(bio) == WRITE || op_is_flush(bio->bi_opf) ||
1822                    bio_op(bio) == REQ_OP_DISCARD)
1823                        bio_list_add(&info->defer_bios, bio);
1824                else {
1825                        struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1826
1827                        h->shared_read_entry = dm_deferred_entry_inc(info->tc->pool->shared_read_ds);
1828                        inc_all_io_entry(info->tc->pool, bio);
1829                        bio_list_add(&info->issue_bios, bio);
1830                }
1831        }
1832}
1833
1834static void remap_and_issue_shared_cell(struct thin_c *tc,
1835                                        struct dm_bio_prison_cell *cell,
1836                                        dm_block_t block)
1837{
1838        struct bio *bio;
1839        struct remap_info info;
1840
1841        info.tc = tc;
1842        bio_list_init(&info.defer_bios);
1843        bio_list_init(&info.issue_bios);
1844
1845        cell_visit_release(tc->pool, __remap_and_issue_shared_cell,
1846                           &info, cell);
1847
1848        while ((bio = bio_list_pop(&info.defer_bios)))
1849                thin_defer_bio(tc, bio);
1850
1851        while ((bio = bio_list_pop(&info.issue_bios)))
1852                remap_and_issue(tc, bio, block);
1853}
1854
1855static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1856                               dm_block_t block,
1857                               struct dm_thin_lookup_result *lookup_result,
1858                               struct dm_bio_prison_cell *virt_cell)
1859{
1860        struct dm_bio_prison_cell *data_cell;
1861        struct pool *pool = tc->pool;
1862        struct dm_cell_key key;
1863
1864        /*
1865         * If cell is already occupied, then sharing is already in the process
1866         * of being broken so we have nothing further to do here.
1867         */
1868        build_data_key(tc->td, lookup_result->block, &key);
1869        if (bio_detain(pool, &key, bio, &data_cell)) {
1870                cell_defer_no_holder(tc, virt_cell);
1871                return;
1872        }
1873
1874        if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) {
1875                break_sharing(tc, bio, block, &key, lookup_result, data_cell);
1876                cell_defer_no_holder(tc, virt_cell);
1877        } else {
1878                struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
1879
1880                h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds);
1881                inc_all_io_entry(pool, bio);
1882                remap_and_issue(tc, bio, lookup_result->block);
1883
1884                remap_and_issue_shared_cell(tc, data_cell, lookup_result->block);
1885                remap_and_issue_shared_cell(tc, virt_cell, lookup_result->block);
1886        }
1887}
1888
1889static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block,
1890                            struct dm_bio_prison_cell *cell)
1891{
1892        int r;
1893        dm_block_t data_block;
1894        struct pool *pool = tc->pool;
1895
1896        /*
1897         * Remap empty bios (flushes) immediately, without provisioning.
1898         */
1899        if (!bio->bi_iter.bi_size) {
1900                inc_all_io_entry(pool, bio);
1901                cell_defer_no_holder(tc, cell);
1902
1903                remap_and_issue(tc, bio, 0);
1904                return;
1905        }
1906
1907        /*
1908         * Fill read bios with zeroes and complete them immediately.
1909         */
1910        if (bio_data_dir(bio) == READ) {
1911                zero_fill_bio(bio);
1912                cell_defer_no_holder(tc, cell);
1913                bio_endio(bio);
1914                return;
1915        }
1916
1917        r = alloc_data_block(tc, &data_block);
1918        switch (r) {
1919        case 0:
1920                if (tc->origin_dev)
1921                        schedule_external_copy(tc, block, data_block, cell, bio);
1922                else
1923                        schedule_zero(tc, block, data_block, cell, bio);
1924                break;
1925
1926        case -ENOSPC:
1927                retry_bios_on_resume(pool, cell);
1928                break;
1929
1930        default:
1931                DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
1932                            __func__, r);
1933                cell_error(pool, cell);
1934                break;
1935        }
1936}
1937
1938static void process_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
1939{
1940        int r;
1941        struct pool *pool = tc->pool;
1942        struct bio *bio = cell->holder;
1943        dm_block_t block = get_bio_block(tc, bio);
1944        struct dm_thin_lookup_result lookup_result;
1945
1946        if (tc->requeue_mode) {
1947                cell_requeue(pool, cell);
1948                return;
1949        }
1950
1951        r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1952        switch (r) {
1953        case 0:
1954                if (lookup_result.shared)
1955                        process_shared_bio(tc, bio, block, &lookup_result, cell);
1956                else {
1957                        inc_all_io_entry(pool, bio);
1958                        remap_and_issue(tc, bio, lookup_result.block);
1959                        inc_remap_and_issue_cell(tc, cell, lookup_result.block);
1960                }
1961                break;
1962
1963        case -ENODATA:
1964                if (bio_data_dir(bio) == READ && tc->origin_dev) {
1965                        inc_all_io_entry(pool, bio);
1966                        cell_defer_no_holder(tc, cell);
1967
1968                        if (bio_end_sector(bio) <= tc->origin_size)
1969                                remap_to_origin_and_issue(tc, bio);
1970
1971                        else if (bio->bi_iter.bi_sector < tc->origin_size) {
1972                                zero_fill_bio(bio);
1973                                bio->bi_iter.bi_size = (tc->origin_size - bio->bi_iter.bi_sector) << SECTOR_SHIFT;
1974                                remap_to_origin_and_issue(tc, bio);
1975
1976                        } else {
1977                                zero_fill_bio(bio);
1978                                bio_endio(bio);
1979                        }
1980                } else
1981                        provision_block(tc, bio, block, cell);
1982                break;
1983
1984        default:
1985                DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
1986                            __func__, r);
1987                cell_defer_no_holder(tc, cell);
1988                bio_io_error(bio);
1989                break;
1990        }
1991}
1992
1993static void process_bio(struct thin_c *tc, struct bio *bio)
1994{
1995        struct pool *pool = tc->pool;
1996        dm_block_t block = get_bio_block(tc, bio);
1997        struct dm_bio_prison_cell *cell;
1998        struct dm_cell_key key;
1999
2000        /*
2001         * If cell is already occupied, then the block is already
2002         * being provisioned so we have nothing further to do here.
2003         */
2004        build_virtual_key(tc->td, block, &key);
2005        if (bio_detain(pool, &key, bio, &cell))
2006                return;
2007
2008        process_cell(tc, cell);
2009}
2010
2011static void __process_bio_read_only(struct thin_c *tc, struct bio *bio,
2012                                    struct dm_bio_prison_cell *cell)
2013{
2014        int r;
2015        int rw = bio_data_dir(bio);
2016        dm_block_t block = get_bio_block(tc, bio);
2017        struct dm_thin_lookup_result lookup_result;
2018
2019        r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
2020        switch (r) {
2021        case 0:
2022                if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) {
2023                        handle_unserviceable_bio(tc->pool, bio);
2024                        if (cell)
2025                                cell_defer_no_holder(tc, cell);
2026                } else {
2027                        inc_all_io_entry(tc->pool, bio);
2028                        remap_and_issue(tc, bio, lookup_result.block);
2029                        if (cell)
2030                                inc_remap_and_issue_cell(tc, cell, lookup_result.block);
2031                }
2032                break;
2033
2034        case -ENODATA:
2035                if (cell)
2036                        cell_defer_no_holder(tc, cell);
2037                if (rw != READ) {
2038                        handle_unserviceable_bio(tc->pool, bio);
2039                        break;
2040                }
2041
2042                if (tc->origin_dev) {
2043                        inc_all_io_entry(tc->pool, bio);
2044                        remap_to_origin_and_issue(tc, bio);
2045                        break;
2046                }
2047
2048                zero_fill_bio(bio);
2049                bio_endio(bio);
2050                break;
2051
2052        default:
2053                DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
2054                            __func__, r);
2055                if (cell)
2056                        cell_defer_no_holder(tc, cell);
2057                bio_io_error(bio);
2058                break;
2059        }
2060}
2061
2062static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
2063{
2064        __process_bio_read_only(tc, bio, NULL);
2065}
2066
2067static void process_cell_read_only(struct thin_c *tc, struct dm_bio_prison_cell *cell)
2068{
2069        __process_bio_read_only(tc, cell->holder, cell);
2070}
2071
2072static void process_bio_success(struct thin_c *tc, struct bio *bio)
2073{
2074        bio_endio(bio);
2075}
2076
2077static void process_bio_fail(struct thin_c *tc, struct bio *bio)
2078{
2079        bio_io_error(bio);
2080}
2081
2082static void process_cell_success(struct thin_c *tc, struct dm_bio_prison_cell *cell)
2083{
2084        cell_success(tc->pool, cell);
2085}
2086
2087static void process_cell_fail(struct thin_c *tc, struct dm_bio_prison_cell *cell)
2088{
2089        cell_error(tc->pool, cell);
2090}
2091
2092/*
2093 * FIXME: should we also commit due to size of transaction, measured in
2094 * metadata blocks?
2095 */
2096static int need_commit_due_to_time(struct pool *pool)
2097{
2098        return !time_in_range(jiffies, pool->last_commit_jiffies,
2099                              pool->last_commit_jiffies + COMMIT_PERIOD);
2100}
2101
2102#define thin_pbd(node) rb_entry((node), struct dm_thin_endio_hook, rb_node)
2103#define thin_bio(pbd) dm_bio_from_per_bio_data((pbd), sizeof(struct dm_thin_endio_hook))
2104
2105static void __thin_bio_rb_add(struct thin_c *tc, struct bio *bio)
2106{
2107        struct rb_node **rbp, *parent;
2108        struct dm_thin_endio_hook *pbd;
2109        sector_t bi_sector = bio->bi_iter.bi_sector;
2110
2111        rbp = &tc->sort_bio_list.rb_node;
2112        parent = NULL;
2113        while (*rbp) {
2114                parent = *rbp;
2115                pbd = thin_pbd(parent);
2116
2117                if (bi_sector < thin_bio(pbd)->bi_iter.bi_sector)
2118                        rbp = &(*rbp)->rb_left;
2119                else
2120                        rbp = &(*rbp)->rb_right;
2121        }
2122
2123        pbd = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
2124        rb_link_node(&pbd->rb_node, parent, rbp);
2125        rb_insert_color(&pbd->rb_node, &tc->sort_bio_list);
2126}
2127
2128static void __extract_sorted_bios(struct thin_c *tc)
2129{
2130        struct rb_node *node;
2131        struct dm_thin_endio_hook *pbd;
2132        struct bio *bio;
2133
2134        for (node = rb_first(&tc->sort_bio_list); node; node = rb_next(node)) {
2135                pbd = thin_pbd(node);
2136                bio = thin_bio(pbd);
2137
2138                bio_list_add(&tc->deferred_bio_list, bio);
2139                rb_erase(&pbd->rb_node, &tc->sort_bio_list);
2140        }
2141
2142        WARN_ON(!RB_EMPTY_ROOT(&tc->sort_bio_list));
2143}
2144
2145static void __sort_thin_deferred_bios(struct thin_c *tc)
2146{
2147        struct bio *bio;
2148        struct bio_list bios;
2149
2150        bio_list_init(&bios);
2151        bio_list_merge(&bios, &tc->deferred_bio_list);
2152        bio_list_init(&tc->deferred_bio_list);
2153
2154        /* Sort deferred_bio_list using rb-tree */
2155        while ((bio = bio_list_pop(&bios)))
2156                __thin_bio_rb_add(tc, bio);
2157
2158        /*
2159         * Transfer the sorted bios in sort_bio_list back to
2160         * deferred_bio_list to allow lockless submission of
2161         * all bios.
2162         */
2163        __extract_sorted_bios(tc);
2164}
2165
2166static void process_thin_deferred_bios(struct thin_c *tc)
2167{
2168        struct pool *pool = tc->pool;
2169        struct bio *bio;
2170        struct bio_list bios;
2171        struct blk_plug plug;
2172        unsigned count = 0;
2173
2174        if (tc->requeue_mode) {
2175                error_thin_bio_list(tc, &tc->deferred_bio_list,
2176                                BLK_STS_DM_REQUEUE);
2177                return;
2178        }
2179
2180        bio_list_init(&bios);
2181
2182        spin_lock_irq(&tc->lock);
2183
2184        if (bio_list_empty(&tc->deferred_bio_list)) {
2185                spin_unlock_irq(&tc->lock);
2186                return;
2187        }
2188
2189        __sort_thin_deferred_bios(tc);
2190
2191        bio_list_merge(&bios, &tc->deferred_bio_list);
2192        bio_list_init(&tc->deferred_bio_list);
2193
2194        spin_unlock_irq(&tc->lock);
2195
2196        blk_start_plug(&plug);
2197        while ((bio = bio_list_pop(&bios))) {
2198                /*
2199                 * If we've got no free new_mapping structs, and processing
2200                 * this bio might require one, we pause until there are some
2201                 * prepared mappings to process.
2202                 */
2203                if (ensure_next_mapping(pool)) {
2204                        spin_lock_irq(&tc->lock);
2205                        bio_list_add(&tc->deferred_bio_list, bio);
2206                        bio_list_merge(&tc->deferred_bio_list, &bios);
2207                        spin_unlock_irq(&tc->lock);
2208                        break;
2209                }
2210
2211                if (bio_op(bio) == REQ_OP_DISCARD)
2212                        pool->process_discard(tc, bio);
2213                else
2214                        pool->process_bio(tc, bio);
2215
2216                if ((count++ & 127) == 0) {
2217                        throttle_work_update(&pool->throttle);
2218                        dm_pool_issue_prefetches(pool->pmd);
2219                }
2220        }
2221        blk_finish_plug(&plug);
2222}
2223
2224static int cmp_cells(const void *lhs, const void *rhs)
2225{
2226        struct dm_bio_prison_cell *lhs_cell = *((struct dm_bio_prison_cell **) lhs);
2227        struct dm_bio_prison_cell *rhs_cell = *((struct dm_bio_prison_cell **) rhs);
2228
2229        BUG_ON(!lhs_cell->holder);
2230        BUG_ON(!rhs_cell->holder);
2231
2232        if (lhs_cell->holder->bi_iter.bi_sector < rhs_cell->holder->bi_iter.bi_sector)
2233                return -1;
2234
2235        if (lhs_cell->holder->bi_iter.bi_sector > rhs_cell->holder->bi_iter.bi_sector)
2236                return 1;
2237
2238        return 0;
2239}
2240
2241static unsigned sort_cells(struct pool *pool, struct list_head *cells)
2242{
2243        unsigned count = 0;
2244        struct dm_bio_prison_cell *cell, *tmp;
2245
2246        list_for_each_entry_safe(cell, tmp, cells, user_list) {
2247                if (count >= CELL_SORT_ARRAY_SIZE)
2248                        break;
2249
2250                pool->cell_sort_array[count++] = cell;
2251                list_del(&cell->user_list);
2252        }
2253
2254        sort(pool->cell_sort_array, count, sizeof(cell), cmp_cells, NULL);
2255
2256        return count;
2257}
2258
2259static void process_thin_deferred_cells(struct thin_c *tc)
2260{
2261        struct pool *pool = tc->pool;
2262        struct list_head cells;
2263        struct dm_bio_prison_cell *cell;
2264        unsigned i, j, count;
2265
2266        INIT_LIST_HEAD(&cells);
2267
2268        spin_lock_irq(&tc->lock);
2269        list_splice_init(&tc->deferred_cells, &cells);
2270        spin_unlock_irq(&tc->lock);
2271
2272        if (list_empty(&cells))
2273                return;
2274
2275        do {
2276                count = sort_cells(tc->pool, &cells);
2277
2278                for (i = 0; i < count; i++) {
2279                        cell = pool->cell_sort_array[i];
2280                        BUG_ON(!cell->holder);
2281
2282                        /*
2283                         * If we've got no free new_mapping structs, and processing
2284                         * this bio might require one, we pause until there are some
2285                         * prepared mappings to process.
2286                         */
2287                        if (ensure_next_mapping(pool)) {
2288                                for (j = i; j < count; j++)
2289                                        list_add(&pool->cell_sort_array[j]->user_list, &cells);
2290
2291                                spin_lock_irq(&tc->lock);
2292                                list_splice(&cells, &tc->deferred_cells);
2293                                spin_unlock_irq(&tc->lock);
2294                                return;
2295                        }
2296
2297                        if (bio_op(cell->holder) == REQ_OP_DISCARD)
2298                                pool->process_discard_cell(tc, cell);
2299                        else
2300                                pool->process_cell(tc, cell);
2301                }
2302        } while (!list_empty(&cells));
2303}
2304
2305static void thin_get(struct thin_c *tc);
2306static void thin_put(struct thin_c *tc);
2307
2308/*
2309 * We can't hold rcu_read_lock() around code that can block.  So we
2310 * find a thin with the rcu lock held; bump a refcount; then drop
2311 * the lock.
2312 */
2313static struct thin_c *get_first_thin(struct pool *pool)
2314{
2315        struct thin_c *tc = NULL;
2316
2317        rcu_read_lock();
2318        if (!list_empty(&pool->active_thins)) {
2319                tc = list_entry_rcu(pool->active_thins.next, struct thin_c, list);
2320                thin_get(tc);
2321        }
2322        rcu_read_unlock();
2323
2324        return tc;
2325}
2326
2327static struct thin_c *get_next_thin(struct pool *pool, struct thin_c *tc)
2328{
2329        struct thin_c *old_tc = tc;
2330
2331        rcu_read_lock();
2332        list_for_each_entry_continue_rcu(tc, &pool->active_thins, list) {
2333                thin_get(tc);
2334                thin_put(old_tc);
2335                rcu_read_unlock();
2336                return tc;
2337        }
2338        thin_put(old_tc);
2339        rcu_read_unlock();
2340
2341        return NULL;
2342}
2343
2344static void process_deferred_bios(struct pool *pool)
2345{
2346        struct bio *bio;
2347        struct bio_list bios, bio_completions;
2348        struct thin_c *tc;
2349
2350        tc = get_first_thin(pool);
2351        while (tc) {
2352                process_thin_deferred_cells(tc);
2353                process_thin_deferred_bios(tc);
2354                tc = get_next_thin(pool, tc);
2355        }
2356
2357        /*
2358         * If there are any deferred flush bios, we must commit the metadata
2359         * before issuing them or signaling their completion.
2360         */
2361        bio_list_init(&bios);
2362        bio_list_init(&bio_completions);
2363
2364        spin_lock_irq(&pool->lock);
2365        bio_list_merge(&bios, &pool->deferred_flush_bios);
2366        bio_list_init(&pool->deferred_flush_bios);
2367
2368        bio_list_merge(&bio_completions, &pool->deferred_flush_completions);
2369        bio_list_init(&pool->deferred_flush_completions);
2370        spin_unlock_irq(&pool->lock);
2371
2372        if (bio_list_empty(&bios) && bio_list_empty(&bio_completions) &&
2373            !(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool)))
2374                return;
2375
2376        if (commit(pool)) {
2377                bio_list_merge(&bios, &bio_completions);
2378
2379                while ((bio = bio_list_pop(&bios)))
2380                        bio_io_error(bio);
2381                return;
2382        }
2383        pool->last_commit_jiffies = jiffies;
2384
2385        while ((bio = bio_list_pop(&bio_completions)))
2386                bio_endio(bio);
2387
2388        while ((bio = bio_list_pop(&bios))) {
2389                /*
2390                 * The data device was flushed as part of metadata commit,
2391                 * so complete redundant flushes immediately.
2392                 */
2393                if (bio->bi_opf & REQ_PREFLUSH)
2394                        bio_endio(bio);
2395                else
2396                        submit_bio_noacct(bio);
2397        }
2398}
2399
2400static void do_worker(struct work_struct *ws)
2401{
2402        struct pool *pool = container_of(ws, struct pool, worker);
2403
2404        throttle_work_start(&pool->throttle);
2405        dm_pool_issue_prefetches(pool->pmd);
2406        throttle_work_update(&pool->throttle);
2407        process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
2408        throttle_work_update(&pool->throttle);
2409        process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
2410        throttle_work_update(&pool->throttle);
2411        process_prepared(pool, &pool->prepared_discards_pt2, &pool->process_prepared_discard_pt2);
2412        throttle_work_update(&pool->throttle);
2413        process_deferred_bios(pool);
2414        throttle_work_complete(&pool->throttle);
2415}
2416
2417/*
2418 * We want to commit periodically so that not too much
2419 * unwritten data builds up.
2420 */
2421static void do_waker(struct work_struct *ws)
2422{
2423        struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
2424        wake_worker(pool);
2425        queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
2426}
2427
2428/*
2429 * We're holding onto IO to allow userland time to react.  After the
2430 * timeout either the pool will have been resized (and thus back in
2431 * PM_WRITE mode), or we degrade to PM_OUT_OF_DATA_SPACE w/ error_if_no_space.
2432 */
2433static void do_no_space_timeout(struct work_struct *ws)
2434{
2435        struct pool *pool = container_of(to_delayed_work(ws), struct pool,
2436                                         no_space_timeout);
2437
2438        if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space) {
2439                pool->pf.error_if_no_space = true;
2440                notify_of_pool_mode_change(pool);
2441                error_retry_list_with_code(pool, BLK_STS_NOSPC);
2442        }
2443}
2444
2445/*----------------------------------------------------------------*/
2446
2447struct pool_work {
2448        struct work_struct worker;
2449        struct completion complete;
2450};
2451
2452static struct pool_work *to_pool_work(struct work_struct *ws)
2453{
2454        return container_of(ws, struct pool_work, worker);
2455}
2456
2457static void pool_work_complete(struct pool_work *pw)
2458{
2459        complete(&pw->complete);
2460}
2461
2462static void pool_work_wait(struct pool_work *pw, struct pool *pool,
2463                           void (*fn)(struct work_struct *))
2464{
2465        INIT_WORK_ONSTACK(&pw->worker, fn);
2466        init_completion(&pw->complete);
2467        queue_work(pool->wq, &pw->worker);
2468        wait_for_completion(&pw->complete);
2469}
2470
2471/*----------------------------------------------------------------*/
2472
2473struct noflush_work {
2474        struct pool_work pw;
2475        struct thin_c *tc;
2476};
2477
2478static struct noflush_work *to_noflush(struct work_struct *ws)
2479{
2480        return container_of(to_pool_work(ws), struct noflush_work, pw);
2481}
2482
2483static void do_noflush_start(struct work_struct *ws)
2484{
2485        struct noflush_work *w = to_noflush(ws);
2486        w->tc->requeue_mode = true;
2487        requeue_io(w->tc);
2488        pool_work_complete(&w->pw);
2489}
2490
2491static void do_noflush_stop(struct work_struct *ws)
2492{
2493        struct noflush_work *w = to_noflush(ws);
2494        w->tc->requeue_mode = false;
2495        pool_work_complete(&w->pw);
2496}
2497
2498static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *))
2499{
2500        struct noflush_work w;
2501
2502        w.tc = tc;
2503        pool_work_wait(&w.pw, tc->pool, fn);
2504}
2505
2506/*----------------------------------------------------------------*/
2507
2508static bool passdown_enabled(struct pool_c *pt)
2509{
2510        return pt->adjusted_pf.discard_passdown;
2511}
2512
2513static void set_discard_callbacks(struct pool *pool)
2514{
2515        struct pool_c *pt = pool->ti->private;
2516
2517        if (passdown_enabled(pt)) {
2518                pool->process_discard_cell = process_discard_cell_passdown;
2519                pool->process_prepared_discard = process_prepared_discard_passdown_pt1;
2520                pool->process_prepared_discard_pt2 = process_prepared_discard_passdown_pt2;
2521        } else {
2522                pool->process_discard_cell = process_discard_cell_no_passdown;
2523                pool->process_prepared_discard = process_prepared_discard_no_passdown;
2524        }
2525}
2526
2527static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
2528{
2529        struct pool_c *pt = pool->ti->private;
2530        bool needs_check = dm_pool_metadata_needs_check(pool->pmd);
2531        enum pool_mode old_mode = get_pool_mode(pool);
2532        unsigned long no_space_timeout = READ_ONCE(no_space_timeout_secs) * HZ;
2533
2534        /*
2535         * Never allow the pool to transition to PM_WRITE mode if user
2536         * intervention is required to verify metadata and data consistency.
2537         */
2538        if (new_mode == PM_WRITE && needs_check) {
2539                DMERR("%s: unable to switch pool to write mode until repaired.",
2540                      dm_device_name(pool->pool_md));
2541                if (old_mode != new_mode)
2542                        new_mode = old_mode;
2543                else
2544                        new_mode = PM_READ_ONLY;
2545        }
2546        /*
2547         * If we were in PM_FAIL mode, rollback of metadata failed.  We're
2548         * not going to recover without a thin_repair.  So we never let the
2549         * pool move out of the old mode.
2550         */
2551        if (old_mode == PM_FAIL)
2552                new_mode = old_mode;
2553
2554        switch (new_mode) {
2555        case PM_FAIL:
2556                dm_pool_metadata_read_only(pool->pmd);
2557                pool->process_bio = process_bio_fail;
2558                pool->process_discard = process_bio_fail;
2559                pool->process_cell = process_cell_fail;
2560                pool->process_discard_cell = process_cell_fail;
2561                pool->process_prepared_mapping = process_prepared_mapping_fail;
2562                pool->process_prepared_discard = process_prepared_discard_fail;
2563
2564                error_retry_list(pool);
2565                break;
2566
2567        case PM_OUT_OF_METADATA_SPACE:
2568        case PM_READ_ONLY:
2569                dm_pool_metadata_read_only(pool->pmd);
2570                pool->process_bio = process_bio_read_only;
2571                pool->process_discard = process_bio_success;
2572                pool->process_cell = process_cell_read_only;
2573                pool->process_discard_cell = process_cell_success;
2574                pool->process_prepared_mapping = process_prepared_mapping_fail;
2575                pool->process_prepared_discard = process_prepared_discard_success;
2576
2577                error_retry_list(pool);
2578                break;
2579
2580        case PM_OUT_OF_DATA_SPACE:
2581                /*
2582                 * Ideally we'd never hit this state; the low water mark
2583                 * would trigger userland to extend the pool before we
2584                 * completely run out of data space.  However, many small
2585                 * IOs to unprovisioned space can consume data space at an
2586                 * alarming rate.  Adjust your low water mark if you're
2587                 * frequently seeing this mode.
2588                 */
2589                pool->out_of_data_space = true;
2590                pool->process_bio = process_bio_read_only;
2591                pool->process_discard = process_discard_bio;
2592                pool->process_cell = process_cell_read_only;
2593                pool->process_prepared_mapping = process_prepared_mapping;
2594                set_discard_callbacks(pool);
2595
2596                if (!pool->pf.error_if_no_space && no_space_timeout)
2597                        queue_delayed_work(pool->wq, &pool->no_space_timeout, no_space_timeout);
2598                break;
2599
2600        case PM_WRITE:
2601                if (old_mode == PM_OUT_OF_DATA_SPACE)
2602                        cancel_delayed_work_sync(&pool->no_space_timeout);
2603                pool->out_of_data_space = false;
2604                pool->pf.error_if_no_space = pt->requested_pf.error_if_no_space;
2605                dm_pool_metadata_read_write(pool->pmd);
2606                pool->process_bio = process_bio;
2607                pool->process_discard = process_discard_bio;
2608                pool->process_cell = process_cell;
2609                pool->process_prepared_mapping = process_prepared_mapping;
2610                set_discard_callbacks(pool);
2611                break;
2612        }
2613
2614        pool->pf.mode = new_mode;
2615        /*
2616         * The pool mode may have changed, sync it so bind_control_target()
2617         * doesn't cause an unexpected mode transition on resume.
2618         */
2619        pt->adjusted_pf.mode = new_mode;
2620
2621        if (old_mode != new_mode)
2622                notify_of_pool_mode_change(pool);
2623}
2624
2625static void abort_transaction(struct pool *pool)
2626{
2627        const char *dev_name = dm_device_name(pool->pool_md);
2628
2629        DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
2630        if (dm_pool_abort_metadata(pool->pmd)) {
2631                DMERR("%s: failed to abort metadata transaction", dev_name);
2632                set_pool_mode(pool, PM_FAIL);
2633        }
2634
2635        if (dm_pool_metadata_set_needs_check(pool->pmd)) {
2636                DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
2637                set_pool_mode(pool, PM_FAIL);
2638        }
2639}
2640
2641static void metadata_operation_failed(struct pool *pool, const char *op, int r)
2642{
2643        DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
2644                    dm_device_name(pool->pool_md), op, r);
2645
2646        abort_transaction(pool);
2647        set_pool_mode(pool, PM_READ_ONLY);
2648}
2649
2650/*----------------------------------------------------------------*/
2651
2652/*
2653 * Mapping functions.
2654 */
2655
2656/*
2657 * Called only while mapping a thin bio to hand it over to the workqueue.
2658 */
2659static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
2660{
2661        struct pool *pool = tc->pool;
2662
2663        spin_lock_irq(&tc->lock);
2664        bio_list_add(&tc->deferred_bio_list, bio);
2665        spin_unlock_irq(&tc->lock);
2666
2667        wake_worker(pool);
2668}
2669
2670static void thin_defer_bio_with_throttle(struct thin_c *tc, struct bio *bio)
2671{
2672        struct pool *pool = tc->pool;
2673
2674        throttle_lock(&pool->throttle);
2675        thin_defer_bio(tc, bio);
2676        throttle_unlock(&pool->throttle);
2677}
2678
2679static void thin_defer_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
2680{
2681        struct pool *pool = tc->pool;
2682
2683        throttle_lock(&pool->throttle);
2684        spin_lock_irq(&tc->lock);
2685        list_add_tail(&cell->user_list, &tc->deferred_cells);
2686        spin_unlock_irq(&tc->lock);
2687        throttle_unlock(&pool->throttle);
2688
2689        wake_worker(pool);
2690}
2691
2692static void thin_hook_bio(struct thin_c *tc, struct bio *bio)
2693{
2694        struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
2695
2696        h->tc = tc;
2697        h->shared_read_entry = NULL;
2698        h->all_io_entry = NULL;
2699        h->overwrite_mapping = NULL;
2700        h->cell = NULL;
2701}
2702
2703/*
2704 * Non-blocking function called from the thin target's map function.
2705 */
2706static int thin_bio_map(struct dm_target *ti, struct bio *bio)
2707{
2708        int r;
2709        struct thin_c *tc = ti->private;
2710        dm_block_t block = get_bio_block(tc, bio);
2711        struct dm_thin_device *td = tc->td;
2712        struct dm_thin_lookup_result result;
2713        struct dm_bio_prison_cell *virt_cell, *data_cell;
2714        struct dm_cell_key key;
2715
2716        thin_hook_bio(tc, bio);
2717
2718        if (tc->requeue_mode) {
2719                bio->bi_status = BLK_STS_DM_REQUEUE;
2720                bio_endio(bio);
2721                return DM_MAPIO_SUBMITTED;
2722        }
2723
2724        if (get_pool_mode(tc->pool) == PM_FAIL) {
2725                bio_io_error(bio);
2726                return DM_MAPIO_SUBMITTED;
2727        }
2728
2729        if (op_is_flush(bio->bi_opf) || bio_op(bio) == REQ_OP_DISCARD) {
2730                thin_defer_bio_with_throttle(tc, bio);
2731                return DM_MAPIO_SUBMITTED;
2732        }
2733
2734        /*
2735         * We must hold the virtual cell before doing the lookup, otherwise
2736         * there's a race with discard.
2737         */
2738        build_virtual_key(tc->td, block, &key);
2739        if (bio_detain(tc->pool, &key, bio, &virt_cell))
2740                return DM_MAPIO_SUBMITTED;
2741
2742        r = dm_thin_find_block(td, block, 0, &result);
2743
2744        /*
2745         * Note that we defer readahead too.
2746         */
2747        switch (r) {
2748        case 0:
2749                if (unlikely(result.shared)) {
2750                        /*
2751                         * We have a race condition here between the
2752                         * result.shared value returned by the lookup and
2753                         * snapshot creation, which may cause new
2754                         * sharing.
2755                         *
2756                         * To avoid this always quiesce the origin before
2757                         * taking the snap.  You want to do this anyway to
2758                         * ensure a consistent application view
2759                         * (i.e. lockfs).
2760                         *
2761                         * More distant ancestors are irrelevant. The
2762                         * shared flag will be set in their case.
2763                         */
2764                        thin_defer_cell(tc, virt_cell);
2765                        return DM_MAPIO_SUBMITTED;
2766                }
2767
2768                build_data_key(tc->td, result.block, &key);
2769                if (bio_detain(tc->pool, &key, bio, &data_cell)) {
2770                        cell_defer_no_holder(tc, virt_cell);
2771                        return DM_MAPIO_SUBMITTED;
2772                }
2773
2774                inc_all_io_entry(tc->pool, bio);
2775                cell_defer_no_holder(tc, data_cell);
2776                cell_defer_no_holder(tc, virt_cell);
2777
2778                remap(tc, bio, result.block);
2779                return DM_MAPIO_REMAPPED;
2780
2781        case -ENODATA:
2782        case -EWOULDBLOCK:
2783                thin_defer_cell(tc, virt_cell);
2784                return DM_MAPIO_SUBMITTED;
2785
2786        default:
2787                /*
2788                 * Must always call bio_io_error on failure.
2789                 * dm_thin_find_block can fail with -EINVAL if the
2790                 * pool is switched to fail-io mode.
2791                 */
2792                bio_io_error(bio);
2793                cell_defer_no_holder(tc, virt_cell);
2794                return DM_MAPIO_SUBMITTED;
2795        }
2796}
2797
2798static void requeue_bios(struct pool *pool)
2799{
2800        struct thin_c *tc;
2801
2802        rcu_read_lock();
2803        list_for_each_entry_rcu(tc, &pool->active_thins, list) {
2804                spin_lock_irq(&tc->lock);
2805                bio_list_merge(&tc->deferred_bio_list, &tc->retry_on_resume_list);
2806                bio_list_init(&tc->retry_on_resume_list);
2807                spin_unlock_irq(&tc->lock);
2808        }
2809        rcu_read_unlock();
2810}
2811
2812/*----------------------------------------------------------------
2813 * Binding of control targets to a pool object
2814 *--------------------------------------------------------------*/
2815static bool data_dev_supports_discard(struct pool_c *pt)
2816{
2817        struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
2818
2819        return blk_queue_discard(q);
2820}
2821
2822static bool is_factor(sector_t block_size, uint32_t n)
2823{
2824        return !sector_div(block_size, n);
2825}
2826
2827/*
2828 * If discard_passdown was enabled verify that the data device
2829 * supports discards.  Disable discard_passdown if not.
2830 */
2831static void disable_passdown_if_not_supported(struct pool_c *pt)
2832{
2833        struct pool *pool = pt->pool;
2834        struct block_device *data_bdev = pt->data_dev->bdev;
2835        struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;
2836        const char *reason = NULL;
2837        char buf[BDEVNAME_SIZE];
2838
2839        if (!pt->adjusted_pf.discard_passdown)
2840                return;
2841
2842        if (!data_dev_supports_discard(pt))
2843                reason = "discard unsupported";
2844
2845        else if (data_limits->max_discard_sectors < pool->sectors_per_block)
2846                reason = "max discard sectors smaller than a block";
2847
2848        if (reason) {
2849                DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason);
2850                pt->adjusted_pf.discard_passdown = false;
2851        }
2852}
2853
2854static int bind_control_target(struct pool *pool, struct dm_target *ti)
2855{
2856        struct pool_c *pt = ti->private;
2857
2858        /*
2859         * We want to make sure that a pool in PM_FAIL mode is never upgraded.
2860         */
2861        enum pool_mode old_mode = get_pool_mode(pool);
2862        enum pool_mode new_mode = pt->adjusted_pf.mode;
2863
2864        /*
2865         * Don't change the pool's mode until set_pool_mode() below.
2866         * Otherwise the pool's process_* function pointers may
2867         * not match the desired pool mode.
2868         */
2869        pt->adjusted_pf.mode = old_mode;
2870
2871        pool->ti = ti;
2872        pool->pf = pt->adjusted_pf;
2873        pool->low_water_blocks = pt->low_water_blocks;
2874
2875        set_pool_mode(pool, new_mode);
2876
2877        return 0;
2878}
2879
2880static void unbind_control_target(struct pool *pool, struct dm_target *ti)
2881{
2882        if (pool->ti == ti)
2883                pool->ti = NULL;
2884}
2885
2886/*----------------------------------------------------------------
2887 * Pool creation
2888 *--------------------------------------------------------------*/
2889/* Initialize pool features. */
2890static void pool_features_init(struct pool_features *pf)
2891{
2892        pf->mode = PM_WRITE;
2893        pf->zero_new_blocks = true;
2894        pf->discard_enabled = true;
2895        pf->discard_passdown = true;
2896        pf->error_if_no_space = false;
2897}
2898
2899static void __pool_destroy(struct pool *pool)
2900{
2901        __pool_table_remove(pool);
2902
2903        vfree(pool->cell_sort_array);
2904        if (dm_pool_metadata_close(pool->pmd) < 0)
2905                DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
2906
2907        dm_bio_prison_destroy(pool->prison);
2908        dm_kcopyd_client_destroy(pool->copier);
2909
2910        if (pool->wq)
2911                destroy_workqueue(pool->wq);
2912
2913        if (pool->next_mapping)
2914                mempool_free(pool->next_mapping, &pool->mapping_pool);
2915        mempool_exit(&pool->mapping_pool);
2916        bio_uninit(&pool->flush_bio);
2917        dm_deferred_set_destroy(pool->shared_read_ds);
2918        dm_deferred_set_destroy(pool->all_io_ds);
2919        kfree(pool);
2920}
2921
2922static struct kmem_cache *_new_mapping_cache;
2923
2924static struct pool *pool_create(struct mapped_device *pool_md,
2925                                struct block_device *metadata_dev,
2926                                struct block_device *data_dev,
2927                                unsigned long block_size,
2928                                int read_only, char **error)
2929{
2930        int r;
2931        void *err_p;
2932        struct pool *pool;
2933        struct dm_pool_metadata *pmd;
2934        bool format_device = read_only ? false : true;
2935
2936        pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device);
2937        if (IS_ERR(pmd)) {
2938                *error = "Error creating metadata object";
2939                return (struct pool *)pmd;
2940        }
2941
2942        pool = kzalloc(sizeof(*pool), GFP_KERNEL);
2943        if (!pool) {
2944                *error = "Error allocating memory for pool";
2945                err_p = ERR_PTR(-ENOMEM);
2946                goto bad_pool;
2947        }
2948
2949        pool->pmd = pmd;
2950        pool->sectors_per_block = block_size;
2951        if (block_size & (block_size - 1))
2952                pool->sectors_per_block_shift = -1;
2953        else
2954                pool->sectors_per_block_shift = __ffs(block_size);
2955        pool->low_water_blocks = 0;
2956        pool_features_init(&pool->pf);
2957        pool->prison = dm_bio_prison_create();
2958        if (!pool->prison) {
2959                *error = "Error creating pool's bio prison";
2960                err_p = ERR_PTR(-ENOMEM);
2961                goto bad_prison;
2962        }
2963
2964        pool->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2965        if (IS_ERR(pool->copier)) {
2966                r = PTR_ERR(pool->copier);
2967                *error = "Error creating pool's kcopyd client";
2968                err_p = ERR_PTR(r);
2969                goto bad_kcopyd_client;
2970        }
2971
2972        /*
2973         * Create singlethreaded workqueue that will service all devices
2974         * that use this metadata.
2975         */
2976        pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
2977        if (!pool->wq) {
2978                *error = "Error creating pool's workqueue";
2979                err_p = ERR_PTR(-ENOMEM);
2980                goto bad_wq;
2981        }
2982
2983        throttle_init(&pool->throttle);
2984        INIT_WORK(&pool->worker, do_worker);
2985        INIT_DELAYED_WORK(&pool->waker, do_waker);
2986        INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout);
2987        spin_lock_init(&pool->lock);
2988        bio_list_init(&pool->deferred_flush_bios);
2989        bio_list_init(&pool->deferred_flush_completions);
2990        INIT_LIST_HEAD(&pool->prepared_mappings);
2991        INIT_LIST_HEAD(&pool->prepared_discards);
2992        INIT_LIST_HEAD(&pool->prepared_discards_pt2);
2993        INIT_LIST_HEAD(&pool->active_thins);
2994        pool->low_water_triggered = false;
2995        pool->suspended = true;
2996        pool->out_of_data_space = false;
2997        bio_init(&pool->flush_bio, NULL, 0);
2998
2999        pool->shared_read_ds = dm_deferred_set_create();
3000        if (!pool->shared_read_ds) {
3001                *error = "Error creating pool's shared read deferred set";
3002                err_p = ERR_PTR(-ENOMEM);
3003                goto bad_shared_read_ds;
3004        }
3005
3006        pool->all_io_ds = dm_deferred_set_create();
3007        if (!pool->all_io_ds) {
3008                *error = "Error creating pool's all io deferred set";
3009                err_p = ERR_PTR(-ENOMEM);
3010                goto bad_all_io_ds;
3011        }
3012
3013        pool->next_mapping = NULL;
3014        r = mempool_init_slab_pool(&pool->mapping_pool, MAPPING_POOL_SIZE,
3015                                   _new_mapping_cache);
3016        if (r) {
3017                *error = "Error creating pool's mapping mempool";
3018                err_p = ERR_PTR(r);
3019                goto bad_mapping_pool;
3020        }
3021
3022        pool->cell_sort_array =
3023                vmalloc(array_size(CELL_SORT_ARRAY_SIZE,
3024                                   sizeof(*pool->cell_sort_array)));
3025        if (!pool->cell_sort_array) {
3026                *error = "Error allocating cell sort array";
3027                err_p = ERR_PTR(-ENOMEM);
3028                goto bad_sort_array;
3029        }
3030
3031        pool->ref_count = 1;
3032        pool->last_commit_jiffies = jiffies;
3033        pool->pool_md = pool_md;
3034        pool->md_dev = metadata_dev;
3035        pool->data_dev = data_dev;
3036        __pool_table_insert(pool);
3037
3038        return pool;
3039
3040bad_sort_array:
3041        mempool_exit(&pool->mapping_pool);
3042bad_mapping_pool:
3043        dm_deferred_set_destroy(pool->all_io_ds);
3044bad_all_io_ds:
3045        dm_deferred_set_destroy(pool->shared_read_ds);
3046bad_shared_read_ds:
3047        destroy_workqueue(pool->wq);
3048bad_wq:
3049        dm_kcopyd_client_destroy(pool->copier);
3050bad_kcopyd_client:
3051        dm_bio_prison_destroy(pool->prison);
3052bad_prison:
3053        kfree(pool);
3054bad_pool:
3055        if (dm_pool_metadata_close(pmd))
3056                DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
3057
3058        return err_p;
3059}
3060
3061static void __pool_inc(struct pool *pool)
3062{
3063        BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
3064        pool->ref_count++;
3065}
3066
3067static void __pool_dec(struct pool *pool)
3068{
3069        BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
3070        BUG_ON(!pool->ref_count);
3071        if (!--pool->ref_count)
3072                __pool_destroy(pool);
3073}
3074
3075static struct pool *__pool_find(struct mapped_device *pool_md,
3076                                struct block_device *metadata_dev,
3077                                struct block_device *data_dev,
3078                                unsigned long block_size, int read_only,
3079                                char **error, int *created)
3080{
3081        struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
3082
3083        if (pool) {
3084                if (pool->pool_md != pool_md) {
3085                        *error = "metadata device already in use by a pool";
3086                        return ERR_PTR(-EBUSY);
3087                }
3088                if (pool->data_dev != data_dev) {
3089                        *error = "data device already in use by a pool";
3090                        return ERR_PTR(-EBUSY);
3091                }
3092                __pool_inc(pool);
3093
3094        } else {
3095                pool = __pool_table_lookup(pool_md);
3096                if (pool) {
3097                        if (pool->md_dev != metadata_dev || pool->data_dev != data_dev) {
3098                                *error = "different pool cannot replace a pool";
3099                                return ERR_PTR(-EINVAL);
3100                        }
3101                        __pool_inc(pool);
3102
3103                } else {
3104                        pool = pool_create(pool_md, metadata_dev, data_dev, block_size, read_only, error);
3105                        *created = 1;
3106                }
3107        }
3108
3109        return pool;
3110}
3111
3112/*----------------------------------------------------------------
3113 * Pool target methods
3114 *--------------------------------------------------------------*/
3115static void pool_dtr(struct dm_target *ti)
3116{
3117        struct pool_c *pt = ti->private;
3118
3119        mutex_lock(&dm_thin_pool_table.mutex);
3120
3121        unbind_control_target(pt->pool, ti);
3122        __pool_dec(pt->pool);
3123        dm_put_device(ti, pt->metadata_dev);
3124        dm_put_device(ti, pt->data_dev);
3125        kfree(pt);
3126
3127        mutex_unlock(&dm_thin_pool_table.mutex);
3128}
3129
3130static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
3131                               struct dm_target *ti)
3132{
3133        int r;
3134        unsigned argc;
3135        const char *arg_name;
3136
3137        static const struct dm_arg _args[] = {
3138                {0, 4, "Invalid number of pool feature arguments"},
3139        };
3140
3141        /*
3142         * No feature arguments supplied.
3143         */
3144        if (!as->argc)
3145                return 0;
3146
3147        r = dm_read_arg_group(_args, as, &argc, &ti->error);
3148        if (r)
3149                return -EINVAL;
3150
3151        while (argc && !r) {
3152                arg_name = dm_shift_arg(as);
3153                argc--;
3154
3155                if (!strcasecmp(arg_name, "skip_block_zeroing"))
3156                        pf->zero_new_blocks = false;
3157
3158                else if (!strcasecmp(arg_name, "ignore_discard"))
3159                        pf->discard_enabled = false;
3160
3161                else if (!strcasecmp(arg_name, "no_discard_passdown"))
3162                        pf->discard_passdown = false;
3163
3164                else if (!strcasecmp(arg_name, "read_only"))
3165                        pf->mode = PM_READ_ONLY;
3166
3167                else if (!strcasecmp(arg_name, "error_if_no_space"))
3168                        pf->error_if_no_space = true;
3169
3170                else {
3171                        ti->error = "Unrecognised pool feature requested";
3172                        r = -EINVAL;
3173                        break;
3174                }
3175        }
3176
3177        return r;
3178}
3179
3180static void metadata_low_callback(void *context)
3181{
3182        struct pool *pool = context;
3183
3184        DMWARN("%s: reached low water mark for metadata device: sending event.",
3185               dm_device_name(pool->pool_md));
3186
3187        dm_table_event(pool->ti->table);
3188}
3189
3190/*
3191 * We need to flush the data device **before** committing the metadata.
3192 *
3193 * This ensures that the data blocks of any newly inserted mappings are
3194 * properly written to non-volatile storage and won't be lost in case of a
3195 * crash.
3196 *
3197 * Failure to do so can result in data corruption in the case of internal or
3198 * external snapshots and in the case of newly provisioned blocks, when block
3199 * zeroing is enabled.
3200 */
3201static int metadata_pre_commit_callback(void *context)
3202{
3203        struct pool *pool = context;
3204        struct bio *flush_bio = &pool->flush_bio;
3205
3206        bio_reset(flush_bio);
3207        bio_set_dev(flush_bio, pool->data_dev);
3208        flush_bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
3209
3210        return submit_bio_wait(flush_bio);
3211}
3212
3213static sector_t get_dev_size(struct block_device *bdev)
3214{
3215        return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
3216}
3217
3218static void warn_if_metadata_device_too_big(struct block_device *bdev)
3219{
3220        sector_t metadata_dev_size = get_dev_size(bdev);
3221        char buffer[BDEVNAME_SIZE];
3222
3223        if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
3224                DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
3225                       bdevname(bdev, buffer), THIN_METADATA_MAX_SECTORS);
3226}
3227
3228static sector_t get_metadata_dev_size(struct block_device *bdev)
3229{
3230        sector_t metadata_dev_size = get_dev_size(bdev);
3231
3232        if (metadata_dev_size > THIN_METADATA_MAX_SECTORS)
3233                metadata_dev_size = THIN_METADATA_MAX_SECTORS;
3234
3235        return metadata_dev_size;
3236}
3237
3238static dm_block_t get_metadata_dev_size_in_blocks(struct block_device *bdev)
3239{
3240        sector_t metadata_dev_size = get_metadata_dev_size(bdev);
3241
3242        sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE);
3243
3244        return metadata_dev_size;
3245}
3246
3247/*
3248 * When a metadata threshold is crossed a dm event is triggered, and
3249 * userland should respond by growing the metadata device.  We could let
3250 * userland set the threshold, like we do with the data threshold, but I'm
3251 * not sure they know enough to do this well.
3252 */
3253static dm_block_t calc_metadata_threshold(struct pool_c *pt)
3254{
3255        /*
3256         * 4M is ample for all ops with the possible exception of thin
3257         * device deletion which is harmless if it fails (just retry the
3258         * delete after you've grown the device).
3259         */
3260        dm_block_t quarter = get_metadata_dev_size_in_blocks(pt->metadata_dev->bdev) / 4;
3261        return min((dm_block_t)1024ULL /* 4M */, quarter);
3262}
3263
3264/*
3265 * thin-pool <metadata dev> <data dev>
3266 *           <data block size (sectors)>
3267 *           <low water mark (blocks)>
3268 *           [<#feature args> [<arg>]*]
3269 *
3270 * Optional feature arguments are:
3271 *           skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
3272 *           ignore_discard: disable discard
3273 *           no_discard_passdown: don't pass discards down to the data device
3274 *           read_only: Don't allow any changes to be made to the pool metadata.
3275 *           error_if_no_space: error IOs, instead of queueing, if no space.
3276 */
3277static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
3278{
3279        int r, pool_created = 0;
3280        struct pool_c *pt;
3281        struct pool *pool;
3282        struct pool_features pf;
3283        struct dm_arg_set as;
3284        struct dm_dev *data_dev;
3285        unsigned long block_size;
3286        dm_block_t low_water_blocks;
3287        struct dm_dev *metadata_dev;
3288        fmode_t metadata_mode;
3289
3290        /*
3291         * FIXME Remove validation from scope of lock.
3292         */
3293        mutex_lock(&dm_thin_pool_table.mutex);
3294
3295        if (argc < 4) {
3296                ti->error = "Invalid argument count";
3297                r = -EINVAL;
3298                goto out_unlock;
3299        }
3300
3301        as.argc = argc;
3302        as.argv = argv;
3303
3304        /* make sure metadata and data are different devices */
3305        if (!strcmp(argv[0], argv[1])) {
3306                ti->error = "Error setting metadata or data device";
3307                r = -EINVAL;
3308                goto out_unlock;
3309        }
3310
3311        /*
3312         * Set default pool features.
3313         */
3314        pool_features_init(&pf);
3315
3316        dm_consume_args(&as, 4);
3317        r = parse_pool_features(&as, &pf, ti);
3318        if (r)
3319                goto out_unlock;
3320
3321        metadata_mode = FMODE_READ | ((pf.mode == PM_READ_ONLY) ? 0 : FMODE_WRITE);
3322        r = dm_get_device(ti, argv[0], metadata_mode, &metadata_dev);
3323        if (r) {
3324                ti->error = "Error opening metadata block device";
3325                goto out_unlock;
3326        }
3327        warn_if_metadata_device_too_big(metadata_dev->bdev);
3328
3329        r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
3330        if (r) {
3331                ti->error = "Error getting data device";
3332                goto out_metadata;
3333        }
3334
3335        if (kstrtoul(argv[2], 10, &block_size) || !block_size ||
3336            block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
3337            block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
3338            block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
3339                ti->error = "Invalid block size";
3340                r = -EINVAL;
3341                goto out;
3342        }
3343
3344        if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {
3345                ti->error = "Invalid low water mark";
3346                r = -EINVAL;
3347                goto out;
3348        }
3349
3350        pt = kzalloc(sizeof(*pt), GFP_KERNEL);
3351        if (!pt) {
3352                r = -ENOMEM;
3353                goto out;
3354        }
3355
3356        pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, data_dev->bdev,
3357                           block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);
3358        if (IS_ERR(pool)) {
3359                r = PTR_ERR(pool);
3360                goto out_free_pt;
3361        }
3362
3363        /*
3364         * 'pool_created' reflects whether this is the first table load.
3365         * Top level discard support is not allowed to be changed after
3366         * initial load.  This would require a pool reload to trigger thin
3367         * device changes.
3368         */
3369        if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
3370                ti->error = "Discard support cannot be disabled once enabled";
3371                r = -EINVAL;
3372                goto out_flags_changed;
3373        }
3374
3375        pt->pool = pool;
3376        pt->ti = ti;
3377        pt->metadata_dev = metadata_dev;
3378        pt->data_dev = data_dev;
3379        pt->low_water_blocks = low_water_blocks;
3380        pt->adjusted_pf = pt->requested_pf = pf;
3381        ti->num_flush_bios = 1;
3382
3383        /*
3384         * Only need to enable discards if the pool should pass
3385         * them down to the data device.  The thin device's discard
3386         * processing will cause mappings to be removed from the btree.
3387         */
3388        if (pf.discard_enabled && pf.discard_passdown) {
3389                ti->num_discard_bios = 1;
3390
3391                /*
3392                 * Setting 'discards_supported' circumvents the normal
3393                 * stacking of discard limits (this keeps the pool and
3394                 * thin devices' discard limits consistent).
3395                 */
3396                ti->discards_supported = true;
3397        }
3398        ti->private = pt;
3399
3400        r = dm_pool_register_metadata_threshold(pt->pool->pmd,
3401                                                calc_metadata_threshold(pt),
3402                                                metadata_low_callback,
3403                                                pool);
3404        if (r)
3405                goto out_flags_changed;
3406
3407        dm_pool_register_pre_commit_callback(pool->pmd,
3408                                             metadata_pre_commit_callback, pool);
3409
3410        mutex_unlock(&dm_thin_pool_table.mutex);
3411
3412        return 0;
3413
3414out_flags_changed:
3415        __pool_dec(pool);
3416out_free_pt:
3417        kfree(pt);
3418out:
3419        dm_put_device(ti, data_dev);
3420out_metadata:
3421        dm_put_device(ti, metadata_dev);
3422out_unlock:
3423        mutex_unlock(&dm_thin_pool_table.mutex);
3424
3425        return r;
3426}
3427
3428static int pool_map(struct dm_target *ti, struct bio *bio)
3429{
3430        int r;
3431        struct pool_c *pt = ti->private;
3432        struct pool *pool = pt->pool;
3433
3434        /*
3435         * As this is a singleton target, ti->begin is always zero.
3436         */
3437        spin_lock_irq(&pool->lock);
3438        bio_set_dev(bio, pt->data_dev->bdev);
3439        r = DM_MAPIO_REMAPPED;
3440        spin_unlock_irq(&pool->lock);
3441
3442        return r;
3443}
3444
3445static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit)
3446{
3447        int r;
3448        struct pool_c *pt = ti->private;
3449        struct pool *pool = pt->pool;
3450        sector_t data_size = ti->len;
3451        dm_block_t sb_data_size;
3452
3453        *need_commit = false;
3454
3455        (void) sector_div(data_size, pool->sectors_per_block);
3456
3457        r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
3458        if (r) {
3459                DMERR("%s: failed to retrieve data device size",
3460                      dm_device_name(pool->pool_md));
3461                return r;
3462        }
3463
3464        if (data_size < sb_data_size) {
3465                DMERR("%s: pool target (%llu blocks) too small: expected %llu",
3466                      dm_device_name(pool->pool_md),
3467                      (unsigned long long)data_size, sb_data_size);
3468                return -EINVAL;
3469
3470        } else if (data_size > sb_data_size) {
3471                if (dm_pool_metadata_needs_check(pool->pmd)) {
3472                        DMERR("%s: unable to grow the data device until repaired.",
3473                              dm_device_name(pool->pool_md));
3474                        return 0;
3475                }
3476
3477                if (sb_data_size)
3478                        DMINFO("%s: growing the data device from %llu to %llu blocks",
3479                               dm_device_name(pool->pool_md),
3480                               sb_data_size, (unsigned long long)data_size);
3481                r = dm_pool_resize_data_dev(pool->pmd, data_size);
3482                if (r) {
3483                        metadata_operation_failed(pool, "dm_pool_resize_data_dev", r);
3484                        return r;
3485                }
3486
3487                *need_commit = true;
3488        }
3489
3490        return 0;
3491}
3492
3493static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)
3494{
3495        int r;
3496        struct pool_c *pt = ti->private;
3497        struct pool *pool = pt->pool;
3498        dm_block_t metadata_dev_size, sb_metadata_dev_size;
3499
3500        *need_commit = false;
3501
3502        metadata_dev_size = get_metadata_dev_size_in_blocks(pool->md_dev);
3503
3504        r = dm_pool_get_metadata_dev_size(pool->pmd, &sb_metadata_dev_size);
3505        if (r) {
3506                DMERR("%s: failed to retrieve metadata device size",
3507                      dm_device_name(pool->pool_md));
3508                return r;
3509        }
3510
3511        if (metadata_dev_size < sb_metadata_dev_size) {
3512                DMERR("%s: metadata device (%llu blocks) too small: expected %llu",
3513                      dm_device_name(pool->pool_md),
3514                      metadata_dev_size, sb_metadata_dev_size);
3515                return -EINVAL;
3516
3517        } else if (metadata_dev_size > sb_metadata_dev_size) {
3518                if (dm_pool_metadata_needs_check(pool->pmd)) {
3519                        DMERR("%s: unable to grow the metadata device until repaired.",
3520                              dm_device_name(pool->pool_md));
3521                        return 0;
3522                }
3523
3524                warn_if_metadata_device_too_big(pool->md_dev);
3525                DMINFO("%s: growing the metadata device from %llu to %llu blocks",
3526                       dm_device_name(pool->pool_md),
3527                       sb_metadata_dev_size, metadata_dev_size);
3528
3529                if (get_pool_mode(pool) == PM_OUT_OF_METADATA_SPACE)
3530                        set_pool_mode(pool, PM_WRITE);
3531
3532                r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size);
3533                if (r) {
3534                        metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r);
3535                        return r;
3536                }
3537
3538                *need_commit = true;
3539        }
3540
3541        return 0;
3542}
3543
3544/*
3545 * Retrieves the number of blocks of the data device from
3546 * the superblock and compares it to the actual device size,
3547 * thus resizing the data device in case it has grown.
3548 *
3549 * This both copes with opening preallocated data devices in the ctr
3550 * being followed by a resume
3551 * -and-
3552 * calling the resume method individually after userspace has
3553 * grown the data device in reaction to a table event.
3554 */
3555static int pool_preresume(struct dm_target *ti)
3556{
3557        int r;
3558        bool need_commit1, need_commit2;
3559        struct pool_c *pt = ti->private;
3560        struct pool *pool = pt->pool;
3561
3562        /*
3563         * Take control of the pool object.
3564         */
3565        r = bind_control_target(pool, ti);
3566        if (r)
3567                return r;
3568
3569        r = maybe_resize_data_dev(ti, &need_commit1);
3570        if (r)
3571                return r;
3572
3573        r = maybe_resize_metadata_dev(ti, &need_commit2);
3574        if (r)
3575                return r;
3576
3577        if (need_commit1 || need_commit2)
3578                (void) commit(pool);
3579
3580        return 0;
3581}
3582
3583static void pool_suspend_active_thins(struct pool *pool)
3584{
3585        struct thin_c *tc;
3586
3587        /* Suspend all active thin devices */
3588        tc = get_first_thin(pool);
3589        while (tc) {
3590                dm_internal_suspend_noflush(tc->thin_md);
3591                tc = get_next_thin(pool, tc);
3592        }
3593}
3594
3595static void pool_resume_active_thins(struct pool *pool)
3596{
3597        struct thin_c *tc;
3598
3599        /* Resume all active thin devices */
3600        tc = get_first_thin(pool);
3601        while (tc) {
3602                dm_internal_resume(tc->thin_md);
3603                tc = get_next_thin(pool, tc);
3604        }
3605}
3606
3607static void pool_resume(struct dm_target *ti)
3608{
3609        struct pool_c *pt = ti->private;
3610        struct pool *pool = pt->pool;
3611
3612        /*
3613         * Must requeue active_thins' bios and then resume
3614         * active_thins _before_ clearing 'suspend' flag.
3615         */
3616        requeue_bios(pool);
3617        pool_resume_active_thins(pool);
3618
3619        spin_lock_irq(&pool->lock);
3620        pool->low_water_triggered = false;
3621        pool->suspended = false;
3622        spin_unlock_irq(&pool->lock);
3623
3624        do_waker(&pool->waker.work);
3625}
3626
3627static void pool_presuspend(struct dm_target *ti)
3628{
3629        struct pool_c *pt = ti->private;
3630        struct pool *pool = pt->pool;
3631
3632        spin_lock_irq(&pool->lock);
3633        pool->suspended = true;
3634        spin_unlock_irq(&pool->lock);
3635
3636        pool_suspend_active_thins(pool);
3637}
3638
3639static void pool_presuspend_undo(struct dm_target *ti)
3640{
3641        struct pool_c *pt = ti->private;
3642        struct pool *pool = pt->pool;
3643
3644        pool_resume_active_thins(pool);
3645
3646        spin_lock_irq(&pool->lock);
3647        pool->suspended = false;
3648        spin_unlock_irq(&pool->lock);
3649}
3650
3651static void pool_postsuspend(struct dm_target *ti)
3652{
3653        struct pool_c *pt = ti->private;
3654        struct pool *pool = pt->pool;
3655
3656        cancel_delayed_work_sync(&pool->waker);
3657        cancel_delayed_work_sync(&pool->no_space_timeout);
3658        flush_workqueue(pool->wq);
3659        (void) commit(pool);
3660}
3661
3662static int check_arg_count(unsigned argc, unsigned args_required)
3663{
3664        if (argc != args_required) {
3665                DMWARN("Message received with %u arguments instead of %u.",
3666                       argc, args_required);
3667                return -EINVAL;
3668        }
3669
3670        return 0;
3671}
3672
3673static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning)
3674{
3675        if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
3676            *dev_id <= MAX_DEV_ID)
3677                return 0;
3678
3679        if (warning)
3680                DMWARN("Message received with invalid device id: %s", arg);
3681
3682        return -EINVAL;
3683}
3684
3685static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool)
3686{
3687        dm_thin_id dev_id;
3688        int r;
3689
3690        r = check_arg_count(argc, 2);
3691        if (r)
3692                return r;
3693
3694        r = read_dev_id(argv[1], &dev_id, 1);
3695        if (r)
3696                return r;
3697
3698        r = dm_pool_create_thin(pool->pmd, dev_id);
3699        if (r) {
3700                DMWARN("Creation of new thinly-provisioned device with id %s failed.",
3701                       argv[1]);
3702                return r;
3703        }
3704
3705        return 0;
3706}
3707
3708static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool)
3709{
3710        dm_thin_id dev_id;
3711        dm_thin_id origin_dev_id;
3712        int r;
3713
3714        r = check_arg_count(argc, 3);
3715        if (r)
3716                return r;
3717
3718        r = read_dev_id(argv[1], &dev_id, 1);
3719        if (r)
3720                return r;
3721
3722        r = read_dev_id(argv[2], &origin_dev_id, 1);
3723        if (r)
3724                return r;
3725
3726        r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
3727        if (r) {
3728                DMWARN("Creation of new snapshot %s of device %s failed.",
3729                       argv[1], argv[2]);
3730                return r;
3731        }
3732
3733        return 0;
3734}
3735
3736static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool)
3737{
3738        dm_thin_id dev_id;
3739        int r;
3740
3741        r = check_arg_count(argc, 2);
3742        if (r)
3743                return r;
3744
3745        r = read_dev_id(argv[1], &dev_id, 1);
3746        if (r)
3747                return r;
3748
3749        r = dm_pool_delete_thin_device(pool->pmd, dev_id);
3750        if (r)
3751                DMWARN("Deletion of thin device %s failed.", argv[1]);
3752
3753        return r;
3754}
3755
3756static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool)
3757{
3758        dm_thin_id old_id, new_id;
3759        int r;
3760
3761        r = check_arg_count(argc, 3);
3762        if (r)
3763                return r;
3764
3765        if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
3766                DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
3767                return -EINVAL;
3768        }
3769
3770        if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
3771                DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
3772                return -EINVAL;
3773        }
3774
3775        r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
3776        if (r) {
3777                DMWARN("Failed to change transaction id from %s to %s.",
3778                       argv[1], argv[2]);
3779                return r;
3780        }
3781
3782        return 0;
3783}
3784
3785static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
3786{
3787        int r;
3788
3789        r = check_arg_count(argc, 1);
3790        if (r)
3791                return r;
3792
3793        (void) commit(pool);
3794
3795        r = dm_pool_reserve_metadata_snap(pool->pmd);
3796        if (r)
3797                DMWARN("reserve_metadata_snap message failed.");
3798
3799        return r;
3800}
3801
3802static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
3803{
3804        int r;
3805
3806        r = check_arg_count(argc, 1);
3807        if (r)
3808                return r;
3809
3810        r = dm_pool_release_metadata_snap(pool->pmd);
3811        if (r)
3812                DMWARN("release_metadata_snap message failed.");
3813
3814        return r;
3815}
3816
3817/*
3818 * Messages supported:
3819 *   create_thin        <dev_id>
3820 *   create_snap        <dev_id> <origin_id>
3821 *   delete             <dev_id>
3822 *   set_transaction_id <current_trans_id> <new_trans_id>
3823 *   reserve_metadata_snap
3824 *   release_metadata_snap
3825 */
3826static int pool_message(struct dm_target *ti, unsigned argc, char **argv,
3827                        char *result, unsigned maxlen)
3828{
3829        int r = -EINVAL;
3830        struct pool_c *pt = ti->private;
3831        struct pool *pool = pt->pool;
3832
3833        if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE) {
3834                DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode",
3835                      dm_device_name(pool->pool_md));
3836                return -EOPNOTSUPP;
3837        }
3838
3839        if (!strcasecmp(argv[0], "create_thin"))
3840                r = process_create_thin_mesg(argc, argv, pool);
3841
3842        else if (!strcasecmp(argv[0], "create_snap"))
3843                r = process_create_snap_mesg(argc, argv, pool);
3844
3845        else if (!strcasecmp(argv[0], "delete"))
3846                r = process_delete_mesg(argc, argv, pool);
3847
3848        else if (!strcasecmp(argv[0], "set_transaction_id"))
3849                r = process_set_transaction_id_mesg(argc, argv, pool);
3850
3851        else if (!strcasecmp(argv[0], "reserve_metadata_snap"))
3852                r = process_reserve_metadata_snap_mesg(argc, argv, pool);
3853
3854        else if (!strcasecmp(argv[0], "release_metadata_snap"))
3855                r = process_release_metadata_snap_mesg(argc, argv, pool);
3856
3857        else
3858                DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
3859
3860        if (!r)
3861                (void) commit(pool);
3862
3863        return r;
3864}
3865
3866static void emit_flags(struct pool_features *pf, char *result,
3867                       unsigned sz, unsigned maxlen)
3868{
3869        unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
3870                !pf->discard_passdown + (pf->mode == PM_READ_ONLY) +
3871                pf->error_if_no_space;
3872        DMEMIT("%u ", count);
3873
3874        if (!pf->zero_new_blocks)
3875                DMEMIT("skip_block_zeroing ");
3876
3877        if (!pf->discard_enabled)
3878                DMEMIT("ignore_discard ");
3879
3880        if (!pf->discard_passdown)
3881                DMEMIT("no_discard_passdown ");
3882
3883        if (pf->mode == PM_READ_ONLY)
3884                DMEMIT("read_only ");
3885
3886        if (pf->error_if_no_space)
3887                DMEMIT("error_if_no_space ");
3888}
3889
3890/*
3891 * Status line is:
3892 *    <transaction id> <used metadata sectors>/<total metadata sectors>
3893 *    <used data sectors>/<total data sectors> <held metadata root>
3894 *    <pool mode> <discard config> <no space config> <needs_check>
3895 */
3896static void pool_status(struct dm_target *ti, status_type_t type,
3897                        unsigned status_flags, char *result, unsigned maxlen)
3898{
3899        int r;
3900        unsigned sz = 0;
3901        uint64_t transaction_id;
3902        dm_block_t nr_free_blocks_data;
3903        dm_block_t nr_free_blocks_metadata;
3904        dm_block_t nr_blocks_data;
3905        dm_block_t nr_blocks_metadata;
3906        dm_block_t held_root;
3907        enum pool_mode mode;
3908        char buf[BDEVNAME_SIZE];
3909        char buf2[BDEVNAME_SIZE];
3910        struct pool_c *pt = ti->private;
3911        struct pool *pool = pt->pool;
3912
3913        switch (type) {
3914        case STATUSTYPE_INFO:
3915                if (get_pool_mode(pool) == PM_FAIL) {
3916                        DMEMIT("Fail");
3917                        break;
3918                }
3919
3920                /* Commit to ensure statistics aren't out-of-date */
3921                if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
3922                        (void) commit(pool);
3923
3924                r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id);
3925                if (r) {
3926                        DMERR("%s: dm_pool_get_metadata_transaction_id returned %d",
3927                              dm_device_name(pool->pool_md), r);
3928                        goto err;
3929                }
3930
3931                r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free_blocks_metadata);
3932                if (r) {
3933                        DMERR("%s: dm_pool_get_free_metadata_block_count returned %d",
3934                              dm_device_name(pool->pool_md), r);
3935                        goto err;
3936                }
3937
3938                r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
3939                if (r) {
3940                        DMERR("%s: dm_pool_get_metadata_dev_size returned %d",
3941                              dm_device_name(pool->pool_md), r);
3942                        goto err;
3943                }
3944
3945                r = dm_pool_get_free_block_count(pool->pmd, &nr_free_blocks_data);
3946                if (r) {
3947                        DMERR("%s: dm_pool_get_free_block_count returned %d",
3948                              dm_device_name(pool->pool_md), r);
3949                        goto err;
3950                }
3951
3952                r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
3953                if (r) {
3954                        DMERR("%s: dm_pool_get_data_dev_size returned %d",
3955                              dm_device_name(pool->pool_md), r);
3956                        goto err;
3957                }
3958
3959                r = dm_pool_get_metadata_snap(pool->pmd, &held_root);
3960                if (r) {
3961                        DMERR("%s: dm_pool_get_metadata_snap returned %d",
3962                              dm_device_name(pool->pool_md), r);
3963                        goto err;
3964                }
3965
3966                DMEMIT("%llu %llu/%llu %llu/%llu ",
3967                       (unsigned long long)transaction_id,
3968                       (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
3969                       (unsigned long long)nr_blocks_metadata,
3970                       (unsigned long long)(nr_blocks_data - nr_free_blocks_data),
3971                       (unsigned long long)nr_blocks_data);
3972
3973                if (held_root)
3974                        DMEMIT("%llu ", held_root);
3975                else
3976                        DMEMIT("- ");
3977
3978                mode = get_pool_mode(pool);
3979                if (mode == PM_OUT_OF_DATA_SPACE)
3980                        DMEMIT("out_of_data_space ");
3981                else if (is_read_only_pool_mode(mode))
3982                        DMEMIT("ro ");
3983                else
3984                        DMEMIT("rw ");
3985
3986                if (!pool->pf.discard_enabled)
3987                        DMEMIT("ignore_discard ");
3988                else if (pool->pf.discard_passdown)
3989                        DMEMIT("discard_passdown ");
3990                else
3991                        DMEMIT("no_discard_passdown ");
3992
3993                if (pool->pf.error_if_no_space)
3994                        DMEMIT("error_if_no_space ");
3995                else
3996                        DMEMIT("queue_if_no_space ");
3997
3998                if (dm_pool_metadata_needs_check(pool->pmd))
3999                        DMEMIT("needs_check ");
4000                else
4001                        DMEMIT("- ");
4002
4003                DMEMIT("%llu ", (unsigned long long)calc_metadata_threshold(pt));
4004
4005                break;
4006
4007        case STATUSTYPE_TABLE:
4008                DMEMIT("%s %s %lu %llu ",
4009                       format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
4010                       format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
4011                       (unsigned long)pool->sectors_per_block,
4012                       (unsigned long long)pt->low_water_blocks);
4013                emit_flags(&pt->requested_pf, result, sz, maxlen);
4014                break;
4015        }
4016        return;
4017
4018err:
4019        DMEMIT("Error");
4020}
4021
4022static int pool_iterate_devices(struct dm_target *ti,
4023                                iterate_devices_callout_fn fn, void *data)
4024{
4025        struct pool_c *pt = ti->private;
4026
4027        return fn(ti, pt->data_dev, 0, ti->len, data);
4028}
4029
4030static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
4031{
4032        struct pool_c *pt = ti->private;
4033        struct pool *pool = pt->pool;
4034        sector_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
4035
4036        /*
4037         * If max_sectors is smaller than pool->sectors_per_block adjust it
4038         * to the highest possible power-of-2 factor of pool->sectors_per_block.
4039         * This is especially beneficial when the pool's data device is a RAID
4040         * device that has a full stripe width that matches pool->sectors_per_block
4041         * -- because even though partial RAID stripe-sized IOs will be issued to a
4042         *    single RAID stripe; when aggregated they will end on a full RAID stripe
4043         *    boundary.. which avoids additional partial RAID stripe writes cascading
4044         */
4045        if (limits->max_sectors < pool->sectors_per_block) {
4046                while (!is_factor(pool->sectors_per_block, limits->max_sectors)) {
4047                        if ((limits->max_sectors & (limits->max_sectors - 1)) == 0)
4048                                limits->max_sectors--;
4049                        limits->max_sectors = rounddown_pow_of_two(limits->max_sectors);
4050                }
4051        }
4052
4053        /*
4054         * If the system-determined stacked limits are compatible with the
4055         * pool's blocksize (io_opt is a factor) do not override them.
4056         */
4057        if (io_opt_sectors < pool->sectors_per_block ||
4058            !is_factor(io_opt_sectors, pool->sectors_per_block)) {
4059                if (is_factor(pool->sectors_per_block, limits->max_sectors))
4060                        blk_limits_io_min(limits, limits->max_sectors << SECTOR_SHIFT);
4061                else
4062                        blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT);
4063                blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
4064        }
4065
4066        /*
4067         * pt->adjusted_pf is a staging area for the actual features to use.
4068         * They get transferred to the live pool in bind_control_target()
4069         * called from pool_preresume().
4070         */
4071        if (!pt->adjusted_pf.discard_enabled) {
4072                /*
4073                 * Must explicitly disallow stacking discard limits otherwise the
4074                 * block layer will stack them if pool's data device has support.
4075                 * QUEUE_FLAG_DISCARD wouldn't be set but there is no way for the
4076                 * user to see that, so make sure to set all discard limits to 0.
4077                 */
4078                limits->discard_granularity = 0;
4079                return;
4080        }
4081
4082        disable_passdown_if_not_supported(pt);
4083
4084        /*
4085         * The pool uses the same discard limits as the underlying data
4086         * device.  DM core has already set this up.
4087         */
4088}
4089
4090static struct target_type pool_target = {
4091        .name = "thin-pool",
4092        .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
4093                    DM_TARGET_IMMUTABLE,
4094        .version = {1, 22, 0},
4095        .module = THIS_MODULE,
4096        .ctr = pool_ctr,
4097        .dtr = pool_dtr,
4098        .map = pool_map,
4099        .presuspend = pool_presuspend,
4100        .presuspend_undo = pool_presuspend_undo,
4101        .postsuspend = pool_postsuspend,
4102        .preresume = pool_preresume,
4103        .resume = pool_resume,
4104        .message = pool_message,
4105        .status = pool_status,
4106        .iterate_devices = pool_iterate_devices,
4107        .io_hints = pool_io_hints,
4108};
4109
4110/*----------------------------------------------------------------
4111 * Thin target methods
4112 *--------------------------------------------------------------*/
4113static void thin_get(struct thin_c *tc)
4114{
4115        refcount_inc(&tc->refcount);
4116}
4117
4118static void thin_put(struct thin_c *tc)
4119{
4120        if (refcount_dec_and_test(&tc->refcount))
4121                complete(&tc->can_destroy);
4122}
4123
4124static void thin_dtr(struct dm_target *ti)
4125{
4126        struct thin_c *tc = ti->private;
4127
4128        spin_lock_irq(&tc->pool->lock);
4129        list_del_rcu(&tc->list);
4130        spin_unlock_irq(&tc->pool->lock);
4131        synchronize_rcu();
4132
4133        thin_put(tc);
4134        wait_for_completion(&tc->can_destroy);
4135
4136        mutex_lock(&dm_thin_pool_table.mutex);
4137
4138        __pool_dec(tc->pool);
4139        dm_pool_close_thin_device(tc->td);
4140        dm_put_device(ti, tc->pool_dev);
4141        if (tc->origin_dev)
4142                dm_put_device(ti, tc->origin_dev);
4143        kfree(tc);
4144
4145        mutex_unlock(&dm_thin_pool_table.mutex);
4146}
4147
4148/*
4149 * Thin target parameters:
4150 *
4151 * <pool_dev> <dev_id> [origin_dev]
4152 *
4153 * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
4154 * dev_id: the internal device identifier
4155 * origin_dev: a device external to the pool that should act as the origin
4156 *
4157 * If the pool device has discards disabled, they get disabled for the thin
4158 * device as well.
4159 */
4160static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
4161{
4162        int r;
4163        struct thin_c *tc;
4164        struct dm_dev *pool_dev, *origin_dev;
4165        struct mapped_device *pool_md;
4166
4167        mutex_lock(&dm_thin_pool_table.mutex);
4168
4169        if (argc != 2 && argc != 3) {
4170                ti->error = "Invalid argument count";
4171                r = -EINVAL;
4172                goto out_unlock;
4173        }
4174
4175        tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
4176        if (!tc) {
4177                ti->error = "Out of memory";
4178                r = -ENOMEM;
4179                goto out_unlock;
4180        }
4181        tc->thin_md = dm_table_get_md(ti->table);
4182        spin_lock_init(&tc->lock);
4183        INIT_LIST_HEAD(&tc->deferred_cells);
4184        bio_list_init(&tc->deferred_bio_list);
4185        bio_list_init(&tc->retry_on_resume_list);
4186        tc->sort_bio_list = RB_ROOT;
4187
4188        if (argc == 3) {
4189                if (!strcmp(argv[0], argv[2])) {
4190                        ti->error = "Error setting origin device";
4191                        r = -EINVAL;
4192                        goto bad_origin_dev;
4193                }
4194
4195                r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
4196                if (r) {
4197                        ti->error = "Error opening origin device";
4198                        goto bad_origin_dev;
4199                }
4200                tc->origin_dev = origin_dev;
4201        }
4202
4203        r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
4204        if (r) {
4205                ti->error = "Error opening pool device";
4206                goto bad_pool_dev;
4207        }
4208        tc->pool_dev = pool_dev;
4209
4210        if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
4211                ti->error = "Invalid device id";
4212                r = -EINVAL;
4213                goto bad_common;
4214        }
4215
4216        pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
4217        if (!pool_md) {
4218                ti->error = "Couldn't get pool mapped device";
4219                r = -EINVAL;
4220                goto bad_common;
4221        }
4222
4223        tc->pool = __pool_table_lookup(pool_md);
4224        if (!tc->pool) {
4225                ti->error = "Couldn't find pool object";
4226                r = -EINVAL;
4227                goto bad_pool_lookup;
4228        }
4229        __pool_inc(tc->pool);
4230
4231        if (get_pool_mode(tc->pool) == PM_FAIL) {
4232                ti->error = "Couldn't open thin device, Pool is in fail mode";
4233                r = -EINVAL;
4234                goto bad_pool;
4235        }
4236
4237        r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
4238        if (r) {
4239                ti->error = "Couldn't open thin internal device";
4240                goto bad_pool;
4241        }
4242
4243        r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
4244        if (r)
4245                goto bad;
4246
4247        ti->num_flush_bios = 1;
4248        ti->flush_supported = true;
4249        ti->per_io_data_size = sizeof(struct dm_thin_endio_hook);
4250
4251        /* In case the pool supports discards, pass them on. */
4252        if (tc->pool->pf.discard_enabled) {
4253                ti->discards_supported = true;
4254                ti->num_discard_bios = 1;
4255        }
4256
4257        mutex_unlock(&dm_thin_pool_table.mutex);
4258
4259        spin_lock_irq(&tc->pool->lock);
4260        if (tc->pool->suspended) {
4261                spin_unlock_irq(&tc->pool->lock);
4262                mutex_lock(&dm_thin_pool_table.mutex); /* reacquire for __pool_dec */
4263                ti->error = "Unable to activate thin device while pool is suspended";
4264                r = -EINVAL;
4265                goto bad;
4266        }
4267        refcount_set(&tc->refcount, 1);
4268        init_completion(&tc->can_destroy);
4269        list_add_tail_rcu(&tc->list, &tc->pool->active_thins);
4270        spin_unlock_irq(&tc->pool->lock);
4271        /*
4272         * This synchronize_rcu() call is needed here otherwise we risk a
4273         * wake_worker() call finding no bios to process (because the newly
4274         * added tc isn't yet visible).  So this reduces latency since we
4275         * aren't then dependent on the periodic commit to wake_worker().
4276         */
4277        synchronize_rcu();
4278
4279        dm_put(pool_md);
4280
4281        return 0;
4282
4283bad:
4284        dm_pool_close_thin_device(tc->td);
4285bad_pool:
4286        __pool_dec(tc->pool);
4287bad_pool_lookup:
4288        dm_put(pool_md);
4289bad_common:
4290        dm_put_device(ti, tc->pool_dev);
4291bad_pool_dev:
4292        if (tc->origin_dev)
4293                dm_put_device(ti, tc->origin_dev);
4294bad_origin_dev:
4295        kfree(tc);
4296out_unlock:
4297        mutex_unlock(&dm_thin_pool_table.mutex);
4298
4299        return r;
4300}
4301
4302static int thin_map(struct dm_target *ti, struct bio *bio)
4303{
4304        bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
4305
4306        return thin_bio_map(ti, bio);
4307}
4308
4309static int thin_endio(struct dm_target *ti, struct bio *bio,
4310                blk_status_t *err)
4311{
4312        unsigned long flags;
4313        struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
4314        struct list_head work;
4315        struct dm_thin_new_mapping *m, *tmp;
4316        struct pool *pool = h->tc->pool;
4317
4318        if (h->shared_read_entry) {
4319                INIT_LIST_HEAD(&work);
4320                dm_deferred_entry_dec(h->shared_read_entry, &work);
4321
4322                spin_lock_irqsave(&pool->lock, flags);
4323                list_for_each_entry_safe(m, tmp, &work, list) {
4324                        list_del(&m->list);
4325                        __complete_mapping_preparation(m);
4326                }
4327                spin_unlock_irqrestore(&pool->lock, flags);
4328        }
4329
4330        if (h->all_io_entry) {
4331                INIT_LIST_HEAD(&work);
4332                dm_deferred_entry_dec(h->all_io_entry, &work);
4333                if (!list_empty(&work)) {
4334                        spin_lock_irqsave(&pool->lock, flags);
4335                        list_for_each_entry_safe(m, tmp, &work, list)
4336                                list_add_tail(&m->list, &pool->prepared_discards);
4337                        spin_unlock_irqrestore(&pool->lock, flags);
4338                        wake_worker(pool);
4339                }
4340        }
4341
4342        if (h->cell)
4343                cell_defer_no_holder(h->tc, h->cell);
4344
4345        return DM_ENDIO_DONE;
4346}
4347
4348static void thin_presuspend(struct dm_target *ti)
4349{
4350        struct thin_c *tc = ti->private;
4351
4352        if (dm_noflush_suspending(ti))
4353                noflush_work(tc, do_noflush_start);
4354}
4355
4356static void thin_postsuspend(struct dm_target *ti)
4357{
4358        struct thin_c *tc = ti->private;
4359
4360        /*
4361         * The dm_noflush_suspending flag has been cleared by now, so
4362         * unfortunately we must always run this.
4363         */
4364        noflush_work(tc, do_noflush_stop);
4365}
4366
4367static int thin_preresume(struct dm_target *ti)
4368{
4369        struct thin_c *tc = ti->private;
4370
4371        if (tc->origin_dev)
4372                tc->origin_size = get_dev_size(tc->origin_dev->bdev);
4373
4374        return 0;
4375}
4376
4377/*
4378 * <nr mapped sectors> <highest mapped sector>
4379 */
4380static void thin_status(struct dm_target *ti, status_type_t type,
4381                        unsigned status_flags, char *result, unsigned maxlen)
4382{
4383        int r;
4384        ssize_t sz = 0;
4385        dm_block_t mapped, highest;
4386        char buf[BDEVNAME_SIZE];
4387        struct thin_c *tc = ti->private;
4388
4389        if (get_pool_mode(tc->pool) == PM_FAIL) {
4390                DMEMIT("Fail");
4391                return;
4392        }
4393
4394        if (!tc->td)
4395                DMEMIT("-");
4396        else {
4397                switch (type) {
4398                case STATUSTYPE_INFO:
4399                        r = dm_thin_get_mapped_count(tc->td, &mapped);
4400                        if (r) {
4401                                DMERR("dm_thin_get_mapped_count returned %d", r);
4402                                goto err;
4403                        }
4404
4405                        r = dm_thin_get_highest_mapped_block(tc->td, &highest);
4406                        if (r < 0) {
4407                                DMERR("dm_thin_get_highest_mapped_block returned %d", r);
4408                                goto err;
4409                        }
4410
4411                        DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
4412                        if (r)
4413                                DMEMIT("%llu", ((highest + 1) *
4414                                                tc->pool->sectors_per_block) - 1);
4415                        else
4416                                DMEMIT("-");
4417                        break;
4418
4419                case STATUSTYPE_TABLE:
4420                        DMEMIT("%s %lu",
4421                               format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
4422                               (unsigned long) tc->dev_id);
4423                        if (tc->origin_dev)
4424                                DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
4425                        break;
4426                }
4427        }
4428
4429        return;
4430
4431err:
4432        DMEMIT("Error");
4433}
4434
4435static int thin_iterate_devices(struct dm_target *ti,
4436                                iterate_devices_callout_fn fn, void *data)
4437{
4438        sector_t blocks;
4439        struct thin_c *tc = ti->private;
4440        struct pool *pool = tc->pool;
4441
4442        /*
4443         * We can't call dm_pool_get_data_dev_size() since that blocks.  So
4444         * we follow a more convoluted path through to the pool's target.
4445         */
4446        if (!pool->ti)
4447                return 0;       /* nothing is bound */
4448
4449        blocks = pool->ti->len;
4450        (void) sector_div(blocks, pool->sectors_per_block);
4451        if (blocks)
4452                return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data);
4453
4454        return 0;
4455}
4456
4457static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
4458{
4459        struct thin_c *tc = ti->private;
4460        struct pool *pool = tc->pool;
4461
4462        if (!pool->pf.discard_enabled)
4463                return;
4464
4465        limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
4466        limits->max_discard_sectors = 2048 * 1024 * 16; /* 16G */
4467}
4468
4469static struct target_type thin_target = {
4470        .name = "thin",
4471        .version = {1, 22, 0},
4472        .module = THIS_MODULE,
4473        .ctr = thin_ctr,
4474        .dtr = thin_dtr,
4475        .map = thin_map,
4476        .end_io = thin_endio,
4477        .preresume = thin_preresume,
4478        .presuspend = thin_presuspend,
4479        .postsuspend = thin_postsuspend,
4480        .status = thin_status,
4481        .iterate_devices = thin_iterate_devices,
4482        .io_hints = thin_io_hints,
4483};
4484
4485/*----------------------------------------------------------------*/
4486
4487static int __init dm_thin_init(void)
4488{
4489        int r = -ENOMEM;
4490
4491        pool_table_init();
4492
4493        _new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0);
4494        if (!_new_mapping_cache)
4495                return r;
4496
4497        r = dm_register_target(&thin_target);
4498        if (r)
4499                goto bad_new_mapping_cache;
4500
4501        r = dm_register_target(&pool_target);
4502        if (r)
4503                goto bad_thin_target;
4504
4505        return 0;
4506
4507bad_thin_target:
4508        dm_unregister_target(&thin_target);
4509bad_new_mapping_cache:
4510        kmem_cache_destroy(_new_mapping_cache);
4511
4512        return r;
4513}
4514
4515static void dm_thin_exit(void)
4516{
4517        dm_unregister_target(&thin_target);
4518        dm_unregister_target(&pool_target);
4519
4520        kmem_cache_destroy(_new_mapping_cache);
4521
4522        pool_table_exit();
4523}
4524
4525module_init(dm_thin_init);
4526module_exit(dm_thin_exit);
4527
4528module_param_named(no_space_timeout, no_space_timeout_secs, uint, S_IRUGO | S_IWUSR);
4529MODULE_PARM_DESC(no_space_timeout, "Out of data space queue IO timeout in seconds");
4530
4531MODULE_DESCRIPTION(DM_NAME " thin provisioning target");
4532MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
4533MODULE_LICENSE("GPL");
4534