LXR linux/drivers/md/dm-cache-target.c

   1/*
   2 * Copyright (C) 2012 Red Hat. All rights reserved.
   3 *
   4 * This file is released under the GPL.
   5 */
   6
   7#include "dm.h"
   8#include "dm-bio-prison-v2.h"
   9#include "dm-bio-record.h"
  10#include "dm-cache-metadata.h"
  11
  12#include <linux/dm-io.h>
  13#include <linux/dm-kcopyd.h>
  14#include <linux/jiffies.h>
  15#include <linux/init.h>
  16#include <linux/mempool.h>
  17#include <linux/module.h>
  18#include <linux/rwsem.h>
  19#include <linux/slab.h>
  20#include <linux/vmalloc.h>
  21
  22#define DM_MSG_PREFIX "cache"
  23
  24DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
  25        "A percentage of time allocated for copying to and/or from cache");
  26
  27/*----------------------------------------------------------------*/
  28
  29/*
  30 * Glossary:
  31 *
  32 * oblock: index of an origin block
  33 * cblock: index of a cache block
  34 * promotion: movement of a block from origin to cache
  35 * demotion: movement of a block from cache to origin
  36 * migration: movement of a block between the origin and cache device,
  37 *            either direction
  38 */
  39
  40/*----------------------------------------------------------------*/
  41
  42struct io_tracker {
  43        spinlock_t lock;
  44
  45        /*
  46         * Sectors of in-flight IO.
  47         */
  48        sector_t in_flight;
  49
  50        /*
  51         * The time, in jiffies, when this device became idle (if it is
  52         * indeed idle).
  53         */
  54        unsigned long idle_time;
  55        unsigned long last_update_time;
  56};
  57
  58static void iot_init(struct io_tracker *iot)
  59{
  60        spin_lock_init(&iot->lock);
  61        iot->in_flight = 0ul;
  62        iot->idle_time = 0ul;
  63        iot->last_update_time = jiffies;
  64}
  65
  66static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs)
  67{
  68        if (iot->in_flight)
  69                return false;
  70
  71        return time_after(jiffies, iot->idle_time + jifs);
  72}
  73
  74static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs)
  75{
  76        bool r;
  77        unsigned long flags;
  78
  79        spin_lock_irqsave(&iot->lock, flags);
  80        r = __iot_idle_for(iot, jifs);
  81        spin_unlock_irqrestore(&iot->lock, flags);
  82
  83        return r;
  84}
  85
  86static void iot_io_begin(struct io_tracker *iot, sector_t len)
  87{
  88        unsigned long flags;
  89
  90        spin_lock_irqsave(&iot->lock, flags);
  91        iot->in_flight += len;
  92        spin_unlock_irqrestore(&iot->lock, flags);
  93}
  94
  95static void __iot_io_end(struct io_tracker *iot, sector_t len)
  96{
  97        if (!len)
  98                return;
  99
 100        iot->in_flight -= len;
 101        if (!iot->in_flight)
 102                iot->idle_time = jiffies;
 103}
 104
 105static void iot_io_end(struct io_tracker *iot, sector_t len)
 106{
 107        unsigned long flags;
 108
 109        spin_lock_irqsave(&iot->lock, flags);
 110        __iot_io_end(iot, len);
 111        spin_unlock_irqrestore(&iot->lock, flags);
 112}
 113
 114/*----------------------------------------------------------------*/
 115
 116/*
 117 * Represents a chunk of future work.  'input' allows continuations to pass
 118 * values between themselves, typically error values.
 119 */
 120struct continuation {
 121        struct work_struct ws;
 122        blk_status_t input;
 123};
 124
 125static inline void init_continuation(struct continuation *k,
 126                                     void (*fn)(struct work_struct *))
 127{
 128        INIT_WORK(&k->ws, fn);
 129        k->input = 0;
 130}
 131
 132static inline void queue_continuation(struct workqueue_struct *wq,
 133                                      struct continuation *k)
 134{
 135        queue_work(wq, &k->ws);
 136}
 137
 138/*----------------------------------------------------------------*/
 139
 140/*
 141 * The batcher collects together pieces of work that need a particular
 142 * operation to occur before they can proceed (typically a commit).
 143 */
 144struct batcher {
 145        /*
 146         * The operation that everyone is waiting for.
 147         */
 148        blk_status_t (*commit_op)(void *context);
 149        void *commit_context;
 150
 151        /*
 152         * This is how bios should be issued once the commit op is complete
 153         * (accounted_request).
 154         */
 155        void (*issue_op)(struct bio *bio, void *context);
 156        void *issue_context;
 157
 158        /*
 159         * Queued work gets put on here after commit.
 160         */
 161        struct workqueue_struct *wq;
 162
 163        spinlock_t lock;
 164        struct list_head work_items;
 165        struct bio_list bios;
 166        struct work_struct commit_work;
 167
 168        bool commit_scheduled;
 169};
 170
 171static void __commit(struct work_struct *_ws)
 172{
 173        struct batcher *b = container_of(_ws, struct batcher, commit_work);
 174        blk_status_t r;
 175        unsigned long flags;
 176        struct list_head work_items;
 177        struct work_struct *ws, *tmp;
 178        struct continuation *k;
 179        struct bio *bio;
 180        struct bio_list bios;
 181
 182        INIT_LIST_HEAD(&work_items);
 183        bio_list_init(&bios);
 184
 185        /*
 186         * We have to grab these before the commit_op to avoid a race
 187         * condition.
 188         */
 189        spin_lock_irqsave(&b->lock, flags);
 190        list_splice_init(&b->work_items, &work_items);
 191        bio_list_merge(&bios, &b->bios);
 192        bio_list_init(&b->bios);
 193        b->commit_scheduled = false;
 194        spin_unlock_irqrestore(&b->lock, flags);
 195
 196        r = b->commit_op(b->commit_context);
 197
 198        list_for_each_entry_safe(ws, tmp, &work_items, entry) {
 199                k = container_of(ws, struct continuation, ws);
 200                k->input = r;
 201                INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */
 202                queue_work(b->wq, ws);
 203        }
 204
 205        while ((bio = bio_list_pop(&bios))) {
 206                if (r) {
 207                        bio->bi_status = r;
 208                        bio_endio(bio);
 209                } else
 210                        b->issue_op(bio, b->issue_context);
 211        }
 212}
 213
 214static void batcher_init(struct batcher *b,
 215                         blk_status_t (*commit_op)(void *),
 216                         void *commit_context,
 217                         void (*issue_op)(struct bio *bio, void *),
 218                         void *issue_context,
 219                         struct workqueue_struct *wq)
 220{
 221        b->commit_op = commit_op;
 222        b->commit_context = commit_context;
 223        b->issue_op = issue_op;
 224        b->issue_context = issue_context;
 225        b->wq = wq;
 226
 227        spin_lock_init(&b->lock);
 228        INIT_LIST_HEAD(&b->work_items);
 229        bio_list_init(&b->bios);
 230        INIT_WORK(&b->commit_work, __commit);
 231        b->commit_scheduled = false;
 232}
 233
 234static void async_commit(struct batcher *b)
 235{
 236        queue_work(b->wq, &b->commit_work);
 237}
 238
 239static void continue_after_commit(struct batcher *b, struct continuation *k)
 240{
 241        unsigned long flags;
 242        bool commit_scheduled;
 243
 244        spin_lock_irqsave(&b->lock, flags);
 245        commit_scheduled = b->commit_scheduled;
 246        list_add_tail(&k->ws.entry, &b->work_items);
 247        spin_unlock_irqrestore(&b->lock, flags);
 248
 249        if (commit_scheduled)
 250                async_commit(b);
 251}
 252
 253/*
 254 * Bios are errored if commit failed.
 255 */
 256static void issue_after_commit(struct batcher *b, struct bio *bio)
 257{
 258       unsigned long flags;
 259       bool commit_scheduled;
 260
 261       spin_lock_irqsave(&b->lock, flags);
 262       commit_scheduled = b->commit_scheduled;
 263       bio_list_add(&b->bios, bio);
 264       spin_unlock_irqrestore(&b->lock, flags);
 265
 266       if (commit_scheduled)
 267               async_commit(b);
 268}
 269
 270/*
 271 * Call this if some urgent work is waiting for the commit to complete.
 272 */
 273static void schedule_commit(struct batcher *b)
 274{
 275        bool immediate;
 276        unsigned long flags;
 277
 278        spin_lock_irqsave(&b->lock, flags);
 279        immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios);
 280        b->commit_scheduled = true;
 281        spin_unlock_irqrestore(&b->lock, flags);
 282
 283        if (immediate)
 284                async_commit(b);
 285}
 286
 287/*
 288 * There are a couple of places where we let a bio run, but want to do some
 289 * work before calling its endio function.  We do this by temporarily
 290 * changing the endio fn.
 291 */
 292struct dm_hook_info {
 293        bio_end_io_t *bi_end_io;
 294};
 295
 296static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio,
 297                        bio_end_io_t *bi_end_io, void *bi_private)
 298{
 299        h->bi_end_io = bio->bi_end_io;
 300
 301        bio->bi_end_io = bi_end_io;
 302        bio->bi_private = bi_private;
 303}
 304
 305static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
 306{
 307        bio->bi_end_io = h->bi_end_io;
 308}
 309
 310/*----------------------------------------------------------------*/
 311
 312#define MIGRATION_POOL_SIZE 128
 313#define COMMIT_PERIOD HZ
 314#define MIGRATION_COUNT_WINDOW 10
 315
 316/*
 317 * The block size of the device holding cache data must be
 318 * between 32KB and 1GB.
 319 */
 320#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
 321#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
 322
 323enum cache_metadata_mode {
 324        CM_WRITE,               /* metadata may be changed */
 325        CM_READ_ONLY,           /* metadata may not be changed */
 326        CM_FAIL
 327};
 328
 329enum cache_io_mode {
 330        /*
 331         * Data is written to cached blocks only.  These blocks are marked
 332         * dirty.  If you lose the cache device you will lose data.
 333         * Potential performance increase for both reads and writes.
 334         */
 335        CM_IO_WRITEBACK,
 336
 337        /*
 338         * Data is written to both cache and origin.  Blocks are never
 339         * dirty.  Potential performance benfit for reads only.
 340         */
 341        CM_IO_WRITETHROUGH,
 342
 343        /*
 344         * A degraded mode useful for various cache coherency situations
 345         * (eg, rolling back snapshots).  Reads and writes always go to the
 346         * origin.  If a write goes to a cached oblock, then the cache
 347         * block is invalidated.
 348         */
 349        CM_IO_PASSTHROUGH
 350};
 351
 352struct cache_features {
 353        enum cache_metadata_mode mode;
 354        enum cache_io_mode io_mode;
 355        unsigned metadata_version;
 356        bool discard_passdown:1;
 357};
 358
 359struct cache_stats {
 360        atomic_t read_hit;
 361        atomic_t read_miss;
 362        atomic_t write_hit;
 363        atomic_t write_miss;
 364        atomic_t demotion;
 365        atomic_t promotion;
 366        atomic_t writeback;
 367        atomic_t copies_avoided;
 368        atomic_t cache_cell_clash;
 369        atomic_t commit_count;
 370        atomic_t discard_count;
 371};
 372
 373struct cache {
 374        struct dm_target *ti;
 375        spinlock_t lock;
 376
 377        /*
 378         * Fields for converting from sectors to blocks.
 379         */
 380        int sectors_per_block_shift;
 381        sector_t sectors_per_block;
 382
 383        struct dm_cache_metadata *cmd;
 384
 385        /*
 386         * Metadata is written to this device.
 387         */
 388        struct dm_dev *metadata_dev;
 389
 390        /*
 391         * The slower of the two data devices.  Typically a spindle.
 392         */
 393        struct dm_dev *origin_dev;
 394
 395        /*
 396         * The faster of the two data devices.  Typically an SSD.
 397         */
 398        struct dm_dev *cache_dev;
 399
 400        /*
 401         * Size of the origin device in _complete_ blocks and native sectors.
 402         */
 403        dm_oblock_t origin_blocks;
 404        sector_t origin_sectors;
 405
 406        /*
 407         * Size of the cache device in blocks.
 408         */
 409        dm_cblock_t cache_size;
 410
 411        /*
 412         * Invalidation fields.
 413         */
 414        spinlock_t invalidation_lock;
 415        struct list_head invalidation_requests;
 416
 417        sector_t migration_threshold;
 418        wait_queue_head_t migration_wait;
 419        atomic_t nr_allocated_migrations;
 420
 421        /*
 422         * The number of in flight migrations that are performing
 423         * background io. eg, promotion, writeback.
 424         */
 425        atomic_t nr_io_migrations;
 426
 427        struct bio_list deferred_bios;
 428
 429        struct rw_semaphore quiesce_lock;
 430
 431        struct dm_target_callbacks callbacks;
 432
 433        /*
 434         * origin_blocks entries, discarded if set.
 435         */
 436        dm_dblock_t discard_nr_blocks;
 437        unsigned long *discard_bitset;
 438        uint32_t discard_block_size; /* a power of 2 times sectors per block */
 439
 440        /*
 441         * Rather than reconstructing the table line for the status we just
 442         * save it and regurgitate.
 443         */
 444        unsigned nr_ctr_args;
 445        const char **ctr_args;
 446
 447        struct dm_kcopyd_client *copier;
 448        struct work_struct deferred_bio_worker;
 449        struct work_struct migration_worker;
 450        struct workqueue_struct *wq;
 451        struct delayed_work waker;
 452        struct dm_bio_prison_v2 *prison;
 453
 454        /*
 455         * cache_size entries, dirty if set
 456         */
 457        unsigned long *dirty_bitset;
 458        atomic_t nr_dirty;
 459
 460        unsigned policy_nr_args;
 461        struct dm_cache_policy *policy;
 462
 463        /*
 464         * Cache features such as write-through.
 465         */
 466        struct cache_features features;
 467
 468        struct cache_stats stats;
 469
 470        bool need_tick_bio:1;
 471        bool sized:1;
 472        bool invalidate:1;
 473        bool commit_requested:1;
 474        bool loaded_mappings:1;
 475        bool loaded_discards:1;
 476
 477        struct rw_semaphore background_work_lock;
 478
 479        struct batcher committer;
 480        struct work_struct commit_ws;
 481
 482        struct io_tracker tracker;
 483
 484        mempool_t migration_pool;
 485
 486        struct bio_set bs;
 487};
 488
 489struct per_bio_data {
 490        bool tick:1;
 491        unsigned req_nr:2;
 492        struct dm_bio_prison_cell_v2 *cell;
 493        struct dm_hook_info hook_info;
 494        sector_t len;
 495};
 496
 497struct dm_cache_migration {
 498        struct continuation k;
 499        struct cache *cache;
 500
 501        struct policy_work *op;
 502        struct bio *overwrite_bio;
 503        struct dm_bio_prison_cell_v2 *cell;
 504
 505        dm_cblock_t invalidate_cblock;
 506        dm_oblock_t invalidate_oblock;
 507};
 508
 509/*----------------------------------------------------------------*/
 510
 511static bool writethrough_mode(struct cache *cache)
 512{
 513        return cache->features.io_mode == CM_IO_WRITETHROUGH;
 514}
 515
 516static bool writeback_mode(struct cache *cache)
 517{
 518        return cache->features.io_mode == CM_IO_WRITEBACK;
 519}
 520
 521static inline bool passthrough_mode(struct cache *cache)
 522{
 523        return unlikely(cache->features.io_mode == CM_IO_PASSTHROUGH);
 524}
 525
 526/*----------------------------------------------------------------*/
 527
 528static void wake_deferred_bio_worker(struct cache *cache)
 529{
 530        queue_work(cache->wq, &cache->deferred_bio_worker);
 531}
 532
 533static void wake_migration_worker(struct cache *cache)
 534{
 535        if (passthrough_mode(cache))
 536                return;
 537
 538        queue_work(cache->wq, &cache->migration_worker);
 539}
 540
 541/*----------------------------------------------------------------*/
 542
 543static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache)
 544{
 545        return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOWAIT);
 546}
 547
 548static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell)
 549{
 550        dm_bio_prison_free_cell_v2(cache->prison, cell);
 551}
 552
 553static struct dm_cache_migration *alloc_migration(struct cache *cache)
 554{
 555        struct dm_cache_migration *mg;
 556
 557        mg = mempool_alloc(&cache->migration_pool, GFP_NOWAIT);
 558        if (!mg)
 559                return NULL;
 560
 561        memset(mg, 0, sizeof(*mg));
 562
 563        mg->cache = cache;
 564        atomic_inc(&cache->nr_allocated_migrations);
 565
 566        return mg;
 567}
 568
 569static void free_migration(struct dm_cache_migration *mg)
 570{
 571        struct cache *cache = mg->cache;
 572
 573        if (atomic_dec_and_test(&cache->nr_allocated_migrations))
 574                wake_up(&cache->migration_wait);
 575
 576        mempool_free(mg, &cache->migration_pool);
 577}
 578
 579/*----------------------------------------------------------------*/
 580
 581static inline dm_oblock_t oblock_succ(dm_oblock_t b)
 582{
 583        return to_oblock(from_oblock(b) + 1ull);
 584}
 585
 586static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key)
 587{
 588        key->virtual = 0;
 589        key->dev = 0;
 590        key->block_begin = from_oblock(begin);
 591        key->block_end = from_oblock(end);
 592}
 593
 594/*
 595 * We have two lock levels.  Level 0, which is used to prevent WRITEs, and
 596 * level 1 which prevents *both* READs and WRITEs.
 597 */
 598#define WRITE_LOCK_LEVEL 0
 599#define READ_WRITE_LOCK_LEVEL 1
 600
 601static unsigned lock_level(struct bio *bio)
 602{
 603        return bio_data_dir(bio) == WRITE ?
 604                WRITE_LOCK_LEVEL :
 605                READ_WRITE_LOCK_LEVEL;
 606}
 607
 608/*----------------------------------------------------------------
 609 * Per bio data
 610 *--------------------------------------------------------------*/
 611
 612static struct per_bio_data *get_per_bio_data(struct bio *bio)
 613{
 614        struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
 615        BUG_ON(!pb);
 616        return pb;
 617}
 618
 619static struct per_bio_data *init_per_bio_data(struct bio *bio)
 620{
 621        struct per_bio_data *pb = get_per_bio_data(bio);
 622
 623        pb->tick = false;
 624        pb->req_nr = dm_bio_get_target_bio_nr(bio);
 625        pb->cell = NULL;
 626        pb->len = 0;
 627
 628        return pb;
 629}
 630
 631/*----------------------------------------------------------------*/
 632
 633static void defer_bio(struct cache *cache, struct bio *bio)
 634{
 635        unsigned long flags;
 636
 637        spin_lock_irqsave(&cache->lock, flags);
 638        bio_list_add(&cache->deferred_bios, bio);
 639        spin_unlock_irqrestore(&cache->lock, flags);
 640
 641        wake_deferred_bio_worker(cache);
 642}
 643
 644static void defer_bios(struct cache *cache, struct bio_list *bios)
 645{
 646        unsigned long flags;
 647
 648        spin_lock_irqsave(&cache->lock, flags);
 649        bio_list_merge(&cache->deferred_bios, bios);
 650        bio_list_init(bios);
 651        spin_unlock_irqrestore(&cache->lock, flags);
 652
 653        wake_deferred_bio_worker(cache);
 654}
 655
 656/*----------------------------------------------------------------*/
 657
 658static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio)
 659{
 660        bool r;
 661        struct per_bio_data *pb;
 662        struct dm_cell_key_v2 key;
 663        dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL);
 664        struct dm_bio_prison_cell_v2 *cell_prealloc, *cell;
 665
 666        cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */
 667        if (!cell_prealloc) {
 668                defer_bio(cache, bio);
 669                return false;
 670        }
 671
 672        build_key(oblock, end, &key);
 673        r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell);
 674        if (!r) {
 675                /*
 676                 * Failed to get the lock.
 677                 */
 678                free_prison_cell(cache, cell_prealloc);
 679                return r;
 680        }
 681
 682        if (cell != cell_prealloc)
 683                free_prison_cell(cache, cell_prealloc);
 684
 685        pb = get_per_bio_data(bio);
 686        pb->cell = cell;
 687
 688        return r;
 689}
 690
 691/*----------------------------------------------------------------*/
 692
 693static bool is_dirty(struct cache *cache, dm_cblock_t b)
 694{
 695        return test_bit(from_cblock(b), cache->dirty_bitset);
 696}
 697
 698static void set_dirty(struct cache *cache, dm_cblock_t cblock)
 699{
 700        if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
 701                atomic_inc(&cache->nr_dirty);
 702                policy_set_dirty(cache->policy, cblock);
 703        }
 704}
 705
 706/*
 707 * These two are called when setting after migrations to force the policy
 708 * and dirty bitset to be in sync.
 709 */
 710static void force_set_dirty(struct cache *cache, dm_cblock_t cblock)
 711{
 712        if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset))
 713                atomic_inc(&cache->nr_dirty);
 714        policy_set_dirty(cache->policy, cblock);
 715}
 716
 717static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock)
 718{
 719        if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
 720                if (atomic_dec_return(&cache->nr_dirty) == 0)
 721                        dm_table_event(cache->ti->table);
 722        }
 723
 724        policy_clear_dirty(cache->policy, cblock);
 725}
 726
 727/*----------------------------------------------------------------*/
 728
 729static bool block_size_is_power_of_two(struct cache *cache)
 730{
 731        return cache->sectors_per_block_shift >= 0;
 732}
 733
 734/* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */
 735#if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6
 736__always_inline
 737#endif
 738static dm_block_t block_div(dm_block_t b, uint32_t n)
 739{
 740        do_div(b, n);
 741
 742        return b;
 743}
 744
 745static dm_block_t oblocks_per_dblock(struct cache *cache)
 746{
 747        dm_block_t oblocks = cache->discard_block_size;
 748
 749        if (block_size_is_power_of_two(cache))
 750                oblocks >>= cache->sectors_per_block_shift;
 751        else
 752                oblocks = block_div(oblocks, cache->sectors_per_block);
 753
 754        return oblocks;
 755}
 756
 757static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
 758{
 759        return to_dblock(block_div(from_oblock(oblock),
 760                                   oblocks_per_dblock(cache)));
 761}
 762
 763static void set_discard(struct cache *cache, dm_dblock_t b)
 764{
 765        unsigned long flags;
 766
 767        BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks));
 768        atomic_inc(&cache->stats.discard_count);
 769
 770        spin_lock_irqsave(&cache->lock, flags);
 771        set_bit(from_dblock(b), cache->discard_bitset);
 772        spin_unlock_irqrestore(&cache->lock, flags);
 773}
 774
 775static void clear_discard(struct cache *cache, dm_dblock_t b)
 776{
 777        unsigned long flags;
 778
 779        spin_lock_irqsave(&cache->lock, flags);
 780        clear_bit(from_dblock(b), cache->discard_bitset);
 781        spin_unlock_irqrestore(&cache->lock, flags);
 782}
 783
 784static bool is_discarded(struct cache *cache, dm_dblock_t b)
 785{
 786        int r;
 787        unsigned long flags;
 788
 789        spin_lock_irqsave(&cache->lock, flags);
 790        r = test_bit(from_dblock(b), cache->discard_bitset);
 791        spin_unlock_irqrestore(&cache->lock, flags);
 792
 793        return r;
 794}
 795
 796static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
 797{
 798        int r;
 799        unsigned long flags;
 800
 801        spin_lock_irqsave(&cache->lock, flags);
 802        r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
 803                     cache->discard_bitset);
 804        spin_unlock_irqrestore(&cache->lock, flags);
 805
 806        return r;
 807}
 808
 809/*----------------------------------------------------------------
 810 * Remapping
 811 *--------------------------------------------------------------*/
 812static void remap_to_origin(struct cache *cache, struct bio *bio)
 813{
 814        bio_set_dev(bio, cache->origin_dev->bdev);
 815}
 816
 817static void remap_to_cache(struct cache *cache, struct bio *bio,
 818                           dm_cblock_t cblock)
 819{
 820        sector_t bi_sector = bio->bi_iter.bi_sector;
 821        sector_t block = from_cblock(cblock);
 822
 823        bio_set_dev(bio, cache->cache_dev->bdev);
 824        if (!block_size_is_power_of_two(cache))
 825                bio->bi_iter.bi_sector =
 826                        (block * cache->sectors_per_block) +
 827                        sector_div(bi_sector, cache->sectors_per_block);
 828        else
 829                bio->bi_iter.bi_sector =
 830                        (block << cache->sectors_per_block_shift) |
 831                        (bi_sector & (cache->sectors_per_block - 1));
 832}
 833
 834static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
 835{
 836        unsigned long flags;
 837        struct per_bio_data *pb;
 838
 839        spin_lock_irqsave(&cache->lock, flags);
 840        if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) &&
 841            bio_op(bio) != REQ_OP_DISCARD) {
 842                pb = get_per_bio_data(bio);
 843                pb->tick = true;
 844                cache->need_tick_bio = false;
 845        }
 846        spin_unlock_irqrestore(&cache->lock, flags);
 847}
 848
 849static void __remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
 850                                            dm_oblock_t oblock, bool bio_has_pbd)
 851{
 852        if (bio_has_pbd)
 853                check_if_tick_bio_needed(cache, bio);
 854        remap_to_origin(cache, bio);
 855        if (bio_data_dir(bio) == WRITE)
 856                clear_discard(cache, oblock_to_dblock(cache, oblock));
 857}
 858
 859static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
 860                                          dm_oblock_t oblock)
 861{
 862        // FIXME: check_if_tick_bio_needed() is called way too much through this interface
 863        __remap_to_origin_clear_discard(cache, bio, oblock, true);
 864}
 865
 866static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
 867                                 dm_oblock_t oblock, dm_cblock_t cblock)
 868{
 869        check_if_tick_bio_needed(cache, bio);
 870        remap_to_cache(cache, bio, cblock);
 871        if (bio_data_dir(bio) == WRITE) {
 872                set_dirty(cache, cblock);
 873                clear_discard(cache, oblock_to_dblock(cache, oblock));
 874        }
 875}
 876
 877static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
 878{
 879        sector_t block_nr = bio->bi_iter.bi_sector;
 880
 881        if (!block_size_is_power_of_two(cache))
 882                (void) sector_div(block_nr, cache->sectors_per_block);
 883        else
 884                block_nr >>= cache->sectors_per_block_shift;
 885
 886        return to_oblock(block_nr);
 887}
 888
 889static bool accountable_bio(struct cache *cache, struct bio *bio)
 890{
 891        return bio_op(bio) != REQ_OP_DISCARD;
 892}
 893
 894static void accounted_begin(struct cache *cache, struct bio *bio)
 895{
 896        struct per_bio_data *pb;
 897
 898        if (accountable_bio(cache, bio)) {
 899                pb = get_per_bio_data(bio);
 900                pb->len = bio_sectors(bio);
 901                iot_io_begin(&cache->tracker, pb->len);
 902        }
 903}
 904
 905static void accounted_complete(struct cache *cache, struct bio *bio)
 906{
 907        struct per_bio_data *pb = get_per_bio_data(bio);
 908
 909        iot_io_end(&cache->tracker, pb->len);
 910}
 911
 912static void accounted_request(struct cache *cache, struct bio *bio)
 913{
 914        accounted_begin(cache, bio);
 915        generic_make_request(bio);
 916}
 917
 918static void issue_op(struct bio *bio, void *context)
 919{
 920        struct cache *cache = context;
 921        accounted_request(cache, bio);
 922}
 923
 924/*
 925 * When running in writethrough mode we need to send writes to clean blocks
 926 * to both the cache and origin devices.  Clone the bio and send them in parallel.
 927 */
 928static void remap_to_origin_and_cache(struct cache *cache, struct bio *bio,
 929                                      dm_oblock_t oblock, dm_cblock_t cblock)
 930{
 931        struct bio *origin_bio = bio_clone_fast(bio, GFP_NOIO, &cache->bs);
 932
 933        BUG_ON(!origin_bio);
 934
 935        bio_chain(origin_bio, bio);
 936        /*
 937         * Passing false to __remap_to_origin_clear_discard() skips
 938         * all code that might use per_bio_data (since clone doesn't have it)
 939         */
 940        __remap_to_origin_clear_discard(cache, origin_bio, oblock, false);
 941        submit_bio(origin_bio);
 942
 943        remap_to_cache(cache, bio, cblock);
 944}
 945
 946/*----------------------------------------------------------------
 947 * Failure modes
 948 *--------------------------------------------------------------*/
 949static enum cache_metadata_mode get_cache_mode(struct cache *cache)
 950{
 951        return cache->features.mode;
 952}
 953
 954static const char *cache_device_name(struct cache *cache)
 955{
 956        return dm_device_name(dm_table_get_md(cache->ti->table));
 957}
 958
 959static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode)
 960{
 961        const char *descs[] = {
 962                "write",
 963                "read-only",
 964                "fail"
 965        };
 966
 967        dm_table_event(cache->ti->table);
 968        DMINFO("%s: switching cache to %s mode",
 969               cache_device_name(cache), descs[(int)mode]);
 970}
 971
 972static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode)
 973{
 974        bool needs_check;
 975        enum cache_metadata_mode old_mode = get_cache_mode(cache);
 976
 977        if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) {
 978                DMERR("%s: unable to read needs_check flag, setting failure mode.",
 979                      cache_device_name(cache));
 980                new_mode = CM_FAIL;
 981        }
 982
 983        if (new_mode == CM_WRITE && needs_check) {
 984                DMERR("%s: unable to switch cache to write mode until repaired.",
 985                      cache_device_name(cache));
 986                if (old_mode != new_mode)
 987                        new_mode = old_mode;
 988                else
 989                        new_mode = CM_READ_ONLY;
 990        }
 991
 992        /* Never move out of fail mode */
 993        if (old_mode == CM_FAIL)
 994                new_mode = CM_FAIL;
 995
 996        switch (new_mode) {
 997        case CM_FAIL:
 998        case CM_READ_ONLY:
 999                dm_cache_metadata_set_read_only(cache->cmd);
1000                break;

1001
1002        case CM_WRITE:
1003                dm_cache_metadata_set_read_write(cache->cmd);
1004                break;
1005        }
1006
1007        cache->features.mode = new_mode;
1008
1009        if (new_mode != old_mode)
1010                notify_mode_switch(cache, new_mode);
1011}
1012
1013static void abort_transaction(struct cache *cache)
1014{
1015        const char *dev_name = cache_device_name(cache);
1016
1017        if (get_cache_mode(cache) >= CM_READ_ONLY)
1018                return;
1019
1020        if (dm_cache_metadata_set_needs_check(cache->cmd)) {
1021                DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
1022                set_cache_mode(cache, CM_FAIL);
1023        }
1024
1025        DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
1026        if (dm_cache_metadata_abort(cache->cmd)) {
1027                DMERR("%s: failed to abort metadata transaction", dev_name);
1028                set_cache_mode(cache, CM_FAIL);
1029        }
1030}
1031
1032static void metadata_operation_failed(struct cache *cache, const char *op, int r)
1033{
1034        DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
1035                    cache_device_name(cache), op, r);
1036        abort_transaction(cache);
1037        set_cache_mode(cache, CM_READ_ONLY);
1038}
1039
1040/*----------------------------------------------------------------*/
1041
1042static void load_stats(struct cache *cache)
1043{
1044        struct dm_cache_statistics stats;
1045
1046        dm_cache_metadata_get_stats(cache->cmd, &stats);
1047        atomic_set(&cache->stats.read_hit, stats.read_hits);
1048        atomic_set(&cache->stats.read_miss, stats.read_misses);
1049        atomic_set(&cache->stats.write_hit, stats.write_hits);
1050        atomic_set(&cache->stats.write_miss, stats.write_misses);
1051}
1052
1053static void save_stats(struct cache *cache)
1054{
1055        struct dm_cache_statistics stats;
1056
1057        if (get_cache_mode(cache) >= CM_READ_ONLY)
1058                return;
1059
1060        stats.read_hits = atomic_read(&cache->stats.read_hit);
1061        stats.read_misses = atomic_read(&cache->stats.read_miss);
1062        stats.write_hits = atomic_read(&cache->stats.write_hit);
1063        stats.write_misses = atomic_read(&cache->stats.write_miss);
1064
1065        dm_cache_metadata_set_stats(cache->cmd, &stats);
1066}
1067
1068static void update_stats(struct cache_stats *stats, enum policy_operation op)
1069{
1070        switch (op) {
1071        case POLICY_PROMOTE:
1072                atomic_inc(&stats->promotion);
1073                break;
1074
1075        case POLICY_DEMOTE:
1076                atomic_inc(&stats->demotion);
1077                break;
1078
1079        case POLICY_WRITEBACK:
1080                atomic_inc(&stats->writeback);
1081                break;
1082        }
1083}
1084
1085/*----------------------------------------------------------------
1086 * Migration processing
1087 *
1088 * Migration covers moving data from the origin device to the cache, or
1089 * vice versa.
1090 *--------------------------------------------------------------*/
1091
1092static void inc_io_migrations(struct cache *cache)
1093{
1094        atomic_inc(&cache->nr_io_migrations);
1095}
1096
1097static void dec_io_migrations(struct cache *cache)
1098{
1099        atomic_dec(&cache->nr_io_migrations);
1100}
1101
1102static bool discard_or_flush(struct bio *bio)
1103{
1104        return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf);
1105}
1106
1107static void calc_discard_block_range(struct cache *cache, struct bio *bio,
1108                                     dm_dblock_t *b, dm_dblock_t *e)
1109{
1110        sector_t sb = bio->bi_iter.bi_sector;
1111        sector_t se = bio_end_sector(bio);
1112
1113        *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
1114
1115        if (se - sb < cache->discard_block_size)
1116                *e = *b;
1117        else
1118                *e = to_dblock(block_div(se, cache->discard_block_size));
1119}
1120
1121/*----------------------------------------------------------------*/
1122
1123static void prevent_background_work(struct cache *cache)
1124{
1125        lockdep_off();
1126        down_write(&cache->background_work_lock);
1127        lockdep_on();
1128}
1129
1130static void allow_background_work(struct cache *cache)
1131{
1132        lockdep_off();
1133        up_write(&cache->background_work_lock);
1134        lockdep_on();
1135}
1136
1137static bool background_work_begin(struct cache *cache)
1138{
1139        bool r;
1140
1141        lockdep_off();
1142        r = down_read_trylock(&cache->background_work_lock);
1143        lockdep_on();
1144
1145        return r;
1146}
1147
1148static void background_work_end(struct cache *cache)
1149{
1150        lockdep_off();
1151        up_read(&cache->background_work_lock);
1152        lockdep_on();
1153}
1154
1155/*----------------------------------------------------------------*/
1156
1157static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
1158{
1159        return (bio_data_dir(bio) == WRITE) &&
1160                (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
1161}
1162
1163static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block)
1164{
1165        return writeback_mode(cache) &&
1166                (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio));
1167}
1168
1169static void quiesce(struct dm_cache_migration *mg,
1170                    void (*continuation)(struct work_struct *))
1171{
1172        init_continuation(&mg->k, continuation);
1173        dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws);
1174}
1175
1176static struct dm_cache_migration *ws_to_mg(struct work_struct *ws)
1177{
1178        struct continuation *k = container_of(ws, struct continuation, ws);
1179        return container_of(k, struct dm_cache_migration, k);
1180}
1181
1182static void copy_complete(int read_err, unsigned long write_err, void *context)
1183{
1184        struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k);
1185
1186        if (read_err || write_err)
1187                mg->k.input = BLK_STS_IOERR;
1188
1189        queue_continuation(mg->cache->wq, &mg->k);
1190}
1191
1192static void copy(struct dm_cache_migration *mg, bool promote)
1193{
1194        struct dm_io_region o_region, c_region;
1195        struct cache *cache = mg->cache;
1196
1197        o_region.bdev = cache->origin_dev->bdev;
1198        o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block;
1199        o_region.count = cache->sectors_per_block;
1200
1201        c_region.bdev = cache->cache_dev->bdev;
1202        c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block;
1203        c_region.count = cache->sectors_per_block;
1204
1205        if (promote)
1206                dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k);
1207        else
1208                dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k);
1209}
1210
1211static void bio_drop_shared_lock(struct cache *cache, struct bio *bio)
1212{
1213        struct per_bio_data *pb = get_per_bio_data(bio);
1214
1215        if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell))
1216                free_prison_cell(cache, pb->cell);
1217        pb->cell = NULL;
1218}
1219
1220static void overwrite_endio(struct bio *bio)
1221{
1222        struct dm_cache_migration *mg = bio->bi_private;
1223        struct cache *cache = mg->cache;
1224        struct per_bio_data *pb = get_per_bio_data(bio);
1225
1226        dm_unhook_bio(&pb->hook_info, bio);
1227
1228        if (bio->bi_status)
1229                mg->k.input = bio->bi_status;
1230
1231        queue_continuation(cache->wq, &mg->k);
1232}
1233
1234static void overwrite(struct dm_cache_migration *mg,
1235                      void (*continuation)(struct work_struct *))
1236{
1237        struct bio *bio = mg->overwrite_bio;
1238        struct per_bio_data *pb = get_per_bio_data(bio);
1239
1240        dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
1241
1242        /*
1243         * The overwrite bio is part of the copy operation, as such it does
1244         * not set/clear discard or dirty flags.
1245         */
1246        if (mg->op->op == POLICY_PROMOTE)
1247                remap_to_cache(mg->cache, bio, mg->op->cblock);
1248        else
1249                remap_to_origin(mg->cache, bio);
1250
1251        init_continuation(&mg->k, continuation);
1252        accounted_request(mg->cache, bio);
1253}
1254
1255/*
1256 * Migration steps:
1257 *
1258 * 1) exclusive lock preventing WRITEs
1259 * 2) quiesce
1260 * 3) copy or issue overwrite bio
1261 * 4) upgrade to exclusive lock preventing READs and WRITEs
1262 * 5) quiesce
1263 * 6) update metadata and commit
1264 * 7) unlock
1265 */
1266static void mg_complete(struct dm_cache_migration *mg, bool success)
1267{
1268        struct bio_list bios;
1269        struct cache *cache = mg->cache;
1270        struct policy_work *op = mg->op;
1271        dm_cblock_t cblock = op->cblock;
1272
1273        if (success)
1274                update_stats(&cache->stats, op->op);
1275
1276        switch (op->op) {
1277        case POLICY_PROMOTE:
1278                clear_discard(cache, oblock_to_dblock(cache, op->oblock));
1279                policy_complete_background_work(cache->policy, op, success);
1280
1281                if (mg->overwrite_bio) {
1282                        if (success)
1283                                force_set_dirty(cache, cblock);
1284                        else if (mg->k.input)
1285                                mg->overwrite_bio->bi_status = mg->k.input;
1286                        else
1287                                mg->overwrite_bio->bi_status = BLK_STS_IOERR;
1288                        bio_endio(mg->overwrite_bio);
1289                } else {
1290                        if (success)
1291                                force_clear_dirty(cache, cblock);
1292                        dec_io_migrations(cache);
1293                }
1294                break;
1295
1296        case POLICY_DEMOTE:
1297                /*
1298                 * We clear dirty here to update the nr_dirty counter.
1299                 */
1300                if (success)
1301                        force_clear_dirty(cache, cblock);
1302                policy_complete_background_work(cache->policy, op, success);
1303                dec_io_migrations(cache);
1304                break;
1305
1306        case POLICY_WRITEBACK:
1307                if (success)
1308                        force_clear_dirty(cache, cblock);
1309                policy_complete_background_work(cache->policy, op, success);
1310                dec_io_migrations(cache);
1311                break;
1312        }
1313
1314        bio_list_init(&bios);
1315        if (mg->cell) {
1316                if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
1317                        free_prison_cell(cache, mg->cell);
1318        }
1319
1320        free_migration(mg);
1321        defer_bios(cache, &bios);
1322        wake_migration_worker(cache);
1323
1324        background_work_end(cache);
1325}
1326
1327static void mg_success(struct work_struct *ws)
1328{
1329        struct dm_cache_migration *mg = ws_to_mg(ws);
1330        mg_complete(mg, mg->k.input == 0);
1331}
1332
1333static void mg_update_metadata(struct work_struct *ws)
1334{
1335        int r;
1336        struct dm_cache_migration *mg = ws_to_mg(ws);
1337        struct cache *cache = mg->cache;
1338        struct policy_work *op = mg->op;
1339
1340        switch (op->op) {
1341        case POLICY_PROMOTE:
1342                r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock);
1343                if (r) {
1344                        DMERR_LIMIT("%s: migration failed; couldn't insert mapping",
1345                                    cache_device_name(cache));
1346                        metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
1347
1348                        mg_complete(mg, false);
1349                        return;
1350                }
1351                mg_complete(mg, true);
1352                break;
1353
1354        case POLICY_DEMOTE:
1355                r = dm_cache_remove_mapping(cache->cmd, op->cblock);
1356                if (r) {
1357                        DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata",
1358                                    cache_device_name(cache));
1359                        metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
1360
1361                        mg_complete(mg, false);
1362                        return;
1363                }
1364
1365                /*
1366                 * It would be nice if we only had to commit when a REQ_FLUSH
1367                 * comes through.  But there's one scenario that we have to
1368                 * look out for:
1369                 *
1370                 * - vblock x in a cache block
1371                 * - domotion occurs
1372                 * - cache block gets reallocated and over written
1373                 * - crash
1374                 *
1375                 * When we recover, because there was no commit the cache will
1376                 * rollback to having the data for vblock x in the cache block.
1377                 * But the cache block has since been overwritten, so it'll end
1378                 * up pointing to data that was never in 'x' during the history
1379                 * of the device.
1380                 *
1381                 * To avoid this issue we require a commit as part of the
1382                 * demotion operation.
1383                 */
1384                init_continuation(&mg->k, mg_success);
1385                continue_after_commit(&cache->committer, &mg->k);
1386                schedule_commit(&cache->committer);
1387                break;
1388
1389        case POLICY_WRITEBACK:
1390                mg_complete(mg, true);
1391                break;
1392        }
1393}
1394
1395static void mg_update_metadata_after_copy(struct work_struct *ws)
1396{
1397        struct dm_cache_migration *mg = ws_to_mg(ws);
1398
1399        /*
1400         * Did the copy succeed?
1401         */
1402        if (mg->k.input)
1403                mg_complete(mg, false);
1404        else
1405                mg_update_metadata(ws);
1406}
1407
1408static void mg_upgrade_lock(struct work_struct *ws)
1409{
1410        int r;
1411        struct dm_cache_migration *mg = ws_to_mg(ws);
1412
1413        /*
1414         * Did the copy succeed?
1415         */
1416        if (mg->k.input)
1417                mg_complete(mg, false);
1418
1419        else {
1420                /*
1421                 * Now we want the lock to prevent both reads and writes.
1422                 */
1423                r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell,
1424                                            READ_WRITE_LOCK_LEVEL);
1425                if (r < 0)
1426                        mg_complete(mg, false);
1427
1428                else if (r)
1429                        quiesce(mg, mg_update_metadata);
1430
1431                else
1432                        mg_update_metadata(ws);
1433        }
1434}
1435
1436static void mg_full_copy(struct work_struct *ws)
1437{
1438        struct dm_cache_migration *mg = ws_to_mg(ws);
1439        struct cache *cache = mg->cache;
1440        struct policy_work *op = mg->op;
1441        bool is_policy_promote = (op->op == POLICY_PROMOTE);
1442
1443        if ((!is_policy_promote && !is_dirty(cache, op->cblock)) ||
1444            is_discarded_oblock(cache, op->oblock)) {
1445                mg_upgrade_lock(ws);
1446                return;
1447        }
1448
1449        init_continuation(&mg->k, mg_upgrade_lock);
1450        copy(mg, is_policy_promote);
1451}
1452
1453static void mg_copy(struct work_struct *ws)
1454{
1455        struct dm_cache_migration *mg = ws_to_mg(ws);
1456
1457        if (mg->overwrite_bio) {
1458                /*
1459                 * No exclusive lock was held when we last checked if the bio
1460                 * was optimisable.  So we have to check again in case things
1461                 * have changed (eg, the block may no longer be discarded).
1462                 */
1463                if (!optimisable_bio(mg->cache, mg->overwrite_bio, mg->op->oblock)) {
1464                        /*
1465                         * Fallback to a real full copy after doing some tidying up.
1466                         */
1467                        bool rb = bio_detain_shared(mg->cache, mg->op->oblock, mg->overwrite_bio);
1468                        BUG_ON(rb); /* An exclussive lock must _not_ be held for this block */
1469                        mg->overwrite_bio = NULL;
1470                        inc_io_migrations(mg->cache);
1471                        mg_full_copy(ws);
1472                        return;
1473                }
1474
1475                /*
1476                 * It's safe to do this here, even though it's new data
1477                 * because all IO has been locked out of the block.
1478                 *
1479                 * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL
1480                 * so _not_ using mg_upgrade_lock() as continutation.
1481                 */
1482                overwrite(mg, mg_update_metadata_after_copy);
1483
1484        } else
1485                mg_full_copy(ws);
1486}
1487
1488static int mg_lock_writes(struct dm_cache_migration *mg)
1489{
1490        int r;
1491        struct dm_cell_key_v2 key;
1492        struct cache *cache = mg->cache;
1493        struct dm_bio_prison_cell_v2 *prealloc;
1494
1495        prealloc = alloc_prison_cell(cache);
1496        if (!prealloc) {
1497                DMERR_LIMIT("%s: alloc_prison_cell failed", cache_device_name(cache));
1498                mg_complete(mg, false);
1499                return -ENOMEM;
1500        }
1501
1502        /*
1503         * Prevent writes to the block, but allow reads to continue.
1504         * Unless we're using an overwrite bio, in which case we lock
1505         * everything.
1506         */
1507        build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key);
1508        r = dm_cell_lock_v2(cache->prison, &key,
1509                            mg->overwrite_bio ?  READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL,
1510                            prealloc, &mg->cell);
1511        if (r < 0) {
1512                free_prison_cell(cache, prealloc);
1513                mg_complete(mg, false);
1514                return r;
1515        }
1516
1517        if (mg->cell != prealloc)
1518                free_prison_cell(cache, prealloc);
1519
1520        if (r == 0)
1521                mg_copy(&mg->k.ws);
1522        else
1523                quiesce(mg, mg_copy);
1524
1525        return 0;
1526}
1527
1528static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio)
1529{
1530        struct dm_cache_migration *mg;
1531
1532        if (!background_work_begin(cache)) {
1533                policy_complete_background_work(cache->policy, op, false);
1534                return -EPERM;
1535        }
1536
1537        mg = alloc_migration(cache);
1538        if (!mg) {
1539                policy_complete_background_work(cache->policy, op, false);
1540                background_work_end(cache);
1541                return -ENOMEM;
1542        }
1543
1544        mg->op = op;
1545        mg->overwrite_bio = bio;
1546
1547        if (!bio)
1548                inc_io_migrations(cache);
1549
1550        return mg_lock_writes(mg);
1551}
1552
1553/*----------------------------------------------------------------
1554 * invalidation processing
1555 *--------------------------------------------------------------*/
1556
1557static void invalidate_complete(struct dm_cache_migration *mg, bool success)
1558{
1559        struct bio_list bios;
1560        struct cache *cache = mg->cache;
1561
1562        bio_list_init(&bios);
1563        if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
1564                free_prison_cell(cache, mg->cell);
1565
1566        if (!success && mg->overwrite_bio)
1567                bio_io_error(mg->overwrite_bio);
1568
1569        free_migration(mg);
1570        defer_bios(cache, &bios);
1571
1572        background_work_end(cache);
1573}
1574
1575static void invalidate_completed(struct work_struct *ws)
1576{
1577        struct dm_cache_migration *mg = ws_to_mg(ws);
1578        invalidate_complete(mg, !mg->k.input);
1579}
1580
1581static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock)
1582{
1583        int r = policy_invalidate_mapping(cache->policy, cblock);
1584        if (!r) {
1585                r = dm_cache_remove_mapping(cache->cmd, cblock);
1586                if (r) {
1587                        DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata",
1588                                    cache_device_name(cache));
1589                        metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
1590                }
1591
1592        } else if (r == -ENODATA) {
1593                /*
1594                 * Harmless, already unmapped.
1595                 */
1596                r = 0;
1597
1598        } else
1599                DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache));
1600
1601        return r;
1602}
1603
1604static void invalidate_remove(struct work_struct *ws)
1605{
1606        int r;
1607        struct dm_cache_migration *mg = ws_to_mg(ws);
1608        struct cache *cache = mg->cache;
1609
1610        r = invalidate_cblock(cache, mg->invalidate_cblock);
1611        if (r) {
1612                invalidate_complete(mg, false);
1613                return;
1614        }
1615
1616        init_continuation(&mg->k, invalidate_completed);
1617        continue_after_commit(&cache->committer, &mg->k);
1618        remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock);
1619        mg->overwrite_bio = NULL;
1620        schedule_commit(&cache->committer);
1621}
1622
1623static int invalidate_lock(struct dm_cache_migration *mg)
1624{
1625        int r;
1626        struct dm_cell_key_v2 key;
1627        struct cache *cache = mg->cache;
1628        struct dm_bio_prison_cell_v2 *prealloc;
1629
1630        prealloc = alloc_prison_cell(cache);
1631        if (!prealloc) {
1632                invalidate_complete(mg, false);
1633                return -ENOMEM;
1634        }
1635
1636        build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key);
1637        r = dm_cell_lock_v2(cache->prison, &key,
1638                            READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell);
1639        if (r < 0) {
1640                free_prison_cell(cache, prealloc);
1641                invalidate_complete(mg, false);
1642                return r;
1643        }
1644
1645        if (mg->cell != prealloc)
1646                free_prison_cell(cache, prealloc);
1647
1648        if (r)
1649                quiesce(mg, invalidate_remove);
1650
1651        else {
1652                /*
1653                 * We can't call invalidate_remove() directly here because we
1654                 * might still be in request context.
1655                 */
1656                init_continuation(&mg->k, invalidate_remove);
1657                queue_work(cache->wq, &mg->k.ws);
1658        }
1659
1660        return 0;
1661}
1662
1663static int invalidate_start(struct cache *cache, dm_cblock_t cblock,
1664                            dm_oblock_t oblock, struct bio *bio)
1665{
1666        struct dm_cache_migration *mg;
1667
1668        if (!background_work_begin(cache))
1669                return -EPERM;
1670
1671        mg = alloc_migration(cache);
1672        if (!mg) {
1673                background_work_end(cache);
1674                return -ENOMEM;
1675        }
1676
1677        mg->overwrite_bio = bio;
1678        mg->invalidate_cblock = cblock;
1679        mg->invalidate_oblock = oblock;
1680
1681        return invalidate_lock(mg);
1682}
1683
1684/*----------------------------------------------------------------
1685 * bio processing
1686 *--------------------------------------------------------------*/
1687
1688enum busy {
1689        IDLE,
1690        BUSY
1691};
1692
1693static enum busy spare_migration_bandwidth(struct cache *cache)
1694{
1695        bool idle = iot_idle_for(&cache->tracker, HZ);
1696        sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
1697                cache->sectors_per_block;
1698
1699        if (idle && current_volume <= cache->migration_threshold)
1700                return IDLE;
1701        else
1702                return BUSY;
1703}
1704
1705static void inc_hit_counter(struct cache *cache, struct bio *bio)
1706{
1707        atomic_inc(bio_data_dir(bio) == READ ?
1708                   &cache->stats.read_hit : &cache->stats.write_hit);
1709}
1710
1711static void inc_miss_counter(struct cache *cache, struct bio *bio)
1712{
1713        atomic_inc(bio_data_dir(bio) == READ ?
1714                   &cache->stats.read_miss : &cache->stats.write_miss);
1715}
1716
1717/*----------------------------------------------------------------*/
1718
1719static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block,
1720                   bool *commit_needed)
1721{
1722        int r, data_dir;
1723        bool rb, background_queued;
1724        dm_cblock_t cblock;
1725
1726        *commit_needed = false;
1727
1728        rb = bio_detain_shared(cache, block, bio);
1729        if (!rb) {
1730                /*
1731                 * An exclusive lock is held for this block, so we have to
1732                 * wait.  We set the commit_needed flag so the current
1733                 * transaction will be committed asap, allowing this lock
1734                 * to be dropped.
1735                 */
1736                *commit_needed = true;
1737                return DM_MAPIO_SUBMITTED;
1738        }
1739
1740        data_dir = bio_data_dir(bio);
1741
1742        if (optimisable_bio(cache, bio, block)) {
1743                struct policy_work *op = NULL;
1744
1745                r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op);
1746                if (unlikely(r && r != -ENOENT)) {
1747                        DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d",
1748                                    cache_device_name(cache), r);
1749                        bio_io_error(bio);
1750                        return DM_MAPIO_SUBMITTED;
1751                }
1752
1753                if (r == -ENOENT && op) {
1754                        bio_drop_shared_lock(cache, bio);
1755                        BUG_ON(op->op != POLICY_PROMOTE);
1756                        mg_start(cache, op, bio);
1757                        return DM_MAPIO_SUBMITTED;
1758                }
1759        } else {
1760                r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued);
1761                if (unlikely(r && r != -ENOENT)) {
1762                        DMERR_LIMIT("%s: policy_lookup() failed with r = %d",
1763                                    cache_device_name(cache), r);
1764                        bio_io_error(bio);
1765                        return DM_MAPIO_SUBMITTED;
1766                }
1767
1768                if (background_queued)
1769                        wake_migration_worker(cache);
1770        }
1771
1772        if (r == -ENOENT) {
1773                struct per_bio_data *pb = get_per_bio_data(bio);
1774
1775                /*
1776                 * Miss.
1777                 */
1778                inc_miss_counter(cache, bio);
1779                if (pb->req_nr == 0) {
1780                        accounted_begin(cache, bio);
1781                        remap_to_origin_clear_discard(cache, bio, block);
1782                } else {
1783                        /*
1784                         * This is a duplicate writethrough io that is no
1785                         * longer needed because the block has been demoted.
1786                         */
1787                        bio_endio(bio);
1788                        return DM_MAPIO_SUBMITTED;
1789                }
1790        } else {
1791                /*
1792                 * Hit.
1793                 */
1794                inc_hit_counter(cache, bio);
1795
1796                /*
1797                 * Passthrough always maps to the origin, invalidating any
1798                 * cache blocks that are written to.
1799                 */
1800                if (passthrough_mode(cache)) {
1801                        if (bio_data_dir(bio) == WRITE) {
1802                                bio_drop_shared_lock(cache, bio);
1803                                atomic_inc(&cache->stats.demotion);
1804                                invalidate_start(cache, cblock, block, bio);
1805                        } else
1806                                remap_to_origin_clear_discard(cache, bio, block);
1807                } else {
1808                        if (bio_data_dir(bio) == WRITE && writethrough_mode(cache) &&
1809                            !is_dirty(cache, cblock)) {
1810                                remap_to_origin_and_cache(cache, bio, block, cblock);
1811                                accounted_begin(cache, bio);
1812                        } else
1813                                remap_to_cache_dirty(cache, bio, block, cblock);
1814                }
1815        }
1816
1817        /*
1818         * dm core turns FUA requests into a separate payload and FLUSH req.
1819         */
1820        if (bio->bi_opf & REQ_FUA) {
1821                /*
1822                 * issue_after_commit will call accounted_begin a second time.  So
1823                 * we call accounted_complete() to avoid double accounting.
1824                 */
1825                accounted_complete(cache, bio);
1826                issue_after_commit(&cache->committer, bio);
1827                *commit_needed = true;
1828                return DM_MAPIO_SUBMITTED;
1829        }
1830
1831        return DM_MAPIO_REMAPPED;
1832}
1833
1834static bool process_bio(struct cache *cache, struct bio *bio)
1835{
1836        bool commit_needed;
1837
1838        if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED)
1839                generic_make_request(bio);
1840
1841        return commit_needed;
1842}
1843
1844/*
1845 * A non-zero return indicates read_only or fail_io mode.
1846 */
1847static int commit(struct cache *cache, bool clean_shutdown)
1848{
1849        int r;
1850
1851        if (get_cache_mode(cache) >= CM_READ_ONLY)
1852                return -EINVAL;
1853
1854        atomic_inc(&cache->stats.commit_count);
1855        r = dm_cache_commit(cache->cmd, clean_shutdown);
1856        if (r)
1857                metadata_operation_failed(cache, "dm_cache_commit", r);
1858
1859        return r;
1860}
1861
1862/*
1863 * Used by the batcher.
1864 */
1865static blk_status_t commit_op(void *context)
1866{
1867        struct cache *cache = context;
1868
1869        if (dm_cache_changed_this_transaction(cache->cmd))
1870                return errno_to_blk_status(commit(cache, false));
1871
1872        return 0;
1873}
1874
1875/*----------------------------------------------------------------*/
1876
1877static bool process_flush_bio(struct cache *cache, struct bio *bio)
1878{
1879        struct per_bio_data *pb = get_per_bio_data(bio);
1880
1881        if (!pb->req_nr)
1882                remap_to_origin(cache, bio);
1883        else
1884                remap_to_cache(cache, bio, 0);
1885
1886        issue_after_commit(&cache->committer, bio);
1887        return true;
1888}
1889
1890static bool process_discard_bio(struct cache *cache, struct bio *bio)
1891{
1892        dm_dblock_t b, e;
1893
1894        // FIXME: do we need to lock the region?  Or can we just assume the
1895        // user wont be so foolish as to issue discard concurrently with
1896        // other IO?
1897        calc_discard_block_range(cache, bio, &b, &e);
1898        while (b != e) {
1899                set_discard(cache, b);
1900                b = to_dblock(from_dblock(b) + 1);
1901        }
1902
1903        if (cache->features.discard_passdown) {
1904                remap_to_origin(cache, bio);
1905                generic_make_request(bio);
1906        } else
1907                bio_endio(bio);
1908
1909        return false;
1910}
1911
1912static void process_deferred_bios(struct work_struct *ws)
1913{
1914        struct cache *cache = container_of(ws, struct cache, deferred_bio_worker);
1915
1916        unsigned long flags;
1917        bool commit_needed = false;
1918        struct bio_list bios;
1919        struct bio *bio;
1920
1921        bio_list_init(&bios);
1922
1923        spin_lock_irqsave(&cache->lock, flags);
1924        bio_list_merge(&bios, &cache->deferred_bios);
1925        bio_list_init(&cache->deferred_bios);
1926        spin_unlock_irqrestore(&cache->lock, flags);
1927
1928        while ((bio = bio_list_pop(&bios))) {
1929                if (bio->bi_opf & REQ_PREFLUSH)
1930                        commit_needed = process_flush_bio(cache, bio) || commit_needed;
1931
1932                else if (bio_op(bio) == REQ_OP_DISCARD)
1933                        commit_needed = process_discard_bio(cache, bio) || commit_needed;
1934
1935                else
1936                        commit_needed = process_bio(cache, bio) || commit_needed;
1937        }
1938
1939        if (commit_needed)
1940                schedule_commit(&cache->committer);
1941}
1942
1943/*----------------------------------------------------------------
1944 * Main worker loop
1945 *--------------------------------------------------------------*/
1946
1947static void requeue_deferred_bios(struct cache *cache)
1948{
1949        struct bio *bio;
1950        struct bio_list bios;
1951
1952        bio_list_init(&bios);
1953        bio_list_merge(&bios, &cache->deferred_bios);
1954        bio_list_init(&cache->deferred_bios);
1955
1956        while ((bio = bio_list_pop(&bios))) {
1957                bio->bi_status = BLK_STS_DM_REQUEUE;
1958                bio_endio(bio);
1959        }
1960}
1961
1962/*
1963 * We want to commit periodically so that not too much
1964 * unwritten metadata builds up.
1965 */
1966static void do_waker(struct work_struct *ws)
1967{
1968        struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
1969
1970        policy_tick(cache->policy, true);
1971        wake_migration_worker(cache);
1972        schedule_commit(&cache->committer);
1973        queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
1974}
1975
1976static void check_migrations(struct work_struct *ws)
1977{
1978        int r;
1979        struct policy_work *op;
1980        struct cache *cache = container_of(ws, struct cache, migration_worker);
1981        enum busy b;
1982
1983        for (;;) {
1984                b = spare_migration_bandwidth(cache);
1985
1986                r = policy_get_background_work(cache->policy, b == IDLE, &op);
1987                if (r == -ENODATA)
1988                        break;
1989
1990                if (r) {
1991                        DMERR_LIMIT("%s: policy_background_work failed",
1992                                    cache_device_name(cache));
1993                        break;
1994                }
1995
1996                r = mg_start(cache, op, NULL);
1997                if (r)
1998                        break;
1999        }
2000}

2001
2002/*----------------------------------------------------------------
2003 * Target methods
2004 *--------------------------------------------------------------*/
2005
2006/*
2007 * This function gets called on the error paths of the constructor, so we
2008 * have to cope with a partially initialised struct.
2009 */
2010static void destroy(struct cache *cache)
2011{
2012        unsigned i;
2013
2014        mempool_exit(&cache->migration_pool);
2015
2016        if (cache->prison)
2017                dm_bio_prison_destroy_v2(cache->prison);
2018
2019        if (cache->wq)
2020                destroy_workqueue(cache->wq);
2021
2022        if (cache->dirty_bitset)
2023                free_bitset(cache->dirty_bitset);
2024
2025        if (cache->discard_bitset)
2026                free_bitset(cache->discard_bitset);
2027
2028        if (cache->copier)
2029                dm_kcopyd_client_destroy(cache->copier);
2030
2031        if (cache->cmd)
2032                dm_cache_metadata_close(cache->cmd);
2033
2034        if (cache->metadata_dev)
2035                dm_put_device(cache->ti, cache->metadata_dev);
2036
2037        if (cache->origin_dev)
2038                dm_put_device(cache->ti, cache->origin_dev);
2039
2040        if (cache->cache_dev)
2041                dm_put_device(cache->ti, cache->cache_dev);
2042
2043        if (cache->policy)
2044                dm_cache_policy_destroy(cache->policy);
2045
2046        for (i = 0; i < cache->nr_ctr_args ; i++)
2047                kfree(cache->ctr_args[i]);
2048        kfree(cache->ctr_args);
2049
2050        bioset_exit(&cache->bs);
2051
2052        kfree(cache);
2053}
2054
2055static void cache_dtr(struct dm_target *ti)
2056{
2057        struct cache *cache = ti->private;
2058
2059        destroy(cache);
2060}
2061
2062static sector_t get_dev_size(struct dm_dev *dev)
2063{
2064        return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
2065}
2066
2067/*----------------------------------------------------------------*/
2068
2069/*
2070 * Construct a cache device mapping.
2071 *
2072 * cache <metadata dev> <cache dev> <origin dev> <block size>
2073 *       <#feature args> [<feature arg>]*
2074 *       <policy> <#policy args> [<policy arg>]*
2075 *
2076 * metadata dev    : fast device holding the persistent metadata
2077 * cache dev       : fast device holding cached data blocks
2078 * origin dev      : slow device holding original data blocks
2079 * block size      : cache unit size in sectors
2080 *
2081 * #feature args   : number of feature arguments passed
2082 * feature args    : writethrough.  (The default is writeback.)
2083 *
2084 * policy          : the replacement policy to use
2085 * #policy args    : an even number of policy arguments corresponding
2086 *                   to key/value pairs passed to the policy
2087 * policy args     : key/value pairs passed to the policy
2088 *                   E.g. 'sequential_threshold 1024'
2089 *                   See cache-policies.txt for details.
2090 *
2091 * Optional feature arguments are:
2092 *   writethrough  : write through caching that prohibits cache block
2093 *                   content from being different from origin block content.
2094 *                   Without this argument, the default behaviour is to write
2095 *                   back cache block contents later for performance reasons,
2096 *                   so they may differ from the corresponding origin blocks.
2097 */
2098struct cache_args {
2099        struct dm_target *ti;
2100
2101        struct dm_dev *metadata_dev;
2102
2103        struct dm_dev *cache_dev;
2104        sector_t cache_sectors;
2105
2106        struct dm_dev *origin_dev;
2107        sector_t origin_sectors;
2108
2109        uint32_t block_size;
2110
2111        const char *policy_name;
2112        int policy_argc;
2113        const char **policy_argv;
2114
2115        struct cache_features features;
2116};
2117
2118static void destroy_cache_args(struct cache_args *ca)
2119{
2120        if (ca->metadata_dev)
2121                dm_put_device(ca->ti, ca->metadata_dev);
2122
2123        if (ca->cache_dev)
2124                dm_put_device(ca->ti, ca->cache_dev);
2125
2126        if (ca->origin_dev)
2127                dm_put_device(ca->ti, ca->origin_dev);
2128
2129        kfree(ca);
2130}
2131
2132static bool at_least_one_arg(struct dm_arg_set *as, char **error)
2133{
2134        if (!as->argc) {
2135                *error = "Insufficient args";
2136                return false;
2137        }
2138
2139        return true;
2140}
2141
2142static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as,
2143                              char **error)
2144{
2145        int r;
2146        sector_t metadata_dev_size;
2147        char b[BDEVNAME_SIZE];
2148
2149        if (!at_least_one_arg(as, error))
2150                return -EINVAL;
2151
2152        r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
2153                          &ca->metadata_dev);
2154        if (r) {
2155                *error = "Error opening metadata device";
2156                return r;
2157        }
2158
2159        metadata_dev_size = get_dev_size(ca->metadata_dev);
2160        if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
2161                DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
2162                       bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
2163
2164        return 0;
2165}
2166
2167static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as,
2168                           char **error)
2169{
2170        int r;
2171
2172        if (!at_least_one_arg(as, error))
2173                return -EINVAL;
2174
2175        r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
2176                          &ca->cache_dev);
2177        if (r) {
2178                *error = "Error opening cache device";
2179                return r;
2180        }
2181        ca->cache_sectors = get_dev_size(ca->cache_dev);
2182
2183        return 0;
2184}
2185
2186static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
2187                            char **error)
2188{
2189        int r;
2190
2191        if (!at_least_one_arg(as, error))
2192                return -EINVAL;
2193
2194        r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
2195                          &ca->origin_dev);
2196        if (r) {
2197                *error = "Error opening origin device";
2198                return r;
2199        }
2200
2201        ca->origin_sectors = get_dev_size(ca->origin_dev);
2202        if (ca->ti->len > ca->origin_sectors) {
2203                *error = "Device size larger than cached device";
2204                return -EINVAL;
2205        }
2206
2207        return 0;
2208}
2209
2210static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
2211                            char **error)
2212{
2213        unsigned long block_size;
2214
2215        if (!at_least_one_arg(as, error))
2216                return -EINVAL;
2217
2218        if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size ||
2219            block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
2220            block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
2221            block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
2222                *error = "Invalid data block size";
2223                return -EINVAL;
2224        }
2225
2226        if (block_size > ca->cache_sectors) {
2227                *error = "Data block size is larger than the cache device";
2228                return -EINVAL;
2229        }
2230
2231        ca->block_size = block_size;
2232
2233        return 0;
2234}
2235
2236static void init_features(struct cache_features *cf)
2237{
2238        cf->mode = CM_WRITE;
2239        cf->io_mode = CM_IO_WRITEBACK;
2240        cf->metadata_version = 1;
2241        cf->discard_passdown = true;
2242}
2243
2244static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
2245                          char **error)
2246{
2247        static const struct dm_arg _args[] = {
2248                {0, 3, "Invalid number of cache feature arguments"},
2249        };
2250
2251        int r, mode_ctr = 0;
2252        unsigned argc;
2253        const char *arg;
2254        struct cache_features *cf = &ca->features;
2255
2256        init_features(cf);
2257
2258        r = dm_read_arg_group(_args, as, &argc, error);
2259        if (r)
2260                return -EINVAL;
2261
2262        while (argc--) {
2263                arg = dm_shift_arg(as);
2264
2265                if (!strcasecmp(arg, "writeback")) {
2266                        cf->io_mode = CM_IO_WRITEBACK;
2267                        mode_ctr++;
2268                }
2269
2270                else if (!strcasecmp(arg, "writethrough")) {
2271                        cf->io_mode = CM_IO_WRITETHROUGH;
2272                        mode_ctr++;
2273                }
2274
2275                else if (!strcasecmp(arg, "passthrough")) {
2276                        cf->io_mode = CM_IO_PASSTHROUGH;
2277                        mode_ctr++;
2278                }
2279
2280                else if (!strcasecmp(arg, "metadata2"))
2281                        cf->metadata_version = 2;
2282
2283                else if (!strcasecmp(arg, "no_discard_passdown"))
2284                        cf->discard_passdown = false;
2285
2286                else {
2287                        *error = "Unrecognised cache feature requested";
2288                        return -EINVAL;
2289                }
2290        }
2291
2292        if (mode_ctr > 1) {
2293                *error = "Duplicate cache io_mode features requested";
2294                return -EINVAL;
2295        }
2296
2297        return 0;
2298}
2299
2300static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
2301                        char **error)
2302{
2303        static const struct dm_arg _args[] = {
2304                {0, 1024, "Invalid number of policy arguments"},
2305        };
2306
2307        int r;
2308
2309        if (!at_least_one_arg(as, error))
2310                return -EINVAL;
2311
2312        ca->policy_name = dm_shift_arg(as);
2313
2314        r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
2315        if (r)
2316                return -EINVAL;
2317
2318        ca->policy_argv = (const char **)as->argv;
2319        dm_consume_args(as, ca->policy_argc);
2320
2321        return 0;
2322}
2323
2324static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
2325                            char **error)
2326{
2327        int r;
2328        struct dm_arg_set as;
2329
2330        as.argc = argc;
2331        as.argv = argv;
2332
2333        r = parse_metadata_dev(ca, &as, error);
2334        if (r)
2335                return r;
2336
2337        r = parse_cache_dev(ca, &as, error);
2338        if (r)
2339                return r;
2340
2341        r = parse_origin_dev(ca, &as, error);
2342        if (r)
2343                return r;
2344
2345        r = parse_block_size(ca, &as, error);
2346        if (r)
2347                return r;
2348
2349        r = parse_features(ca, &as, error);
2350        if (r)
2351                return r;
2352
2353        r = parse_policy(ca, &as, error);
2354        if (r)
2355                return r;
2356
2357        return 0;
2358}
2359
2360/*----------------------------------------------------------------*/
2361
2362static struct kmem_cache *migration_cache;
2363
2364#define NOT_CORE_OPTION 1
2365
2366static int process_config_option(struct cache *cache, const char *key, const char *value)
2367{
2368        unsigned long tmp;
2369
2370        if (!strcasecmp(key, "migration_threshold")) {
2371                if (kstrtoul(value, 10, &tmp))
2372                        return -EINVAL;
2373
2374                cache->migration_threshold = tmp;
2375                return 0;
2376        }
2377
2378        return NOT_CORE_OPTION;
2379}
2380
2381static int set_config_value(struct cache *cache, const char *key, const char *value)
2382{
2383        int r = process_config_option(cache, key, value);
2384
2385        if (r == NOT_CORE_OPTION)
2386                r = policy_set_config_value(cache->policy, key, value);
2387
2388        if (r)
2389                DMWARN("bad config value for %s: %s", key, value);
2390
2391        return r;
2392}
2393
2394static int set_config_values(struct cache *cache, int argc, const char **argv)
2395{
2396        int r = 0;
2397
2398        if (argc & 1) {
2399                DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
2400                return -EINVAL;
2401        }
2402
2403        while (argc) {
2404                r = set_config_value(cache, argv[0], argv[1]);
2405                if (r)
2406                        break;
2407
2408                argc -= 2;
2409                argv += 2;
2410        }
2411
2412        return r;
2413}
2414
2415static int create_cache_policy(struct cache *cache, struct cache_args *ca,
2416                               char **error)
2417{
2418        struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name,
2419                                                           cache->cache_size,
2420                                                           cache->origin_sectors,
2421                                                           cache->sectors_per_block);
2422        if (IS_ERR(p)) {
2423                *error = "Error creating cache's policy";
2424                return PTR_ERR(p);
2425        }
2426        cache->policy = p;
2427        BUG_ON(!cache->policy);
2428
2429        return 0;
2430}
2431
2432/*
2433 * We want the discard block size to be at least the size of the cache
2434 * block size and have no more than 2^14 discard blocks across the origin.
2435 */
2436#define MAX_DISCARD_BLOCKS (1 << 14)
2437
2438static bool too_many_discard_blocks(sector_t discard_block_size,
2439                                    sector_t origin_size)
2440{
2441        (void) sector_div(origin_size, discard_block_size);
2442
2443        return origin_size > MAX_DISCARD_BLOCKS;
2444}
2445
2446static sector_t calculate_discard_block_size(sector_t cache_block_size,
2447                                             sector_t origin_size)
2448{
2449        sector_t discard_block_size = cache_block_size;
2450
2451        if (origin_size)
2452                while (too_many_discard_blocks(discard_block_size, origin_size))
2453                        discard_block_size *= 2;
2454
2455        return discard_block_size;
2456}
2457
2458static void set_cache_size(struct cache *cache, dm_cblock_t size)
2459{
2460        dm_block_t nr_blocks = from_cblock(size);
2461
2462        if (nr_blocks > (1 << 20) && cache->cache_size != size)
2463                DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n"
2464                             "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n"
2465                             "Please consider increasing the cache block size to reduce the overall cache block count.",
2466                             (unsigned long long) nr_blocks);
2467
2468        cache->cache_size = size;
2469}
2470
2471static int is_congested(struct dm_dev *dev, int bdi_bits)
2472{
2473        struct request_queue *q = bdev_get_queue(dev->bdev);
2474        return bdi_congested(q->backing_dev_info, bdi_bits);
2475}
2476
2477static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
2478{
2479        struct cache *cache = container_of(cb, struct cache, callbacks);
2480
2481        return is_congested(cache->origin_dev, bdi_bits) ||
2482                is_congested(cache->cache_dev, bdi_bits);
2483}
2484
2485#define DEFAULT_MIGRATION_THRESHOLD 2048
2486
2487static int cache_create(struct cache_args *ca, struct cache **result)
2488{
2489        int r = 0;
2490        char **error = &ca->ti->error;
2491        struct cache *cache;
2492        struct dm_target *ti = ca->ti;
2493        dm_block_t origin_blocks;
2494        struct dm_cache_metadata *cmd;
2495        bool may_format = ca->features.mode == CM_WRITE;
2496
2497        cache = kzalloc(sizeof(*cache), GFP_KERNEL);
2498        if (!cache)
2499                return -ENOMEM;
2500
2501        cache->ti = ca->ti;
2502        ti->private = cache;
2503        ti->num_flush_bios = 2;
2504        ti->flush_supported = true;
2505
2506        ti->num_discard_bios = 1;
2507        ti->discards_supported = true;
2508
2509        ti->per_io_data_size = sizeof(struct per_bio_data);
2510
2511        cache->features = ca->features;
2512        if (writethrough_mode(cache)) {
2513                /* Create bioset for writethrough bios issued to origin */
2514                r = bioset_init(&cache->bs, BIO_POOL_SIZE, 0, 0);
2515                if (r)
2516                        goto bad;
2517        }
2518
2519        cache->callbacks.congested_fn = cache_is_congested;
2520        dm_table_add_target_callbacks(ti->table, &cache->callbacks);
2521
2522        cache->metadata_dev = ca->metadata_dev;
2523        cache->origin_dev = ca->origin_dev;
2524        cache->cache_dev = ca->cache_dev;
2525
2526        ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
2527
2528        origin_blocks = cache->origin_sectors = ca->origin_sectors;
2529        origin_blocks = block_div(origin_blocks, ca->block_size);
2530        cache->origin_blocks = to_oblock(origin_blocks);
2531
2532        cache->sectors_per_block = ca->block_size;
2533        if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
2534                r = -EINVAL;
2535                goto bad;
2536        }
2537
2538        if (ca->block_size & (ca->block_size - 1)) {
2539                dm_block_t cache_size = ca->cache_sectors;
2540
2541                cache->sectors_per_block_shift = -1;
2542                cache_size = block_div(cache_size, ca->block_size);
2543                set_cache_size(cache, to_cblock(cache_size));
2544        } else {
2545                cache->sectors_per_block_shift = __ffs(ca->block_size);
2546                set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift));
2547        }
2548
2549        r = create_cache_policy(cache, ca, error);
2550        if (r)
2551                goto bad;
2552
2553        cache->policy_nr_args = ca->policy_argc;
2554        cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
2555
2556        r = set_config_values(cache, ca->policy_argc, ca->policy_argv);
2557        if (r) {
2558                *error = "Error setting cache policy's config values";
2559                goto bad;
2560        }
2561
2562        cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
2563                                     ca->block_size, may_format,
2564                                     dm_cache_policy_get_hint_size(cache->policy),
2565                                     ca->features.metadata_version);
2566        if (IS_ERR(cmd)) {
2567                *error = "Error creating metadata object";
2568                r = PTR_ERR(cmd);
2569                goto bad;
2570        }
2571        cache->cmd = cmd;
2572        set_cache_mode(cache, CM_WRITE);
2573        if (get_cache_mode(cache) != CM_WRITE) {
2574                *error = "Unable to get write access to metadata, please check/repair metadata.";
2575                r = -EINVAL;
2576                goto bad;
2577        }
2578
2579        if (passthrough_mode(cache)) {
2580                bool all_clean;
2581
2582                r = dm_cache_metadata_all_clean(cache->cmd, &all_clean);
2583                if (r) {
2584                        *error = "dm_cache_metadata_all_clean() failed";
2585                        goto bad;
2586                }
2587
2588                if (!all_clean) {
2589                        *error = "Cannot enter passthrough mode unless all blocks are clean";
2590                        r = -EINVAL;
2591                        goto bad;
2592                }
2593
2594                policy_allow_migrations(cache->policy, false);
2595        }
2596
2597        spin_lock_init(&cache->lock);
2598        bio_list_init(&cache->deferred_bios);
2599        atomic_set(&cache->nr_allocated_migrations, 0);
2600        atomic_set(&cache->nr_io_migrations, 0);
2601        init_waitqueue_head(&cache->migration_wait);
2602
2603        r = -ENOMEM;
2604        atomic_set(&cache->nr_dirty, 0);
2605        cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
2606        if (!cache->dirty_bitset) {
2607                *error = "could not allocate dirty bitset";
2608                goto bad;
2609        }
2610        clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
2611
2612        cache->discard_block_size =
2613                calculate_discard_block_size(cache->sectors_per_block,
2614                                             cache->origin_sectors);
2615        cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors,
2616                                                              cache->discard_block_size));
2617        cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
2618        if (!cache->discard_bitset) {
2619                *error = "could not allocate discard bitset";
2620                goto bad;
2621        }
2622        clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
2623
2624        cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2625        if (IS_ERR(cache->copier)) {
2626                *error = "could not create kcopyd client";
2627                r = PTR_ERR(cache->copier);
2628                goto bad;
2629        }
2630
2631        cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
2632        if (!cache->wq) {
2633                *error = "could not create workqueue for metadata object";
2634                goto bad;
2635        }
2636        INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios);
2637        INIT_WORK(&cache->migration_worker, check_migrations);
2638        INIT_DELAYED_WORK(&cache->waker, do_waker);
2639
2640        cache->prison = dm_bio_prison_create_v2(cache->wq);
2641        if (!cache->prison) {
2642                *error = "could not create bio prison";
2643                goto bad;
2644        }
2645
2646        r = mempool_init_slab_pool(&cache->migration_pool, MIGRATION_POOL_SIZE,
2647                                   migration_cache);
2648        if (r) {
2649                *error = "Error creating cache's migration mempool";
2650                goto bad;
2651        }
2652
2653        cache->need_tick_bio = true;
2654        cache->sized = false;
2655        cache->invalidate = false;
2656        cache->commit_requested = false;
2657        cache->loaded_mappings = false;
2658        cache->loaded_discards = false;
2659
2660        load_stats(cache);
2661
2662        atomic_set(&cache->stats.demotion, 0);
2663        atomic_set(&cache->stats.promotion, 0);
2664        atomic_set(&cache->stats.copies_avoided, 0);
2665        atomic_set(&cache->stats.cache_cell_clash, 0);
2666        atomic_set(&cache->stats.commit_count, 0);
2667        atomic_set(&cache->stats.discard_count, 0);
2668
2669        spin_lock_init(&cache->invalidation_lock);
2670        INIT_LIST_HEAD(&cache->invalidation_requests);
2671
2672        batcher_init(&cache->committer, commit_op, cache,
2673                     issue_op, cache, cache->wq);
2674        iot_init(&cache->tracker);
2675
2676        init_rwsem(&cache->background_work_lock);
2677        prevent_background_work(cache);
2678
2679        *result = cache;
2680        return 0;
2681bad:
2682        destroy(cache);
2683        return r;
2684}
2685
2686static int copy_ctr_args(struct cache *cache, int argc, const char **argv)
2687{
2688        unsigned i;
2689        const char **copy;
2690
2691        copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
2692        if (!copy)
2693                return -ENOMEM;
2694        for (i = 0; i < argc; i++) {
2695                copy[i] = kstrdup(argv[i], GFP_KERNEL);
2696                if (!copy[i]) {
2697                        while (i--)
2698                                kfree(copy[i]);
2699                        kfree(copy);
2700                        return -ENOMEM;
2701                }
2702        }
2703
2704        cache->nr_ctr_args = argc;
2705        cache->ctr_args = copy;
2706
2707        return 0;
2708}
2709
2710static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
2711{
2712        int r = -EINVAL;
2713        struct cache_args *ca;
2714        struct cache *cache = NULL;
2715
2716        ca = kzalloc(sizeof(*ca), GFP_KERNEL);
2717        if (!ca) {
2718                ti->error = "Error allocating memory for cache";
2719                return -ENOMEM;
2720        }
2721        ca->ti = ti;
2722
2723        r = parse_cache_args(ca, argc, argv, &ti->error);
2724        if (r)
2725                goto out;
2726
2727        r = cache_create(ca, &cache);
2728        if (r)
2729                goto out;
2730
2731        r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
2732        if (r) {
2733                destroy(cache);
2734                goto out;
2735        }
2736
2737        ti->private = cache;
2738out:
2739        destroy_cache_args(ca);
2740        return r;
2741}
2742
2743/*----------------------------------------------------------------*/
2744
2745static int cache_map(struct dm_target *ti, struct bio *bio)
2746{
2747        struct cache *cache = ti->private;
2748
2749        int r;
2750        bool commit_needed;
2751        dm_oblock_t block = get_bio_block(cache, bio);
2752
2753        init_per_bio_data(bio);
2754        if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
2755                /*
2756                 * This can only occur if the io goes to a partial block at
2757                 * the end of the origin device.  We don't cache these.
2758                 * Just remap to the origin and carry on.
2759                 */
2760                remap_to_origin(cache, bio);
2761                accounted_begin(cache, bio);
2762                return DM_MAPIO_REMAPPED;
2763        }
2764
2765        if (discard_or_flush(bio)) {
2766                defer_bio(cache, bio);
2767                return DM_MAPIO_SUBMITTED;
2768        }
2769
2770        r = map_bio(cache, bio, block, &commit_needed);
2771        if (commit_needed)
2772                schedule_commit(&cache->committer);
2773
2774        return r;
2775}
2776
2777static int cache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *error)
2778{
2779        struct cache *cache = ti->private;
2780        unsigned long flags;
2781        struct per_bio_data *pb = get_per_bio_data(bio);
2782
2783        if (pb->tick) {
2784                policy_tick(cache->policy, false);
2785
2786                spin_lock_irqsave(&cache->lock, flags);
2787                cache->need_tick_bio = true;
2788                spin_unlock_irqrestore(&cache->lock, flags);
2789        }
2790
2791        bio_drop_shared_lock(cache, bio);
2792        accounted_complete(cache, bio);
2793
2794        return DM_ENDIO_DONE;
2795}
2796
2797static int write_dirty_bitset(struct cache *cache)
2798{
2799        int r;
2800
2801        if (get_cache_mode(cache) >= CM_READ_ONLY)
2802                return -EINVAL;
2803
2804        r = dm_cache_set_dirty_bits(cache->cmd, from_cblock(cache->cache_size), cache->dirty_bitset);
2805        if (r)
2806                metadata_operation_failed(cache, "dm_cache_set_dirty_bits", r);
2807
2808        return r;
2809}
2810
2811static int write_discard_bitset(struct cache *cache)
2812{
2813        unsigned i, r;
2814
2815        if (get_cache_mode(cache) >= CM_READ_ONLY)
2816                return -EINVAL;
2817
2818        r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
2819                                           cache->discard_nr_blocks);
2820        if (r) {
2821                DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache));
2822                metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r);
2823                return r;
2824        }
2825
2826        for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
2827                r = dm_cache_set_discard(cache->cmd, to_dblock(i),
2828                                         is_discarded(cache, to_dblock(i)));
2829                if (r) {
2830                        metadata_operation_failed(cache, "dm_cache_set_discard", r);
2831                        return r;
2832                }
2833        }
2834
2835        return 0;
2836}
2837
2838static int write_hints(struct cache *cache)
2839{
2840        int r;
2841
2842        if (get_cache_mode(cache) >= CM_READ_ONLY)
2843                return -EINVAL;
2844
2845        r = dm_cache_write_hints(cache->cmd, cache->policy);
2846        if (r) {
2847                metadata_operation_failed(cache, "dm_cache_write_hints", r);
2848                return r;
2849        }
2850
2851        return 0;
2852}
2853
2854/*
2855 * returns true on success
2856 */
2857static bool sync_metadata(struct cache *cache)
2858{
2859        int r1, r2, r3, r4;
2860
2861        r1 = write_dirty_bitset(cache);
2862        if (r1)
2863                DMERR("%s: could not write dirty bitset", cache_device_name(cache));
2864
2865        r2 = write_discard_bitset(cache);
2866        if (r2)
2867                DMERR("%s: could not write discard bitset", cache_device_name(cache));
2868
2869        save_stats(cache);
2870
2871        r3 = write_hints(cache);
2872        if (r3)
2873                DMERR("%s: could not write hints", cache_device_name(cache));
2874
2875        /*
2876         * If writing the above metadata failed, we still commit, but don't
2877         * set the clean shutdown flag.  This will effectively force every
2878         * dirty bit to be set on reload.
2879         */
2880        r4 = commit(cache, !r1 && !r2 && !r3);
2881        if (r4)
2882                DMERR("%s: could not write cache metadata", cache_device_name(cache));
2883
2884        return !r1 && !r2 && !r3 && !r4;
2885}
2886
2887static void cache_postsuspend(struct dm_target *ti)
2888{
2889        struct cache *cache = ti->private;
2890
2891        prevent_background_work(cache);
2892        BUG_ON(atomic_read(&cache->nr_io_migrations));
2893
2894        cancel_delayed_work(&cache->waker);
2895        flush_workqueue(cache->wq);
2896        WARN_ON(cache->tracker.in_flight);
2897
2898        /*
2899         * If it's a flush suspend there won't be any deferred bios, so this
2900         * call is harmless.
2901         */
2902        requeue_deferred_bios(cache);
2903
2904        if (get_cache_mode(cache) == CM_WRITE)
2905                (void) sync_metadata(cache);
2906}
2907
2908static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
2909                        bool dirty, uint32_t hint, bool hint_valid)
2910{
2911        int r;
2912        struct cache *cache = context;
2913
2914        if (dirty) {
2915                set_bit(from_cblock(cblock), cache->dirty_bitset);
2916                atomic_inc(&cache->nr_dirty);
2917        } else
2918                clear_bit(from_cblock(cblock), cache->dirty_bitset);
2919
2920        r = policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid);
2921        if (r)
2922                return r;
2923
2924        return 0;
2925}
2926
2927/*
2928 * The discard block size in the on disk metadata is not
2929 * neccessarily the same as we're currently using.  So we have to
2930 * be careful to only set the discarded attribute if we know it
2931 * covers a complete block of the new size.
2932 */
2933struct discard_load_info {
2934        struct cache *cache;
2935
2936        /*
2937         * These blocks are sized using the on disk dblock size, rather
2938         * than the current one.
2939         */
2940        dm_block_t block_size;
2941        dm_block_t discard_begin, discard_end;
2942};
2943
2944static void discard_load_info_init(struct cache *cache,
2945                                   struct discard_load_info *li)
2946{
2947        li->cache = cache;
2948        li->discard_begin = li->discard_end = 0;
2949}
2950
2951static void set_discard_range(struct discard_load_info *li)
2952{
2953        sector_t b, e;
2954
2955        if (li->discard_begin == li->discard_end)
2956                return;
2957
2958        /*
2959         * Convert to sectors.
2960         */
2961        b = li->discard_begin * li->block_size;
2962        e = li->discard_end * li->block_size;
2963
2964        /*
2965         * Then convert back to the current dblock size.
2966         */
2967        b = dm_sector_div_up(b, li->cache->discard_block_size);
2968        sector_div(e, li->cache->discard_block_size);
2969
2970        /*
2971         * The origin may have shrunk, so we need to check we're still in
2972         * bounds.
2973         */
2974        if (e > from_dblock(li->cache->discard_nr_blocks))
2975                e = from_dblock(li->cache->discard_nr_blocks);
2976
2977        for (; b < e; b++)
2978                set_discard(li->cache, to_dblock(b));
2979}
2980
2981static int load_discard(void *context, sector_t discard_block_size,
2982                        dm_dblock_t dblock, bool discard)
2983{
2984        struct discard_load_info *li = context;
2985
2986        li->block_size = discard_block_size;
2987
2988        if (discard) {
2989                if (from_dblock(dblock) == li->discard_end)
2990                        /*
2991                         * We're already in a discard range, just extend it.
2992                         */
2993                        li->discard_end = li->discard_end + 1ULL;
2994
2995                else {
2996                        /*
2997                         * Emit the old range and start a new one.
2998                         */
2999                        set_discard_range(li);
3000                        li->discard_begin = from_dblock(dblock);

3001                        li->discard_end = li->discard_begin + 1ULL;
3002                }
3003        } else {
3004                set_discard_range(li);
3005                li->discard_begin = li->discard_end = 0;
3006        }
3007
3008        return 0;
3009}
3010
3011static dm_cblock_t get_cache_dev_size(struct cache *cache)
3012{
3013        sector_t size = get_dev_size(cache->cache_dev);
3014        (void) sector_div(size, cache->sectors_per_block);
3015        return to_cblock(size);
3016}
3017
3018static bool can_resize(struct cache *cache, dm_cblock_t new_size)
3019{
3020        if (from_cblock(new_size) > from_cblock(cache->cache_size)) {
3021                if (cache->sized) {
3022                        DMERR("%s: unable to extend cache due to missing cache table reload",
3023                              cache_device_name(cache));
3024                        return false;
3025                }
3026        }
3027
3028        /*
3029         * We can't drop a dirty block when shrinking the cache.
3030         */
3031        while (from_cblock(new_size) < from_cblock(cache->cache_size)) {
3032                new_size = to_cblock(from_cblock(new_size) + 1);
3033                if (is_dirty(cache, new_size)) {
3034                        DMERR("%s: unable to shrink cache; cache block %llu is dirty",
3035                              cache_device_name(cache),
3036                              (unsigned long long) from_cblock(new_size));
3037                        return false;
3038                }
3039        }
3040
3041        return true;
3042}
3043
3044static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
3045{
3046        int r;
3047
3048        r = dm_cache_resize(cache->cmd, new_size);
3049        if (r) {
3050                DMERR("%s: could not resize cache metadata", cache_device_name(cache));
3051                metadata_operation_failed(cache, "dm_cache_resize", r);
3052                return r;
3053        }
3054
3055        set_cache_size(cache, new_size);
3056
3057        return 0;
3058}
3059
3060static int cache_preresume(struct dm_target *ti)
3061{
3062        int r = 0;
3063        struct cache *cache = ti->private;
3064        dm_cblock_t csize = get_cache_dev_size(cache);
3065
3066        /*
3067         * Check to see if the cache has resized.
3068         */
3069        if (!cache->sized) {
3070                r = resize_cache_dev(cache, csize);
3071                if (r)
3072                        return r;
3073
3074                cache->sized = true;
3075
3076        } else if (csize != cache->cache_size) {
3077                if (!can_resize(cache, csize))
3078                        return -EINVAL;
3079
3080                r = resize_cache_dev(cache, csize);
3081                if (r)
3082                        return r;
3083        }
3084
3085        if (!cache->loaded_mappings) {
3086                r = dm_cache_load_mappings(cache->cmd, cache->policy,
3087                                           load_mapping, cache);
3088                if (r) {
3089                        DMERR("%s: could not load cache mappings", cache_device_name(cache));
3090                        metadata_operation_failed(cache, "dm_cache_load_mappings", r);
3091                        return r;
3092                }
3093
3094                cache->loaded_mappings = true;
3095        }
3096
3097        if (!cache->loaded_discards) {
3098                struct discard_load_info li;
3099
3100                /*
3101                 * The discard bitset could have been resized, or the
3102                 * discard block size changed.  To be safe we start by
3103                 * setting every dblock to not discarded.
3104                 */
3105                clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
3106
3107                discard_load_info_init(cache, &li);
3108                r = dm_cache_load_discards(cache->cmd, load_discard, &li);
3109                if (r) {
3110                        DMERR("%s: could not load origin discards", cache_device_name(cache));
3111                        metadata_operation_failed(cache, "dm_cache_load_discards", r);
3112                        return r;
3113                }
3114                set_discard_range(&li);
3115
3116                cache->loaded_discards = true;
3117        }
3118
3119        return r;
3120}
3121
3122static void cache_resume(struct dm_target *ti)
3123{
3124        struct cache *cache = ti->private;
3125
3126        cache->need_tick_bio = true;
3127        allow_background_work(cache);
3128        do_waker(&cache->waker.work);
3129}
3130
3131static void emit_flags(struct cache *cache, char *result,
3132                       unsigned maxlen, ssize_t *sz_ptr)
3133{
3134        ssize_t sz = *sz_ptr;
3135        struct cache_features *cf = &cache->features;
3136        unsigned count = (cf->metadata_version == 2) + !cf->discard_passdown + 1;
3137
3138        DMEMIT("%u ", count);
3139
3140        if (cf->metadata_version == 2)
3141                DMEMIT("metadata2 ");
3142
3143        if (writethrough_mode(cache))
3144                DMEMIT("writethrough ");
3145
3146        else if (passthrough_mode(cache))
3147                DMEMIT("passthrough ");
3148
3149        else if (writeback_mode(cache))
3150                DMEMIT("writeback ");
3151
3152        else {
3153                DMEMIT("unknown ");
3154                DMERR("%s: internal error: unknown io mode: %d",
3155                      cache_device_name(cache), (int) cf->io_mode);
3156        }
3157
3158        if (!cf->discard_passdown)
3159                DMEMIT("no_discard_passdown ");
3160
3161        *sz_ptr = sz;
3162}
3163
3164/*
3165 * Status format:
3166 *
3167 * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
3168 * <cache block size> <#used cache blocks>/<#total cache blocks>
3169 * <#read hits> <#read misses> <#write hits> <#write misses>
3170 * <#demotions> <#promotions> <#dirty>
3171 * <#features> <features>*
3172 * <#core args> <core args>
3173 * <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check>
3174 */
3175static void cache_status(struct dm_target *ti, status_type_t type,
3176                         unsigned status_flags, char *result, unsigned maxlen)
3177{
3178        int r = 0;
3179        unsigned i;
3180        ssize_t sz = 0;
3181        dm_block_t nr_free_blocks_metadata = 0;
3182        dm_block_t nr_blocks_metadata = 0;
3183        char buf[BDEVNAME_SIZE];
3184        struct cache *cache = ti->private;
3185        dm_cblock_t residency;
3186        bool needs_check;
3187
3188        switch (type) {
3189        case STATUSTYPE_INFO:
3190                if (get_cache_mode(cache) == CM_FAIL) {
3191                        DMEMIT("Fail");
3192                        break;
3193                }
3194
3195                /* Commit to ensure statistics aren't out-of-date */
3196                if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
3197                        (void) commit(cache, false);
3198
3199                r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata);
3200                if (r) {
3201                        DMERR("%s: dm_cache_get_free_metadata_block_count returned %d",
3202                              cache_device_name(cache), r);
3203                        goto err;
3204                }
3205
3206                r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
3207                if (r) {
3208                        DMERR("%s: dm_cache_get_metadata_dev_size returned %d",
3209                              cache_device_name(cache), r);
3210                        goto err;
3211                }
3212
3213                residency = policy_residency(cache->policy);
3214
3215                DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ",
3216                       (unsigned)DM_CACHE_METADATA_BLOCK_SIZE,
3217                       (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
3218                       (unsigned long long)nr_blocks_metadata,
3219                       (unsigned long long)cache->sectors_per_block,
3220                       (unsigned long long) from_cblock(residency),
3221                       (unsigned long long) from_cblock(cache->cache_size),
3222                       (unsigned) atomic_read(&cache->stats.read_hit),
3223                       (unsigned) atomic_read(&cache->stats.read_miss),
3224                       (unsigned) atomic_read(&cache->stats.write_hit),
3225                       (unsigned) atomic_read(&cache->stats.write_miss),
3226                       (unsigned) atomic_read(&cache->stats.demotion),
3227                       (unsigned) atomic_read(&cache->stats.promotion),
3228                       (unsigned long) atomic_read(&cache->nr_dirty));
3229
3230                emit_flags(cache, result, maxlen, &sz);
3231
3232                DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
3233
3234                DMEMIT("%s ", dm_cache_policy_get_name(cache->policy));
3235                if (sz < maxlen) {
3236                        r = policy_emit_config_values(cache->policy, result, maxlen, &sz);
3237                        if (r)
3238                                DMERR("%s: policy_emit_config_values returned %d",
3239                                      cache_device_name(cache), r);
3240                }
3241
3242                if (get_cache_mode(cache) == CM_READ_ONLY)
3243                        DMEMIT("ro ");
3244                else
3245                        DMEMIT("rw ");
3246
3247                r = dm_cache_metadata_needs_check(cache->cmd, &needs_check);
3248
3249                if (r || needs_check)
3250                        DMEMIT("needs_check ");
3251                else
3252                        DMEMIT("- ");
3253
3254                break;
3255
3256        case STATUSTYPE_TABLE:
3257                format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
3258                DMEMIT("%s ", buf);
3259                format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
3260                DMEMIT("%s ", buf);
3261                format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
3262                DMEMIT("%s", buf);
3263
3264                for (i = 0; i < cache->nr_ctr_args - 1; i++)
3265                        DMEMIT(" %s", cache->ctr_args[i]);
3266                if (cache->nr_ctr_args)
3267                        DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
3268        }
3269
3270        return;
3271
3272err:
3273        DMEMIT("Error");
3274}
3275
3276/*
3277 * Defines a range of cblocks, begin to (end - 1) are in the range.  end is
3278 * the one-past-the-end value.
3279 */
3280struct cblock_range {
3281        dm_cblock_t begin;
3282        dm_cblock_t end;
3283};
3284
3285/*
3286 * A cache block range can take two forms:
3287 *
3288 * i) A single cblock, eg. '3456'
3289 * ii) A begin and end cblock with a dash between, eg. 123-234
3290 */
3291static int parse_cblock_range(struct cache *cache, const char *str,
3292                              struct cblock_range *result)
3293{
3294        char dummy;
3295        uint64_t b, e;
3296        int r;
3297
3298        /*
3299         * Try and parse form (ii) first.
3300         */
3301        r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy);
3302        if (r < 0)
3303                return r;
3304
3305        if (r == 2) {
3306                result->begin = to_cblock(b);
3307                result->end = to_cblock(e);
3308                return 0;
3309        }
3310
3311        /*
3312         * That didn't work, try form (i).
3313         */
3314        r = sscanf(str, "%llu%c", &b, &dummy);
3315        if (r < 0)
3316                return r;
3317
3318        if (r == 1) {
3319                result->begin = to_cblock(b);
3320                result->end = to_cblock(from_cblock(result->begin) + 1u);
3321                return 0;
3322        }
3323
3324        DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str);
3325        return -EINVAL;
3326}
3327
3328static int validate_cblock_range(struct cache *cache, struct cblock_range *range)
3329{
3330        uint64_t b = from_cblock(range->begin);
3331        uint64_t e = from_cblock(range->end);
3332        uint64_t n = from_cblock(cache->cache_size);
3333
3334        if (b >= n) {
3335                DMERR("%s: begin cblock out of range: %llu >= %llu",
3336                      cache_device_name(cache), b, n);
3337                return -EINVAL;
3338        }
3339
3340        if (e > n) {
3341                DMERR("%s: end cblock out of range: %llu > %llu",
3342                      cache_device_name(cache), e, n);
3343                return -EINVAL;
3344        }
3345
3346        if (b >= e) {
3347                DMERR("%s: invalid cblock range: %llu >= %llu",
3348                      cache_device_name(cache), b, e);
3349                return -EINVAL;
3350        }
3351
3352        return 0;
3353}
3354
3355static inline dm_cblock_t cblock_succ(dm_cblock_t b)
3356{
3357        return to_cblock(from_cblock(b) + 1);
3358}
3359
3360static int request_invalidation(struct cache *cache, struct cblock_range *range)
3361{
3362        int r = 0;
3363
3364        /*
3365         * We don't need to do any locking here because we know we're in
3366         * passthrough mode.  There's is potential for a race between an
3367         * invalidation triggered by an io and an invalidation message.  This
3368         * is harmless, we must not worry if the policy call fails.
3369         */
3370        while (range->begin != range->end) {
3371                r = invalidate_cblock(cache, range->begin);
3372                if (r)
3373                        return r;
3374
3375                range->begin = cblock_succ(range->begin);
3376        }
3377
3378        cache->commit_requested = true;
3379        return r;
3380}
3381
3382static int process_invalidate_cblocks_message(struct cache *cache, unsigned count,
3383                                              const char **cblock_ranges)
3384{
3385        int r = 0;
3386        unsigned i;
3387        struct cblock_range range;
3388
3389        if (!passthrough_mode(cache)) {
3390                DMERR("%s: cache has to be in passthrough mode for invalidation",
3391                      cache_device_name(cache));
3392                return -EPERM;
3393        }
3394
3395        for (i = 0; i < count; i++) {
3396                r = parse_cblock_range(cache, cblock_ranges[i], &range);
3397                if (r)
3398                        break;
3399
3400                r = validate_cblock_range(cache, &range);
3401                if (r)
3402                        break;
3403
3404                /*
3405                 * Pass begin and end origin blocks to the worker and wake it.
3406                 */
3407                r = request_invalidation(cache, &range);
3408                if (r)
3409                        break;
3410        }
3411
3412        return r;
3413}
3414
3415/*
3416 * Supports
3417 *      "<key> <value>"
3418 * and
3419 *     "invalidate_cblocks [(<begin>)|(<begin>-<end>)]*
3420 *
3421 * The key migration_threshold is supported by the cache target core.
3422 */
3423static int cache_message(struct dm_target *ti, unsigned argc, char **argv,
3424                         char *result, unsigned maxlen)
3425{
3426        struct cache *cache = ti->private;
3427
3428        if (!argc)
3429                return -EINVAL;
3430
3431        if (get_cache_mode(cache) >= CM_READ_ONLY) {
3432                DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode",
3433                      cache_device_name(cache));
3434                return -EOPNOTSUPP;
3435        }
3436
3437        if (!strcasecmp(argv[0], "invalidate_cblocks"))
3438                return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1);
3439
3440        if (argc != 2)
3441                return -EINVAL;
3442
3443        return set_config_value(cache, argv[0], argv[1]);
3444}
3445
3446static int cache_iterate_devices(struct dm_target *ti,
3447                                 iterate_devices_callout_fn fn, void *data)
3448{
3449        int r = 0;
3450        struct cache *cache = ti->private;
3451
3452        r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
3453        if (!r)
3454                r = fn(ti, cache->origin_dev, 0, ti->len, data);
3455
3456        return r;
3457}
3458
3459static bool origin_dev_supports_discard(struct block_device *origin_bdev)
3460{
3461        struct request_queue *q = bdev_get_queue(origin_bdev);
3462
3463        return q && blk_queue_discard(q);
3464}
3465
3466/*
3467 * If discard_passdown was enabled verify that the origin device
3468 * supports discards.  Disable discard_passdown if not.
3469 */
3470static void disable_passdown_if_not_supported(struct cache *cache)
3471{
3472        struct block_device *origin_bdev = cache->origin_dev->bdev;
3473        struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits;
3474        const char *reason = NULL;
3475        char buf[BDEVNAME_SIZE];
3476
3477        if (!cache->features.discard_passdown)
3478                return;
3479
3480        if (!origin_dev_supports_discard(origin_bdev))
3481                reason = "discard unsupported";
3482
3483        else if (origin_limits->max_discard_sectors < cache->sectors_per_block)
3484                reason = "max discard sectors smaller than a block";
3485
3486        if (reason) {
3487                DMWARN("Origin device (%s) %s: Disabling discard passdown.",
3488                       bdevname(origin_bdev, buf), reason);
3489                cache->features.discard_passdown = false;
3490        }
3491}
3492
3493static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
3494{
3495        struct block_device *origin_bdev = cache->origin_dev->bdev;
3496        struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits;
3497
3498        if (!cache->features.discard_passdown) {
3499                /* No passdown is done so setting own virtual limits */
3500                limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024,
3501                                                    cache->origin_sectors);
3502                limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
3503                return;
3504        }
3505
3506        /*
3507         * cache_iterate_devices() is stacking both origin and fast device limits
3508         * but discards aren't passed to fast device, so inherit origin's limits.
3509         */
3510        limits->max_discard_sectors = origin_limits->max_discard_sectors;
3511        limits->max_hw_discard_sectors = origin_limits->max_hw_discard_sectors;
3512        limits->discard_granularity = origin_limits->discard_granularity;
3513        limits->discard_alignment = origin_limits->discard_alignment;
3514        limits->discard_misaligned = origin_limits->discard_misaligned;
3515}
3516
3517static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
3518{
3519        struct cache *cache = ti->private;
3520        uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
3521
3522        /*
3523         * If the system-determined stacked limits are compatible with the
3524         * cache's blocksize (io_opt is a factor) do not override them.
3525         */
3526        if (io_opt_sectors < cache->sectors_per_block ||
3527            do_div(io_opt_sectors, cache->sectors_per_block)) {
3528                blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT);
3529                blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
3530        }
3531
3532        disable_passdown_if_not_supported(cache);
3533        set_discard_limits(cache, limits);
3534}
3535
3536/*----------------------------------------------------------------*/
3537
3538static struct target_type cache_target = {
3539        .name = "cache",
3540        .version = {2, 1, 0},
3541        .module = THIS_MODULE,
3542        .ctr = cache_ctr,
3543        .dtr = cache_dtr,
3544        .map = cache_map,
3545        .end_io = cache_end_io,
3546        .postsuspend = cache_postsuspend,
3547        .preresume = cache_preresume,
3548        .resume = cache_resume,
3549        .status = cache_status,
3550        .message = cache_message,
3551        .iterate_devices = cache_iterate_devices,
3552        .io_hints = cache_io_hints,
3553};
3554
3555static int __init dm_cache_init(void)
3556{
3557        int r;
3558
3559        migration_cache = KMEM_CACHE(dm_cache_migration, 0);
3560        if (!migration_cache)
3561                return -ENOMEM;
3562
3563        r = dm_register_target(&cache_target);
3564        if (r) {
3565                DMERR("cache target registration failed: %d", r);
3566                kmem_cache_destroy(migration_cache);
3567                return r;
3568        }
3569
3570        return 0;
3571}
3572
3573static void __exit dm_cache_exit(void)
3574{
3575        dm_unregister_target(&cache_target);
3576        kmem_cache_destroy(migration_cache);
3577}
3578
3579module_init(dm_cache_init);
3580module_exit(dm_cache_exit);
3581
3582MODULE_DESCRIPTION(DM_NAME " cache target");
3583MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
3584MODULE_LICENSE("GPL");
3585