linux/drivers/md/dm-cache-target.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2012 Red Hat. All rights reserved.
   3 *
   4 * This file is released under the GPL.
   5 */
   6
   7#include "dm.h"
   8#include "dm-bio-prison-v2.h"
   9#include "dm-bio-record.h"
  10#include "dm-cache-metadata.h"
  11
  12#include <linux/dm-io.h>
  13#include <linux/dm-kcopyd.h>
  14#include <linux/jiffies.h>
  15#include <linux/init.h>
  16#include <linux/mempool.h>
  17#include <linux/module.h>
  18#include <linux/rwsem.h>
  19#include <linux/slab.h>
  20#include <linux/vmalloc.h>
  21
  22#define DM_MSG_PREFIX "cache"
  23
  24DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(cache_copy_throttle,
  25        "A percentage of time allocated for copying to and/or from cache");
  26
  27/*----------------------------------------------------------------*/
  28
  29/*
  30 * Glossary:
  31 *
  32 * oblock: index of an origin block
  33 * cblock: index of a cache block
  34 * promotion: movement of a block from origin to cache
  35 * demotion: movement of a block from cache to origin
  36 * migration: movement of a block between the origin and cache device,
  37 *            either direction
  38 */
  39
  40/*----------------------------------------------------------------*/
  41
  42struct io_tracker {
  43        spinlock_t lock;
  44
  45        /*
  46         * Sectors of in-flight IO.
  47         */
  48        sector_t in_flight;
  49
  50        /*
  51         * The time, in jiffies, when this device became idle (if it is
  52         * indeed idle).
  53         */
  54        unsigned long idle_time;
  55        unsigned long last_update_time;
  56};
  57
  58static void iot_init(struct io_tracker *iot)
  59{
  60        spin_lock_init(&iot->lock);
  61        iot->in_flight = 0ul;
  62        iot->idle_time = 0ul;
  63        iot->last_update_time = jiffies;
  64}
  65
  66static bool __iot_idle_for(struct io_tracker *iot, unsigned long jifs)
  67{
  68        if (iot->in_flight)
  69                return false;
  70
  71        return time_after(jiffies, iot->idle_time + jifs);
  72}
  73
  74static bool iot_idle_for(struct io_tracker *iot, unsigned long jifs)
  75{
  76        bool r;
  77
  78        spin_lock_irq(&iot->lock);
  79        r = __iot_idle_for(iot, jifs);
  80        spin_unlock_irq(&iot->lock);
  81
  82        return r;
  83}
  84
  85static void iot_io_begin(struct io_tracker *iot, sector_t len)
  86{
  87        spin_lock_irq(&iot->lock);
  88        iot->in_flight += len;
  89        spin_unlock_irq(&iot->lock);
  90}
  91
  92static void __iot_io_end(struct io_tracker *iot, sector_t len)
  93{
  94        if (!len)
  95                return;
  96
  97        iot->in_flight -= len;
  98        if (!iot->in_flight)
  99                iot->idle_time = jiffies;
 100}
 101
 102static void iot_io_end(struct io_tracker *iot, sector_t len)
 103{
 104        unsigned long flags;
 105
 106        spin_lock_irqsave(&iot->lock, flags);
 107        __iot_io_end(iot, len);
 108        spin_unlock_irqrestore(&iot->lock, flags);
 109}
 110
 111/*----------------------------------------------------------------*/
 112
 113/*
 114 * Represents a chunk of future work.  'input' allows continuations to pass
 115 * values between themselves, typically error values.
 116 */
 117struct continuation {
 118        struct work_struct ws;
 119        blk_status_t input;
 120};
 121
 122static inline void init_continuation(struct continuation *k,
 123                                     void (*fn)(struct work_struct *))
 124{
 125        INIT_WORK(&k->ws, fn);
 126        k->input = 0;
 127}
 128
 129static inline void queue_continuation(struct workqueue_struct *wq,
 130                                      struct continuation *k)
 131{
 132        queue_work(wq, &k->ws);
 133}
 134
 135/*----------------------------------------------------------------*/
 136
 137/*
 138 * The batcher collects together pieces of work that need a particular
 139 * operation to occur before they can proceed (typically a commit).
 140 */
 141struct batcher {
 142        /*
 143         * The operation that everyone is waiting for.
 144         */
 145        blk_status_t (*commit_op)(void *context);
 146        void *commit_context;
 147
 148        /*
 149         * This is how bios should be issued once the commit op is complete
 150         * (accounted_request).
 151         */
 152        void (*issue_op)(struct bio *bio, void *context);
 153        void *issue_context;
 154
 155        /*
 156         * Queued work gets put on here after commit.
 157         */
 158        struct workqueue_struct *wq;
 159
 160        spinlock_t lock;
 161        struct list_head work_items;
 162        struct bio_list bios;
 163        struct work_struct commit_work;
 164
 165        bool commit_scheduled;
 166};
 167
 168static void __commit(struct work_struct *_ws)
 169{
 170        struct batcher *b = container_of(_ws, struct batcher, commit_work);
 171        blk_status_t r;
 172        struct list_head work_items;
 173        struct work_struct *ws, *tmp;
 174        struct continuation *k;
 175        struct bio *bio;
 176        struct bio_list bios;
 177
 178        INIT_LIST_HEAD(&work_items);
 179        bio_list_init(&bios);
 180
 181        /*
 182         * We have to grab these before the commit_op to avoid a race
 183         * condition.
 184         */
 185        spin_lock_irq(&b->lock);
 186        list_splice_init(&b->work_items, &work_items);
 187        bio_list_merge(&bios, &b->bios);
 188        bio_list_init(&b->bios);
 189        b->commit_scheduled = false;
 190        spin_unlock_irq(&b->lock);
 191
 192        r = b->commit_op(b->commit_context);
 193
 194        list_for_each_entry_safe(ws, tmp, &work_items, entry) {
 195                k = container_of(ws, struct continuation, ws);
 196                k->input = r;
 197                INIT_LIST_HEAD(&ws->entry); /* to avoid a WARN_ON */
 198                queue_work(b->wq, ws);
 199        }
 200
 201        while ((bio = bio_list_pop(&bios))) {
 202                if (r) {
 203                        bio->bi_status = r;
 204                        bio_endio(bio);
 205                } else
 206                        b->issue_op(bio, b->issue_context);
 207        }
 208}
 209
 210static void batcher_init(struct batcher *b,
 211                         blk_status_t (*commit_op)(void *),
 212                         void *commit_context,
 213                         void (*issue_op)(struct bio *bio, void *),
 214                         void *issue_context,
 215                         struct workqueue_struct *wq)
 216{
 217        b->commit_op = commit_op;
 218        b->commit_context = commit_context;
 219        b->issue_op = issue_op;
 220        b->issue_context = issue_context;
 221        b->wq = wq;
 222
 223        spin_lock_init(&b->lock);
 224        INIT_LIST_HEAD(&b->work_items);
 225        bio_list_init(&b->bios);
 226        INIT_WORK(&b->commit_work, __commit);
 227        b->commit_scheduled = false;
 228}
 229
 230static void async_commit(struct batcher *b)
 231{
 232        queue_work(b->wq, &b->commit_work);
 233}
 234
 235static void continue_after_commit(struct batcher *b, struct continuation *k)
 236{
 237        bool commit_scheduled;
 238
 239        spin_lock_irq(&b->lock);
 240        commit_scheduled = b->commit_scheduled;
 241        list_add_tail(&k->ws.entry, &b->work_items);
 242        spin_unlock_irq(&b->lock);
 243
 244        if (commit_scheduled)
 245                async_commit(b);
 246}
 247
 248/*
 249 * Bios are errored if commit failed.
 250 */
 251static void issue_after_commit(struct batcher *b, struct bio *bio)
 252{
 253       bool commit_scheduled;
 254
 255       spin_lock_irq(&b->lock);
 256       commit_scheduled = b->commit_scheduled;
 257       bio_list_add(&b->bios, bio);
 258       spin_unlock_irq(&b->lock);
 259
 260       if (commit_scheduled)
 261               async_commit(b);
 262}
 263
 264/*
 265 * Call this if some urgent work is waiting for the commit to complete.
 266 */
 267static void schedule_commit(struct batcher *b)
 268{
 269        bool immediate;
 270
 271        spin_lock_irq(&b->lock);
 272        immediate = !list_empty(&b->work_items) || !bio_list_empty(&b->bios);
 273        b->commit_scheduled = true;
 274        spin_unlock_irq(&b->lock);
 275
 276        if (immediate)
 277                async_commit(b);
 278}
 279
 280/*
 281 * There are a couple of places where we let a bio run, but want to do some
 282 * work before calling its endio function.  We do this by temporarily
 283 * changing the endio fn.
 284 */
 285struct dm_hook_info {
 286        bio_end_io_t *bi_end_io;
 287};
 288
 289static void dm_hook_bio(struct dm_hook_info *h, struct bio *bio,
 290                        bio_end_io_t *bi_end_io, void *bi_private)
 291{
 292        h->bi_end_io = bio->bi_end_io;
 293
 294        bio->bi_end_io = bi_end_io;
 295        bio->bi_private = bi_private;
 296}
 297
 298static void dm_unhook_bio(struct dm_hook_info *h, struct bio *bio)
 299{
 300        bio->bi_end_io = h->bi_end_io;
 301}
 302
 303/*----------------------------------------------------------------*/
 304
 305#define MIGRATION_POOL_SIZE 128
 306#define COMMIT_PERIOD HZ
 307#define MIGRATION_COUNT_WINDOW 10
 308
 309/*
 310 * The block size of the device holding cache data must be
 311 * between 32KB and 1GB.
 312 */
 313#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (32 * 1024 >> SECTOR_SHIFT)
 314#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
 315
 316enum cache_metadata_mode {
 317        CM_WRITE,               /* metadata may be changed */
 318        CM_READ_ONLY,           /* metadata may not be changed */
 319        CM_FAIL
 320};
 321
 322enum cache_io_mode {
 323        /*
 324         * Data is written to cached blocks only.  These blocks are marked
 325         * dirty.  If you lose the cache device you will lose data.
 326         * Potential performance increase for both reads and writes.
 327         */
 328        CM_IO_WRITEBACK,
 329
 330        /*
 331         * Data is written to both cache and origin.  Blocks are never
 332         * dirty.  Potential performance benfit for reads only.
 333         */
 334        CM_IO_WRITETHROUGH,
 335
 336        /*
 337         * A degraded mode useful for various cache coherency situations
 338         * (eg, rolling back snapshots).  Reads and writes always go to the
 339         * origin.  If a write goes to a cached oblock, then the cache
 340         * block is invalidated.
 341         */
 342        CM_IO_PASSTHROUGH
 343};
 344
 345struct cache_features {
 346        enum cache_metadata_mode mode;
 347        enum cache_io_mode io_mode;
 348        unsigned metadata_version;
 349        bool discard_passdown:1;
 350};
 351
 352struct cache_stats {
 353        atomic_t read_hit;
 354        atomic_t read_miss;
 355        atomic_t write_hit;
 356        atomic_t write_miss;
 357        atomic_t demotion;
 358        atomic_t promotion;
 359        atomic_t writeback;
 360        atomic_t copies_avoided;
 361        atomic_t cache_cell_clash;
 362        atomic_t commit_count;
 363        atomic_t discard_count;
 364};
 365
 366struct cache {
 367        struct dm_target *ti;
 368        spinlock_t lock;
 369
 370        /*
 371         * Fields for converting from sectors to blocks.
 372         */
 373        int sectors_per_block_shift;
 374        sector_t sectors_per_block;
 375
 376        struct dm_cache_metadata *cmd;
 377
 378        /*
 379         * Metadata is written to this device.
 380         */
 381        struct dm_dev *metadata_dev;
 382
 383        /*
 384         * The slower of the two data devices.  Typically a spindle.
 385         */
 386        struct dm_dev *origin_dev;
 387
 388        /*
 389         * The faster of the two data devices.  Typically an SSD.
 390         */
 391        struct dm_dev *cache_dev;
 392
 393        /*
 394         * Size of the origin device in _complete_ blocks and native sectors.
 395         */
 396        dm_oblock_t origin_blocks;
 397        sector_t origin_sectors;
 398
 399        /*
 400         * Size of the cache device in blocks.
 401         */
 402        dm_cblock_t cache_size;
 403
 404        /*
 405         * Invalidation fields.
 406         */
 407        spinlock_t invalidation_lock;
 408        struct list_head invalidation_requests;
 409
 410        sector_t migration_threshold;
 411        wait_queue_head_t migration_wait;
 412        atomic_t nr_allocated_migrations;
 413
 414        /*
 415         * The number of in flight migrations that are performing
 416         * background io. eg, promotion, writeback.
 417         */
 418        atomic_t nr_io_migrations;
 419
 420        struct bio_list deferred_bios;
 421
 422        struct rw_semaphore quiesce_lock;
 423
 424        struct dm_target_callbacks callbacks;
 425
 426        /*
 427         * origin_blocks entries, discarded if set.
 428         */
 429        dm_dblock_t discard_nr_blocks;
 430        unsigned long *discard_bitset;
 431        uint32_t discard_block_size; /* a power of 2 times sectors per block */
 432
 433        /*
 434         * Rather than reconstructing the table line for the status we just
 435         * save it and regurgitate.
 436         */
 437        unsigned nr_ctr_args;
 438        const char **ctr_args;
 439
 440        struct dm_kcopyd_client *copier;
 441        struct work_struct deferred_bio_worker;
 442        struct work_struct migration_worker;
 443        struct workqueue_struct *wq;
 444        struct delayed_work waker;
 445        struct dm_bio_prison_v2 *prison;
 446
 447        /*
 448         * cache_size entries, dirty if set
 449         */
 450        unsigned long *dirty_bitset;
 451        atomic_t nr_dirty;
 452
 453        unsigned policy_nr_args;
 454        struct dm_cache_policy *policy;
 455
 456        /*
 457         * Cache features such as write-through.
 458         */
 459        struct cache_features features;
 460
 461        struct cache_stats stats;
 462
 463        bool need_tick_bio:1;
 464        bool sized:1;
 465        bool invalidate:1;
 466        bool commit_requested:1;
 467        bool loaded_mappings:1;
 468        bool loaded_discards:1;
 469
 470        struct rw_semaphore background_work_lock;
 471
 472        struct batcher committer;
 473        struct work_struct commit_ws;
 474
 475        struct io_tracker tracker;
 476
 477        mempool_t migration_pool;
 478
 479        struct bio_set bs;
 480};
 481
 482struct per_bio_data {
 483        bool tick:1;
 484        unsigned req_nr:2;
 485        struct dm_bio_prison_cell_v2 *cell;
 486        struct dm_hook_info hook_info;
 487        sector_t len;
 488};
 489
 490struct dm_cache_migration {
 491        struct continuation k;
 492        struct cache *cache;
 493
 494        struct policy_work *op;
 495        struct bio *overwrite_bio;
 496        struct dm_bio_prison_cell_v2 *cell;
 497
 498        dm_cblock_t invalidate_cblock;
 499        dm_oblock_t invalidate_oblock;
 500};
 501
 502/*----------------------------------------------------------------*/
 503
 504static bool writethrough_mode(struct cache *cache)
 505{
 506        return cache->features.io_mode == CM_IO_WRITETHROUGH;
 507}
 508
 509static bool writeback_mode(struct cache *cache)
 510{
 511        return cache->features.io_mode == CM_IO_WRITEBACK;
 512}
 513
 514static inline bool passthrough_mode(struct cache *cache)
 515{
 516        return unlikely(cache->features.io_mode == CM_IO_PASSTHROUGH);
 517}
 518
 519/*----------------------------------------------------------------*/
 520
 521static void wake_deferred_bio_worker(struct cache *cache)
 522{
 523        queue_work(cache->wq, &cache->deferred_bio_worker);
 524}
 525
 526static void wake_migration_worker(struct cache *cache)
 527{
 528        if (passthrough_mode(cache))
 529                return;
 530
 531        queue_work(cache->wq, &cache->migration_worker);
 532}
 533
 534/*----------------------------------------------------------------*/
 535
 536static struct dm_bio_prison_cell_v2 *alloc_prison_cell(struct cache *cache)
 537{
 538        return dm_bio_prison_alloc_cell_v2(cache->prison, GFP_NOIO);
 539}
 540
 541static void free_prison_cell(struct cache *cache, struct dm_bio_prison_cell_v2 *cell)
 542{
 543        dm_bio_prison_free_cell_v2(cache->prison, cell);
 544}
 545
 546static struct dm_cache_migration *alloc_migration(struct cache *cache)
 547{
 548        struct dm_cache_migration *mg;
 549
 550        mg = mempool_alloc(&cache->migration_pool, GFP_NOIO);
 551
 552        memset(mg, 0, sizeof(*mg));
 553
 554        mg->cache = cache;
 555        atomic_inc(&cache->nr_allocated_migrations);
 556
 557        return mg;
 558}
 559
 560static void free_migration(struct dm_cache_migration *mg)
 561{
 562        struct cache *cache = mg->cache;
 563
 564        if (atomic_dec_and_test(&cache->nr_allocated_migrations))
 565                wake_up(&cache->migration_wait);
 566
 567        mempool_free(mg, &cache->migration_pool);
 568}
 569
 570/*----------------------------------------------------------------*/
 571
 572static inline dm_oblock_t oblock_succ(dm_oblock_t b)
 573{
 574        return to_oblock(from_oblock(b) + 1ull);
 575}
 576
 577static void build_key(dm_oblock_t begin, dm_oblock_t end, struct dm_cell_key_v2 *key)
 578{
 579        key->virtual = 0;
 580        key->dev = 0;
 581        key->block_begin = from_oblock(begin);
 582        key->block_end = from_oblock(end);
 583}
 584
 585/*
 586 * We have two lock levels.  Level 0, which is used to prevent WRITEs, and
 587 * level 1 which prevents *both* READs and WRITEs.
 588 */
 589#define WRITE_LOCK_LEVEL 0
 590#define READ_WRITE_LOCK_LEVEL 1
 591
 592static unsigned lock_level(struct bio *bio)
 593{
 594        return bio_data_dir(bio) == WRITE ?
 595                WRITE_LOCK_LEVEL :
 596                READ_WRITE_LOCK_LEVEL;
 597}
 598
 599/*----------------------------------------------------------------
 600 * Per bio data
 601 *--------------------------------------------------------------*/
 602
 603static struct per_bio_data *get_per_bio_data(struct bio *bio)
 604{
 605        struct per_bio_data *pb = dm_per_bio_data(bio, sizeof(struct per_bio_data));
 606        BUG_ON(!pb);
 607        return pb;
 608}
 609
 610static struct per_bio_data *init_per_bio_data(struct bio *bio)
 611{
 612        struct per_bio_data *pb = get_per_bio_data(bio);
 613
 614        pb->tick = false;
 615        pb->req_nr = dm_bio_get_target_bio_nr(bio);
 616        pb->cell = NULL;
 617        pb->len = 0;
 618
 619        return pb;
 620}
 621
 622/*----------------------------------------------------------------*/
 623
 624static void defer_bio(struct cache *cache, struct bio *bio)
 625{
 626        spin_lock_irq(&cache->lock);
 627        bio_list_add(&cache->deferred_bios, bio);
 628        spin_unlock_irq(&cache->lock);
 629
 630        wake_deferred_bio_worker(cache);
 631}
 632
 633static void defer_bios(struct cache *cache, struct bio_list *bios)
 634{
 635        spin_lock_irq(&cache->lock);
 636        bio_list_merge(&cache->deferred_bios, bios);
 637        bio_list_init(bios);
 638        spin_unlock_irq(&cache->lock);
 639
 640        wake_deferred_bio_worker(cache);
 641}
 642
 643/*----------------------------------------------------------------*/
 644
 645static bool bio_detain_shared(struct cache *cache, dm_oblock_t oblock, struct bio *bio)
 646{
 647        bool r;
 648        struct per_bio_data *pb;
 649        struct dm_cell_key_v2 key;
 650        dm_oblock_t end = to_oblock(from_oblock(oblock) + 1ULL);
 651        struct dm_bio_prison_cell_v2 *cell_prealloc, *cell;
 652
 653        cell_prealloc = alloc_prison_cell(cache); /* FIXME: allow wait if calling from worker */
 654
 655        build_key(oblock, end, &key);
 656        r = dm_cell_get_v2(cache->prison, &key, lock_level(bio), bio, cell_prealloc, &cell);
 657        if (!r) {
 658                /*
 659                 * Failed to get the lock.
 660                 */
 661                free_prison_cell(cache, cell_prealloc);
 662                return r;
 663        }
 664
 665        if (cell != cell_prealloc)
 666                free_prison_cell(cache, cell_prealloc);
 667
 668        pb = get_per_bio_data(bio);
 669        pb->cell = cell;
 670
 671        return r;
 672}
 673
 674/*----------------------------------------------------------------*/
 675
 676static bool is_dirty(struct cache *cache, dm_cblock_t b)
 677{
 678        return test_bit(from_cblock(b), cache->dirty_bitset);
 679}
 680
 681static void set_dirty(struct cache *cache, dm_cblock_t cblock)
 682{
 683        if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset)) {
 684                atomic_inc(&cache->nr_dirty);
 685                policy_set_dirty(cache->policy, cblock);
 686        }
 687}
 688
 689/*
 690 * These two are called when setting after migrations to force the policy
 691 * and dirty bitset to be in sync.
 692 */
 693static void force_set_dirty(struct cache *cache, dm_cblock_t cblock)
 694{
 695        if (!test_and_set_bit(from_cblock(cblock), cache->dirty_bitset))
 696                atomic_inc(&cache->nr_dirty);
 697        policy_set_dirty(cache->policy, cblock);
 698}
 699
 700static void force_clear_dirty(struct cache *cache, dm_cblock_t cblock)
 701{
 702        if (test_and_clear_bit(from_cblock(cblock), cache->dirty_bitset)) {
 703                if (atomic_dec_return(&cache->nr_dirty) == 0)
 704                        dm_table_event(cache->ti->table);
 705        }
 706
 707        policy_clear_dirty(cache->policy, cblock);
 708}
 709
 710/*----------------------------------------------------------------*/
 711
 712static bool block_size_is_power_of_two(struct cache *cache)
 713{
 714        return cache->sectors_per_block_shift >= 0;
 715}
 716
 717/* gcc on ARM generates spurious references to __udivdi3 and __umoddi3 */
 718#if defined(CONFIG_ARM) && __GNUC__ == 4 && __GNUC_MINOR__ <= 6
 719__always_inline
 720#endif
 721static dm_block_t block_div(dm_block_t b, uint32_t n)
 722{
 723        do_div(b, n);
 724
 725        return b;
 726}
 727
 728static dm_block_t oblocks_per_dblock(struct cache *cache)
 729{
 730        dm_block_t oblocks = cache->discard_block_size;
 731
 732        if (block_size_is_power_of_two(cache))
 733                oblocks >>= cache->sectors_per_block_shift;
 734        else
 735                oblocks = block_div(oblocks, cache->sectors_per_block);
 736
 737        return oblocks;
 738}
 739
 740static dm_dblock_t oblock_to_dblock(struct cache *cache, dm_oblock_t oblock)
 741{
 742        return to_dblock(block_div(from_oblock(oblock),
 743                                   oblocks_per_dblock(cache)));
 744}
 745
 746static void set_discard(struct cache *cache, dm_dblock_t b)
 747{
 748        BUG_ON(from_dblock(b) >= from_dblock(cache->discard_nr_blocks));
 749        atomic_inc(&cache->stats.discard_count);
 750
 751        spin_lock_irq(&cache->lock);
 752        set_bit(from_dblock(b), cache->discard_bitset);
 753        spin_unlock_irq(&cache->lock);
 754}
 755
 756static void clear_discard(struct cache *cache, dm_dblock_t b)
 757{
 758        spin_lock_irq(&cache->lock);
 759        clear_bit(from_dblock(b), cache->discard_bitset);
 760        spin_unlock_irq(&cache->lock);
 761}
 762
 763static bool is_discarded(struct cache *cache, dm_dblock_t b)
 764{
 765        int r;
 766        spin_lock_irq(&cache->lock);
 767        r = test_bit(from_dblock(b), cache->discard_bitset);
 768        spin_unlock_irq(&cache->lock);
 769
 770        return r;
 771}
 772
 773static bool is_discarded_oblock(struct cache *cache, dm_oblock_t b)
 774{
 775        int r;
 776        spin_lock_irq(&cache->lock);
 777        r = test_bit(from_dblock(oblock_to_dblock(cache, b)),
 778                     cache->discard_bitset);
 779        spin_unlock_irq(&cache->lock);
 780
 781        return r;
 782}
 783
 784/*----------------------------------------------------------------
 785 * Remapping
 786 *--------------------------------------------------------------*/
 787static void remap_to_origin(struct cache *cache, struct bio *bio)
 788{
 789        bio_set_dev(bio, cache->origin_dev->bdev);
 790}
 791
 792static void remap_to_cache(struct cache *cache, struct bio *bio,
 793                           dm_cblock_t cblock)
 794{
 795        sector_t bi_sector = bio->bi_iter.bi_sector;
 796        sector_t block = from_cblock(cblock);
 797
 798        bio_set_dev(bio, cache->cache_dev->bdev);
 799        if (!block_size_is_power_of_two(cache))
 800                bio->bi_iter.bi_sector =
 801                        (block * cache->sectors_per_block) +
 802                        sector_div(bi_sector, cache->sectors_per_block);
 803        else
 804                bio->bi_iter.bi_sector =
 805                        (block << cache->sectors_per_block_shift) |
 806                        (bi_sector & (cache->sectors_per_block - 1));
 807}
 808
 809static void check_if_tick_bio_needed(struct cache *cache, struct bio *bio)
 810{
 811        struct per_bio_data *pb;
 812
 813        spin_lock_irq(&cache->lock);
 814        if (cache->need_tick_bio && !op_is_flush(bio->bi_opf) &&
 815            bio_op(bio) != REQ_OP_DISCARD) {
 816                pb = get_per_bio_data(bio);
 817                pb->tick = true;
 818                cache->need_tick_bio = false;
 819        }
 820        spin_unlock_irq(&cache->lock);
 821}
 822
 823static void __remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
 824                                            dm_oblock_t oblock, bool bio_has_pbd)
 825{
 826        if (bio_has_pbd)
 827                check_if_tick_bio_needed(cache, bio);
 828        remap_to_origin(cache, bio);
 829        if (bio_data_dir(bio) == WRITE)
 830                clear_discard(cache, oblock_to_dblock(cache, oblock));
 831}
 832
 833static void remap_to_origin_clear_discard(struct cache *cache, struct bio *bio,
 834                                          dm_oblock_t oblock)
 835{
 836        // FIXME: check_if_tick_bio_needed() is called way too much through this interface
 837        __remap_to_origin_clear_discard(cache, bio, oblock, true);
 838}
 839
 840static void remap_to_cache_dirty(struct cache *cache, struct bio *bio,
 841                                 dm_oblock_t oblock, dm_cblock_t cblock)
 842{
 843        check_if_tick_bio_needed(cache, bio);
 844        remap_to_cache(cache, bio, cblock);
 845        if (bio_data_dir(bio) == WRITE) {
 846                set_dirty(cache, cblock);
 847                clear_discard(cache, oblock_to_dblock(cache, oblock));
 848        }
 849}
 850
 851static dm_oblock_t get_bio_block(struct cache *cache, struct bio *bio)
 852{
 853        sector_t block_nr = bio->bi_iter.bi_sector;
 854
 855        if (!block_size_is_power_of_two(cache))
 856                (void) sector_div(block_nr, cache->sectors_per_block);
 857        else
 858                block_nr >>= cache->sectors_per_block_shift;
 859
 860        return to_oblock(block_nr);
 861}
 862
 863static bool accountable_bio(struct cache *cache, struct bio *bio)
 864{
 865        return bio_op(bio) != REQ_OP_DISCARD;
 866}
 867
 868static void accounted_begin(struct cache *cache, struct bio *bio)
 869{
 870        struct per_bio_data *pb;
 871
 872        if (accountable_bio(cache, bio)) {
 873                pb = get_per_bio_data(bio);
 874                pb->len = bio_sectors(bio);
 875                iot_io_begin(&cache->tracker, pb->len);
 876        }
 877}
 878
 879static void accounted_complete(struct cache *cache, struct bio *bio)
 880{
 881        struct per_bio_data *pb = get_per_bio_data(bio);
 882
 883        iot_io_end(&cache->tracker, pb->len);
 884}
 885
 886static void accounted_request(struct cache *cache, struct bio *bio)
 887{
 888        accounted_begin(cache, bio);
 889        generic_make_request(bio);
 890}
 891
 892static void issue_op(struct bio *bio, void *context)
 893{
 894        struct cache *cache = context;
 895        accounted_request(cache, bio);
 896}
 897
 898/*
 899 * When running in writethrough mode we need to send writes to clean blocks
 900 * to both the cache and origin devices.  Clone the bio and send them in parallel.
 901 */
 902static void remap_to_origin_and_cache(struct cache *cache, struct bio *bio,
 903                                      dm_oblock_t oblock, dm_cblock_t cblock)
 904{
 905        struct bio *origin_bio = bio_clone_fast(bio, GFP_NOIO, &cache->bs);
 906
 907        BUG_ON(!origin_bio);
 908
 909        bio_chain(origin_bio, bio);
 910        /*
 911         * Passing false to __remap_to_origin_clear_discard() skips
 912         * all code that might use per_bio_data (since clone doesn't have it)
 913         */
 914        __remap_to_origin_clear_discard(cache, origin_bio, oblock, false);
 915        submit_bio(origin_bio);
 916
 917        remap_to_cache(cache, bio, cblock);
 918}
 919
 920/*----------------------------------------------------------------
 921 * Failure modes
 922 *--------------------------------------------------------------*/
 923static enum cache_metadata_mode get_cache_mode(struct cache *cache)
 924{
 925        return cache->features.mode;
 926}
 927
 928static const char *cache_device_name(struct cache *cache)
 929{
 930        return dm_device_name(dm_table_get_md(cache->ti->table));
 931}
 932
 933static void notify_mode_switch(struct cache *cache, enum cache_metadata_mode mode)
 934{
 935        const char *descs[] = {
 936                "write",
 937                "read-only",
 938                "fail"
 939        };
 940
 941        dm_table_event(cache->ti->table);
 942        DMINFO("%s: switching cache to %s mode",
 943               cache_device_name(cache), descs[(int)mode]);
 944}
 945
 946static void set_cache_mode(struct cache *cache, enum cache_metadata_mode new_mode)
 947{
 948        bool needs_check;
 949        enum cache_metadata_mode old_mode = get_cache_mode(cache);
 950
 951        if (dm_cache_metadata_needs_check(cache->cmd, &needs_check)) {
 952                DMERR("%s: unable to read needs_check flag, setting failure mode.",
 953                      cache_device_name(cache));
 954                new_mode = CM_FAIL;
 955        }
 956
 957        if (new_mode == CM_WRITE && needs_check) {
 958                DMERR("%s: unable to switch cache to write mode until repaired.",
 959                      cache_device_name(cache));
 960                if (old_mode != new_mode)
 961                        new_mode = old_mode;
 962                else
 963                        new_mode = CM_READ_ONLY;
 964        }
 965
 966        /* Never move out of fail mode */
 967        if (old_mode == CM_FAIL)
 968                new_mode = CM_FAIL;
 969
 970        switch (new_mode) {
 971        case CM_FAIL:
 972        case CM_READ_ONLY:
 973                dm_cache_metadata_set_read_only(cache->cmd);
 974                break;
 975
 976        case CM_WRITE:
 977                dm_cache_metadata_set_read_write(cache->cmd);
 978                break;
 979        }
 980
 981        cache->features.mode = new_mode;
 982
 983        if (new_mode != old_mode)
 984                notify_mode_switch(cache, new_mode);
 985}
 986
 987static void abort_transaction(struct cache *cache)
 988{
 989        const char *dev_name = cache_device_name(cache);
 990
 991        if (get_cache_mode(cache) >= CM_READ_ONLY)
 992                return;
 993
 994        if (dm_cache_metadata_set_needs_check(cache->cmd)) {
 995                DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
 996                set_cache_mode(cache, CM_FAIL);
 997        }
 998
 999        DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
1000        if (dm_cache_metadata_abort(cache->cmd)) {
1001                DMERR("%s: failed to abort metadata transaction", dev_name);
1002                set_cache_mode(cache, CM_FAIL);
1003        }
1004}
1005
1006static void metadata_operation_failed(struct cache *cache, const char *op, int r)
1007{
1008        DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
1009                    cache_device_name(cache), op, r);
1010        abort_transaction(cache);
1011        set_cache_mode(cache, CM_READ_ONLY);
1012}
1013
1014/*----------------------------------------------------------------*/
1015
1016static void load_stats(struct cache *cache)
1017{
1018        struct dm_cache_statistics stats;
1019
1020        dm_cache_metadata_get_stats(cache->cmd, &stats);
1021        atomic_set(&cache->stats.read_hit, stats.read_hits);
1022        atomic_set(&cache->stats.read_miss, stats.read_misses);
1023        atomic_set(&cache->stats.write_hit, stats.write_hits);
1024        atomic_set(&cache->stats.write_miss, stats.write_misses);
1025}
1026
1027static void save_stats(struct cache *cache)
1028{
1029        struct dm_cache_statistics stats;
1030
1031        if (get_cache_mode(cache) >= CM_READ_ONLY)
1032                return;
1033
1034        stats.read_hits = atomic_read(&cache->stats.read_hit);
1035        stats.read_misses = atomic_read(&cache->stats.read_miss);
1036        stats.write_hits = atomic_read(&cache->stats.write_hit);
1037        stats.write_misses = atomic_read(&cache->stats.write_miss);
1038
1039        dm_cache_metadata_set_stats(cache->cmd, &stats);
1040}
1041
1042static void update_stats(struct cache_stats *stats, enum policy_operation op)
1043{
1044        switch (op) {
1045        case POLICY_PROMOTE:
1046                atomic_inc(&stats->promotion);
1047                break;
1048
1049        case POLICY_DEMOTE:
1050                atomic_inc(&stats->demotion);
1051                break;
1052
1053        case POLICY_WRITEBACK:
1054                atomic_inc(&stats->writeback);
1055                break;
1056        }
1057}
1058
1059/*----------------------------------------------------------------
1060 * Migration processing
1061 *
1062 * Migration covers moving data from the origin device to the cache, or
1063 * vice versa.
1064 *--------------------------------------------------------------*/
1065
1066static void inc_io_migrations(struct cache *cache)
1067{
1068        atomic_inc(&cache->nr_io_migrations);
1069}
1070
1071static void dec_io_migrations(struct cache *cache)
1072{
1073        atomic_dec(&cache->nr_io_migrations);
1074}
1075
1076static bool discard_or_flush(struct bio *bio)
1077{
1078        return bio_op(bio) == REQ_OP_DISCARD || op_is_flush(bio->bi_opf);
1079}
1080
1081static void calc_discard_block_range(struct cache *cache, struct bio *bio,
1082                                     dm_dblock_t *b, dm_dblock_t *e)
1083{
1084        sector_t sb = bio->bi_iter.bi_sector;
1085        sector_t se = bio_end_sector(bio);
1086
1087        *b = to_dblock(dm_sector_div_up(sb, cache->discard_block_size));
1088
1089        if (se - sb < cache->discard_block_size)
1090                *e = *b;
1091        else
1092                *e = to_dblock(block_div(se, cache->discard_block_size));
1093}
1094
1095/*----------------------------------------------------------------*/
1096
1097static void prevent_background_work(struct cache *cache)
1098{
1099        lockdep_off();
1100        down_write(&cache->background_work_lock);
1101        lockdep_on();
1102}
1103
1104static void allow_background_work(struct cache *cache)
1105{
1106        lockdep_off();
1107        up_write(&cache->background_work_lock);
1108        lockdep_on();
1109}
1110
1111static bool background_work_begin(struct cache *cache)
1112{
1113        bool r;
1114
1115        lockdep_off();
1116        r = down_read_trylock(&cache->background_work_lock);
1117        lockdep_on();
1118
1119        return r;
1120}
1121
1122static void background_work_end(struct cache *cache)
1123{
1124        lockdep_off();
1125        up_read(&cache->background_work_lock);
1126        lockdep_on();
1127}
1128
1129/*----------------------------------------------------------------*/
1130
1131static bool bio_writes_complete_block(struct cache *cache, struct bio *bio)
1132{
1133        return (bio_data_dir(bio) == WRITE) &&
1134                (bio->bi_iter.bi_size == (cache->sectors_per_block << SECTOR_SHIFT));
1135}
1136
1137static bool optimisable_bio(struct cache *cache, struct bio *bio, dm_oblock_t block)
1138{
1139        return writeback_mode(cache) &&
1140                (is_discarded_oblock(cache, block) || bio_writes_complete_block(cache, bio));
1141}
1142
1143static void quiesce(struct dm_cache_migration *mg,
1144                    void (*continuation)(struct work_struct *))
1145{
1146        init_continuation(&mg->k, continuation);
1147        dm_cell_quiesce_v2(mg->cache->prison, mg->cell, &mg->k.ws);
1148}
1149
1150static struct dm_cache_migration *ws_to_mg(struct work_struct *ws)
1151{
1152        struct continuation *k = container_of(ws, struct continuation, ws);
1153        return container_of(k, struct dm_cache_migration, k);
1154}
1155
1156static void copy_complete(int read_err, unsigned long write_err, void *context)
1157{
1158        struct dm_cache_migration *mg = container_of(context, struct dm_cache_migration, k);
1159
1160        if (read_err || write_err)
1161                mg->k.input = BLK_STS_IOERR;
1162
1163        queue_continuation(mg->cache->wq, &mg->k);
1164}
1165
1166static void copy(struct dm_cache_migration *mg, bool promote)
1167{
1168        struct dm_io_region o_region, c_region;
1169        struct cache *cache = mg->cache;
1170
1171        o_region.bdev = cache->origin_dev->bdev;
1172        o_region.sector = from_oblock(mg->op->oblock) * cache->sectors_per_block;
1173        o_region.count = cache->sectors_per_block;
1174
1175        c_region.bdev = cache->cache_dev->bdev;
1176        c_region.sector = from_cblock(mg->op->cblock) * cache->sectors_per_block;
1177        c_region.count = cache->sectors_per_block;
1178
1179        if (promote)
1180                dm_kcopyd_copy(cache->copier, &o_region, 1, &c_region, 0, copy_complete, &mg->k);
1181        else
1182                dm_kcopyd_copy(cache->copier, &c_region, 1, &o_region, 0, copy_complete, &mg->k);
1183}
1184
1185static void bio_drop_shared_lock(struct cache *cache, struct bio *bio)
1186{
1187        struct per_bio_data *pb = get_per_bio_data(bio);
1188
1189        if (pb->cell && dm_cell_put_v2(cache->prison, pb->cell))
1190                free_prison_cell(cache, pb->cell);
1191        pb->cell = NULL;
1192}
1193
1194static void overwrite_endio(struct bio *bio)
1195{
1196        struct dm_cache_migration *mg = bio->bi_private;
1197        struct cache *cache = mg->cache;
1198        struct per_bio_data *pb = get_per_bio_data(bio);
1199
1200        dm_unhook_bio(&pb->hook_info, bio);
1201
1202        if (bio->bi_status)
1203                mg->k.input = bio->bi_status;
1204
1205        queue_continuation(cache->wq, &mg->k);
1206}
1207
1208static void overwrite(struct dm_cache_migration *mg,
1209                      void (*continuation)(struct work_struct *))
1210{
1211        struct bio *bio = mg->overwrite_bio;
1212        struct per_bio_data *pb = get_per_bio_data(bio);
1213
1214        dm_hook_bio(&pb->hook_info, bio, overwrite_endio, mg);
1215
1216        /*
1217         * The overwrite bio is part of the copy operation, as such it does
1218         * not set/clear discard or dirty flags.
1219         */
1220        if (mg->op->op == POLICY_PROMOTE)
1221                remap_to_cache(mg->cache, bio, mg->op->cblock);
1222        else
1223                remap_to_origin(mg->cache, bio);
1224
1225        init_continuation(&mg->k, continuation);
1226        accounted_request(mg->cache, bio);
1227}
1228
1229/*
1230 * Migration steps:
1231 *
1232 * 1) exclusive lock preventing WRITEs
1233 * 2) quiesce
1234 * 3) copy or issue overwrite bio
1235 * 4) upgrade to exclusive lock preventing READs and WRITEs
1236 * 5) quiesce
1237 * 6) update metadata and commit
1238 * 7) unlock
1239 */
1240static void mg_complete(struct dm_cache_migration *mg, bool success)
1241{
1242        struct bio_list bios;
1243        struct cache *cache = mg->cache;
1244        struct policy_work *op = mg->op;
1245        dm_cblock_t cblock = op->cblock;
1246
1247        if (success)
1248                update_stats(&cache->stats, op->op);
1249
1250        switch (op->op) {
1251        case POLICY_PROMOTE:
1252                clear_discard(cache, oblock_to_dblock(cache, op->oblock));
1253                policy_complete_background_work(cache->policy, op, success);
1254
1255                if (mg->overwrite_bio) {
1256                        if (success)
1257                                force_set_dirty(cache, cblock);
1258                        else if (mg->k.input)
1259                                mg->overwrite_bio->bi_status = mg->k.input;
1260                        else
1261                                mg->overwrite_bio->bi_status = BLK_STS_IOERR;
1262                        bio_endio(mg->overwrite_bio);
1263                } else {
1264                        if (success)
1265                                force_clear_dirty(cache, cblock);
1266                        dec_io_migrations(cache);
1267                }
1268                break;
1269
1270        case POLICY_DEMOTE:
1271                /*
1272                 * We clear dirty here to update the nr_dirty counter.
1273                 */
1274                if (success)
1275                        force_clear_dirty(cache, cblock);
1276                policy_complete_background_work(cache->policy, op, success);
1277                dec_io_migrations(cache);
1278                break;
1279
1280        case POLICY_WRITEBACK:
1281                if (success)
1282                        force_clear_dirty(cache, cblock);
1283                policy_complete_background_work(cache->policy, op, success);
1284                dec_io_migrations(cache);
1285                break;
1286        }
1287
1288        bio_list_init(&bios);
1289        if (mg->cell) {
1290                if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
1291                        free_prison_cell(cache, mg->cell);
1292        }
1293
1294        free_migration(mg);
1295        defer_bios(cache, &bios);
1296        wake_migration_worker(cache);
1297
1298        background_work_end(cache);
1299}
1300
1301static void mg_success(struct work_struct *ws)
1302{
1303        struct dm_cache_migration *mg = ws_to_mg(ws);
1304        mg_complete(mg, mg->k.input == 0);
1305}
1306
1307static void mg_update_metadata(struct work_struct *ws)
1308{
1309        int r;
1310        struct dm_cache_migration *mg = ws_to_mg(ws);
1311        struct cache *cache = mg->cache;
1312        struct policy_work *op = mg->op;
1313
1314        switch (op->op) {
1315        case POLICY_PROMOTE:
1316                r = dm_cache_insert_mapping(cache->cmd, op->cblock, op->oblock);
1317                if (r) {
1318                        DMERR_LIMIT("%s: migration failed; couldn't insert mapping",
1319                                    cache_device_name(cache));
1320                        metadata_operation_failed(cache, "dm_cache_insert_mapping", r);
1321
1322                        mg_complete(mg, false);
1323                        return;
1324                }
1325                mg_complete(mg, true);
1326                break;
1327
1328        case POLICY_DEMOTE:
1329                r = dm_cache_remove_mapping(cache->cmd, op->cblock);
1330                if (r) {
1331                        DMERR_LIMIT("%s: migration failed; couldn't update on disk metadata",
1332                                    cache_device_name(cache));
1333                        metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
1334
1335                        mg_complete(mg, false);
1336                        return;
1337                }
1338
1339                /*
1340                 * It would be nice if we only had to commit when a REQ_FLUSH
1341                 * comes through.  But there's one scenario that we have to
1342                 * look out for:
1343                 *
1344                 * - vblock x in a cache block
1345                 * - domotion occurs
1346                 * - cache block gets reallocated and over written
1347                 * - crash
1348                 *
1349                 * When we recover, because there was no commit the cache will
1350                 * rollback to having the data for vblock x in the cache block.
1351                 * But the cache block has since been overwritten, so it'll end
1352                 * up pointing to data that was never in 'x' during the history
1353                 * of the device.
1354                 *
1355                 * To avoid this issue we require a commit as part of the
1356                 * demotion operation.
1357                 */
1358                init_continuation(&mg->k, mg_success);
1359                continue_after_commit(&cache->committer, &mg->k);
1360                schedule_commit(&cache->committer);
1361                break;
1362
1363        case POLICY_WRITEBACK:
1364                mg_complete(mg, true);
1365                break;
1366        }
1367}
1368
1369static void mg_update_metadata_after_copy(struct work_struct *ws)
1370{
1371        struct dm_cache_migration *mg = ws_to_mg(ws);
1372
1373        /*
1374         * Did the copy succeed?
1375         */
1376        if (mg->k.input)
1377                mg_complete(mg, false);
1378        else
1379                mg_update_metadata(ws);
1380}
1381
1382static void mg_upgrade_lock(struct work_struct *ws)
1383{
1384        int r;
1385        struct dm_cache_migration *mg = ws_to_mg(ws);
1386
1387        /*
1388         * Did the copy succeed?
1389         */
1390        if (mg->k.input)
1391                mg_complete(mg, false);
1392
1393        else {
1394                /*
1395                 * Now we want the lock to prevent both reads and writes.
1396                 */
1397                r = dm_cell_lock_promote_v2(mg->cache->prison, mg->cell,
1398                                            READ_WRITE_LOCK_LEVEL);
1399                if (r < 0)
1400                        mg_complete(mg, false);
1401
1402                else if (r)
1403                        quiesce(mg, mg_update_metadata);
1404
1405                else
1406                        mg_update_metadata(ws);
1407        }
1408}
1409
1410static void mg_full_copy(struct work_struct *ws)
1411{
1412        struct dm_cache_migration *mg = ws_to_mg(ws);
1413        struct cache *cache = mg->cache;
1414        struct policy_work *op = mg->op;
1415        bool is_policy_promote = (op->op == POLICY_PROMOTE);
1416
1417        if ((!is_policy_promote && !is_dirty(cache, op->cblock)) ||
1418            is_discarded_oblock(cache, op->oblock)) {
1419                mg_upgrade_lock(ws);
1420                return;
1421        }
1422
1423        init_continuation(&mg->k, mg_upgrade_lock);
1424        copy(mg, is_policy_promote);
1425}
1426
1427static void mg_copy(struct work_struct *ws)
1428{
1429        struct dm_cache_migration *mg = ws_to_mg(ws);
1430
1431        if (mg->overwrite_bio) {
1432                /*
1433                 * No exclusive lock was held when we last checked if the bio
1434                 * was optimisable.  So we have to check again in case things
1435                 * have changed (eg, the block may no longer be discarded).
1436                 */
1437                if (!optimisable_bio(mg->cache, mg->overwrite_bio, mg->op->oblock)) {
1438                        /*
1439                         * Fallback to a real full copy after doing some tidying up.
1440                         */
1441                        bool rb = bio_detain_shared(mg->cache, mg->op->oblock, mg->overwrite_bio);
1442                        BUG_ON(rb); /* An exclussive lock must _not_ be held for this block */
1443                        mg->overwrite_bio = NULL;
1444                        inc_io_migrations(mg->cache);
1445                        mg_full_copy(ws);
1446                        return;
1447                }
1448
1449                /*
1450                 * It's safe to do this here, even though it's new data
1451                 * because all IO has been locked out of the block.
1452                 *
1453                 * mg_lock_writes() already took READ_WRITE_LOCK_LEVEL
1454                 * so _not_ using mg_upgrade_lock() as continutation.
1455                 */
1456                overwrite(mg, mg_update_metadata_after_copy);
1457
1458        } else
1459                mg_full_copy(ws);
1460}
1461
1462static int mg_lock_writes(struct dm_cache_migration *mg)
1463{
1464        int r;
1465        struct dm_cell_key_v2 key;
1466        struct cache *cache = mg->cache;
1467        struct dm_bio_prison_cell_v2 *prealloc;
1468
1469        prealloc = alloc_prison_cell(cache);
1470
1471        /*
1472         * Prevent writes to the block, but allow reads to continue.
1473         * Unless we're using an overwrite bio, in which case we lock
1474         * everything.
1475         */
1476        build_key(mg->op->oblock, oblock_succ(mg->op->oblock), &key);
1477        r = dm_cell_lock_v2(cache->prison, &key,
1478                            mg->overwrite_bio ?  READ_WRITE_LOCK_LEVEL : WRITE_LOCK_LEVEL,
1479                            prealloc, &mg->cell);
1480        if (r < 0) {
1481                free_prison_cell(cache, prealloc);
1482                mg_complete(mg, false);
1483                return r;
1484        }
1485
1486        if (mg->cell != prealloc)
1487                free_prison_cell(cache, prealloc);
1488
1489        if (r == 0)
1490                mg_copy(&mg->k.ws);
1491        else
1492                quiesce(mg, mg_copy);
1493
1494        return 0;
1495}
1496
1497static int mg_start(struct cache *cache, struct policy_work *op, struct bio *bio)
1498{
1499        struct dm_cache_migration *mg;
1500
1501        if (!background_work_begin(cache)) {
1502                policy_complete_background_work(cache->policy, op, false);
1503                return -EPERM;
1504        }
1505
1506        mg = alloc_migration(cache);
1507
1508        mg->op = op;
1509        mg->overwrite_bio = bio;
1510
1511        if (!bio)
1512                inc_io_migrations(cache);
1513
1514        return mg_lock_writes(mg);
1515}
1516
1517/*----------------------------------------------------------------
1518 * invalidation processing
1519 *--------------------------------------------------------------*/
1520
1521static void invalidate_complete(struct dm_cache_migration *mg, bool success)
1522{
1523        struct bio_list bios;
1524        struct cache *cache = mg->cache;
1525
1526        bio_list_init(&bios);
1527        if (dm_cell_unlock_v2(cache->prison, mg->cell, &bios))
1528                free_prison_cell(cache, mg->cell);
1529
1530        if (!success && mg->overwrite_bio)
1531                bio_io_error(mg->overwrite_bio);
1532
1533        free_migration(mg);
1534        defer_bios(cache, &bios);
1535
1536        background_work_end(cache);
1537}
1538
1539static void invalidate_completed(struct work_struct *ws)
1540{
1541        struct dm_cache_migration *mg = ws_to_mg(ws);
1542        invalidate_complete(mg, !mg->k.input);
1543}
1544
1545static int invalidate_cblock(struct cache *cache, dm_cblock_t cblock)
1546{
1547        int r = policy_invalidate_mapping(cache->policy, cblock);
1548        if (!r) {
1549                r = dm_cache_remove_mapping(cache->cmd, cblock);
1550                if (r) {
1551                        DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata",
1552                                    cache_device_name(cache));
1553                        metadata_operation_failed(cache, "dm_cache_remove_mapping", r);
1554                }
1555
1556        } else if (r == -ENODATA) {
1557                /*
1558                 * Harmless, already unmapped.
1559                 */
1560                r = 0;
1561
1562        } else
1563                DMERR("%s: policy_invalidate_mapping failed", cache_device_name(cache));
1564
1565        return r;
1566}
1567
1568static void invalidate_remove(struct work_struct *ws)
1569{
1570        int r;
1571        struct dm_cache_migration *mg = ws_to_mg(ws);
1572        struct cache *cache = mg->cache;
1573
1574        r = invalidate_cblock(cache, mg->invalidate_cblock);
1575        if (r) {
1576                invalidate_complete(mg, false);
1577                return;
1578        }
1579
1580        init_continuation(&mg->k, invalidate_completed);
1581        continue_after_commit(&cache->committer, &mg->k);
1582        remap_to_origin_clear_discard(cache, mg->overwrite_bio, mg->invalidate_oblock);
1583        mg->overwrite_bio = NULL;
1584        schedule_commit(&cache->committer);
1585}
1586
1587static int invalidate_lock(struct dm_cache_migration *mg)
1588{
1589        int r;
1590        struct dm_cell_key_v2 key;
1591        struct cache *cache = mg->cache;
1592        struct dm_bio_prison_cell_v2 *prealloc;
1593
1594        prealloc = alloc_prison_cell(cache);
1595
1596        build_key(mg->invalidate_oblock, oblock_succ(mg->invalidate_oblock), &key);
1597        r = dm_cell_lock_v2(cache->prison, &key,
1598                            READ_WRITE_LOCK_LEVEL, prealloc, &mg->cell);
1599        if (r < 0) {
1600                free_prison_cell(cache, prealloc);
1601                invalidate_complete(mg, false);
1602                return r;
1603        }
1604
1605        if (mg->cell != prealloc)
1606                free_prison_cell(cache, prealloc);
1607
1608        if (r)
1609                quiesce(mg, invalidate_remove);
1610
1611        else {
1612                /*
1613                 * We can't call invalidate_remove() directly here because we
1614                 * might still be in request context.
1615                 */
1616                init_continuation(&mg->k, invalidate_remove);
1617                queue_work(cache->wq, &mg->k.ws);
1618        }
1619
1620        return 0;
1621}
1622
1623static int invalidate_start(struct cache *cache, dm_cblock_t cblock,
1624                            dm_oblock_t oblock, struct bio *bio)
1625{
1626        struct dm_cache_migration *mg;
1627
1628        if (!background_work_begin(cache))
1629                return -EPERM;
1630
1631        mg = alloc_migration(cache);
1632
1633        mg->overwrite_bio = bio;
1634        mg->invalidate_cblock = cblock;
1635        mg->invalidate_oblock = oblock;
1636
1637        return invalidate_lock(mg);
1638}
1639
1640/*----------------------------------------------------------------
1641 * bio processing
1642 *--------------------------------------------------------------*/
1643
1644enum busy {
1645        IDLE,
1646        BUSY
1647};
1648
1649static enum busy spare_migration_bandwidth(struct cache *cache)
1650{
1651        bool idle = iot_idle_for(&cache->tracker, HZ);
1652        sector_t current_volume = (atomic_read(&cache->nr_io_migrations) + 1) *
1653                cache->sectors_per_block;
1654
1655        if (idle && current_volume <= cache->migration_threshold)
1656                return IDLE;
1657        else
1658                return BUSY;
1659}
1660
1661static void inc_hit_counter(struct cache *cache, struct bio *bio)
1662{
1663        atomic_inc(bio_data_dir(bio) == READ ?
1664                   &cache->stats.read_hit : &cache->stats.write_hit);
1665}
1666
1667static void inc_miss_counter(struct cache *cache, struct bio *bio)
1668{
1669        atomic_inc(bio_data_dir(bio) == READ ?
1670                   &cache->stats.read_miss : &cache->stats.write_miss);
1671}
1672
1673/*----------------------------------------------------------------*/
1674
1675static int map_bio(struct cache *cache, struct bio *bio, dm_oblock_t block,
1676                   bool *commit_needed)
1677{
1678        int r, data_dir;
1679        bool rb, background_queued;
1680        dm_cblock_t cblock;
1681
1682        *commit_needed = false;
1683
1684        rb = bio_detain_shared(cache, block, bio);
1685        if (!rb) {
1686                /*
1687                 * An exclusive lock is held for this block, so we have to
1688                 * wait.  We set the commit_needed flag so the current
1689                 * transaction will be committed asap, allowing this lock
1690                 * to be dropped.
1691                 */
1692                *commit_needed = true;
1693                return DM_MAPIO_SUBMITTED;
1694        }
1695
1696        data_dir = bio_data_dir(bio);
1697
1698        if (optimisable_bio(cache, bio, block)) {
1699                struct policy_work *op = NULL;
1700
1701                r = policy_lookup_with_work(cache->policy, block, &cblock, data_dir, true, &op);
1702                if (unlikely(r && r != -ENOENT)) {
1703                        DMERR_LIMIT("%s: policy_lookup_with_work() failed with r = %d",
1704                                    cache_device_name(cache), r);
1705                        bio_io_error(bio);
1706                        return DM_MAPIO_SUBMITTED;
1707                }
1708
1709                if (r == -ENOENT && op) {
1710                        bio_drop_shared_lock(cache, bio);
1711                        BUG_ON(op->op != POLICY_PROMOTE);
1712                        mg_start(cache, op, bio);
1713                        return DM_MAPIO_SUBMITTED;
1714                }
1715        } else {
1716                r = policy_lookup(cache->policy, block, &cblock, data_dir, false, &background_queued);
1717                if (unlikely(r && r != -ENOENT)) {
1718                        DMERR_LIMIT("%s: policy_lookup() failed with r = %d",
1719                                    cache_device_name(cache), r);
1720                        bio_io_error(bio);
1721                        return DM_MAPIO_SUBMITTED;
1722                }
1723
1724                if (background_queued)
1725                        wake_migration_worker(cache);
1726        }
1727
1728        if (r == -ENOENT) {
1729                struct per_bio_data *pb = get_per_bio_data(bio);
1730
1731                /*
1732                 * Miss.
1733                 */
1734                inc_miss_counter(cache, bio);
1735                if (pb->req_nr == 0) {
1736                        accounted_begin(cache, bio);
1737                        remap_to_origin_clear_discard(cache, bio, block);
1738                } else {
1739                        /*
1740                         * This is a duplicate writethrough io that is no
1741                         * longer needed because the block has been demoted.
1742                         */
1743                        bio_endio(bio);
1744                        return DM_MAPIO_SUBMITTED;
1745                }
1746        } else {
1747                /*
1748                 * Hit.
1749                 */
1750                inc_hit_counter(cache, bio);
1751
1752                /*
1753                 * Passthrough always maps to the origin, invalidating any
1754                 * cache blocks that are written to.
1755                 */
1756                if (passthrough_mode(cache)) {
1757                        if (bio_data_dir(bio) == WRITE) {
1758                                bio_drop_shared_lock(cache, bio);
1759                                atomic_inc(&cache->stats.demotion);
1760                                invalidate_start(cache, cblock, block, bio);
1761                        } else
1762                                remap_to_origin_clear_discard(cache, bio, block);
1763                } else {
1764                        if (bio_data_dir(bio) == WRITE && writethrough_mode(cache) &&
1765                            !is_dirty(cache, cblock)) {
1766                                remap_to_origin_and_cache(cache, bio, block, cblock);
1767                                accounted_begin(cache, bio);
1768                        } else
1769                                remap_to_cache_dirty(cache, bio, block, cblock);
1770                }
1771        }
1772
1773        /*
1774         * dm core turns FUA requests into a separate payload and FLUSH req.
1775         */
1776        if (bio->bi_opf & REQ_FUA) {
1777                /*
1778                 * issue_after_commit will call accounted_begin a second time.  So
1779                 * we call accounted_complete() to avoid double accounting.
1780                 */
1781                accounted_complete(cache, bio);
1782                issue_after_commit(&cache->committer, bio);
1783                *commit_needed = true;
1784                return DM_MAPIO_SUBMITTED;
1785        }
1786
1787        return DM_MAPIO_REMAPPED;
1788}
1789
1790static bool process_bio(struct cache *cache, struct bio *bio)
1791{
1792        bool commit_needed;
1793
1794        if (map_bio(cache, bio, get_bio_block(cache, bio), &commit_needed) == DM_MAPIO_REMAPPED)
1795                generic_make_request(bio);
1796
1797        return commit_needed;
1798}
1799
1800/*
1801 * A non-zero return indicates read_only or fail_io mode.
1802 */
1803static int commit(struct cache *cache, bool clean_shutdown)
1804{
1805        int r;
1806
1807        if (get_cache_mode(cache) >= CM_READ_ONLY)
1808                return -EINVAL;
1809
1810        atomic_inc(&cache->stats.commit_count);
1811        r = dm_cache_commit(cache->cmd, clean_shutdown);
1812        if (r)
1813                metadata_operation_failed(cache, "dm_cache_commit", r);
1814
1815        return r;
1816}
1817
1818/*
1819 * Used by the batcher.
1820 */
1821static blk_status_t commit_op(void *context)
1822{
1823        struct cache *cache = context;
1824
1825        if (dm_cache_changed_this_transaction(cache->cmd))
1826                return errno_to_blk_status(commit(cache, false));
1827
1828        return 0;
1829}
1830
1831/*----------------------------------------------------------------*/
1832
1833static bool process_flush_bio(struct cache *cache, struct bio *bio)
1834{
1835        struct per_bio_data *pb = get_per_bio_data(bio);
1836
1837        if (!pb->req_nr)
1838                remap_to_origin(cache, bio);
1839        else
1840                remap_to_cache(cache, bio, 0);
1841
1842        issue_after_commit(&cache->committer, bio);
1843        return true;
1844}
1845
1846static bool process_discard_bio(struct cache *cache, struct bio *bio)
1847{
1848        dm_dblock_t b, e;
1849
1850        // FIXME: do we need to lock the region?  Or can we just assume the
1851        // user wont be so foolish as to issue discard concurrently with
1852        // other IO?
1853        calc_discard_block_range(cache, bio, &b, &e);
1854        while (b != e) {
1855                set_discard(cache, b);
1856                b = to_dblock(from_dblock(b) + 1);
1857        }
1858
1859        if (cache->features.discard_passdown) {
1860                remap_to_origin(cache, bio);
1861                generic_make_request(bio);
1862        } else
1863                bio_endio(bio);
1864
1865        return false;
1866}
1867
1868static void process_deferred_bios(struct work_struct *ws)
1869{
1870        struct cache *cache = container_of(ws, struct cache, deferred_bio_worker);
1871
1872        bool commit_needed = false;
1873        struct bio_list bios;
1874        struct bio *bio;
1875
1876        bio_list_init(&bios);
1877
1878        spin_lock_irq(&cache->lock);
1879        bio_list_merge(&bios, &cache->deferred_bios);
1880        bio_list_init(&cache->deferred_bios);
1881        spin_unlock_irq(&cache->lock);
1882
1883        while ((bio = bio_list_pop(&bios))) {
1884                if (bio->bi_opf & REQ_PREFLUSH)
1885                        commit_needed = process_flush_bio(cache, bio) || commit_needed;
1886
1887                else if (bio_op(bio) == REQ_OP_DISCARD)
1888                        commit_needed = process_discard_bio(cache, bio) || commit_needed;
1889
1890                else
1891                        commit_needed = process_bio(cache, bio) || commit_needed;
1892        }
1893
1894        if (commit_needed)
1895                schedule_commit(&cache->committer);
1896}
1897
1898/*----------------------------------------------------------------
1899 * Main worker loop
1900 *--------------------------------------------------------------*/
1901
1902static void requeue_deferred_bios(struct cache *cache)
1903{
1904        struct bio *bio;
1905        struct bio_list bios;
1906
1907        bio_list_init(&bios);
1908        bio_list_merge(&bios, &cache->deferred_bios);
1909        bio_list_init(&cache->deferred_bios);
1910
1911        while ((bio = bio_list_pop(&bios))) {
1912                bio->bi_status = BLK_STS_DM_REQUEUE;
1913                bio_endio(bio);
1914        }
1915}
1916
1917/*
1918 * We want to commit periodically so that not too much
1919 * unwritten metadata builds up.
1920 */
1921static void do_waker(struct work_struct *ws)
1922{
1923        struct cache *cache = container_of(to_delayed_work(ws), struct cache, waker);
1924
1925        policy_tick(cache->policy, true);
1926        wake_migration_worker(cache);
1927        schedule_commit(&cache->committer);
1928        queue_delayed_work(cache->wq, &cache->waker, COMMIT_PERIOD);
1929}
1930
1931static void check_migrations(struct work_struct *ws)
1932{
1933        int r;
1934        struct policy_work *op;
1935        struct cache *cache = container_of(ws, struct cache, migration_worker);
1936        enum busy b;
1937
1938        for (;;) {
1939                b = spare_migration_bandwidth(cache);
1940
1941                r = policy_get_background_work(cache->policy, b == IDLE, &op);
1942                if (r == -ENODATA)
1943                        break;
1944
1945                if (r) {
1946                        DMERR_LIMIT("%s: policy_background_work failed",
1947                                    cache_device_name(cache));
1948                        break;
1949                }
1950
1951                r = mg_start(cache, op, NULL);
1952                if (r)
1953                        break;
1954        }
1955}
1956
1957/*----------------------------------------------------------------
1958 * Target methods
1959 *--------------------------------------------------------------*/
1960
1961/*
1962 * This function gets called on the error paths of the constructor, so we
1963 * have to cope with a partially initialised struct.
1964 */
1965static void destroy(struct cache *cache)
1966{
1967        unsigned i;
1968
1969        mempool_exit(&cache->migration_pool);
1970
1971        if (cache->prison)
1972                dm_bio_prison_destroy_v2(cache->prison);
1973
1974        if (cache->wq)
1975                destroy_workqueue(cache->wq);
1976
1977        if (cache->dirty_bitset)
1978                free_bitset(cache->dirty_bitset);
1979
1980        if (cache->discard_bitset)
1981                free_bitset(cache->discard_bitset);
1982
1983        if (cache->copier)
1984                dm_kcopyd_client_destroy(cache->copier);
1985
1986        if (cache->cmd)
1987                dm_cache_metadata_close(cache->cmd);
1988
1989        if (cache->metadata_dev)
1990                dm_put_device(cache->ti, cache->metadata_dev);
1991
1992        if (cache->origin_dev)
1993                dm_put_device(cache->ti, cache->origin_dev);
1994
1995        if (cache->cache_dev)
1996                dm_put_device(cache->ti, cache->cache_dev);
1997
1998        if (cache->policy)
1999                dm_cache_policy_destroy(cache->policy);
2000
2001        for (i = 0; i < cache->nr_ctr_args ; i++)
2002                kfree(cache->ctr_args[i]);
2003        kfree(cache->ctr_args);
2004
2005        bioset_exit(&cache->bs);
2006
2007        kfree(cache);
2008}
2009
2010static void cache_dtr(struct dm_target *ti)
2011{
2012        struct cache *cache = ti->private;
2013
2014        destroy(cache);
2015}
2016
2017static sector_t get_dev_size(struct dm_dev *dev)
2018{
2019        return i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
2020}
2021
2022/*----------------------------------------------------------------*/
2023
2024/*
2025 * Construct a cache device mapping.
2026 *
2027 * cache <metadata dev> <cache dev> <origin dev> <block size>
2028 *       <#feature args> [<feature arg>]*
2029 *       <policy> <#policy args> [<policy arg>]*
2030 *
2031 * metadata dev    : fast device holding the persistent metadata
2032 * cache dev       : fast device holding cached data blocks
2033 * origin dev      : slow device holding original data blocks
2034 * block size      : cache unit size in sectors
2035 *
2036 * #feature args   : number of feature arguments passed
2037 * feature args    : writethrough.  (The default is writeback.)
2038 *
2039 * policy          : the replacement policy to use
2040 * #policy args    : an even number of policy arguments corresponding
2041 *                   to key/value pairs passed to the policy
2042 * policy args     : key/value pairs passed to the policy
2043 *                   E.g. 'sequential_threshold 1024'
2044 *                   See cache-policies.txt for details.
2045 *
2046 * Optional feature arguments are:
2047 *   writethrough  : write through caching that prohibits cache block
2048 *                   content from being different from origin block content.
2049 *                   Without this argument, the default behaviour is to write
2050 *                   back cache block contents later for performance reasons,
2051 *                   so they may differ from the corresponding origin blocks.
2052 */
2053struct cache_args {
2054        struct dm_target *ti;
2055
2056        struct dm_dev *metadata_dev;
2057
2058        struct dm_dev *cache_dev;
2059        sector_t cache_sectors;
2060
2061        struct dm_dev *origin_dev;
2062        sector_t origin_sectors;
2063
2064        uint32_t block_size;
2065
2066        const char *policy_name;
2067        int policy_argc;
2068        const char **policy_argv;
2069
2070        struct cache_features features;
2071};
2072
2073static void destroy_cache_args(struct cache_args *ca)
2074{
2075        if (ca->metadata_dev)
2076                dm_put_device(ca->ti, ca->metadata_dev);
2077
2078        if (ca->cache_dev)
2079                dm_put_device(ca->ti, ca->cache_dev);
2080
2081        if (ca->origin_dev)
2082                dm_put_device(ca->ti, ca->origin_dev);
2083
2084        kfree(ca);
2085}
2086
2087static bool at_least_one_arg(struct dm_arg_set *as, char **error)
2088{
2089        if (!as->argc) {
2090                *error = "Insufficient args";
2091                return false;
2092        }
2093
2094        return true;
2095}
2096
2097static int parse_metadata_dev(struct cache_args *ca, struct dm_arg_set *as,
2098                              char **error)
2099{
2100        int r;
2101        sector_t metadata_dev_size;
2102        char b[BDEVNAME_SIZE];
2103
2104        if (!at_least_one_arg(as, error))
2105                return -EINVAL;
2106
2107        r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
2108                          &ca->metadata_dev);
2109        if (r) {
2110                *error = "Error opening metadata device";
2111                return r;
2112        }
2113
2114        metadata_dev_size = get_dev_size(ca->metadata_dev);
2115        if (metadata_dev_size > DM_CACHE_METADATA_MAX_SECTORS_WARNING)
2116                DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
2117                       bdevname(ca->metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
2118
2119        return 0;
2120}
2121
2122static int parse_cache_dev(struct cache_args *ca, struct dm_arg_set *as,
2123                           char **error)
2124{
2125        int r;
2126
2127        if (!at_least_one_arg(as, error))
2128                return -EINVAL;
2129
2130        r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
2131                          &ca->cache_dev);
2132        if (r) {
2133                *error = "Error opening cache device";
2134                return r;
2135        }
2136        ca->cache_sectors = get_dev_size(ca->cache_dev);
2137
2138        return 0;
2139}
2140
2141static int parse_origin_dev(struct cache_args *ca, struct dm_arg_set *as,
2142                            char **error)
2143{
2144        int r;
2145
2146        if (!at_least_one_arg(as, error))
2147                return -EINVAL;
2148
2149        r = dm_get_device(ca->ti, dm_shift_arg(as), FMODE_READ | FMODE_WRITE,
2150                          &ca->origin_dev);
2151        if (r) {
2152                *error = "Error opening origin device";
2153                return r;
2154        }
2155
2156        ca->origin_sectors = get_dev_size(ca->origin_dev);
2157        if (ca->ti->len > ca->origin_sectors) {
2158                *error = "Device size larger than cached device";
2159                return -EINVAL;
2160        }
2161
2162        return 0;
2163}
2164
2165static int parse_block_size(struct cache_args *ca, struct dm_arg_set *as,
2166                            char **error)
2167{
2168        unsigned long block_size;
2169
2170        if (!at_least_one_arg(as, error))
2171                return -EINVAL;
2172
2173        if (kstrtoul(dm_shift_arg(as), 10, &block_size) || !block_size ||
2174            block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
2175            block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
2176            block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
2177                *error = "Invalid data block size";
2178                return -EINVAL;
2179        }
2180
2181        if (block_size > ca->cache_sectors) {
2182                *error = "Data block size is larger than the cache device";
2183                return -EINVAL;
2184        }
2185
2186        ca->block_size = block_size;
2187
2188        return 0;
2189}
2190
2191static void init_features(struct cache_features *cf)
2192{
2193        cf->mode = CM_WRITE;
2194        cf->io_mode = CM_IO_WRITEBACK;
2195        cf->metadata_version = 1;
2196        cf->discard_passdown = true;
2197}
2198
2199static int parse_features(struct cache_args *ca, struct dm_arg_set *as,
2200                          char **error)
2201{
2202        static const struct dm_arg _args[] = {
2203                {0, 3, "Invalid number of cache feature arguments"},
2204        };
2205
2206        int r, mode_ctr = 0;
2207        unsigned argc;
2208        const char *arg;
2209        struct cache_features *cf = &ca->features;
2210
2211        init_features(cf);
2212
2213        r = dm_read_arg_group(_args, as, &argc, error);
2214        if (r)
2215                return -EINVAL;
2216
2217        while (argc--) {
2218                arg = dm_shift_arg(as);
2219
2220                if (!strcasecmp(arg, "writeback")) {
2221                        cf->io_mode = CM_IO_WRITEBACK;
2222                        mode_ctr++;
2223                }
2224
2225                else if (!strcasecmp(arg, "writethrough")) {
2226                        cf->io_mode = CM_IO_WRITETHROUGH;
2227                        mode_ctr++;
2228                }
2229
2230                else if (!strcasecmp(arg, "passthrough")) {
2231                        cf->io_mode = CM_IO_PASSTHROUGH;
2232                        mode_ctr++;
2233                }
2234
2235                else if (!strcasecmp(arg, "metadata2"))
2236                        cf->metadata_version = 2;
2237
2238                else if (!strcasecmp(arg, "no_discard_passdown"))
2239                        cf->discard_passdown = false;
2240
2241                else {
2242                        *error = "Unrecognised cache feature requested";
2243                        return -EINVAL;
2244                }
2245        }
2246
2247        if (mode_ctr > 1) {
2248                *error = "Duplicate cache io_mode features requested";
2249                return -EINVAL;
2250        }
2251
2252        return 0;
2253}
2254
2255static int parse_policy(struct cache_args *ca, struct dm_arg_set *as,
2256                        char **error)
2257{
2258        static const struct dm_arg _args[] = {
2259                {0, 1024, "Invalid number of policy arguments"},
2260        };
2261
2262        int r;
2263
2264        if (!at_least_one_arg(as, error))
2265                return -EINVAL;
2266
2267        ca->policy_name = dm_shift_arg(as);
2268
2269        r = dm_read_arg_group(_args, as, &ca->policy_argc, error);
2270        if (r)
2271                return -EINVAL;
2272
2273        ca->policy_argv = (const char **)as->argv;
2274        dm_consume_args(as, ca->policy_argc);
2275
2276        return 0;
2277}
2278
2279static int parse_cache_args(struct cache_args *ca, int argc, char **argv,
2280                            char **error)
2281{
2282        int r;
2283        struct dm_arg_set as;
2284
2285        as.argc = argc;
2286        as.argv = argv;
2287
2288        r = parse_metadata_dev(ca, &as, error);
2289        if (r)
2290                return r;
2291
2292        r = parse_cache_dev(ca, &as, error);
2293        if (r)
2294                return r;
2295
2296        r = parse_origin_dev(ca, &as, error);
2297        if (r)
2298                return r;
2299
2300        r = parse_block_size(ca, &as, error);
2301        if (r)
2302                return r;
2303
2304        r = parse_features(ca, &as, error);
2305        if (r)
2306                return r;
2307
2308        r = parse_policy(ca, &as, error);
2309        if (r)
2310                return r;
2311
2312        return 0;
2313}
2314
2315/*----------------------------------------------------------------*/
2316
2317static struct kmem_cache *migration_cache;
2318
2319#define NOT_CORE_OPTION 1
2320
2321static int process_config_option(struct cache *cache, const char *key, const char *value)
2322{
2323        unsigned long tmp;
2324
2325        if (!strcasecmp(key, "migration_threshold")) {
2326                if (kstrtoul(value, 10, &tmp))
2327                        return -EINVAL;
2328
2329                cache->migration_threshold = tmp;
2330                return 0;
2331        }
2332
2333        return NOT_CORE_OPTION;
2334}
2335
2336static int set_config_value(struct cache *cache, const char *key, const char *value)
2337{
2338        int r = process_config_option(cache, key, value);
2339
2340        if (r == NOT_CORE_OPTION)
2341                r = policy_set_config_value(cache->policy, key, value);
2342
2343        if (r)
2344                DMWARN("bad config value for %s: %s", key, value);
2345
2346        return r;
2347}
2348
2349static int set_config_values(struct cache *cache, int argc, const char **argv)
2350{
2351        int r = 0;
2352
2353        if (argc & 1) {
2354                DMWARN("Odd number of policy arguments given but they should be <key> <value> pairs.");
2355                return -EINVAL;
2356        }
2357
2358        while (argc) {
2359                r = set_config_value(cache, argv[0], argv[1]);
2360                if (r)
2361                        break;
2362
2363                argc -= 2;
2364                argv += 2;
2365        }
2366
2367        return r;
2368}
2369
2370static int create_cache_policy(struct cache *cache, struct cache_args *ca,
2371                               char **error)
2372{
2373        struct dm_cache_policy *p = dm_cache_policy_create(ca->policy_name,
2374                                                           cache->cache_size,
2375                                                           cache->origin_sectors,
2376                                                           cache->sectors_per_block);
2377        if (IS_ERR(p)) {
2378                *error = "Error creating cache's policy";
2379                return PTR_ERR(p);
2380        }
2381        cache->policy = p;
2382        BUG_ON(!cache->policy);
2383
2384        return 0;
2385}
2386
2387/*
2388 * We want the discard block size to be at least the size of the cache
2389 * block size and have no more than 2^14 discard blocks across the origin.
2390 */
2391#define MAX_DISCARD_BLOCKS (1 << 14)
2392
2393static bool too_many_discard_blocks(sector_t discard_block_size,
2394                                    sector_t origin_size)
2395{
2396        (void) sector_div(origin_size, discard_block_size);
2397
2398        return origin_size > MAX_DISCARD_BLOCKS;
2399}
2400
2401static sector_t calculate_discard_block_size(sector_t cache_block_size,
2402                                             sector_t origin_size)
2403{
2404        sector_t discard_block_size = cache_block_size;
2405
2406        if (origin_size)
2407                while (too_many_discard_blocks(discard_block_size, origin_size))
2408                        discard_block_size *= 2;
2409
2410        return discard_block_size;
2411}
2412
2413static void set_cache_size(struct cache *cache, dm_cblock_t size)
2414{
2415        dm_block_t nr_blocks = from_cblock(size);
2416
2417        if (nr_blocks > (1 << 20) && cache->cache_size != size)
2418                DMWARN_LIMIT("You have created a cache device with a lot of individual cache blocks (%llu)\n"
2419                             "All these mappings can consume a lot of kernel memory, and take some time to read/write.\n"
2420                             "Please consider increasing the cache block size to reduce the overall cache block count.",
2421                             (unsigned long long) nr_blocks);
2422
2423        cache->cache_size = size;
2424}
2425
2426static int is_congested(struct dm_dev *dev, int bdi_bits)
2427{
2428        struct request_queue *q = bdev_get_queue(dev->bdev);
2429        return bdi_congested(q->backing_dev_info, bdi_bits);
2430}
2431
2432static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
2433{
2434        struct cache *cache = container_of(cb, struct cache, callbacks);
2435
2436        return is_congested(cache->origin_dev, bdi_bits) ||
2437                is_congested(cache->cache_dev, bdi_bits);
2438}
2439
2440#define DEFAULT_MIGRATION_THRESHOLD 2048
2441
2442static int cache_create(struct cache_args *ca, struct cache **result)
2443{
2444        int r = 0;
2445        char **error = &ca->ti->error;
2446        struct cache *cache;
2447        struct dm_target *ti = ca->ti;
2448        dm_block_t origin_blocks;
2449        struct dm_cache_metadata *cmd;
2450        bool may_format = ca->features.mode == CM_WRITE;
2451
2452        cache = kzalloc(sizeof(*cache), GFP_KERNEL);
2453        if (!cache)
2454                return -ENOMEM;
2455
2456        cache->ti = ca->ti;
2457        ti->private = cache;
2458        ti->num_flush_bios = 2;
2459        ti->flush_supported = true;
2460
2461        ti->num_discard_bios = 1;
2462        ti->discards_supported = true;
2463
2464        ti->per_io_data_size = sizeof(struct per_bio_data);
2465
2466        cache->features = ca->features;
2467        if (writethrough_mode(cache)) {
2468                /* Create bioset for writethrough bios issued to origin */
2469                r = bioset_init(&cache->bs, BIO_POOL_SIZE, 0, 0);
2470                if (r)
2471                        goto bad;
2472        }
2473
2474        cache->callbacks.congested_fn = cache_is_congested;
2475        dm_table_add_target_callbacks(ti->table, &cache->callbacks);
2476
2477        cache->metadata_dev = ca->metadata_dev;
2478        cache->origin_dev = ca->origin_dev;
2479        cache->cache_dev = ca->cache_dev;
2480
2481        ca->metadata_dev = ca->origin_dev = ca->cache_dev = NULL;
2482
2483        origin_blocks = cache->origin_sectors = ca->origin_sectors;
2484        origin_blocks = block_div(origin_blocks, ca->block_size);
2485        cache->origin_blocks = to_oblock(origin_blocks);
2486
2487        cache->sectors_per_block = ca->block_size;
2488        if (dm_set_target_max_io_len(ti, cache->sectors_per_block)) {
2489                r = -EINVAL;
2490                goto bad;
2491        }
2492
2493        if (ca->block_size & (ca->block_size - 1)) {
2494                dm_block_t cache_size = ca->cache_sectors;
2495
2496                cache->sectors_per_block_shift = -1;
2497                cache_size = block_div(cache_size, ca->block_size);
2498                set_cache_size(cache, to_cblock(cache_size));
2499        } else {
2500                cache->sectors_per_block_shift = __ffs(ca->block_size);
2501                set_cache_size(cache, to_cblock(ca->cache_sectors >> cache->sectors_per_block_shift));
2502        }
2503
2504        r = create_cache_policy(cache, ca, error);
2505        if (r)
2506                goto bad;
2507
2508        cache->policy_nr_args = ca->policy_argc;
2509        cache->migration_threshold = DEFAULT_MIGRATION_THRESHOLD;
2510
2511        r = set_config_values(cache, ca->policy_argc, ca->policy_argv);
2512        if (r) {
2513                *error = "Error setting cache policy's config values";
2514                goto bad;
2515        }
2516
2517        cmd = dm_cache_metadata_open(cache->metadata_dev->bdev,
2518                                     ca->block_size, may_format,
2519                                     dm_cache_policy_get_hint_size(cache->policy),
2520                                     ca->features.metadata_version);
2521        if (IS_ERR(cmd)) {
2522                *error = "Error creating metadata object";
2523                r = PTR_ERR(cmd);
2524                goto bad;
2525        }
2526        cache->cmd = cmd;
2527        set_cache_mode(cache, CM_WRITE);
2528        if (get_cache_mode(cache) != CM_WRITE) {
2529                *error = "Unable to get write access to metadata, please check/repair metadata.";
2530                r = -EINVAL;
2531                goto bad;
2532        }
2533
2534        if (passthrough_mode(cache)) {
2535                bool all_clean;
2536
2537                r = dm_cache_metadata_all_clean(cache->cmd, &all_clean);
2538                if (r) {
2539                        *error = "dm_cache_metadata_all_clean() failed";
2540                        goto bad;
2541                }
2542
2543                if (!all_clean) {
2544                        *error = "Cannot enter passthrough mode unless all blocks are clean";
2545                        r = -EINVAL;
2546                        goto bad;
2547                }
2548
2549                policy_allow_migrations(cache->policy, false);
2550        }
2551
2552        spin_lock_init(&cache->lock);
2553        bio_list_init(&cache->deferred_bios);
2554        atomic_set(&cache->nr_allocated_migrations, 0);
2555        atomic_set(&cache->nr_io_migrations, 0);
2556        init_waitqueue_head(&cache->migration_wait);
2557
2558        r = -ENOMEM;
2559        atomic_set(&cache->nr_dirty, 0);
2560        cache->dirty_bitset = alloc_bitset(from_cblock(cache->cache_size));
2561        if (!cache->dirty_bitset) {
2562                *error = "could not allocate dirty bitset";
2563                goto bad;
2564        }
2565        clear_bitset(cache->dirty_bitset, from_cblock(cache->cache_size));
2566
2567        cache->discard_block_size =
2568                calculate_discard_block_size(cache->sectors_per_block,
2569                                             cache->origin_sectors);
2570        cache->discard_nr_blocks = to_dblock(dm_sector_div_up(cache->origin_sectors,
2571                                                              cache->discard_block_size));
2572        cache->discard_bitset = alloc_bitset(from_dblock(cache->discard_nr_blocks));
2573        if (!cache->discard_bitset) {
2574                *error = "could not allocate discard bitset";
2575                goto bad;
2576        }
2577        clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
2578
2579        cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
2580        if (IS_ERR(cache->copier)) {
2581                *error = "could not create kcopyd client";
2582                r = PTR_ERR(cache->copier);
2583                goto bad;
2584        }
2585
2586        cache->wq = alloc_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM, 0);
2587        if (!cache->wq) {
2588                *error = "could not create workqueue for metadata object";
2589                goto bad;
2590        }
2591        INIT_WORK(&cache->deferred_bio_worker, process_deferred_bios);
2592        INIT_WORK(&cache->migration_worker, check_migrations);
2593        INIT_DELAYED_WORK(&cache->waker, do_waker);
2594
2595        cache->prison = dm_bio_prison_create_v2(cache->wq);
2596        if (!cache->prison) {
2597                *error = "could not create bio prison";
2598                goto bad;
2599        }
2600
2601        r = mempool_init_slab_pool(&cache->migration_pool, MIGRATION_POOL_SIZE,
2602                                   migration_cache);
2603        if (r) {
2604                *error = "Error creating cache's migration mempool";
2605                goto bad;
2606        }
2607
2608        cache->need_tick_bio = true;
2609        cache->sized = false;
2610        cache->invalidate = false;
2611        cache->commit_requested = false;
2612        cache->loaded_mappings = false;
2613        cache->loaded_discards = false;
2614
2615        load_stats(cache);
2616
2617        atomic_set(&cache->stats.demotion, 0);
2618        atomic_set(&cache->stats.promotion, 0);
2619        atomic_set(&cache->stats.copies_avoided, 0);
2620        atomic_set(&cache->stats.cache_cell_clash, 0);
2621        atomic_set(&cache->stats.commit_count, 0);
2622        atomic_set(&cache->stats.discard_count, 0);
2623
2624        spin_lock_init(&cache->invalidation_lock);
2625        INIT_LIST_HEAD(&cache->invalidation_requests);
2626
2627        batcher_init(&cache->committer, commit_op, cache,
2628                     issue_op, cache, cache->wq);
2629        iot_init(&cache->tracker);
2630
2631        init_rwsem(&cache->background_work_lock);
2632        prevent_background_work(cache);
2633
2634        *result = cache;
2635        return 0;
2636bad:
2637        destroy(cache);
2638        return r;
2639}
2640
2641static int copy_ctr_args(struct cache *cache, int argc, const char **argv)
2642{
2643        unsigned i;
2644        const char **copy;
2645
2646        copy = kcalloc(argc, sizeof(*copy), GFP_KERNEL);
2647        if (!copy)
2648                return -ENOMEM;
2649        for (i = 0; i < argc; i++) {
2650                copy[i] = kstrdup(argv[i], GFP_KERNEL);
2651                if (!copy[i]) {
2652                        while (i--)
2653                                kfree(copy[i]);
2654                        kfree(copy);
2655                        return -ENOMEM;
2656                }
2657        }
2658
2659        cache->nr_ctr_args = argc;
2660        cache->ctr_args = copy;
2661
2662        return 0;
2663}
2664
2665static int cache_ctr(struct dm_target *ti, unsigned argc, char **argv)
2666{
2667        int r = -EINVAL;
2668        struct cache_args *ca;
2669        struct cache *cache = NULL;
2670
2671        ca = kzalloc(sizeof(*ca), GFP_KERNEL);
2672        if (!ca) {
2673                ti->error = "Error allocating memory for cache";
2674                return -ENOMEM;
2675        }
2676        ca->ti = ti;
2677
2678        r = parse_cache_args(ca, argc, argv, &ti->error);
2679        if (r)
2680                goto out;
2681
2682        r = cache_create(ca, &cache);
2683        if (r)
2684                goto out;
2685
2686        r = copy_ctr_args(cache, argc - 3, (const char **)argv + 3);
2687        if (r) {
2688                destroy(cache);
2689                goto out;
2690        }
2691
2692        ti->private = cache;
2693out:
2694        destroy_cache_args(ca);
2695        return r;
2696}
2697
2698/*----------------------------------------------------------------*/
2699
2700static int cache_map(struct dm_target *ti, struct bio *bio)
2701{
2702        struct cache *cache = ti->private;
2703
2704        int r;
2705        bool commit_needed;
2706        dm_oblock_t block = get_bio_block(cache, bio);
2707
2708        init_per_bio_data(bio);
2709        if (unlikely(from_oblock(block) >= from_oblock(cache->origin_blocks))) {
2710                /*
2711                 * This can only occur if the io goes to a partial block at
2712                 * the end of the origin device.  We don't cache these.
2713                 * Just remap to the origin and carry on.
2714                 */
2715                remap_to_origin(cache, bio);
2716                accounted_begin(cache, bio);
2717                return DM_MAPIO_REMAPPED;
2718        }
2719
2720        if (discard_or_flush(bio)) {
2721                defer_bio(cache, bio);
2722                return DM_MAPIO_SUBMITTED;
2723        }
2724
2725        r = map_bio(cache, bio, block, &commit_needed);
2726        if (commit_needed)
2727                schedule_commit(&cache->committer);
2728
2729        return r;
2730}
2731
2732static int cache_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *error)
2733{
2734        struct cache *cache = ti->private;
2735        unsigned long flags;
2736        struct per_bio_data *pb = get_per_bio_data(bio);
2737
2738        if (pb->tick) {
2739                policy_tick(cache->policy, false);
2740
2741                spin_lock_irqsave(&cache->lock, flags);
2742                cache->need_tick_bio = true;
2743                spin_unlock_irqrestore(&cache->lock, flags);
2744        }
2745
2746        bio_drop_shared_lock(cache, bio);
2747        accounted_complete(cache, bio);
2748
2749        return DM_ENDIO_DONE;
2750}
2751
2752static int write_dirty_bitset(struct cache *cache)
2753{
2754        int r;
2755
2756        if (get_cache_mode(cache) >= CM_READ_ONLY)
2757                return -EINVAL;
2758
2759        r = dm_cache_set_dirty_bits(cache->cmd, from_cblock(cache->cache_size), cache->dirty_bitset);
2760        if (r)
2761                metadata_operation_failed(cache, "dm_cache_set_dirty_bits", r);
2762
2763        return r;
2764}
2765
2766static int write_discard_bitset(struct cache *cache)
2767{
2768        unsigned i, r;
2769
2770        if (get_cache_mode(cache) >= CM_READ_ONLY)
2771                return -EINVAL;
2772
2773        r = dm_cache_discard_bitset_resize(cache->cmd, cache->discard_block_size,
2774                                           cache->discard_nr_blocks);
2775        if (r) {
2776                DMERR("%s: could not resize on-disk discard bitset", cache_device_name(cache));
2777                metadata_operation_failed(cache, "dm_cache_discard_bitset_resize", r);
2778                return r;
2779        }
2780
2781        for (i = 0; i < from_dblock(cache->discard_nr_blocks); i++) {
2782                r = dm_cache_set_discard(cache->cmd, to_dblock(i),
2783                                         is_discarded(cache, to_dblock(i)));
2784                if (r) {
2785                        metadata_operation_failed(cache, "dm_cache_set_discard", r);
2786                        return r;
2787                }
2788        }
2789
2790        return 0;
2791}
2792
2793static int write_hints(struct cache *cache)
2794{
2795        int r;
2796
2797        if (get_cache_mode(cache) >= CM_READ_ONLY)
2798                return -EINVAL;
2799
2800        r = dm_cache_write_hints(cache->cmd, cache->policy);
2801        if (r) {
2802                metadata_operation_failed(cache, "dm_cache_write_hints", r);
2803                return r;
2804        }
2805
2806        return 0;
2807}
2808
2809/*
2810 * returns true on success
2811 */
2812static bool sync_metadata(struct cache *cache)
2813{
2814        int r1, r2, r3, r4;
2815
2816        r1 = write_dirty_bitset(cache);
2817        if (r1)
2818                DMERR("%s: could not write dirty bitset", cache_device_name(cache));
2819
2820        r2 = write_discard_bitset(cache);
2821        if (r2)
2822                DMERR("%s: could not write discard bitset", cache_device_name(cache));
2823
2824        save_stats(cache);
2825
2826        r3 = write_hints(cache);
2827        if (r3)
2828                DMERR("%s: could not write hints", cache_device_name(cache));
2829
2830        /*
2831         * If writing the above metadata failed, we still commit, but don't
2832         * set the clean shutdown flag.  This will effectively force every
2833         * dirty bit to be set on reload.
2834         */
2835        r4 = commit(cache, !r1 && !r2 && !r3);
2836        if (r4)
2837                DMERR("%s: could not write cache metadata", cache_device_name(cache));
2838
2839        return !r1 && !r2 && !r3 && !r4;
2840}
2841
2842static void cache_postsuspend(struct dm_target *ti)
2843{
2844        struct cache *cache = ti->private;
2845
2846        prevent_background_work(cache);
2847        BUG_ON(atomic_read(&cache->nr_io_migrations));
2848
2849        cancel_delayed_work_sync(&cache->waker);
2850        drain_workqueue(cache->wq);
2851        WARN_ON(cache->tracker.in_flight);
2852
2853        /*
2854         * If it's a flush suspend there won't be any deferred bios, so this
2855         * call is harmless.
2856         */
2857        requeue_deferred_bios(cache);
2858
2859        if (get_cache_mode(cache) == CM_WRITE)
2860                (void) sync_metadata(cache);
2861}
2862
2863static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock,
2864                        bool dirty, uint32_t hint, bool hint_valid)
2865{
2866        int r;
2867        struct cache *cache = context;
2868
2869        if (dirty) {
2870                set_bit(from_cblock(cblock), cache->dirty_bitset);
2871                atomic_inc(&cache->nr_dirty);
2872        } else
2873                clear_bit(from_cblock(cblock), cache->dirty_bitset);
2874
2875        r = policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid);
2876        if (r)
2877                return r;
2878
2879        return 0;
2880}
2881
2882/*
2883 * The discard block size in the on disk metadata is not
2884 * neccessarily the same as we're currently using.  So we have to
2885 * be careful to only set the discarded attribute if we know it
2886 * covers a complete block of the new size.
2887 */
2888struct discard_load_info {
2889        struct cache *cache;
2890
2891        /*
2892         * These blocks are sized using the on disk dblock size, rather
2893         * than the current one.
2894         */
2895        dm_block_t block_size;
2896        dm_block_t discard_begin, discard_end;
2897};
2898
2899static void discard_load_info_init(struct cache *cache,
2900                                   struct discard_load_info *li)
2901{
2902        li->cache = cache;
2903        li->discard_begin = li->discard_end = 0;
2904}
2905
2906static void set_discard_range(struct discard_load_info *li)
2907{
2908        sector_t b, e;
2909
2910        if (li->discard_begin == li->discard_end)
2911                return;
2912
2913        /*
2914         * Convert to sectors.
2915         */
2916        b = li->discard_begin * li->block_size;
2917        e = li->discard_end * li->block_size;
2918
2919        /*
2920         * Then convert back to the current dblock size.
2921         */
2922        b = dm_sector_div_up(b, li->cache->discard_block_size);
2923        sector_div(e, li->cache->discard_block_size);
2924
2925        /*
2926         * The origin may have shrunk, so we need to check we're still in
2927         * bounds.
2928         */
2929        if (e > from_dblock(li->cache->discard_nr_blocks))
2930                e = from_dblock(li->cache->discard_nr_blocks);
2931
2932        for (; b < e; b++)
2933                set_discard(li->cache, to_dblock(b));
2934}
2935
2936static int load_discard(void *context, sector_t discard_block_size,
2937                        dm_dblock_t dblock, bool discard)
2938{
2939        struct discard_load_info *li = context;
2940
2941        li->block_size = discard_block_size;
2942
2943        if (discard) {
2944                if (from_dblock(dblock) == li->discard_end)
2945                        /*
2946                         * We're already in a discard range, just extend it.
2947                         */
2948                        li->discard_end = li->discard_end + 1ULL;
2949
2950                else {
2951                        /*
2952                         * Emit the old range and start a new one.
2953                         */
2954                        set_discard_range(li);
2955                        li->discard_begin = from_dblock(dblock);
2956                        li->discard_end = li->discard_begin + 1ULL;
2957                }
2958        } else {
2959                set_discard_range(li);
2960                li->discard_begin = li->discard_end = 0;
2961        }
2962
2963        return 0;
2964}
2965
2966static dm_cblock_t get_cache_dev_size(struct cache *cache)
2967{
2968        sector_t size = get_dev_size(cache->cache_dev);
2969        (void) sector_div(size, cache->sectors_per_block);
2970        return to_cblock(size);
2971}
2972
2973static bool can_resize(struct cache *cache, dm_cblock_t new_size)
2974{
2975        if (from_cblock(new_size) > from_cblock(cache->cache_size)) {
2976                if (cache->sized) {
2977                        DMERR("%s: unable to extend cache due to missing cache table reload",
2978                              cache_device_name(cache));
2979                        return false;
2980                }
2981        }
2982
2983        /*
2984         * We can't drop a dirty block when shrinking the cache.
2985         */
2986        while (from_cblock(new_size) < from_cblock(cache->cache_size)) {
2987                new_size = to_cblock(from_cblock(new_size) + 1);
2988                if (is_dirty(cache, new_size)) {
2989                        DMERR("%s: unable to shrink cache; cache block %llu is dirty",
2990                              cache_device_name(cache),
2991                              (unsigned long long) from_cblock(new_size));
2992                        return false;
2993                }
2994        }
2995
2996        return true;
2997}
2998
2999static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size)
3000{
3001        int r;
3002
3003        r = dm_cache_resize(cache->cmd, new_size);
3004        if (r) {
3005                DMERR("%s: could not resize cache metadata", cache_device_name(cache));
3006                metadata_operation_failed(cache, "dm_cache_resize", r);
3007                return r;
3008        }
3009
3010        set_cache_size(cache, new_size);
3011
3012        return 0;
3013}
3014
3015static int cache_preresume(struct dm_target *ti)
3016{
3017        int r = 0;
3018        struct cache *cache = ti->private;
3019        dm_cblock_t csize = get_cache_dev_size(cache);
3020
3021        /*
3022         * Check to see if the cache has resized.
3023         */
3024        if (!cache->sized) {
3025                r = resize_cache_dev(cache, csize);
3026                if (r)
3027                        return r;
3028
3029                cache->sized = true;
3030
3031        } else if (csize != cache->cache_size) {
3032                if (!can_resize(cache, csize))
3033                        return -EINVAL;
3034
3035                r = resize_cache_dev(cache, csize);
3036                if (r)
3037                        return r;
3038        }
3039
3040        if (!cache->loaded_mappings) {
3041                r = dm_cache_load_mappings(cache->cmd, cache->policy,
3042                                           load_mapping, cache);
3043                if (r) {
3044                        DMERR("%s: could not load cache mappings", cache_device_name(cache));
3045                        metadata_operation_failed(cache, "dm_cache_load_mappings", r);
3046                        return r;
3047                }
3048
3049                cache->loaded_mappings = true;
3050        }
3051
3052        if (!cache->loaded_discards) {
3053                struct discard_load_info li;
3054
3055                /*
3056                 * The discard bitset could have been resized, or the
3057                 * discard block size changed.  To be safe we start by
3058                 * setting every dblock to not discarded.
3059                 */
3060                clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks));
3061
3062                discard_load_info_init(cache, &li);
3063                r = dm_cache_load_discards(cache->cmd, load_discard, &li);
3064                if (r) {
3065                        DMERR("%s: could not load origin discards", cache_device_name(cache));
3066                        metadata_operation_failed(cache, "dm_cache_load_discards", r);
3067                        return r;
3068                }
3069                set_discard_range(&li);
3070
3071                cache->loaded_discards = true;
3072        }
3073
3074        return r;
3075}
3076
3077static void cache_resume(struct dm_target *ti)
3078{
3079        struct cache *cache = ti->private;
3080
3081        cache->need_tick_bio = true;
3082        allow_background_work(cache);
3083        do_waker(&cache->waker.work);
3084}
3085
3086static void emit_flags(struct cache *cache, char *result,
3087                       unsigned maxlen, ssize_t *sz_ptr)
3088{
3089        ssize_t sz = *sz_ptr;
3090        struct cache_features *cf = &cache->features;
3091        unsigned count = (cf->metadata_version == 2) + !cf->discard_passdown + 1;
3092
3093        DMEMIT("%u ", count);
3094
3095        if (cf->metadata_version == 2)
3096                DMEMIT("metadata2 ");
3097
3098        if (writethrough_mode(cache))
3099                DMEMIT("writethrough ");
3100
3101        else if (passthrough_mode(cache))
3102                DMEMIT("passthrough ");
3103
3104        else if (writeback_mode(cache))
3105                DMEMIT("writeback ");
3106
3107        else {
3108                DMEMIT("unknown ");
3109                DMERR("%s: internal error: unknown io mode: %d",
3110                      cache_device_name(cache), (int) cf->io_mode);
3111        }
3112
3113        if (!cf->discard_passdown)
3114                DMEMIT("no_discard_passdown ");
3115
3116        *sz_ptr = sz;
3117}
3118
3119/*
3120 * Status format:
3121 *
3122 * <metadata block size> <#used metadata blocks>/<#total metadata blocks>
3123 * <cache block size> <#used cache blocks>/<#total cache blocks>
3124 * <#read hits> <#read misses> <#write hits> <#write misses>
3125 * <#demotions> <#promotions> <#dirty>
3126 * <#features> <features>*
3127 * <#core args> <core args>
3128 * <policy name> <#policy args> <policy args>* <cache metadata mode> <needs_check>
3129 */
3130static void cache_status(struct dm_target *ti, status_type_t type,
3131                         unsigned status_flags, char *result, unsigned maxlen)
3132{
3133        int r = 0;
3134        unsigned i;
3135        ssize_t sz = 0;
3136        dm_block_t nr_free_blocks_metadata = 0;
3137        dm_block_t nr_blocks_metadata = 0;
3138        char buf[BDEVNAME_SIZE];
3139        struct cache *cache = ti->private;
3140        dm_cblock_t residency;
3141        bool needs_check;
3142
3143        switch (type) {
3144        case STATUSTYPE_INFO:
3145                if (get_cache_mode(cache) == CM_FAIL) {
3146                        DMEMIT("Fail");
3147                        break;
3148                }
3149
3150                /* Commit to ensure statistics aren't out-of-date */
3151                if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
3152                        (void) commit(cache, false);
3153
3154                r = dm_cache_get_free_metadata_block_count(cache->cmd, &nr_free_blocks_metadata);
3155                if (r) {
3156                        DMERR("%s: dm_cache_get_free_metadata_block_count returned %d",
3157                              cache_device_name(cache), r);
3158                        goto err;
3159                }
3160
3161                r = dm_cache_get_metadata_dev_size(cache->cmd, &nr_blocks_metadata);
3162                if (r) {
3163                        DMERR("%s: dm_cache_get_metadata_dev_size returned %d",
3164                              cache_device_name(cache), r);
3165                        goto err;
3166                }
3167
3168                residency = policy_residency(cache->policy);
3169
3170                DMEMIT("%u %llu/%llu %llu %llu/%llu %u %u %u %u %u %u %lu ",
3171                       (unsigned)DM_CACHE_METADATA_BLOCK_SIZE,
3172                       (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
3173                       (unsigned long long)nr_blocks_metadata,
3174                       (unsigned long long)cache->sectors_per_block,
3175                       (unsigned long long) from_cblock(residency),
3176                       (unsigned long long) from_cblock(cache->cache_size),
3177                       (unsigned) atomic_read(&cache->stats.read_hit),
3178                       (unsigned) atomic_read(&cache->stats.read_miss),
3179                       (unsigned) atomic_read(&cache->stats.write_hit),
3180                       (unsigned) atomic_read(&cache->stats.write_miss),
3181                       (unsigned) atomic_read(&cache->stats.demotion),
3182                       (unsigned) atomic_read(&cache->stats.promotion),
3183                       (unsigned long) atomic_read(&cache->nr_dirty));
3184
3185                emit_flags(cache, result, maxlen, &sz);
3186
3187                DMEMIT("2 migration_threshold %llu ", (unsigned long long) cache->migration_threshold);
3188
3189                DMEMIT("%s ", dm_cache_policy_get_name(cache->policy));
3190                if (sz < maxlen) {
3191                        r = policy_emit_config_values(cache->policy, result, maxlen, &sz);
3192                        if (r)
3193                                DMERR("%s: policy_emit_config_values returned %d",
3194                                      cache_device_name(cache), r);
3195                }
3196
3197                if (get_cache_mode(cache) == CM_READ_ONLY)
3198                        DMEMIT("ro ");
3199                else
3200                        DMEMIT("rw ");
3201
3202                r = dm_cache_metadata_needs_check(cache->cmd, &needs_check);
3203
3204                if (r || needs_check)
3205                        DMEMIT("needs_check ");
3206                else
3207                        DMEMIT("- ");
3208
3209                break;
3210
3211        case STATUSTYPE_TABLE:
3212                format_dev_t(buf, cache->metadata_dev->bdev->bd_dev);
3213                DMEMIT("%s ", buf);
3214                format_dev_t(buf, cache->cache_dev->bdev->bd_dev);
3215                DMEMIT("%s ", buf);
3216                format_dev_t(buf, cache->origin_dev->bdev->bd_dev);
3217                DMEMIT("%s", buf);
3218
3219                for (i = 0; i < cache->nr_ctr_args - 1; i++)
3220                        DMEMIT(" %s", cache->ctr_args[i]);
3221                if (cache->nr_ctr_args)
3222                        DMEMIT(" %s", cache->ctr_args[cache->nr_ctr_args - 1]);
3223        }
3224
3225        return;
3226
3227err:
3228        DMEMIT("Error");
3229}
3230
3231/*
3232 * Defines a range of cblocks, begin to (end - 1) are in the range.  end is
3233 * the one-past-the-end value.
3234 */
3235struct cblock_range {
3236        dm_cblock_t begin;
3237        dm_cblock_t end;
3238};
3239
3240/*
3241 * A cache block range can take two forms:
3242 *
3243 * i) A single cblock, eg. '3456'
3244 * ii) A begin and end cblock with a dash between, eg. 123-234
3245 */
3246static int parse_cblock_range(struct cache *cache, const char *str,
3247                              struct cblock_range *result)
3248{
3249        char dummy;
3250        uint64_t b, e;
3251        int r;
3252
3253        /*
3254         * Try and parse form (ii) first.
3255         */
3256        r = sscanf(str, "%llu-%llu%c", &b, &e, &dummy);
3257        if (r < 0)
3258                return r;
3259
3260        if (r == 2) {
3261                result->begin = to_cblock(b);
3262                result->end = to_cblock(e);
3263                return 0;
3264        }
3265
3266        /*
3267         * That didn't work, try form (i).
3268         */
3269        r = sscanf(str, "%llu%c", &b, &dummy);
3270        if (r < 0)
3271                return r;
3272
3273        if (r == 1) {
3274                result->begin = to_cblock(b);
3275                result->end = to_cblock(from_cblock(result->begin) + 1u);
3276                return 0;
3277        }
3278
3279        DMERR("%s: invalid cblock range '%s'", cache_device_name(cache), str);
3280        return -EINVAL;
3281}
3282
3283static int validate_cblock_range(struct cache *cache, struct cblock_range *range)
3284{
3285        uint64_t b = from_cblock(range->begin);
3286        uint64_t e = from_cblock(range->end);
3287        uint64_t n = from_cblock(cache->cache_size);
3288
3289        if (b >= n) {
3290                DMERR("%s: begin cblock out of range: %llu >= %llu",
3291                      cache_device_name(cache), b, n);
3292                return -EINVAL;
3293        }
3294
3295        if (e > n) {
3296                DMERR("%s: end cblock out of range: %llu > %llu",
3297                      cache_device_name(cache), e, n);
3298                return -EINVAL;
3299        }
3300
3301        if (b >= e) {
3302                DMERR("%s: invalid cblock range: %llu >= %llu",
3303                      cache_device_name(cache), b, e);
3304                return -EINVAL;
3305        }
3306
3307        return 0;
3308}
3309
3310static inline dm_cblock_t cblock_succ(dm_cblock_t b)
3311{
3312        return to_cblock(from_cblock(b) + 1);
3313}
3314
3315static int request_invalidation(struct cache *cache, struct cblock_range *range)
3316{
3317        int r = 0;
3318
3319        /*
3320         * We don't need to do any locking here because we know we're in
3321         * passthrough mode.  There's is potential for a race between an
3322         * invalidation triggered by an io and an invalidation message.  This
3323         * is harmless, we must not worry if the policy call fails.
3324         */
3325        while (range->begin != range->end) {
3326                r = invalidate_cblock(cache, range->begin);
3327                if (r)
3328                        return r;
3329
3330                range->begin = cblock_succ(range->begin);
3331        }
3332
3333        cache->commit_requested = true;
3334        return r;
3335}
3336
3337static int process_invalidate_cblocks_message(struct cache *cache, unsigned count,
3338                                              const char **cblock_ranges)
3339{
3340        int r = 0;
3341        unsigned i;
3342        struct cblock_range range;
3343
3344        if (!passthrough_mode(cache)) {
3345                DMERR("%s: cache has to be in passthrough mode for invalidation",
3346                      cache_device_name(cache));
3347                return -EPERM;
3348        }
3349
3350        for (i = 0; i < count; i++) {
3351                r = parse_cblock_range(cache, cblock_ranges[i], &range);
3352                if (r)
3353                        break;
3354
3355                r = validate_cblock_range(cache, &range);
3356                if (r)
3357                        break;
3358
3359                /*
3360                 * Pass begin and end origin blocks to the worker and wake it.
3361                 */
3362                r = request_invalidation(cache, &range);
3363                if (r)
3364                        break;
3365        }
3366
3367        return r;
3368}
3369
3370/*
3371 * Supports
3372 *      "<key> <value>"
3373 * and
3374 *     "invalidate_cblocks [(<begin>)|(<begin>-<end>)]*
3375 *
3376 * The key migration_threshold is supported by the cache target core.
3377 */
3378static int cache_message(struct dm_target *ti, unsigned argc, char **argv,
3379                         char *result, unsigned maxlen)
3380{
3381        struct cache *cache = ti->private;
3382
3383        if (!argc)
3384                return -EINVAL;
3385
3386        if (get_cache_mode(cache) >= CM_READ_ONLY) {
3387                DMERR("%s: unable to service cache target messages in READ_ONLY or FAIL mode",
3388                      cache_device_name(cache));
3389                return -EOPNOTSUPP;
3390        }
3391
3392        if (!strcasecmp(argv[0], "invalidate_cblocks"))
3393                return process_invalidate_cblocks_message(cache, argc - 1, (const char **) argv + 1);
3394
3395        if (argc != 2)
3396                return -EINVAL;
3397
3398        return set_config_value(cache, argv[0], argv[1]);
3399}
3400
3401static int cache_iterate_devices(struct dm_target *ti,
3402                                 iterate_devices_callout_fn fn, void *data)
3403{
3404        int r = 0;
3405        struct cache *cache = ti->private;
3406
3407        r = fn(ti, cache->cache_dev, 0, get_dev_size(cache->cache_dev), data);
3408        if (!r)
3409                r = fn(ti, cache->origin_dev, 0, ti->len, data);
3410
3411        return r;
3412}
3413
3414static bool origin_dev_supports_discard(struct block_device *origin_bdev)
3415{
3416        struct request_queue *q = bdev_get_queue(origin_bdev);
3417
3418        return q && blk_queue_discard(q);
3419}
3420
3421/*
3422 * If discard_passdown was enabled verify that the origin device
3423 * supports discards.  Disable discard_passdown if not.
3424 */
3425static void disable_passdown_if_not_supported(struct cache *cache)
3426{
3427        struct block_device *origin_bdev = cache->origin_dev->bdev;
3428        struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits;
3429        const char *reason = NULL;
3430        char buf[BDEVNAME_SIZE];
3431
3432        if (!cache->features.discard_passdown)
3433                return;
3434
3435        if (!origin_dev_supports_discard(origin_bdev))
3436                reason = "discard unsupported";
3437
3438        else if (origin_limits->max_discard_sectors < cache->sectors_per_block)
3439                reason = "max discard sectors smaller than a block";
3440
3441        if (reason) {
3442                DMWARN("Origin device (%s) %s: Disabling discard passdown.",
3443                       bdevname(origin_bdev, buf), reason);
3444                cache->features.discard_passdown = false;
3445        }
3446}
3447
3448static void set_discard_limits(struct cache *cache, struct queue_limits *limits)
3449{
3450        struct block_device *origin_bdev = cache->origin_dev->bdev;
3451        struct queue_limits *origin_limits = &bdev_get_queue(origin_bdev)->limits;
3452
3453        if (!cache->features.discard_passdown) {
3454                /* No passdown is done so setting own virtual limits */
3455                limits->max_discard_sectors = min_t(sector_t, cache->discard_block_size * 1024,
3456                                                    cache->origin_sectors);
3457                limits->discard_granularity = cache->discard_block_size << SECTOR_SHIFT;
3458                return;
3459        }
3460
3461        /*
3462         * cache_iterate_devices() is stacking both origin and fast device limits
3463         * but discards aren't passed to fast device, so inherit origin's limits.
3464         */
3465        limits->max_discard_sectors = origin_limits->max_discard_sectors;
3466        limits->max_hw_discard_sectors = origin_limits->max_hw_discard_sectors;
3467        limits->discard_granularity = origin_limits->discard_granularity;
3468        limits->discard_alignment = origin_limits->discard_alignment;
3469        limits->discard_misaligned = origin_limits->discard_misaligned;
3470}
3471
3472static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits)
3473{
3474        struct cache *cache = ti->private;
3475        uint64_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
3476
3477        /*
3478         * If the system-determined stacked limits are compatible with the
3479         * cache's blocksize (io_opt is a factor) do not override them.
3480         */
3481        if (io_opt_sectors < cache->sectors_per_block ||
3482            do_div(io_opt_sectors, cache->sectors_per_block)) {
3483                blk_limits_io_min(limits, cache->sectors_per_block << SECTOR_SHIFT);
3484                blk_limits_io_opt(limits, cache->sectors_per_block << SECTOR_SHIFT);
3485        }
3486
3487        disable_passdown_if_not_supported(cache);
3488        set_discard_limits(cache, limits);
3489}
3490
3491/*----------------------------------------------------------------*/
3492
3493static struct target_type cache_target = {
3494        .name = "cache",
3495        .version = {2, 2, 0},
3496        .module = THIS_MODULE,
3497        .ctr = cache_ctr,
3498        .dtr = cache_dtr,
3499        .map = cache_map,
3500        .end_io = cache_end_io,
3501        .postsuspend = cache_postsuspend,
3502        .preresume = cache_preresume,
3503        .resume = cache_resume,
3504        .status = cache_status,
3505        .message = cache_message,
3506        .iterate_devices = cache_iterate_devices,
3507        .io_hints = cache_io_hints,
3508};
3509
3510static int __init dm_cache_init(void)
3511{
3512        int r;
3513
3514        migration_cache = KMEM_CACHE(dm_cache_migration, 0);
3515        if (!migration_cache)
3516                return -ENOMEM;
3517
3518        r = dm_register_target(&cache_target);
3519        if (r) {
3520                DMERR("cache target registration failed: %d", r);
3521                kmem_cache_destroy(migration_cache);
3522                return r;
3523        }
3524
3525        return 0;
3526}
3527
3528static void __exit dm_cache_exit(void)
3529{
3530        dm_unregister_target(&cache_target);
3531        kmem_cache_destroy(migration_cache);
3532}
3533
3534module_init(dm_cache_init);
3535module_exit(dm_cache_exit);
3536
3537MODULE_DESCRIPTION(DM_NAME " cache target");
3538MODULE_AUTHOR("Joe Thornber <ejt@redhat.com>");
3539MODULE_LICENSE("GPL");
3540