linux/drivers/md/dm-snap.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2001-2002 Sistina Software (UK) Limited.
   3 *
   4 * This file is released under the GPL.
   5 */
   6
   7#include <linux/blkdev.h>
   8#include <linux/device-mapper.h>
   9#include <linux/delay.h>
  10#include <linux/fs.h>
  11#include <linux/init.h>
  12#include <linux/kdev_t.h>
  13#include <linux/list.h>
  14#include <linux/list_bl.h>
  15#include <linux/mempool.h>
  16#include <linux/module.h>
  17#include <linux/slab.h>
  18#include <linux/vmalloc.h>
  19#include <linux/log2.h>
  20#include <linux/dm-kcopyd.h>
  21#include <linux/semaphore.h>
  22
  23#include "dm.h"
  24
  25#include "dm-exception-store.h"
  26
  27#define DM_MSG_PREFIX "snapshots"
  28
  29static const char dm_snapshot_merge_target_name[] = "snapshot-merge";
  30
  31#define dm_target_is_snapshot_merge(ti) \
  32        ((ti)->type->name == dm_snapshot_merge_target_name)
  33
  34/*
  35 * The size of the mempool used to track chunks in use.
  36 */
  37#define MIN_IOS 256
  38
  39#define DM_TRACKED_CHUNK_HASH_SIZE      16
  40#define DM_TRACKED_CHUNK_HASH(x)        ((unsigned long)(x) & \
  41                                         (DM_TRACKED_CHUNK_HASH_SIZE - 1))
  42
  43struct dm_exception_table {
  44        uint32_t hash_mask;
  45        unsigned hash_shift;
  46        struct hlist_bl_head *table;
  47};
  48
  49struct dm_snapshot {
  50        struct rw_semaphore lock;
  51
  52        struct dm_dev *origin;
  53        struct dm_dev *cow;
  54
  55        struct dm_target *ti;
  56
  57        /* List of snapshots per Origin */
  58        struct list_head list;
  59
  60        /*
  61         * You can't use a snapshot if this is 0 (e.g. if full).
  62         * A snapshot-merge target never clears this.
  63         */
  64        int valid;
  65
  66        /*
  67         * The snapshot overflowed because of a write to the snapshot device.
  68         * We don't have to invalidate the snapshot in this case, but we need
  69         * to prevent further writes.
  70         */
  71        int snapshot_overflowed;
  72
  73        /* Origin writes don't trigger exceptions until this is set */
  74        int active;
  75
  76        atomic_t pending_exceptions_count;
  77
  78        spinlock_t pe_allocation_lock;
  79
  80        /* Protected by "pe_allocation_lock" */
  81        sector_t exception_start_sequence;
  82
  83        /* Protected by kcopyd single-threaded callback */
  84        sector_t exception_complete_sequence;
  85
  86        /*
  87         * A list of pending exceptions that completed out of order.
  88         * Protected by kcopyd single-threaded callback.
  89         */
  90        struct rb_root out_of_order_tree;
  91
  92        mempool_t pending_pool;
  93
  94        struct dm_exception_table pending;
  95        struct dm_exception_table complete;
  96
  97        /*
  98         * pe_lock protects all pending_exception operations and access
  99         * as well as the snapshot_bios list.
 100         */
 101        spinlock_t pe_lock;
 102
 103        /* Chunks with outstanding reads */
 104        spinlock_t tracked_chunk_lock;
 105        struct hlist_head tracked_chunk_hash[DM_TRACKED_CHUNK_HASH_SIZE];
 106
 107        /* The on disk metadata handler */
 108        struct dm_exception_store *store;
 109
 110        /* Maximum number of in-flight COW jobs. */
 111        struct semaphore cow_count;
 112
 113        struct dm_kcopyd_client *kcopyd_client;
 114
 115        /* Wait for events based on state_bits */
 116        unsigned long state_bits;
 117
 118        /* Range of chunks currently being merged. */
 119        chunk_t first_merging_chunk;
 120        int num_merging_chunks;
 121
 122        /*
 123         * The merge operation failed if this flag is set.
 124         * Failure modes are handled as follows:
 125         * - I/O error reading the header
 126         *      => don't load the target; abort.
 127         * - Header does not have "valid" flag set
 128         *      => use the origin; forget about the snapshot.
 129         * - I/O error when reading exceptions
 130         *      => don't load the target; abort.
 131         *         (We can't use the intermediate origin state.)
 132         * - I/O error while merging
 133         *      => stop merging; set merge_failed; process I/O normally.
 134         */
 135        bool merge_failed:1;
 136
 137        bool discard_zeroes_cow:1;
 138        bool discard_passdown_origin:1;
 139
 140        /*
 141         * Incoming bios that overlap with chunks being merged must wait
 142         * for them to be committed.
 143         */
 144        struct bio_list bios_queued_during_merge;
 145};
 146
 147/*
 148 * state_bits:
 149 *   RUNNING_MERGE  - Merge operation is in progress.
 150 *   SHUTDOWN_MERGE - Set to signal that merge needs to be stopped;
 151 *                    cleared afterwards.
 152 */
 153#define RUNNING_MERGE          0
 154#define SHUTDOWN_MERGE         1
 155
 156/*
 157 * Maximum number of chunks being copied on write.
 158 *
 159 * The value was decided experimentally as a trade-off between memory
 160 * consumption, stalling the kernel's workqueues and maintaining a high enough
 161 * throughput.
 162 */
 163#define DEFAULT_COW_THRESHOLD 2048
 164
 165static int cow_threshold = DEFAULT_COW_THRESHOLD;
 166module_param_named(snapshot_cow_threshold, cow_threshold, int, 0644);
 167MODULE_PARM_DESC(snapshot_cow_threshold, "Maximum number of chunks being copied on write");
 168
 169DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
 170                "A percentage of time allocated for copy on write");
 171
 172struct dm_dev *dm_snap_origin(struct dm_snapshot *s)
 173{
 174        return s->origin;
 175}
 176EXPORT_SYMBOL(dm_snap_origin);
 177
 178struct dm_dev *dm_snap_cow(struct dm_snapshot *s)
 179{
 180        return s->cow;
 181}
 182EXPORT_SYMBOL(dm_snap_cow);
 183
 184static sector_t chunk_to_sector(struct dm_exception_store *store,
 185                                chunk_t chunk)
 186{
 187        return chunk << store->chunk_shift;
 188}
 189
 190static int bdev_equal(struct block_device *lhs, struct block_device *rhs)
 191{
 192        /*
 193         * There is only ever one instance of a particular block
 194         * device so we can compare pointers safely.
 195         */
 196        return lhs == rhs;
 197}
 198
 199struct dm_snap_pending_exception {
 200        struct dm_exception e;
 201
 202        /*
 203         * Origin buffers waiting for this to complete are held
 204         * in a bio list
 205         */
 206        struct bio_list origin_bios;
 207        struct bio_list snapshot_bios;
 208
 209        /* Pointer back to snapshot context */
 210        struct dm_snapshot *snap;
 211
 212        /*
 213         * 1 indicates the exception has already been sent to
 214         * kcopyd.
 215         */
 216        int started;
 217
 218        /* There was copying error. */
 219        int copy_error;
 220
 221        /* A sequence number, it is used for in-order completion. */
 222        sector_t exception_sequence;
 223
 224        struct rb_node out_of_order_node;
 225
 226        /*
 227         * For writing a complete chunk, bypassing the copy.
 228         */
 229        struct bio *full_bio;
 230        bio_end_io_t *full_bio_end_io;
 231};
 232
 233/*
 234 * Hash table mapping origin volumes to lists of snapshots and
 235 * a lock to protect it
 236 */
 237static struct kmem_cache *exception_cache;
 238static struct kmem_cache *pending_cache;
 239
 240struct dm_snap_tracked_chunk {
 241        struct hlist_node node;
 242        chunk_t chunk;
 243};
 244
 245static void init_tracked_chunk(struct bio *bio)
 246{
 247        struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
 248        INIT_HLIST_NODE(&c->node);
 249}
 250
 251static bool is_bio_tracked(struct bio *bio)
 252{
 253        struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
 254        return !hlist_unhashed(&c->node);
 255}
 256
 257static void track_chunk(struct dm_snapshot *s, struct bio *bio, chunk_t chunk)
 258{
 259        struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
 260
 261        c->chunk = chunk;
 262
 263        spin_lock_irq(&s->tracked_chunk_lock);
 264        hlist_add_head(&c->node,
 265                       &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)]);
 266        spin_unlock_irq(&s->tracked_chunk_lock);
 267}
 268
 269static void stop_tracking_chunk(struct dm_snapshot *s, struct bio *bio)
 270{
 271        struct dm_snap_tracked_chunk *c = dm_per_bio_data(bio, sizeof(struct dm_snap_tracked_chunk));
 272        unsigned long flags;
 273
 274        spin_lock_irqsave(&s->tracked_chunk_lock, flags);
 275        hlist_del(&c->node);
 276        spin_unlock_irqrestore(&s->tracked_chunk_lock, flags);
 277}
 278
 279static int __chunk_is_tracked(struct dm_snapshot *s, chunk_t chunk)
 280{
 281        struct dm_snap_tracked_chunk *c;
 282        int found = 0;
 283
 284        spin_lock_irq(&s->tracked_chunk_lock);
 285
 286        hlist_for_each_entry(c,
 287            &s->tracked_chunk_hash[DM_TRACKED_CHUNK_HASH(chunk)], node) {
 288                if (c->chunk == chunk) {
 289                        found = 1;
 290                        break;
 291                }
 292        }
 293
 294        spin_unlock_irq(&s->tracked_chunk_lock);
 295
 296        return found;
 297}
 298
 299/*
 300 * This conflicting I/O is extremely improbable in the caller,
 301 * so msleep(1) is sufficient and there is no need for a wait queue.
 302 */
 303static void __check_for_conflicting_io(struct dm_snapshot *s, chunk_t chunk)
 304{
 305        while (__chunk_is_tracked(s, chunk))
 306                msleep(1);
 307}
 308
 309/*
 310 * One of these per registered origin, held in the snapshot_origins hash
 311 */
 312struct origin {
 313        /* The origin device */
 314        struct block_device *bdev;
 315
 316        struct list_head hash_list;
 317
 318        /* List of snapshots for this origin */
 319        struct list_head snapshots;
 320};
 321
 322/*
 323 * This structure is allocated for each origin target
 324 */
 325struct dm_origin {
 326        struct dm_dev *dev;
 327        struct dm_target *ti;
 328        unsigned split_boundary;
 329        struct list_head hash_list;
 330};
 331
 332/*
 333 * Size of the hash table for origin volumes. If we make this
 334 * the size of the minors list then it should be nearly perfect
 335 */
 336#define ORIGIN_HASH_SIZE 256
 337#define ORIGIN_MASK      0xFF
 338static struct list_head *_origins;
 339static struct list_head *_dm_origins;
 340static struct rw_semaphore _origins_lock;
 341
 342static DECLARE_WAIT_QUEUE_HEAD(_pending_exceptions_done);
 343static DEFINE_SPINLOCK(_pending_exceptions_done_spinlock);
 344static uint64_t _pending_exceptions_done_count;
 345
 346static int init_origin_hash(void)
 347{
 348        int i;
 349
 350        _origins = kmalloc_array(ORIGIN_HASH_SIZE, sizeof(struct list_head),
 351                                 GFP_KERNEL);
 352        if (!_origins) {
 353                DMERR("unable to allocate memory for _origins");
 354                return -ENOMEM;
 355        }
 356        for (i = 0; i < ORIGIN_HASH_SIZE; i++)
 357                INIT_LIST_HEAD(_origins + i);
 358
 359        _dm_origins = kmalloc_array(ORIGIN_HASH_SIZE,
 360                                    sizeof(struct list_head),
 361                                    GFP_KERNEL);
 362        if (!_dm_origins) {
 363                DMERR("unable to allocate memory for _dm_origins");
 364                kfree(_origins);
 365                return -ENOMEM;
 366        }
 367        for (i = 0; i < ORIGIN_HASH_SIZE; i++)
 368                INIT_LIST_HEAD(_dm_origins + i);
 369
 370        init_rwsem(&_origins_lock);
 371
 372        return 0;
 373}
 374
 375static void exit_origin_hash(void)
 376{
 377        kfree(_origins);
 378        kfree(_dm_origins);
 379}
 380
 381static unsigned origin_hash(struct block_device *bdev)
 382{
 383        return bdev->bd_dev & ORIGIN_MASK;
 384}
 385
 386static struct origin *__lookup_origin(struct block_device *origin)
 387{
 388        struct list_head *ol;
 389        struct origin *o;
 390
 391        ol = &_origins[origin_hash(origin)];
 392        list_for_each_entry (o, ol, hash_list)
 393                if (bdev_equal(o->bdev, origin))
 394                        return o;
 395
 396        return NULL;
 397}
 398
 399static void __insert_origin(struct origin *o)
 400{
 401        struct list_head *sl = &_origins[origin_hash(o->bdev)];
 402        list_add_tail(&o->hash_list, sl);
 403}
 404
 405static struct dm_origin *__lookup_dm_origin(struct block_device *origin)
 406{
 407        struct list_head *ol;
 408        struct dm_origin *o;
 409
 410        ol = &_dm_origins[origin_hash(origin)];
 411        list_for_each_entry (o, ol, hash_list)
 412                if (bdev_equal(o->dev->bdev, origin))
 413                        return o;
 414
 415        return NULL;
 416}
 417
 418static void __insert_dm_origin(struct dm_origin *o)
 419{
 420        struct list_head *sl = &_dm_origins[origin_hash(o->dev->bdev)];
 421        list_add_tail(&o->hash_list, sl);
 422}
 423
 424static void __remove_dm_origin(struct dm_origin *o)
 425{
 426        list_del(&o->hash_list);
 427}
 428
 429/*
 430 * _origins_lock must be held when calling this function.
 431 * Returns number of snapshots registered using the supplied cow device, plus:
 432 * snap_src - a snapshot suitable for use as a source of exception handover
 433 * snap_dest - a snapshot capable of receiving exception handover.
 434 * snap_merge - an existing snapshot-merge target linked to the same origin.
 435 *   There can be at most one snapshot-merge target. The parameter is optional.
 436 *
 437 * Possible return values and states of snap_src and snap_dest.
 438 *   0: NULL, NULL  - first new snapshot
 439 *   1: snap_src, NULL - normal snapshot
 440 *   2: snap_src, snap_dest  - waiting for handover
 441 *   2: snap_src, NULL - handed over, waiting for old to be deleted
 442 *   1: NULL, snap_dest - source got destroyed without handover
 443 */
 444static int __find_snapshots_sharing_cow(struct dm_snapshot *snap,
 445                                        struct dm_snapshot **snap_src,
 446                                        struct dm_snapshot **snap_dest,
 447                                        struct dm_snapshot **snap_merge)
 448{
 449        struct dm_snapshot *s;
 450        struct origin *o;
 451        int count = 0;
 452        int active;
 453
 454        o = __lookup_origin(snap->origin->bdev);
 455        if (!o)
 456                goto out;
 457
 458        list_for_each_entry(s, &o->snapshots, list) {
 459                if (dm_target_is_snapshot_merge(s->ti) && snap_merge)
 460                        *snap_merge = s;
 461                if (!bdev_equal(s->cow->bdev, snap->cow->bdev))
 462                        continue;
 463
 464                down_read(&s->lock);
 465                active = s->active;
 466                up_read(&s->lock);
 467
 468                if (active) {
 469                        if (snap_src)
 470                                *snap_src = s;
 471                } else if (snap_dest)
 472                        *snap_dest = s;
 473
 474                count++;
 475        }
 476
 477out:
 478        return count;
 479}
 480
 481/*
 482 * On success, returns 1 if this snapshot is a handover destination,
 483 * otherwise returns 0.
 484 */
 485static int __validate_exception_handover(struct dm_snapshot *snap)
 486{
 487        struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
 488        struct dm_snapshot *snap_merge = NULL;
 489
 490        /* Does snapshot need exceptions handed over to it? */
 491        if ((__find_snapshots_sharing_cow(snap, &snap_src, &snap_dest,
 492                                          &snap_merge) == 2) ||
 493            snap_dest) {
 494                snap->ti->error = "Snapshot cow pairing for exception "
 495                                  "table handover failed";
 496                return -EINVAL;
 497        }
 498
 499        /*
 500         * If no snap_src was found, snap cannot become a handover
 501         * destination.
 502         */
 503        if (!snap_src)
 504                return 0;
 505
 506        /*
 507         * Non-snapshot-merge handover?
 508         */
 509        if (!dm_target_is_snapshot_merge(snap->ti))
 510                return 1;
 511
 512        /*
 513         * Do not allow more than one merging snapshot.
 514         */
 515        if (snap_merge) {
 516                snap->ti->error = "A snapshot is already merging.";
 517                return -EINVAL;
 518        }
 519
 520        if (!snap_src->store->type->prepare_merge ||
 521            !snap_src->store->type->commit_merge) {
 522                snap->ti->error = "Snapshot exception store does not "
 523                                  "support snapshot-merge.";
 524                return -EINVAL;
 525        }
 526
 527        return 1;
 528}
 529
 530static void __insert_snapshot(struct origin *o, struct dm_snapshot *s)
 531{
 532        struct dm_snapshot *l;
 533
 534        /* Sort the list according to chunk size, largest-first smallest-last */
 535        list_for_each_entry(l, &o->snapshots, list)
 536                if (l->store->chunk_size < s->store->chunk_size)
 537                        break;
 538        list_add_tail(&s->list, &l->list);
 539}
 540
 541/*
 542 * Make a note of the snapshot and its origin so we can look it
 543 * up when the origin has a write on it.
 544 *
 545 * Also validate snapshot exception store handovers.
 546 * On success, returns 1 if this registration is a handover destination,
 547 * otherwise returns 0.
 548 */
 549static int register_snapshot(struct dm_snapshot *snap)
 550{
 551        struct origin *o, *new_o = NULL;
 552        struct block_device *bdev = snap->origin->bdev;
 553        int r = 0;
 554
 555        new_o = kmalloc(sizeof(*new_o), GFP_KERNEL);
 556        if (!new_o)
 557                return -ENOMEM;
 558
 559        down_write(&_origins_lock);
 560
 561        r = __validate_exception_handover(snap);
 562        if (r < 0) {
 563                kfree(new_o);
 564                goto out;
 565        }
 566
 567        o = __lookup_origin(bdev);
 568        if (o)
 569                kfree(new_o);
 570        else {
 571                /* New origin */
 572                o = new_o;
 573
 574                /* Initialise the struct */
 575                INIT_LIST_HEAD(&o->snapshots);
 576                o->bdev = bdev;
 577
 578                __insert_origin(o);
 579        }
 580
 581        __insert_snapshot(o, snap);
 582
 583out:
 584        up_write(&_origins_lock);
 585
 586        return r;
 587}
 588
 589/*
 590 * Move snapshot to correct place in list according to chunk size.
 591 */
 592static void reregister_snapshot(struct dm_snapshot *s)
 593{
 594        struct block_device *bdev = s->origin->bdev;
 595
 596        down_write(&_origins_lock);
 597
 598        list_del(&s->list);
 599        __insert_snapshot(__lookup_origin(bdev), s);
 600
 601        up_write(&_origins_lock);
 602}
 603
 604static void unregister_snapshot(struct dm_snapshot *s)
 605{
 606        struct origin *o;
 607
 608        down_write(&_origins_lock);
 609        o = __lookup_origin(s->origin->bdev);
 610
 611        list_del(&s->list);
 612        if (o && list_empty(&o->snapshots)) {
 613                list_del(&o->hash_list);
 614                kfree(o);
 615        }
 616
 617        up_write(&_origins_lock);
 618}
 619
 620/*
 621 * Implementation of the exception hash tables.
 622 * The lowest hash_shift bits of the chunk number are ignored, allowing
 623 * some consecutive chunks to be grouped together.
 624 */
 625static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk);
 626
 627/* Lock to protect access to the completed and pending exception hash tables. */
 628struct dm_exception_table_lock {
 629        struct hlist_bl_head *complete_slot;
 630        struct hlist_bl_head *pending_slot;
 631};
 632
 633static void dm_exception_table_lock_init(struct dm_snapshot *s, chunk_t chunk,
 634                                         struct dm_exception_table_lock *lock)
 635{
 636        struct dm_exception_table *complete = &s->complete;
 637        struct dm_exception_table *pending = &s->pending;
 638
 639        lock->complete_slot = &complete->table[exception_hash(complete, chunk)];
 640        lock->pending_slot = &pending->table[exception_hash(pending, chunk)];
 641}
 642
 643static void dm_exception_table_lock(struct dm_exception_table_lock *lock)
 644{
 645        hlist_bl_lock(lock->complete_slot);
 646        hlist_bl_lock(lock->pending_slot);
 647}
 648
 649static void dm_exception_table_unlock(struct dm_exception_table_lock *lock)
 650{
 651        hlist_bl_unlock(lock->pending_slot);
 652        hlist_bl_unlock(lock->complete_slot);
 653}
 654
 655static int dm_exception_table_init(struct dm_exception_table *et,
 656                                   uint32_t size, unsigned hash_shift)
 657{
 658        unsigned int i;
 659
 660        et->hash_shift = hash_shift;
 661        et->hash_mask = size - 1;
 662        et->table = dm_vcalloc(size, sizeof(struct hlist_bl_head));
 663        if (!et->table)
 664                return -ENOMEM;
 665
 666        for (i = 0; i < size; i++)
 667                INIT_HLIST_BL_HEAD(et->table + i);
 668
 669        return 0;
 670}
 671
 672static void dm_exception_table_exit(struct dm_exception_table *et,
 673                                    struct kmem_cache *mem)
 674{
 675        struct hlist_bl_head *slot;
 676        struct dm_exception *ex;
 677        struct hlist_bl_node *pos, *n;
 678        int i, size;
 679
 680        size = et->hash_mask + 1;
 681        for (i = 0; i < size; i++) {
 682                slot = et->table + i;
 683
 684                hlist_bl_for_each_entry_safe(ex, pos, n, slot, hash_list)
 685                        kmem_cache_free(mem, ex);
 686        }
 687
 688        vfree(et->table);
 689}
 690
 691static uint32_t exception_hash(struct dm_exception_table *et, chunk_t chunk)
 692{
 693        return (chunk >> et->hash_shift) & et->hash_mask;
 694}
 695
 696static void dm_remove_exception(struct dm_exception *e)
 697{
 698        hlist_bl_del(&e->hash_list);
 699}
 700
 701/*
 702 * Return the exception data for a sector, or NULL if not
 703 * remapped.
 704 */
 705static struct dm_exception *dm_lookup_exception(struct dm_exception_table *et,
 706                                                chunk_t chunk)
 707{
 708        struct hlist_bl_head *slot;
 709        struct hlist_bl_node *pos;
 710        struct dm_exception *e;
 711
 712        slot = &et->table[exception_hash(et, chunk)];
 713        hlist_bl_for_each_entry(e, pos, slot, hash_list)
 714                if (chunk >= e->old_chunk &&
 715                    chunk <= e->old_chunk + dm_consecutive_chunk_count(e))
 716                        return e;
 717
 718        return NULL;
 719}
 720
 721static struct dm_exception *alloc_completed_exception(gfp_t gfp)
 722{
 723        struct dm_exception *e;
 724
 725        e = kmem_cache_alloc(exception_cache, gfp);
 726        if (!e && gfp == GFP_NOIO)
 727                e = kmem_cache_alloc(exception_cache, GFP_ATOMIC);
 728
 729        return e;
 730}
 731
 732static void free_completed_exception(struct dm_exception *e)
 733{
 734        kmem_cache_free(exception_cache, e);
 735}
 736
 737static struct dm_snap_pending_exception *alloc_pending_exception(struct dm_snapshot *s)
 738{
 739        struct dm_snap_pending_exception *pe = mempool_alloc(&s->pending_pool,
 740                                                             GFP_NOIO);
 741
 742        atomic_inc(&s->pending_exceptions_count);
 743        pe->snap = s;
 744
 745        return pe;
 746}
 747
 748static void free_pending_exception(struct dm_snap_pending_exception *pe)
 749{
 750        struct dm_snapshot *s = pe->snap;
 751
 752        mempool_free(pe, &s->pending_pool);
 753        smp_mb__before_atomic();
 754        atomic_dec(&s->pending_exceptions_count);
 755}
 756
 757static void dm_insert_exception(struct dm_exception_table *eh,
 758                                struct dm_exception *new_e)
 759{
 760        struct hlist_bl_head *l;
 761        struct hlist_bl_node *pos;
 762        struct dm_exception *e = NULL;
 763
 764        l = &eh->table[exception_hash(eh, new_e->old_chunk)];
 765
 766        /* Add immediately if this table doesn't support consecutive chunks */
 767        if (!eh->hash_shift)
 768                goto out;
 769
 770        /* List is ordered by old_chunk */
 771        hlist_bl_for_each_entry(e, pos, l, hash_list) {
 772                /* Insert after an existing chunk? */
 773                if (new_e->old_chunk == (e->old_chunk +
 774                                         dm_consecutive_chunk_count(e) + 1) &&
 775                    new_e->new_chunk == (dm_chunk_number(e->new_chunk) +
 776                                         dm_consecutive_chunk_count(e) + 1)) {
 777                        dm_consecutive_chunk_count_inc(e);
 778                        free_completed_exception(new_e);
 779                        return;
 780                }
 781
 782                /* Insert before an existing chunk? */
 783                if (new_e->old_chunk == (e->old_chunk - 1) &&
 784                    new_e->new_chunk == (dm_chunk_number(e->new_chunk) - 1)) {
 785                        dm_consecutive_chunk_count_inc(e);
 786                        e->old_chunk--;
 787                        e->new_chunk--;
 788                        free_completed_exception(new_e);
 789                        return;
 790                }
 791
 792                if (new_e->old_chunk < e->old_chunk)
 793                        break;
 794        }
 795
 796out:
 797        if (!e) {
 798                /*
 799                 * Either the table doesn't support consecutive chunks or slot
 800                 * l is empty.
 801                 */
 802                hlist_bl_add_head(&new_e->hash_list, l);
 803        } else if (new_e->old_chunk < e->old_chunk) {
 804                /* Add before an existing exception */
 805                hlist_bl_add_before(&new_e->hash_list, &e->hash_list);
 806        } else {
 807                /* Add to l's tail: e is the last exception in this slot */
 808                hlist_bl_add_behind(&new_e->hash_list, &e->hash_list);
 809        }
 810}
 811
 812/*
 813 * Callback used by the exception stores to load exceptions when
 814 * initialising.
 815 */
 816static int dm_add_exception(void *context, chunk_t old, chunk_t new)
 817{
 818        struct dm_exception_table_lock lock;
 819        struct dm_snapshot *s = context;
 820        struct dm_exception *e;
 821
 822        e = alloc_completed_exception(GFP_KERNEL);
 823        if (!e)
 824                return -ENOMEM;
 825
 826        e->old_chunk = old;
 827
 828        /* Consecutive_count is implicitly initialised to zero */
 829        e->new_chunk = new;
 830
 831        /*
 832         * Although there is no need to lock access to the exception tables
 833         * here, if we don't then hlist_bl_add_head(), called by
 834         * dm_insert_exception(), will complain about accessing the
 835         * corresponding list without locking it first.
 836         */
 837        dm_exception_table_lock_init(s, old, &lock);
 838
 839        dm_exception_table_lock(&lock);
 840        dm_insert_exception(&s->complete, e);
 841        dm_exception_table_unlock(&lock);
 842
 843        return 0;
 844}
 845
 846/*
 847 * Return a minimum chunk size of all snapshots that have the specified origin.
 848 * Return zero if the origin has no snapshots.
 849 */
 850static uint32_t __minimum_chunk_size(struct origin *o)
 851{
 852        struct dm_snapshot *snap;
 853        unsigned chunk_size = 0;
 854
 855        if (o)
 856                list_for_each_entry(snap, &o->snapshots, list)
 857                        chunk_size = min_not_zero(chunk_size,
 858                                                  snap->store->chunk_size);
 859
 860        return (uint32_t) chunk_size;
 861}
 862
 863/*
 864 * Hard coded magic.
 865 */
 866static int calc_max_buckets(void)
 867{
 868        /* use a fixed size of 2MB */
 869        unsigned long mem = 2 * 1024 * 1024;
 870        mem /= sizeof(struct hlist_bl_head);
 871
 872        return mem;
 873}
 874
 875/*
 876 * Allocate room for a suitable hash table.
 877 */
 878static int init_hash_tables(struct dm_snapshot *s)
 879{
 880        sector_t hash_size, cow_dev_size, max_buckets;
 881
 882        /*
 883         * Calculate based on the size of the original volume or
 884         * the COW volume...
 885         */
 886        cow_dev_size = get_dev_size(s->cow->bdev);
 887        max_buckets = calc_max_buckets();
 888
 889        hash_size = cow_dev_size >> s->store->chunk_shift;
 890        hash_size = min(hash_size, max_buckets);
 891
 892        if (hash_size < 64)
 893                hash_size = 64;
 894        hash_size = rounddown_pow_of_two(hash_size);
 895        if (dm_exception_table_init(&s->complete, hash_size,
 896                                    DM_CHUNK_CONSECUTIVE_BITS))
 897                return -ENOMEM;
 898
 899        /*
 900         * Allocate hash table for in-flight exceptions
 901         * Make this smaller than the real hash table
 902         */
 903        hash_size >>= 3;
 904        if (hash_size < 64)
 905                hash_size = 64;
 906
 907        if (dm_exception_table_init(&s->pending, hash_size, 0)) {
 908                dm_exception_table_exit(&s->complete, exception_cache);
 909                return -ENOMEM;
 910        }
 911
 912        return 0;
 913}
 914
 915static void merge_shutdown(struct dm_snapshot *s)
 916{
 917        clear_bit_unlock(RUNNING_MERGE, &s->state_bits);
 918        smp_mb__after_atomic();
 919        wake_up_bit(&s->state_bits, RUNNING_MERGE);
 920}
 921
 922static struct bio *__release_queued_bios_after_merge(struct dm_snapshot *s)
 923{
 924        s->first_merging_chunk = 0;
 925        s->num_merging_chunks = 0;
 926
 927        return bio_list_get(&s->bios_queued_during_merge);
 928}
 929
 930/*
 931 * Remove one chunk from the index of completed exceptions.
 932 */
 933static int __remove_single_exception_chunk(struct dm_snapshot *s,
 934                                           chunk_t old_chunk)
 935{
 936        struct dm_exception *e;
 937
 938        e = dm_lookup_exception(&s->complete, old_chunk);
 939        if (!e) {
 940                DMERR("Corruption detected: exception for block %llu is "
 941                      "on disk but not in memory",
 942                      (unsigned long long)old_chunk);
 943                return -EINVAL;
 944        }
 945
 946        /*
 947         * If this is the only chunk using this exception, remove exception.
 948         */
 949        if (!dm_consecutive_chunk_count(e)) {
 950                dm_remove_exception(e);
 951                free_completed_exception(e);
 952                return 0;
 953        }
 954
 955        /*
 956         * The chunk may be either at the beginning or the end of a
 957         * group of consecutive chunks - never in the middle.  We are
 958         * removing chunks in the opposite order to that in which they
 959         * were added, so this should always be true.
 960         * Decrement the consecutive chunk counter and adjust the
 961         * starting point if necessary.
 962         */
 963        if (old_chunk == e->old_chunk) {
 964                e->old_chunk++;
 965                e->new_chunk++;
 966        } else if (old_chunk != e->old_chunk +
 967                   dm_consecutive_chunk_count(e)) {
 968                DMERR("Attempt to merge block %llu from the "
 969                      "middle of a chunk range [%llu - %llu]",
 970                      (unsigned long long)old_chunk,
 971                      (unsigned long long)e->old_chunk,
 972                      (unsigned long long)
 973                      e->old_chunk + dm_consecutive_chunk_count(e));
 974                return -EINVAL;
 975        }
 976
 977        dm_consecutive_chunk_count_dec(e);
 978
 979        return 0;
 980}
 981
 982static void flush_bios(struct bio *bio);
 983
 984static int remove_single_exception_chunk(struct dm_snapshot *s)
 985{
 986        struct bio *b = NULL;
 987        int r;
 988        chunk_t old_chunk = s->first_merging_chunk + s->num_merging_chunks - 1;
 989
 990        down_write(&s->lock);
 991
 992        /*
 993         * Process chunks (and associated exceptions) in reverse order
 994         * so that dm_consecutive_chunk_count_dec() accounting works.
 995         */
 996        do {
 997                r = __remove_single_exception_chunk(s, old_chunk);
 998                if (r)
 999                        goto out;
1000        } while (old_chunk-- > s->first_merging_chunk);
1001
1002        b = __release_queued_bios_after_merge(s);
1003
1004out:
1005        up_write(&s->lock);
1006        if (b)
1007                flush_bios(b);
1008
1009        return r;
1010}
1011
1012static int origin_write_extent(struct dm_snapshot *merging_snap,
1013                               sector_t sector, unsigned chunk_size);
1014
1015static void merge_callback(int read_err, unsigned long write_err,
1016                           void *context);
1017
1018static uint64_t read_pending_exceptions_done_count(void)
1019{
1020        uint64_t pending_exceptions_done;
1021
1022        spin_lock(&_pending_exceptions_done_spinlock);
1023        pending_exceptions_done = _pending_exceptions_done_count;
1024        spin_unlock(&_pending_exceptions_done_spinlock);
1025
1026        return pending_exceptions_done;
1027}
1028
1029static void increment_pending_exceptions_done_count(void)
1030{
1031        spin_lock(&_pending_exceptions_done_spinlock);
1032        _pending_exceptions_done_count++;
1033        spin_unlock(&_pending_exceptions_done_spinlock);
1034
1035        wake_up_all(&_pending_exceptions_done);
1036}
1037
1038static void snapshot_merge_next_chunks(struct dm_snapshot *s)
1039{
1040        int i, linear_chunks;
1041        chunk_t old_chunk, new_chunk;
1042        struct dm_io_region src, dest;
1043        sector_t io_size;
1044        uint64_t previous_count;
1045
1046        BUG_ON(!test_bit(RUNNING_MERGE, &s->state_bits));
1047        if (unlikely(test_bit(SHUTDOWN_MERGE, &s->state_bits)))
1048                goto shut;
1049
1050        /*
1051         * valid flag never changes during merge, so no lock required.
1052         */
1053        if (!s->valid) {
1054                DMERR("Snapshot is invalid: can't merge");
1055                goto shut;
1056        }
1057
1058        linear_chunks = s->store->type->prepare_merge(s->store, &old_chunk,
1059                                                      &new_chunk);
1060        if (linear_chunks <= 0) {
1061                if (linear_chunks < 0) {
1062                        DMERR("Read error in exception store: "
1063                              "shutting down merge");
1064                        down_write(&s->lock);
1065                        s->merge_failed = 1;
1066                        up_write(&s->lock);
1067                }
1068                goto shut;
1069        }
1070
1071        /* Adjust old_chunk and new_chunk to reflect start of linear region */
1072        old_chunk = old_chunk + 1 - linear_chunks;
1073        new_chunk = new_chunk + 1 - linear_chunks;
1074
1075        /*
1076         * Use one (potentially large) I/O to copy all 'linear_chunks'
1077         * from the exception store to the origin
1078         */
1079        io_size = linear_chunks * s->store->chunk_size;
1080
1081        dest.bdev = s->origin->bdev;
1082        dest.sector = chunk_to_sector(s->store, old_chunk);
1083        dest.count = min(io_size, get_dev_size(dest.bdev) - dest.sector);
1084
1085        src.bdev = s->cow->bdev;
1086        src.sector = chunk_to_sector(s->store, new_chunk);
1087        src.count = dest.count;
1088
1089        /*
1090         * Reallocate any exceptions needed in other snapshots then
1091         * wait for the pending exceptions to complete.
1092         * Each time any pending exception (globally on the system)
1093         * completes we are woken and repeat the process to find out
1094         * if we can proceed.  While this may not seem a particularly
1095         * efficient algorithm, it is not expected to have any
1096         * significant impact on performance.
1097         */
1098        previous_count = read_pending_exceptions_done_count();
1099        while (origin_write_extent(s, dest.sector, io_size)) {
1100                wait_event(_pending_exceptions_done,
1101                           (read_pending_exceptions_done_count() !=
1102                            previous_count));
1103                /* Retry after the wait, until all exceptions are done. */
1104                previous_count = read_pending_exceptions_done_count();
1105        }
1106
1107        down_write(&s->lock);
1108        s->first_merging_chunk = old_chunk;
1109        s->num_merging_chunks = linear_chunks;
1110        up_write(&s->lock);
1111
1112        /* Wait until writes to all 'linear_chunks' drain */
1113        for (i = 0; i < linear_chunks; i++)
1114                __check_for_conflicting_io(s, old_chunk + i);
1115
1116        dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, merge_callback, s);
1117        return;
1118
1119shut:
1120        merge_shutdown(s);
1121}
1122
1123static void error_bios(struct bio *bio);
1124
1125static void merge_callback(int read_err, unsigned long write_err, void *context)
1126{
1127        struct dm_snapshot *s = context;
1128        struct bio *b = NULL;
1129
1130        if (read_err || write_err) {
1131                if (read_err)
1132                        DMERR("Read error: shutting down merge.");
1133                else
1134                        DMERR("Write error: shutting down merge.");
1135                goto shut;
1136        }
1137
1138        if (s->store->type->commit_merge(s->store,
1139                                         s->num_merging_chunks) < 0) {
1140                DMERR("Write error in exception store: shutting down merge");
1141                goto shut;
1142        }
1143
1144        if (remove_single_exception_chunk(s) < 0)
1145                goto shut;
1146
1147        snapshot_merge_next_chunks(s);
1148
1149        return;
1150
1151shut:
1152        down_write(&s->lock);
1153        s->merge_failed = 1;
1154        b = __release_queued_bios_after_merge(s);
1155        up_write(&s->lock);
1156        error_bios(b);
1157
1158        merge_shutdown(s);
1159}
1160
1161static void start_merge(struct dm_snapshot *s)
1162{
1163        if (!test_and_set_bit(RUNNING_MERGE, &s->state_bits))
1164                snapshot_merge_next_chunks(s);
1165}
1166
1167/*
1168 * Stop the merging process and wait until it finishes.
1169 */
1170static void stop_merge(struct dm_snapshot *s)
1171{
1172        set_bit(SHUTDOWN_MERGE, &s->state_bits);
1173        wait_on_bit(&s->state_bits, RUNNING_MERGE, TASK_UNINTERRUPTIBLE);
1174        clear_bit(SHUTDOWN_MERGE, &s->state_bits);
1175}
1176
1177static int parse_snapshot_features(struct dm_arg_set *as, struct dm_snapshot *s,
1178                                   struct dm_target *ti)
1179{
1180        int r;
1181        unsigned argc;
1182        const char *arg_name;
1183
1184        static const struct dm_arg _args[] = {
1185                {0, 2, "Invalid number of feature arguments"},
1186        };
1187
1188        /*
1189         * No feature arguments supplied.
1190         */
1191        if (!as->argc)
1192                return 0;
1193
1194        r = dm_read_arg_group(_args, as, &argc, &ti->error);
1195        if (r)
1196                return -EINVAL;
1197
1198        while (argc && !r) {
1199                arg_name = dm_shift_arg(as);
1200                argc--;
1201
1202                if (!strcasecmp(arg_name, "discard_zeroes_cow"))
1203                        s->discard_zeroes_cow = true;
1204
1205                else if (!strcasecmp(arg_name, "discard_passdown_origin"))
1206                        s->discard_passdown_origin = true;
1207
1208                else {
1209                        ti->error = "Unrecognised feature requested";
1210                        r = -EINVAL;
1211                        break;
1212                }
1213        }
1214
1215        if (!s->discard_zeroes_cow && s->discard_passdown_origin) {
1216                /*
1217                 * TODO: really these are disjoint.. but ti->num_discard_bios
1218                 * and dm_bio_get_target_bio_nr() require rigid constraints.
1219                 */
1220                ti->error = "discard_passdown_origin feature depends on discard_zeroes_cow";
1221                r = -EINVAL;
1222        }
1223
1224        return r;
1225}
1226
1227/*
1228 * Construct a snapshot mapping:
1229 * <origin_dev> <COW-dev> <p|po|n> <chunk-size> [<# feature args> [<arg>]*]
1230 */
1231static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1232{
1233        struct dm_snapshot *s;
1234        struct dm_arg_set as;
1235        int i;
1236        int r = -EINVAL;
1237        char *origin_path, *cow_path;
1238        dev_t origin_dev, cow_dev;
1239        unsigned args_used, num_flush_bios = 1;
1240        fmode_t origin_mode = FMODE_READ;
1241
1242        if (argc < 4) {
1243                ti->error = "requires 4 or more arguments";
1244                r = -EINVAL;
1245                goto bad;
1246        }
1247
1248        if (dm_target_is_snapshot_merge(ti)) {
1249                num_flush_bios = 2;
1250                origin_mode = FMODE_WRITE;
1251        }
1252
1253        s = kzalloc(sizeof(*s), GFP_KERNEL);
1254        if (!s) {
1255                ti->error = "Cannot allocate private snapshot structure";
1256                r = -ENOMEM;
1257                goto bad;
1258        }
1259
1260        as.argc = argc;
1261        as.argv = argv;
1262        dm_consume_args(&as, 4);
1263        r = parse_snapshot_features(&as, s, ti);
1264        if (r)
1265                goto bad_features;
1266
1267        origin_path = argv[0];
1268        argv++;
1269        argc--;
1270
1271        r = dm_get_device(ti, origin_path, origin_mode, &s->origin);
1272        if (r) {
1273                ti->error = "Cannot get origin device";
1274                goto bad_origin;
1275        }
1276        origin_dev = s->origin->bdev->bd_dev;
1277
1278        cow_path = argv[0];
1279        argv++;
1280        argc--;
1281
1282        cow_dev = dm_get_dev_t(cow_path);
1283        if (cow_dev && cow_dev == origin_dev) {
1284                ti->error = "COW device cannot be the same as origin device";
1285                r = -EINVAL;
1286                goto bad_cow;
1287        }
1288
1289        r = dm_get_device(ti, cow_path, dm_table_get_mode(ti->table), &s->cow);
1290        if (r) {
1291                ti->error = "Cannot get COW device";
1292                goto bad_cow;
1293        }
1294
1295        r = dm_exception_store_create(ti, argc, argv, s, &args_used, &s->store);
1296        if (r) {
1297                ti->error = "Couldn't create exception store";
1298                r = -EINVAL;
1299                goto bad_store;
1300        }
1301
1302        argv += args_used;
1303        argc -= args_used;
1304
1305        s->ti = ti;
1306        s->valid = 1;
1307        s->snapshot_overflowed = 0;
1308        s->active = 0;
1309        atomic_set(&s->pending_exceptions_count, 0);
1310        spin_lock_init(&s->pe_allocation_lock);
1311        s->exception_start_sequence = 0;
1312        s->exception_complete_sequence = 0;
1313        s->out_of_order_tree = RB_ROOT;
1314        init_rwsem(&s->lock);
1315        INIT_LIST_HEAD(&s->list);
1316        spin_lock_init(&s->pe_lock);
1317        s->state_bits = 0;
1318        s->merge_failed = 0;
1319        s->first_merging_chunk = 0;
1320        s->num_merging_chunks = 0;
1321        bio_list_init(&s->bios_queued_during_merge);
1322
1323        /* Allocate hash table for COW data */
1324        if (init_hash_tables(s)) {
1325                ti->error = "Unable to allocate hash table space";
1326                r = -ENOMEM;
1327                goto bad_hash_tables;
1328        }
1329
1330        sema_init(&s->cow_count, (cow_threshold > 0) ? cow_threshold : INT_MAX);
1331
1332        s->kcopyd_client = dm_kcopyd_client_create(&dm_kcopyd_throttle);
1333        if (IS_ERR(s->kcopyd_client)) {
1334                r = PTR_ERR(s->kcopyd_client);
1335                ti->error = "Could not create kcopyd client";
1336                goto bad_kcopyd;
1337        }
1338
1339        r = mempool_init_slab_pool(&s->pending_pool, MIN_IOS, pending_cache);
1340        if (r) {
1341                ti->error = "Could not allocate mempool for pending exceptions";
1342                goto bad_pending_pool;
1343        }
1344
1345        for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++)
1346                INIT_HLIST_HEAD(&s->tracked_chunk_hash[i]);
1347
1348        spin_lock_init(&s->tracked_chunk_lock);
1349
1350        ti->private = s;
1351        ti->num_flush_bios = num_flush_bios;
1352        if (s->discard_zeroes_cow)
1353                ti->num_discard_bios = (s->discard_passdown_origin ? 2 : 1);
1354        ti->per_io_data_size = sizeof(struct dm_snap_tracked_chunk);
1355
1356        /* Add snapshot to the list of snapshots for this origin */
1357        /* Exceptions aren't triggered till snapshot_resume() is called */
1358        r = register_snapshot(s);
1359        if (r == -ENOMEM) {
1360                ti->error = "Snapshot origin struct allocation failed";
1361                goto bad_load_and_register;
1362        } else if (r < 0) {
1363                /* invalid handover, register_snapshot has set ti->error */
1364                goto bad_load_and_register;
1365        }
1366
1367        /*
1368         * Metadata must only be loaded into one table at once, so skip this
1369         * if metadata will be handed over during resume.
1370         * Chunk size will be set during the handover - set it to zero to
1371         * ensure it's ignored.
1372         */
1373        if (r > 0) {
1374                s->store->chunk_size = 0;
1375                return 0;
1376        }
1377
1378        r = s->store->type->read_metadata(s->store, dm_add_exception,
1379                                          (void *)s);
1380        if (r < 0) {
1381                ti->error = "Failed to read snapshot metadata";
1382                goto bad_read_metadata;
1383        } else if (r > 0) {
1384                s->valid = 0;
1385                DMWARN("Snapshot is marked invalid.");
1386        }
1387
1388        if (!s->store->chunk_size) {
1389                ti->error = "Chunk size not set";
1390                goto bad_read_metadata;
1391        }
1392
1393        r = dm_set_target_max_io_len(ti, s->store->chunk_size);
1394        if (r)
1395                goto bad_read_metadata;
1396
1397        return 0;
1398
1399bad_read_metadata:
1400        unregister_snapshot(s);
1401bad_load_and_register:
1402        mempool_exit(&s->pending_pool);
1403bad_pending_pool:
1404        dm_kcopyd_client_destroy(s->kcopyd_client);
1405bad_kcopyd:
1406        dm_exception_table_exit(&s->pending, pending_cache);
1407        dm_exception_table_exit(&s->complete, exception_cache);
1408bad_hash_tables:
1409        dm_exception_store_destroy(s->store);
1410bad_store:
1411        dm_put_device(ti, s->cow);
1412bad_cow:
1413        dm_put_device(ti, s->origin);
1414bad_origin:
1415bad_features:
1416        kfree(s);
1417bad:
1418        return r;
1419}
1420
1421static void __free_exceptions(struct dm_snapshot *s)
1422{
1423        dm_kcopyd_client_destroy(s->kcopyd_client);
1424        s->kcopyd_client = NULL;
1425
1426        dm_exception_table_exit(&s->pending, pending_cache);
1427        dm_exception_table_exit(&s->complete, exception_cache);
1428}
1429
1430static void __handover_exceptions(struct dm_snapshot *snap_src,
1431                                  struct dm_snapshot *snap_dest)
1432{
1433        union {
1434                struct dm_exception_table table_swap;
1435                struct dm_exception_store *store_swap;
1436        } u;
1437
1438        /*
1439         * Swap all snapshot context information between the two instances.
1440         */
1441        u.table_swap = snap_dest->complete;
1442        snap_dest->complete = snap_src->complete;
1443        snap_src->complete = u.table_swap;
1444
1445        u.store_swap = snap_dest->store;
1446        snap_dest->store = snap_src->store;
1447        snap_dest->store->userspace_supports_overflow = u.store_swap->userspace_supports_overflow;
1448        snap_src->store = u.store_swap;
1449
1450        snap_dest->store->snap = snap_dest;
1451        snap_src->store->snap = snap_src;
1452
1453        snap_dest->ti->max_io_len = snap_dest->store->chunk_size;
1454        snap_dest->valid = snap_src->valid;
1455        snap_dest->snapshot_overflowed = snap_src->snapshot_overflowed;
1456
1457        /*
1458         * Set source invalid to ensure it receives no further I/O.
1459         */
1460        snap_src->valid = 0;
1461}
1462
1463static void snapshot_dtr(struct dm_target *ti)
1464{
1465#ifdef CONFIG_DM_DEBUG
1466        int i;
1467#endif
1468        struct dm_snapshot *s = ti->private;
1469        struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
1470
1471        down_read(&_origins_lock);
1472        /* Check whether exception handover must be cancelled */
1473        (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
1474        if (snap_src && snap_dest && (s == snap_src)) {
1475                down_write(&snap_dest->lock);
1476                snap_dest->valid = 0;
1477                up_write(&snap_dest->lock);
1478                DMERR("Cancelling snapshot handover.");
1479        }
1480        up_read(&_origins_lock);
1481
1482        if (dm_target_is_snapshot_merge(ti))
1483                stop_merge(s);
1484
1485        /* Prevent further origin writes from using this snapshot. */
1486        /* After this returns there can be no new kcopyd jobs. */
1487        unregister_snapshot(s);
1488
1489        while (atomic_read(&s->pending_exceptions_count))
1490                msleep(1);
1491        /*
1492         * Ensure instructions in mempool_exit aren't reordered
1493         * before atomic_read.
1494         */
1495        smp_mb();
1496
1497#ifdef CONFIG_DM_DEBUG
1498        for (i = 0; i < DM_TRACKED_CHUNK_HASH_SIZE; i++)
1499                BUG_ON(!hlist_empty(&s->tracked_chunk_hash[i]));
1500#endif
1501
1502        __free_exceptions(s);
1503
1504        mempool_exit(&s->pending_pool);
1505
1506        dm_exception_store_destroy(s->store);
1507
1508        dm_put_device(ti, s->cow);
1509
1510        dm_put_device(ti, s->origin);
1511
1512        kfree(s);
1513}
1514
1515/*
1516 * Flush a list of buffers.
1517 */
1518static void flush_bios(struct bio *bio)
1519{
1520        struct bio *n;
1521
1522        while (bio) {
1523                n = bio->bi_next;
1524                bio->bi_next = NULL;
1525                generic_make_request(bio);
1526                bio = n;
1527        }
1528}
1529
1530static int do_origin(struct dm_dev *origin, struct bio *bio);
1531
1532/*
1533 * Flush a list of buffers.
1534 */
1535static void retry_origin_bios(struct dm_snapshot *s, struct bio *bio)
1536{
1537        struct bio *n;
1538        int r;
1539
1540        while (bio) {
1541                n = bio->bi_next;
1542                bio->bi_next = NULL;
1543                r = do_origin(s->origin, bio);
1544                if (r == DM_MAPIO_REMAPPED)
1545                        generic_make_request(bio);
1546                bio = n;
1547        }
1548}
1549
1550/*
1551 * Error a list of buffers.
1552 */
1553static void error_bios(struct bio *bio)
1554{
1555        struct bio *n;
1556
1557        while (bio) {
1558                n = bio->bi_next;
1559                bio->bi_next = NULL;
1560                bio_io_error(bio);
1561                bio = n;
1562        }
1563}
1564
1565static void __invalidate_snapshot(struct dm_snapshot *s, int err)
1566{
1567        if (!s->valid)
1568                return;
1569
1570        if (err == -EIO)
1571                DMERR("Invalidating snapshot: Error reading/writing.");
1572        else if (err == -ENOMEM)
1573                DMERR("Invalidating snapshot: Unable to allocate exception.");
1574
1575        if (s->store->type->drop_snapshot)
1576                s->store->type->drop_snapshot(s->store);
1577
1578        s->valid = 0;
1579
1580        dm_table_event(s->ti->table);
1581}
1582
1583static void invalidate_snapshot(struct dm_snapshot *s, int err)
1584{
1585        down_write(&s->lock);
1586        __invalidate_snapshot(s, err);
1587        up_write(&s->lock);
1588}
1589
1590static void pending_complete(void *context, int success)
1591{
1592        struct dm_snap_pending_exception *pe = context;
1593        struct dm_exception *e;
1594        struct dm_snapshot *s = pe->snap;
1595        struct bio *origin_bios = NULL;
1596        struct bio *snapshot_bios = NULL;
1597        struct bio *full_bio = NULL;
1598        struct dm_exception_table_lock lock;
1599        int error = 0;
1600
1601        dm_exception_table_lock_init(s, pe->e.old_chunk, &lock);
1602
1603        if (!success) {
1604                /* Read/write error - snapshot is unusable */
1605                invalidate_snapshot(s, -EIO);
1606                error = 1;
1607
1608                dm_exception_table_lock(&lock);
1609                goto out;
1610        }
1611
1612        e = alloc_completed_exception(GFP_NOIO);
1613        if (!e) {
1614                invalidate_snapshot(s, -ENOMEM);
1615                error = 1;
1616
1617                dm_exception_table_lock(&lock);
1618                goto out;
1619        }
1620        *e = pe->e;
1621
1622        down_read(&s->lock);
1623        dm_exception_table_lock(&lock);
1624        if (!s->valid) {
1625                up_read(&s->lock);
1626                free_completed_exception(e);
1627                error = 1;
1628
1629                goto out;
1630        }
1631
1632        /*
1633         * Add a proper exception. After inserting the completed exception all
1634         * subsequent snapshot reads to this chunk will be redirected to the
1635         * COW device.  This ensures that we do not starve. Moreover, as long
1636         * as the pending exception exists, neither origin writes nor snapshot
1637         * merging can overwrite the chunk in origin.
1638         */
1639        dm_insert_exception(&s->complete, e);
1640        up_read(&s->lock);
1641
1642        /* Wait for conflicting reads to drain */
1643        if (__chunk_is_tracked(s, pe->e.old_chunk)) {
1644                dm_exception_table_unlock(&lock);
1645                __check_for_conflicting_io(s, pe->e.old_chunk);
1646                dm_exception_table_lock(&lock);
1647        }
1648
1649out:
1650        /* Remove the in-flight exception from the list */
1651        dm_remove_exception(&pe->e);
1652
1653        dm_exception_table_unlock(&lock);
1654
1655        snapshot_bios = bio_list_get(&pe->snapshot_bios);
1656        origin_bios = bio_list_get(&pe->origin_bios);
1657        full_bio = pe->full_bio;
1658        if (full_bio)
1659                full_bio->bi_end_io = pe->full_bio_end_io;
1660        increment_pending_exceptions_done_count();
1661
1662        /* Submit any pending write bios */
1663        if (error) {
1664                if (full_bio)
1665                        bio_io_error(full_bio);
1666                error_bios(snapshot_bios);
1667        } else {
1668                if (full_bio)
1669                        bio_endio(full_bio);
1670                flush_bios(snapshot_bios);
1671        }
1672
1673        retry_origin_bios(s, origin_bios);
1674
1675        free_pending_exception(pe);
1676}
1677
1678static void complete_exception(struct dm_snap_pending_exception *pe)
1679{
1680        struct dm_snapshot *s = pe->snap;
1681
1682        /* Update the metadata if we are persistent */
1683        s->store->type->commit_exception(s->store, &pe->e, !pe->copy_error,
1684                                         pending_complete, pe);
1685}
1686
1687/*
1688 * Called when the copy I/O has finished.  kcopyd actually runs
1689 * this code so don't block.
1690 */
1691static void copy_callback(int read_err, unsigned long write_err, void *context)
1692{
1693        struct dm_snap_pending_exception *pe = context;
1694        struct dm_snapshot *s = pe->snap;
1695
1696        pe->copy_error = read_err || write_err;
1697
1698        if (pe->exception_sequence == s->exception_complete_sequence) {
1699                struct rb_node *next;
1700
1701                s->exception_complete_sequence++;
1702                complete_exception(pe);
1703
1704                next = rb_first(&s->out_of_order_tree);
1705                while (next) {
1706                        pe = rb_entry(next, struct dm_snap_pending_exception,
1707                                        out_of_order_node);
1708                        if (pe->exception_sequence != s->exception_complete_sequence)
1709                                break;
1710                        next = rb_next(next);
1711                        s->exception_complete_sequence++;
1712                        rb_erase(&pe->out_of_order_node, &s->out_of_order_tree);
1713                        complete_exception(pe);
1714                        cond_resched();
1715                }
1716        } else {
1717                struct rb_node *parent = NULL;
1718                struct rb_node **p = &s->out_of_order_tree.rb_node;
1719                struct dm_snap_pending_exception *pe2;
1720
1721                while (*p) {
1722                        pe2 = rb_entry(*p, struct dm_snap_pending_exception, out_of_order_node);
1723                        parent = *p;
1724
1725                        BUG_ON(pe->exception_sequence == pe2->exception_sequence);
1726                        if (pe->exception_sequence < pe2->exception_sequence)
1727                                p = &((*p)->rb_left);
1728                        else
1729                                p = &((*p)->rb_right);
1730                }
1731
1732                rb_link_node(&pe->out_of_order_node, parent, p);
1733                rb_insert_color(&pe->out_of_order_node, &s->out_of_order_tree);
1734        }
1735        up(&s->cow_count);
1736}
1737
1738/*
1739 * Dispatches the copy operation to kcopyd.
1740 */
1741static void start_copy(struct dm_snap_pending_exception *pe)
1742{
1743        struct dm_snapshot *s = pe->snap;
1744        struct dm_io_region src, dest;
1745        struct block_device *bdev = s->origin->bdev;
1746        sector_t dev_size;
1747
1748        dev_size = get_dev_size(bdev);
1749
1750        src.bdev = bdev;
1751        src.sector = chunk_to_sector(s->store, pe->e.old_chunk);
1752        src.count = min((sector_t)s->store->chunk_size, dev_size - src.sector);
1753
1754        dest.bdev = s->cow->bdev;
1755        dest.sector = chunk_to_sector(s->store, pe->e.new_chunk);
1756        dest.count = src.count;
1757
1758        /* Hand over to kcopyd */
1759        down(&s->cow_count);
1760        dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe);
1761}
1762
1763static void full_bio_end_io(struct bio *bio)
1764{
1765        void *callback_data = bio->bi_private;
1766
1767        dm_kcopyd_do_callback(callback_data, 0, bio->bi_status ? 1 : 0);
1768}
1769
1770static void start_full_bio(struct dm_snap_pending_exception *pe,
1771                           struct bio *bio)
1772{
1773        struct dm_snapshot *s = pe->snap;
1774        void *callback_data;
1775
1776        pe->full_bio = bio;
1777        pe->full_bio_end_io = bio->bi_end_io;
1778
1779        down(&s->cow_count);
1780        callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client,
1781                                                   copy_callback, pe);
1782
1783        bio->bi_end_io = full_bio_end_io;
1784        bio->bi_private = callback_data;
1785
1786        generic_make_request(bio);
1787}
1788
1789static struct dm_snap_pending_exception *
1790__lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk)
1791{
1792        struct dm_exception *e = dm_lookup_exception(&s->pending, chunk);
1793
1794        if (!e)
1795                return NULL;
1796
1797        return container_of(e, struct dm_snap_pending_exception, e);
1798}
1799
1800/*
1801 * Inserts a pending exception into the pending table.
1802 *
1803 * NOTE: a write lock must be held on the chunk's pending exception table slot
1804 * before calling this.
1805 */
1806static struct dm_snap_pending_exception *
1807__insert_pending_exception(struct dm_snapshot *s,
1808                           struct dm_snap_pending_exception *pe, chunk_t chunk)
1809{
1810        pe->e.old_chunk = chunk;
1811        bio_list_init(&pe->origin_bios);
1812        bio_list_init(&pe->snapshot_bios);
1813        pe->started = 0;
1814        pe->full_bio = NULL;
1815
1816        spin_lock(&s->pe_allocation_lock);
1817        if (s->store->type->prepare_exception(s->store, &pe->e)) {
1818                spin_unlock(&s->pe_allocation_lock);
1819                free_pending_exception(pe);
1820                return NULL;
1821        }
1822
1823        pe->exception_sequence = s->exception_start_sequence++;
1824        spin_unlock(&s->pe_allocation_lock);
1825
1826        dm_insert_exception(&s->pending, &pe->e);
1827
1828        return pe;
1829}
1830
1831/*
1832 * Looks to see if this snapshot already has a pending exception
1833 * for this chunk, otherwise it allocates a new one and inserts
1834 * it into the pending table.
1835 *
1836 * NOTE: a write lock must be held on the chunk's pending exception table slot
1837 * before calling this.
1838 */
1839static struct dm_snap_pending_exception *
1840__find_pending_exception(struct dm_snapshot *s,
1841                         struct dm_snap_pending_exception *pe, chunk_t chunk)
1842{
1843        struct dm_snap_pending_exception *pe2;
1844
1845        pe2 = __lookup_pending_exception(s, chunk);
1846        if (pe2) {
1847                free_pending_exception(pe);
1848                return pe2;
1849        }
1850
1851        return __insert_pending_exception(s, pe, chunk);
1852}
1853
1854static void remap_exception(struct dm_snapshot *s, struct dm_exception *e,
1855                            struct bio *bio, chunk_t chunk)
1856{
1857        bio_set_dev(bio, s->cow->bdev);
1858        bio->bi_iter.bi_sector =
1859                chunk_to_sector(s->store, dm_chunk_number(e->new_chunk) +
1860                                (chunk - e->old_chunk)) +
1861                (bio->bi_iter.bi_sector & s->store->chunk_mask);
1862}
1863
1864static void zero_callback(int read_err, unsigned long write_err, void *context)
1865{
1866        struct bio *bio = context;
1867        struct dm_snapshot *s = bio->bi_private;
1868
1869        up(&s->cow_count);
1870        bio->bi_status = write_err ? BLK_STS_IOERR : 0;
1871        bio_endio(bio);
1872}
1873
1874static void zero_exception(struct dm_snapshot *s, struct dm_exception *e,
1875                           struct bio *bio, chunk_t chunk)
1876{
1877        struct dm_io_region dest;
1878
1879        dest.bdev = s->cow->bdev;
1880        dest.sector = bio->bi_iter.bi_sector;
1881        dest.count = s->store->chunk_size;
1882
1883        down(&s->cow_count);
1884        WARN_ON_ONCE(bio->bi_private);
1885        bio->bi_private = s;
1886        dm_kcopyd_zero(s->kcopyd_client, 1, &dest, 0, zero_callback, bio);
1887}
1888
1889static bool io_overlaps_chunk(struct dm_snapshot *s, struct bio *bio)
1890{
1891        return bio->bi_iter.bi_size ==
1892                (s->store->chunk_size << SECTOR_SHIFT);
1893}
1894
1895static int snapshot_map(struct dm_target *ti, struct bio *bio)
1896{
1897        struct dm_exception *e;
1898        struct dm_snapshot *s = ti->private;
1899        int r = DM_MAPIO_REMAPPED;
1900        chunk_t chunk;
1901        struct dm_snap_pending_exception *pe = NULL;
1902        struct dm_exception_table_lock lock;
1903
1904        init_tracked_chunk(bio);
1905
1906        if (bio->bi_opf & REQ_PREFLUSH) {
1907                bio_set_dev(bio, s->cow->bdev);
1908                return DM_MAPIO_REMAPPED;
1909        }
1910
1911        chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
1912        dm_exception_table_lock_init(s, chunk, &lock);
1913
1914        /* Full snapshots are not usable */
1915        /* To get here the table must be live so s->active is always set. */
1916        if (!s->valid)
1917                return DM_MAPIO_KILL;
1918
1919        down_read(&s->lock);
1920        dm_exception_table_lock(&lock);
1921
1922        if (!s->valid || (unlikely(s->snapshot_overflowed) &&
1923            bio_data_dir(bio) == WRITE)) {
1924                r = DM_MAPIO_KILL;
1925                goto out_unlock;
1926        }
1927
1928        if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
1929                if (s->discard_passdown_origin && dm_bio_get_target_bio_nr(bio)) {
1930                        /*
1931                         * passdown discard to origin (without triggering
1932                         * snapshot exceptions via do_origin; doing so would
1933                         * defeat the goal of freeing space in origin that is
1934                         * implied by the "discard_passdown_origin" feature)
1935                         */
1936                        bio_set_dev(bio, s->origin->bdev);
1937                        track_chunk(s, bio, chunk);
1938                        goto out_unlock;
1939                }
1940                /* discard to snapshot (target_bio_nr == 0) zeroes exceptions */
1941        }
1942
1943        /* If the block is already remapped - use that, else remap it */
1944        e = dm_lookup_exception(&s->complete, chunk);
1945        if (e) {
1946                remap_exception(s, e, bio, chunk);
1947                if (unlikely(bio_op(bio) == REQ_OP_DISCARD) &&
1948                    io_overlaps_chunk(s, bio)) {
1949                        dm_exception_table_unlock(&lock);
1950                        up_read(&s->lock);
1951                        zero_exception(s, e, bio, chunk);
1952                        r = DM_MAPIO_SUBMITTED; /* discard is not issued */
1953                        goto out;
1954                }
1955                goto out_unlock;
1956        }
1957
1958        if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
1959                /*
1960                 * If no exception exists, complete discard immediately
1961                 * otherwise it'll trigger copy-out.
1962                 */
1963                bio_endio(bio);
1964                r = DM_MAPIO_SUBMITTED;
1965                goto out_unlock;
1966        }
1967
1968        /*
1969         * Write to snapshot - higher level takes care of RW/RO
1970         * flags so we should only get this if we are
1971         * writeable.
1972         */
1973        if (bio_data_dir(bio) == WRITE) {
1974                pe = __lookup_pending_exception(s, chunk);
1975                if (!pe) {
1976                        dm_exception_table_unlock(&lock);
1977                        pe = alloc_pending_exception(s);
1978                        dm_exception_table_lock(&lock);
1979
1980                        e = dm_lookup_exception(&s->complete, chunk);
1981                        if (e) {
1982                                free_pending_exception(pe);
1983                                remap_exception(s, e, bio, chunk);
1984                                goto out_unlock;
1985                        }
1986
1987                        pe = __find_pending_exception(s, pe, chunk);
1988                        if (!pe) {
1989                                dm_exception_table_unlock(&lock);
1990                                up_read(&s->lock);
1991
1992                                down_write(&s->lock);
1993
1994                                if (s->store->userspace_supports_overflow) {
1995                                        if (s->valid && !s->snapshot_overflowed) {
1996                                                s->snapshot_overflowed = 1;
1997                                                DMERR("Snapshot overflowed: Unable to allocate exception.");
1998                                        }
1999                                } else
2000                                        __invalidate_snapshot(s, -ENOMEM);
2001                                up_write(&s->lock);
2002
2003                                r = DM_MAPIO_KILL;
2004                                goto out;
2005                        }
2006                }
2007
2008                remap_exception(s, &pe->e, bio, chunk);
2009
2010                r = DM_MAPIO_SUBMITTED;
2011
2012                if (!pe->started && io_overlaps_chunk(s, bio)) {
2013                        pe->started = 1;
2014
2015                        dm_exception_table_unlock(&lock);
2016                        up_read(&s->lock);
2017
2018                        start_full_bio(pe, bio);
2019                        goto out;
2020                }
2021
2022                bio_list_add(&pe->snapshot_bios, bio);
2023
2024                if (!pe->started) {
2025                        /* this is protected by the exception table lock */
2026                        pe->started = 1;
2027
2028                        dm_exception_table_unlock(&lock);
2029                        up_read(&s->lock);
2030
2031                        start_copy(pe);
2032                        goto out;
2033                }
2034        } else {
2035                bio_set_dev(bio, s->origin->bdev);
2036                track_chunk(s, bio, chunk);
2037        }
2038
2039out_unlock:
2040        dm_exception_table_unlock(&lock);
2041        up_read(&s->lock);
2042out:
2043        return r;
2044}
2045
2046/*
2047 * A snapshot-merge target behaves like a combination of a snapshot
2048 * target and a snapshot-origin target.  It only generates new
2049 * exceptions in other snapshots and not in the one that is being
2050 * merged.
2051 *
2052 * For each chunk, if there is an existing exception, it is used to
2053 * redirect I/O to the cow device.  Otherwise I/O is sent to the origin,
2054 * which in turn might generate exceptions in other snapshots.
2055 * If merging is currently taking place on the chunk in question, the
2056 * I/O is deferred by adding it to s->bios_queued_during_merge.
2057 */
2058static int snapshot_merge_map(struct dm_target *ti, struct bio *bio)
2059{
2060        struct dm_exception *e;
2061        struct dm_snapshot *s = ti->private;
2062        int r = DM_MAPIO_REMAPPED;
2063        chunk_t chunk;
2064
2065        init_tracked_chunk(bio);
2066
2067        if (bio->bi_opf & REQ_PREFLUSH) {
2068                if (!dm_bio_get_target_bio_nr(bio))
2069                        bio_set_dev(bio, s->origin->bdev);
2070                else
2071                        bio_set_dev(bio, s->cow->bdev);
2072                return DM_MAPIO_REMAPPED;
2073        }
2074
2075        if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) {
2076                /* Once merging, discards no longer effect change */
2077                bio_endio(bio);
2078                return DM_MAPIO_SUBMITTED;
2079        }
2080
2081        chunk = sector_to_chunk(s->store, bio->bi_iter.bi_sector);
2082
2083        down_write(&s->lock);
2084
2085        /* Full merging snapshots are redirected to the origin */
2086        if (!s->valid)
2087                goto redirect_to_origin;
2088
2089        /* If the block is already remapped - use that */
2090        e = dm_lookup_exception(&s->complete, chunk);
2091        if (e) {
2092                /* Queue writes overlapping with chunks being merged */
2093                if (bio_data_dir(bio) == WRITE &&
2094                    chunk >= s->first_merging_chunk &&
2095                    chunk < (s->first_merging_chunk +
2096                             s->num_merging_chunks)) {
2097                        bio_set_dev(bio, s->origin->bdev);
2098                        bio_list_add(&s->bios_queued_during_merge, bio);
2099                        r = DM_MAPIO_SUBMITTED;
2100                        goto out_unlock;
2101                }
2102
2103                remap_exception(s, e, bio, chunk);
2104
2105                if (bio_data_dir(bio) == WRITE)
2106                        track_chunk(s, bio, chunk);
2107                goto out_unlock;
2108        }
2109
2110redirect_to_origin:
2111        bio_set_dev(bio, s->origin->bdev);
2112
2113        if (bio_data_dir(bio) == WRITE) {
2114                up_write(&s->lock);
2115                return do_origin(s->origin, bio);
2116        }
2117
2118out_unlock:
2119        up_write(&s->lock);
2120
2121        return r;
2122}
2123
2124static int snapshot_end_io(struct dm_target *ti, struct bio *bio,
2125                blk_status_t *error)
2126{
2127        struct dm_snapshot *s = ti->private;
2128
2129        if (is_bio_tracked(bio))
2130                stop_tracking_chunk(s, bio);
2131
2132        return DM_ENDIO_DONE;
2133}
2134
2135static void snapshot_merge_presuspend(struct dm_target *ti)
2136{
2137        struct dm_snapshot *s = ti->private;
2138
2139        stop_merge(s);
2140}
2141
2142static int snapshot_preresume(struct dm_target *ti)
2143{
2144        int r = 0;
2145        struct dm_snapshot *s = ti->private;
2146        struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
2147
2148        down_read(&_origins_lock);
2149        (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
2150        if (snap_src && snap_dest) {
2151                down_read(&snap_src->lock);
2152                if (s == snap_src) {
2153                        DMERR("Unable to resume snapshot source until "
2154                              "handover completes.");
2155                        r = -EINVAL;
2156                } else if (!dm_suspended(snap_src->ti)) {
2157                        DMERR("Unable to perform snapshot handover until "
2158                              "source is suspended.");
2159                        r = -EINVAL;
2160                }
2161                up_read(&snap_src->lock);
2162        }
2163        up_read(&_origins_lock);
2164
2165        return r;
2166}
2167
2168static void snapshot_resume(struct dm_target *ti)
2169{
2170        struct dm_snapshot *s = ti->private;
2171        struct dm_snapshot *snap_src = NULL, *snap_dest = NULL, *snap_merging = NULL;
2172        struct dm_origin *o;
2173        struct mapped_device *origin_md = NULL;
2174        bool must_restart_merging = false;
2175
2176        down_read(&_origins_lock);
2177
2178        o = __lookup_dm_origin(s->origin->bdev);
2179        if (o)
2180                origin_md = dm_table_get_md(o->ti->table);
2181        if (!origin_md) {
2182                (void) __find_snapshots_sharing_cow(s, NULL, NULL, &snap_merging);
2183                if (snap_merging)
2184                        origin_md = dm_table_get_md(snap_merging->ti->table);
2185        }
2186        if (origin_md == dm_table_get_md(ti->table))
2187                origin_md = NULL;
2188        if (origin_md) {
2189                if (dm_hold(origin_md))
2190                        origin_md = NULL;
2191        }
2192
2193        up_read(&_origins_lock);
2194
2195        if (origin_md) {
2196                dm_internal_suspend_fast(origin_md);
2197                if (snap_merging && test_bit(RUNNING_MERGE, &snap_merging->state_bits)) {
2198                        must_restart_merging = true;
2199                        stop_merge(snap_merging);
2200                }
2201        }
2202
2203        down_read(&_origins_lock);
2204
2205        (void) __find_snapshots_sharing_cow(s, &snap_src, &snap_dest, NULL);
2206        if (snap_src && snap_dest) {
2207                down_write(&snap_src->lock);
2208                down_write_nested(&snap_dest->lock, SINGLE_DEPTH_NESTING);
2209                __handover_exceptions(snap_src, snap_dest);
2210                up_write(&snap_dest->lock);
2211                up_write(&snap_src->lock);
2212        }
2213
2214        up_read(&_origins_lock);
2215
2216        if (origin_md) {
2217                if (must_restart_merging)
2218                        start_merge(snap_merging);
2219                dm_internal_resume_fast(origin_md);
2220                dm_put(origin_md);
2221        }
2222
2223        /* Now we have correct chunk size, reregister */
2224        reregister_snapshot(s);
2225
2226        down_write(&s->lock);
2227        s->active = 1;
2228        up_write(&s->lock);
2229}
2230
2231static uint32_t get_origin_minimum_chunksize(struct block_device *bdev)
2232{
2233        uint32_t min_chunksize;
2234
2235        down_read(&_origins_lock);
2236        min_chunksize = __minimum_chunk_size(__lookup_origin(bdev));
2237        up_read(&_origins_lock);
2238
2239        return min_chunksize;
2240}
2241
2242static void snapshot_merge_resume(struct dm_target *ti)
2243{
2244        struct dm_snapshot *s = ti->private;
2245
2246        /*
2247         * Handover exceptions from existing snapshot.
2248         */
2249        snapshot_resume(ti);
2250
2251        /*
2252         * snapshot-merge acts as an origin, so set ti->max_io_len
2253         */
2254        ti->max_io_len = get_origin_minimum_chunksize(s->origin->bdev);
2255
2256        start_merge(s);
2257}
2258
2259static void snapshot_status(struct dm_target *ti, status_type_t type,
2260                            unsigned status_flags, char *result, unsigned maxlen)
2261{
2262        unsigned sz = 0;
2263        struct dm_snapshot *snap = ti->private;
2264        unsigned num_features;
2265
2266        switch (type) {
2267        case STATUSTYPE_INFO:
2268
2269                down_write(&snap->lock);
2270
2271                if (!snap->valid)
2272                        DMEMIT("Invalid");
2273                else if (snap->merge_failed)
2274                        DMEMIT("Merge failed");
2275                else if (snap->snapshot_overflowed)
2276                        DMEMIT("Overflow");
2277                else {
2278                        if (snap->store->type->usage) {
2279                                sector_t total_sectors, sectors_allocated,
2280                                         metadata_sectors;
2281                                snap->store->type->usage(snap->store,
2282                                                         &total_sectors,
2283                                                         &sectors_allocated,
2284                                                         &metadata_sectors);
2285                                DMEMIT("%llu/%llu %llu",
2286                                       (unsigned long long)sectors_allocated,
2287                                       (unsigned long long)total_sectors,
2288                                       (unsigned long long)metadata_sectors);
2289                        }
2290                        else
2291                                DMEMIT("Unknown");
2292                }
2293
2294                up_write(&snap->lock);
2295
2296                break;
2297
2298        case STATUSTYPE_TABLE:
2299                /*
2300                 * kdevname returns a static pointer so we need
2301                 * to make private copies if the output is to
2302                 * make sense.
2303                 */
2304                DMEMIT("%s %s", snap->origin->name, snap->cow->name);
2305                sz += snap->store->type->status(snap->store, type, result + sz,
2306                                                maxlen - sz);
2307                num_features = snap->discard_zeroes_cow + snap->discard_passdown_origin;
2308                if (num_features) {
2309                        DMEMIT(" %u", num_features);
2310                        if (snap->discard_zeroes_cow)
2311                                DMEMIT(" discard_zeroes_cow");
2312                        if (snap->discard_passdown_origin)
2313                                DMEMIT(" discard_passdown_origin");
2314                }
2315                break;
2316        }
2317}
2318
2319static int snapshot_iterate_devices(struct dm_target *ti,
2320                                    iterate_devices_callout_fn fn, void *data)
2321{
2322        struct dm_snapshot *snap = ti->private;
2323        int r;
2324
2325        r = fn(ti, snap->origin, 0, ti->len, data);
2326
2327        if (!r)
2328                r = fn(ti, snap->cow, 0, get_dev_size(snap->cow->bdev), data);
2329
2330        return r;
2331}
2332
2333static void snapshot_io_hints(struct dm_target *ti, struct queue_limits *limits)
2334{
2335        struct dm_snapshot *snap = ti->private;
2336
2337        if (snap->discard_zeroes_cow) {
2338                struct dm_snapshot *snap_src = NULL, *snap_dest = NULL;
2339
2340                down_read(&_origins_lock);
2341
2342                (void) __find_snapshots_sharing_cow(snap, &snap_src, &snap_dest, NULL);
2343                if (snap_src && snap_dest)
2344                        snap = snap_src;
2345
2346                /* All discards are split on chunk_size boundary */
2347                limits->discard_granularity = snap->store->chunk_size;
2348                limits->max_discard_sectors = snap->store->chunk_size;
2349
2350                up_read(&_origins_lock);
2351        }
2352}
2353
2354/*-----------------------------------------------------------------
2355 * Origin methods
2356 *---------------------------------------------------------------*/
2357
2358/*
2359 * If no exceptions need creating, DM_MAPIO_REMAPPED is returned and any
2360 * supplied bio was ignored.  The caller may submit it immediately.
2361 * (No remapping actually occurs as the origin is always a direct linear
2362 * map.)
2363 *
2364 * If further exceptions are required, DM_MAPIO_SUBMITTED is returned
2365 * and any supplied bio is added to a list to be submitted once all
2366 * the necessary exceptions exist.
2367 */
2368static int __origin_write(struct list_head *snapshots, sector_t sector,
2369                          struct bio *bio)
2370{
2371        int r = DM_MAPIO_REMAPPED;
2372        struct dm_snapshot *snap;
2373        struct dm_exception *e;
2374        struct dm_snap_pending_exception *pe, *pe2;
2375        struct dm_snap_pending_exception *pe_to_start_now = NULL;
2376        struct dm_snap_pending_exception *pe_to_start_last = NULL;
2377        struct dm_exception_table_lock lock;
2378        chunk_t chunk;
2379
2380        /* Do all the snapshots on this origin */
2381        list_for_each_entry (snap, snapshots, list) {
2382                /*
2383                 * Don't make new exceptions in a merging snapshot
2384                 * because it has effectively been deleted
2385                 */
2386                if (dm_target_is_snapshot_merge(snap->ti))
2387                        continue;
2388
2389                /* Nothing to do if writing beyond end of snapshot */
2390                if (sector >= dm_table_get_size(snap->ti->table))
2391                        continue;
2392
2393                /*
2394                 * Remember, different snapshots can have
2395                 * different chunk sizes.
2396                 */
2397                chunk = sector_to_chunk(snap->store, sector);
2398                dm_exception_table_lock_init(snap, chunk, &lock);
2399
2400                down_read(&snap->lock);
2401                dm_exception_table_lock(&lock);
2402
2403                /* Only deal with valid and active snapshots */
2404                if (!snap->valid || !snap->active)
2405                        goto next_snapshot;
2406
2407                pe = __lookup_pending_exception(snap, chunk);
2408                if (!pe) {
2409                        /*
2410                         * Check exception table to see if block is already
2411                         * remapped in this snapshot and trigger an exception
2412                         * if not.
2413                         */
2414                        e = dm_lookup_exception(&snap->complete, chunk);
2415                        if (e)
2416                                goto next_snapshot;
2417
2418                        dm_exception_table_unlock(&lock);
2419                        pe = alloc_pending_exception(snap);
2420                        dm_exception_table_lock(&lock);
2421
2422                        pe2 = __lookup_pending_exception(snap, chunk);
2423
2424                        if (!pe2) {
2425                                e = dm_lookup_exception(&snap->complete, chunk);
2426                                if (e) {
2427                                        free_pending_exception(pe);
2428                                        goto next_snapshot;
2429                                }
2430
2431                                pe = __insert_pending_exception(snap, pe, chunk);
2432                                if (!pe) {
2433                                        dm_exception_table_unlock(&lock);
2434                                        up_read(&snap->lock);
2435
2436                                        invalidate_snapshot(snap, -ENOMEM);
2437                                        continue;
2438                                }
2439                        } else {
2440                                free_pending_exception(pe);
2441                                pe = pe2;
2442                        }
2443                }
2444
2445                r = DM_MAPIO_SUBMITTED;
2446
2447                /*
2448                 * If an origin bio was supplied, queue it to wait for the
2449                 * completion of this exception, and start this one last,
2450                 * at the end of the function.
2451                 */
2452                if (bio) {
2453                        bio_list_add(&pe->origin_bios, bio);
2454                        bio = NULL;
2455
2456                        if (!pe->started) {
2457                                pe->started = 1;
2458                                pe_to_start_last = pe;
2459                        }
2460                }
2461
2462                if (!pe->started) {
2463                        pe->started = 1;
2464                        pe_to_start_now = pe;
2465                }
2466
2467next_snapshot:
2468                dm_exception_table_unlock(&lock);
2469                up_read(&snap->lock);
2470
2471                if (pe_to_start_now) {
2472                        start_copy(pe_to_start_now);
2473                        pe_to_start_now = NULL;
2474                }
2475        }
2476
2477        /*
2478         * Submit the exception against which the bio is queued last,
2479         * to give the other exceptions a head start.
2480         */
2481        if (pe_to_start_last)
2482                start_copy(pe_to_start_last);
2483
2484        return r;
2485}
2486
2487/*
2488 * Called on a write from the origin driver.
2489 */
2490static int do_origin(struct dm_dev *origin, struct bio *bio)
2491{
2492        struct origin *o;
2493        int r = DM_MAPIO_REMAPPED;
2494
2495        down_read(&_origins_lock);
2496        o = __lookup_origin(origin->bdev);
2497        if (o)
2498                r = __origin_write(&o->snapshots, bio->bi_iter.bi_sector, bio);
2499        up_read(&_origins_lock);
2500
2501        return r;
2502}
2503
2504/*
2505 * Trigger exceptions in all non-merging snapshots.
2506 *
2507 * The chunk size of the merging snapshot may be larger than the chunk
2508 * size of some other snapshot so we may need to reallocate multiple
2509 * chunks in other snapshots.
2510 *
2511 * We scan all the overlapping exceptions in the other snapshots.
2512 * Returns 1 if anything was reallocated and must be waited for,
2513 * otherwise returns 0.
2514 *
2515 * size must be a multiple of merging_snap's chunk_size.
2516 */
2517static int origin_write_extent(struct dm_snapshot *merging_snap,
2518                               sector_t sector, unsigned size)
2519{
2520        int must_wait = 0;
2521        sector_t n;
2522        struct origin *o;
2523
2524        /*
2525         * The origin's __minimum_chunk_size() got stored in max_io_len
2526         * by snapshot_merge_resume().
2527         */
2528        down_read(&_origins_lock);
2529        o = __lookup_origin(merging_snap->origin->bdev);
2530        for (n = 0; n < size; n += merging_snap->ti->max_io_len)
2531                if (__origin_write(&o->snapshots, sector + n, NULL) ==
2532                    DM_MAPIO_SUBMITTED)
2533                        must_wait = 1;
2534        up_read(&_origins_lock);
2535
2536        return must_wait;
2537}
2538
2539/*
2540 * Origin: maps a linear range of a device, with hooks for snapshotting.
2541 */
2542
2543/*
2544 * Construct an origin mapping: <dev_path>
2545 * The context for an origin is merely a 'struct dm_dev *'
2546 * pointing to the real device.
2547 */
2548static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
2549{
2550        int r;
2551        struct dm_origin *o;
2552
2553        if (argc != 1) {
2554                ti->error = "origin: incorrect number of arguments";
2555                return -EINVAL;
2556        }
2557
2558        o = kmalloc(sizeof(struct dm_origin), GFP_KERNEL);
2559        if (!o) {
2560                ti->error = "Cannot allocate private origin structure";
2561                r = -ENOMEM;
2562                goto bad_alloc;
2563        }
2564
2565        r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &o->dev);
2566        if (r) {
2567                ti->error = "Cannot get target device";
2568                goto bad_open;
2569        }
2570
2571        o->ti = ti;
2572        ti->private = o;
2573        ti->num_flush_bios = 1;
2574
2575        return 0;
2576
2577bad_open:
2578        kfree(o);
2579bad_alloc:
2580        return r;
2581}
2582
2583static void origin_dtr(struct dm_target *ti)
2584{
2585        struct dm_origin *o = ti->private;
2586
2587        dm_put_device(ti, o->dev);
2588        kfree(o);
2589}
2590
2591static int origin_map(struct dm_target *ti, struct bio *bio)
2592{
2593        struct dm_origin *o = ti->private;
2594        unsigned available_sectors;
2595
2596        bio_set_dev(bio, o->dev->bdev);
2597
2598        if (unlikely(bio->bi_opf & REQ_PREFLUSH))
2599                return DM_MAPIO_REMAPPED;
2600
2601        if (bio_data_dir(bio) != WRITE)
2602                return DM_MAPIO_REMAPPED;
2603
2604        available_sectors = o->split_boundary -
2605                ((unsigned)bio->bi_iter.bi_sector & (o->split_boundary - 1));
2606
2607        if (bio_sectors(bio) > available_sectors)
2608                dm_accept_partial_bio(bio, available_sectors);
2609
2610        /* Only tell snapshots if this is a write */
2611        return do_origin(o->dev, bio);
2612}
2613
2614/*
2615 * Set the target "max_io_len" field to the minimum of all the snapshots'
2616 * chunk sizes.
2617 */
2618static void origin_resume(struct dm_target *ti)
2619{
2620        struct dm_origin *o = ti->private;
2621
2622        o->split_boundary = get_origin_minimum_chunksize(o->dev->bdev);
2623
2624        down_write(&_origins_lock);
2625        __insert_dm_origin(o);
2626        up_write(&_origins_lock);
2627}
2628
2629static void origin_postsuspend(struct dm_target *ti)
2630{
2631        struct dm_origin *o = ti->private;
2632
2633        down_write(&_origins_lock);
2634        __remove_dm_origin(o);
2635        up_write(&_origins_lock);
2636}
2637
2638static void origin_status(struct dm_target *ti, status_type_t type,
2639                          unsigned status_flags, char *result, unsigned maxlen)
2640{
2641        struct dm_origin *o = ti->private;
2642
2643        switch (type) {
2644        case STATUSTYPE_INFO:
2645                result[0] = '\0';
2646                break;
2647
2648        case STATUSTYPE_TABLE:
2649                snprintf(result, maxlen, "%s", o->dev->name);
2650                break;
2651        }
2652}
2653
2654static int origin_iterate_devices(struct dm_target *ti,
2655                                  iterate_devices_callout_fn fn, void *data)
2656{
2657        struct dm_origin *o = ti->private;
2658
2659        return fn(ti, o->dev, 0, ti->len, data);
2660}
2661
2662static struct target_type origin_target = {
2663        .name    = "snapshot-origin",
2664        .version = {1, 9, 0},
2665        .module  = THIS_MODULE,
2666        .ctr     = origin_ctr,
2667        .dtr     = origin_dtr,
2668        .map     = origin_map,
2669        .resume  = origin_resume,
2670        .postsuspend = origin_postsuspend,
2671        .status  = origin_status,
2672        .iterate_devices = origin_iterate_devices,
2673};
2674
2675static struct target_type snapshot_target = {
2676        .name    = "snapshot",
2677        .version = {1, 16, 0},
2678        .module  = THIS_MODULE,
2679        .ctr     = snapshot_ctr,
2680        .dtr     = snapshot_dtr,
2681        .map     = snapshot_map,
2682        .end_io  = snapshot_end_io,
2683        .preresume  = snapshot_preresume,
2684        .resume  = snapshot_resume,
2685        .status  = snapshot_status,
2686        .iterate_devices = snapshot_iterate_devices,
2687        .io_hints = snapshot_io_hints,
2688};
2689
2690static struct target_type merge_target = {
2691        .name    = dm_snapshot_merge_target_name,
2692        .version = {1, 5, 0},
2693        .module  = THIS_MODULE,
2694        .ctr     = snapshot_ctr,
2695        .dtr     = snapshot_dtr,
2696        .map     = snapshot_merge_map,
2697        .end_io  = snapshot_end_io,
2698        .presuspend = snapshot_merge_presuspend,
2699        .preresume  = snapshot_preresume,
2700        .resume  = snapshot_merge_resume,
2701        .status  = snapshot_status,
2702        .iterate_devices = snapshot_iterate_devices,
2703        .io_hints = snapshot_io_hints,
2704};
2705
2706static int __init dm_snapshot_init(void)
2707{
2708        int r;
2709
2710        r = dm_exception_store_init();
2711        if (r) {
2712                DMERR("Failed to initialize exception stores");
2713                return r;
2714        }
2715
2716        r = init_origin_hash();
2717        if (r) {
2718                DMERR("init_origin_hash failed.");
2719                goto bad_origin_hash;
2720        }
2721
2722        exception_cache = KMEM_CACHE(dm_exception, 0);
2723        if (!exception_cache) {
2724                DMERR("Couldn't create exception cache.");
2725                r = -ENOMEM;
2726                goto bad_exception_cache;
2727        }
2728
2729        pending_cache = KMEM_CACHE(dm_snap_pending_exception, 0);
2730        if (!pending_cache) {
2731                DMERR("Couldn't create pending cache.");
2732                r = -ENOMEM;
2733                goto bad_pending_cache;
2734        }
2735
2736        r = dm_register_target(&snapshot_target);
2737        if (r < 0) {
2738                DMERR("snapshot target register failed %d", r);
2739                goto bad_register_snapshot_target;
2740        }
2741
2742        r = dm_register_target(&origin_target);
2743        if (r < 0) {
2744                DMERR("Origin target register failed %d", r);
2745                goto bad_register_origin_target;
2746        }
2747
2748        r = dm_register_target(&merge_target);
2749        if (r < 0) {
2750                DMERR("Merge target register failed %d", r);
2751                goto bad_register_merge_target;
2752        }
2753
2754        return 0;
2755
2756bad_register_merge_target:
2757        dm_unregister_target(&origin_target);
2758bad_register_origin_target:
2759        dm_unregister_target(&snapshot_target);
2760bad_register_snapshot_target:
2761        kmem_cache_destroy(pending_cache);
2762bad_pending_cache:
2763        kmem_cache_destroy(exception_cache);
2764bad_exception_cache:
2765        exit_origin_hash();
2766bad_origin_hash:
2767        dm_exception_store_exit();
2768
2769        return r;
2770}
2771
2772static void __exit dm_snapshot_exit(void)
2773{
2774        dm_unregister_target(&snapshot_target);
2775        dm_unregister_target(&origin_target);
2776        dm_unregister_target(&merge_target);
2777
2778        exit_origin_hash();
2779        kmem_cache_destroy(pending_cache);
2780        kmem_cache_destroy(exception_cache);
2781
2782        dm_exception_store_exit();
2783}
2784
2785/* Module hooks */
2786module_init(dm_snapshot_init);
2787module_exit(dm_snapshot_exit);
2788
2789MODULE_DESCRIPTION(DM_NAME " snapshot target");
2790MODULE_AUTHOR("Joe Thornber");
2791MODULE_LICENSE("GPL");
2792MODULE_ALIAS("dm-snapshot-origin");
2793MODULE_ALIAS("dm-snapshot-merge");
2794