linux/drivers/md/dm-thin-metadata.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2011-2012 Red Hat, Inc.
   3 *
   4 * This file is released under the GPL.
   5 */
   6
   7#include "dm-thin-metadata.h"
   8#include "persistent-data/dm-btree.h"
   9#include "persistent-data/dm-space-map.h"
  10#include "persistent-data/dm-space-map-disk.h"
  11#include "persistent-data/dm-transaction-manager.h"
  12
  13#include <linux/list.h>
  14#include <linux/device-mapper.h>
  15#include <linux/workqueue.h>
  16
  17/*--------------------------------------------------------------------------
  18 * As far as the metadata goes, there is:
  19 *
  20 * - A superblock in block zero, taking up fewer than 512 bytes for
  21 *   atomic writes.
  22 *
  23 * - A space map managing the metadata blocks.
  24 *
  25 * - A space map managing the data blocks.
  26 *
  27 * - A btree mapping our internal thin dev ids onto struct disk_device_details.
  28 *
  29 * - A hierarchical btree, with 2 levels which effectively maps (thin
  30 *   dev id, virtual block) -> block_time.  Block time is a 64-bit
  31 *   field holding the time in the low 24 bits, and block in the top 48
  32 *   bits.
  33 *
  34 * BTrees consist solely of btree_nodes, that fill a block.  Some are
  35 * internal nodes, as such their values are a __le64 pointing to other
  36 * nodes.  Leaf nodes can store data of any reasonable size (ie. much
  37 * smaller than the block size).  The nodes consist of the header,
  38 * followed by an array of keys, followed by an array of values.  We have
  39 * to binary search on the keys so they're all held together to help the
  40 * cpu cache.
  41 *
  42 * Space maps have 2 btrees:
  43 *
  44 * - One maps a uint64_t onto a struct index_entry.  Which points to a
  45 *   bitmap block, and has some details about how many free entries there
  46 *   are etc.
  47 *
  48 * - The bitmap blocks have a header (for the checksum).  Then the rest
  49 *   of the block is pairs of bits.  With the meaning being:
  50 *
  51 *   0 - ref count is 0
  52 *   1 - ref count is 1
  53 *   2 - ref count is 2
  54 *   3 - ref count is higher than 2
  55 *
  56 * - If the count is higher than 2 then the ref count is entered in a
  57 *   second btree that directly maps the block_address to a uint32_t ref
  58 *   count.
  59 *
  60 * The space map metadata variant doesn't have a bitmaps btree.  Instead
  61 * it has one single blocks worth of index_entries.  This avoids
  62 * recursive issues with the bitmap btree needing to allocate space in
  63 * order to insert.  With a small data block size such as 64k the
  64 * metadata support data devices that are hundreds of terrabytes.
  65 *
  66 * The space maps allocate space linearly from front to back.  Space that
  67 * is freed in a transaction is never recycled within that transaction.
  68 * To try and avoid fragmenting _free_ space the allocator always goes
  69 * back and fills in gaps.
  70 *
  71 * All metadata io is in THIN_METADATA_BLOCK_SIZE sized/aligned chunks
  72 * from the block manager.
  73 *--------------------------------------------------------------------------*/
  74
  75#define DM_MSG_PREFIX   "thin metadata"
  76
  77#define THIN_SUPERBLOCK_MAGIC 27022010
  78#define THIN_SUPERBLOCK_LOCATION 0
  79#define THIN_VERSION 2
  80#define THIN_METADATA_CACHE_SIZE 64
  81#define SECTOR_TO_BLOCK_SHIFT 3
  82
  83/*
  84 *  3 for btree insert +
  85 *  2 for btree lookup used within space map
  86 */
  87#define THIN_MAX_CONCURRENT_LOCKS 5
  88
  89/* This should be plenty */
  90#define SPACE_MAP_ROOT_SIZE 128
  91
  92/*
  93 * Little endian on-disk superblock and device details.
  94 */
  95struct thin_disk_superblock {
  96        __le32 csum;    /* Checksum of superblock except for this field. */
  97        __le32 flags;
  98        __le64 blocknr; /* This block number, dm_block_t. */
  99
 100        __u8 uuid[16];
 101        __le64 magic;
 102        __le32 version;
 103        __le32 time;
 104
 105        __le64 trans_id;
 106
 107        /*
 108         * Root held by userspace transactions.
 109         */
 110        __le64 held_root;
 111
 112        __u8 data_space_map_root[SPACE_MAP_ROOT_SIZE];
 113        __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
 114
 115        /*
 116         * 2-level btree mapping (dev_id, (dev block, time)) -> data block
 117         */
 118        __le64 data_mapping_root;
 119
 120        /*
 121         * Device detail root mapping dev_id -> device_details
 122         */
 123        __le64 device_details_root;
 124
 125        __le32 data_block_size;         /* In 512-byte sectors. */
 126
 127        __le32 metadata_block_size;     /* In 512-byte sectors. */
 128        __le64 metadata_nr_blocks;
 129
 130        __le32 compat_flags;
 131        __le32 compat_ro_flags;
 132        __le32 incompat_flags;
 133} __packed;
 134
 135struct disk_device_details {
 136        __le64 mapped_blocks;
 137        __le64 transaction_id;          /* When created. */
 138        __le32 creation_time;
 139        __le32 snapshotted_time;
 140} __packed;
 141
 142struct dm_pool_metadata {
 143        struct hlist_node hash;
 144
 145        struct block_device *bdev;
 146        struct dm_block_manager *bm;
 147        struct dm_space_map *metadata_sm;
 148        struct dm_space_map *data_sm;
 149        struct dm_transaction_manager *tm;
 150        struct dm_transaction_manager *nb_tm;
 151
 152        /*
 153         * Two-level btree.
 154         * First level holds thin_dev_t.
 155         * Second level holds mappings.
 156         */
 157        struct dm_btree_info info;
 158
 159        /*
 160         * Non-blocking version of the above.
 161         */
 162        struct dm_btree_info nb_info;
 163
 164        /*
 165         * Just the top level for deleting whole devices.
 166         */
 167        struct dm_btree_info tl_info;
 168
 169        /*
 170         * Just the bottom level for creating new devices.
 171         */
 172        struct dm_btree_info bl_info;
 173
 174        /*
 175         * Describes the device details btree.
 176         */
 177        struct dm_btree_info details_info;
 178
 179        struct rw_semaphore root_lock;
 180        uint32_t time;
 181        dm_block_t root;
 182        dm_block_t details_root;
 183        struct list_head thin_devices;
 184        uint64_t trans_id;
 185        unsigned long flags;
 186        sector_t data_block_size;
 187
 188        /*
 189         * Set if a transaction has to be aborted but the attempt to roll back
 190         * to the previous (good) transaction failed.  The only pool metadata
 191         * operation possible in this state is the closing of the device.
 192         */
 193        bool fail_io:1;
 194
 195        /*
 196         * Reading the space map roots can fail, so we read it into these
 197         * buffers before the superblock is locked and updated.
 198         */
 199        __u8 data_space_map_root[SPACE_MAP_ROOT_SIZE];
 200        __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
 201};
 202
 203struct dm_thin_device {
 204        struct list_head list;
 205        struct dm_pool_metadata *pmd;
 206        dm_thin_id id;
 207
 208        int open_count;
 209        bool changed:1;
 210        bool aborted_with_changes:1;
 211        uint64_t mapped_blocks;
 212        uint64_t transaction_id;
 213        uint32_t creation_time;
 214        uint32_t snapshotted_time;
 215};
 216
 217/*----------------------------------------------------------------
 218 * superblock validator
 219 *--------------------------------------------------------------*/
 220
 221#define SUPERBLOCK_CSUM_XOR 160774
 222
 223static void sb_prepare_for_write(struct dm_block_validator *v,
 224                                 struct dm_block *b,
 225                                 size_t block_size)
 226{
 227        struct thin_disk_superblock *disk_super = dm_block_data(b);
 228
 229        disk_super->blocknr = cpu_to_le64(dm_block_location(b));
 230        disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
 231                                                      block_size - sizeof(__le32),
 232                                                      SUPERBLOCK_CSUM_XOR));
 233}
 234
 235static int sb_check(struct dm_block_validator *v,
 236                    struct dm_block *b,
 237                    size_t block_size)
 238{
 239        struct thin_disk_superblock *disk_super = dm_block_data(b);
 240        __le32 csum_le;
 241
 242        if (dm_block_location(b) != le64_to_cpu(disk_super->blocknr)) {
 243                DMERR("sb_check failed: blocknr %llu: "
 244                      "wanted %llu", le64_to_cpu(disk_super->blocknr),
 245                      (unsigned long long)dm_block_location(b));
 246                return -ENOTBLK;
 247        }
 248
 249        if (le64_to_cpu(disk_super->magic) != THIN_SUPERBLOCK_MAGIC) {
 250                DMERR("sb_check failed: magic %llu: "
 251                      "wanted %llu", le64_to_cpu(disk_super->magic),
 252                      (unsigned long long)THIN_SUPERBLOCK_MAGIC);
 253                return -EILSEQ;
 254        }
 255
 256        csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
 257                                             block_size - sizeof(__le32),
 258                                             SUPERBLOCK_CSUM_XOR));
 259        if (csum_le != disk_super->csum) {
 260                DMERR("sb_check failed: csum %u: wanted %u",
 261                      le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum));
 262                return -EILSEQ;
 263        }
 264
 265        return 0;
 266}
 267
 268static struct dm_block_validator sb_validator = {
 269        .name = "superblock",
 270        .prepare_for_write = sb_prepare_for_write,
 271        .check = sb_check
 272};
 273
 274/*----------------------------------------------------------------
 275 * Methods for the btree value types
 276 *--------------------------------------------------------------*/
 277
 278static uint64_t pack_block_time(dm_block_t b, uint32_t t)
 279{
 280        return (b << 24) | t;
 281}
 282
 283static void unpack_block_time(uint64_t v, dm_block_t *b, uint32_t *t)
 284{
 285        *b = v >> 24;
 286        *t = v & ((1 << 24) - 1);
 287}
 288
 289static void data_block_inc(void *context, const void *value_le)
 290{
 291        struct dm_space_map *sm = context;
 292        __le64 v_le;
 293        uint64_t b;
 294        uint32_t t;
 295
 296        memcpy(&v_le, value_le, sizeof(v_le));
 297        unpack_block_time(le64_to_cpu(v_le), &b, &t);
 298        dm_sm_inc_block(sm, b);
 299}
 300
 301static void data_block_dec(void *context, const void *value_le)
 302{
 303        struct dm_space_map *sm = context;
 304        __le64 v_le;
 305        uint64_t b;
 306        uint32_t t;
 307
 308        memcpy(&v_le, value_le, sizeof(v_le));
 309        unpack_block_time(le64_to_cpu(v_le), &b, &t);
 310        dm_sm_dec_block(sm, b);
 311}
 312
 313static int data_block_equal(void *context, const void *value1_le, const void *value2_le)
 314{
 315        __le64 v1_le, v2_le;
 316        uint64_t b1, b2;
 317        uint32_t t;
 318
 319        memcpy(&v1_le, value1_le, sizeof(v1_le));
 320        memcpy(&v2_le, value2_le, sizeof(v2_le));
 321        unpack_block_time(le64_to_cpu(v1_le), &b1, &t);
 322        unpack_block_time(le64_to_cpu(v2_le), &b2, &t);
 323
 324        return b1 == b2;
 325}
 326
 327static void subtree_inc(void *context, const void *value)
 328{
 329        struct dm_btree_info *info = context;
 330        __le64 root_le;
 331        uint64_t root;
 332
 333        memcpy(&root_le, value, sizeof(root_le));
 334        root = le64_to_cpu(root_le);
 335        dm_tm_inc(info->tm, root);
 336}
 337
 338static void subtree_dec(void *context, const void *value)
 339{
 340        struct dm_btree_info *info = context;
 341        __le64 root_le;
 342        uint64_t root;
 343
 344        memcpy(&root_le, value, sizeof(root_le));
 345        root = le64_to_cpu(root_le);
 346        if (dm_btree_del(info, root))
 347                DMERR("btree delete failed");
 348}
 349
 350static int subtree_equal(void *context, const void *value1_le, const void *value2_le)
 351{
 352        __le64 v1_le, v2_le;
 353        memcpy(&v1_le, value1_le, sizeof(v1_le));
 354        memcpy(&v2_le, value2_le, sizeof(v2_le));
 355
 356        return v1_le == v2_le;
 357}
 358
 359/*----------------------------------------------------------------*/
 360
 361static int superblock_lock_zero(struct dm_pool_metadata *pmd,
 362                                struct dm_block **sblock)
 363{
 364        return dm_bm_write_lock_zero(pmd->bm, THIN_SUPERBLOCK_LOCATION,
 365                                     &sb_validator, sblock);
 366}
 367
 368static int superblock_lock(struct dm_pool_metadata *pmd,
 369                           struct dm_block **sblock)
 370{
 371        return dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
 372                                &sb_validator, sblock);
 373}
 374
 375static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result)
 376{
 377        int r;
 378        unsigned i;
 379        struct dm_block *b;
 380        __le64 *data_le, zero = cpu_to_le64(0);
 381        unsigned block_size = dm_bm_block_size(bm) / sizeof(__le64);
 382
 383        /*
 384         * We can't use a validator here - it may be all zeroes.
 385         */
 386        r = dm_bm_read_lock(bm, THIN_SUPERBLOCK_LOCATION, NULL, &b);
 387        if (r)
 388                return r;
 389
 390        data_le = dm_block_data(b);
 391        *result = 1;
 392        for (i = 0; i < block_size; i++) {
 393                if (data_le[i] != zero) {
 394                        *result = 0;
 395                        break;
 396                }
 397        }
 398
 399        dm_bm_unlock(b);
 400
 401        return 0;
 402}
 403
 404static void __setup_btree_details(struct dm_pool_metadata *pmd)
 405{
 406        pmd->info.tm = pmd->tm;
 407        pmd->info.levels = 2;
 408        pmd->info.value_type.context = pmd->data_sm;
 409        pmd->info.value_type.size = sizeof(__le64);
 410        pmd->info.value_type.inc = data_block_inc;
 411        pmd->info.value_type.dec = data_block_dec;
 412        pmd->info.value_type.equal = data_block_equal;
 413
 414        memcpy(&pmd->nb_info, &pmd->info, sizeof(pmd->nb_info));
 415        pmd->nb_info.tm = pmd->nb_tm;
 416
 417        pmd->tl_info.tm = pmd->tm;
 418        pmd->tl_info.levels = 1;
 419        pmd->tl_info.value_type.context = &pmd->bl_info;
 420        pmd->tl_info.value_type.size = sizeof(__le64);
 421        pmd->tl_info.value_type.inc = subtree_inc;
 422        pmd->tl_info.value_type.dec = subtree_dec;
 423        pmd->tl_info.value_type.equal = subtree_equal;
 424
 425        pmd->bl_info.tm = pmd->tm;
 426        pmd->bl_info.levels = 1;
 427        pmd->bl_info.value_type.context = pmd->data_sm;
 428        pmd->bl_info.value_type.size = sizeof(__le64);
 429        pmd->bl_info.value_type.inc = data_block_inc;
 430        pmd->bl_info.value_type.dec = data_block_dec;
 431        pmd->bl_info.value_type.equal = data_block_equal;
 432
 433        pmd->details_info.tm = pmd->tm;
 434        pmd->details_info.levels = 1;
 435        pmd->details_info.value_type.context = NULL;
 436        pmd->details_info.value_type.size = sizeof(struct disk_device_details);
 437        pmd->details_info.value_type.inc = NULL;
 438        pmd->details_info.value_type.dec = NULL;
 439        pmd->details_info.value_type.equal = NULL;
 440}
 441
 442static int save_sm_roots(struct dm_pool_metadata *pmd)
 443{
 444        int r;
 445        size_t len;
 446
 447        r = dm_sm_root_size(pmd->metadata_sm, &len);
 448        if (r < 0)
 449                return r;
 450
 451        r = dm_sm_copy_root(pmd->metadata_sm, &pmd->metadata_space_map_root, len);
 452        if (r < 0)
 453                return r;
 454
 455        r = dm_sm_root_size(pmd->data_sm, &len);
 456        if (r < 0)
 457                return r;
 458
 459        return dm_sm_copy_root(pmd->data_sm, &pmd->data_space_map_root, len);
 460}
 461
 462static void copy_sm_roots(struct dm_pool_metadata *pmd,
 463                          struct thin_disk_superblock *disk)
 464{
 465        memcpy(&disk->metadata_space_map_root,
 466               &pmd->metadata_space_map_root,
 467               sizeof(pmd->metadata_space_map_root));
 468
 469        memcpy(&disk->data_space_map_root,
 470               &pmd->data_space_map_root,
 471               sizeof(pmd->data_space_map_root));
 472}
 473
 474static int __write_initial_superblock(struct dm_pool_metadata *pmd)
 475{
 476        int r;
 477        struct dm_block *sblock;
 478        struct thin_disk_superblock *disk_super;
 479        sector_t bdev_size = i_size_read(pmd->bdev->bd_inode) >> SECTOR_SHIFT;
 480
 481        if (bdev_size > THIN_METADATA_MAX_SECTORS)
 482                bdev_size = THIN_METADATA_MAX_SECTORS;
 483
 484        r = dm_sm_commit(pmd->data_sm);
 485        if (r < 0)
 486                return r;
 487
 488        r = save_sm_roots(pmd);
 489        if (r < 0)
 490                return r;
 491
 492        r = dm_tm_pre_commit(pmd->tm);
 493        if (r < 0)
 494                return r;
 495
 496        r = superblock_lock_zero(pmd, &sblock);
 497        if (r)
 498                return r;
 499
 500        disk_super = dm_block_data(sblock);
 501        disk_super->flags = 0;
 502        memset(disk_super->uuid, 0, sizeof(disk_super->uuid));
 503        disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC);
 504        disk_super->version = cpu_to_le32(THIN_VERSION);
 505        disk_super->time = 0;
 506        disk_super->trans_id = 0;
 507        disk_super->held_root = 0;
 508
 509        copy_sm_roots(pmd, disk_super);
 510
 511        disk_super->data_mapping_root = cpu_to_le64(pmd->root);
 512        disk_super->device_details_root = cpu_to_le64(pmd->details_root);
 513        disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE);
 514        disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT);
 515        disk_super->data_block_size = cpu_to_le32(pmd->data_block_size);
 516
 517        return dm_tm_commit(pmd->tm, sblock);
 518}
 519
 520static int __format_metadata(struct dm_pool_metadata *pmd)
 521{
 522        int r;
 523
 524        r = dm_tm_create_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION,
 525                                 &pmd->tm, &pmd->metadata_sm);
 526        if (r < 0) {
 527                DMERR("tm_create_with_sm failed");
 528                return r;
 529        }
 530
 531        pmd->data_sm = dm_sm_disk_create(pmd->tm, 0);
 532        if (IS_ERR(pmd->data_sm)) {
 533                DMERR("sm_disk_create failed");
 534                r = PTR_ERR(pmd->data_sm);
 535                goto bad_cleanup_tm;
 536        }
 537
 538        pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm);
 539        if (!pmd->nb_tm) {
 540                DMERR("could not create non-blocking clone tm");
 541                r = -ENOMEM;
 542                goto bad_cleanup_data_sm;
 543        }
 544
 545        __setup_btree_details(pmd);
 546
 547        r = dm_btree_empty(&pmd->info, &pmd->root);
 548        if (r < 0)
 549                goto bad_cleanup_nb_tm;
 550
 551        r = dm_btree_empty(&pmd->details_info, &pmd->details_root);
 552        if (r < 0) {
 553                DMERR("couldn't create devices root");
 554                goto bad_cleanup_nb_tm;
 555        }
 556
 557        r = __write_initial_superblock(pmd);
 558        if (r)
 559                goto bad_cleanup_nb_tm;
 560
 561        return 0;
 562
 563bad_cleanup_nb_tm:
 564        dm_tm_destroy(pmd->nb_tm);
 565bad_cleanup_data_sm:
 566        dm_sm_destroy(pmd->data_sm);
 567bad_cleanup_tm:
 568        dm_tm_destroy(pmd->tm);
 569        dm_sm_destroy(pmd->metadata_sm);
 570
 571        return r;
 572}
 573
 574static int __check_incompat_features(struct thin_disk_superblock *disk_super,
 575                                     struct dm_pool_metadata *pmd)
 576{
 577        uint32_t features;
 578
 579        features = le32_to_cpu(disk_super->incompat_flags) & ~THIN_FEATURE_INCOMPAT_SUPP;
 580        if (features) {
 581                DMERR("could not access metadata due to unsupported optional features (%lx).",
 582                      (unsigned long)features);
 583                return -EINVAL;
 584        }
 585
 586        /*
 587         * Check for read-only metadata to skip the following RDWR checks.
 588         */
 589        if (get_disk_ro(pmd->bdev->bd_disk))
 590                return 0;
 591
 592        features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP;
 593        if (features) {
 594                DMERR("could not access metadata RDWR due to unsupported optional features (%lx).",
 595                      (unsigned long)features);
 596                return -EINVAL;
 597        }
 598
 599        return 0;
 600}
 601
 602static int __open_metadata(struct dm_pool_metadata *pmd)
 603{
 604        int r;
 605        struct dm_block *sblock;
 606        struct thin_disk_superblock *disk_super;
 607
 608        r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
 609                            &sb_validator, &sblock);
 610        if (r < 0) {
 611                DMERR("couldn't read superblock");
 612                return r;
 613        }
 614
 615        disk_super = dm_block_data(sblock);
 616
 617        /* Verify the data block size hasn't changed */
 618        if (le32_to_cpu(disk_super->data_block_size) != pmd->data_block_size) {
 619                DMERR("changing the data block size (from %u to %llu) is not supported",
 620                      le32_to_cpu(disk_super->data_block_size),
 621                      (unsigned long long)pmd->data_block_size);
 622                r = -EINVAL;
 623                goto bad_unlock_sblock;
 624        }
 625
 626        r = __check_incompat_features(disk_super, pmd);
 627        if (r < 0)
 628                goto bad_unlock_sblock;
 629
 630        r = dm_tm_open_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION,
 631                               disk_super->metadata_space_map_root,
 632                               sizeof(disk_super->metadata_space_map_root),
 633                               &pmd->tm, &pmd->metadata_sm);
 634        if (r < 0) {
 635                DMERR("tm_open_with_sm failed");
 636                goto bad_unlock_sblock;
 637        }
 638
 639        pmd->data_sm = dm_sm_disk_open(pmd->tm, disk_super->data_space_map_root,
 640                                       sizeof(disk_super->data_space_map_root));
 641        if (IS_ERR(pmd->data_sm)) {
 642                DMERR("sm_disk_open failed");
 643                r = PTR_ERR(pmd->data_sm);
 644                goto bad_cleanup_tm;
 645        }
 646
 647        pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm);
 648        if (!pmd->nb_tm) {
 649                DMERR("could not create non-blocking clone tm");
 650                r = -ENOMEM;
 651                goto bad_cleanup_data_sm;
 652        }
 653
 654        __setup_btree_details(pmd);
 655        dm_bm_unlock(sblock);
 656
 657        return 0;
 658
 659bad_cleanup_data_sm:
 660        dm_sm_destroy(pmd->data_sm);
 661bad_cleanup_tm:
 662        dm_tm_destroy(pmd->tm);
 663        dm_sm_destroy(pmd->metadata_sm);
 664bad_unlock_sblock:
 665        dm_bm_unlock(sblock);
 666
 667        return r;
 668}
 669
 670static int __open_or_format_metadata(struct dm_pool_metadata *pmd, bool format_device)
 671{
 672        int r, unformatted;
 673
 674        r = __superblock_all_zeroes(pmd->bm, &unformatted);
 675        if (r)
 676                return r;
 677
 678        if (unformatted)
 679                return format_device ? __format_metadata(pmd) : -EPERM;
 680
 681        return __open_metadata(pmd);
 682}
 683
 684static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool format_device)
 685{
 686        int r;
 687
 688        pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
 689                                          THIN_METADATA_CACHE_SIZE,
 690                                          THIN_MAX_CONCURRENT_LOCKS);
 691        if (IS_ERR(pmd->bm)) {
 692                DMERR("could not create block manager");
 693                return PTR_ERR(pmd->bm);
 694        }
 695
 696        r = __open_or_format_metadata(pmd, format_device);
 697        if (r)
 698                dm_block_manager_destroy(pmd->bm);
 699
 700        return r;
 701}
 702
 703static void __destroy_persistent_data_objects(struct dm_pool_metadata *pmd)
 704{
 705        dm_sm_destroy(pmd->data_sm);
 706        dm_sm_destroy(pmd->metadata_sm);
 707        dm_tm_destroy(pmd->nb_tm);
 708        dm_tm_destroy(pmd->tm);
 709        dm_block_manager_destroy(pmd->bm);
 710}
 711
 712static int __begin_transaction(struct dm_pool_metadata *pmd)
 713{
 714        int r;
 715        struct thin_disk_superblock *disk_super;
 716        struct dm_block *sblock;
 717
 718        /*
 719         * We re-read the superblock every time.  Shouldn't need to do this
 720         * really.
 721         */
 722        r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
 723                            &sb_validator, &sblock);
 724        if (r)
 725                return r;
 726
 727        disk_super = dm_block_data(sblock);
 728        pmd->time = le32_to_cpu(disk_super->time);
 729        pmd->root = le64_to_cpu(disk_super->data_mapping_root);
 730        pmd->details_root = le64_to_cpu(disk_super->device_details_root);
 731        pmd->trans_id = le64_to_cpu(disk_super->trans_id);
 732        pmd->flags = le32_to_cpu(disk_super->flags);
 733        pmd->data_block_size = le32_to_cpu(disk_super->data_block_size);
 734
 735        dm_bm_unlock(sblock);
 736        return 0;
 737}
 738
 739static int __write_changed_details(struct dm_pool_metadata *pmd)
 740{
 741        int r;
 742        struct dm_thin_device *td, *tmp;
 743        struct disk_device_details details;
 744        uint64_t key;
 745
 746        list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
 747                if (!td->changed)
 748                        continue;
 749
 750                key = td->id;
 751
 752                details.mapped_blocks = cpu_to_le64(td->mapped_blocks);
 753                details.transaction_id = cpu_to_le64(td->transaction_id);
 754                details.creation_time = cpu_to_le32(td->creation_time);
 755                details.snapshotted_time = cpu_to_le32(td->snapshotted_time);
 756                __dm_bless_for_disk(&details);
 757
 758                r = dm_btree_insert(&pmd->details_info, pmd->details_root,
 759                                    &key, &details, &pmd->details_root);
 760                if (r)
 761                        return r;
 762
 763                if (td->open_count)
 764                        td->changed = 0;
 765                else {
 766                        list_del(&td->list);
 767                        kfree(td);
 768                }
 769        }
 770
 771        return 0;
 772}
 773
 774static int __commit_transaction(struct dm_pool_metadata *pmd)
 775{
 776        int r;
 777        size_t metadata_len, data_len;
 778        struct thin_disk_superblock *disk_super;
 779        struct dm_block *sblock;
 780
 781        /*
 782         * We need to know if the thin_disk_superblock exceeds a 512-byte sector.
 783         */
 784        BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512);
 785
 786        r = __write_changed_details(pmd);
 787        if (r < 0)
 788                return r;
 789
 790        r = dm_sm_commit(pmd->data_sm);
 791        if (r < 0)
 792                return r;
 793
 794        r = dm_tm_pre_commit(pmd->tm);
 795        if (r < 0)
 796                return r;
 797
 798        r = dm_sm_root_size(pmd->metadata_sm, &metadata_len);
 799        if (r < 0)
 800                return r;
 801
 802        r = dm_sm_root_size(pmd->data_sm, &data_len);
 803        if (r < 0)
 804                return r;
 805
 806        r = save_sm_roots(pmd);
 807        if (r < 0)
 808                return r;
 809
 810        r = superblock_lock(pmd, &sblock);
 811        if (r)
 812                return r;
 813
 814        disk_super = dm_block_data(sblock);
 815        disk_super->time = cpu_to_le32(pmd->time);
 816        disk_super->data_mapping_root = cpu_to_le64(pmd->root);
 817        disk_super->device_details_root = cpu_to_le64(pmd->details_root);
 818        disk_super->trans_id = cpu_to_le64(pmd->trans_id);
 819        disk_super->flags = cpu_to_le32(pmd->flags);
 820
 821        copy_sm_roots(pmd, disk_super);
 822
 823        return dm_tm_commit(pmd->tm, sblock);
 824}
 825
 826struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
 827                                               sector_t data_block_size,
 828                                               bool format_device)
 829{
 830        int r;
 831        struct dm_pool_metadata *pmd;
 832
 833        pmd = kmalloc(sizeof(*pmd), GFP_KERNEL);
 834        if (!pmd) {
 835                DMERR("could not allocate metadata struct");
 836                return ERR_PTR(-ENOMEM);
 837        }
 838
 839        init_rwsem(&pmd->root_lock);
 840        pmd->time = 0;
 841        INIT_LIST_HEAD(&pmd->thin_devices);
 842        pmd->fail_io = false;
 843        pmd->bdev = bdev;
 844        pmd->data_block_size = data_block_size;
 845
 846        r = __create_persistent_data_objects(pmd, format_device);
 847        if (r) {
 848                kfree(pmd);
 849                return ERR_PTR(r);
 850        }
 851
 852        r = __begin_transaction(pmd);
 853        if (r < 0) {
 854                if (dm_pool_metadata_close(pmd) < 0)
 855                        DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
 856                return ERR_PTR(r);
 857        }
 858
 859        return pmd;
 860}
 861
 862int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
 863{
 864        int r;
 865        unsigned open_devices = 0;
 866        struct dm_thin_device *td, *tmp;
 867
 868        down_read(&pmd->root_lock);
 869        list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
 870                if (td->open_count)
 871                        open_devices++;
 872                else {
 873                        list_del(&td->list);
 874                        kfree(td);
 875                }
 876        }
 877        up_read(&pmd->root_lock);
 878
 879        if (open_devices) {
 880                DMERR("attempt to close pmd when %u device(s) are still open",
 881                       open_devices);
 882                return -EBUSY;
 883        }
 884
 885        if (!dm_bm_is_read_only(pmd->bm) && !pmd->fail_io) {
 886                r = __commit_transaction(pmd);
 887                if (r < 0)
 888                        DMWARN("%s: __commit_transaction() failed, error = %d",
 889                               __func__, r);
 890        }
 891
 892        if (!pmd->fail_io)
 893                __destroy_persistent_data_objects(pmd);
 894
 895        kfree(pmd);
 896        return 0;
 897}
 898
 899/*
 900 * __open_device: Returns @td corresponding to device with id @dev,
 901 * creating it if @create is set and incrementing @td->open_count.
 902 * On failure, @td is undefined.
 903 */
 904static int __open_device(struct dm_pool_metadata *pmd,
 905                         dm_thin_id dev, int create,
 906                         struct dm_thin_device **td)
 907{
 908        int r, changed = 0;
 909        struct dm_thin_device *td2;
 910        uint64_t key = dev;
 911        struct disk_device_details details_le;
 912
 913        /*
 914         * If the device is already open, return it.
 915         */
 916        list_for_each_entry(td2, &pmd->thin_devices, list)
 917                if (td2->id == dev) {
 918                        /*
 919                         * May not create an already-open device.
 920                         */
 921                        if (create)
 922                                return -EEXIST;
 923
 924                        td2->open_count++;
 925                        *td = td2;
 926                        return 0;
 927                }
 928
 929        /*
 930         * Check the device exists.
 931         */
 932        r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
 933                            &key, &details_le);
 934        if (r) {
 935                if (r != -ENODATA || !create)
 936                        return r;
 937
 938                /*
 939                 * Create new device.
 940                 */
 941                changed = 1;
 942                details_le.mapped_blocks = 0;
 943                details_le.transaction_id = cpu_to_le64(pmd->trans_id);
 944                details_le.creation_time = cpu_to_le32(pmd->time);
 945                details_le.snapshotted_time = cpu_to_le32(pmd->time);
 946        }
 947
 948        *td = kmalloc(sizeof(**td), GFP_NOIO);
 949        if (!*td)
 950                return -ENOMEM;
 951
 952        (*td)->pmd = pmd;
 953        (*td)->id = dev;
 954        (*td)->open_count = 1;
 955        (*td)->changed = changed;
 956        (*td)->aborted_with_changes = false;
 957        (*td)->mapped_blocks = le64_to_cpu(details_le.mapped_blocks);
 958        (*td)->transaction_id = le64_to_cpu(details_le.transaction_id);
 959        (*td)->creation_time = le32_to_cpu(details_le.creation_time);
 960        (*td)->snapshotted_time = le32_to_cpu(details_le.snapshotted_time);
 961
 962        list_add(&(*td)->list, &pmd->thin_devices);
 963
 964        return 0;
 965}
 966
 967static void __close_device(struct dm_thin_device *td)
 968{
 969        --td->open_count;
 970}
 971
 972static int __create_thin(struct dm_pool_metadata *pmd,
 973                         dm_thin_id dev)
 974{
 975        int r;
 976        dm_block_t dev_root;
 977        uint64_t key = dev;
 978        struct disk_device_details details_le;
 979        struct dm_thin_device *td;
 980        __le64 value;
 981
 982        r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
 983                            &key, &details_le);
 984        if (!r)
 985                return -EEXIST;
 986
 987        /*
 988         * Create an empty btree for the mappings.
 989         */
 990        r = dm_btree_empty(&pmd->bl_info, &dev_root);
 991        if (r)
 992                return r;
 993
 994        /*
 995         * Insert it into the main mapping tree.
 996         */
 997        value = cpu_to_le64(dev_root);
 998        __dm_bless_for_disk(&value);
 999        r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root);
1000        if (r) {
1001                dm_btree_del(&pmd->bl_info, dev_root);
1002                return r;
1003        }
1004
1005        r = __open_device(pmd, dev, 1, &td);
1006        if (r) {
1007                dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
1008                dm_btree_del(&pmd->bl_info, dev_root);
1009                return r;
1010        }
1011        __close_device(td);
1012
1013        return r;
1014}
1015
1016int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev)
1017{
1018        int r = -EINVAL;
1019
1020        down_write(&pmd->root_lock);
1021        if (!pmd->fail_io)
1022                r = __create_thin(pmd, dev);
1023        up_write(&pmd->root_lock);
1024
1025        return r;
1026}
1027
1028static int __set_snapshot_details(struct dm_pool_metadata *pmd,
1029                                  struct dm_thin_device *snap,
1030                                  dm_thin_id origin, uint32_t time)
1031{
1032        int r;
1033        struct dm_thin_device *td;
1034
1035        r = __open_device(pmd, origin, 0, &td);
1036        if (r)
1037                return r;
1038
1039        td->changed = 1;
1040        td->snapshotted_time = time;
1041
1042        snap->mapped_blocks = td->mapped_blocks;
1043        snap->snapshotted_time = time;
1044        __close_device(td);
1045
1046        return 0;
1047}
1048
1049static int __create_snap(struct dm_pool_metadata *pmd,
1050                         dm_thin_id dev, dm_thin_id origin)
1051{
1052        int r;
1053        dm_block_t origin_root;
1054        uint64_t key = origin, dev_key = dev;
1055        struct dm_thin_device *td;
1056        struct disk_device_details details_le;
1057        __le64 value;
1058
1059        /* check this device is unused */
1060        r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
1061                            &dev_key, &details_le);
1062        if (!r)
1063                return -EEXIST;
1064
1065        /* find the mapping tree for the origin */
1066        r = dm_btree_lookup(&pmd->tl_info, pmd->root, &key, &value);
1067        if (r)
1068                return r;
1069        origin_root = le64_to_cpu(value);
1070
1071        /* clone the origin, an inc will do */
1072        dm_tm_inc(pmd->tm, origin_root);
1073
1074        /* insert into the main mapping tree */
1075        value = cpu_to_le64(origin_root);
1076        __dm_bless_for_disk(&value);
1077        key = dev;
1078        r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root);
1079        if (r) {
1080                dm_tm_dec(pmd->tm, origin_root);
1081                return r;
1082        }
1083
1084        pmd->time++;
1085
1086        r = __open_device(pmd, dev, 1, &td);
1087        if (r)
1088                goto bad;
1089
1090        r = __set_snapshot_details(pmd, td, origin, pmd->time);
1091        __close_device(td);
1092
1093        if (r)
1094                goto bad;
1095
1096        return 0;
1097
1098bad:
1099        dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
1100        dm_btree_remove(&pmd->details_info, pmd->details_root,
1101                        &key, &pmd->details_root);
1102        return r;
1103}
1104
1105int dm_pool_create_snap(struct dm_pool_metadata *pmd,
1106                                 dm_thin_id dev,
1107                                 dm_thin_id origin)
1108{
1109        int r = -EINVAL;
1110
1111        down_write(&pmd->root_lock);
1112        if (!pmd->fail_io)
1113                r = __create_snap(pmd, dev, origin);
1114        up_write(&pmd->root_lock);
1115
1116        return r;
1117}
1118
1119static int __delete_device(struct dm_pool_metadata *pmd, dm_thin_id dev)
1120{
1121        int r;
1122        uint64_t key = dev;
1123        struct dm_thin_device *td;
1124
1125        /* TODO: failure should mark the transaction invalid */
1126        r = __open_device(pmd, dev, 0, &td);
1127        if (r)
1128                return r;
1129
1130        if (td->open_count > 1) {
1131                __close_device(td);
1132                return -EBUSY;
1133        }
1134
1135        list_del(&td->list);
1136        kfree(td);
1137        r = dm_btree_remove(&pmd->details_info, pmd->details_root,
1138                            &key, &pmd->details_root);
1139        if (r)
1140                return r;
1141
1142        r = dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
1143        if (r)
1144                return r;
1145
1146        return 0;
1147}
1148
1149int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd,
1150                               dm_thin_id dev)
1151{
1152        int r = -EINVAL;
1153
1154        down_write(&pmd->root_lock);
1155        if (!pmd->fail_io)
1156                r = __delete_device(pmd, dev);
1157        up_write(&pmd->root_lock);
1158
1159        return r;
1160}
1161
1162int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd,
1163                                        uint64_t current_id,
1164                                        uint64_t new_id)
1165{
1166        int r = -EINVAL;
1167
1168        down_write(&pmd->root_lock);
1169
1170        if (pmd->fail_io)
1171                goto out;
1172
1173        if (pmd->trans_id != current_id) {
1174                DMERR("mismatched transaction id");
1175                goto out;
1176        }
1177
1178        pmd->trans_id = new_id;
1179        r = 0;
1180
1181out:
1182        up_write(&pmd->root_lock);
1183
1184        return r;
1185}
1186
1187int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd,
1188                                        uint64_t *result)
1189{
1190        int r = -EINVAL;
1191
1192        down_read(&pmd->root_lock);
1193        if (!pmd->fail_io) {
1194                *result = pmd->trans_id;
1195                r = 0;
1196        }
1197        up_read(&pmd->root_lock);
1198
1199        return r;
1200}
1201
1202static int __reserve_metadata_snap(struct dm_pool_metadata *pmd)
1203{
1204        int r, inc;
1205        struct thin_disk_superblock *disk_super;
1206        struct dm_block *copy, *sblock;
1207        dm_block_t held_root;
1208
1209        /*
1210         * We commit to ensure the btree roots which we increment in a
1211         * moment are up to date.
1212         */
1213        __commit_transaction(pmd);
1214
1215        /*
1216         * Copy the superblock.
1217         */
1218        dm_sm_inc_block(pmd->metadata_sm, THIN_SUPERBLOCK_LOCATION);
1219        r = dm_tm_shadow_block(pmd->tm, THIN_SUPERBLOCK_LOCATION,
1220                               &sb_validator, &copy, &inc);
1221        if (r)
1222                return r;
1223
1224        BUG_ON(!inc);
1225
1226        held_root = dm_block_location(copy);
1227        disk_super = dm_block_data(copy);
1228
1229        if (le64_to_cpu(disk_super->held_root)) {
1230                DMWARN("Pool metadata snapshot already exists: release this before taking another.");
1231
1232                dm_tm_dec(pmd->tm, held_root);
1233                dm_tm_unlock(pmd->tm, copy);
1234                return -EBUSY;
1235        }
1236
1237        /*
1238         * Wipe the spacemap since we're not publishing this.
1239         */
1240        memset(&disk_super->data_space_map_root, 0,
1241               sizeof(disk_super->data_space_map_root));
1242        memset(&disk_super->metadata_space_map_root, 0,
1243               sizeof(disk_super->metadata_space_map_root));
1244
1245        /*
1246         * Increment the data structures that need to be preserved.
1247         */
1248        dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->data_mapping_root));
1249        dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->device_details_root));
1250        dm_tm_unlock(pmd->tm, copy);
1251
1252        /*
1253         * Write the held root into the superblock.
1254         */
1255        r = superblock_lock(pmd, &sblock);
1256        if (r) {
1257                dm_tm_dec(pmd->tm, held_root);
1258                return r;
1259        }
1260
1261        disk_super = dm_block_data(sblock);
1262        disk_super->held_root = cpu_to_le64(held_root);
1263        dm_bm_unlock(sblock);
1264        return 0;
1265}
1266
1267int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd)
1268{
1269        int r = -EINVAL;
1270
1271        down_write(&pmd->root_lock);
1272        if (!pmd->fail_io)
1273                r = __reserve_metadata_snap(pmd);
1274        up_write(&pmd->root_lock);
1275
1276        return r;
1277}
1278
1279static int __release_metadata_snap(struct dm_pool_metadata *pmd)
1280{
1281        int r;
1282        struct thin_disk_superblock *disk_super;
1283        struct dm_block *sblock, *copy;
1284        dm_block_t held_root;
1285
1286        r = superblock_lock(pmd, &sblock);
1287        if (r)
1288                return r;
1289
1290        disk_super = dm_block_data(sblock);
1291        held_root = le64_to_cpu(disk_super->held_root);
1292        disk_super->held_root = cpu_to_le64(0);
1293
1294        dm_bm_unlock(sblock);
1295
1296        if (!held_root) {
1297                DMWARN("No pool metadata snapshot found: nothing to release.");
1298                return -EINVAL;
1299        }
1300
1301        r = dm_tm_read_lock(pmd->tm, held_root, &sb_validator, &copy);
1302        if (r)
1303                return r;
1304
1305        disk_super = dm_block_data(copy);
1306        dm_btree_del(&pmd->info, le64_to_cpu(disk_super->data_mapping_root));
1307        dm_btree_del(&pmd->details_info, le64_to_cpu(disk_super->device_details_root));
1308        dm_sm_dec_block(pmd->metadata_sm, held_root);
1309
1310        dm_tm_unlock(pmd->tm, copy);
1311
1312        return 0;
1313}
1314
1315int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd)
1316{
1317        int r = -EINVAL;
1318
1319        down_write(&pmd->root_lock);
1320        if (!pmd->fail_io)
1321                r = __release_metadata_snap(pmd);
1322        up_write(&pmd->root_lock);
1323
1324        return r;
1325}
1326
1327static int __get_metadata_snap(struct dm_pool_metadata *pmd,
1328                               dm_block_t *result)
1329{
1330        int r;
1331        struct thin_disk_superblock *disk_super;
1332        struct dm_block *sblock;
1333
1334        r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
1335                            &sb_validator, &sblock);
1336        if (r)
1337                return r;
1338
1339        disk_super = dm_block_data(sblock);
1340        *result = le64_to_cpu(disk_super->held_root);
1341
1342        dm_bm_unlock(sblock);
1343
1344        return 0;
1345}
1346
1347int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd,
1348                              dm_block_t *result)
1349{
1350        int r = -EINVAL;
1351
1352        down_read(&pmd->root_lock);
1353        if (!pmd->fail_io)
1354                r = __get_metadata_snap(pmd, result);
1355        up_read(&pmd->root_lock);
1356
1357        return r;
1358}
1359
1360int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev,
1361                             struct dm_thin_device **td)
1362{
1363        int r = -EINVAL;
1364
1365        down_write(&pmd->root_lock);
1366        if (!pmd->fail_io)
1367                r = __open_device(pmd, dev, 0, td);
1368        up_write(&pmd->root_lock);
1369
1370        return r;
1371}
1372
1373int dm_pool_close_thin_device(struct dm_thin_device *td)
1374{
1375        down_write(&td->pmd->root_lock);
1376        __close_device(td);
1377        up_write(&td->pmd->root_lock);
1378
1379        return 0;
1380}
1381
1382dm_thin_id dm_thin_dev_id(struct dm_thin_device *td)
1383{
1384        return td->id;
1385}
1386
1387/*
1388 * Check whether @time (of block creation) is older than @td's last snapshot.
1389 * If so then the associated block is shared with the last snapshot device.
1390 * Any block on a device created *after* the device last got snapshotted is
1391 * necessarily not shared.
1392 */
1393static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time)
1394{
1395        return td->snapshotted_time > time;
1396}
1397
1398static void unpack_lookup_result(struct dm_thin_device *td, __le64 value,
1399                                 struct dm_thin_lookup_result *result)
1400{
1401        uint64_t block_time = 0;
1402        dm_block_t exception_block;
1403        uint32_t exception_time;
1404
1405        block_time = le64_to_cpu(value);
1406        unpack_block_time(block_time, &exception_block, &exception_time);
1407        result->block = exception_block;
1408        result->shared = __snapshotted_since(td, exception_time);
1409}
1410
1411static int __find_block(struct dm_thin_device *td, dm_block_t block,
1412                        int can_issue_io, struct dm_thin_lookup_result *result)
1413{
1414        int r;
1415        __le64 value;
1416        struct dm_pool_metadata *pmd = td->pmd;
1417        dm_block_t keys[2] = { td->id, block };
1418        struct dm_btree_info *info;
1419
1420        if (can_issue_io) {
1421                info = &pmd->info;
1422        } else
1423                info = &pmd->nb_info;
1424
1425        r = dm_btree_lookup(info, pmd->root, keys, &value);
1426        if (!r)
1427                unpack_lookup_result(td, value, result);
1428
1429        return r;
1430}
1431
1432int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
1433                       int can_issue_io, struct dm_thin_lookup_result *result)
1434{
1435        int r;
1436        struct dm_pool_metadata *pmd = td->pmd;
1437
1438        down_read(&pmd->root_lock);
1439        if (pmd->fail_io) {
1440                up_read(&pmd->root_lock);
1441                return -EINVAL;
1442        }
1443
1444        r = __find_block(td, block, can_issue_io, result);
1445
1446        up_read(&pmd->root_lock);
1447        return r;
1448}
1449
1450static int __find_next_mapped_block(struct dm_thin_device *td, dm_block_t block,
1451                                          dm_block_t *vblock,
1452                                          struct dm_thin_lookup_result *result)
1453{
1454        int r;
1455        __le64 value;
1456        struct dm_pool_metadata *pmd = td->pmd;
1457        dm_block_t keys[2] = { td->id, block };
1458
1459        r = dm_btree_lookup_next(&pmd->info, pmd->root, keys, vblock, &value);
1460        if (!r)
1461                unpack_lookup_result(td, value, result);
1462
1463        return r;
1464}
1465
1466static int __find_mapped_range(struct dm_thin_device *td,
1467                               dm_block_t begin, dm_block_t end,
1468                               dm_block_t *thin_begin, dm_block_t *thin_end,
1469                               dm_block_t *pool_begin, bool *maybe_shared)
1470{
1471        int r;
1472        dm_block_t pool_end;
1473        struct dm_thin_lookup_result lookup;
1474
1475        if (end < begin)
1476                return -ENODATA;
1477
1478        r = __find_next_mapped_block(td, begin, &begin, &lookup);
1479        if (r)
1480                return r;
1481
1482        if (begin >= end)
1483                return -ENODATA;
1484
1485        *thin_begin = begin;
1486        *pool_begin = lookup.block;
1487        *maybe_shared = lookup.shared;
1488
1489        begin++;
1490        pool_end = *pool_begin + 1;
1491        while (begin != end) {
1492                r = __find_block(td, begin, true, &lookup);
1493                if (r) {
1494                        if (r == -ENODATA)
1495                                break;
1496                        else
1497                                return r;
1498                }
1499
1500                if ((lookup.block != pool_end) ||
1501                    (lookup.shared != *maybe_shared))
1502                        break;
1503
1504                pool_end++;
1505                begin++;
1506        }
1507
1508        *thin_end = begin;
1509        return 0;
1510}
1511
1512int dm_thin_find_mapped_range(struct dm_thin_device *td,
1513                              dm_block_t begin, dm_block_t end,
1514                              dm_block_t *thin_begin, dm_block_t *thin_end,
1515                              dm_block_t *pool_begin, bool *maybe_shared)
1516{
1517        int r = -EINVAL;
1518        struct dm_pool_metadata *pmd = td->pmd;
1519
1520        down_read(&pmd->root_lock);
1521        if (!pmd->fail_io) {
1522                r = __find_mapped_range(td, begin, end, thin_begin, thin_end,
1523                                        pool_begin, maybe_shared);
1524        }
1525        up_read(&pmd->root_lock);
1526
1527        return r;
1528}
1529
1530static int __insert(struct dm_thin_device *td, dm_block_t block,
1531                    dm_block_t data_block)
1532{
1533        int r, inserted;
1534        __le64 value;
1535        struct dm_pool_metadata *pmd = td->pmd;
1536        dm_block_t keys[2] = { td->id, block };
1537
1538        value = cpu_to_le64(pack_block_time(data_block, pmd->time));
1539        __dm_bless_for_disk(&value);
1540
1541        r = dm_btree_insert_notify(&pmd->info, pmd->root, keys, &value,
1542                                   &pmd->root, &inserted);
1543        if (r)
1544                return r;
1545
1546        td->changed = 1;
1547        if (inserted)
1548                td->mapped_blocks++;
1549
1550        return 0;
1551}
1552
1553int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block,
1554                         dm_block_t data_block)
1555{
1556        int r = -EINVAL;
1557
1558        down_write(&td->pmd->root_lock);
1559        if (!td->pmd->fail_io)
1560                r = __insert(td, block, data_block);
1561        up_write(&td->pmd->root_lock);
1562
1563        return r;
1564}
1565
1566static int __remove(struct dm_thin_device *td, dm_block_t block)
1567{
1568        int r;
1569        struct dm_pool_metadata *pmd = td->pmd;
1570        dm_block_t keys[2] = { td->id, block };
1571
1572        r = dm_btree_remove(&pmd->info, pmd->root, keys, &pmd->root);
1573        if (r)
1574                return r;
1575
1576        td->mapped_blocks--;
1577        td->changed = 1;
1578
1579        return 0;
1580}
1581
1582static int __remove_range(struct dm_thin_device *td, dm_block_t begin, dm_block_t end)
1583{
1584        int r;
1585        unsigned count, total_count = 0;
1586        struct dm_pool_metadata *pmd = td->pmd;
1587        dm_block_t keys[1] = { td->id };
1588        __le64 value;
1589        dm_block_t mapping_root;
1590
1591        /*
1592         * Find the mapping tree
1593         */
1594        r = dm_btree_lookup(&pmd->tl_info, pmd->root, keys, &value);
1595        if (r)
1596                return r;
1597
1598        /*
1599         * Remove from the mapping tree, taking care to inc the
1600         * ref count so it doesn't get deleted.
1601         */
1602        mapping_root = le64_to_cpu(value);
1603        dm_tm_inc(pmd->tm, mapping_root);
1604        r = dm_btree_remove(&pmd->tl_info, pmd->root, keys, &pmd->root);
1605        if (r)
1606                return r;
1607
1608        /*
1609         * Remove leaves stops at the first unmapped entry, so we have to
1610         * loop round finding mapped ranges.
1611         */
1612        while (begin < end) {
1613                r = dm_btree_lookup_next(&pmd->bl_info, mapping_root, &begin, &begin, &value);
1614                if (r == -ENODATA)
1615                        break;
1616
1617                if (r)
1618                        return r;
1619
1620                if (begin >= end)
1621                        break;
1622
1623                r = dm_btree_remove_leaves(&pmd->bl_info, mapping_root, &begin, end, &mapping_root, &count);
1624                if (r)
1625                        return r;
1626
1627                total_count += count;
1628        }
1629
1630        td->mapped_blocks -= total_count;
1631        td->changed = 1;
1632
1633        /*
1634         * Reinsert the mapping tree.
1635         */
1636        value = cpu_to_le64(mapping_root);
1637        __dm_bless_for_disk(&value);
1638        return dm_btree_insert(&pmd->tl_info, pmd->root, keys, &value, &pmd->root);
1639}
1640
1641int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block)
1642{
1643        int r = -EINVAL;
1644
1645        down_write(&td->pmd->root_lock);
1646        if (!td->pmd->fail_io)
1647                r = __remove(td, block);
1648        up_write(&td->pmd->root_lock);
1649
1650        return r;
1651}
1652
1653int dm_thin_remove_range(struct dm_thin_device *td,
1654                         dm_block_t begin, dm_block_t end)
1655{
1656        int r = -EINVAL;
1657
1658        down_write(&td->pmd->root_lock);
1659        if (!td->pmd->fail_io)
1660                r = __remove_range(td, begin, end);
1661        up_write(&td->pmd->root_lock);
1662
1663        return r;
1664}
1665
1666int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result)
1667{
1668        int r;
1669        uint32_t ref_count;
1670
1671        down_read(&pmd->root_lock);
1672        r = dm_sm_get_count(pmd->data_sm, b, &ref_count);
1673        if (!r)
1674                *result = (ref_count != 0);
1675        up_read(&pmd->root_lock);
1676
1677        return r;
1678}
1679
1680bool dm_thin_changed_this_transaction(struct dm_thin_device *td)
1681{
1682        int r;
1683
1684        down_read(&td->pmd->root_lock);
1685        r = td->changed;
1686        up_read(&td->pmd->root_lock);
1687
1688        return r;
1689}
1690
1691bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd)
1692{
1693        bool r = false;
1694        struct dm_thin_device *td, *tmp;
1695
1696        down_read(&pmd->root_lock);
1697        list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
1698                if (td->changed) {
1699                        r = td->changed;
1700                        break;
1701                }
1702        }
1703        up_read(&pmd->root_lock);
1704
1705        return r;
1706}
1707
1708bool dm_thin_aborted_changes(struct dm_thin_device *td)
1709{
1710        bool r;
1711
1712        down_read(&td->pmd->root_lock);
1713        r = td->aborted_with_changes;
1714        up_read(&td->pmd->root_lock);
1715
1716        return r;
1717}
1718
1719int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result)
1720{
1721        int r = -EINVAL;
1722
1723        down_write(&pmd->root_lock);
1724        if (!pmd->fail_io)
1725                r = dm_sm_new_block(pmd->data_sm, result);
1726        up_write(&pmd->root_lock);
1727
1728        return r;
1729}
1730
1731int dm_pool_commit_metadata(struct dm_pool_metadata *pmd)
1732{
1733        int r = -EINVAL;
1734
1735        down_write(&pmd->root_lock);
1736        if (pmd->fail_io)
1737                goto out;
1738
1739        r = __commit_transaction(pmd);
1740        if (r <= 0)
1741                goto out;
1742
1743        /*
1744         * Open the next transaction.
1745         */
1746        r = __begin_transaction(pmd);
1747out:
1748        up_write(&pmd->root_lock);
1749        return r;
1750}
1751
1752static void __set_abort_with_changes_flags(struct dm_pool_metadata *pmd)
1753{
1754        struct dm_thin_device *td;
1755
1756        list_for_each_entry(td, &pmd->thin_devices, list)
1757                td->aborted_with_changes = td->changed;
1758}
1759
1760int dm_pool_abort_metadata(struct dm_pool_metadata *pmd)
1761{
1762        int r = -EINVAL;
1763
1764        down_write(&pmd->root_lock);
1765        if (pmd->fail_io)
1766                goto out;
1767
1768        __set_abort_with_changes_flags(pmd);
1769        __destroy_persistent_data_objects(pmd);
1770        r = __create_persistent_data_objects(pmd, false);
1771        if (r)
1772                pmd->fail_io = true;
1773
1774out:
1775        up_write(&pmd->root_lock);
1776
1777        return r;
1778}
1779
1780int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *result)
1781{
1782        int r = -EINVAL;
1783
1784        down_read(&pmd->root_lock);
1785        if (!pmd->fail_io)
1786                r = dm_sm_get_nr_free(pmd->data_sm, result);
1787        up_read(&pmd->root_lock);
1788
1789        return r;
1790}
1791
1792int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd,
1793                                          dm_block_t *result)
1794{
1795        int r = -EINVAL;
1796
1797        down_read(&pmd->root_lock);
1798        if (!pmd->fail_io)
1799                r = dm_sm_get_nr_free(pmd->metadata_sm, result);
1800        up_read(&pmd->root_lock);
1801
1802        return r;
1803}
1804
1805int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd,
1806                                  dm_block_t *result)
1807{
1808        int r = -EINVAL;
1809
1810        down_read(&pmd->root_lock);
1811        if (!pmd->fail_io)
1812                r = dm_sm_get_nr_blocks(pmd->metadata_sm, result);
1813        up_read(&pmd->root_lock);
1814
1815        return r;
1816}
1817
1818int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result)
1819{
1820        int r = -EINVAL;
1821
1822        down_read(&pmd->root_lock);
1823        if (!pmd->fail_io)
1824                r = dm_sm_get_nr_blocks(pmd->data_sm, result);
1825        up_read(&pmd->root_lock);
1826
1827        return r;
1828}
1829
1830int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result)
1831{
1832        int r = -EINVAL;
1833        struct dm_pool_metadata *pmd = td->pmd;
1834
1835        down_read(&pmd->root_lock);
1836        if (!pmd->fail_io) {
1837                *result = td->mapped_blocks;
1838                r = 0;
1839        }
1840        up_read(&pmd->root_lock);
1841
1842        return r;
1843}
1844
1845static int __highest_block(struct dm_thin_device *td, dm_block_t *result)
1846{
1847        int r;
1848        __le64 value_le;
1849        dm_block_t thin_root;
1850        struct dm_pool_metadata *pmd = td->pmd;
1851
1852        r = dm_btree_lookup(&pmd->tl_info, pmd->root, &td->id, &value_le);
1853        if (r)
1854                return r;
1855
1856        thin_root = le64_to_cpu(value_le);
1857
1858        return dm_btree_find_highest_key(&pmd->bl_info, thin_root, result);
1859}
1860
1861int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
1862                                     dm_block_t *result)
1863{
1864        int r = -EINVAL;
1865        struct dm_pool_metadata *pmd = td->pmd;
1866
1867        down_read(&pmd->root_lock);
1868        if (!pmd->fail_io)
1869                r = __highest_block(td, result);
1870        up_read(&pmd->root_lock);
1871
1872        return r;
1873}
1874
1875static int __resize_space_map(struct dm_space_map *sm, dm_block_t new_count)
1876{
1877        int r;
1878        dm_block_t old_count;
1879
1880        r = dm_sm_get_nr_blocks(sm, &old_count);
1881        if (r)
1882                return r;
1883
1884        if (new_count == old_count)
1885                return 0;
1886
1887        if (new_count < old_count) {
1888                DMERR("cannot reduce size of space map");
1889                return -EINVAL;
1890        }
1891
1892        return dm_sm_extend(sm, new_count - old_count);
1893}
1894
1895int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
1896{
1897        int r = -EINVAL;
1898
1899        down_write(&pmd->root_lock);
1900        if (!pmd->fail_io)
1901                r = __resize_space_map(pmd->data_sm, new_count);
1902        up_write(&pmd->root_lock);
1903
1904        return r;
1905}
1906
1907int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
1908{
1909        int r = -EINVAL;
1910
1911        down_write(&pmd->root_lock);
1912        if (!pmd->fail_io)
1913                r = __resize_space_map(pmd->metadata_sm, new_count);
1914        up_write(&pmd->root_lock);
1915
1916        return r;
1917}
1918
1919void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
1920{
1921        down_write(&pmd->root_lock);
1922        dm_bm_set_read_only(pmd->bm);
1923        up_write(&pmd->root_lock);
1924}
1925
1926void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd)
1927{
1928        down_write(&pmd->root_lock);
1929        dm_bm_set_read_write(pmd->bm);
1930        up_write(&pmd->root_lock);
1931}
1932
1933int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
1934                                        dm_block_t threshold,
1935                                        dm_sm_threshold_fn fn,
1936                                        void *context)
1937{
1938        int r;
1939
1940        down_write(&pmd->root_lock);
1941        r = dm_sm_register_threshold_callback(pmd->metadata_sm, threshold, fn, context);
1942        up_write(&pmd->root_lock);
1943
1944        return r;
1945}
1946
1947int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
1948{
1949        int r;
1950        struct dm_block *sblock;
1951        struct thin_disk_superblock *disk_super;
1952
1953        down_write(&pmd->root_lock);
1954        pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG;
1955
1956        r = superblock_lock(pmd, &sblock);
1957        if (r) {
1958                DMERR("couldn't read superblock");
1959                goto out;
1960        }
1961
1962        disk_super = dm_block_data(sblock);
1963        disk_super->flags = cpu_to_le32(pmd->flags);
1964
1965        dm_bm_unlock(sblock);
1966out:
1967        up_write(&pmd->root_lock);
1968        return r;
1969}
1970
1971bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd)
1972{
1973        bool needs_check;
1974
1975        down_read(&pmd->root_lock);
1976        needs_check = pmd->flags & THIN_METADATA_NEEDS_CHECK_FLAG;
1977        up_read(&pmd->root_lock);
1978
1979        return needs_check;
1980}
1981
1982void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd)
1983{
1984        down_read(&pmd->root_lock);
1985        if (!pmd->fail_io)
1986                dm_tm_issue_prefetches(pmd->tm);
1987        up_read(&pmd->root_lock);
1988}
1989