linux/drivers/md/dm-thin-metadata.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2011-2012 Red Hat, Inc.
   3 *
   4 * This file is released under the GPL.
   5 */
   6
   7#include "dm-thin-metadata.h"
   8#include "persistent-data/dm-btree.h"
   9#include "persistent-data/dm-space-map.h"
  10#include "persistent-data/dm-space-map-disk.h"
  11#include "persistent-data/dm-transaction-manager.h"
  12
  13#include <linux/list.h>
  14#include <linux/device-mapper.h>
  15#include <linux/workqueue.h>
  16
  17/*--------------------------------------------------------------------------
  18 * As far as the metadata goes, there is:
  19 *
  20 * - A superblock in block zero, taking up fewer than 512 bytes for
  21 *   atomic writes.
  22 *
  23 * - A space map managing the metadata blocks.
  24 *
  25 * - A space map managing the data blocks.
  26 *
  27 * - A btree mapping our internal thin dev ids onto struct disk_device_details.
  28 *
  29 * - A hierarchical btree, with 2 levels which effectively maps (thin
  30 *   dev id, virtual block) -> block_time.  Block time is a 64-bit
  31 *   field holding the time in the low 24 bits, and block in the top 48
  32 *   bits.
  33 *
  34 * BTrees consist solely of btree_nodes, that fill a block.  Some are
  35 * internal nodes, as such their values are a __le64 pointing to other
  36 * nodes.  Leaf nodes can store data of any reasonable size (ie. much
  37 * smaller than the block size).  The nodes consist of the header,
  38 * followed by an array of keys, followed by an array of values.  We have
  39 * to binary search on the keys so they're all held together to help the
  40 * cpu cache.
  41 *
  42 * Space maps have 2 btrees:
  43 *
  44 * - One maps a uint64_t onto a struct index_entry.  Which points to a
  45 *   bitmap block, and has some details about how many free entries there
  46 *   are etc.
  47 *
  48 * - The bitmap blocks have a header (for the checksum).  Then the rest
  49 *   of the block is pairs of bits.  With the meaning being:
  50 *
  51 *   0 - ref count is 0
  52 *   1 - ref count is 1
  53 *   2 - ref count is 2
  54 *   3 - ref count is higher than 2
  55 *
  56 * - If the count is higher than 2 then the ref count is entered in a
  57 *   second btree that directly maps the block_address to a uint32_t ref
  58 *   count.
  59 *
  60 * The space map metadata variant doesn't have a bitmaps btree.  Instead
  61 * it has one single blocks worth of index_entries.  This avoids
  62 * recursive issues with the bitmap btree needing to allocate space in
  63 * order to insert.  With a small data block size such as 64k the
  64 * metadata support data devices that are hundreds of terrabytes.
  65 *
  66 * The space maps allocate space linearly from front to back.  Space that
  67 * is freed in a transaction is never recycled within that transaction.
  68 * To try and avoid fragmenting _free_ space the allocator always goes
  69 * back and fills in gaps.
  70 *
  71 * All metadata io is in THIN_METADATA_BLOCK_SIZE sized/aligned chunks
  72 * from the block manager.
  73 *--------------------------------------------------------------------------*/
  74
  75#define DM_MSG_PREFIX   "thin metadata"
  76
  77#define THIN_SUPERBLOCK_MAGIC 27022010
  78#define THIN_SUPERBLOCK_LOCATION 0
  79#define THIN_VERSION 2
  80#define THIN_METADATA_CACHE_SIZE 64
  81#define SECTOR_TO_BLOCK_SHIFT 3
  82
  83/*
  84 *  3 for btree insert +
  85 *  2 for btree lookup used within space map
  86 */
  87#define THIN_MAX_CONCURRENT_LOCKS 5
  88
  89/* This should be plenty */
  90#define SPACE_MAP_ROOT_SIZE 128
  91
  92/*
  93 * Little endian on-disk superblock and device details.
  94 */
  95struct thin_disk_superblock {
  96        __le32 csum;    /* Checksum of superblock except for this field. */
  97        __le32 flags;
  98        __le64 blocknr; /* This block number, dm_block_t. */
  99
 100        __u8 uuid[16];
 101        __le64 magic;
 102        __le32 version;
 103        __le32 time;
 104
 105        __le64 trans_id;
 106
 107        /*
 108         * Root held by userspace transactions.
 109         */
 110        __le64 held_root;
 111
 112        __u8 data_space_map_root[SPACE_MAP_ROOT_SIZE];
 113        __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
 114
 115        /*
 116         * 2-level btree mapping (dev_id, (dev block, time)) -> data block
 117         */
 118        __le64 data_mapping_root;
 119
 120        /*
 121         * Device detail root mapping dev_id -> device_details
 122         */
 123        __le64 device_details_root;
 124
 125        __le32 data_block_size;         /* In 512-byte sectors. */
 126
 127        __le32 metadata_block_size;     /* In 512-byte sectors. */
 128        __le64 metadata_nr_blocks;
 129
 130        __le32 compat_flags;
 131        __le32 compat_ro_flags;
 132        __le32 incompat_flags;
 133} __packed;
 134
 135struct disk_device_details {
 136        __le64 mapped_blocks;
 137        __le64 transaction_id;          /* When created. */
 138        __le32 creation_time;
 139        __le32 snapshotted_time;
 140} __packed;
 141
 142struct dm_pool_metadata {
 143        struct hlist_node hash;
 144
 145        struct block_device *bdev;
 146        struct dm_block_manager *bm;
 147        struct dm_space_map *metadata_sm;
 148        struct dm_space_map *data_sm;
 149        struct dm_transaction_manager *tm;
 150        struct dm_transaction_manager *nb_tm;
 151
 152        /*
 153         * Two-level btree.
 154         * First level holds thin_dev_t.
 155         * Second level holds mappings.
 156         */
 157        struct dm_btree_info info;
 158
 159        /*
 160         * Non-blocking version of the above.
 161         */
 162        struct dm_btree_info nb_info;
 163
 164        /*
 165         * Just the top level for deleting whole devices.
 166         */
 167        struct dm_btree_info tl_info;
 168
 169        /*
 170         * Just the bottom level for creating new devices.
 171         */
 172        struct dm_btree_info bl_info;
 173
 174        /*
 175         * Describes the device details btree.
 176         */
 177        struct dm_btree_info details_info;
 178
 179        struct rw_semaphore root_lock;
 180        uint32_t time;
 181        dm_block_t root;
 182        dm_block_t details_root;
 183        struct list_head thin_devices;
 184        uint64_t trans_id;
 185        unsigned long flags;
 186        sector_t data_block_size;
 187        bool read_only:1;
 188
 189        /*
 190         * Set if a transaction has to be aborted but the attempt to roll back
 191         * to the previous (good) transaction failed.  The only pool metadata
 192         * operation possible in this state is the closing of the device.
 193         */
 194        bool fail_io:1;
 195
 196        /*
 197         * Reading the space map roots can fail, so we read it into these
 198         * buffers before the superblock is locked and updated.
 199         */
 200        __u8 data_space_map_root[SPACE_MAP_ROOT_SIZE];
 201        __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE];
 202};
 203
 204struct dm_thin_device {
 205        struct list_head list;
 206        struct dm_pool_metadata *pmd;
 207        dm_thin_id id;
 208
 209        int open_count;
 210        bool changed:1;
 211        bool aborted_with_changes:1;
 212        uint64_t mapped_blocks;
 213        uint64_t transaction_id;
 214        uint32_t creation_time;
 215        uint32_t snapshotted_time;
 216};
 217
 218/*----------------------------------------------------------------
 219 * superblock validator
 220 *--------------------------------------------------------------*/
 221
 222#define SUPERBLOCK_CSUM_XOR 160774
 223
 224static void sb_prepare_for_write(struct dm_block_validator *v,
 225                                 struct dm_block *b,
 226                                 size_t block_size)
 227{
 228        struct thin_disk_superblock *disk_super = dm_block_data(b);
 229
 230        disk_super->blocknr = cpu_to_le64(dm_block_location(b));
 231        disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
 232                                                      block_size - sizeof(__le32),
 233                                                      SUPERBLOCK_CSUM_XOR));
 234}
 235
 236static int sb_check(struct dm_block_validator *v,
 237                    struct dm_block *b,
 238                    size_t block_size)
 239{
 240        struct thin_disk_superblock *disk_super = dm_block_data(b);
 241        __le32 csum_le;
 242
 243        if (dm_block_location(b) != le64_to_cpu(disk_super->blocknr)) {
 244                DMERR("sb_check failed: blocknr %llu: "
 245                      "wanted %llu", le64_to_cpu(disk_super->blocknr),
 246                      (unsigned long long)dm_block_location(b));
 247                return -ENOTBLK;
 248        }
 249
 250        if (le64_to_cpu(disk_super->magic) != THIN_SUPERBLOCK_MAGIC) {
 251                DMERR("sb_check failed: magic %llu: "
 252                      "wanted %llu", le64_to_cpu(disk_super->magic),
 253                      (unsigned long long)THIN_SUPERBLOCK_MAGIC);
 254                return -EILSEQ;
 255        }
 256
 257        csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags,
 258                                             block_size - sizeof(__le32),
 259                                             SUPERBLOCK_CSUM_XOR));
 260        if (csum_le != disk_super->csum) {
 261                DMERR("sb_check failed: csum %u: wanted %u",
 262                      le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum));
 263                return -EILSEQ;
 264        }
 265
 266        return 0;
 267}
 268
 269static struct dm_block_validator sb_validator = {
 270        .name = "superblock",
 271        .prepare_for_write = sb_prepare_for_write,
 272        .check = sb_check
 273};
 274
 275/*----------------------------------------------------------------
 276 * Methods for the btree value types
 277 *--------------------------------------------------------------*/
 278
 279static uint64_t pack_block_time(dm_block_t b, uint32_t t)
 280{
 281        return (b << 24) | t;
 282}
 283
 284static void unpack_block_time(uint64_t v, dm_block_t *b, uint32_t *t)
 285{
 286        *b = v >> 24;
 287        *t = v & ((1 << 24) - 1);
 288}
 289
 290static void data_block_inc(void *context, const void *value_le)
 291{
 292        struct dm_space_map *sm = context;
 293        __le64 v_le;
 294        uint64_t b;
 295        uint32_t t;
 296
 297        memcpy(&v_le, value_le, sizeof(v_le));
 298        unpack_block_time(le64_to_cpu(v_le), &b, &t);
 299        dm_sm_inc_block(sm, b);
 300}
 301
 302static void data_block_dec(void *context, const void *value_le)
 303{
 304        struct dm_space_map *sm = context;
 305        __le64 v_le;
 306        uint64_t b;
 307        uint32_t t;
 308
 309        memcpy(&v_le, value_le, sizeof(v_le));
 310        unpack_block_time(le64_to_cpu(v_le), &b, &t);
 311        dm_sm_dec_block(sm, b);
 312}
 313
 314static int data_block_equal(void *context, const void *value1_le, const void *value2_le)
 315{
 316        __le64 v1_le, v2_le;
 317        uint64_t b1, b2;
 318        uint32_t t;
 319
 320        memcpy(&v1_le, value1_le, sizeof(v1_le));
 321        memcpy(&v2_le, value2_le, sizeof(v2_le));
 322        unpack_block_time(le64_to_cpu(v1_le), &b1, &t);
 323        unpack_block_time(le64_to_cpu(v2_le), &b2, &t);
 324
 325        return b1 == b2;
 326}
 327
 328static void subtree_inc(void *context, const void *value)
 329{
 330        struct dm_btree_info *info = context;
 331        __le64 root_le;
 332        uint64_t root;
 333
 334        memcpy(&root_le, value, sizeof(root_le));
 335        root = le64_to_cpu(root_le);
 336        dm_tm_inc(info->tm, root);
 337}
 338
 339static void subtree_dec(void *context, const void *value)
 340{
 341        struct dm_btree_info *info = context;
 342        __le64 root_le;
 343        uint64_t root;
 344
 345        memcpy(&root_le, value, sizeof(root_le));
 346        root = le64_to_cpu(root_le);
 347        if (dm_btree_del(info, root))
 348                DMERR("btree delete failed\n");
 349}
 350
 351static int subtree_equal(void *context, const void *value1_le, const void *value2_le)
 352{
 353        __le64 v1_le, v2_le;
 354        memcpy(&v1_le, value1_le, sizeof(v1_le));
 355        memcpy(&v2_le, value2_le, sizeof(v2_le));
 356
 357        return v1_le == v2_le;
 358}
 359
 360/*----------------------------------------------------------------*/
 361
 362static int superblock_lock_zero(struct dm_pool_metadata *pmd,
 363                                struct dm_block **sblock)
 364{
 365        return dm_bm_write_lock_zero(pmd->bm, THIN_SUPERBLOCK_LOCATION,
 366                                     &sb_validator, sblock);
 367}
 368
 369static int superblock_lock(struct dm_pool_metadata *pmd,
 370                           struct dm_block **sblock)
 371{
 372        return dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
 373                                &sb_validator, sblock);
 374}
 375
 376static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result)
 377{
 378        int r;
 379        unsigned i;
 380        struct dm_block *b;
 381        __le64 *data_le, zero = cpu_to_le64(0);
 382        unsigned block_size = dm_bm_block_size(bm) / sizeof(__le64);
 383
 384        /*
 385         * We can't use a validator here - it may be all zeroes.
 386         */
 387        r = dm_bm_read_lock(bm, THIN_SUPERBLOCK_LOCATION, NULL, &b);
 388        if (r)
 389                return r;
 390
 391        data_le = dm_block_data(b);
 392        *result = 1;
 393        for (i = 0; i < block_size; i++) {
 394                if (data_le[i] != zero) {
 395                        *result = 0;
 396                        break;
 397                }
 398        }
 399
 400        return dm_bm_unlock(b);
 401}
 402
 403static void __setup_btree_details(struct dm_pool_metadata *pmd)
 404{
 405        pmd->info.tm = pmd->tm;
 406        pmd->info.levels = 2;
 407        pmd->info.value_type.context = pmd->data_sm;
 408        pmd->info.value_type.size = sizeof(__le64);
 409        pmd->info.value_type.inc = data_block_inc;
 410        pmd->info.value_type.dec = data_block_dec;
 411        pmd->info.value_type.equal = data_block_equal;
 412
 413        memcpy(&pmd->nb_info, &pmd->info, sizeof(pmd->nb_info));
 414        pmd->nb_info.tm = pmd->nb_tm;
 415
 416        pmd->tl_info.tm = pmd->tm;
 417        pmd->tl_info.levels = 1;
 418        pmd->tl_info.value_type.context = &pmd->bl_info;
 419        pmd->tl_info.value_type.size = sizeof(__le64);
 420        pmd->tl_info.value_type.inc = subtree_inc;
 421        pmd->tl_info.value_type.dec = subtree_dec;
 422        pmd->tl_info.value_type.equal = subtree_equal;
 423
 424        pmd->bl_info.tm = pmd->tm;
 425        pmd->bl_info.levels = 1;
 426        pmd->bl_info.value_type.context = pmd->data_sm;
 427        pmd->bl_info.value_type.size = sizeof(__le64);
 428        pmd->bl_info.value_type.inc = data_block_inc;
 429        pmd->bl_info.value_type.dec = data_block_dec;
 430        pmd->bl_info.value_type.equal = data_block_equal;
 431
 432        pmd->details_info.tm = pmd->tm;
 433        pmd->details_info.levels = 1;
 434        pmd->details_info.value_type.context = NULL;
 435        pmd->details_info.value_type.size = sizeof(struct disk_device_details);
 436        pmd->details_info.value_type.inc = NULL;
 437        pmd->details_info.value_type.dec = NULL;
 438        pmd->details_info.value_type.equal = NULL;
 439}
 440
 441static int save_sm_roots(struct dm_pool_metadata *pmd)
 442{
 443        int r;
 444        size_t len;
 445
 446        r = dm_sm_root_size(pmd->metadata_sm, &len);
 447        if (r < 0)
 448                return r;
 449
 450        r = dm_sm_copy_root(pmd->metadata_sm, &pmd->metadata_space_map_root, len);
 451        if (r < 0)
 452                return r;
 453
 454        r = dm_sm_root_size(pmd->data_sm, &len);
 455        if (r < 0)
 456                return r;
 457
 458        return dm_sm_copy_root(pmd->data_sm, &pmd->data_space_map_root, len);
 459}
 460
 461static void copy_sm_roots(struct dm_pool_metadata *pmd,
 462                          struct thin_disk_superblock *disk)
 463{
 464        memcpy(&disk->metadata_space_map_root,
 465               &pmd->metadata_space_map_root,
 466               sizeof(pmd->metadata_space_map_root));
 467
 468        memcpy(&disk->data_space_map_root,
 469               &pmd->data_space_map_root,
 470               sizeof(pmd->data_space_map_root));
 471}
 472
 473static int __write_initial_superblock(struct dm_pool_metadata *pmd)
 474{
 475        int r;
 476        struct dm_block *sblock;
 477        struct thin_disk_superblock *disk_super;
 478        sector_t bdev_size = i_size_read(pmd->bdev->bd_inode) >> SECTOR_SHIFT;
 479
 480        if (bdev_size > THIN_METADATA_MAX_SECTORS)
 481                bdev_size = THIN_METADATA_MAX_SECTORS;
 482
 483        r = dm_sm_commit(pmd->data_sm);
 484        if (r < 0)
 485                return r;
 486
 487        r = save_sm_roots(pmd);
 488        if (r < 0)
 489                return r;
 490
 491        r = dm_tm_pre_commit(pmd->tm);
 492        if (r < 0)
 493                return r;
 494
 495        r = superblock_lock_zero(pmd, &sblock);
 496        if (r)
 497                return r;
 498
 499        disk_super = dm_block_data(sblock);
 500        disk_super->flags = 0;
 501        memset(disk_super->uuid, 0, sizeof(disk_super->uuid));
 502        disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC);
 503        disk_super->version = cpu_to_le32(THIN_VERSION);
 504        disk_super->time = 0;
 505        disk_super->trans_id = 0;
 506        disk_super->held_root = 0;
 507
 508        copy_sm_roots(pmd, disk_super);
 509
 510        disk_super->data_mapping_root = cpu_to_le64(pmd->root);
 511        disk_super->device_details_root = cpu_to_le64(pmd->details_root);
 512        disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE);
 513        disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT);
 514        disk_super->data_block_size = cpu_to_le32(pmd->data_block_size);
 515
 516        return dm_tm_commit(pmd->tm, sblock);
 517}
 518
 519static int __format_metadata(struct dm_pool_metadata *pmd)
 520{
 521        int r;
 522
 523        r = dm_tm_create_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION,
 524                                 &pmd->tm, &pmd->metadata_sm);
 525        if (r < 0) {
 526                DMERR("tm_create_with_sm failed");
 527                return r;
 528        }
 529
 530        pmd->data_sm = dm_sm_disk_create(pmd->tm, 0);
 531        if (IS_ERR(pmd->data_sm)) {
 532                DMERR("sm_disk_create failed");
 533                r = PTR_ERR(pmd->data_sm);
 534                goto bad_cleanup_tm;
 535        }
 536
 537        pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm);
 538        if (!pmd->nb_tm) {
 539                DMERR("could not create non-blocking clone tm");
 540                r = -ENOMEM;
 541                goto bad_cleanup_data_sm;
 542        }
 543
 544        __setup_btree_details(pmd);
 545
 546        r = dm_btree_empty(&pmd->info, &pmd->root);
 547        if (r < 0)
 548                goto bad_cleanup_nb_tm;
 549
 550        r = dm_btree_empty(&pmd->details_info, &pmd->details_root);
 551        if (r < 0) {
 552                DMERR("couldn't create devices root");
 553                goto bad_cleanup_nb_tm;
 554        }
 555
 556        r = __write_initial_superblock(pmd);
 557        if (r)
 558                goto bad_cleanup_nb_tm;
 559
 560        return 0;
 561
 562bad_cleanup_nb_tm:
 563        dm_tm_destroy(pmd->nb_tm);
 564bad_cleanup_data_sm:
 565        dm_sm_destroy(pmd->data_sm);
 566bad_cleanup_tm:
 567        dm_tm_destroy(pmd->tm);
 568        dm_sm_destroy(pmd->metadata_sm);
 569
 570        return r;
 571}
 572
 573static int __check_incompat_features(struct thin_disk_superblock *disk_super,
 574                                     struct dm_pool_metadata *pmd)
 575{
 576        uint32_t features;
 577
 578        features = le32_to_cpu(disk_super->incompat_flags) & ~THIN_FEATURE_INCOMPAT_SUPP;
 579        if (features) {
 580                DMERR("could not access metadata due to unsupported optional features (%lx).",
 581                      (unsigned long)features);
 582                return -EINVAL;
 583        }
 584
 585        /*
 586         * Check for read-only metadata to skip the following RDWR checks.
 587         */
 588        if (get_disk_ro(pmd->bdev->bd_disk))
 589                return 0;
 590
 591        features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP;
 592        if (features) {
 593                DMERR("could not access metadata RDWR due to unsupported optional features (%lx).",
 594                      (unsigned long)features);
 595                return -EINVAL;
 596        }
 597
 598        return 0;
 599}
 600
 601static int __open_metadata(struct dm_pool_metadata *pmd)
 602{
 603        int r;
 604        struct dm_block *sblock;
 605        struct thin_disk_superblock *disk_super;
 606
 607        r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
 608                            &sb_validator, &sblock);
 609        if (r < 0) {
 610                DMERR("couldn't read superblock");
 611                return r;
 612        }
 613
 614        disk_super = dm_block_data(sblock);
 615
 616        /* Verify the data block size hasn't changed */
 617        if (le32_to_cpu(disk_super->data_block_size) != pmd->data_block_size) {
 618                DMERR("changing the data block size (from %u to %llu) is not supported",
 619                      le32_to_cpu(disk_super->data_block_size),
 620                      (unsigned long long)pmd->data_block_size);
 621                r = -EINVAL;
 622                goto bad_unlock_sblock;
 623        }
 624
 625        r = __check_incompat_features(disk_super, pmd);
 626        if (r < 0)
 627                goto bad_unlock_sblock;
 628
 629        r = dm_tm_open_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION,
 630                               disk_super->metadata_space_map_root,
 631                               sizeof(disk_super->metadata_space_map_root),
 632                               &pmd->tm, &pmd->metadata_sm);
 633        if (r < 0) {
 634                DMERR("tm_open_with_sm failed");
 635                goto bad_unlock_sblock;
 636        }
 637
 638        pmd->data_sm = dm_sm_disk_open(pmd->tm, disk_super->data_space_map_root,
 639                                       sizeof(disk_super->data_space_map_root));
 640        if (IS_ERR(pmd->data_sm)) {
 641                DMERR("sm_disk_open failed");
 642                r = PTR_ERR(pmd->data_sm);
 643                goto bad_cleanup_tm;
 644        }
 645
 646        pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm);
 647        if (!pmd->nb_tm) {
 648                DMERR("could not create non-blocking clone tm");
 649                r = -ENOMEM;
 650                goto bad_cleanup_data_sm;
 651        }
 652
 653        __setup_btree_details(pmd);
 654        return dm_bm_unlock(sblock);
 655
 656bad_cleanup_data_sm:
 657        dm_sm_destroy(pmd->data_sm);
 658bad_cleanup_tm:
 659        dm_tm_destroy(pmd->tm);
 660        dm_sm_destroy(pmd->metadata_sm);
 661bad_unlock_sblock:
 662        dm_bm_unlock(sblock);
 663
 664        return r;
 665}
 666
 667static int __open_or_format_metadata(struct dm_pool_metadata *pmd, bool format_device)
 668{
 669        int r, unformatted;
 670
 671        r = __superblock_all_zeroes(pmd->bm, &unformatted);
 672        if (r)
 673                return r;
 674
 675        if (unformatted)
 676                return format_device ? __format_metadata(pmd) : -EPERM;
 677
 678        return __open_metadata(pmd);
 679}
 680
 681static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool format_device)
 682{
 683        int r;
 684
 685        pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT,
 686                                          THIN_METADATA_CACHE_SIZE,
 687                                          THIN_MAX_CONCURRENT_LOCKS);
 688        if (IS_ERR(pmd->bm)) {
 689                DMERR("could not create block manager");
 690                return PTR_ERR(pmd->bm);
 691        }
 692
 693        r = __open_or_format_metadata(pmd, format_device);
 694        if (r)
 695                dm_block_manager_destroy(pmd->bm);
 696
 697        return r;
 698}
 699
 700static void __destroy_persistent_data_objects(struct dm_pool_metadata *pmd)
 701{
 702        dm_sm_destroy(pmd->data_sm);
 703        dm_sm_destroy(pmd->metadata_sm);
 704        dm_tm_destroy(pmd->nb_tm);
 705        dm_tm_destroy(pmd->tm);
 706        dm_block_manager_destroy(pmd->bm);
 707}
 708
 709static int __begin_transaction(struct dm_pool_metadata *pmd)
 710{
 711        int r;
 712        struct thin_disk_superblock *disk_super;
 713        struct dm_block *sblock;
 714
 715        /*
 716         * We re-read the superblock every time.  Shouldn't need to do this
 717         * really.
 718         */
 719        r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
 720                            &sb_validator, &sblock);
 721        if (r)
 722                return r;
 723
 724        disk_super = dm_block_data(sblock);
 725        pmd->time = le32_to_cpu(disk_super->time);
 726        pmd->root = le64_to_cpu(disk_super->data_mapping_root);
 727        pmd->details_root = le64_to_cpu(disk_super->device_details_root);
 728        pmd->trans_id = le64_to_cpu(disk_super->trans_id);
 729        pmd->flags = le32_to_cpu(disk_super->flags);
 730        pmd->data_block_size = le32_to_cpu(disk_super->data_block_size);
 731
 732        dm_bm_unlock(sblock);
 733        return 0;
 734}
 735
 736static int __write_changed_details(struct dm_pool_metadata *pmd)
 737{
 738        int r;
 739        struct dm_thin_device *td, *tmp;
 740        struct disk_device_details details;
 741        uint64_t key;
 742
 743        list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
 744                if (!td->changed)
 745                        continue;
 746
 747                key = td->id;
 748
 749                details.mapped_blocks = cpu_to_le64(td->mapped_blocks);
 750                details.transaction_id = cpu_to_le64(td->transaction_id);
 751                details.creation_time = cpu_to_le32(td->creation_time);
 752                details.snapshotted_time = cpu_to_le32(td->snapshotted_time);
 753                __dm_bless_for_disk(&details);
 754
 755                r = dm_btree_insert(&pmd->details_info, pmd->details_root,
 756                                    &key, &details, &pmd->details_root);
 757                if (r)
 758                        return r;
 759
 760                if (td->open_count)
 761                        td->changed = 0;
 762                else {
 763                        list_del(&td->list);
 764                        kfree(td);
 765                }
 766        }
 767
 768        return 0;
 769}
 770
 771static int __commit_transaction(struct dm_pool_metadata *pmd)
 772{
 773        int r;
 774        size_t metadata_len, data_len;
 775        struct thin_disk_superblock *disk_super;
 776        struct dm_block *sblock;
 777
 778        /*
 779         * We need to know if the thin_disk_superblock exceeds a 512-byte sector.
 780         */
 781        BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512);
 782
 783        r = __write_changed_details(pmd);
 784        if (r < 0)
 785                return r;
 786
 787        r = dm_sm_commit(pmd->data_sm);
 788        if (r < 0)
 789                return r;
 790
 791        r = dm_tm_pre_commit(pmd->tm);
 792        if (r < 0)
 793                return r;
 794
 795        r = dm_sm_root_size(pmd->metadata_sm, &metadata_len);
 796        if (r < 0)
 797                return r;
 798
 799        r = dm_sm_root_size(pmd->data_sm, &data_len);
 800        if (r < 0)
 801                return r;
 802
 803        r = save_sm_roots(pmd);
 804        if (r < 0)
 805                return r;
 806
 807        r = superblock_lock(pmd, &sblock);
 808        if (r)
 809                return r;
 810
 811        disk_super = dm_block_data(sblock);
 812        disk_super->time = cpu_to_le32(pmd->time);
 813        disk_super->data_mapping_root = cpu_to_le64(pmd->root);
 814        disk_super->device_details_root = cpu_to_le64(pmd->details_root);
 815        disk_super->trans_id = cpu_to_le64(pmd->trans_id);
 816        disk_super->flags = cpu_to_le32(pmd->flags);
 817
 818        copy_sm_roots(pmd, disk_super);
 819
 820        return dm_tm_commit(pmd->tm, sblock);
 821}
 822
 823struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
 824                                               sector_t data_block_size,
 825                                               bool format_device)
 826{
 827        int r;
 828        struct dm_pool_metadata *pmd;
 829
 830        pmd = kmalloc(sizeof(*pmd), GFP_KERNEL);
 831        if (!pmd) {
 832                DMERR("could not allocate metadata struct");
 833                return ERR_PTR(-ENOMEM);
 834        }
 835
 836        init_rwsem(&pmd->root_lock);
 837        pmd->time = 0;
 838        INIT_LIST_HEAD(&pmd->thin_devices);
 839        pmd->read_only = false;
 840        pmd->fail_io = false;
 841        pmd->bdev = bdev;
 842        pmd->data_block_size = data_block_size;
 843
 844        r = __create_persistent_data_objects(pmd, format_device);
 845        if (r) {
 846                kfree(pmd);
 847                return ERR_PTR(r);
 848        }
 849
 850        r = __begin_transaction(pmd);
 851        if (r < 0) {
 852                if (dm_pool_metadata_close(pmd) < 0)
 853                        DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
 854                return ERR_PTR(r);
 855        }
 856
 857        return pmd;
 858}
 859
 860int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
 861{
 862        int r;
 863        unsigned open_devices = 0;
 864        struct dm_thin_device *td, *tmp;
 865
 866        down_read(&pmd->root_lock);
 867        list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
 868                if (td->open_count)
 869                        open_devices++;
 870                else {
 871                        list_del(&td->list);
 872                        kfree(td);
 873                }
 874        }
 875        up_read(&pmd->root_lock);
 876
 877        if (open_devices) {
 878                DMERR("attempt to close pmd when %u device(s) are still open",
 879                       open_devices);
 880                return -EBUSY;
 881        }
 882
 883        if (!pmd->read_only && !pmd->fail_io) {
 884                r = __commit_transaction(pmd);
 885                if (r < 0)
 886                        DMWARN("%s: __commit_transaction() failed, error = %d",
 887                               __func__, r);
 888        }
 889
 890        if (!pmd->fail_io)
 891                __destroy_persistent_data_objects(pmd);
 892
 893        kfree(pmd);
 894        return 0;
 895}
 896
 897/*
 898 * __open_device: Returns @td corresponding to device with id @dev,
 899 * creating it if @create is set and incrementing @td->open_count.
 900 * On failure, @td is undefined.
 901 */
 902static int __open_device(struct dm_pool_metadata *pmd,
 903                         dm_thin_id dev, int create,
 904                         struct dm_thin_device **td)
 905{
 906        int r, changed = 0;
 907        struct dm_thin_device *td2;
 908        uint64_t key = dev;
 909        struct disk_device_details details_le;
 910
 911        /*
 912         * If the device is already open, return it.
 913         */
 914        list_for_each_entry(td2, &pmd->thin_devices, list)
 915                if (td2->id == dev) {
 916                        /*
 917                         * May not create an already-open device.
 918                         */
 919                        if (create)
 920                                return -EEXIST;
 921
 922                        td2->open_count++;
 923                        *td = td2;
 924                        return 0;
 925                }
 926
 927        /*
 928         * Check the device exists.
 929         */
 930        r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
 931                            &key, &details_le);
 932        if (r) {
 933                if (r != -ENODATA || !create)
 934                        return r;
 935
 936                /*
 937                 * Create new device.
 938                 */
 939                changed = 1;
 940                details_le.mapped_blocks = 0;
 941                details_le.transaction_id = cpu_to_le64(pmd->trans_id);
 942                details_le.creation_time = cpu_to_le32(pmd->time);
 943                details_le.snapshotted_time = cpu_to_le32(pmd->time);
 944        }
 945
 946        *td = kmalloc(sizeof(**td), GFP_NOIO);
 947        if (!*td)
 948                return -ENOMEM;
 949
 950        (*td)->pmd = pmd;
 951        (*td)->id = dev;
 952        (*td)->open_count = 1;
 953        (*td)->changed = changed;
 954        (*td)->aborted_with_changes = false;
 955        (*td)->mapped_blocks = le64_to_cpu(details_le.mapped_blocks);
 956        (*td)->transaction_id = le64_to_cpu(details_le.transaction_id);
 957        (*td)->creation_time = le32_to_cpu(details_le.creation_time);
 958        (*td)->snapshotted_time = le32_to_cpu(details_le.snapshotted_time);
 959
 960        list_add(&(*td)->list, &pmd->thin_devices);
 961
 962        return 0;
 963}
 964
 965static void __close_device(struct dm_thin_device *td)
 966{
 967        --td->open_count;
 968}
 969
 970static int __create_thin(struct dm_pool_metadata *pmd,
 971                         dm_thin_id dev)
 972{
 973        int r;
 974        dm_block_t dev_root;
 975        uint64_t key = dev;
 976        struct disk_device_details details_le;
 977        struct dm_thin_device *td;
 978        __le64 value;
 979
 980        r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
 981                            &key, &details_le);
 982        if (!r)
 983                return -EEXIST;
 984
 985        /*
 986         * Create an empty btree for the mappings.
 987         */
 988        r = dm_btree_empty(&pmd->bl_info, &dev_root);
 989        if (r)
 990                return r;
 991
 992        /*
 993         * Insert it into the main mapping tree.
 994         */
 995        value = cpu_to_le64(dev_root);
 996        __dm_bless_for_disk(&value);
 997        r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root);
 998        if (r) {
 999                dm_btree_del(&pmd->bl_info, dev_root);
1000                return r;
1001        }
1002
1003        r = __open_device(pmd, dev, 1, &td);
1004        if (r) {
1005                dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
1006                dm_btree_del(&pmd->bl_info, dev_root);
1007                return r;
1008        }
1009        __close_device(td);
1010
1011        return r;
1012}
1013
1014int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev)
1015{
1016        int r = -EINVAL;
1017
1018        down_write(&pmd->root_lock);
1019        if (!pmd->fail_io)
1020                r = __create_thin(pmd, dev);
1021        up_write(&pmd->root_lock);
1022
1023        return r;
1024}
1025
1026static int __set_snapshot_details(struct dm_pool_metadata *pmd,
1027                                  struct dm_thin_device *snap,
1028                                  dm_thin_id origin, uint32_t time)
1029{
1030        int r;
1031        struct dm_thin_device *td;
1032
1033        r = __open_device(pmd, origin, 0, &td);
1034        if (r)
1035                return r;
1036
1037        td->changed = 1;
1038        td->snapshotted_time = time;
1039
1040        snap->mapped_blocks = td->mapped_blocks;
1041        snap->snapshotted_time = time;
1042        __close_device(td);
1043
1044        return 0;
1045}
1046
1047static int __create_snap(struct dm_pool_metadata *pmd,
1048                         dm_thin_id dev, dm_thin_id origin)
1049{
1050        int r;
1051        dm_block_t origin_root;
1052        uint64_t key = origin, dev_key = dev;
1053        struct dm_thin_device *td;
1054        struct disk_device_details details_le;
1055        __le64 value;
1056
1057        /* check this device is unused */
1058        r = dm_btree_lookup(&pmd->details_info, pmd->details_root,
1059                            &dev_key, &details_le);
1060        if (!r)
1061                return -EEXIST;
1062
1063        /* find the mapping tree for the origin */
1064        r = dm_btree_lookup(&pmd->tl_info, pmd->root, &key, &value);
1065        if (r)
1066                return r;
1067        origin_root = le64_to_cpu(value);
1068
1069        /* clone the origin, an inc will do */
1070        dm_tm_inc(pmd->tm, origin_root);
1071
1072        /* insert into the main mapping tree */
1073        value = cpu_to_le64(origin_root);
1074        __dm_bless_for_disk(&value);
1075        key = dev;
1076        r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root);
1077        if (r) {
1078                dm_tm_dec(pmd->tm, origin_root);
1079                return r;
1080        }
1081
1082        pmd->time++;
1083
1084        r = __open_device(pmd, dev, 1, &td);
1085        if (r)
1086                goto bad;
1087
1088        r = __set_snapshot_details(pmd, td, origin, pmd->time);
1089        __close_device(td);
1090
1091        if (r)
1092                goto bad;
1093
1094        return 0;
1095
1096bad:
1097        dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
1098        dm_btree_remove(&pmd->details_info, pmd->details_root,
1099                        &key, &pmd->details_root);
1100        return r;
1101}
1102
1103int dm_pool_create_snap(struct dm_pool_metadata *pmd,
1104                                 dm_thin_id dev,
1105                                 dm_thin_id origin)
1106{
1107        int r = -EINVAL;
1108
1109        down_write(&pmd->root_lock);
1110        if (!pmd->fail_io)
1111                r = __create_snap(pmd, dev, origin);
1112        up_write(&pmd->root_lock);
1113
1114        return r;
1115}
1116
1117static int __delete_device(struct dm_pool_metadata *pmd, dm_thin_id dev)
1118{
1119        int r;
1120        uint64_t key = dev;
1121        struct dm_thin_device *td;
1122
1123        /* TODO: failure should mark the transaction invalid */
1124        r = __open_device(pmd, dev, 0, &td);
1125        if (r)
1126                return r;
1127
1128        if (td->open_count > 1) {
1129                __close_device(td);
1130                return -EBUSY;
1131        }
1132
1133        list_del(&td->list);
1134        kfree(td);
1135        r = dm_btree_remove(&pmd->details_info, pmd->details_root,
1136                            &key, &pmd->details_root);
1137        if (r)
1138                return r;
1139
1140        r = dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
1141        if (r)
1142                return r;
1143
1144        return 0;
1145}
1146
1147int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd,
1148                               dm_thin_id dev)
1149{
1150        int r = -EINVAL;
1151
1152        down_write(&pmd->root_lock);
1153        if (!pmd->fail_io)
1154                r = __delete_device(pmd, dev);
1155        up_write(&pmd->root_lock);
1156
1157        return r;
1158}
1159
1160int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd,
1161                                        uint64_t current_id,
1162                                        uint64_t new_id)
1163{
1164        int r = -EINVAL;
1165
1166        down_write(&pmd->root_lock);
1167
1168        if (pmd->fail_io)
1169                goto out;
1170
1171        if (pmd->trans_id != current_id) {
1172                DMERR("mismatched transaction id");
1173                goto out;
1174        }
1175
1176        pmd->trans_id = new_id;
1177        r = 0;
1178
1179out:
1180        up_write(&pmd->root_lock);
1181
1182        return r;
1183}
1184
1185int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd,
1186                                        uint64_t *result)
1187{
1188        int r = -EINVAL;
1189
1190        down_read(&pmd->root_lock);
1191        if (!pmd->fail_io) {
1192                *result = pmd->trans_id;
1193                r = 0;
1194        }
1195        up_read(&pmd->root_lock);
1196
1197        return r;
1198}
1199
1200static int __reserve_metadata_snap(struct dm_pool_metadata *pmd)
1201{
1202        int r, inc;
1203        struct thin_disk_superblock *disk_super;
1204        struct dm_block *copy, *sblock;
1205        dm_block_t held_root;
1206
1207        /*
1208         * Copy the superblock.
1209         */
1210        dm_sm_inc_block(pmd->metadata_sm, THIN_SUPERBLOCK_LOCATION);
1211        r = dm_tm_shadow_block(pmd->tm, THIN_SUPERBLOCK_LOCATION,
1212                               &sb_validator, &copy, &inc);
1213        if (r)
1214                return r;
1215
1216        BUG_ON(!inc);
1217
1218        held_root = dm_block_location(copy);
1219        disk_super = dm_block_data(copy);
1220
1221        if (le64_to_cpu(disk_super->held_root)) {
1222                DMWARN("Pool metadata snapshot already exists: release this before taking another.");
1223
1224                dm_tm_dec(pmd->tm, held_root);
1225                dm_tm_unlock(pmd->tm, copy);
1226                return -EBUSY;
1227        }
1228
1229        /*
1230         * Wipe the spacemap since we're not publishing this.
1231         */
1232        memset(&disk_super->data_space_map_root, 0,
1233               sizeof(disk_super->data_space_map_root));
1234        memset(&disk_super->metadata_space_map_root, 0,
1235               sizeof(disk_super->metadata_space_map_root));
1236
1237        /*
1238         * Increment the data structures that need to be preserved.
1239         */
1240        dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->data_mapping_root));
1241        dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->device_details_root));
1242        dm_tm_unlock(pmd->tm, copy);
1243
1244        /*
1245         * Write the held root into the superblock.
1246         */
1247        r = superblock_lock(pmd, &sblock);
1248        if (r) {
1249                dm_tm_dec(pmd->tm, held_root);
1250                return r;
1251        }
1252
1253        disk_super = dm_block_data(sblock);
1254        disk_super->held_root = cpu_to_le64(held_root);
1255        dm_bm_unlock(sblock);
1256        return 0;
1257}
1258
1259int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd)
1260{
1261        int r = -EINVAL;
1262
1263        down_write(&pmd->root_lock);
1264        if (!pmd->fail_io)
1265                r = __reserve_metadata_snap(pmd);
1266        up_write(&pmd->root_lock);
1267
1268        return r;
1269}
1270
1271static int __release_metadata_snap(struct dm_pool_metadata *pmd)
1272{
1273        int r;
1274        struct thin_disk_superblock *disk_super;
1275        struct dm_block *sblock, *copy;
1276        dm_block_t held_root;
1277
1278        r = superblock_lock(pmd, &sblock);
1279        if (r)
1280                return r;
1281
1282        disk_super = dm_block_data(sblock);
1283        held_root = le64_to_cpu(disk_super->held_root);
1284        disk_super->held_root = cpu_to_le64(0);
1285
1286        dm_bm_unlock(sblock);
1287
1288        if (!held_root) {
1289                DMWARN("No pool metadata snapshot found: nothing to release.");
1290                return -EINVAL;
1291        }
1292
1293        r = dm_tm_read_lock(pmd->tm, held_root, &sb_validator, &copy);
1294        if (r)
1295                return r;
1296
1297        disk_super = dm_block_data(copy);
1298        dm_sm_dec_block(pmd->metadata_sm, le64_to_cpu(disk_super->data_mapping_root));
1299        dm_sm_dec_block(pmd->metadata_sm, le64_to_cpu(disk_super->device_details_root));
1300        dm_sm_dec_block(pmd->metadata_sm, held_root);
1301
1302        return dm_tm_unlock(pmd->tm, copy);
1303}
1304
1305int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd)
1306{
1307        int r = -EINVAL;
1308
1309        down_write(&pmd->root_lock);
1310        if (!pmd->fail_io)
1311                r = __release_metadata_snap(pmd);
1312        up_write(&pmd->root_lock);
1313
1314        return r;
1315}
1316
1317static int __get_metadata_snap(struct dm_pool_metadata *pmd,
1318                               dm_block_t *result)
1319{
1320        int r;
1321        struct thin_disk_superblock *disk_super;
1322        struct dm_block *sblock;
1323
1324        r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION,
1325                            &sb_validator, &sblock);
1326        if (r)
1327                return r;
1328
1329        disk_super = dm_block_data(sblock);
1330        *result = le64_to_cpu(disk_super->held_root);
1331
1332        return dm_bm_unlock(sblock);
1333}
1334
1335int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd,
1336                              dm_block_t *result)
1337{
1338        int r = -EINVAL;
1339
1340        down_read(&pmd->root_lock);
1341        if (!pmd->fail_io)
1342                r = __get_metadata_snap(pmd, result);
1343        up_read(&pmd->root_lock);
1344
1345        return r;
1346}
1347
1348int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev,
1349                             struct dm_thin_device **td)
1350{
1351        int r = -EINVAL;
1352
1353        down_write(&pmd->root_lock);
1354        if (!pmd->fail_io)
1355                r = __open_device(pmd, dev, 0, td);
1356        up_write(&pmd->root_lock);
1357
1358        return r;
1359}
1360
1361int dm_pool_close_thin_device(struct dm_thin_device *td)
1362{
1363        down_write(&td->pmd->root_lock);
1364        __close_device(td);
1365        up_write(&td->pmd->root_lock);
1366
1367        return 0;
1368}
1369
1370dm_thin_id dm_thin_dev_id(struct dm_thin_device *td)
1371{
1372        return td->id;
1373}
1374
1375/*
1376 * Check whether @time (of block creation) is older than @td's last snapshot.
1377 * If so then the associated block is shared with the last snapshot device.
1378 * Any block on a device created *after* the device last got snapshotted is
1379 * necessarily not shared.
1380 */
1381static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time)
1382{
1383        return td->snapshotted_time > time;
1384}
1385
1386int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block,
1387                       int can_issue_io, struct dm_thin_lookup_result *result)
1388{
1389        int r;
1390        __le64 value;
1391        struct dm_pool_metadata *pmd = td->pmd;
1392        dm_block_t keys[2] = { td->id, block };
1393        struct dm_btree_info *info;
1394
1395        if (pmd->fail_io)
1396                return -EINVAL;
1397
1398        down_read(&pmd->root_lock);
1399
1400        if (can_issue_io) {
1401                info = &pmd->info;
1402        } else
1403                info = &pmd->nb_info;
1404
1405        r = dm_btree_lookup(info, pmd->root, keys, &value);
1406        if (!r) {
1407                uint64_t block_time = 0;
1408                dm_block_t exception_block;
1409                uint32_t exception_time;
1410
1411                block_time = le64_to_cpu(value);
1412                unpack_block_time(block_time, &exception_block,
1413                                  &exception_time);
1414                result->block = exception_block;
1415                result->shared = __snapshotted_since(td, exception_time);
1416        }
1417
1418        up_read(&pmd->root_lock);
1419        return r;
1420}
1421
1422static int __insert(struct dm_thin_device *td, dm_block_t block,
1423                    dm_block_t data_block)
1424{
1425        int r, inserted;
1426        __le64 value;
1427        struct dm_pool_metadata *pmd = td->pmd;
1428        dm_block_t keys[2] = { td->id, block };
1429
1430        value = cpu_to_le64(pack_block_time(data_block, pmd->time));
1431        __dm_bless_for_disk(&value);
1432
1433        r = dm_btree_insert_notify(&pmd->info, pmd->root, keys, &value,
1434                                   &pmd->root, &inserted);
1435        if (r)
1436                return r;
1437
1438        td->changed = 1;
1439        if (inserted)
1440                td->mapped_blocks++;
1441
1442        return 0;
1443}
1444
1445int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block,
1446                         dm_block_t data_block)
1447{
1448        int r = -EINVAL;
1449
1450        down_write(&td->pmd->root_lock);
1451        if (!td->pmd->fail_io)
1452                r = __insert(td, block, data_block);
1453        up_write(&td->pmd->root_lock);
1454
1455        return r;
1456}
1457
1458static int __remove(struct dm_thin_device *td, dm_block_t block)
1459{
1460        int r;
1461        struct dm_pool_metadata *pmd = td->pmd;
1462        dm_block_t keys[2] = { td->id, block };
1463
1464        r = dm_btree_remove(&pmd->info, pmd->root, keys, &pmd->root);
1465        if (r)
1466                return r;
1467
1468        td->mapped_blocks--;
1469        td->changed = 1;
1470
1471        return 0;
1472}
1473
1474int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block)
1475{
1476        int r = -EINVAL;
1477
1478        down_write(&td->pmd->root_lock);
1479        if (!td->pmd->fail_io)
1480                r = __remove(td, block);
1481        up_write(&td->pmd->root_lock);
1482
1483        return r;
1484}
1485
1486int dm_pool_block_is_used(struct dm_pool_metadata *pmd, dm_block_t b, bool *result)
1487{
1488        int r;
1489        uint32_t ref_count;
1490
1491        down_read(&pmd->root_lock);
1492        r = dm_sm_get_count(pmd->data_sm, b, &ref_count);
1493        if (!r)
1494                *result = (ref_count != 0);
1495        up_read(&pmd->root_lock);
1496
1497        return r;
1498}
1499
1500bool dm_thin_changed_this_transaction(struct dm_thin_device *td)
1501{
1502        int r;
1503
1504        down_read(&td->pmd->root_lock);
1505        r = td->changed;
1506        up_read(&td->pmd->root_lock);
1507
1508        return r;
1509}
1510
1511bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd)
1512{
1513        bool r = false;
1514        struct dm_thin_device *td, *tmp;
1515
1516        down_read(&pmd->root_lock);
1517        list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) {
1518                if (td->changed) {
1519                        r = td->changed;
1520                        break;
1521                }
1522        }
1523        up_read(&pmd->root_lock);
1524
1525        return r;
1526}
1527
1528bool dm_thin_aborted_changes(struct dm_thin_device *td)
1529{
1530        bool r;
1531
1532        down_read(&td->pmd->root_lock);
1533        r = td->aborted_with_changes;
1534        up_read(&td->pmd->root_lock);
1535
1536        return r;
1537}
1538
1539int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result)
1540{
1541        int r = -EINVAL;
1542
1543        down_write(&pmd->root_lock);
1544        if (!pmd->fail_io)
1545                r = dm_sm_new_block(pmd->data_sm, result);
1546        up_write(&pmd->root_lock);
1547
1548        return r;
1549}
1550
1551int dm_pool_commit_metadata(struct dm_pool_metadata *pmd)
1552{
1553        int r = -EINVAL;
1554
1555        down_write(&pmd->root_lock);
1556        if (pmd->fail_io)
1557                goto out;
1558
1559        r = __commit_transaction(pmd);
1560        if (r <= 0)
1561                goto out;
1562
1563        /*
1564         * Open the next transaction.
1565         */
1566        r = __begin_transaction(pmd);
1567out:
1568        up_write(&pmd->root_lock);
1569        return r;
1570}
1571
1572static void __set_abort_with_changes_flags(struct dm_pool_metadata *pmd)
1573{
1574        struct dm_thin_device *td;
1575
1576        list_for_each_entry(td, &pmd->thin_devices, list)
1577                td->aborted_with_changes = td->changed;
1578}
1579
1580int dm_pool_abort_metadata(struct dm_pool_metadata *pmd)
1581{
1582        int r = -EINVAL;
1583
1584        down_write(&pmd->root_lock);
1585        if (pmd->fail_io)
1586                goto out;
1587
1588        __set_abort_with_changes_flags(pmd);
1589        __destroy_persistent_data_objects(pmd);
1590        r = __create_persistent_data_objects(pmd, false);
1591        if (r)
1592                pmd->fail_io = true;
1593
1594out:
1595        up_write(&pmd->root_lock);
1596
1597        return r;
1598}
1599
1600int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *result)
1601{
1602        int r = -EINVAL;
1603
1604        down_read(&pmd->root_lock);
1605        if (!pmd->fail_io)
1606                r = dm_sm_get_nr_free(pmd->data_sm, result);
1607        up_read(&pmd->root_lock);
1608
1609        return r;
1610}
1611
1612int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd,
1613                                          dm_block_t *result)
1614{
1615        int r = -EINVAL;
1616
1617        down_read(&pmd->root_lock);
1618        if (!pmd->fail_io)
1619                r = dm_sm_get_nr_free(pmd->metadata_sm, result);
1620        up_read(&pmd->root_lock);
1621
1622        return r;
1623}
1624
1625int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd,
1626                                  dm_block_t *result)
1627{
1628        int r = -EINVAL;
1629
1630        down_read(&pmd->root_lock);
1631        if (!pmd->fail_io)
1632                r = dm_sm_get_nr_blocks(pmd->metadata_sm, result);
1633        up_read(&pmd->root_lock);
1634
1635        return r;
1636}
1637
1638int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result)
1639{
1640        down_read(&pmd->root_lock);
1641        *result = pmd->data_block_size;
1642        up_read(&pmd->root_lock);
1643
1644        return 0;
1645}
1646
1647int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result)
1648{
1649        int r = -EINVAL;
1650
1651        down_read(&pmd->root_lock);
1652        if (!pmd->fail_io)
1653                r = dm_sm_get_nr_blocks(pmd->data_sm, result);
1654        up_read(&pmd->root_lock);
1655
1656        return r;
1657}
1658
1659int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result)
1660{
1661        int r = -EINVAL;
1662        struct dm_pool_metadata *pmd = td->pmd;
1663
1664        down_read(&pmd->root_lock);
1665        if (!pmd->fail_io) {
1666                *result = td->mapped_blocks;
1667                r = 0;
1668        }
1669        up_read(&pmd->root_lock);
1670
1671        return r;
1672}
1673
1674static int __highest_block(struct dm_thin_device *td, dm_block_t *result)
1675{
1676        int r;
1677        __le64 value_le;
1678        dm_block_t thin_root;
1679        struct dm_pool_metadata *pmd = td->pmd;
1680
1681        r = dm_btree_lookup(&pmd->tl_info, pmd->root, &td->id, &value_le);
1682        if (r)
1683                return r;
1684
1685        thin_root = le64_to_cpu(value_le);
1686
1687        return dm_btree_find_highest_key(&pmd->bl_info, thin_root, result);
1688}
1689
1690int dm_thin_get_highest_mapped_block(struct dm_thin_device *td,
1691                                     dm_block_t *result)
1692{
1693        int r = -EINVAL;
1694        struct dm_pool_metadata *pmd = td->pmd;
1695
1696        down_read(&pmd->root_lock);
1697        if (!pmd->fail_io)
1698                r = __highest_block(td, result);
1699        up_read(&pmd->root_lock);
1700
1701        return r;
1702}
1703
1704static int __resize_space_map(struct dm_space_map *sm, dm_block_t new_count)
1705{
1706        int r;
1707        dm_block_t old_count;
1708
1709        r = dm_sm_get_nr_blocks(sm, &old_count);
1710        if (r)
1711                return r;
1712
1713        if (new_count == old_count)
1714                return 0;
1715
1716        if (new_count < old_count) {
1717                DMERR("cannot reduce size of space map");
1718                return -EINVAL;
1719        }
1720
1721        return dm_sm_extend(sm, new_count - old_count);
1722}
1723
1724int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
1725{
1726        int r = -EINVAL;
1727
1728        down_write(&pmd->root_lock);
1729        if (!pmd->fail_io)
1730                r = __resize_space_map(pmd->data_sm, new_count);
1731        up_write(&pmd->root_lock);
1732
1733        return r;
1734}
1735
1736int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_count)
1737{
1738        int r = -EINVAL;
1739
1740        down_write(&pmd->root_lock);
1741        if (!pmd->fail_io)
1742                r = __resize_space_map(pmd->metadata_sm, new_count);
1743        up_write(&pmd->root_lock);
1744
1745        return r;
1746}
1747
1748void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd)
1749{
1750        down_write(&pmd->root_lock);
1751        pmd->read_only = true;
1752        dm_bm_set_read_only(pmd->bm);
1753        up_write(&pmd->root_lock);
1754}
1755
1756void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd)
1757{
1758        down_write(&pmd->root_lock);
1759        pmd->read_only = false;
1760        dm_bm_set_read_write(pmd->bm);
1761        up_write(&pmd->root_lock);
1762}
1763
1764int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd,
1765                                        dm_block_t threshold,
1766                                        dm_sm_threshold_fn fn,
1767                                        void *context)
1768{
1769        int r;
1770
1771        down_write(&pmd->root_lock);
1772        r = dm_sm_register_threshold_callback(pmd->metadata_sm, threshold, fn, context);
1773        up_write(&pmd->root_lock);
1774
1775        return r;
1776}
1777
1778int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd)
1779{
1780        int r;
1781        struct dm_block *sblock;
1782        struct thin_disk_superblock *disk_super;
1783
1784        down_write(&pmd->root_lock);
1785        pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG;
1786
1787        r = superblock_lock(pmd, &sblock);
1788        if (r) {
1789                DMERR("couldn't read superblock");
1790                goto out;
1791        }
1792
1793        disk_super = dm_block_data(sblock);
1794        disk_super->flags = cpu_to_le32(pmd->flags);
1795
1796        dm_bm_unlock(sblock);
1797out:
1798        up_write(&pmd->root_lock);
1799        return r;
1800}
1801
1802bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd)
1803{
1804        bool needs_check;
1805
1806        down_read(&pmd->root_lock);
1807        needs_check = pmd->flags & THIN_METADATA_NEEDS_CHECK_FLAG;
1808        up_read(&pmd->root_lock);
1809
1810        return needs_check;
1811}
1812
1813void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd)
1814{
1815        dm_tm_issue_prefetches(pmd->tm);
1816}
1817