linux/drivers/md/dm-zoned-metadata.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2017 Western Digital Corporation or its affiliates.
   3 *
   4 * This file is released under the GPL.
   5 */
   6
   7#include "dm-zoned.h"
   8
   9#include <linux/module.h>
  10#include <linux/crc32.h>
  11
  12#define DM_MSG_PREFIX           "zoned metadata"
  13
  14/*
  15 * Metadata version.
  16 */
  17#define DMZ_META_VER    1
  18
  19/*
  20 * On-disk super block magic.
  21 */
  22#define DMZ_MAGIC       ((((unsigned int)('D')) << 24) | \
  23                         (((unsigned int)('Z')) << 16) | \
  24                         (((unsigned int)('B')) <<  8) | \
  25                         ((unsigned int)('D')))
  26
  27/*
  28 * On disk super block.
  29 * This uses only 512 B but uses on disk a full 4KB block. This block is
  30 * followed on disk by the mapping table of chunks to zones and the bitmap
  31 * blocks indicating zone block validity.
  32 * The overall resulting metadata format is:
  33 *    (1) Super block (1 block)
  34 *    (2) Chunk mapping table (nr_map_blocks)
  35 *    (3) Bitmap blocks (nr_bitmap_blocks)
  36 * All metadata blocks are stored in conventional zones, starting from the
  37 * the first conventional zone found on disk.
  38 */
  39struct dmz_super {
  40        /* Magic number */
  41        __le32          magic;                  /*   4 */
  42
  43        /* Metadata version number */
  44        __le32          version;                /*   8 */
  45
  46        /* Generation number */
  47        __le64          gen;                    /*  16 */
  48
  49        /* This block number */
  50        __le64          sb_block;               /*  24 */
  51
  52        /* The number of metadata blocks, including this super block */
  53        __le32          nr_meta_blocks;         /*  28 */
  54
  55        /* The number of sequential zones reserved for reclaim */
  56        __le32          nr_reserved_seq;        /*  32 */
  57
  58        /* The number of entries in the mapping table */
  59        __le32          nr_chunks;              /*  36 */
  60
  61        /* The number of blocks used for the chunk mapping table */
  62        __le32          nr_map_blocks;          /*  40 */
  63
  64        /* The number of blocks used for the block bitmaps */
  65        __le32          nr_bitmap_blocks;       /*  44 */
  66
  67        /* Checksum */
  68        __le32          crc;                    /*  48 */
  69
  70        /* Padding to full 512B sector */
  71        u8              reserved[464];          /* 512 */
  72};
  73
  74/*
  75 * Chunk mapping entry: entries are indexed by chunk number
  76 * and give the zone ID (dzone_id) mapping the chunk on disk.
  77 * This zone may be sequential or random. If it is a sequential
  78 * zone, a second zone (bzone_id) used as a write buffer may
  79 * also be specified. This second zone will always be a randomly
  80 * writeable zone.
  81 */
  82struct dmz_map {
  83        __le32                  dzone_id;
  84        __le32                  bzone_id;
  85};
  86
  87/*
  88 * Chunk mapping table metadata: 512 8-bytes entries per 4KB block.
  89 */
  90#define DMZ_MAP_ENTRIES         (DMZ_BLOCK_SIZE / sizeof(struct dmz_map))
  91#define DMZ_MAP_ENTRIES_SHIFT   (ilog2(DMZ_MAP_ENTRIES))
  92#define DMZ_MAP_ENTRIES_MASK    (DMZ_MAP_ENTRIES - 1)
  93#define DMZ_MAP_UNMAPPED        UINT_MAX
  94
  95/*
  96 * Meta data block descriptor (for cached metadata blocks).
  97 */
  98struct dmz_mblock {
  99        struct rb_node          node;
 100        struct list_head        link;
 101        sector_t                no;
 102        unsigned int            ref;
 103        unsigned long           state;
 104        struct page             *page;
 105        void                    *data;
 106};
 107
 108/*
 109 * Metadata block state flags.
 110 */
 111enum {
 112        DMZ_META_DIRTY,
 113        DMZ_META_READING,
 114        DMZ_META_WRITING,
 115        DMZ_META_ERROR,
 116};
 117
 118/*
 119 * Super block information (one per metadata set).
 120 */
 121struct dmz_sb {
 122        sector_t                block;
 123        struct dmz_mblock       *mblk;
 124        struct dmz_super        *sb;
 125};
 126
 127/*
 128 * In-memory metadata.
 129 */
 130struct dmz_metadata {
 131        struct dmz_dev          *dev;
 132
 133        sector_t                zone_bitmap_size;
 134        unsigned int            zone_nr_bitmap_blocks;
 135
 136        unsigned int            nr_bitmap_blocks;
 137        unsigned int            nr_map_blocks;
 138
 139        unsigned int            nr_useable_zones;
 140        unsigned int            nr_meta_blocks;
 141        unsigned int            nr_meta_zones;
 142        unsigned int            nr_data_zones;
 143        unsigned int            nr_rnd_zones;
 144        unsigned int            nr_reserved_seq;
 145        unsigned int            nr_chunks;
 146
 147        /* Zone information array */
 148        struct dm_zone          *zones;
 149
 150        struct dm_zone          *sb_zone;
 151        struct dmz_sb           sb[2];
 152        unsigned int            mblk_primary;
 153        u64                     sb_gen;
 154        unsigned int            min_nr_mblks;
 155        unsigned int            max_nr_mblks;
 156        atomic_t                nr_mblks;
 157        struct rw_semaphore     mblk_sem;
 158        struct mutex            mblk_flush_lock;
 159        spinlock_t              mblk_lock;
 160        struct rb_root          mblk_rbtree;
 161        struct list_head        mblk_lru_list;
 162        struct list_head        mblk_dirty_list;
 163        struct shrinker         mblk_shrinker;
 164
 165        /* Zone allocation management */
 166        struct mutex            map_lock;
 167        struct dmz_mblock       **map_mblk;
 168        unsigned int            nr_rnd;
 169        atomic_t                unmap_nr_rnd;
 170        struct list_head        unmap_rnd_list;
 171        struct list_head        map_rnd_list;
 172
 173        unsigned int            nr_seq;
 174        atomic_t                unmap_nr_seq;
 175        struct list_head        unmap_seq_list;
 176        struct list_head        map_seq_list;
 177
 178        atomic_t                nr_reserved_seq_zones;
 179        struct list_head        reserved_seq_zones_list;
 180
 181        wait_queue_head_t       free_wq;
 182};
 183
 184/*
 185 * Various accessors
 186 */
 187unsigned int dmz_id(struct dmz_metadata *zmd, struct dm_zone *zone)
 188{
 189        return ((unsigned int)(zone - zmd->zones));
 190}
 191
 192sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone)
 193{
 194        return (sector_t)dmz_id(zmd, zone) << zmd->dev->zone_nr_sectors_shift;
 195}
 196
 197sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone)
 198{
 199        return (sector_t)dmz_id(zmd, zone) << zmd->dev->zone_nr_blocks_shift;
 200}
 201
 202unsigned int dmz_nr_chunks(struct dmz_metadata *zmd)
 203{
 204        return zmd->nr_chunks;
 205}
 206
 207unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd)
 208{
 209        return zmd->nr_rnd;
 210}
 211
 212unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd)
 213{
 214        return atomic_read(&zmd->unmap_nr_rnd);
 215}
 216
 217/*
 218 * Lock/unlock mapping table.
 219 * The map lock also protects all the zone lists.
 220 */
 221void dmz_lock_map(struct dmz_metadata *zmd)
 222{
 223        mutex_lock(&zmd->map_lock);
 224}
 225
 226void dmz_unlock_map(struct dmz_metadata *zmd)
 227{
 228        mutex_unlock(&zmd->map_lock);
 229}
 230
 231/*
 232 * Lock/unlock metadata access. This is a "read" lock on a semaphore
 233 * that prevents metadata flush from running while metadata are being
 234 * modified. The actual metadata write mutual exclusion is achieved with
 235 * the map lock and zone styate management (active and reclaim state are
 236 * mutually exclusive).
 237 */
 238void dmz_lock_metadata(struct dmz_metadata *zmd)
 239{
 240        down_read(&zmd->mblk_sem);
 241}
 242
 243void dmz_unlock_metadata(struct dmz_metadata *zmd)
 244{
 245        up_read(&zmd->mblk_sem);
 246}
 247
 248/*
 249 * Lock/unlock flush: prevent concurrent executions
 250 * of dmz_flush_metadata as well as metadata modification in reclaim
 251 * while flush is being executed.
 252 */
 253void dmz_lock_flush(struct dmz_metadata *zmd)
 254{
 255        mutex_lock(&zmd->mblk_flush_lock);
 256}
 257
 258void dmz_unlock_flush(struct dmz_metadata *zmd)
 259{
 260        mutex_unlock(&zmd->mblk_flush_lock);
 261}
 262
 263/*
 264 * Allocate a metadata block.
 265 */
 266static struct dmz_mblock *dmz_alloc_mblock(struct dmz_metadata *zmd,
 267                                           sector_t mblk_no)
 268{
 269        struct dmz_mblock *mblk = NULL;
 270
 271        /* See if we can reuse cached blocks */
 272        if (zmd->max_nr_mblks && atomic_read(&zmd->nr_mblks) > zmd->max_nr_mblks) {
 273                spin_lock(&zmd->mblk_lock);
 274                mblk = list_first_entry_or_null(&zmd->mblk_lru_list,
 275                                                struct dmz_mblock, link);
 276                if (mblk) {
 277                        list_del_init(&mblk->link);
 278                        rb_erase(&mblk->node, &zmd->mblk_rbtree);
 279                        mblk->no = mblk_no;
 280                }
 281                spin_unlock(&zmd->mblk_lock);
 282                if (mblk)
 283                        return mblk;
 284        }
 285
 286        /* Allocate a new block */
 287        mblk = kmalloc(sizeof(struct dmz_mblock), GFP_NOIO);
 288        if (!mblk)
 289                return NULL;
 290
 291        mblk->page = alloc_page(GFP_NOIO);
 292        if (!mblk->page) {
 293                kfree(mblk);
 294                return NULL;
 295        }
 296
 297        RB_CLEAR_NODE(&mblk->node);
 298        INIT_LIST_HEAD(&mblk->link);
 299        mblk->ref = 0;
 300        mblk->state = 0;
 301        mblk->no = mblk_no;
 302        mblk->data = page_address(mblk->page);
 303
 304        atomic_inc(&zmd->nr_mblks);
 305
 306        return mblk;
 307}
 308
 309/*
 310 * Free a metadata block.
 311 */
 312static void dmz_free_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk)
 313{
 314        __free_pages(mblk->page, 0);
 315        kfree(mblk);
 316
 317        atomic_dec(&zmd->nr_mblks);
 318}
 319
 320/*
 321 * Insert a metadata block in the rbtree.
 322 */
 323static void dmz_insert_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk)
 324{
 325        struct rb_root *root = &zmd->mblk_rbtree;
 326        struct rb_node **new = &(root->rb_node), *parent = NULL;
 327        struct dmz_mblock *b;
 328
 329        /* Figure out where to put the new node */
 330        while (*new) {
 331                b = container_of(*new, struct dmz_mblock, node);
 332                parent = *new;
 333                new = (b->no < mblk->no) ? &((*new)->rb_left) : &((*new)->rb_right);
 334        }
 335
 336        /* Add new node and rebalance tree */
 337        rb_link_node(&mblk->node, parent, new);
 338        rb_insert_color(&mblk->node, root);
 339}
 340
 341/*
 342 * Lookup a metadata block in the rbtree. If the block is found, increment
 343 * its reference count.
 344 */
 345static struct dmz_mblock *dmz_get_mblock_fast(struct dmz_metadata *zmd,
 346                                              sector_t mblk_no)
 347{
 348        struct rb_root *root = &zmd->mblk_rbtree;
 349        struct rb_node *node = root->rb_node;
 350        struct dmz_mblock *mblk;
 351
 352        while (node) {
 353                mblk = container_of(node, struct dmz_mblock, node);
 354                if (mblk->no == mblk_no) {
 355                        /*
 356                         * If this is the first reference to the block,
 357                         * remove it from the LRU list.
 358                         */
 359                        mblk->ref++;
 360                        if (mblk->ref == 1 &&
 361                            !test_bit(DMZ_META_DIRTY, &mblk->state))
 362                                list_del_init(&mblk->link);
 363                        return mblk;
 364                }
 365                node = (mblk->no < mblk_no) ? node->rb_left : node->rb_right;
 366        }
 367
 368        return NULL;
 369}
 370
 371/*
 372 * Metadata block BIO end callback.
 373 */
 374static void dmz_mblock_bio_end_io(struct bio *bio)
 375{
 376        struct dmz_mblock *mblk = bio->bi_private;
 377        int flag;
 378
 379        if (bio->bi_status)
 380                set_bit(DMZ_META_ERROR, &mblk->state);
 381
 382        if (bio_op(bio) == REQ_OP_WRITE)
 383                flag = DMZ_META_WRITING;
 384        else
 385                flag = DMZ_META_READING;
 386
 387        clear_bit_unlock(flag, &mblk->state);
 388        smp_mb__after_atomic();
 389        wake_up_bit(&mblk->state, flag);
 390
 391        bio_put(bio);
 392}
 393
 394/*
 395 * Read an uncached metadata block from disk and add it to the cache.
 396 */
 397static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd,
 398                                              sector_t mblk_no)
 399{
 400        struct dmz_mblock *mblk, *m;
 401        sector_t block = zmd->sb[zmd->mblk_primary].block + mblk_no;
 402        struct bio *bio;
 403
 404        /* Get a new block and a BIO to read it */
 405        mblk = dmz_alloc_mblock(zmd, mblk_no);
 406        if (!mblk)
 407                return NULL;
 408
 409        bio = bio_alloc(GFP_NOIO, 1);
 410        if (!bio) {
 411                dmz_free_mblock(zmd, mblk);
 412                return NULL;
 413        }
 414
 415        spin_lock(&zmd->mblk_lock);
 416
 417        /*
 418         * Make sure that another context did not start reading
 419         * the block already.
 420         */
 421        m = dmz_get_mblock_fast(zmd, mblk_no);
 422        if (m) {
 423                spin_unlock(&zmd->mblk_lock);
 424                dmz_free_mblock(zmd, mblk);
 425                bio_put(bio);
 426                return m;
 427        }
 428
 429        mblk->ref++;
 430        set_bit(DMZ_META_READING, &mblk->state);
 431        dmz_insert_mblock(zmd, mblk);
 432
 433        spin_unlock(&zmd->mblk_lock);
 434
 435        /* Submit read BIO */
 436        bio->bi_iter.bi_sector = dmz_blk2sect(block);
 437        bio_set_dev(bio, zmd->dev->bdev);
 438        bio->bi_private = mblk;
 439        bio->bi_end_io = dmz_mblock_bio_end_io;
 440        bio_set_op_attrs(bio, REQ_OP_READ, REQ_META | REQ_PRIO);
 441        bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0);
 442        submit_bio(bio);
 443
 444        return mblk;
 445}
 446
 447/*
 448 * Free metadata blocks.
 449 */
 450static unsigned long dmz_shrink_mblock_cache(struct dmz_metadata *zmd,
 451                                             unsigned long limit)
 452{
 453        struct dmz_mblock *mblk;
 454        unsigned long count = 0;
 455
 456        if (!zmd->max_nr_mblks)
 457                return 0;
 458
 459        while (!list_empty(&zmd->mblk_lru_list) &&
 460               atomic_read(&zmd->nr_mblks) > zmd->min_nr_mblks &&
 461               count < limit) {
 462                mblk = list_first_entry(&zmd->mblk_lru_list,
 463                                        struct dmz_mblock, link);
 464                list_del_init(&mblk->link);
 465                rb_erase(&mblk->node, &zmd->mblk_rbtree);
 466                dmz_free_mblock(zmd, mblk);
 467                count++;
 468        }
 469
 470        return count;
 471}
 472
 473/*
 474 * For mblock shrinker: get the number of unused metadata blocks in the cache.
 475 */
 476static unsigned long dmz_mblock_shrinker_count(struct shrinker *shrink,
 477                                               struct shrink_control *sc)
 478{
 479        struct dmz_metadata *zmd = container_of(shrink, struct dmz_metadata, mblk_shrinker);
 480
 481        return atomic_read(&zmd->nr_mblks);
 482}
 483
 484/*
 485 * For mblock shrinker: scan unused metadata blocks and shrink the cache.
 486 */
 487static unsigned long dmz_mblock_shrinker_scan(struct shrinker *shrink,
 488                                              struct shrink_control *sc)
 489{
 490        struct dmz_metadata *zmd = container_of(shrink, struct dmz_metadata, mblk_shrinker);
 491        unsigned long count;
 492
 493        spin_lock(&zmd->mblk_lock);
 494        count = dmz_shrink_mblock_cache(zmd, sc->nr_to_scan);
 495        spin_unlock(&zmd->mblk_lock);
 496
 497        return count ? count : SHRINK_STOP;
 498}
 499
 500/*
 501 * Release a metadata block.
 502 */
 503static void dmz_release_mblock(struct dmz_metadata *zmd,
 504                               struct dmz_mblock *mblk)
 505{
 506
 507        if (!mblk)
 508                return;
 509
 510        spin_lock(&zmd->mblk_lock);
 511
 512        mblk->ref--;
 513        if (mblk->ref == 0) {
 514                if (test_bit(DMZ_META_ERROR, &mblk->state)) {
 515                        rb_erase(&mblk->node, &zmd->mblk_rbtree);
 516                        dmz_free_mblock(zmd, mblk);
 517                } else if (!test_bit(DMZ_META_DIRTY, &mblk->state)) {
 518                        list_add_tail(&mblk->link, &zmd->mblk_lru_list);
 519                        dmz_shrink_mblock_cache(zmd, 1);
 520                }
 521        }
 522
 523        spin_unlock(&zmd->mblk_lock);
 524}
 525
 526/*
 527 * Get a metadata block from the rbtree. If the block
 528 * is not present, read it from disk.
 529 */
 530static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd,
 531                                         sector_t mblk_no)
 532{
 533        struct dmz_mblock *mblk;
 534
 535        /* Check rbtree */
 536        spin_lock(&zmd->mblk_lock);
 537        mblk = dmz_get_mblock_fast(zmd, mblk_no);
 538        spin_unlock(&zmd->mblk_lock);
 539
 540        if (!mblk) {
 541                /* Cache miss: read the block from disk */
 542                mblk = dmz_get_mblock_slow(zmd, mblk_no);
 543                if (!mblk)
 544                        return ERR_PTR(-ENOMEM);
 545        }
 546
 547        /* Wait for on-going read I/O and check for error */
 548        wait_on_bit_io(&mblk->state, DMZ_META_READING,
 549                       TASK_UNINTERRUPTIBLE);
 550        if (test_bit(DMZ_META_ERROR, &mblk->state)) {
 551                dmz_release_mblock(zmd, mblk);
 552                return ERR_PTR(-EIO);
 553        }
 554
 555        return mblk;
 556}
 557
 558/*
 559 * Mark a metadata block dirty.
 560 */
 561static void dmz_dirty_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk)
 562{
 563        spin_lock(&zmd->mblk_lock);
 564        if (!test_and_set_bit(DMZ_META_DIRTY, &mblk->state))
 565                list_add_tail(&mblk->link, &zmd->mblk_dirty_list);
 566        spin_unlock(&zmd->mblk_lock);
 567}
 568
 569/*
 570 * Issue a metadata block write BIO.
 571 */
 572static void dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
 573                             unsigned int set)
 574{
 575        sector_t block = zmd->sb[set].block + mblk->no;
 576        struct bio *bio;
 577
 578        bio = bio_alloc(GFP_NOIO, 1);
 579        if (!bio) {
 580                set_bit(DMZ_META_ERROR, &mblk->state);
 581                return;
 582        }
 583
 584        set_bit(DMZ_META_WRITING, &mblk->state);
 585
 586        bio->bi_iter.bi_sector = dmz_blk2sect(block);
 587        bio_set_dev(bio, zmd->dev->bdev);
 588        bio->bi_private = mblk;
 589        bio->bi_end_io = dmz_mblock_bio_end_io;
 590        bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_META | REQ_PRIO);
 591        bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0);
 592        submit_bio(bio);
 593}
 594
 595/*
 596 * Read/write a metadata block.
 597 */
 598static int dmz_rdwr_block(struct dmz_metadata *zmd, int op, sector_t block,
 599                          struct page *page)
 600{
 601        struct bio *bio;
 602        int ret;
 603
 604        bio = bio_alloc(GFP_NOIO, 1);
 605        if (!bio)
 606                return -ENOMEM;
 607
 608        bio->bi_iter.bi_sector = dmz_blk2sect(block);
 609        bio_set_dev(bio, zmd->dev->bdev);
 610        bio_set_op_attrs(bio, op, REQ_SYNC | REQ_META | REQ_PRIO);
 611        bio_add_page(bio, page, DMZ_BLOCK_SIZE, 0);
 612        ret = submit_bio_wait(bio);
 613        bio_put(bio);
 614
 615        return ret;
 616}
 617
 618/*
 619 * Write super block of the specified metadata set.
 620 */
 621static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set)
 622{
 623        sector_t block = zmd->sb[set].block;
 624        struct dmz_mblock *mblk = zmd->sb[set].mblk;
 625        struct dmz_super *sb = zmd->sb[set].sb;
 626        u64 sb_gen = zmd->sb_gen + 1;
 627        int ret;
 628
 629        sb->magic = cpu_to_le32(DMZ_MAGIC);
 630        sb->version = cpu_to_le32(DMZ_META_VER);
 631
 632        sb->gen = cpu_to_le64(sb_gen);
 633
 634        sb->sb_block = cpu_to_le64(block);
 635        sb->nr_meta_blocks = cpu_to_le32(zmd->nr_meta_blocks);
 636        sb->nr_reserved_seq = cpu_to_le32(zmd->nr_reserved_seq);
 637        sb->nr_chunks = cpu_to_le32(zmd->nr_chunks);
 638
 639        sb->nr_map_blocks = cpu_to_le32(zmd->nr_map_blocks);
 640        sb->nr_bitmap_blocks = cpu_to_le32(zmd->nr_bitmap_blocks);
 641
 642        sb->crc = 0;
 643        sb->crc = cpu_to_le32(crc32_le(sb_gen, (unsigned char *)sb, DMZ_BLOCK_SIZE));
 644
 645        ret = dmz_rdwr_block(zmd, REQ_OP_WRITE, block, mblk->page);
 646        if (ret == 0)
 647                ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL);
 648
 649        return ret;
 650}
 651
 652/*
 653 * Write dirty metadata blocks to the specified set.
 654 */
 655static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd,
 656                                   struct list_head *write_list,
 657                                   unsigned int set)
 658{
 659        struct dmz_mblock *mblk;
 660        struct blk_plug plug;
 661        int ret = 0;
 662
 663        /* Issue writes */
 664        blk_start_plug(&plug);
 665        list_for_each_entry(mblk, write_list, link)
 666                dmz_write_mblock(zmd, mblk, set);
 667        blk_finish_plug(&plug);
 668
 669        /* Wait for completion */
 670        list_for_each_entry(mblk, write_list, link) {
 671                wait_on_bit_io(&mblk->state, DMZ_META_WRITING,
 672                               TASK_UNINTERRUPTIBLE);
 673                if (test_bit(DMZ_META_ERROR, &mblk->state)) {
 674                        clear_bit(DMZ_META_ERROR, &mblk->state);
 675                        ret = -EIO;
 676                }
 677        }
 678
 679        /* Flush drive cache (this will also sync data) */
 680        if (ret == 0)
 681                ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL);
 682
 683        return ret;
 684}
 685
 686/*
 687 * Log dirty metadata blocks.
 688 */
 689static int dmz_log_dirty_mblocks(struct dmz_metadata *zmd,
 690                                 struct list_head *write_list)
 691{
 692        unsigned int log_set = zmd->mblk_primary ^ 0x1;
 693        int ret;
 694
 695        /* Write dirty blocks to the log */
 696        ret = dmz_write_dirty_mblocks(zmd, write_list, log_set);
 697        if (ret)
 698                return ret;
 699
 700        /*
 701         * No error so far: now validate the log by updating the
 702         * log index super block generation.
 703         */
 704        ret = dmz_write_sb(zmd, log_set);
 705        if (ret)
 706                return ret;
 707
 708        return 0;
 709}
 710
 711/*
 712 * Flush dirty metadata blocks.
 713 */
 714int dmz_flush_metadata(struct dmz_metadata *zmd)
 715{
 716        struct dmz_mblock *mblk;
 717        struct list_head write_list;
 718        int ret;
 719
 720        if (WARN_ON(!zmd))
 721                return 0;
 722
 723        INIT_LIST_HEAD(&write_list);
 724
 725        /*
 726         * Make sure that metadata blocks are stable before logging: take
 727         * the write lock on the metadata semaphore to prevent target BIOs
 728         * from modifying metadata.
 729         */
 730        down_write(&zmd->mblk_sem);
 731
 732        /*
 733         * This is called from the target flush work and reclaim work.
 734         * Concurrent execution is not allowed.
 735         */
 736        dmz_lock_flush(zmd);
 737
 738        /* Get dirty blocks */
 739        spin_lock(&zmd->mblk_lock);
 740        list_splice_init(&zmd->mblk_dirty_list, &write_list);
 741        spin_unlock(&zmd->mblk_lock);
 742
 743        /* If there are no dirty metadata blocks, just flush the device cache */
 744        if (list_empty(&write_list)) {
 745                ret = blkdev_issue_flush(zmd->dev->bdev, GFP_NOIO, NULL);
 746                goto out;
 747        }
 748
 749        /*
 750         * The primary metadata set is still clean. Keep it this way until
 751         * all updates are successful in the secondary set. That is, use
 752         * the secondary set as a log.
 753         */
 754        ret = dmz_log_dirty_mblocks(zmd, &write_list);
 755        if (ret)
 756                goto out;
 757
 758        /*
 759         * The log is on disk. It is now safe to update in place
 760         * in the primary metadata set.
 761         */
 762        ret = dmz_write_dirty_mblocks(zmd, &write_list, zmd->mblk_primary);
 763        if (ret)
 764                goto out;
 765
 766        ret = dmz_write_sb(zmd, zmd->mblk_primary);
 767        if (ret)
 768                goto out;
 769
 770        while (!list_empty(&write_list)) {
 771                mblk = list_first_entry(&write_list, struct dmz_mblock, link);
 772                list_del_init(&mblk->link);
 773
 774                spin_lock(&zmd->mblk_lock);
 775                clear_bit(DMZ_META_DIRTY, &mblk->state);
 776                if (mblk->ref == 0)
 777                        list_add_tail(&mblk->link, &zmd->mblk_lru_list);
 778                spin_unlock(&zmd->mblk_lock);
 779        }
 780
 781        zmd->sb_gen++;
 782out:
 783        if (ret && !list_empty(&write_list)) {
 784                spin_lock(&zmd->mblk_lock);
 785                list_splice(&write_list, &zmd->mblk_dirty_list);
 786                spin_unlock(&zmd->mblk_lock);
 787        }
 788
 789        dmz_unlock_flush(zmd);
 790        up_write(&zmd->mblk_sem);
 791
 792        return ret;
 793}
 794
 795/*
 796 * Check super block.
 797 */
 798static int dmz_check_sb(struct dmz_metadata *zmd, struct dmz_super *sb)
 799{
 800        unsigned int nr_meta_zones, nr_data_zones;
 801        struct dmz_dev *dev = zmd->dev;
 802        u32 crc, stored_crc;
 803        u64 gen;
 804
 805        gen = le64_to_cpu(sb->gen);
 806        stored_crc = le32_to_cpu(sb->crc);
 807        sb->crc = 0;
 808        crc = crc32_le(gen, (unsigned char *)sb, DMZ_BLOCK_SIZE);
 809        if (crc != stored_crc) {
 810                dmz_dev_err(dev, "Invalid checksum (needed 0x%08x, got 0x%08x)",
 811                            crc, stored_crc);
 812                return -ENXIO;
 813        }
 814
 815        if (le32_to_cpu(sb->magic) != DMZ_MAGIC) {
 816                dmz_dev_err(dev, "Invalid meta magic (needed 0x%08x, got 0x%08x)",
 817                            DMZ_MAGIC, le32_to_cpu(sb->magic));
 818                return -ENXIO;
 819        }
 820
 821        if (le32_to_cpu(sb->version) != DMZ_META_VER) {
 822                dmz_dev_err(dev, "Invalid meta version (needed %d, got %d)",
 823                            DMZ_META_VER, le32_to_cpu(sb->version));
 824                return -ENXIO;
 825        }
 826
 827        nr_meta_zones = (le32_to_cpu(sb->nr_meta_blocks) + dev->zone_nr_blocks - 1)
 828                >> dev->zone_nr_blocks_shift;
 829        if (!nr_meta_zones ||
 830            nr_meta_zones >= zmd->nr_rnd_zones) {
 831                dmz_dev_err(dev, "Invalid number of metadata blocks");
 832                return -ENXIO;
 833        }
 834
 835        if (!le32_to_cpu(sb->nr_reserved_seq) ||
 836            le32_to_cpu(sb->nr_reserved_seq) >= (zmd->nr_useable_zones - nr_meta_zones)) {
 837                dmz_dev_err(dev, "Invalid number of reserved sequential zones");
 838                return -ENXIO;
 839        }
 840
 841        nr_data_zones = zmd->nr_useable_zones -
 842                (nr_meta_zones * 2 + le32_to_cpu(sb->nr_reserved_seq));
 843        if (le32_to_cpu(sb->nr_chunks) > nr_data_zones) {
 844                dmz_dev_err(dev, "Invalid number of chunks %u / %u",
 845                            le32_to_cpu(sb->nr_chunks), nr_data_zones);
 846                return -ENXIO;
 847        }
 848
 849        /* OK */
 850        zmd->nr_meta_blocks = le32_to_cpu(sb->nr_meta_blocks);
 851        zmd->nr_reserved_seq = le32_to_cpu(sb->nr_reserved_seq);
 852        zmd->nr_chunks = le32_to_cpu(sb->nr_chunks);
 853        zmd->nr_map_blocks = le32_to_cpu(sb->nr_map_blocks);
 854        zmd->nr_bitmap_blocks = le32_to_cpu(sb->nr_bitmap_blocks);
 855        zmd->nr_meta_zones = nr_meta_zones;
 856        zmd->nr_data_zones = nr_data_zones;
 857
 858        return 0;
 859}
 860
 861/*
 862 * Read the first or second super block from disk.
 863 */
 864static int dmz_read_sb(struct dmz_metadata *zmd, unsigned int set)
 865{
 866        return dmz_rdwr_block(zmd, REQ_OP_READ, zmd->sb[set].block,
 867                              zmd->sb[set].mblk->page);
 868}
 869
 870/*
 871 * Determine the position of the secondary super blocks on disk.
 872 * This is used only if a corruption of the primary super block
 873 * is detected.
 874 */
 875static int dmz_lookup_secondary_sb(struct dmz_metadata *zmd)
 876{
 877        unsigned int zone_nr_blocks = zmd->dev->zone_nr_blocks;
 878        struct dmz_mblock *mblk;
 879        int i;
 880
 881        /* Allocate a block */
 882        mblk = dmz_alloc_mblock(zmd, 0);
 883        if (!mblk)
 884                return -ENOMEM;
 885
 886        zmd->sb[1].mblk = mblk;
 887        zmd->sb[1].sb = mblk->data;
 888
 889        /* Bad first super block: search for the second one */
 890        zmd->sb[1].block = zmd->sb[0].block + zone_nr_blocks;
 891        for (i = 0; i < zmd->nr_rnd_zones - 1; i++) {
 892                if (dmz_read_sb(zmd, 1) != 0)
 893                        break;
 894                if (le32_to_cpu(zmd->sb[1].sb->magic) == DMZ_MAGIC)
 895                        return 0;
 896                zmd->sb[1].block += zone_nr_blocks;
 897        }
 898
 899        dmz_free_mblock(zmd, mblk);
 900        zmd->sb[1].mblk = NULL;
 901
 902        return -EIO;
 903}
 904
 905/*
 906 * Read the first or second super block from disk.
 907 */
 908static int dmz_get_sb(struct dmz_metadata *zmd, unsigned int set)
 909{
 910        struct dmz_mblock *mblk;
 911        int ret;
 912
 913        /* Allocate a block */
 914        mblk = dmz_alloc_mblock(zmd, 0);
 915        if (!mblk)
 916                return -ENOMEM;
 917
 918        zmd->sb[set].mblk = mblk;
 919        zmd->sb[set].sb = mblk->data;
 920
 921        /* Read super block */
 922        ret = dmz_read_sb(zmd, set);
 923        if (ret) {
 924                dmz_free_mblock(zmd, mblk);
 925                zmd->sb[set].mblk = NULL;
 926                return ret;
 927        }
 928
 929        return 0;
 930}
 931
 932/*
 933 * Recover a metadata set.
 934 */
 935static int dmz_recover_mblocks(struct dmz_metadata *zmd, unsigned int dst_set)
 936{
 937        unsigned int src_set = dst_set ^ 0x1;
 938        struct page *page;
 939        int i, ret;
 940
 941        dmz_dev_warn(zmd->dev, "Metadata set %u invalid: recovering", dst_set);
 942
 943        if (dst_set == 0)
 944                zmd->sb[0].block = dmz_start_block(zmd, zmd->sb_zone);
 945        else {
 946                zmd->sb[1].block = zmd->sb[0].block +
 947                        (zmd->nr_meta_zones << zmd->dev->zone_nr_blocks_shift);
 948        }
 949
 950        page = alloc_page(GFP_NOIO);
 951        if (!page)
 952                return -ENOMEM;
 953
 954        /* Copy metadata blocks */
 955        for (i = 1; i < zmd->nr_meta_blocks; i++) {
 956                ret = dmz_rdwr_block(zmd, REQ_OP_READ,
 957                                     zmd->sb[src_set].block + i, page);
 958                if (ret)
 959                        goto out;
 960                ret = dmz_rdwr_block(zmd, REQ_OP_WRITE,
 961                                     zmd->sb[dst_set].block + i, page);
 962                if (ret)
 963                        goto out;
 964        }
 965
 966        /* Finalize with the super block */
 967        if (!zmd->sb[dst_set].mblk) {
 968                zmd->sb[dst_set].mblk = dmz_alloc_mblock(zmd, 0);
 969                if (!zmd->sb[dst_set].mblk) {
 970                        ret = -ENOMEM;
 971                        goto out;
 972                }
 973                zmd->sb[dst_set].sb = zmd->sb[dst_set].mblk->data;
 974        }
 975
 976        ret = dmz_write_sb(zmd, dst_set);
 977out:
 978        __free_pages(page, 0);
 979
 980        return ret;
 981}
 982
 983/*
 984 * Get super block from disk.
 985 */
 986static int dmz_load_sb(struct dmz_metadata *zmd)
 987{
 988        bool sb_good[2] = {false, false};
 989        u64 sb_gen[2] = {0, 0};
 990        int ret;
 991
 992        /* Read and check the primary super block */
 993        zmd->sb[0].block = dmz_start_block(zmd, zmd->sb_zone);
 994        ret = dmz_get_sb(zmd, 0);
 995        if (ret) {
 996                dmz_dev_err(zmd->dev, "Read primary super block failed");
 997                return ret;
 998        }
 999
1000        ret = dmz_check_sb(zmd, zmd->sb[0].sb);
1001
1002        /* Read and check secondary super block */
1003        if (ret == 0) {
1004                sb_good[0] = true;
1005                zmd->sb[1].block = zmd->sb[0].block +
1006                        (zmd->nr_meta_zones << zmd->dev->zone_nr_blocks_shift);
1007                ret = dmz_get_sb(zmd, 1);
1008        } else
1009                ret = dmz_lookup_secondary_sb(zmd);
1010
1011        if (ret) {
1012                dmz_dev_err(zmd->dev, "Read secondary super block failed");
1013                return ret;
1014        }
1015
1016        ret = dmz_check_sb(zmd, zmd->sb[1].sb);
1017        if (ret == 0)
1018                sb_good[1] = true;
1019
1020        /* Use highest generation sb first */
1021        if (!sb_good[0] && !sb_good[1]) {
1022                dmz_dev_err(zmd->dev, "No valid super block found");
1023                return -EIO;
1024        }
1025
1026        if (sb_good[0])
1027                sb_gen[0] = le64_to_cpu(zmd->sb[0].sb->gen);
1028        else
1029                ret = dmz_recover_mblocks(zmd, 0);
1030
1031        if (sb_good[1])
1032                sb_gen[1] = le64_to_cpu(zmd->sb[1].sb->gen);
1033        else
1034                ret = dmz_recover_mblocks(zmd, 1);
1035
1036        if (ret) {
1037                dmz_dev_err(zmd->dev, "Recovery failed");
1038                return -EIO;
1039        }
1040
1041        if (sb_gen[0] >= sb_gen[1]) {
1042                zmd->sb_gen = sb_gen[0];
1043                zmd->mblk_primary = 0;
1044        } else {
1045                zmd->sb_gen = sb_gen[1];
1046                zmd->mblk_primary = 1;
1047        }
1048
1049        dmz_dev_debug(zmd->dev, "Using super block %u (gen %llu)",
1050                      zmd->mblk_primary, zmd->sb_gen);
1051
1052        return 0;
1053}
1054
1055/*
1056 * Initialize a zone descriptor.
1057 */
1058static int dmz_init_zone(struct dmz_metadata *zmd, struct dm_zone *zone,
1059                         struct blk_zone *blkz)
1060{
1061        struct dmz_dev *dev = zmd->dev;
1062
1063        /* Ignore the eventual last runt (smaller) zone */
1064        if (blkz->len != dev->zone_nr_sectors) {
1065                if (blkz->start + blkz->len == dev->capacity)
1066                        return 0;
1067                return -ENXIO;
1068        }
1069
1070        INIT_LIST_HEAD(&zone->link);
1071        atomic_set(&zone->refcount, 0);
1072        zone->chunk = DMZ_MAP_UNMAPPED;
1073
1074        if (blkz->type == BLK_ZONE_TYPE_CONVENTIONAL) {
1075                set_bit(DMZ_RND, &zone->flags);
1076                zmd->nr_rnd_zones++;
1077        } else if (blkz->type == BLK_ZONE_TYPE_SEQWRITE_REQ ||
1078                   blkz->type == BLK_ZONE_TYPE_SEQWRITE_PREF) {
1079                set_bit(DMZ_SEQ, &zone->flags);
1080        } else
1081                return -ENXIO;
1082
1083        if (blkz->cond == BLK_ZONE_COND_OFFLINE)
1084                set_bit(DMZ_OFFLINE, &zone->flags);
1085        else if (blkz->cond == BLK_ZONE_COND_READONLY)
1086                set_bit(DMZ_READ_ONLY, &zone->flags);
1087
1088        if (dmz_is_rnd(zone))
1089                zone->wp_block = 0;
1090        else
1091                zone->wp_block = dmz_sect2blk(blkz->wp - blkz->start);
1092
1093        if (!dmz_is_offline(zone) && !dmz_is_readonly(zone)) {
1094                zmd->nr_useable_zones++;
1095                if (dmz_is_rnd(zone)) {
1096                        zmd->nr_rnd_zones++;
1097                        if (!zmd->sb_zone) {
1098                                /* Super block zone */
1099                                zmd->sb_zone = zone;
1100                        }
1101                }
1102        }
1103
1104        return 0;
1105}
1106
1107/*
1108 * Free zones descriptors.
1109 */
1110static void dmz_drop_zones(struct dmz_metadata *zmd)
1111{
1112        kfree(zmd->zones);
1113        zmd->zones = NULL;
1114}
1115
1116/*
1117 * The size of a zone report in number of zones.
1118 * This results in 4096*64B=256KB report zones commands.
1119 */
1120#define DMZ_REPORT_NR_ZONES     4096
1121
1122/*
1123 * Allocate and initialize zone descriptors using the zone
1124 * information from disk.
1125 */
1126static int dmz_init_zones(struct dmz_metadata *zmd)
1127{
1128        struct dmz_dev *dev = zmd->dev;
1129        struct dm_zone *zone;
1130        struct blk_zone *blkz;
1131        unsigned int nr_blkz;
1132        sector_t sector = 0;
1133        int i, ret = 0;
1134
1135        /* Init */
1136        zmd->zone_bitmap_size = dev->zone_nr_blocks >> 3;
1137        zmd->zone_nr_bitmap_blocks = zmd->zone_bitmap_size >> DMZ_BLOCK_SHIFT;
1138
1139        /* Allocate zone array */
1140        zmd->zones = kcalloc(dev->nr_zones, sizeof(struct dm_zone), GFP_KERNEL);
1141        if (!zmd->zones)
1142                return -ENOMEM;
1143
1144        dmz_dev_info(dev, "Using %zu B for zone information",
1145                     sizeof(struct dm_zone) * dev->nr_zones);
1146
1147        /* Get zone information */
1148        nr_blkz = DMZ_REPORT_NR_ZONES;
1149        blkz = kcalloc(nr_blkz, sizeof(struct blk_zone), GFP_KERNEL);
1150        if (!blkz) {
1151                ret = -ENOMEM;
1152                goto out;
1153        }
1154
1155        /*
1156         * Get zone information and initialize zone descriptors.
1157         * At the same time, determine where the super block
1158         * should be: first block of the first randomly writable
1159         * zone.
1160         */
1161        zone = zmd->zones;
1162        while (sector < dev->capacity) {
1163                /* Get zone information */
1164                nr_blkz = DMZ_REPORT_NR_ZONES;
1165                ret = blkdev_report_zones(dev->bdev, sector, blkz,
1166                                          &nr_blkz, GFP_KERNEL);
1167                if (ret) {
1168                        dmz_dev_err(dev, "Report zones failed %d", ret);
1169                        goto out;
1170                }
1171
1172                /* Process report */
1173                for (i = 0; i < nr_blkz; i++) {
1174                        ret = dmz_init_zone(zmd, zone, &blkz[i]);
1175                        if (ret)
1176                                goto out;
1177                        sector += dev->zone_nr_sectors;
1178                        zone++;
1179                }
1180        }
1181
1182        /* The entire zone configuration of the disk should now be known */
1183        if (sector < dev->capacity) {
1184                dmz_dev_err(dev, "Failed to get correct zone information");
1185                ret = -ENXIO;
1186        }
1187out:
1188        kfree(blkz);
1189        if (ret)
1190                dmz_drop_zones(zmd);
1191
1192        return ret;
1193}
1194
1195/*
1196 * Update a zone information.
1197 */
1198static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
1199{
1200        unsigned int nr_blkz = 1;
1201        struct blk_zone blkz;
1202        int ret;
1203
1204        /* Get zone information from disk */
1205        ret = blkdev_report_zones(zmd->dev->bdev, dmz_start_sect(zmd, zone),
1206                                  &blkz, &nr_blkz, GFP_NOIO);
1207        if (ret) {
1208                dmz_dev_err(zmd->dev, "Get zone %u report failed",
1209                            dmz_id(zmd, zone));
1210                return ret;
1211        }
1212
1213        clear_bit(DMZ_OFFLINE, &zone->flags);
1214        clear_bit(DMZ_READ_ONLY, &zone->flags);
1215        if (blkz.cond == BLK_ZONE_COND_OFFLINE)
1216                set_bit(DMZ_OFFLINE, &zone->flags);
1217        else if (blkz.cond == BLK_ZONE_COND_READONLY)
1218                set_bit(DMZ_READ_ONLY, &zone->flags);
1219
1220        if (dmz_is_seq(zone))
1221                zone->wp_block = dmz_sect2blk(blkz.wp - blkz.start);
1222        else
1223                zone->wp_block = 0;
1224
1225        return 0;
1226}
1227
1228/*
1229 * Check a zone write pointer position when the zone is marked
1230 * with the sequential write error flag.
1231 */
1232static int dmz_handle_seq_write_err(struct dmz_metadata *zmd,
1233                                    struct dm_zone *zone)
1234{
1235        unsigned int wp = 0;
1236        int ret;
1237
1238        wp = zone->wp_block;
1239        ret = dmz_update_zone(zmd, zone);
1240        if (ret)
1241                return ret;
1242
1243        dmz_dev_warn(zmd->dev, "Processing zone %u write error (zone wp %u/%u)",
1244                     dmz_id(zmd, zone), zone->wp_block, wp);
1245
1246        if (zone->wp_block < wp) {
1247                dmz_invalidate_blocks(zmd, zone, zone->wp_block,
1248                                      wp - zone->wp_block);
1249        }
1250
1251        return 0;
1252}
1253
1254static struct dm_zone *dmz_get(struct dmz_metadata *zmd, unsigned int zone_id)
1255{
1256        return &zmd->zones[zone_id];
1257}
1258
1259/*
1260 * Reset a zone write pointer.
1261 */
1262static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
1263{
1264        int ret;
1265
1266        /*
1267         * Ignore offline zones, read only zones,
1268         * and conventional zones.
1269         */
1270        if (dmz_is_offline(zone) ||
1271            dmz_is_readonly(zone) ||
1272            dmz_is_rnd(zone))
1273                return 0;
1274
1275        if (!dmz_is_empty(zone) || dmz_seq_write_err(zone)) {
1276                struct dmz_dev *dev = zmd->dev;
1277
1278                ret = blkdev_reset_zones(dev->bdev,
1279                                         dmz_start_sect(zmd, zone),
1280                                         dev->zone_nr_sectors, GFP_NOIO);
1281                if (ret) {
1282                        dmz_dev_err(dev, "Reset zone %u failed %d",
1283                                    dmz_id(zmd, zone), ret);
1284                        return ret;
1285                }
1286        }
1287
1288        /* Clear write error bit and rewind write pointer position */
1289        clear_bit(DMZ_SEQ_WRITE_ERR, &zone->flags);
1290        zone->wp_block = 0;
1291
1292        return 0;
1293}
1294
1295static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone);
1296
1297/*
1298 * Initialize chunk mapping.
1299 */
1300static int dmz_load_mapping(struct dmz_metadata *zmd)
1301{
1302        struct dmz_dev *dev = zmd->dev;
1303        struct dm_zone *dzone, *bzone;
1304        struct dmz_mblock *dmap_mblk = NULL;
1305        struct dmz_map *dmap;
1306        unsigned int i = 0, e = 0, chunk = 0;
1307        unsigned int dzone_id;
1308        unsigned int bzone_id;
1309
1310        /* Metadata block array for the chunk mapping table */
1311        zmd->map_mblk = kcalloc(zmd->nr_map_blocks,
1312                                sizeof(struct dmz_mblk *), GFP_KERNEL);
1313        if (!zmd->map_mblk)
1314                return -ENOMEM;
1315
1316        /* Get chunk mapping table blocks and initialize zone mapping */
1317        while (chunk < zmd->nr_chunks) {
1318                if (!dmap_mblk) {
1319                        /* Get mapping block */
1320                        dmap_mblk = dmz_get_mblock(zmd, i + 1);
1321                        if (IS_ERR(dmap_mblk))
1322                                return PTR_ERR(dmap_mblk);
1323                        zmd->map_mblk[i] = dmap_mblk;
1324                        dmap = (struct dmz_map *) dmap_mblk->data;
1325                        i++;
1326                        e = 0;
1327                }
1328
1329                /* Check data zone */
1330                dzone_id = le32_to_cpu(dmap[e].dzone_id);
1331                if (dzone_id == DMZ_MAP_UNMAPPED)
1332                        goto next;
1333
1334                if (dzone_id >= dev->nr_zones) {
1335                        dmz_dev_err(dev, "Chunk %u mapping: invalid data zone ID %u",
1336                                    chunk, dzone_id);
1337                        return -EIO;
1338                }
1339
1340                dzone = dmz_get(zmd, dzone_id);
1341                set_bit(DMZ_DATA, &dzone->flags);
1342                dzone->chunk = chunk;
1343                dmz_get_zone_weight(zmd, dzone);
1344
1345                if (dmz_is_rnd(dzone))
1346                        list_add_tail(&dzone->link, &zmd->map_rnd_list);
1347                else
1348                        list_add_tail(&dzone->link, &zmd->map_seq_list);
1349
1350                /* Check buffer zone */
1351                bzone_id = le32_to_cpu(dmap[e].bzone_id);
1352                if (bzone_id == DMZ_MAP_UNMAPPED)
1353                        goto next;
1354
1355                if (bzone_id >= dev->nr_zones) {
1356                        dmz_dev_err(dev, "Chunk %u mapping: invalid buffer zone ID %u",
1357                                    chunk, bzone_id);
1358                        return -EIO;
1359                }
1360
1361                bzone = dmz_get(zmd, bzone_id);
1362                if (!dmz_is_rnd(bzone)) {
1363                        dmz_dev_err(dev, "Chunk %u mapping: invalid buffer zone %u",
1364                                    chunk, bzone_id);
1365                        return -EIO;
1366                }
1367
1368                set_bit(DMZ_DATA, &bzone->flags);
1369                set_bit(DMZ_BUF, &bzone->flags);
1370                bzone->chunk = chunk;
1371                bzone->bzone = dzone;
1372                dzone->bzone = bzone;
1373                dmz_get_zone_weight(zmd, bzone);
1374                list_add_tail(&bzone->link, &zmd->map_rnd_list);
1375next:
1376                chunk++;
1377                e++;
1378                if (e >= DMZ_MAP_ENTRIES)
1379                        dmap_mblk = NULL;
1380        }
1381
1382        /*
1383         * At this point, only meta zones and mapped data zones were
1384         * fully initialized. All remaining zones are unmapped data
1385         * zones. Finish initializing those here.
1386         */
1387        for (i = 0; i < dev->nr_zones; i++) {
1388                dzone = dmz_get(zmd, i);
1389                if (dmz_is_meta(dzone))
1390                        continue;
1391
1392                if (dmz_is_rnd(dzone))
1393                        zmd->nr_rnd++;
1394                else
1395                        zmd->nr_seq++;
1396
1397                if (dmz_is_data(dzone)) {
1398                        /* Already initialized */
1399                        continue;
1400                }
1401
1402                /* Unmapped data zone */
1403                set_bit(DMZ_DATA, &dzone->flags);
1404                dzone->chunk = DMZ_MAP_UNMAPPED;
1405                if (dmz_is_rnd(dzone)) {
1406                        list_add_tail(&dzone->link, &zmd->unmap_rnd_list);
1407                        atomic_inc(&zmd->unmap_nr_rnd);
1408                } else if (atomic_read(&zmd->nr_reserved_seq_zones) < zmd->nr_reserved_seq) {
1409                        list_add_tail(&dzone->link, &zmd->reserved_seq_zones_list);
1410                        atomic_inc(&zmd->nr_reserved_seq_zones);
1411                        zmd->nr_seq--;
1412                } else {
1413                        list_add_tail(&dzone->link, &zmd->unmap_seq_list);
1414                        atomic_inc(&zmd->unmap_nr_seq);
1415                }
1416        }
1417
1418        return 0;
1419}
1420
1421/*
1422 * Set a data chunk mapping.
1423 */
1424static void dmz_set_chunk_mapping(struct dmz_metadata *zmd, unsigned int chunk,
1425                                  unsigned int dzone_id, unsigned int bzone_id)
1426{
1427        struct dmz_mblock *dmap_mblk = zmd->map_mblk[chunk >> DMZ_MAP_ENTRIES_SHIFT];
1428        struct dmz_map *dmap = (struct dmz_map *) dmap_mblk->data;
1429        int map_idx = chunk & DMZ_MAP_ENTRIES_MASK;
1430
1431        dmap[map_idx].dzone_id = cpu_to_le32(dzone_id);
1432        dmap[map_idx].bzone_id = cpu_to_le32(bzone_id);
1433        dmz_dirty_mblock(zmd, dmap_mblk);
1434}
1435
1436/*
1437 * The list of mapped zones is maintained in LRU order.
1438 * This rotates a zone at the end of its map list.
1439 */
1440static void __dmz_lru_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
1441{
1442        if (list_empty(&zone->link))
1443                return;
1444
1445        list_del_init(&zone->link);
1446        if (dmz_is_seq(zone)) {
1447                /* LRU rotate sequential zone */
1448                list_add_tail(&zone->link, &zmd->map_seq_list);
1449        } else {
1450                /* LRU rotate random zone */
1451                list_add_tail(&zone->link, &zmd->map_rnd_list);
1452        }
1453}
1454
1455/*
1456 * The list of mapped random zones is maintained
1457 * in LRU order. This rotates a zone at the end of the list.
1458 */
1459static void dmz_lru_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
1460{
1461        __dmz_lru_zone(zmd, zone);
1462        if (zone->bzone)
1463                __dmz_lru_zone(zmd, zone->bzone);
1464}
1465
1466/*
1467 * Wait for any zone to be freed.
1468 */
1469static void dmz_wait_for_free_zones(struct dmz_metadata *zmd)
1470{
1471        DEFINE_WAIT(wait);
1472
1473        prepare_to_wait(&zmd->free_wq, &wait, TASK_UNINTERRUPTIBLE);
1474        dmz_unlock_map(zmd);
1475        dmz_unlock_metadata(zmd);
1476
1477        io_schedule_timeout(HZ);
1478
1479        dmz_lock_metadata(zmd);
1480        dmz_lock_map(zmd);
1481        finish_wait(&zmd->free_wq, &wait);
1482}
1483
1484/*
1485 * Lock a zone for reclaim (set the zone RECLAIM bit).
1486 * Returns false if the zone cannot be locked or if it is already locked
1487 * and 1 otherwise.
1488 */
1489int dmz_lock_zone_reclaim(struct dm_zone *zone)
1490{
1491        /* Active zones cannot be reclaimed */
1492        if (dmz_is_active(zone))
1493                return 0;
1494
1495        return !test_and_set_bit(DMZ_RECLAIM, &zone->flags);
1496}
1497
1498/*
1499 * Clear a zone reclaim flag.
1500 */
1501void dmz_unlock_zone_reclaim(struct dm_zone *zone)
1502{
1503        WARN_ON(dmz_is_active(zone));
1504        WARN_ON(!dmz_in_reclaim(zone));
1505
1506        clear_bit_unlock(DMZ_RECLAIM, &zone->flags);
1507        smp_mb__after_atomic();
1508        wake_up_bit(&zone->flags, DMZ_RECLAIM);
1509}
1510
1511/*
1512 * Wait for a zone reclaim to complete.
1513 */
1514static void dmz_wait_for_reclaim(struct dmz_metadata *zmd, struct dm_zone *zone)
1515{
1516        dmz_unlock_map(zmd);
1517        dmz_unlock_metadata(zmd);
1518        wait_on_bit_timeout(&zone->flags, DMZ_RECLAIM, TASK_UNINTERRUPTIBLE, HZ);
1519        dmz_lock_metadata(zmd);
1520        dmz_lock_map(zmd);
1521}
1522
1523/*
1524 * Select a random write zone for reclaim.
1525 */
1526static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd)
1527{
1528        struct dm_zone *dzone = NULL;
1529        struct dm_zone *zone;
1530
1531        if (list_empty(&zmd->map_rnd_list))
1532                return NULL;
1533
1534        list_for_each_entry(zone, &zmd->map_rnd_list, link) {
1535                if (dmz_is_buf(zone))
1536                        dzone = zone->bzone;
1537                else
1538                        dzone = zone;
1539                if (dmz_lock_zone_reclaim(dzone))
1540                        return dzone;
1541        }
1542
1543        return NULL;
1544}
1545
1546/*
1547 * Select a buffered sequential zone for reclaim.
1548 */
1549static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd)
1550{
1551        struct dm_zone *zone;
1552
1553        if (list_empty(&zmd->map_seq_list))
1554                return NULL;
1555
1556        list_for_each_entry(zone, &zmd->map_seq_list, link) {
1557                if (!zone->bzone)
1558                        continue;
1559                if (dmz_lock_zone_reclaim(zone))
1560                        return zone;
1561        }
1562
1563        return NULL;
1564}
1565
1566/*
1567 * Select a zone for reclaim.
1568 */
1569struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd)
1570{
1571        struct dm_zone *zone;
1572
1573        /*
1574         * Search for a zone candidate to reclaim: 2 cases are possible.
1575         * (1) There is no free sequential zones. Then a random data zone
1576         *     cannot be reclaimed. So choose a sequential zone to reclaim so
1577         *     that afterward a random zone can be reclaimed.
1578         * (2) At least one free sequential zone is available, then choose
1579         *     the oldest random zone (data or buffer) that can be locked.
1580         */
1581        dmz_lock_map(zmd);
1582        if (list_empty(&zmd->reserved_seq_zones_list))
1583                zone = dmz_get_seq_zone_for_reclaim(zmd);
1584        else
1585                zone = dmz_get_rnd_zone_for_reclaim(zmd);
1586        dmz_unlock_map(zmd);
1587
1588        return zone;
1589}
1590
1591/*
1592 * Activate a zone (increment its reference count).
1593 */
1594void dmz_activate_zone(struct dm_zone *zone)
1595{
1596        set_bit(DMZ_ACTIVE, &zone->flags);
1597        atomic_inc(&zone->refcount);
1598}
1599
1600/*
1601 * Deactivate a zone. This decrement the zone reference counter
1602 * and clears the active state of the zone once the count reaches 0,
1603 * indicating that all BIOs to the zone have completed. Returns
1604 * true if the zone was deactivated.
1605 */
1606void dmz_deactivate_zone(struct dm_zone *zone)
1607{
1608        if (atomic_dec_and_test(&zone->refcount)) {
1609                WARN_ON(!test_bit(DMZ_ACTIVE, &zone->flags));
1610                clear_bit_unlock(DMZ_ACTIVE, &zone->flags);
1611                smp_mb__after_atomic();
1612        }
1613}
1614
1615/*
1616 * Get the zone mapping a chunk, if the chunk is mapped already.
1617 * If no mapping exist and the operation is WRITE, a zone is
1618 * allocated and used to map the chunk.
1619 * The zone returned will be set to the active state.
1620 */
1621struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd, unsigned int chunk, int op)
1622{
1623        struct dmz_mblock *dmap_mblk = zmd->map_mblk[chunk >> DMZ_MAP_ENTRIES_SHIFT];
1624        struct dmz_map *dmap = (struct dmz_map *) dmap_mblk->data;
1625        int dmap_idx = chunk & DMZ_MAP_ENTRIES_MASK;
1626        unsigned int dzone_id;
1627        struct dm_zone *dzone = NULL;
1628        int ret = 0;
1629
1630        dmz_lock_map(zmd);
1631again:
1632        /* Get the chunk mapping */
1633        dzone_id = le32_to_cpu(dmap[dmap_idx].dzone_id);
1634        if (dzone_id == DMZ_MAP_UNMAPPED) {
1635                /*
1636                 * Read or discard in unmapped chunks are fine. But for
1637                 * writes, we need a mapping, so get one.
1638                 */
1639                if (op != REQ_OP_WRITE)
1640                        goto out;
1641
1642                /* Alloate a random zone */
1643                dzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
1644                if (!dzone) {
1645                        dmz_wait_for_free_zones(zmd);
1646                        goto again;
1647                }
1648
1649                dmz_map_zone(zmd, dzone, chunk);
1650
1651        } else {
1652                /* The chunk is already mapped: get the mapping zone */
1653                dzone = dmz_get(zmd, dzone_id);
1654                if (dzone->chunk != chunk) {
1655                        dzone = ERR_PTR(-EIO);
1656                        goto out;
1657                }
1658
1659                /* Repair write pointer if the sequential dzone has error */
1660                if (dmz_seq_write_err(dzone)) {
1661                        ret = dmz_handle_seq_write_err(zmd, dzone);
1662                        if (ret) {
1663                                dzone = ERR_PTR(-EIO);
1664                                goto out;
1665                        }
1666                        clear_bit(DMZ_SEQ_WRITE_ERR, &dzone->flags);
1667                }
1668        }
1669
1670        /*
1671         * If the zone is being reclaimed, the chunk mapping may change
1672         * to a different zone. So wait for reclaim and retry. Otherwise,
1673         * activate the zone (this will prevent reclaim from touching it).
1674         */
1675        if (dmz_in_reclaim(dzone)) {
1676                dmz_wait_for_reclaim(zmd, dzone);
1677                goto again;
1678        }
1679        dmz_activate_zone(dzone);
1680        dmz_lru_zone(zmd, dzone);
1681out:
1682        dmz_unlock_map(zmd);
1683
1684        return dzone;
1685}
1686
1687/*
1688 * Write and discard change the block validity of data zones and their buffer
1689 * zones. Check here that valid blocks are still present. If all blocks are
1690 * invalid, the zones can be unmapped on the fly without waiting for reclaim
1691 * to do it.
1692 */
1693void dmz_put_chunk_mapping(struct dmz_metadata *zmd, struct dm_zone *dzone)
1694{
1695        struct dm_zone *bzone;
1696
1697        dmz_lock_map(zmd);
1698
1699        bzone = dzone->bzone;
1700        if (bzone) {
1701                if (dmz_weight(bzone))
1702                        dmz_lru_zone(zmd, bzone);
1703                else {
1704                        /* Empty buffer zone: reclaim it */
1705                        dmz_unmap_zone(zmd, bzone);
1706                        dmz_free_zone(zmd, bzone);
1707                        bzone = NULL;
1708                }
1709        }
1710
1711        /* Deactivate the data zone */
1712        dmz_deactivate_zone(dzone);
1713        if (dmz_is_active(dzone) || bzone || dmz_weight(dzone))
1714                dmz_lru_zone(zmd, dzone);
1715        else {
1716                /* Unbuffered inactive empty data zone: reclaim it */
1717                dmz_unmap_zone(zmd, dzone);
1718                dmz_free_zone(zmd, dzone);
1719        }
1720
1721        dmz_unlock_map(zmd);
1722}
1723
1724/*
1725 * Allocate and map a random zone to buffer a chunk
1726 * already mapped to a sequential zone.
1727 */
1728struct dm_zone *dmz_get_chunk_buffer(struct dmz_metadata *zmd,
1729                                     struct dm_zone *dzone)
1730{
1731        struct dm_zone *bzone;
1732
1733        dmz_lock_map(zmd);
1734again:
1735        bzone = dzone->bzone;
1736        if (bzone)
1737                goto out;
1738
1739        /* Alloate a random zone */
1740        bzone = dmz_alloc_zone(zmd, DMZ_ALLOC_RND);
1741        if (!bzone) {
1742                dmz_wait_for_free_zones(zmd);
1743                goto again;
1744        }
1745
1746        /* Update the chunk mapping */
1747        dmz_set_chunk_mapping(zmd, dzone->chunk, dmz_id(zmd, dzone),
1748                              dmz_id(zmd, bzone));
1749
1750        set_bit(DMZ_BUF, &bzone->flags);
1751        bzone->chunk = dzone->chunk;
1752        bzone->bzone = dzone;
1753        dzone->bzone = bzone;
1754        list_add_tail(&bzone->link, &zmd->map_rnd_list);
1755out:
1756        dmz_unlock_map(zmd);
1757
1758        return bzone;
1759}
1760
1761/*
1762 * Get an unmapped (free) zone.
1763 * This must be called with the mapping lock held.
1764 */
1765struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsigned long flags)
1766{
1767        struct list_head *list;
1768        struct dm_zone *zone;
1769
1770        if (flags & DMZ_ALLOC_RND)
1771                list = &zmd->unmap_rnd_list;
1772        else
1773                list = &zmd->unmap_seq_list;
1774again:
1775        if (list_empty(list)) {
1776                /*
1777                 * No free zone: if this is for reclaim, allow using the
1778                 * reserved sequential zones.
1779                 */
1780                if (!(flags & DMZ_ALLOC_RECLAIM) ||
1781                    list_empty(&zmd->reserved_seq_zones_list))
1782                        return NULL;
1783
1784                zone = list_first_entry(&zmd->reserved_seq_zones_list,
1785                                        struct dm_zone, link);
1786                list_del_init(&zone->link);
1787                atomic_dec(&zmd->nr_reserved_seq_zones);
1788                return zone;
1789        }
1790
1791        zone = list_first_entry(list, struct dm_zone, link);
1792        list_del_init(&zone->link);
1793
1794        if (dmz_is_rnd(zone))
1795                atomic_dec(&zmd->unmap_nr_rnd);
1796        else
1797                atomic_dec(&zmd->unmap_nr_seq);
1798
1799        if (dmz_is_offline(zone)) {
1800                dmz_dev_warn(zmd->dev, "Zone %u is offline", dmz_id(zmd, zone));
1801                zone = NULL;
1802                goto again;
1803        }
1804
1805        return zone;
1806}
1807
1808/*
1809 * Free a zone.
1810 * This must be called with the mapping lock held.
1811 */
1812void dmz_free_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
1813{
1814        /* If this is a sequential zone, reset it */
1815        if (dmz_is_seq(zone))
1816                dmz_reset_zone(zmd, zone);
1817
1818        /* Return the zone to its type unmap list */
1819        if (dmz_is_rnd(zone)) {
1820                list_add_tail(&zone->link, &zmd->unmap_rnd_list);
1821                atomic_inc(&zmd->unmap_nr_rnd);
1822        } else if (atomic_read(&zmd->nr_reserved_seq_zones) <
1823                   zmd->nr_reserved_seq) {
1824                list_add_tail(&zone->link, &zmd->reserved_seq_zones_list);
1825                atomic_inc(&zmd->nr_reserved_seq_zones);
1826        } else {
1827                list_add_tail(&zone->link, &zmd->unmap_seq_list);
1828                atomic_inc(&zmd->unmap_nr_seq);
1829        }
1830
1831        wake_up_all(&zmd->free_wq);
1832}
1833
1834/*
1835 * Map a chunk to a zone.
1836 * This must be called with the mapping lock held.
1837 */
1838void dmz_map_zone(struct dmz_metadata *zmd, struct dm_zone *dzone,
1839                  unsigned int chunk)
1840{
1841        /* Set the chunk mapping */
1842        dmz_set_chunk_mapping(zmd, chunk, dmz_id(zmd, dzone),
1843                              DMZ_MAP_UNMAPPED);
1844        dzone->chunk = chunk;
1845        if (dmz_is_rnd(dzone))
1846                list_add_tail(&dzone->link, &zmd->map_rnd_list);
1847        else
1848                list_add_tail(&dzone->link, &zmd->map_seq_list);
1849}
1850
1851/*
1852 * Unmap a zone.
1853 * This must be called with the mapping lock held.
1854 */
1855void dmz_unmap_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
1856{
1857        unsigned int chunk = zone->chunk;
1858        unsigned int dzone_id;
1859
1860        if (chunk == DMZ_MAP_UNMAPPED) {
1861                /* Already unmapped */
1862                return;
1863        }
1864
1865        if (test_and_clear_bit(DMZ_BUF, &zone->flags)) {
1866                /*
1867                 * Unmapping the chunk buffer zone: clear only
1868                 * the chunk buffer mapping
1869                 */
1870                dzone_id = dmz_id(zmd, zone->bzone);
1871                zone->bzone->bzone = NULL;
1872                zone->bzone = NULL;
1873
1874        } else {
1875                /*
1876                 * Unmapping the chunk data zone: the zone must
1877                 * not be buffered.
1878                 */
1879                if (WARN_ON(zone->bzone)) {
1880                        zone->bzone->bzone = NULL;
1881                        zone->bzone = NULL;
1882                }
1883                dzone_id = DMZ_MAP_UNMAPPED;
1884        }
1885
1886        dmz_set_chunk_mapping(zmd, chunk, dzone_id, DMZ_MAP_UNMAPPED);
1887
1888        zone->chunk = DMZ_MAP_UNMAPPED;
1889        list_del_init(&zone->link);
1890}
1891
1892/*
1893 * Set @nr_bits bits in @bitmap starting from @bit.
1894 * Return the number of bits changed from 0 to 1.
1895 */
1896static unsigned int dmz_set_bits(unsigned long *bitmap,
1897                                 unsigned int bit, unsigned int nr_bits)
1898{
1899        unsigned long *addr;
1900        unsigned int end = bit + nr_bits;
1901        unsigned int n = 0;
1902
1903        while (bit < end) {
1904                if (((bit & (BITS_PER_LONG - 1)) == 0) &&
1905                    ((end - bit) >= BITS_PER_LONG)) {
1906                        /* Try to set the whole word at once */
1907                        addr = bitmap + BIT_WORD(bit);
1908                        if (*addr == 0) {
1909                                *addr = ULONG_MAX;
1910                                n += BITS_PER_LONG;
1911                                bit += BITS_PER_LONG;
1912                                continue;
1913                        }
1914                }
1915
1916                if (!test_and_set_bit(bit, bitmap))
1917                        n++;
1918                bit++;
1919        }
1920
1921        return n;
1922}
1923
1924/*
1925 * Get the bitmap block storing the bit for chunk_block in zone.
1926 */
1927static struct dmz_mblock *dmz_get_bitmap(struct dmz_metadata *zmd,
1928                                         struct dm_zone *zone,
1929                                         sector_t chunk_block)
1930{
1931        sector_t bitmap_block = 1 + zmd->nr_map_blocks +
1932                (sector_t)(dmz_id(zmd, zone) * zmd->zone_nr_bitmap_blocks) +
1933                (chunk_block >> DMZ_BLOCK_SHIFT_BITS);
1934
1935        return dmz_get_mblock(zmd, bitmap_block);
1936}
1937
1938/*
1939 * Copy the valid blocks bitmap of from_zone to the bitmap of to_zone.
1940 */
1941int dmz_copy_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone,
1942                          struct dm_zone *to_zone)
1943{
1944        struct dmz_mblock *from_mblk, *to_mblk;
1945        sector_t chunk_block = 0;
1946
1947        /* Get the zones bitmap blocks */
1948        while (chunk_block < zmd->dev->zone_nr_blocks) {
1949                from_mblk = dmz_get_bitmap(zmd, from_zone, chunk_block);
1950                if (IS_ERR(from_mblk))
1951                        return PTR_ERR(from_mblk);
1952                to_mblk = dmz_get_bitmap(zmd, to_zone, chunk_block);
1953                if (IS_ERR(to_mblk)) {
1954                        dmz_release_mblock(zmd, from_mblk);
1955                        return PTR_ERR(to_mblk);
1956                }
1957
1958                memcpy(to_mblk->data, from_mblk->data, DMZ_BLOCK_SIZE);
1959                dmz_dirty_mblock(zmd, to_mblk);
1960
1961                dmz_release_mblock(zmd, to_mblk);
1962                dmz_release_mblock(zmd, from_mblk);
1963
1964                chunk_block += DMZ_BLOCK_SIZE_BITS;
1965        }
1966
1967        to_zone->weight = from_zone->weight;
1968
1969        return 0;
1970}
1971
1972/*
1973 * Merge the valid blocks bitmap of from_zone into the bitmap of to_zone,
1974 * starting from chunk_block.
1975 */
1976int dmz_merge_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone,
1977                           struct dm_zone *to_zone, sector_t chunk_block)
1978{
1979        unsigned int nr_blocks;
1980        int ret;
1981
1982        /* Get the zones bitmap blocks */
1983        while (chunk_block < zmd->dev->zone_nr_blocks) {
1984                /* Get a valid region from the source zone */
1985                ret = dmz_first_valid_block(zmd, from_zone, &chunk_block);
1986                if (ret <= 0)
1987                        return ret;
1988
1989                nr_blocks = ret;
1990                ret = dmz_validate_blocks(zmd, to_zone, chunk_block, nr_blocks);
1991                if (ret)
1992                        return ret;
1993
1994                chunk_block += nr_blocks;
1995        }
1996
1997        return 0;
1998}
1999
2000/*
2001 * Validate all the blocks in the range [block..block+nr_blocks-1].
2002 */
2003int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone,
2004                        sector_t chunk_block, unsigned int nr_blocks)
2005{
2006        unsigned int count, bit, nr_bits;
2007        unsigned int zone_nr_blocks = zmd->dev->zone_nr_blocks;
2008        struct dmz_mblock *mblk;
2009        unsigned int n = 0;
2010
2011        dmz_dev_debug(zmd->dev, "=> VALIDATE zone %u, block %llu, %u blocks",
2012                      dmz_id(zmd, zone), (unsigned long long)chunk_block,
2013                      nr_blocks);
2014
2015        WARN_ON(chunk_block + nr_blocks > zone_nr_blocks);
2016
2017        while (nr_blocks) {
2018                /* Get bitmap block */
2019                mblk = dmz_get_bitmap(zmd, zone, chunk_block);
2020                if (IS_ERR(mblk))
2021                        return PTR_ERR(mblk);
2022
2023                /* Set bits */
2024                bit = chunk_block & DMZ_BLOCK_MASK_BITS;
2025                nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
2026
2027                count = dmz_set_bits((unsigned long *)mblk->data, bit, nr_bits);
2028                if (count) {
2029                        dmz_dirty_mblock(zmd, mblk);
2030                        n += count;
2031                }
2032                dmz_release_mblock(zmd, mblk);
2033
2034                nr_blocks -= nr_bits;
2035                chunk_block += nr_bits;
2036        }
2037
2038        if (likely(zone->weight + n <= zone_nr_blocks))
2039                zone->weight += n;
2040        else {
2041                dmz_dev_warn(zmd->dev, "Zone %u: weight %u should be <= %u",
2042                             dmz_id(zmd, zone), zone->weight,
2043                             zone_nr_blocks - n);
2044                zone->weight = zone_nr_blocks;
2045        }
2046
2047        return 0;
2048}
2049
2050/*
2051 * Clear nr_bits bits in bitmap starting from bit.
2052 * Return the number of bits cleared.
2053 */
2054static int dmz_clear_bits(unsigned long *bitmap, int bit, int nr_bits)
2055{
2056        unsigned long *addr;
2057        int end = bit + nr_bits;
2058        int n = 0;
2059
2060        while (bit < end) {
2061                if (((bit & (BITS_PER_LONG - 1)) == 0) &&
2062                    ((end - bit) >= BITS_PER_LONG)) {
2063                        /* Try to clear whole word at once */
2064                        addr = bitmap + BIT_WORD(bit);
2065                        if (*addr == ULONG_MAX) {
2066                                *addr = 0;
2067                                n += BITS_PER_LONG;
2068                                bit += BITS_PER_LONG;
2069                                continue;
2070                        }
2071                }
2072
2073                if (test_and_clear_bit(bit, bitmap))
2074                        n++;
2075                bit++;
2076        }
2077
2078        return n;
2079}
2080
2081/*
2082 * Invalidate all the blocks in the range [block..block+nr_blocks-1].
2083 */
2084int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone,
2085                          sector_t chunk_block, unsigned int nr_blocks)
2086{
2087        unsigned int count, bit, nr_bits;
2088        struct dmz_mblock *mblk;
2089        unsigned int n = 0;
2090
2091        dmz_dev_debug(zmd->dev, "=> INVALIDATE zone %u, block %llu, %u blocks",
2092                      dmz_id(zmd, zone), (u64)chunk_block, nr_blocks);
2093
2094        WARN_ON(chunk_block + nr_blocks > zmd->dev->zone_nr_blocks);
2095
2096        while (nr_blocks) {
2097                /* Get bitmap block */
2098                mblk = dmz_get_bitmap(zmd, zone, chunk_block);
2099                if (IS_ERR(mblk))
2100                        return PTR_ERR(mblk);
2101
2102                /* Clear bits */
2103                bit = chunk_block & DMZ_BLOCK_MASK_BITS;
2104                nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
2105
2106                count = dmz_clear_bits((unsigned long *)mblk->data,
2107                                       bit, nr_bits);
2108                if (count) {
2109                        dmz_dirty_mblock(zmd, mblk);
2110                        n += count;
2111                }
2112                dmz_release_mblock(zmd, mblk);
2113
2114                nr_blocks -= nr_bits;
2115                chunk_block += nr_bits;
2116        }
2117
2118        if (zone->weight >= n)
2119                zone->weight -= n;
2120        else {
2121                dmz_dev_warn(zmd->dev, "Zone %u: weight %u should be >= %u",
2122                             dmz_id(zmd, zone), zone->weight, n);
2123                zone->weight = 0;
2124        }
2125
2126        return 0;
2127}
2128
2129/*
2130 * Get a block bit value.
2131 */
2132static int dmz_test_block(struct dmz_metadata *zmd, struct dm_zone *zone,
2133                          sector_t chunk_block)
2134{
2135        struct dmz_mblock *mblk;
2136        int ret;
2137
2138        WARN_ON(chunk_block >= zmd->dev->zone_nr_blocks);
2139
2140        /* Get bitmap block */
2141        mblk = dmz_get_bitmap(zmd, zone, chunk_block);
2142        if (IS_ERR(mblk))
2143                return PTR_ERR(mblk);
2144
2145        /* Get offset */
2146        ret = test_bit(chunk_block & DMZ_BLOCK_MASK_BITS,
2147                       (unsigned long *) mblk->data) != 0;
2148
2149        dmz_release_mblock(zmd, mblk);
2150
2151        return ret;
2152}
2153
2154/*
2155 * Return the number of blocks from chunk_block to the first block with a bit
2156 * value specified by set. Search at most nr_blocks blocks from chunk_block.
2157 */
2158static int dmz_to_next_set_block(struct dmz_metadata *zmd, struct dm_zone *zone,
2159                                 sector_t chunk_block, unsigned int nr_blocks,
2160                                 int set)
2161{
2162        struct dmz_mblock *mblk;
2163        unsigned int bit, set_bit, nr_bits;
2164        unsigned long *bitmap;
2165        int n = 0;
2166
2167        WARN_ON(chunk_block + nr_blocks > zmd->dev->zone_nr_blocks);
2168
2169        while (nr_blocks) {
2170                /* Get bitmap block */
2171                mblk = dmz_get_bitmap(zmd, zone, chunk_block);
2172                if (IS_ERR(mblk))
2173                        return PTR_ERR(mblk);
2174
2175                /* Get offset */
2176                bitmap = (unsigned long *) mblk->data;
2177                bit = chunk_block & DMZ_BLOCK_MASK_BITS;
2178                nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
2179                if (set)
2180                        set_bit = find_next_bit(bitmap, DMZ_BLOCK_SIZE_BITS, bit);
2181                else
2182                        set_bit = find_next_zero_bit(bitmap, DMZ_BLOCK_SIZE_BITS, bit);
2183                dmz_release_mblock(zmd, mblk);
2184
2185                n += set_bit - bit;
2186                if (set_bit < DMZ_BLOCK_SIZE_BITS)
2187                        break;
2188
2189                nr_blocks -= nr_bits;
2190                chunk_block += nr_bits;
2191        }
2192
2193        return n;
2194}
2195
2196/*
2197 * Test if chunk_block is valid. If it is, the number of consecutive
2198 * valid blocks from chunk_block will be returned.
2199 */
2200int dmz_block_valid(struct dmz_metadata *zmd, struct dm_zone *zone,
2201                    sector_t chunk_block)
2202{
2203        int valid;
2204
2205        valid = dmz_test_block(zmd, zone, chunk_block);
2206        if (valid <= 0)
2207                return valid;
2208
2209        /* The block is valid: get the number of valid blocks from block */
2210        return dmz_to_next_set_block(zmd, zone, chunk_block,
2211                                     zmd->dev->zone_nr_blocks - chunk_block, 0);
2212}
2213
2214/*
2215 * Find the first valid block from @chunk_block in @zone.
2216 * If such a block is found, its number is returned using
2217 * @chunk_block and the total number of valid blocks from @chunk_block
2218 * is returned.
2219 */
2220int dmz_first_valid_block(struct dmz_metadata *zmd, struct dm_zone *zone,
2221                          sector_t *chunk_block)
2222{
2223        sector_t start_block = *chunk_block;
2224        int ret;
2225
2226        ret = dmz_to_next_set_block(zmd, zone, start_block,
2227                                    zmd->dev->zone_nr_blocks - start_block, 1);
2228        if (ret < 0)
2229                return ret;
2230
2231        start_block += ret;
2232        *chunk_block = start_block;
2233
2234        return dmz_to_next_set_block(zmd, zone, start_block,
2235                                     zmd->dev->zone_nr_blocks - start_block, 0);
2236}
2237
2238/*
2239 * Count the number of bits set starting from bit up to bit + nr_bits - 1.
2240 */
2241static int dmz_count_bits(void *bitmap, int bit, int nr_bits)
2242{
2243        unsigned long *addr;
2244        int end = bit + nr_bits;
2245        int n = 0;
2246
2247        while (bit < end) {
2248                if (((bit & (BITS_PER_LONG - 1)) == 0) &&
2249                    ((end - bit) >= BITS_PER_LONG)) {
2250                        addr = (unsigned long *)bitmap + BIT_WORD(bit);
2251                        if (*addr == ULONG_MAX) {
2252                                n += BITS_PER_LONG;
2253                                bit += BITS_PER_LONG;
2254                                continue;
2255                        }
2256                }
2257
2258                if (test_bit(bit, bitmap))
2259                        n++;
2260                bit++;
2261        }
2262
2263        return n;
2264}
2265
2266/*
2267 * Get a zone weight.
2268 */
2269static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone)
2270{
2271        struct dmz_mblock *mblk;
2272        sector_t chunk_block = 0;
2273        unsigned int bit, nr_bits;
2274        unsigned int nr_blocks = zmd->dev->zone_nr_blocks;
2275        void *bitmap;
2276        int n = 0;
2277
2278        while (nr_blocks) {
2279                /* Get bitmap block */
2280                mblk = dmz_get_bitmap(zmd, zone, chunk_block);
2281                if (IS_ERR(mblk)) {
2282                        n = 0;
2283                        break;
2284                }
2285
2286                /* Count bits in this block */
2287                bitmap = mblk->data;
2288                bit = chunk_block & DMZ_BLOCK_MASK_BITS;
2289                nr_bits = min(nr_blocks, DMZ_BLOCK_SIZE_BITS - bit);
2290                n += dmz_count_bits(bitmap, bit, nr_bits);
2291
2292                dmz_release_mblock(zmd, mblk);
2293
2294                nr_blocks -= nr_bits;
2295                chunk_block += nr_bits;
2296        }
2297
2298        zone->weight = n;
2299}
2300
2301/*
2302 * Cleanup the zoned metadata resources.
2303 */
2304static void dmz_cleanup_metadata(struct dmz_metadata *zmd)
2305{
2306        struct rb_root *root;
2307        struct dmz_mblock *mblk, *next;
2308        int i;
2309
2310        /* Release zone mapping resources */
2311        if (zmd->map_mblk) {
2312                for (i = 0; i < zmd->nr_map_blocks; i++)
2313                        dmz_release_mblock(zmd, zmd->map_mblk[i]);
2314                kfree(zmd->map_mblk);
2315                zmd->map_mblk = NULL;
2316        }
2317
2318        /* Release super blocks */
2319        for (i = 0; i < 2; i++) {
2320                if (zmd->sb[i].mblk) {
2321                        dmz_free_mblock(zmd, zmd->sb[i].mblk);
2322                        zmd->sb[i].mblk = NULL;
2323                }
2324        }
2325
2326        /* Free cached blocks */
2327        while (!list_empty(&zmd->mblk_dirty_list)) {
2328                mblk = list_first_entry(&zmd->mblk_dirty_list,
2329                                        struct dmz_mblock, link);
2330                dmz_dev_warn(zmd->dev, "mblock %llu still in dirty list (ref %u)",
2331                             (u64)mblk->no, mblk->ref);
2332                list_del_init(&mblk->link);
2333                rb_erase(&mblk->node, &zmd->mblk_rbtree);
2334                dmz_free_mblock(zmd, mblk);
2335        }
2336
2337        while (!list_empty(&zmd->mblk_lru_list)) {
2338                mblk = list_first_entry(&zmd->mblk_lru_list,
2339                                        struct dmz_mblock, link);
2340                list_del_init(&mblk->link);
2341                rb_erase(&mblk->node, &zmd->mblk_rbtree);
2342                dmz_free_mblock(zmd, mblk);
2343        }
2344
2345        /* Sanity checks: the mblock rbtree should now be empty */
2346        root = &zmd->mblk_rbtree;
2347        rbtree_postorder_for_each_entry_safe(mblk, next, root, node) {
2348                dmz_dev_warn(zmd->dev, "mblock %llu ref %u still in rbtree",
2349                             (u64)mblk->no, mblk->ref);
2350                mblk->ref = 0;
2351                dmz_free_mblock(zmd, mblk);
2352        }
2353
2354        /* Free the zone descriptors */
2355        dmz_drop_zones(zmd);
2356
2357        mutex_destroy(&zmd->mblk_flush_lock);
2358        mutex_destroy(&zmd->map_lock);
2359}
2360
2361/*
2362 * Initialize the zoned metadata.
2363 */
2364int dmz_ctr_metadata(struct dmz_dev *dev, struct dmz_metadata **metadata)
2365{
2366        struct dmz_metadata *zmd;
2367        unsigned int i, zid;
2368        struct dm_zone *zone;
2369        int ret;
2370
2371        zmd = kzalloc(sizeof(struct dmz_metadata), GFP_KERNEL);
2372        if (!zmd)
2373                return -ENOMEM;
2374
2375        zmd->dev = dev;
2376        zmd->mblk_rbtree = RB_ROOT;
2377        init_rwsem(&zmd->mblk_sem);
2378        mutex_init(&zmd->mblk_flush_lock);
2379        spin_lock_init(&zmd->mblk_lock);
2380        INIT_LIST_HEAD(&zmd->mblk_lru_list);
2381        INIT_LIST_HEAD(&zmd->mblk_dirty_list);
2382
2383        mutex_init(&zmd->map_lock);
2384        atomic_set(&zmd->unmap_nr_rnd, 0);
2385        INIT_LIST_HEAD(&zmd->unmap_rnd_list);
2386        INIT_LIST_HEAD(&zmd->map_rnd_list);
2387
2388        atomic_set(&zmd->unmap_nr_seq, 0);
2389        INIT_LIST_HEAD(&zmd->unmap_seq_list);
2390        INIT_LIST_HEAD(&zmd->map_seq_list);
2391
2392        atomic_set(&zmd->nr_reserved_seq_zones, 0);
2393        INIT_LIST_HEAD(&zmd->reserved_seq_zones_list);
2394
2395        init_waitqueue_head(&zmd->free_wq);
2396
2397        /* Initialize zone descriptors */
2398        ret = dmz_init_zones(zmd);
2399        if (ret)
2400                goto err;
2401
2402        /* Get super block */
2403        ret = dmz_load_sb(zmd);
2404        if (ret)
2405                goto err;
2406
2407        /* Set metadata zones starting from sb_zone */
2408        zid = dmz_id(zmd, zmd->sb_zone);
2409        for (i = 0; i < zmd->nr_meta_zones << 1; i++) {
2410                zone = dmz_get(zmd, zid + i);
2411                if (!dmz_is_rnd(zone))
2412                        goto err;
2413                set_bit(DMZ_META, &zone->flags);
2414        }
2415
2416        /* Load mapping table */
2417        ret = dmz_load_mapping(zmd);
2418        if (ret)
2419                goto err;
2420
2421        /*
2422         * Cache size boundaries: allow at least 2 super blocks, the chunk map
2423         * blocks and enough blocks to be able to cache the bitmap blocks of
2424         * up to 16 zones when idle (min_nr_mblks). Otherwise, if busy, allow
2425         * the cache to add 512 more metadata blocks.
2426         */
2427        zmd->min_nr_mblks = 2 + zmd->nr_map_blocks + zmd->zone_nr_bitmap_blocks * 16;
2428        zmd->max_nr_mblks = zmd->min_nr_mblks + 512;
2429        zmd->mblk_shrinker.count_objects = dmz_mblock_shrinker_count;
2430        zmd->mblk_shrinker.scan_objects = dmz_mblock_shrinker_scan;
2431        zmd->mblk_shrinker.seeks = DEFAULT_SEEKS;
2432
2433        /* Metadata cache shrinker */
2434        ret = register_shrinker(&zmd->mblk_shrinker);
2435        if (ret) {
2436                dmz_dev_err(dev, "Register metadata cache shrinker failed");
2437                goto err;
2438        }
2439
2440        dmz_dev_info(dev, "Host-%s zoned block device",
2441                     bdev_zoned_model(dev->bdev) == BLK_ZONED_HA ?
2442                     "aware" : "managed");
2443        dmz_dev_info(dev, "  %llu 512-byte logical sectors",
2444                     (u64)dev->capacity);
2445        dmz_dev_info(dev, "  %u zones of %llu 512-byte logical sectors",
2446                     dev->nr_zones, (u64)dev->zone_nr_sectors);
2447        dmz_dev_info(dev, "  %u metadata zones",
2448                     zmd->nr_meta_zones * 2);
2449        dmz_dev_info(dev, "  %u data zones for %u chunks",
2450                     zmd->nr_data_zones, zmd->nr_chunks);
2451        dmz_dev_info(dev, "    %u random zones (%u unmapped)",
2452                     zmd->nr_rnd, atomic_read(&zmd->unmap_nr_rnd));
2453        dmz_dev_info(dev, "    %u sequential zones (%u unmapped)",
2454                     zmd->nr_seq, atomic_read(&zmd->unmap_nr_seq));
2455        dmz_dev_info(dev, "  %u reserved sequential data zones",
2456                     zmd->nr_reserved_seq);
2457
2458        dmz_dev_debug(dev, "Format:");
2459        dmz_dev_debug(dev, "%u metadata blocks per set (%u max cache)",
2460                      zmd->nr_meta_blocks, zmd->max_nr_mblks);
2461        dmz_dev_debug(dev, "  %u data zone mapping blocks",
2462                      zmd->nr_map_blocks);
2463        dmz_dev_debug(dev, "  %u bitmap blocks",
2464                      zmd->nr_bitmap_blocks);
2465
2466        *metadata = zmd;
2467
2468        return 0;
2469err:
2470        dmz_cleanup_metadata(zmd);
2471        kfree(zmd);
2472        *metadata = NULL;
2473
2474        return ret;
2475}
2476
2477/*
2478 * Cleanup the zoned metadata resources.
2479 */
2480void dmz_dtr_metadata(struct dmz_metadata *zmd)
2481{
2482        unregister_shrinker(&zmd->mblk_shrinker);
2483        dmz_cleanup_metadata(zmd);
2484        kfree(zmd);
2485}
2486
2487/*
2488 * Check zone information on resume.
2489 */
2490int dmz_resume_metadata(struct dmz_metadata *zmd)
2491{
2492        struct dmz_dev *dev = zmd->dev;
2493        struct dm_zone *zone;
2494        sector_t wp_block;
2495        unsigned int i;
2496        int ret;
2497
2498        /* Check zones */
2499        for (i = 0; i < dev->nr_zones; i++) {
2500                zone = dmz_get(zmd, i);
2501                if (!zone) {
2502                        dmz_dev_err(dev, "Unable to get zone %u", i);
2503                        return -EIO;
2504                }
2505
2506                wp_block = zone->wp_block;
2507
2508                ret = dmz_update_zone(zmd, zone);
2509                if (ret) {
2510                        dmz_dev_err(dev, "Broken zone %u", i);
2511                        return ret;
2512                }
2513
2514                if (dmz_is_offline(zone)) {
2515                        dmz_dev_warn(dev, "Zone %u is offline", i);
2516                        continue;
2517                }
2518
2519                /* Check write pointer */
2520                if (!dmz_is_seq(zone))
2521                        zone->wp_block = 0;
2522                else if (zone->wp_block != wp_block) {
2523                        dmz_dev_err(dev, "Zone %u: Invalid wp (%llu / %llu)",
2524                                    i, (u64)zone->wp_block, (u64)wp_block);
2525                        zone->wp_block = wp_block;
2526                        dmz_invalidate_blocks(zmd, zone, zone->wp_block,
2527                                              dev->zone_nr_blocks - zone->wp_block);
2528                }
2529        }
2530
2531        return 0;
2532}
2533