linux/drivers/md/dm-zoned-metadata.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright (C) 2017 Western Digital Corporation or its affiliates.
   4 *
   5 * This file is released under the GPL.
   6 */
   7
   8#include "dm-zoned.h"
   9
  10#include <linux/module.h>
  11#include <linux/crc32.h>
  12#include <linux/sched/mm.h>
  13
  14#define DM_MSG_PREFIX           "zoned metadata"
  15
  16/*
  17 * Metadata version.
  18 */
  19#define DMZ_META_VER    2
  20
  21/*
  22 * On-disk super block magic.
  23 */
  24#define DMZ_MAGIC       ((((unsigned int)('D')) << 24) | \
  25                         (((unsigned int)('Z')) << 16) | \
  26                         (((unsigned int)('B')) <<  8) | \
  27                         ((unsigned int)('D')))
  28
  29/*
  30 * On disk super block.
  31 * This uses only 512 B but uses on disk a full 4KB block. This block is
  32 * followed on disk by the mapping table of chunks to zones and the bitmap
  33 * blocks indicating zone block validity.
  34 * The overall resulting metadata format is:
  35 *    (1) Super block (1 block)
  36 *    (2) Chunk mapping table (nr_map_blocks)
  37 *    (3) Bitmap blocks (nr_bitmap_blocks)
  38 * All metadata blocks are stored in conventional zones, starting from
  39 * the first conventional zone found on disk.
  40 */
  41struct dmz_super {
  42        /* Magic number */
  43        __le32          magic;                  /*   4 */
  44
  45        /* Metadata version number */
  46        __le32          version;                /*   8 */
  47
  48        /* Generation number */
  49        __le64          gen;                    /*  16 */
  50
  51        /* This block number */
  52        __le64          sb_block;               /*  24 */
  53
  54        /* The number of metadata blocks, including this super block */
  55        __le32          nr_meta_blocks;         /*  28 */
  56
  57        /* The number of sequential zones reserved for reclaim */
  58        __le32          nr_reserved_seq;        /*  32 */
  59
  60        /* The number of entries in the mapping table */
  61        __le32          nr_chunks;              /*  36 */
  62
  63        /* The number of blocks used for the chunk mapping table */
  64        __le32          nr_map_blocks;          /*  40 */
  65
  66        /* The number of blocks used for the block bitmaps */
  67        __le32          nr_bitmap_blocks;       /*  44 */
  68
  69        /* Checksum */
  70        __le32          crc;                    /*  48 */
  71
  72        /* DM-Zoned label */
  73        u8              dmz_label[32];          /*  80 */
  74
  75        /* DM-Zoned UUID */
  76        u8              dmz_uuid[16];           /*  96 */
  77
  78        /* Device UUID */
  79        u8              dev_uuid[16];           /* 112 */
  80
  81        /* Padding to full 512B sector */
  82        u8              reserved[400];          /* 512 */
  83};
  84
  85/*
  86 * Chunk mapping entry: entries are indexed by chunk number
  87 * and give the zone ID (dzone_id) mapping the chunk on disk.
  88 * This zone may be sequential or random. If it is a sequential
  89 * zone, a second zone (bzone_id) used as a write buffer may
  90 * also be specified. This second zone will always be a randomly
  91 * writeable zone.
  92 */
  93struct dmz_map {
  94        __le32                  dzone_id;
  95        __le32                  bzone_id;
  96};
  97
  98/*
  99 * Chunk mapping table metadata: 512 8-bytes entries per 4KB block.
 100 */
 101#define DMZ_MAP_ENTRIES         (DMZ_BLOCK_SIZE / sizeof(struct dmz_map))
 102#define DMZ_MAP_ENTRIES_SHIFT   (ilog2(DMZ_MAP_ENTRIES))
 103#define DMZ_MAP_ENTRIES_MASK    (DMZ_MAP_ENTRIES - 1)
 104#define DMZ_MAP_UNMAPPED        UINT_MAX
 105
 106/*
 107 * Meta data block descriptor (for cached metadata blocks).
 108 */
 109struct dmz_mblock {
 110        struct rb_node          node;
 111        struct list_head        link;
 112        sector_t                no;
 113        unsigned int            ref;
 114        unsigned long           state;
 115        struct page             *page;
 116        void                    *data;
 117};
 118
 119/*
 120 * Metadata block state flags.
 121 */
 122enum {
 123        DMZ_META_DIRTY,
 124        DMZ_META_READING,
 125        DMZ_META_WRITING,
 126        DMZ_META_ERROR,
 127};
 128
 129/*
 130 * Super block information (one per metadata set).
 131 */
 132struct dmz_sb {
 133        sector_t                block;
 134        struct dmz_dev          *dev;
 135        struct dmz_mblock       *mblk;
 136        struct dmz_super        *sb;
 137        struct dm_zone          *zone;
 138};
 139
 140/*
 141 * In-memory metadata.
 142 */
 143struct dmz_metadata {
 144        struct dmz_dev          *dev;
 145        unsigned int            nr_devs;
 146
 147        char                    devname[BDEVNAME_SIZE];
 148        char                    label[BDEVNAME_SIZE];
 149        uuid_t                  uuid;
 150
 151        sector_t                zone_bitmap_size;
 152        unsigned int            zone_nr_bitmap_blocks;
 153        unsigned int            zone_bits_per_mblk;
 154
 155        sector_t                zone_nr_blocks;
 156        sector_t                zone_nr_blocks_shift;
 157
 158        sector_t                zone_nr_sectors;
 159        sector_t                zone_nr_sectors_shift;
 160
 161        unsigned int            nr_bitmap_blocks;
 162        unsigned int            nr_map_blocks;
 163
 164        unsigned int            nr_zones;
 165        unsigned int            nr_useable_zones;
 166        unsigned int            nr_meta_blocks;
 167        unsigned int            nr_meta_zones;
 168        unsigned int            nr_data_zones;
 169        unsigned int            nr_cache_zones;
 170        unsigned int            nr_rnd_zones;
 171        unsigned int            nr_reserved_seq;
 172        unsigned int            nr_chunks;
 173
 174        /* Zone information array */
 175        struct xarray           zones;
 176
 177        struct dmz_sb           sb[2];
 178        unsigned int            mblk_primary;
 179        unsigned int            sb_version;
 180        u64                     sb_gen;
 181        unsigned int            min_nr_mblks;
 182        unsigned int            max_nr_mblks;
 183        atomic_t                nr_mblks;
 184        struct rw_semaphore     mblk_sem;
 185        struct mutex            mblk_flush_lock;
 186        spinlock_t              mblk_lock;
 187        struct rb_root          mblk_rbtree;
 188        struct list_head        mblk_lru_list;
 189        struct list_head        mblk_dirty_list;
 190        struct shrinker         mblk_shrinker;
 191
 192        /* Zone allocation management */
 193        struct mutex            map_lock;
 194        struct dmz_mblock       **map_mblk;
 195
 196        unsigned int            nr_cache;
 197        atomic_t                unmap_nr_cache;
 198        struct list_head        unmap_cache_list;
 199        struct list_head        map_cache_list;
 200
 201        atomic_t                nr_reserved_seq_zones;
 202        struct list_head        reserved_seq_zones_list;
 203
 204        wait_queue_head_t       free_wq;
 205};
 206
 207#define dmz_zmd_info(zmd, format, args...)      \
 208        DMINFO("(%s): " format, (zmd)->label, ## args)
 209
 210#define dmz_zmd_err(zmd, format, args...)       \
 211        DMERR("(%s): " format, (zmd)->label, ## args)
 212
 213#define dmz_zmd_warn(zmd, format, args...)      \
 214        DMWARN("(%s): " format, (zmd)->label, ## args)
 215
 216#define dmz_zmd_debug(zmd, format, args...)     \
 217        DMDEBUG("(%s): " format, (zmd)->label, ## args)
 218/*
 219 * Various accessors
 220 */
 221static unsigned int dmz_dev_zone_id(struct dmz_metadata *zmd, struct dm_zone *zone)
 222{
 223        if (WARN_ON(!zone))
 224                return 0;
 225
 226        return zone->id - zone->dev->zone_offset;
 227}
 228
 229sector_t dmz_start_sect(struct dmz_metadata *zmd, struct dm_zone *zone)
 230{
 231        unsigned int zone_id = dmz_dev_zone_id(zmd, zone);
 232
 233        return (sector_t)zone_id << zmd->zone_nr_sectors_shift;
 234}
 235
 236sector_t dmz_start_block(struct dmz_metadata *zmd, struct dm_zone *zone)
 237{
 238        unsigned int zone_id = dmz_dev_zone_id(zmd, zone);
 239
 240        return (sector_t)zone_id << zmd->zone_nr_blocks_shift;
 241}
 242
 243unsigned int dmz_zone_nr_blocks(struct dmz_metadata *zmd)
 244{
 245        return zmd->zone_nr_blocks;
 246}
 247
 248unsigned int dmz_zone_nr_blocks_shift(struct dmz_metadata *zmd)
 249{
 250        return zmd->zone_nr_blocks_shift;
 251}
 252
 253unsigned int dmz_zone_nr_sectors(struct dmz_metadata *zmd)
 254{
 255        return zmd->zone_nr_sectors;
 256}
 257
 258unsigned int dmz_zone_nr_sectors_shift(struct dmz_metadata *zmd)
 259{
 260        return zmd->zone_nr_sectors_shift;
 261}
 262
 263unsigned int dmz_nr_zones(struct dmz_metadata *zmd)
 264{
 265        return zmd->nr_zones;
 266}
 267
 268unsigned int dmz_nr_chunks(struct dmz_metadata *zmd)
 269{
 270        return zmd->nr_chunks;
 271}
 272
 273unsigned int dmz_nr_rnd_zones(struct dmz_metadata *zmd, int idx)
 274{
 275        return zmd->dev[idx].nr_rnd;
 276}
 277
 278unsigned int dmz_nr_unmap_rnd_zones(struct dmz_metadata *zmd, int idx)
 279{
 280        return atomic_read(&zmd->dev[idx].unmap_nr_rnd);
 281}
 282
 283unsigned int dmz_nr_cache_zones(struct dmz_metadata *zmd)
 284{
 285        return zmd->nr_cache;
 286}
 287
 288unsigned int dmz_nr_unmap_cache_zones(struct dmz_metadata *zmd)
 289{
 290        return atomic_read(&zmd->unmap_nr_cache);
 291}
 292
 293unsigned int dmz_nr_seq_zones(struct dmz_metadata *zmd, int idx)
 294{
 295        return zmd->dev[idx].nr_seq;
 296}
 297
 298unsigned int dmz_nr_unmap_seq_zones(struct dmz_metadata *zmd, int idx)
 299{
 300        return atomic_read(&zmd->dev[idx].unmap_nr_seq);
 301}
 302
 303static struct dm_zone *dmz_get(struct dmz_metadata *zmd, unsigned int zone_id)
 304{
 305        return xa_load(&zmd->zones, zone_id);
 306}
 307
 308static struct dm_zone *dmz_insert(struct dmz_metadata *zmd,
 309                                  unsigned int zone_id, struct dmz_dev *dev)
 310{
 311        struct dm_zone *zone = kzalloc(sizeof(struct dm_zone), GFP_KERNEL);
 312
 313        if (!zone)
 314                return ERR_PTR(-ENOMEM);
 315
 316        if (xa_insert(&zmd->zones, zone_id, zone, GFP_KERNEL)) {
 317                kfree(zone);
 318                return ERR_PTR(-EBUSY);
 319        }
 320
 321        INIT_LIST_HEAD(&zone->link);
 322        atomic_set(&zone->refcount, 0);
 323        zone->id = zone_id;
 324        zone->chunk = DMZ_MAP_UNMAPPED;
 325        zone->dev = dev;
 326
 327        return zone;
 328}
 329
 330const char *dmz_metadata_label(struct dmz_metadata *zmd)
 331{
 332        return (const char *)zmd->label;
 333}
 334
 335bool dmz_check_dev(struct dmz_metadata *zmd)
 336{
 337        unsigned int i;
 338
 339        for (i = 0; i < zmd->nr_devs; i++) {
 340                if (!dmz_check_bdev(&zmd->dev[i]))
 341                        return false;
 342        }
 343        return true;
 344}
 345
 346bool dmz_dev_is_dying(struct dmz_metadata *zmd)
 347{
 348        unsigned int i;
 349
 350        for (i = 0; i < zmd->nr_devs; i++) {
 351                if (dmz_bdev_is_dying(&zmd->dev[i]))
 352                        return true;
 353        }
 354        return false;
 355}
 356
 357/*
 358 * Lock/unlock mapping table.
 359 * The map lock also protects all the zone lists.
 360 */
 361void dmz_lock_map(struct dmz_metadata *zmd)
 362{
 363        mutex_lock(&zmd->map_lock);
 364}
 365
 366void dmz_unlock_map(struct dmz_metadata *zmd)
 367{
 368        mutex_unlock(&zmd->map_lock);
 369}
 370
 371/*
 372 * Lock/unlock metadata access. This is a "read" lock on a semaphore
 373 * that prevents metadata flush from running while metadata are being
 374 * modified. The actual metadata write mutual exclusion is achieved with
 375 * the map lock and zone state management (active and reclaim state are
 376 * mutually exclusive).
 377 */
 378void dmz_lock_metadata(struct dmz_metadata *zmd)
 379{
 380        down_read(&zmd->mblk_sem);
 381}
 382
 383void dmz_unlock_metadata(struct dmz_metadata *zmd)
 384{
 385        up_read(&zmd->mblk_sem);
 386}
 387
 388/*
 389 * Lock/unlock flush: prevent concurrent executions
 390 * of dmz_flush_metadata as well as metadata modification in reclaim
 391 * while flush is being executed.
 392 */
 393void dmz_lock_flush(struct dmz_metadata *zmd)
 394{
 395        mutex_lock(&zmd->mblk_flush_lock);
 396}
 397
 398void dmz_unlock_flush(struct dmz_metadata *zmd)
 399{
 400        mutex_unlock(&zmd->mblk_flush_lock);
 401}
 402
 403/*
 404 * Allocate a metadata block.
 405 */
 406static struct dmz_mblock *dmz_alloc_mblock(struct dmz_metadata *zmd,
 407                                           sector_t mblk_no)
 408{
 409        struct dmz_mblock *mblk = NULL;
 410
 411        /* See if we can reuse cached blocks */
 412        if (zmd->max_nr_mblks && atomic_read(&zmd->nr_mblks) > zmd->max_nr_mblks) {
 413                spin_lock(&zmd->mblk_lock);
 414                mblk = list_first_entry_or_null(&zmd->mblk_lru_list,
 415                                                struct dmz_mblock, link);
 416                if (mblk) {
 417                        list_del_init(&mblk->link);
 418                        rb_erase(&mblk->node, &zmd->mblk_rbtree);
 419                        mblk->no = mblk_no;
 420                }
 421                spin_unlock(&zmd->mblk_lock);
 422                if (mblk)
 423                        return mblk;
 424        }
 425
 426        /* Allocate a new block */
 427        mblk = kmalloc(sizeof(struct dmz_mblock), GFP_NOIO);
 428        if (!mblk)
 429                return NULL;
 430
 431        mblk->page = alloc_page(GFP_NOIO);
 432        if (!mblk->page) {
 433                kfree(mblk);
 434                return NULL;
 435        }
 436
 437        RB_CLEAR_NODE(&mblk->node);
 438        INIT_LIST_HEAD(&mblk->link);
 439        mblk->ref = 0;
 440        mblk->state = 0;
 441        mblk->no = mblk_no;
 442        mblk->data = page_address(mblk->page);
 443
 444        atomic_inc(&zmd->nr_mblks);
 445
 446        return mblk;
 447}
 448
 449/*
 450 * Free a metadata block.
 451 */
 452static void dmz_free_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk)
 453{
 454        __free_pages(mblk->page, 0);
 455        kfree(mblk);
 456
 457        atomic_dec(&zmd->nr_mblks);
 458}
 459
 460/*
 461 * Insert a metadata block in the rbtree.
 462 */
 463static void dmz_insert_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk)
 464{
 465        struct rb_root *root = &zmd->mblk_rbtree;
 466        struct rb_node **new = &(root->rb_node), *parent = NULL;
 467        struct dmz_mblock *b;
 468
 469        /* Figure out where to put the new node */
 470        while (*new) {
 471                b = container_of(*new, struct dmz_mblock, node);
 472                parent = *new;
 473                new = (b->no < mblk->no) ? &((*new)->rb_left) : &((*new)->rb_right);
 474        }
 475
 476        /* Add new node and rebalance tree */
 477        rb_link_node(&mblk->node, parent, new);
 478        rb_insert_color(&mblk->node, root);
 479}
 480
 481/*
 482 * Lookup a metadata block in the rbtree. If the block is found, increment
 483 * its reference count.
 484 */
 485static struct dmz_mblock *dmz_get_mblock_fast(struct dmz_metadata *zmd,
 486                                              sector_t mblk_no)
 487{
 488        struct rb_root *root = &zmd->mblk_rbtree;
 489        struct rb_node *node = root->rb_node;
 490        struct dmz_mblock *mblk;
 491
 492        while (node) {
 493                mblk = container_of(node, struct dmz_mblock, node);
 494                if (mblk->no == mblk_no) {
 495                        /*
 496                         * If this is the first reference to the block,
 497                         * remove it from the LRU list.
 498                         */
 499                        mblk->ref++;
 500                        if (mblk->ref == 1 &&
 501                            !test_bit(DMZ_META_DIRTY, &mblk->state))
 502                                list_del_init(&mblk->link);
 503                        return mblk;
 504                }
 505                node = (mblk->no < mblk_no) ? node->rb_left : node->rb_right;
 506        }
 507
 508        return NULL;
 509}
 510
 511/*
 512 * Metadata block BIO end callback.
 513 */
 514static void dmz_mblock_bio_end_io(struct bio *bio)
 515{
 516        struct dmz_mblock *mblk = bio->bi_private;
 517        int flag;
 518
 519        if (bio->bi_status)
 520                set_bit(DMZ_META_ERROR, &mblk->state);
 521
 522        if (bio_op(bio) == REQ_OP_WRITE)
 523                flag = DMZ_META_WRITING;
 524        else
 525                flag = DMZ_META_READING;
 526
 527        clear_bit_unlock(flag, &mblk->state);
 528        smp_mb__after_atomic();
 529        wake_up_bit(&mblk->state, flag);
 530
 531        bio_put(bio);
 532}
 533
 534/*
 535 * Read an uncached metadata block from disk and add it to the cache.
 536 */
 537static struct dmz_mblock *dmz_get_mblock_slow(struct dmz_metadata *zmd,
 538                                              sector_t mblk_no)
 539{
 540        struct dmz_mblock *mblk, *m;
 541        sector_t block = zmd->sb[zmd->mblk_primary].block + mblk_no;
 542        struct dmz_dev *dev = zmd->sb[zmd->mblk_primary].dev;
 543        struct bio *bio;
 544
 545        if (dmz_bdev_is_dying(dev))
 546                return ERR_PTR(-EIO);
 547
 548        /* Get a new block and a BIO to read it */
 549        mblk = dmz_alloc_mblock(zmd, mblk_no);
 550        if (!mblk)
 551                return ERR_PTR(-ENOMEM);
 552
 553        bio = bio_alloc(dev->bdev, 1, REQ_OP_READ | REQ_META | REQ_PRIO,
 554                        GFP_NOIO);
 555
 556        spin_lock(&zmd->mblk_lock);
 557
 558        /*
 559         * Make sure that another context did not start reading
 560         * the block already.
 561         */
 562        m = dmz_get_mblock_fast(zmd, mblk_no);
 563        if (m) {
 564                spin_unlock(&zmd->mblk_lock);
 565                dmz_free_mblock(zmd, mblk);
 566                bio_put(bio);
 567                return m;
 568        }
 569
 570        mblk->ref++;
 571        set_bit(DMZ_META_READING, &mblk->state);
 572        dmz_insert_mblock(zmd, mblk);
 573
 574        spin_unlock(&zmd->mblk_lock);
 575
 576        /* Submit read BIO */
 577        bio->bi_iter.bi_sector = dmz_blk2sect(block);
 578        bio->bi_private = mblk;
 579        bio->bi_end_io = dmz_mblock_bio_end_io;
 580        bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0);
 581        submit_bio(bio);
 582
 583        return mblk;
 584}
 585
 586/*
 587 * Free metadata blocks.
 588 */
 589static unsigned long dmz_shrink_mblock_cache(struct dmz_metadata *zmd,
 590                                             unsigned long limit)
 591{
 592        struct dmz_mblock *mblk;
 593        unsigned long count = 0;
 594
 595        if (!zmd->max_nr_mblks)
 596                return 0;
 597
 598        while (!list_empty(&zmd->mblk_lru_list) &&
 599               atomic_read(&zmd->nr_mblks) > zmd->min_nr_mblks &&
 600               count < limit) {
 601                mblk = list_first_entry(&zmd->mblk_lru_list,
 602                                        struct dmz_mblock, link);
 603                list_del_init(&mblk->link);
 604                rb_erase(&mblk->node, &zmd->mblk_rbtree);
 605                dmz_free_mblock(zmd, mblk);
 606                count++;
 607        }
 608
 609        return count;
 610}
 611
 612/*
 613 * For mblock shrinker: get the number of unused metadata blocks in the cache.
 614 */
 615static unsigned long dmz_mblock_shrinker_count(struct shrinker *shrink,
 616                                               struct shrink_control *sc)
 617{
 618        struct dmz_metadata *zmd = container_of(shrink, struct dmz_metadata, mblk_shrinker);
 619
 620        return atomic_read(&zmd->nr_mblks);
 621}
 622
 623/*
 624 * For mblock shrinker: scan unused metadata blocks and shrink the cache.
 625 */
 626static unsigned long dmz_mblock_shrinker_scan(struct shrinker *shrink,
 627                                              struct shrink_control *sc)
 628{
 629        struct dmz_metadata *zmd = container_of(shrink, struct dmz_metadata, mblk_shrinker);
 630        unsigned long count;
 631
 632        spin_lock(&zmd->mblk_lock);
 633        count = dmz_shrink_mblock_cache(zmd, sc->nr_to_scan);
 634        spin_unlock(&zmd->mblk_lock);
 635
 636        return count ? count : SHRINK_STOP;
 637}
 638
 639/*
 640 * Release a metadata block.
 641 */
 642static void dmz_release_mblock(struct dmz_metadata *zmd,
 643                               struct dmz_mblock *mblk)
 644{
 645
 646        if (!mblk)
 647                return;
 648
 649        spin_lock(&zmd->mblk_lock);
 650
 651        mblk->ref--;
 652        if (mblk->ref == 0) {
 653                if (test_bit(DMZ_META_ERROR, &mblk->state)) {
 654                        rb_erase(&mblk->node, &zmd->mblk_rbtree);
 655                        dmz_free_mblock(zmd, mblk);
 656                } else if (!test_bit(DMZ_META_DIRTY, &mblk->state)) {
 657                        list_add_tail(&mblk->link, &zmd->mblk_lru_list);
 658                        dmz_shrink_mblock_cache(zmd, 1);
 659                }
 660        }
 661
 662        spin_unlock(&zmd->mblk_lock);
 663}
 664
 665/*
 666 * Get a metadata block from the rbtree. If the block
 667 * is not present, read it from disk.
 668 */
 669static struct dmz_mblock *dmz_get_mblock(struct dmz_metadata *zmd,
 670                                         sector_t mblk_no)
 671{
 672        struct dmz_mblock *mblk;
 673        struct dmz_dev *dev = zmd->sb[zmd->mblk_primary].dev;
 674
 675        /* Check rbtree */
 676        spin_lock(&zmd->mblk_lock);
 677        mblk = dmz_get_mblock_fast(zmd, mblk_no);
 678        spin_unlock(&zmd->mblk_lock);
 679
 680        if (!mblk) {
 681                /* Cache miss: read the block from disk */
 682                mblk = dmz_get_mblock_slow(zmd, mblk_no);
 683                if (IS_ERR(mblk))
 684                        return mblk;
 685        }
 686
 687        /* Wait for on-going read I/O and check for error */
 688        wait_on_bit_io(&mblk->state, DMZ_META_READING,
 689                       TASK_UNINTERRUPTIBLE);
 690        if (test_bit(DMZ_META_ERROR, &mblk->state)) {
 691                dmz_release_mblock(zmd, mblk);
 692                dmz_check_bdev(dev);
 693                return ERR_PTR(-EIO);
 694        }
 695
 696        return mblk;
 697}
 698
 699/*
 700 * Mark a metadata block dirty.
 701 */
 702static void dmz_dirty_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk)
 703{
 704        spin_lock(&zmd->mblk_lock);
 705        if (!test_and_set_bit(DMZ_META_DIRTY, &mblk->state))
 706                list_add_tail(&mblk->link, &zmd->mblk_dirty_list);
 707        spin_unlock(&zmd->mblk_lock);
 708}
 709
 710/*
 711 * Issue a metadata block write BIO.
 712 */
 713static int dmz_write_mblock(struct dmz_metadata *zmd, struct dmz_mblock *mblk,
 714                            unsigned int set)
 715{
 716        struct dmz_dev *dev = zmd->sb[set].dev;
 717        sector_t block = zmd->sb[set].block + mblk->no;
 718        struct bio *bio;
 719
 720        if (dmz_bdev_is_dying(dev))
 721                return -EIO;
 722
 723        bio = bio_alloc(dev->bdev, 1, REQ_OP_WRITE | REQ_META | REQ_PRIO,
 724                        GFP_NOIO);
 725
 726        set_bit(DMZ_META_WRITING, &mblk->state);
 727
 728        bio->bi_iter.bi_sector = dmz_blk2sect(block);
 729        bio->bi_private = mblk;
 730        bio->bi_end_io = dmz_mblock_bio_end_io;
 731        bio_add_page(bio, mblk->page, DMZ_BLOCK_SIZE, 0);
 732        submit_bio(bio);
 733
 734        return 0;
 735}
 736
 737/*
 738 * Read/write a metadata block.
 739 */
 740static int dmz_rdwr_block(struct dmz_dev *dev, int op,
 741                          sector_t block, struct page *page)
 742{
 743        struct bio *bio;
 744        int ret;
 745
 746        if (WARN_ON(!dev))
 747                return -EIO;
 748
 749        if (dmz_bdev_is_dying(dev))
 750                return -EIO;
 751
 752        bio = bio_alloc(dev->bdev, 1, op | REQ_SYNC | REQ_META | REQ_PRIO,
 753                        GFP_NOIO);
 754        bio->bi_iter.bi_sector = dmz_blk2sect(block);
 755        bio_add_page(bio, page, DMZ_BLOCK_SIZE, 0);
 756        ret = submit_bio_wait(bio);
 757        bio_put(bio);
 758
 759        if (ret)
 760                dmz_check_bdev(dev);
 761        return ret;
 762}
 763
 764/*
 765 * Write super block of the specified metadata set.
 766 */
 767static int dmz_write_sb(struct dmz_metadata *zmd, unsigned int set)
 768{
 769        struct dmz_mblock *mblk = zmd->sb[set].mblk;
 770        struct dmz_super *sb = zmd->sb[set].sb;
 771        struct dmz_dev *dev = zmd->sb[set].dev;
 772        sector_t sb_block;
 773        u64 sb_gen = zmd->sb_gen + 1;
 774        int ret;
 775
 776        sb->magic = cpu_to_le32(DMZ_MAGIC);
 777
 778        sb->version = cpu_to_le32(zmd->sb_version);
 779        if (zmd->sb_version > 1) {
 780                BUILD_BUG_ON(UUID_SIZE != 16);
 781                export_uuid(sb->dmz_uuid, &zmd->uuid);
 782                memcpy(sb->dmz_label, zmd->label, BDEVNAME_SIZE);
 783                export_uuid(sb->dev_uuid, &dev->uuid);
 784        }
 785
 786        sb->gen = cpu_to_le64(sb_gen);
 787
 788        /*
 789         * The metadata always references the absolute block address,
 790         * ie relative to the entire block range, not the per-device
 791         * block address.
 792         */
 793        sb_block = zmd->sb[set].zone->id << zmd->zone_nr_blocks_shift;
 794        sb->sb_block = cpu_to_le64(sb_block);
 795        sb->nr_meta_blocks = cpu_to_le32(zmd->nr_meta_blocks);
 796        sb->nr_reserved_seq = cpu_to_le32(zmd->nr_reserved_seq);
 797        sb->nr_chunks = cpu_to_le32(zmd->nr_chunks);
 798
 799        sb->nr_map_blocks = cpu_to_le32(zmd->nr_map_blocks);
 800        sb->nr_bitmap_blocks = cpu_to_le32(zmd->nr_bitmap_blocks);
 801
 802        sb->crc = 0;
 803        sb->crc = cpu_to_le32(crc32_le(sb_gen, (unsigned char *)sb, DMZ_BLOCK_SIZE));
 804
 805        ret = dmz_rdwr_block(dev, REQ_OP_WRITE, zmd->sb[set].block,
 806                             mblk->page);
 807        if (ret == 0)
 808                ret = blkdev_issue_flush(dev->bdev);
 809
 810        return ret;
 811}
 812
 813/*
 814 * Write dirty metadata blocks to the specified set.
 815 */
 816static int dmz_write_dirty_mblocks(struct dmz_metadata *zmd,
 817                                   struct list_head *write_list,
 818                                   unsigned int set)
 819{
 820        struct dmz_mblock *mblk;
 821        struct dmz_dev *dev = zmd->sb[set].dev;
 822        struct blk_plug plug;
 823        int ret = 0, nr_mblks_submitted = 0;
 824
 825        /* Issue writes */
 826        blk_start_plug(&plug);
 827        list_for_each_entry(mblk, write_list, link) {
 828                ret = dmz_write_mblock(zmd, mblk, set);
 829                if (ret)
 830                        break;
 831                nr_mblks_submitted++;
 832        }
 833        blk_finish_plug(&plug);
 834
 835        /* Wait for completion */
 836        list_for_each_entry(mblk, write_list, link) {
 837                if (!nr_mblks_submitted)
 838                        break;
 839                wait_on_bit_io(&mblk->state, DMZ_META_WRITING,
 840                               TASK_UNINTERRUPTIBLE);
 841                if (test_bit(DMZ_META_ERROR, &mblk->state)) {
 842                        clear_bit(DMZ_META_ERROR, &mblk->state);
 843                        dmz_check_bdev(dev);
 844                        ret = -EIO;
 845                }
 846                nr_mblks_submitted--;
 847        }
 848
 849        /* Flush drive cache (this will also sync data) */
 850        if (ret == 0)
 851                ret = blkdev_issue_flush(dev->bdev);
 852
 853        return ret;
 854}
 855
 856/*
 857 * Log dirty metadata blocks.
 858 */
 859static int dmz_log_dirty_mblocks(struct dmz_metadata *zmd,
 860                                 struct list_head *write_list)
 861{
 862        unsigned int log_set = zmd->mblk_primary ^ 0x1;
 863        int ret;
 864
 865        /* Write dirty blocks to the log */
 866        ret = dmz_write_dirty_mblocks(zmd, write_list, log_set);
 867        if (ret)
 868                return ret;
 869
 870        /*
 871         * No error so far: now validate the log by updating the
 872         * log index super block generation.
 873         */
 874        ret = dmz_write_sb(zmd, log_set);
 875        if (ret)
 876                return ret;
 877
 878        return 0;
 879}
 880
 881/*
 882 * Flush dirty metadata blocks.
 883 */
 884int dmz_flush_metadata(struct dmz_metadata *zmd)
 885{
 886        struct dmz_mblock *mblk;
 887        struct list_head write_list;
 888        struct dmz_dev *dev;
 889        int ret;
 890
 891        if (WARN_ON(!zmd))
 892                return 0;
 893
 894        INIT_LIST_HEAD(&write_list);
 895
 896        /*
 897         * Make sure that metadata blocks are stable before logging: take
 898         * the write lock on the metadata semaphore to prevent target BIOs
 899         * from modifying metadata.
 900         */
 901        down_write(&zmd->mblk_sem);
 902        dev = zmd->sb[zmd->mblk_primary].dev;
 903
 904        /*
 905         * This is called from the target flush work and reclaim work.
 906         * Concurrent execution is not allowed.
 907         */
 908        dmz_lock_flush(zmd);
 909
 910        if (dmz_bdev_is_dying(dev)) {
 911                ret = -EIO;
 912                goto out;
 913        }
 914
 915        /* Get dirty blocks */
 916        spin_lock(&zmd->mblk_lock);
 917        list_splice_init(&zmd->mblk_dirty_list, &write_list);
 918        spin_unlock(&zmd->mblk_lock);
 919
 920        /* If there are no dirty metadata blocks, just flush the device cache */
 921        if (list_empty(&write_list)) {
 922                ret = blkdev_issue_flush(dev->bdev);
 923                goto err;
 924        }
 925
 926        /*
 927         * The primary metadata set is still clean. Keep it this way until
 928         * all updates are successful in the secondary set. That is, use
 929         * the secondary set as a log.
 930         */
 931        ret = dmz_log_dirty_mblocks(zmd, &write_list);
 932        if (ret)
 933                goto err;
 934
 935        /*
 936         * The log is on disk. It is now safe to update in place
 937         * in the primary metadata set.
 938         */
 939        ret = dmz_write_dirty_mblocks(zmd, &write_list, zmd->mblk_primary);
 940        if (ret)
 941                goto err;
 942
 943        ret = dmz_write_sb(zmd, zmd->mblk_primary);
 944        if (ret)
 945                goto err;
 946
 947        while (!list_empty(&write_list)) {
 948                mblk = list_first_entry(&write_list, struct dmz_mblock, link);
 949                list_del_init(&mblk->link);
 950
 951                spin_lock(&zmd->mblk_lock);
 952                clear_bit(DMZ_META_DIRTY, &mblk->state);
 953                if (mblk->ref == 0)
 954                        list_add_tail(&mblk->link, &zmd->mblk_lru_list);
 955                spin_unlock(&zmd->mblk_lock);
 956        }
 957
 958        zmd->sb_gen++;
 959out:
 960        dmz_unlock_flush(zmd);
 961        up_write(&zmd->mblk_sem);
 962
 963        return ret;
 964
 965err:
 966        if (!list_empty(&write_list)) {
 967                spin_lock(&zmd->mblk_lock);
 968                list_splice(&write_list, &zmd->mblk_dirty_list);
 969                spin_unlock(&zmd->mblk_lock);
 970        }
 971        if (!dmz_check_bdev(dev))
 972                ret = -EIO;
 973        goto out;
 974}
 975
 976/*
 977 * Check super block.
 978 */
 979static int dmz_check_sb(struct dmz_metadata *zmd, struct dmz_sb *dsb,
 980                        bool tertiary)
 981{
 982        struct dmz_super *sb = dsb->sb;
 983        struct dmz_dev *dev = dsb->dev;
 984        unsigned int nr_meta_zones, nr_data_zones;
 985        u32 crc, stored_crc;
 986        u64 gen, sb_block;
 987
 988        if (le32_to_cpu(sb->magic) != DMZ_MAGIC) {
 989                dmz_dev_err(dev, "Invalid meta magic (needed 0x%08x, got 0x%08x)",
 990                            DMZ_MAGIC, le32_to_cpu(sb->magic));
 991                return -ENXIO;
 992        }
 993
 994        zmd->sb_version = le32_to_cpu(sb->version);
 995        if (zmd->sb_version > DMZ_META_VER) {
 996                dmz_dev_err(dev, "Invalid meta version (needed %d, got %d)",
 997                            DMZ_META_VER, zmd->sb_version);
 998                return -EINVAL;
 999        }
1000        if (zmd->sb_version < 2 && tertiary) {
1001                dmz_dev_err(dev, "Tertiary superblocks are not supported");
1002                return -EINVAL;
1003        }
1004
1005        gen = le64_to_cpu(sb->gen);
1006        stored_crc = le32_to_cpu(sb->crc);
1007        sb->crc = 0;
1008        crc = crc32_le(gen, (unsigned char *)sb, DMZ_BLOCK_SIZE);
1009        if (crc != stored_crc) {
1010                dmz_dev_err(dev, "Invalid checksum (needed 0x%08x, got 0x%08x)",
1011                            crc, stored_crc);
1012                return -ENXIO;
1013        }
1014
1015        sb_block = le64_to_cpu(sb->sb_block);
1016        if (sb_block != (u64)dsb->zone->id << zmd->zone_nr_blocks_shift ) {
1017                dmz_dev_err(dev, "Invalid superblock position "
1018                            "(is %llu expected %llu)",
1019                            sb_block,
1020                            (u64)dsb->zone->id << zmd->zone_nr_blocks_shift);
1021                return -EINVAL;
1022        }
1023        if (zmd->sb_version > 1) {
1024                uuid_t sb_uuid;
1025
1026                import_uuid(&sb_uuid, sb->dmz_uuid);
1027                if (uuid_is_null(&sb_uuid)) {
1028                        dmz_dev_err(dev, "NULL DM-Zoned uuid");
1029                        return -ENXIO;
1030                } else if (uuid_is_null(&zmd->uuid)) {
1031                        uuid_copy(&zmd->uuid, &sb_uuid);
1032                } else if (!uuid_equal(&zmd->uuid, &sb_uuid)) {
1033                        dmz_dev_err(dev, "mismatching DM-Zoned uuid, "
1034                                    "is %pUl expected %pUl",
1035                                    &sb_uuid, &zmd->uuid);
1036                        return -ENXIO;
1037                }
1038                if (!strlen(zmd->label))
1039                        memcpy(zmd->label, sb->dmz_label, BDEVNAME_SIZE);
1040                else if (memcmp(zmd->label, sb->dmz_label, BDEVNAME_SIZE)) {
1041                        dmz_dev_err(dev, "mismatching DM-Zoned label, "
1042                                    "is %s expected %s",
1043                                    sb->dmz_label, zmd->label);
1044                        return -ENXIO;
1045                }
1046                import_uuid(&dev->uuid, sb->dev_uuid);
1047                if (uuid_is_null(&dev->uuid)) {
1048                        dmz_dev_err(dev, "NULL device uuid");
1049                        return -ENXIO;
1050                }
1051
1052                if (tertiary) {
1053                        /*
1054                         * Generation number should be 0, but it doesn't
1055                         * really matter if it isn't.
1056                         */
1057                        if (gen != 0)
1058                                dmz_dev_warn(dev, "Invalid generation %llu",
1059                                            gen);
1060                        return 0;
1061                }
1062        }
1063
1064        nr_meta_zones = (le32_to_cpu(sb->nr_meta_blocks) + zmd->zone_nr_blocks - 1)
1065                >> zmd->zone_nr_blocks_shift;
1066        if (!nr_meta_zones ||
1067            (zmd->nr_devs <= 1 && nr_meta_zones >= zmd->nr_rnd_zones) ||
1068            (zmd->nr_devs > 1 && nr_meta_zones >= zmd->nr_cache_zones)) {
1069                dmz_dev_err(dev, "Invalid number of metadata blocks");
1070                return -ENXIO;
1071        }
1072
1073        if (!le32_to_cpu(sb->nr_reserved_seq) ||
1074            le32_to_cpu(sb->nr_reserved_seq) >= (zmd->nr_useable_zones - nr_meta_zones)) {
1075                dmz_dev_err(dev, "Invalid number of reserved sequential zones");
1076                return -ENXIO;
1077        }
1078
1079        nr_data_zones = zmd->nr_useable_zones -
1080                (nr_meta_zones * 2 + le32_to_cpu(sb->nr_reserved_seq));
1081        if (le32_to_cpu(sb->nr_chunks) > nr_data_zones) {
1082                dmz_dev_err(dev, "Invalid number of chunks %u / %u",
1083                            le32_to_cpu(sb->nr_chunks), nr_data_zones);
1084                return -ENXIO;
1085        }
1086
1087        /* OK */
1088        zmd->nr_meta_blocks = le32_to_cpu(sb->nr_meta_blocks);
1089        zmd->nr_reserved_seq = le32_to_cpu(sb->nr_reserved_seq);
1090        zmd->nr_chunks = le32_to_cpu(sb->nr_chunks);
1091        zmd->nr_map_blocks = le32_to_cpu(sb->nr_map_blocks);
1092        zmd->nr_bitmap_blocks = le32_to_cpu(sb->nr_bitmap_blocks);
1093        zmd->nr_meta_zones = nr_meta_zones;
1094        zmd->nr_data_zones = nr_data_zones;
1095
1096        return 0;
1097}
1098
1099/*
1100 * Read the first or second super block from disk.
1101 */
1102static int dmz_read_sb(struct dmz_metadata *zmd, struct dmz_sb *sb, int set)
1103{
1104        dmz_zmd_debug(zmd, "read superblock set %d dev %pg block %llu",
1105                      set, sb->dev->bdev, sb->block);
1106
1107        return dmz_rdwr_block(sb->dev, REQ_OP_READ,
1108                              sb->block, sb->mblk->page);
1109}
1110
1111/*
1112 * Determine the position of the secondary super blocks on disk.
1113 * This is used only if a corruption of the primary super block
1114 * is detected.
1115 */
1116static int dmz_lookup_secondary_sb(struct dmz_metadata *zmd)
1117{
1118        unsigned int zone_nr_blocks = zmd->zone_nr_blocks;
1119        struct dmz_mblock *mblk;
1120        unsigned int zone_id = zmd->sb[0].zone->id;
1121        int i;
1122
1123        /* Allocate a block */
1124        mblk = dmz_alloc_mblock(zmd, 0);
1125        if (!mblk)
1126                return -ENOMEM;
1127
1128        zmd->sb[1].mblk = mblk;
1129        zmd->sb[1].sb = mblk->data;
1130
1131        /* Bad first super block: search for the second one */
1132        zmd->sb[1].block = zmd->sb[0].block + zone_nr_blocks;
1133        zmd->sb[1].zone = dmz_get(zmd, zone_id + 1);
1134        zmd->sb[1].dev = zmd->sb[0].dev;
1135        for (i = 1; i < zmd->nr_rnd_zones; i++) {
1136                if (dmz_read_sb(zmd, &zmd->sb[1], 1) != 0)
1137                        break;
1138                if (le32_to_cpu(zmd->sb[1].sb->magic) == DMZ_MAGIC)
1139                        return 0;
1140                zmd->sb[1].block += zone_nr_blocks;
1141                zmd->sb[1].zone = dmz_get(zmd, zone_id + i);
1142        }
1143
1144        dmz_free_mblock(zmd, mblk);
1145        zmd->sb[1].mblk = NULL;
1146        zmd->sb[1].zone = NULL;
1147        zmd->sb[1].dev = NULL;
1148
1149        return -EIO;
1150}
1151
1152/*
1153 * Read a super block from disk.
1154 */
1155static int dmz_get_sb(struct dmz_metadata *zmd, struct dmz_sb *sb, int set)
1156{
1157        struct dmz_mblock *mblk;
1158        int ret;
1159
1160        /* Allocate a block */
1161        mblk = dmz_alloc_mblock(zmd, 0);
1162        if (!mblk)
1163                return -ENOMEM;
1164
1165        sb->mblk = mblk;
1166        sb->sb = mblk->data;
1167
1168        /* Read super block */
1169        ret = dmz_read_sb(zmd, sb, set);
1170        if (ret) {
1171                dmz_free_mblock(zmd, mblk);
1172                sb->mblk = NULL;
1173                return ret;
1174        }
1175
1176        return 0;
1177}
1178
1179/*
1180 * Recover a metadata set.
1181 */
1182static int dmz_recover_mblocks(struct dmz_metadata *zmd, unsigned int dst_set)
1183{
1184        unsigned int src_set = dst_set ^ 0x1;
1185        struct page *page;
1186        int i, ret;
1187
1188        dmz_dev_warn(zmd->sb[dst_set].dev,
1189                     "Metadata set %u invalid: recovering", dst_set);
1190
1191        if (dst_set == 0)
1192                zmd->sb[0].block = dmz_start_block(zmd, zmd->sb[0].zone);
1193        else
1194                zmd->sb[1].block = dmz_start_block(zmd, zmd->sb[1].zone);
1195
1196        page = alloc_page(GFP_NOIO);
1197        if (!page)
1198                return -ENOMEM;
1199
1200        /* Copy metadata blocks */
1201        for (i = 1; i < zmd->nr_meta_blocks; i++) {
1202                ret = dmz_rdwr_block(zmd->sb[src_set].dev, REQ_OP_READ,
1203                                     zmd->sb[src_set].block + i, page);
1204                if (ret)
1205                        goto out;
1206                ret = dmz_rdwr_block(zmd->sb[dst_set].dev, REQ_OP_WRITE,
1207                                     zmd->sb[dst_set].block + i, page);
1208                if (ret)
1209                        goto out;
1210        }
1211
1212        /* Finalize with the super block */
1213        if (!zmd->sb[dst_set].mblk) {
1214                zmd->sb[dst_set].mblk = dmz_alloc_mblock(zmd, 0);
1215                if (!zmd->sb[dst_set].mblk) {
1216                        ret = -ENOMEM;
1217                        goto out;
1218                }
1219                zmd->sb[dst_set].sb = zmd->sb[dst_set].mblk->data;
1220        }
1221
1222        ret = dmz_write_sb(zmd, dst_set);
1223out:
1224        __free_pages(page, 0);
1225
1226        return ret;
1227}
1228
1229/*
1230 * Get super block from disk.
1231 */
1232static int dmz_load_sb(struct dmz_metadata *zmd)
1233{
1234        bool sb_good[2] = {false, false};
1235        u64 sb_gen[2] = {0, 0};
1236        int ret;
1237
1238        if (!zmd->sb[0].zone) {
1239                dmz_zmd_err(zmd, "Primary super block zone not set");
1240                return -ENXIO;
1241        }
1242
1243        /* Read and check the primary super block */
1244        zmd->sb[0].block = dmz_start_block(zmd, zmd->sb[0].zone);
1245        zmd->sb[0].dev = zmd->sb[0].zone->dev;
1246        ret = dmz_get_sb(zmd, &zmd->sb[0], 0);
1247        if (ret) {
1248                dmz_dev_err(zmd->sb[0].dev, "Read primary super block failed");
1249                return ret;
1250        }
1251
1252        ret = dmz_check_sb(zmd, &zmd->sb[0], false);
1253
1254        /* Read and check secondary super block */
1255        if (ret == 0) {
1256                sb_good[0] = true;
1257                if (!zmd->sb[1].zone) {
1258                        unsigned int zone_id =
1259                                zmd->sb[0].zone->id + zmd->nr_meta_zones;
1260
1261                        zmd->sb[1].zone = dmz_get(zmd, zone_id);
1262                }
1263                zmd->sb[1].block = dmz_start_block(zmd, zmd->sb[1].zone);
1264                zmd->sb[1].dev = zmd->sb[0].dev;
1265                ret = dmz_get_sb(zmd, &zmd->sb[1], 1);
1266        } else
1267                ret = dmz_lookup_secondary_sb(zmd);
1268
1269        if (ret) {
1270                dmz_dev_err(zmd->sb[1].dev, "Read secondary super block failed");
1271                return ret;
1272        }
1273
1274        ret = dmz_check_sb(zmd, &zmd->sb[1], false);
1275        if (ret == 0)
1276                sb_good[1] = true;
1277
1278        /* Use highest generation sb first */
1279        if (!sb_good[0] && !sb_good[1]) {
1280                dmz_zmd_err(zmd, "No valid super block found");
1281                return -EIO;
1282        }
1283
1284        if (sb_good[0])
1285                sb_gen[0] = le64_to_cpu(zmd->sb[0].sb->gen);
1286        else {
1287                ret = dmz_recover_mblocks(zmd, 0);
1288                if (ret) {
1289                        dmz_dev_err(zmd->sb[0].dev,
1290                                    "Recovery of superblock 0 failed");
1291                        return -EIO;
1292                }
1293        }
1294
1295        if (sb_good[1])
1296                sb_gen[1] = le64_to_cpu(zmd->sb[1].sb->gen);
1297        else {
1298                ret = dmz_recover_mblocks(zmd, 1);
1299
1300                if (ret) {
1301                        dmz_dev_err(zmd->sb[1].dev,
1302                                    "Recovery of superblock 1 failed");
1303                        return -EIO;
1304                }
1305        }
1306
1307        if (sb_gen[0] >= sb_gen[1]) {
1308                zmd->sb_gen = sb_gen[0];
1309                zmd->mblk_primary = 0;
1310        } else {
1311                zmd->sb_gen = sb_gen[1];
1312                zmd->mblk_primary = 1;
1313        }
1314
1315        dmz_dev_debug(zmd->sb[zmd->mblk_primary].dev,
1316                      "Using super block %u (gen %llu)",
1317                      zmd->mblk_primary, zmd->sb_gen);
1318
1319        if (zmd->sb_version > 1) {
1320                int i;
1321                struct dmz_sb *sb;
1322
1323                sb = kzalloc(sizeof(struct dmz_sb), GFP_KERNEL);
1324                if (!sb)
1325                        return -ENOMEM;
1326                for (i = 1; i < zmd->nr_devs; i++) {
1327                        sb->block = 0;
1328                        sb->zone = dmz_get(zmd, zmd->dev[i].zone_offset);
1329                        sb->dev = &zmd->dev[i];
1330                        if (!dmz_is_meta(sb->zone)) {
1331                                dmz_dev_err(sb->dev,
1332                                            "Tertiary super block zone %u not marked as metadata zone",
1333                                            sb->zone->id);
1334                                ret = -EINVAL;
1335                                goto out_kfree;
1336                        }
1337                        ret = dmz_get_sb(zmd, sb, i + 1);
1338                        if (ret) {
1339                                dmz_dev_err(sb->dev,
1340                                            "Read tertiary super block failed");
1341                                dmz_free_mblock(zmd, sb->mblk);
1342                                goto out_kfree;
1343                        }
1344                        ret = dmz_check_sb(zmd, sb, true);
1345                        dmz_free_mblock(zmd, sb->mblk);
1346                        if (ret == -EINVAL)
1347                                goto out_kfree;
1348                }
1349        out_kfree:
1350                kfree(sb);
1351        }
1352        return ret;
1353}
1354
1355/*
1356 * Initialize a zone descriptor.
1357 */
1358static int dmz_init_zone(struct blk_zone *blkz, unsigned int num, void *data)
1359{
1360        struct dmz_dev *dev = data;
1361        struct dmz_metadata *zmd = dev->metadata;
1362        int idx = num + dev->zone_offset;
1363        struct dm_zone *zone;
1364
1365        zone = dmz_insert(zmd, idx, dev);
1366        if (IS_ERR(zone))
1367                return PTR_ERR(zone);
1368
1369        if (blkz->len != zmd->zone_nr_sectors) {
1370                if (zmd->sb_version > 1) {
1371                        /* Ignore the eventual runt (smaller) zone */
1372                        set_bit(DMZ_OFFLINE, &zone->flags);
1373                        return 0;
1374                } else if (blkz->start + blkz->len == dev->capacity)
1375                        return 0;
1376                return -ENXIO;
1377        }
1378
1379        /*
1380         * Devices that have zones with a capacity smaller than the zone size
1381         * (e.g. NVMe zoned namespaces) are not supported.
1382         */
1383        if (blkz->capacity != blkz->len)
1384                return -ENXIO;
1385
1386        switch (blkz->type) {
1387        case BLK_ZONE_TYPE_CONVENTIONAL:
1388                set_bit(DMZ_RND, &zone->flags);
1389                break;
1390        case BLK_ZONE_TYPE_SEQWRITE_REQ:
1391        case BLK_ZONE_TYPE_SEQWRITE_PREF:
1392                set_bit(DMZ_SEQ, &zone->flags);
1393                break;
1394        default:
1395                return -ENXIO;
1396        }
1397
1398        if (dmz_is_rnd(zone))
1399                zone->wp_block = 0;
1400        else
1401                zone->wp_block = dmz_sect2blk(blkz->wp - blkz->start);
1402
1403        if (blkz->cond == BLK_ZONE_COND_OFFLINE)
1404                set_bit(DMZ_OFFLINE, &zone->flags);
1405        else if (blkz->cond == BLK_ZONE_COND_READONLY)
1406                set_bit(DMZ_READ_ONLY, &zone->flags);
1407        else {
1408                zmd->nr_useable_zones++;
1409                if (dmz_is_rnd(zone)) {
1410                        zmd->nr_rnd_zones++;
1411                        if (zmd->nr_devs == 1 && !zmd->sb[0].zone) {
1412                                /* Primary super block zone */
1413                                zmd->sb[0].zone = zone;
1414                        }
1415                }
1416                if (zmd->nr_devs > 1 && num == 0) {
1417                        /*
1418                         * Tertiary superblock zones are always at the
1419                         * start of the zoned devices, so mark them
1420                         * as metadata zone.
1421                         */
1422                        set_bit(DMZ_META, &zone->flags);
1423                }
1424        }
1425        return 0;
1426}
1427
1428static int dmz_emulate_zones(struct dmz_metadata *zmd, struct dmz_dev *dev)
1429{
1430        int idx;
1431        sector_t zone_offset = 0;
1432
1433        for(idx = 0; idx < dev->nr_zones; idx++) {
1434                struct dm_zone *zone;
1435
1436                zone = dmz_insert(zmd, idx, dev);
1437                if (IS_ERR(zone))
1438                        return PTR_ERR(zone);
1439                set_bit(DMZ_CACHE, &zone->flags);
1440                zone->wp_block = 0;
1441                zmd->nr_cache_zones++;
1442                zmd->nr_useable_zones++;
1443                if (dev->capacity - zone_offset < zmd->zone_nr_sectors) {
1444                        /* Disable runt zone */
1445                        set_bit(DMZ_OFFLINE, &zone->flags);
1446                        break;
1447                }
1448                zone_offset += zmd->zone_nr_sectors;
1449        }
1450        return 0;
1451}
1452
1453/*
1454 * Free zones descriptors.
1455 */
1456static void dmz_drop_zones(struct dmz_metadata *zmd)
1457{
1458        int idx;
1459
1460        for(idx = 0; idx < zmd->nr_zones; idx++) {
1461                struct dm_zone *zone = xa_load(&zmd->zones, idx);
1462
1463                kfree(zone);
1464                xa_erase(&zmd->zones, idx);
1465        }
1466        xa_destroy(&zmd->zones);
1467}
1468
1469/*
1470 * Allocate and initialize zone descriptors using the zone
1471 * information from disk.
1472 */
1473static int dmz_init_zones(struct dmz_metadata *zmd)
1474{
1475        int i, ret;
1476        struct dmz_dev *zoned_dev = &zmd->dev[0];
1477
1478        /* Init */
1479        zmd->zone_nr_sectors = zmd->dev[0].zone_nr_sectors;
1480        zmd->zone_nr_sectors_shift = ilog2(zmd->zone_nr_sectors);
1481        zmd->zone_nr_blocks = dmz_sect2blk(zmd->zone_nr_sectors);
1482        zmd->zone_nr_blocks_shift = ilog2(zmd->zone_nr_blocks);
1483        zmd->zone_bitmap_size = zmd->zone_nr_blocks >> 3;
1484        zmd->zone_nr_bitmap_blocks =
1485                max_t(sector_t, 1, zmd->zone_bitmap_size >> DMZ_BLOCK_SHIFT);
1486        zmd->zone_bits_per_mblk = min_t(sector_t, zmd->zone_nr_blocks,
1487                                        DMZ_BLOCK_SIZE_BITS);
1488
1489        /* Allocate zone array */
1490        zmd->nr_zones = 0;
1491        for (i = 0; i < zmd->nr_devs; i++) {
1492                struct dmz_dev *dev = &zmd->dev[i];
1493
1494                dev->metadata = zmd;
1495                zmd->nr_zones += dev->nr_zones;
1496
1497                atomic_set(&dev->unmap_nr_rnd, 0);
1498                INIT_LIST_HEAD(&dev->unmap_rnd_list);
1499                INIT_LIST_HEAD(&dev->map_rnd_list);
1500
1501                atomic_set(&dev->unmap_nr_seq, 0);
1502                INIT_LIST_HEAD(&dev->unmap_seq_list);
1503                INIT_LIST_HEAD(&dev->map_seq_list);
1504        }
1505
1506        if (!zmd->nr_zones) {
1507                DMERR("(%s): No zones found", zmd->devname);
1508                return -ENXIO;
1509        }
1510        xa_init(&zmd->zones);
1511
1512        DMDEBUG("(%s): Using %zu B for zone information",
1513                zmd->devname, sizeof(struct dm_zone) * zmd->nr_zones);
1514
1515        if (zmd->nr_devs > 1) {
1516                ret = dmz_emulate_zones(zmd, &zmd->dev[0]);
1517                if (ret < 0) {
1518                        DMDEBUG("(%s): Failed to emulate zones, error %d",
1519                                zmd->devname, ret);
1520                        dmz_drop_zones(zmd);
1521                        return ret;
1522                }
1523
1524                /*
1525                 * Primary superblock zone is always at zone 0 when multiple
1526                 * drives are present.
1527                 */
1528                zmd->sb[0].zone = dmz_get(zmd, 0);
1529
1530                for (i = 1; i < zmd->nr_devs; i++) {
1531                        zoned_dev = &zmd->dev[i];
1532
1533                        ret = blkdev_report_zones(zoned_dev->bdev, 0,
1534                                                  BLK_ALL_ZONES,
1535                                                  dmz_init_zone, zoned_dev);
1536                        if (ret < 0) {
1537                                DMDEBUG("(%s): Failed to report zones, error %d",
1538                                        zmd->devname, ret);
1539                                dmz_drop_zones(zmd);
1540                                return ret;
1541                        }
1542                }
1543                return 0;
1544        }
1545
1546        /*
1547         * Get zone information and initialize zone descriptors.  At the same
1548         * time, determine where the super block should be: first block of the
1549         * first randomly writable zone.
1550         */
1551        ret = blkdev_report_zones(zoned_dev->bdev, 0, BLK_ALL_ZONES,
1552                                  dmz_init_zone, zoned_dev);
1553        if (ret < 0) {
1554                DMDEBUG("(%s): Failed to report zones, error %d",
1555                        zmd->devname, ret);
1556                dmz_drop_zones(zmd);
1557                return ret;
1558        }
1559
1560        return 0;
1561}
1562
1563static int dmz_update_zone_cb(struct blk_zone *blkz, unsigned int idx,
1564                              void *data)
1565{
1566        struct dm_zone *zone = data;
1567
1568        clear_bit(DMZ_OFFLINE, &zone->flags);
1569        clear_bit(DMZ_READ_ONLY, &zone->flags);
1570        if (blkz->cond == BLK_ZONE_COND_OFFLINE)
1571                set_bit(DMZ_OFFLINE, &zone->flags);
1572        else if (blkz->cond == BLK_ZONE_COND_READONLY)
1573                set_bit(DMZ_READ_ONLY, &zone->flags);
1574
1575        if (dmz_is_seq(zone))
1576                zone->wp_block = dmz_sect2blk(blkz->wp - blkz->start);
1577        else
1578                zone->wp_block = 0;
1579        return 0;
1580}
1581
1582/*
1583 * Update a zone information.
1584 */
1585static int dmz_update_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
1586{
1587        struct dmz_dev *dev = zone->dev;
1588        unsigned int noio_flag;
1589        int ret;
1590
1591        if (dev->flags & DMZ_BDEV_REGULAR)
1592                return 0;
1593
1594        /*
1595         * Get zone information from disk. Since blkdev_report_zones() uses
1596         * GFP_KERNEL by default for memory allocations, set the per-task
1597         * PF_MEMALLOC_NOIO flag so that all allocations are done as if
1598         * GFP_NOIO was specified.
1599         */
1600        noio_flag = memalloc_noio_save();
1601        ret = blkdev_report_zones(dev->bdev, dmz_start_sect(zmd, zone), 1,
1602                                  dmz_update_zone_cb, zone);
1603        memalloc_noio_restore(noio_flag);
1604
1605        if (ret == 0)
1606                ret = -EIO;
1607        if (ret < 0) {
1608                dmz_dev_err(dev, "Get zone %u report failed",
1609                            zone->id);
1610                dmz_check_bdev(dev);
1611                return ret;
1612        }
1613
1614        return 0;
1615}
1616
1617/*
1618 * Check a zone write pointer position when the zone is marked
1619 * with the sequential write error flag.
1620 */
1621static int dmz_handle_seq_write_err(struct dmz_metadata *zmd,
1622                                    struct dm_zone *zone)
1623{
1624        struct dmz_dev *dev = zone->dev;
1625        unsigned int wp = 0;
1626        int ret;
1627
1628        wp = zone->wp_block;
1629        ret = dmz_update_zone(zmd, zone);
1630        if (ret)
1631                return ret;
1632
1633        dmz_dev_warn(dev, "Processing zone %u write error (zone wp %u/%u)",
1634                     zone->id, zone->wp_block, wp);
1635
1636        if (zone->wp_block < wp) {
1637                dmz_invalidate_blocks(zmd, zone, zone->wp_block,
1638                                      wp - zone->wp_block);
1639        }
1640
1641        return 0;
1642}
1643
1644/*
1645 * Reset a zone write pointer.
1646 */
1647static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
1648{
1649        int ret;
1650
1651        /*
1652         * Ignore offline zones, read only zones,
1653         * and conventional zones.
1654         */
1655        if (dmz_is_offline(zone) ||
1656            dmz_is_readonly(zone) ||
1657            dmz_is_rnd(zone))
1658                return 0;
1659
1660        if (!dmz_is_empty(zone) || dmz_seq_write_err(zone)) {
1661                struct dmz_dev *dev = zone->dev;
1662
1663                ret = blkdev_zone_mgmt(dev->bdev, REQ_OP_ZONE_RESET,
1664                                       dmz_start_sect(zmd, zone),
1665                                       zmd->zone_nr_sectors, GFP_NOIO);
1666                if (ret) {
1667                        dmz_dev_err(dev, "Reset zone %u failed %d",
1668                                    zone->id, ret);
1669                        return ret;
1670                }
1671        }
1672
1673        /* Clear write error bit and rewind write pointer position */
1674        clear_bit(DMZ_SEQ_WRITE_ERR, &zone->flags);
1675        zone->wp_block = 0;
1676
1677        return 0;
1678}
1679
1680static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone);
1681
1682/*
1683 * Initialize chunk mapping.
1684 */
1685static int dmz_load_mapping(struct dmz_metadata *zmd)
1686{
1687        struct dm_zone *dzone, *bzone;
1688        struct dmz_mblock *dmap_mblk = NULL;
1689        struct dmz_map *dmap;
1690        unsigned int i = 0, e = 0, chunk = 0;
1691        unsigned int dzone_id;
1692        unsigned int bzone_id;
1693
1694        /* Metadata block array for the chunk mapping table */
1695        zmd->map_mblk = kcalloc(zmd->nr_map_blocks,
1696                                sizeof(struct dmz_mblk *), GFP_KERNEL);
1697        if (!zmd->map_mblk)
1698                return -ENOMEM;
1699
1700        /* Get chunk mapping table blocks and initialize zone mapping */
1701        while (chunk < zmd->nr_chunks) {
1702                if (!dmap_mblk) {
1703                        /* Get mapping block */
1704                        dmap_mblk = dmz_get_mblock(zmd, i + 1);
1705                        if (IS_ERR(dmap_mblk))
1706                                return PTR_ERR(dmap_mblk);
1707                        zmd->map_mblk[i] = dmap_mblk;
1708                        dmap = (struct dmz_map *) dmap_mblk->data;
1709                        i++;
1710                        e = 0;
1711                }
1712
1713                /* Check data zone */
1714                dzone_id = le32_to_cpu(dmap[e].dzone_id);
1715                if (dzone_id == DMZ_MAP_UNMAPPED)
1716                        goto next;
1717
1718                if (dzone_id >= zmd->nr_zones) {
1719                        dmz_zmd_err(zmd, "Chunk %u mapping: invalid data zone ID %u",
1720                                    chunk, dzone_id);
1721                        return -EIO;
1722                }
1723
1724                dzone = dmz_get(zmd, dzone_id);
1725                if (!dzone) {
1726                        dmz_zmd_err(zmd, "Chunk %u mapping: data zone %u not present",
1727                                    chunk, dzone_id);
1728                        return -EIO;
1729                }
1730                set_bit(DMZ_DATA, &dzone->flags);
1731                dzone->chunk = chunk;
1732                dmz_get_zone_weight(zmd, dzone);
1733
1734                if (dmz_is_cache(dzone))
1735                        list_add_tail(&dzone->link, &zmd->map_cache_list);
1736                else if (dmz_is_rnd(dzone))
1737                        list_add_tail(&dzone->link, &dzone->dev->map_rnd_list);
1738                else
1739                        list_add_tail(&dzone->link, &dzone->dev->map_seq_list);
1740
1741                /* Check buffer zone */
1742                bzone_id = le32_to_cpu(dmap[e].bzone_id);
1743                if (bzone_id == DMZ_MAP_UNMAPPED)
1744                        goto next;
1745
1746                if (bzone_id >= zmd->nr_zones) {
1747                        dmz_zmd_err(zmd, "Chunk %u mapping: invalid buffer zone ID %u",
1748                                    chunk, bzone_id);
1749                        return -EIO;
1750                }
1751
1752                bzone = dmz_get(zmd, bzone_id);
1753                if (!bzone) {
1754                        dmz_zmd_err(zmd, "Chunk %u mapping: buffer zone %u not present",
1755                                    chunk, bzone_id);
1756                        return -EIO;
1757                }
1758                if (!dmz_is_rnd(bzone) && !dmz_is_cache(bzone)) {
1759                        dmz_zmd_err(zmd, "Chunk %u mapping: invalid buffer zone %u",
1760                                    chunk, bzone_id);
1761                        return -EIO;
1762                }
1763
1764                set_bit(DMZ_DATA, &bzone->flags);
1765                set_bit(DMZ_BUF, &bzone->flags);
1766                bzone->chunk = chunk;
1767                bzone->bzone = dzone;
1768                dzone->bzone = bzone;
1769                dmz_get_zone_weight(zmd, bzone);
1770                if (dmz_is_cache(bzone))
1771                        list_add_tail(&bzone->link, &zmd->map_cache_list);
1772                else
1773                        list_add_tail(&bzone->link, &bzone->dev->map_rnd_list);
1774next:
1775                chunk++;
1776                e++;
1777                if (e >= DMZ_MAP_ENTRIES)
1778                        dmap_mblk = NULL;
1779        }
1780
1781        /*
1782         * At this point, only meta zones and mapped data zones were
1783         * fully initialized. All remaining zones are unmapped data
1784         * zones. Finish initializing those here.
1785         */
1786        for (i = 0; i < zmd->nr_zones; i++) {
1787                dzone = dmz_get(zmd, i);
1788                if (!dzone)
1789                        continue;
1790                if (dmz_is_meta(dzone))
1791                        continue;
1792                if (dmz_is_offline(dzone))
1793                        continue;
1794
1795                if (dmz_is_cache(dzone))
1796                        zmd->nr_cache++;
1797                else if (dmz_is_rnd(dzone))
1798                        dzone->dev->nr_rnd++;
1799                else
1800                        dzone->dev->nr_seq++;
1801
1802                if (dmz_is_data(dzone)) {
1803                        /* Already initialized */
1804                        continue;
1805                }
1806
1807                /* Unmapped data zone */
1808                set_bit(DMZ_DATA, &dzone->flags);
1809                dzone->chunk = DMZ_MAP_UNMAPPED;
1810                if (dmz_is_cache(dzone)) {
1811                        list_add_tail(&dzone->link, &zmd->unmap_cache_list);
1812                        atomic_inc(&zmd->unmap_nr_cache);
1813                } else if (dmz_is_rnd(dzone)) {
1814                        list_add_tail(&dzone->link,
1815                                      &dzone->dev->unmap_rnd_list);
1816                        atomic_inc(&dzone->dev->unmap_nr_rnd);
1817                } else if (atomic_read(&zmd->nr_reserved_seq_zones) < zmd->nr_reserved_seq) {
1818                        list_add_tail(&dzone->link, &zmd->reserved_seq_zones_list);
1819                        set_bit(DMZ_RESERVED, &dzone->flags);
1820                        atomic_inc(&zmd->nr_reserved_seq_zones);
1821                        dzone->dev->nr_seq--;
1822                } else {
1823                        list_add_tail(&dzone->link,
1824                                      &dzone->dev->unmap_seq_list);
1825                        atomic_inc(&dzone->dev->unmap_nr_seq);
1826                }
1827        }
1828
1829        return 0;
1830}
1831
1832/*
1833 * Set a data chunk mapping.
1834 */
1835static void dmz_set_chunk_mapping(struct dmz_metadata *zmd, unsigned int chunk,
1836                                  unsigned int dzone_id, unsigned int bzone_id)
1837{
1838        struct dmz_mblock *dmap_mblk = zmd->map_mblk[chunk >> DMZ_MAP_ENTRIES_SHIFT];
1839        struct dmz_map *dmap = (struct dmz_map *) dmap_mblk->data;
1840        int map_idx = chunk & DMZ_MAP_ENTRIES_MASK;
1841
1842        dmap[map_idx].dzone_id = cpu_to_le32(dzone_id);
1843        dmap[map_idx].bzone_id = cpu_to_le32(bzone_id);
1844        dmz_dirty_mblock(zmd, dmap_mblk);
1845}
1846
1847/*
1848 * The list of mapped zones is maintained in LRU order.
1849 * This rotates a zone at the end of its map list.
1850 */
1851static void __dmz_lru_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
1852{
1853        if (list_empty(&zone->link))
1854                return;
1855
1856        list_del_init(&zone->link);
1857        if (dmz_is_seq(zone)) {
1858                /* LRU rotate sequential zone */
1859                list_add_tail(&zone->link, &zone->dev->map_seq_list);
1860        } else if (dmz_is_cache(zone)) {
1861                /* LRU rotate cache zone */
1862                list_add_tail(&zone->link, &zmd->map_cache_list);
1863        } else {
1864                /* LRU rotate random zone */
1865                list_add_tail(&zone->link, &zone->dev->map_rnd_list);
1866        }
1867}
1868
1869/*
1870 * The list of mapped random zones is maintained
1871 * in LRU order. This rotates a zone at the end of the list.
1872 */
1873static void dmz_lru_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
1874{
1875        __dmz_lru_zone(zmd, zone);
1876        if (zone->bzone)
1877                __dmz_lru_zone(zmd, zone->bzone);
1878}
1879
1880/*
1881 * Wait for any zone to be freed.
1882 */
1883static void dmz_wait_for_free_zones(struct dmz_metadata *zmd)
1884{
1885        DEFINE_WAIT(wait);
1886
1887        prepare_to_wait(&zmd->free_wq, &wait, TASK_UNINTERRUPTIBLE);
1888        dmz_unlock_map(zmd);
1889        dmz_unlock_metadata(zmd);
1890
1891        io_schedule_timeout(HZ);
1892
1893        dmz_lock_metadata(zmd);
1894        dmz_lock_map(zmd);
1895        finish_wait(&zmd->free_wq, &wait);
1896}
1897
1898/*
1899 * Lock a zone for reclaim (set the zone RECLAIM bit).
1900 * Returns false if the zone cannot be locked or if it is already locked
1901 * and 1 otherwise.
1902 */
1903int dmz_lock_zone_reclaim(struct dm_zone *zone)
1904{
1905        /* Active zones cannot be reclaimed */
1906        if (dmz_is_active(zone))
1907                return 0;
1908
1909        return !test_and_set_bit(DMZ_RECLAIM, &zone->flags);
1910}
1911
1912/*
1913 * Clear a zone reclaim flag.
1914 */
1915void dmz_unlock_zone_reclaim(struct dm_zone *zone)
1916{
1917        WARN_ON(dmz_is_active(zone));
1918        WARN_ON(!dmz_in_reclaim(zone));
1919
1920        clear_bit_unlock(DMZ_RECLAIM, &zone->flags);
1921        smp_mb__after_atomic();
1922        wake_up_bit(&zone->flags, DMZ_RECLAIM);
1923}
1924
1925/*
1926 * Wait for a zone reclaim to complete.
1927 */
1928static void dmz_wait_for_reclaim(struct dmz_metadata *zmd, struct dm_zone *zone)
1929{
1930        dmz_unlock_map(zmd);
1931        dmz_unlock_metadata(zmd);
1932        set_bit(DMZ_RECLAIM_TERMINATE, &zone->flags);
1933        wait_on_bit_timeout(&zone->flags, DMZ_RECLAIM, TASK_UNINTERRUPTIBLE, HZ);
1934        clear_bit(DMZ_RECLAIM_TERMINATE, &zone->flags);
1935        dmz_lock_metadata(zmd);
1936        dmz_lock_map(zmd);
1937}
1938
1939/*
1940 * Select a cache or random write zone for reclaim.
1941 */
1942static struct dm_zone *dmz_get_rnd_zone_for_reclaim(struct dmz_metadata *zmd,
1943                                                    unsigned int idx, bool idle)
1944{
1945        struct dm_zone *dzone = NULL;
1946        struct dm_zone *zone, *maxw_z = NULL;
1947        struct list_head *zone_list;
1948
1949        /* If we have cache zones select from the cache zone list */
1950        if (zmd->nr_cache) {
1951                zone_list = &zmd->map_cache_list;
1952                /* Try to relaim random zones, too, when idle */
1953                if (idle && list_empty(zone_list))
1954                        zone_list = &zmd->dev[idx].map_rnd_list;
1955        } else
1956                zone_list = &zmd->dev[idx].map_rnd_list;
1957
1958        /*
1959         * Find the buffer zone with the heaviest weight or the first (oldest)
1960         * data zone that can be reclaimed.
1961         */
1962        list_for_each_entry(zone, zone_list, link) {
1963                if (dmz_is_buf(zone)) {
1964                        dzone = zone->bzone;
1965                        if (dmz_is_rnd(dzone) && dzone->dev->dev_idx != idx)
1966                                continue;
1967                        if (!maxw_z || maxw_z->weight < dzone->weight)
1968                                maxw_z = dzone;
1969                } else {
1970                        dzone = zone;
1971                        if (dmz_lock_zone_reclaim(dzone))
1972                                return dzone;
1973                }
1974        }
1975
1976        if (maxw_z && dmz_lock_zone_reclaim(maxw_z))
1977                return maxw_z;
1978
1979        /*
1980         * If we come here, none of the zones inspected could be locked for
1981         * reclaim. Try again, being more aggressive, that is, find the
1982         * first zone that can be reclaimed regardless of its weitght.
1983         */
1984        list_for_each_entry(zone, zone_list, link) {
1985                if (dmz_is_buf(zone)) {
1986                        dzone = zone->bzone;
1987                        if (dmz_is_rnd(dzone) && dzone->dev->dev_idx != idx)
1988                                continue;
1989                } else
1990                        dzone = zone;
1991                if (dmz_lock_zone_reclaim(dzone))
1992                        return dzone;
1993        }
1994
1995        return NULL;
1996}
1997
1998/*
1999 * Select a buffered sequential zone for reclaim.
2000 */
2001static struct dm_zone *dmz_get_seq_zone_for_reclaim(struct dmz_metadata *zmd,
2002                                                    unsigned int idx)
2003{
2004        struct dm_zone *zone;
2005
2006        list_for_each_entry(zone, &zmd->dev[idx].map_seq_list, link) {
2007                if (!zone->bzone)
2008                        continue;
2009                if (dmz_lock_zone_reclaim(zone))
2010                        return zone;
2011        }
2012
2013        return NULL;
2014}
2015
2016/*
2017 * Select a zone for reclaim.
2018 */
2019struct dm_zone *dmz_get_zone_for_reclaim(struct dmz_metadata *zmd,
2020                                         unsigned int dev_idx, bool idle)
2021{
2022        struct dm_zone *zone = NULL;
2023
2024        /*
2025         * Search for a zone candidate to reclaim: 2 cases are possible.
2026         * (1) There is no free sequential zones. Then a random data zone
2027         *     cannot be reclaimed. So choose a sequential zone to reclaim so
2028         *     that afterward a random zone can be reclaimed.
2029         * (2) At least one free sequential zone is available, then choose
2030         *     the oldest random zone (data or buffer) that can be locked.
2031         */
2032        dmz_lock_map(zmd);
2033        if (list_empty(&zmd->reserved_seq_zones_list))
2034                zone = dmz_get_seq_zone_for_reclaim(zmd, dev_idx);
2035        if (!zone)
2036                zone = dmz_get_rnd_zone_for_reclaim(zmd, dev_idx, idle);
2037        dmz_unlock_map(zmd);
2038
2039        return zone;
2040}
2041
2042/*
2043 * Get the zone mapping a chunk, if the chunk is mapped already.
2044 * If no mapping exist and the operation is WRITE, a zone is
2045 * allocated and used to map the chunk.
2046 * The zone returned will be set to the active state.
2047 */
2048struct dm_zone *dmz_get_chunk_mapping(struct dmz_metadata *zmd, unsigned int chunk, int op)
2049{
2050        struct dmz_mblock *dmap_mblk = zmd->map_mblk[chunk >> DMZ_MAP_ENTRIES_SHIFT];
2051        struct dmz_map *dmap = (struct dmz_map *) dmap_mblk->data;
2052        int dmap_idx = chunk & DMZ_MAP_ENTRIES_MASK;
2053        unsigned int dzone_id;
2054        struct dm_zone *dzone = NULL;
2055        int ret = 0;
2056        int alloc_flags = zmd->nr_cache ? DMZ_ALLOC_CACHE : DMZ_ALLOC_RND;
2057
2058        dmz_lock_map(zmd);
2059again:
2060        /* Get the chunk mapping */
2061        dzone_id = le32_to_cpu(dmap[dmap_idx].dzone_id);
2062        if (dzone_id == DMZ_MAP_UNMAPPED) {
2063                /*
2064                 * Read or discard in unmapped chunks are fine. But for
2065                 * writes, we need a mapping, so get one.
2066                 */
2067                if (op != REQ_OP_WRITE)
2068                        goto out;
2069
2070                /* Allocate a random zone */
2071                dzone = dmz_alloc_zone(zmd, 0, alloc_flags);
2072                if (!dzone) {
2073                        if (dmz_dev_is_dying(zmd)) {
2074                                dzone = ERR_PTR(-EIO);
2075                                goto out;
2076                        }
2077                        dmz_wait_for_free_zones(zmd);
2078                        goto again;
2079                }
2080
2081                dmz_map_zone(zmd, dzone, chunk);
2082
2083        } else {
2084                /* The chunk is already mapped: get the mapping zone */
2085                dzone = dmz_get(zmd, dzone_id);
2086                if (!dzone) {
2087                        dzone = ERR_PTR(-EIO);
2088                        goto out;
2089                }
2090                if (dzone->chunk != chunk) {
2091                        dzone = ERR_PTR(-EIO);
2092                        goto out;
2093                }
2094
2095                /* Repair write pointer if the sequential dzone has error */
2096                if (dmz_seq_write_err(dzone)) {
2097                        ret = dmz_handle_seq_write_err(zmd, dzone);
2098                        if (ret) {
2099                                dzone = ERR_PTR(-EIO);
2100                                goto out;
2101                        }
2102                        clear_bit(DMZ_SEQ_WRITE_ERR, &dzone->flags);
2103                }
2104        }
2105
2106        /*
2107         * If the zone is being reclaimed, the chunk mapping may change
2108         * to a different zone. So wait for reclaim and retry. Otherwise,
2109         * activate the zone (this will prevent reclaim from touching it).
2110         */
2111        if (dmz_in_reclaim(dzone)) {
2112                dmz_wait_for_reclaim(zmd, dzone);
2113                goto again;
2114        }
2115        dmz_activate_zone(dzone);
2116        dmz_lru_zone(zmd, dzone);
2117out:
2118        dmz_unlock_map(zmd);
2119
2120        return dzone;
2121}
2122
2123/*
2124 * Write and discard change the block validity of data zones and their buffer
2125 * zones. Check here that valid blocks are still present. If all blocks are
2126 * invalid, the zones can be unmapped on the fly without waiting for reclaim
2127 * to do it.
2128 */
2129void dmz_put_chunk_mapping(struct dmz_metadata *zmd, struct dm_zone *dzone)
2130{
2131        struct dm_zone *bzone;
2132
2133        dmz_lock_map(zmd);
2134
2135        bzone = dzone->bzone;
2136        if (bzone) {
2137                if (dmz_weight(bzone))
2138                        dmz_lru_zone(zmd, bzone);
2139                else {
2140                        /* Empty buffer zone: reclaim it */
2141                        dmz_unmap_zone(zmd, bzone);
2142                        dmz_free_zone(zmd, bzone);
2143                        bzone = NULL;
2144                }
2145        }
2146
2147        /* Deactivate the data zone */
2148        dmz_deactivate_zone(dzone);
2149        if (dmz_is_active(dzone) || bzone || dmz_weight(dzone))
2150                dmz_lru_zone(zmd, dzone);
2151        else {
2152                /* Unbuffered inactive empty data zone: reclaim it */
2153                dmz_unmap_zone(zmd, dzone);
2154                dmz_free_zone(zmd, dzone);
2155        }
2156
2157        dmz_unlock_map(zmd);
2158}
2159
2160/*
2161 * Allocate and map a random zone to buffer a chunk
2162 * already mapped to a sequential zone.
2163 */
2164struct dm_zone *dmz_get_chunk_buffer(struct dmz_metadata *zmd,
2165                                     struct dm_zone *dzone)
2166{
2167        struct dm_zone *bzone;
2168        int alloc_flags = zmd->nr_cache ? DMZ_ALLOC_CACHE : DMZ_ALLOC_RND;
2169
2170        dmz_lock_map(zmd);
2171again:
2172        bzone = dzone->bzone;
2173        if (bzone)
2174                goto out;
2175
2176        /* Allocate a random zone */
2177        bzone = dmz_alloc_zone(zmd, 0, alloc_flags);
2178        if (!bzone) {
2179                if (dmz_dev_is_dying(zmd)) {
2180                        bzone = ERR_PTR(-EIO);
2181                        goto out;
2182                }
2183                dmz_wait_for_free_zones(zmd);
2184                goto again;
2185        }
2186
2187        /* Update the chunk mapping */
2188        dmz_set_chunk_mapping(zmd, dzone->chunk, dzone->id, bzone->id);
2189
2190        set_bit(DMZ_BUF, &bzone->flags);
2191        bzone->chunk = dzone->chunk;
2192        bzone->bzone = dzone;
2193        dzone->bzone = bzone;
2194        if (dmz_is_cache(bzone))
2195                list_add_tail(&bzone->link, &zmd->map_cache_list);
2196        else
2197                list_add_tail(&bzone->link, &bzone->dev->map_rnd_list);
2198out:
2199        dmz_unlock_map(zmd);
2200
2201        return bzone;
2202}
2203
2204/*
2205 * Get an unmapped (free) zone.
2206 * This must be called with the mapping lock held.
2207 */
2208struct dm_zone *dmz_alloc_zone(struct dmz_metadata *zmd, unsigned int dev_idx,
2209                               unsigned long flags)
2210{
2211        struct list_head *list;
2212        struct dm_zone *zone;
2213        int i;
2214
2215        /* Schedule reclaim to ensure free zones are available */
2216        if (!(flags & DMZ_ALLOC_RECLAIM)) {
2217                for (i = 0; i < zmd->nr_devs; i++)
2218                        dmz_schedule_reclaim(zmd->dev[i].reclaim);
2219        }
2220
2221        i = 0;
2222again:
2223        if (flags & DMZ_ALLOC_CACHE)
2224                list = &zmd->unmap_cache_list;
2225        else if (flags & DMZ_ALLOC_RND)
2226                list = &zmd->dev[dev_idx].unmap_rnd_list;
2227        else
2228                list = &zmd->dev[dev_idx].unmap_seq_list;
2229
2230        if (list_empty(list)) {
2231                /*
2232                 * No free zone: return NULL if this is for not reclaim.
2233                 */
2234                if (!(flags & DMZ_ALLOC_RECLAIM))
2235                        return NULL;
2236                /*
2237                 * Try to allocate from other devices
2238                 */
2239                if (i < zmd->nr_devs) {
2240                        dev_idx = (dev_idx + 1) % zmd->nr_devs;
2241                        i++;
2242                        goto again;
2243                }
2244
2245                /*
2246                 * Fallback to the reserved sequential zones
2247                 */
2248                zone = list_first_entry_or_null(&zmd->reserved_seq_zones_list,
2249                                                struct dm_zone, link);
2250                if (zone) {
2251                        list_del_init(&zone->link);
2252                        atomic_dec(&zmd->nr_reserved_seq_zones);
2253                }
2254                return zone;
2255        }
2256
2257        zone = list_first_entry(list, struct dm_zone, link);
2258        list_del_init(&zone->link);
2259
2260        if (dmz_is_cache(zone))
2261                atomic_dec(&zmd->unmap_nr_cache);
2262        else if (dmz_is_rnd(zone))
2263                atomic_dec(&zone->dev->unmap_nr_rnd);
2264        else
2265                atomic_dec(&zone->dev->unmap_nr_seq);
2266
2267        if (dmz_is_offline(zone)) {
2268                dmz_zmd_warn(zmd, "Zone %u is offline", zone->id);
2269                zone = NULL;
2270                goto again;
2271        }
2272        if (dmz_is_meta(zone)) {
2273                dmz_zmd_warn(zmd, "Zone %u has metadata", zone->id);
2274                zone = NULL;
2275                goto again;
2276        }
2277        return zone;
2278}
2279
2280/*
2281 * Free a zone.
2282 * This must be called with the mapping lock held.
2283 */
2284void dmz_free_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
2285{
2286        /* If this is a sequential zone, reset it */
2287        if (dmz_is_seq(zone))
2288                dmz_reset_zone(zmd, zone);
2289
2290        /* Return the zone to its type unmap list */
2291        if (dmz_is_cache(zone)) {
2292                list_add_tail(&zone->link, &zmd->unmap_cache_list);
2293                atomic_inc(&zmd->unmap_nr_cache);
2294        } else if (dmz_is_rnd(zone)) {
2295                list_add_tail(&zone->link, &zone->dev->unmap_rnd_list);
2296                atomic_inc(&zone->dev->unmap_nr_rnd);
2297        } else if (dmz_is_reserved(zone)) {
2298                list_add_tail(&zone->link, &zmd->reserved_seq_zones_list);
2299                atomic_inc(&zmd->nr_reserved_seq_zones);
2300        } else {
2301                list_add_tail(&zone->link, &zone->dev->unmap_seq_list);
2302                atomic_inc(&zone->dev->unmap_nr_seq);
2303        }
2304
2305        wake_up_all(&zmd->free_wq);
2306}
2307
2308/*
2309 * Map a chunk to a zone.
2310 * This must be called with the mapping lock held.
2311 */
2312void dmz_map_zone(struct dmz_metadata *zmd, struct dm_zone *dzone,
2313                  unsigned int chunk)
2314{
2315        /* Set the chunk mapping */
2316        dmz_set_chunk_mapping(zmd, chunk, dzone->id,
2317                              DMZ_MAP_UNMAPPED);
2318        dzone->chunk = chunk;
2319        if (dmz_is_cache(dzone))
2320                list_add_tail(&dzone->link, &zmd->map_cache_list);
2321        else if (dmz_is_rnd(dzone))
2322                list_add_tail(&dzone->link, &dzone->dev->map_rnd_list);
2323        else
2324                list_add_tail(&dzone->link, &dzone->dev->map_seq_list);
2325}
2326
2327/*
2328 * Unmap a zone.
2329 * This must be called with the mapping lock held.
2330 */
2331void dmz_unmap_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
2332{
2333        unsigned int chunk = zone->chunk;
2334        unsigned int dzone_id;
2335
2336        if (chunk == DMZ_MAP_UNMAPPED) {
2337                /* Already unmapped */
2338                return;
2339        }
2340
2341        if (test_and_clear_bit(DMZ_BUF, &zone->flags)) {
2342                /*
2343                 * Unmapping the chunk buffer zone: clear only
2344                 * the chunk buffer mapping
2345                 */
2346                dzone_id = zone->bzone->id;
2347                zone->bzone->bzone = NULL;
2348                zone->bzone = NULL;
2349
2350        } else {
2351                /*
2352                 * Unmapping the chunk data zone: the zone must
2353                 * not be buffered.
2354                 */
2355                if (WARN_ON(zone->bzone)) {
2356                        zone->bzone->bzone = NULL;
2357                        zone->bzone = NULL;
2358                }
2359                dzone_id = DMZ_MAP_UNMAPPED;
2360        }
2361
2362        dmz_set_chunk_mapping(zmd, chunk, dzone_id, DMZ_MAP_UNMAPPED);
2363
2364        zone->chunk = DMZ_MAP_UNMAPPED;
2365        list_del_init(&zone->link);
2366}
2367
2368/*
2369 * Set @nr_bits bits in @bitmap starting from @bit.
2370 * Return the number of bits changed from 0 to 1.
2371 */
2372static unsigned int dmz_set_bits(unsigned long *bitmap,
2373                                 unsigned int bit, unsigned int nr_bits)
2374{
2375        unsigned long *addr;
2376        unsigned int end = bit + nr_bits;
2377        unsigned int n = 0;
2378
2379        while (bit < end) {
2380                if (((bit & (BITS_PER_LONG - 1)) == 0) &&
2381                    ((end - bit) >= BITS_PER_LONG)) {
2382                        /* Try to set the whole word at once */
2383                        addr = bitmap + BIT_WORD(bit);
2384                        if (*addr == 0) {
2385                                *addr = ULONG_MAX;
2386                                n += BITS_PER_LONG;
2387                                bit += BITS_PER_LONG;
2388                                continue;
2389                        }
2390                }
2391
2392                if (!test_and_set_bit(bit, bitmap))
2393                        n++;
2394                bit++;
2395        }
2396
2397        return n;
2398}
2399
2400/*
2401 * Get the bitmap block storing the bit for chunk_block in zone.
2402 */
2403static struct dmz_mblock *dmz_get_bitmap(struct dmz_metadata *zmd,
2404                                         struct dm_zone *zone,
2405                                         sector_t chunk_block)
2406{
2407        sector_t bitmap_block = 1 + zmd->nr_map_blocks +
2408                (sector_t)(zone->id * zmd->zone_nr_bitmap_blocks) +
2409                (chunk_block >> DMZ_BLOCK_SHIFT_BITS);
2410
2411        return dmz_get_mblock(zmd, bitmap_block);
2412}
2413
2414/*
2415 * Copy the valid blocks bitmap of from_zone to the bitmap of to_zone.
2416 */
2417int dmz_copy_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone,
2418                          struct dm_zone *to_zone)
2419{
2420        struct dmz_mblock *from_mblk, *to_mblk;
2421        sector_t chunk_block = 0;
2422
2423        /* Get the zones bitmap blocks */
2424        while (chunk_block < zmd->zone_nr_blocks) {
2425                from_mblk = dmz_get_bitmap(zmd, from_zone, chunk_block);
2426                if (IS_ERR(from_mblk))
2427                        return PTR_ERR(from_mblk);
2428                to_mblk = dmz_get_bitmap(zmd, to_zone, chunk_block);
2429                if (IS_ERR(to_mblk)) {
2430                        dmz_release_mblock(zmd, from_mblk);
2431                        return PTR_ERR(to_mblk);
2432                }
2433
2434                memcpy(to_mblk->data, from_mblk->data, DMZ_BLOCK_SIZE);
2435                dmz_dirty_mblock(zmd, to_mblk);
2436
2437                dmz_release_mblock(zmd, to_mblk);
2438                dmz_release_mblock(zmd, from_mblk);
2439
2440                chunk_block += zmd->zone_bits_per_mblk;
2441        }
2442
2443        to_zone->weight = from_zone->weight;
2444
2445        return 0;
2446}
2447
2448/*
2449 * Merge the valid blocks bitmap of from_zone into the bitmap of to_zone,
2450 * starting from chunk_block.
2451 */
2452int dmz_merge_valid_blocks(struct dmz_metadata *zmd, struct dm_zone *from_zone,
2453                           struct dm_zone *to_zone, sector_t chunk_block)
2454{
2455        unsigned int nr_blocks;
2456        int ret;
2457
2458        /* Get the zones bitmap blocks */
2459        while (chunk_block < zmd->zone_nr_blocks) {
2460                /* Get a valid region from the source zone */
2461                ret = dmz_first_valid_block(zmd, from_zone, &chunk_block);
2462                if (ret <= 0)
2463                        return ret;
2464
2465                nr_blocks = ret;
2466                ret = dmz_validate_blocks(zmd, to_zone, chunk_block, nr_blocks);
2467                if (ret)
2468                        return ret;
2469
2470                chunk_block += nr_blocks;
2471        }
2472
2473        return 0;
2474}
2475
2476/*
2477 * Validate all the blocks in the range [block..block+nr_blocks-1].
2478 */
2479int dmz_validate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone,
2480                        sector_t chunk_block, unsigned int nr_blocks)
2481{
2482        unsigned int count, bit, nr_bits;
2483        unsigned int zone_nr_blocks = zmd->zone_nr_blocks;
2484        struct dmz_mblock *mblk;
2485        unsigned int n = 0;
2486
2487        dmz_zmd_debug(zmd, "=> VALIDATE zone %u, block %llu, %u blocks",
2488                      zone->id, (unsigned long long)chunk_block,
2489                      nr_blocks);
2490
2491        WARN_ON(chunk_block + nr_blocks > zone_nr_blocks);
2492
2493        while (nr_blocks) {
2494                /* Get bitmap block */
2495                mblk = dmz_get_bitmap(zmd, zone, chunk_block);
2496                if (IS_ERR(mblk))
2497                        return PTR_ERR(mblk);
2498
2499                /* Set bits */
2500                bit = chunk_block & DMZ_BLOCK_MASK_BITS;
2501                nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit);
2502
2503                count = dmz_set_bits((unsigned long *)mblk->data, bit, nr_bits);
2504                if (count) {
2505                        dmz_dirty_mblock(zmd, mblk);
2506                        n += count;
2507                }
2508                dmz_release_mblock(zmd, mblk);
2509
2510                nr_blocks -= nr_bits;
2511                chunk_block += nr_bits;
2512        }
2513
2514        if (likely(zone->weight + n <= zone_nr_blocks))
2515                zone->weight += n;
2516        else {
2517                dmz_zmd_warn(zmd, "Zone %u: weight %u should be <= %u",
2518                             zone->id, zone->weight,
2519                             zone_nr_blocks - n);
2520                zone->weight = zone_nr_blocks;
2521        }
2522
2523        return 0;
2524}
2525
2526/*
2527 * Clear nr_bits bits in bitmap starting from bit.
2528 * Return the number of bits cleared.
2529 */
2530static int dmz_clear_bits(unsigned long *bitmap, int bit, int nr_bits)
2531{
2532        unsigned long *addr;
2533        int end = bit + nr_bits;
2534        int n = 0;
2535
2536        while (bit < end) {
2537                if (((bit & (BITS_PER_LONG - 1)) == 0) &&
2538                    ((end - bit) >= BITS_PER_LONG)) {
2539                        /* Try to clear whole word at once */
2540                        addr = bitmap + BIT_WORD(bit);
2541                        if (*addr == ULONG_MAX) {
2542                                *addr = 0;
2543                                n += BITS_PER_LONG;
2544                                bit += BITS_PER_LONG;
2545                                continue;
2546                        }
2547                }
2548
2549                if (test_and_clear_bit(bit, bitmap))
2550                        n++;
2551                bit++;
2552        }
2553
2554        return n;
2555}
2556
2557/*
2558 * Invalidate all the blocks in the range [block..block+nr_blocks-1].
2559 */
2560int dmz_invalidate_blocks(struct dmz_metadata *zmd, struct dm_zone *zone,
2561                          sector_t chunk_block, unsigned int nr_blocks)
2562{
2563        unsigned int count, bit, nr_bits;
2564        struct dmz_mblock *mblk;
2565        unsigned int n = 0;
2566
2567        dmz_zmd_debug(zmd, "=> INVALIDATE zone %u, block %llu, %u blocks",
2568                      zone->id, (u64)chunk_block, nr_blocks);
2569
2570        WARN_ON(chunk_block + nr_blocks > zmd->zone_nr_blocks);
2571
2572        while (nr_blocks) {
2573                /* Get bitmap block */
2574                mblk = dmz_get_bitmap(zmd, zone, chunk_block);
2575                if (IS_ERR(mblk))
2576                        return PTR_ERR(mblk);
2577
2578                /* Clear bits */
2579                bit = chunk_block & DMZ_BLOCK_MASK_BITS;
2580                nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit);
2581
2582                count = dmz_clear_bits((unsigned long *)mblk->data,
2583                                       bit, nr_bits);
2584                if (count) {
2585                        dmz_dirty_mblock(zmd, mblk);
2586                        n += count;
2587                }
2588                dmz_release_mblock(zmd, mblk);
2589
2590                nr_blocks -= nr_bits;
2591                chunk_block += nr_bits;
2592        }
2593
2594        if (zone->weight >= n)
2595                zone->weight -= n;
2596        else {
2597                dmz_zmd_warn(zmd, "Zone %u: weight %u should be >= %u",
2598                             zone->id, zone->weight, n);
2599                zone->weight = 0;
2600        }
2601
2602        return 0;
2603}
2604
2605/*
2606 * Get a block bit value.
2607 */
2608static int dmz_test_block(struct dmz_metadata *zmd, struct dm_zone *zone,
2609                          sector_t chunk_block)
2610{
2611        struct dmz_mblock *mblk;
2612        int ret;
2613
2614        WARN_ON(chunk_block >= zmd->zone_nr_blocks);
2615
2616        /* Get bitmap block */
2617        mblk = dmz_get_bitmap(zmd, zone, chunk_block);
2618        if (IS_ERR(mblk))
2619                return PTR_ERR(mblk);
2620
2621        /* Get offset */
2622        ret = test_bit(chunk_block & DMZ_BLOCK_MASK_BITS,
2623                       (unsigned long *) mblk->data) != 0;
2624
2625        dmz_release_mblock(zmd, mblk);
2626
2627        return ret;
2628}
2629
2630/*
2631 * Return the number of blocks from chunk_block to the first block with a bit
2632 * value specified by set. Search at most nr_blocks blocks from chunk_block.
2633 */
2634static int dmz_to_next_set_block(struct dmz_metadata *zmd, struct dm_zone *zone,
2635                                 sector_t chunk_block, unsigned int nr_blocks,
2636                                 int set)
2637{
2638        struct dmz_mblock *mblk;
2639        unsigned int bit, set_bit, nr_bits;
2640        unsigned int zone_bits = zmd->zone_bits_per_mblk;
2641        unsigned long *bitmap;
2642        int n = 0;
2643
2644        WARN_ON(chunk_block + nr_blocks > zmd->zone_nr_blocks);
2645
2646        while (nr_blocks) {
2647                /* Get bitmap block */
2648                mblk = dmz_get_bitmap(zmd, zone, chunk_block);
2649                if (IS_ERR(mblk))
2650                        return PTR_ERR(mblk);
2651
2652                /* Get offset */
2653                bitmap = (unsigned long *) mblk->data;
2654                bit = chunk_block & DMZ_BLOCK_MASK_BITS;
2655                nr_bits = min(nr_blocks, zone_bits - bit);
2656                if (set)
2657                        set_bit = find_next_bit(bitmap, zone_bits, bit);
2658                else
2659                        set_bit = find_next_zero_bit(bitmap, zone_bits, bit);
2660                dmz_release_mblock(zmd, mblk);
2661
2662                n += set_bit - bit;
2663                if (set_bit < zone_bits)
2664                        break;
2665
2666                nr_blocks -= nr_bits;
2667                chunk_block += nr_bits;
2668        }
2669
2670        return n;
2671}
2672
2673/*
2674 * Test if chunk_block is valid. If it is, the number of consecutive
2675 * valid blocks from chunk_block will be returned.
2676 */
2677int dmz_block_valid(struct dmz_metadata *zmd, struct dm_zone *zone,
2678                    sector_t chunk_block)
2679{
2680        int valid;
2681
2682        valid = dmz_test_block(zmd, zone, chunk_block);
2683        if (valid <= 0)
2684                return valid;
2685
2686        /* The block is valid: get the number of valid blocks from block */
2687        return dmz_to_next_set_block(zmd, zone, chunk_block,
2688                                     zmd->zone_nr_blocks - chunk_block, 0);
2689}
2690
2691/*
2692 * Find the first valid block from @chunk_block in @zone.
2693 * If such a block is found, its number is returned using
2694 * @chunk_block and the total number of valid blocks from @chunk_block
2695 * is returned.
2696 */
2697int dmz_first_valid_block(struct dmz_metadata *zmd, struct dm_zone *zone,
2698                          sector_t *chunk_block)
2699{
2700        sector_t start_block = *chunk_block;
2701        int ret;
2702
2703        ret = dmz_to_next_set_block(zmd, zone, start_block,
2704                                    zmd->zone_nr_blocks - start_block, 1);
2705        if (ret < 0)
2706                return ret;
2707
2708        start_block += ret;
2709        *chunk_block = start_block;
2710
2711        return dmz_to_next_set_block(zmd, zone, start_block,
2712                                     zmd->zone_nr_blocks - start_block, 0);
2713}
2714
2715/*
2716 * Count the number of bits set starting from bit up to bit + nr_bits - 1.
2717 */
2718static int dmz_count_bits(void *bitmap, int bit, int nr_bits)
2719{
2720        unsigned long *addr;
2721        int end = bit + nr_bits;
2722        int n = 0;
2723
2724        while (bit < end) {
2725                if (((bit & (BITS_PER_LONG - 1)) == 0) &&
2726                    ((end - bit) >= BITS_PER_LONG)) {
2727                        addr = (unsigned long *)bitmap + BIT_WORD(bit);
2728                        if (*addr == ULONG_MAX) {
2729                                n += BITS_PER_LONG;
2730                                bit += BITS_PER_LONG;
2731                                continue;
2732                        }
2733                }
2734
2735                if (test_bit(bit, bitmap))
2736                        n++;
2737                bit++;
2738        }
2739
2740        return n;
2741}
2742
2743/*
2744 * Get a zone weight.
2745 */
2746static void dmz_get_zone_weight(struct dmz_metadata *zmd, struct dm_zone *zone)
2747{
2748        struct dmz_mblock *mblk;
2749        sector_t chunk_block = 0;
2750        unsigned int bit, nr_bits;
2751        unsigned int nr_blocks = zmd->zone_nr_blocks;
2752        void *bitmap;
2753        int n = 0;
2754
2755        while (nr_blocks) {
2756                /* Get bitmap block */
2757                mblk = dmz_get_bitmap(zmd, zone, chunk_block);
2758                if (IS_ERR(mblk)) {
2759                        n = 0;
2760                        break;
2761                }
2762
2763                /* Count bits in this block */
2764                bitmap = mblk->data;
2765                bit = chunk_block & DMZ_BLOCK_MASK_BITS;
2766                nr_bits = min(nr_blocks, zmd->zone_bits_per_mblk - bit);
2767                n += dmz_count_bits(bitmap, bit, nr_bits);
2768
2769                dmz_release_mblock(zmd, mblk);
2770
2771                nr_blocks -= nr_bits;
2772                chunk_block += nr_bits;
2773        }
2774
2775        zone->weight = n;
2776}
2777
2778/*
2779 * Cleanup the zoned metadata resources.
2780 */
2781static void dmz_cleanup_metadata(struct dmz_metadata *zmd)
2782{
2783        struct rb_root *root;
2784        struct dmz_mblock *mblk, *next;
2785        int i;
2786
2787        /* Release zone mapping resources */
2788        if (zmd->map_mblk) {
2789                for (i = 0; i < zmd->nr_map_blocks; i++)
2790                        dmz_release_mblock(zmd, zmd->map_mblk[i]);
2791                kfree(zmd->map_mblk);
2792                zmd->map_mblk = NULL;
2793        }
2794
2795        /* Release super blocks */
2796        for (i = 0; i < 2; i++) {
2797                if (zmd->sb[i].mblk) {
2798                        dmz_free_mblock(zmd, zmd->sb[i].mblk);
2799                        zmd->sb[i].mblk = NULL;
2800                }
2801        }
2802
2803        /* Free cached blocks */
2804        while (!list_empty(&zmd->mblk_dirty_list)) {
2805                mblk = list_first_entry(&zmd->mblk_dirty_list,
2806                                        struct dmz_mblock, link);
2807                dmz_zmd_warn(zmd, "mblock %llu still in dirty list (ref %u)",
2808                             (u64)mblk->no, mblk->ref);
2809                list_del_init(&mblk->link);
2810                rb_erase(&mblk->node, &zmd->mblk_rbtree);
2811                dmz_free_mblock(zmd, mblk);
2812        }
2813
2814        while (!list_empty(&zmd->mblk_lru_list)) {
2815                mblk = list_first_entry(&zmd->mblk_lru_list,
2816                                        struct dmz_mblock, link);
2817                list_del_init(&mblk->link);
2818                rb_erase(&mblk->node, &zmd->mblk_rbtree);
2819                dmz_free_mblock(zmd, mblk);
2820        }
2821
2822        /* Sanity checks: the mblock rbtree should now be empty */
2823        root = &zmd->mblk_rbtree;
2824        rbtree_postorder_for_each_entry_safe(mblk, next, root, node) {
2825                dmz_zmd_warn(zmd, "mblock %llu ref %u still in rbtree",
2826                             (u64)mblk->no, mblk->ref);
2827                mblk->ref = 0;
2828                dmz_free_mblock(zmd, mblk);
2829        }
2830
2831        /* Free the zone descriptors */
2832        dmz_drop_zones(zmd);
2833
2834        mutex_destroy(&zmd->mblk_flush_lock);
2835        mutex_destroy(&zmd->map_lock);
2836}
2837
2838static void dmz_print_dev(struct dmz_metadata *zmd, int num)
2839{
2840        struct dmz_dev *dev = &zmd->dev[num];
2841
2842        if (bdev_zoned_model(dev->bdev) == BLK_ZONED_NONE)
2843                dmz_dev_info(dev, "Regular block device");
2844        else
2845                dmz_dev_info(dev, "Host-%s zoned block device",
2846                             bdev_zoned_model(dev->bdev) == BLK_ZONED_HA ?
2847                             "aware" : "managed");
2848        if (zmd->sb_version > 1) {
2849                sector_t sector_offset =
2850                        dev->zone_offset << zmd->zone_nr_sectors_shift;
2851
2852                dmz_dev_info(dev, "  %llu 512-byte logical sectors (offset %llu)",
2853                             (u64)dev->capacity, (u64)sector_offset);
2854                dmz_dev_info(dev, "  %u zones of %llu 512-byte logical sectors (offset %llu)",
2855                             dev->nr_zones, (u64)zmd->zone_nr_sectors,
2856                             (u64)dev->zone_offset);
2857        } else {
2858                dmz_dev_info(dev, "  %llu 512-byte logical sectors",
2859                             (u64)dev->capacity);
2860                dmz_dev_info(dev, "  %u zones of %llu 512-byte logical sectors",
2861                             dev->nr_zones, (u64)zmd->zone_nr_sectors);
2862        }
2863}
2864
2865/*
2866 * Initialize the zoned metadata.
2867 */
2868int dmz_ctr_metadata(struct dmz_dev *dev, int num_dev,
2869                     struct dmz_metadata **metadata,
2870                     const char *devname)
2871{
2872        struct dmz_metadata *zmd;
2873        unsigned int i;
2874        struct dm_zone *zone;
2875        int ret;
2876
2877        zmd = kzalloc(sizeof(struct dmz_metadata), GFP_KERNEL);
2878        if (!zmd)
2879                return -ENOMEM;
2880
2881        strcpy(zmd->devname, devname);
2882        zmd->dev = dev;
2883        zmd->nr_devs = num_dev;
2884        zmd->mblk_rbtree = RB_ROOT;
2885        init_rwsem(&zmd->mblk_sem);
2886        mutex_init(&zmd->mblk_flush_lock);
2887        spin_lock_init(&zmd->mblk_lock);
2888        INIT_LIST_HEAD(&zmd->mblk_lru_list);
2889        INIT_LIST_HEAD(&zmd->mblk_dirty_list);
2890
2891        mutex_init(&zmd->map_lock);
2892
2893        atomic_set(&zmd->unmap_nr_cache, 0);
2894        INIT_LIST_HEAD(&zmd->unmap_cache_list);
2895        INIT_LIST_HEAD(&zmd->map_cache_list);
2896
2897        atomic_set(&zmd->nr_reserved_seq_zones, 0);
2898        INIT_LIST_HEAD(&zmd->reserved_seq_zones_list);
2899
2900        init_waitqueue_head(&zmd->free_wq);
2901
2902        /* Initialize zone descriptors */
2903        ret = dmz_init_zones(zmd);
2904        if (ret)
2905                goto err;
2906
2907        /* Get super block */
2908        ret = dmz_load_sb(zmd);
2909        if (ret)
2910                goto err;
2911
2912        /* Set metadata zones starting from sb_zone */
2913        for (i = 0; i < zmd->nr_meta_zones << 1; i++) {
2914                zone = dmz_get(zmd, zmd->sb[0].zone->id + i);
2915                if (!zone) {
2916                        dmz_zmd_err(zmd,
2917                                    "metadata zone %u not present", i);
2918                        ret = -ENXIO;
2919                        goto err;
2920                }
2921                if (!dmz_is_rnd(zone) && !dmz_is_cache(zone)) {
2922                        dmz_zmd_err(zmd,
2923                                    "metadata zone %d is not random", i);
2924                        ret = -ENXIO;
2925                        goto err;
2926                }
2927                set_bit(DMZ_META, &zone->flags);
2928        }
2929        /* Load mapping table */
2930        ret = dmz_load_mapping(zmd);
2931        if (ret)
2932                goto err;
2933
2934        /*
2935         * Cache size boundaries: allow at least 2 super blocks, the chunk map
2936         * blocks and enough blocks to be able to cache the bitmap blocks of
2937         * up to 16 zones when idle (min_nr_mblks). Otherwise, if busy, allow
2938         * the cache to add 512 more metadata blocks.
2939         */
2940        zmd->min_nr_mblks = 2 + zmd->nr_map_blocks + zmd->zone_nr_bitmap_blocks * 16;
2941        zmd->max_nr_mblks = zmd->min_nr_mblks + 512;
2942        zmd->mblk_shrinker.count_objects = dmz_mblock_shrinker_count;
2943        zmd->mblk_shrinker.scan_objects = dmz_mblock_shrinker_scan;
2944        zmd->mblk_shrinker.seeks = DEFAULT_SEEKS;
2945
2946        /* Metadata cache shrinker */
2947        ret = register_shrinker(&zmd->mblk_shrinker);
2948        if (ret) {
2949                dmz_zmd_err(zmd, "Register metadata cache shrinker failed");
2950                goto err;
2951        }
2952
2953        dmz_zmd_info(zmd, "DM-Zoned metadata version %d", zmd->sb_version);
2954        for (i = 0; i < zmd->nr_devs; i++)
2955                dmz_print_dev(zmd, i);
2956
2957        dmz_zmd_info(zmd, "  %u zones of %llu 512-byte logical sectors",
2958                     zmd->nr_zones, (u64)zmd->zone_nr_sectors);
2959        dmz_zmd_debug(zmd, "  %u metadata zones",
2960                      zmd->nr_meta_zones * 2);
2961        dmz_zmd_debug(zmd, "  %u data zones for %u chunks",
2962                      zmd->nr_data_zones, zmd->nr_chunks);
2963        dmz_zmd_debug(zmd, "    %u cache zones (%u unmapped)",
2964                      zmd->nr_cache, atomic_read(&zmd->unmap_nr_cache));
2965        for (i = 0; i < zmd->nr_devs; i++) {
2966                dmz_zmd_debug(zmd, "    %u random zones (%u unmapped)",
2967                              dmz_nr_rnd_zones(zmd, i),
2968                              dmz_nr_unmap_rnd_zones(zmd, i));
2969                dmz_zmd_debug(zmd, "    %u sequential zones (%u unmapped)",
2970                              dmz_nr_seq_zones(zmd, i),
2971                              dmz_nr_unmap_seq_zones(zmd, i));
2972        }
2973        dmz_zmd_debug(zmd, "  %u reserved sequential data zones",
2974                      zmd->nr_reserved_seq);
2975        dmz_zmd_debug(zmd, "Format:");
2976        dmz_zmd_debug(zmd, "%u metadata blocks per set (%u max cache)",
2977                      zmd->nr_meta_blocks, zmd->max_nr_mblks);
2978        dmz_zmd_debug(zmd, "  %u data zone mapping blocks",
2979                      zmd->nr_map_blocks);
2980        dmz_zmd_debug(zmd, "  %u bitmap blocks",
2981                      zmd->nr_bitmap_blocks);
2982
2983        *metadata = zmd;
2984
2985        return 0;
2986err:
2987        dmz_cleanup_metadata(zmd);
2988        kfree(zmd);
2989        *metadata = NULL;
2990
2991        return ret;
2992}
2993
2994/*
2995 * Cleanup the zoned metadata resources.
2996 */
2997void dmz_dtr_metadata(struct dmz_metadata *zmd)
2998{
2999        unregister_shrinker(&zmd->mblk_shrinker);
3000        dmz_cleanup_metadata(zmd);
3001        kfree(zmd);
3002}
3003
3004/*
3005 * Check zone information on resume.
3006 */
3007int dmz_resume_metadata(struct dmz_metadata *zmd)
3008{
3009        struct dm_zone *zone;
3010        sector_t wp_block;
3011        unsigned int i;
3012        int ret;
3013
3014        /* Check zones */
3015        for (i = 0; i < zmd->nr_zones; i++) {
3016                zone = dmz_get(zmd, i);
3017                if (!zone) {
3018                        dmz_zmd_err(zmd, "Unable to get zone %u", i);
3019                        return -EIO;
3020                }
3021                wp_block = zone->wp_block;
3022
3023                ret = dmz_update_zone(zmd, zone);
3024                if (ret) {
3025                        dmz_zmd_err(zmd, "Broken zone %u", i);
3026                        return ret;
3027                }
3028
3029                if (dmz_is_offline(zone)) {
3030                        dmz_zmd_warn(zmd, "Zone %u is offline", i);
3031                        continue;
3032                }
3033
3034                /* Check write pointer */
3035                if (!dmz_is_seq(zone))
3036                        zone->wp_block = 0;
3037                else if (zone->wp_block != wp_block) {
3038                        dmz_zmd_err(zmd, "Zone %u: Invalid wp (%llu / %llu)",
3039                                    i, (u64)zone->wp_block, (u64)wp_block);
3040                        zone->wp_block = wp_block;
3041                        dmz_invalidate_blocks(zmd, zone, zone->wp_block,
3042                                              zmd->zone_nr_blocks - zone->wp_block);
3043                }
3044        }
3045
3046        return 0;
3047}
3048