linux/drivers/md/dm-zoned-target.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0-only
   2/*
   3 * Copyright (C) 2017 Western Digital Corporation or its affiliates.
   4 *
   5 * This file is released under the GPL.
   6 */
   7
   8#include "dm-zoned.h"
   9
  10#include <linux/module.h>
  11
  12#define DM_MSG_PREFIX           "zoned"
  13
  14#define DMZ_MIN_BIOS            8192
  15
  16/*
  17 * Zone BIO context.
  18 */
  19struct dmz_bioctx {
  20        struct dmz_target       *target;
  21        struct dm_zone          *zone;
  22        struct bio              *bio;
  23        refcount_t              ref;
  24};
  25
  26/*
  27 * Chunk work descriptor.
  28 */
  29struct dm_chunk_work {
  30        struct work_struct      work;
  31        refcount_t              refcount;
  32        struct dmz_target       *target;
  33        unsigned int            chunk;
  34        struct bio_list         bio_list;
  35};
  36
  37/*
  38 * Target descriptor.
  39 */
  40struct dmz_target {
  41        struct dm_dev           *ddev;
  42
  43        unsigned long           flags;
  44
  45        /* Zoned block device information */
  46        struct dmz_dev          *dev;
  47
  48        /* For metadata handling */
  49        struct dmz_metadata     *metadata;
  50
  51        /* For reclaim */
  52        struct dmz_reclaim      *reclaim;
  53
  54        /* For chunk work */
  55        struct radix_tree_root  chunk_rxtree;
  56        struct workqueue_struct *chunk_wq;
  57        struct mutex            chunk_lock;
  58
  59        /* For cloned BIOs to zones */
  60        struct bio_set          bio_set;
  61
  62        /* For flush */
  63        spinlock_t              flush_lock;
  64        struct bio_list         flush_list;
  65        struct delayed_work     flush_work;
  66        struct workqueue_struct *flush_wq;
  67};
  68
  69/*
  70 * Flush intervals (seconds).
  71 */
  72#define DMZ_FLUSH_PERIOD        (10 * HZ)
  73
  74/*
  75 * Target BIO completion.
  76 */
  77static inline void dmz_bio_endio(struct bio *bio, blk_status_t status)
  78{
  79        struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
  80
  81        if (status != BLK_STS_OK && bio->bi_status == BLK_STS_OK)
  82                bio->bi_status = status;
  83
  84        if (refcount_dec_and_test(&bioctx->ref)) {
  85                struct dm_zone *zone = bioctx->zone;
  86
  87                if (zone) {
  88                        if (bio->bi_status != BLK_STS_OK &&
  89                            bio_op(bio) == REQ_OP_WRITE &&
  90                            dmz_is_seq(zone))
  91                                set_bit(DMZ_SEQ_WRITE_ERR, &zone->flags);
  92                        dmz_deactivate_zone(zone);
  93                }
  94                bio_endio(bio);
  95        }
  96}
  97
  98/*
  99 * Completion callback for an internally cloned target BIO. This terminates the
 100 * target BIO when there are no more references to its context.
 101 */
 102static void dmz_clone_endio(struct bio *clone)
 103{
 104        struct dmz_bioctx *bioctx = clone->bi_private;
 105        blk_status_t status = clone->bi_status;
 106
 107        bio_put(clone);
 108        dmz_bio_endio(bioctx->bio, status);
 109}
 110
 111/*
 112 * Issue a clone of a target BIO. The clone may only partially process the
 113 * original target BIO.
 114 */
 115static int dmz_submit_bio(struct dmz_target *dmz, struct dm_zone *zone,
 116                          struct bio *bio, sector_t chunk_block,
 117                          unsigned int nr_blocks)
 118{
 119        struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
 120        struct bio *clone;
 121
 122        clone = bio_clone_fast(bio, GFP_NOIO, &dmz->bio_set);
 123        if (!clone)
 124                return -ENOMEM;
 125
 126        bio_set_dev(clone, dmz->dev->bdev);
 127        clone->bi_iter.bi_sector =
 128                dmz_start_sect(dmz->metadata, zone) + dmz_blk2sect(chunk_block);
 129        clone->bi_iter.bi_size = dmz_blk2sect(nr_blocks) << SECTOR_SHIFT;
 130        clone->bi_end_io = dmz_clone_endio;
 131        clone->bi_private = bioctx;
 132
 133        bio_advance(bio, clone->bi_iter.bi_size);
 134
 135        refcount_inc(&bioctx->ref);
 136        generic_make_request(clone);
 137        if (clone->bi_status == BLK_STS_IOERR)
 138                return -EIO;
 139
 140        if (bio_op(bio) == REQ_OP_WRITE && dmz_is_seq(zone))
 141                zone->wp_block += nr_blocks;
 142
 143        return 0;
 144}
 145
 146/*
 147 * Zero out pages of discarded blocks accessed by a read BIO.
 148 */
 149static void dmz_handle_read_zero(struct dmz_target *dmz, struct bio *bio,
 150                                 sector_t chunk_block, unsigned int nr_blocks)
 151{
 152        unsigned int size = nr_blocks << DMZ_BLOCK_SHIFT;
 153
 154        /* Clear nr_blocks */
 155        swap(bio->bi_iter.bi_size, size);
 156        zero_fill_bio(bio);
 157        swap(bio->bi_iter.bi_size, size);
 158
 159        bio_advance(bio, size);
 160}
 161
 162/*
 163 * Process a read BIO.
 164 */
 165static int dmz_handle_read(struct dmz_target *dmz, struct dm_zone *zone,
 166                           struct bio *bio)
 167{
 168        sector_t chunk_block = dmz_chunk_block(dmz->dev, dmz_bio_block(bio));
 169        unsigned int nr_blocks = dmz_bio_blocks(bio);
 170        sector_t end_block = chunk_block + nr_blocks;
 171        struct dm_zone *rzone, *bzone;
 172        int ret;
 173
 174        /* Read into unmapped chunks need only zeroing the BIO buffer */
 175        if (!zone) {
 176                zero_fill_bio(bio);
 177                return 0;
 178        }
 179
 180        dmz_dev_debug(dmz->dev, "READ chunk %llu -> %s zone %u, block %llu, %u blocks",
 181                      (unsigned long long)dmz_bio_chunk(dmz->dev, bio),
 182                      (dmz_is_rnd(zone) ? "RND" : "SEQ"),
 183                      dmz_id(dmz->metadata, zone),
 184                      (unsigned long long)chunk_block, nr_blocks);
 185
 186        /* Check block validity to determine the read location */
 187        bzone = zone->bzone;
 188        while (chunk_block < end_block) {
 189                nr_blocks = 0;
 190                if (dmz_is_rnd(zone) || chunk_block < zone->wp_block) {
 191                        /* Test block validity in the data zone */
 192                        ret = dmz_block_valid(dmz->metadata, zone, chunk_block);
 193                        if (ret < 0)
 194                                return ret;
 195                        if (ret > 0) {
 196                                /* Read data zone blocks */
 197                                nr_blocks = ret;
 198                                rzone = zone;
 199                        }
 200                }
 201
 202                /*
 203                 * No valid blocks found in the data zone.
 204                 * Check the buffer zone, if there is one.
 205                 */
 206                if (!nr_blocks && bzone) {
 207                        ret = dmz_block_valid(dmz->metadata, bzone, chunk_block);
 208                        if (ret < 0)
 209                                return ret;
 210                        if (ret > 0) {
 211                                /* Read buffer zone blocks */
 212                                nr_blocks = ret;
 213                                rzone = bzone;
 214                        }
 215                }
 216
 217                if (nr_blocks) {
 218                        /* Valid blocks found: read them */
 219                        nr_blocks = min_t(unsigned int, nr_blocks, end_block - chunk_block);
 220                        ret = dmz_submit_bio(dmz, rzone, bio, chunk_block, nr_blocks);
 221                        if (ret)
 222                                return ret;
 223                        chunk_block += nr_blocks;
 224                } else {
 225                        /* No valid block: zeroout the current BIO block */
 226                        dmz_handle_read_zero(dmz, bio, chunk_block, 1);
 227                        chunk_block++;
 228                }
 229        }
 230
 231        return 0;
 232}
 233
 234/*
 235 * Write blocks directly in a data zone, at the write pointer.
 236 * If a buffer zone is assigned, invalidate the blocks written
 237 * in place.
 238 */
 239static int dmz_handle_direct_write(struct dmz_target *dmz,
 240                                   struct dm_zone *zone, struct bio *bio,
 241                                   sector_t chunk_block,
 242                                   unsigned int nr_blocks)
 243{
 244        struct dmz_metadata *zmd = dmz->metadata;
 245        struct dm_zone *bzone = zone->bzone;
 246        int ret;
 247
 248        if (dmz_is_readonly(zone))
 249                return -EROFS;
 250
 251        /* Submit write */
 252        ret = dmz_submit_bio(dmz, zone, bio, chunk_block, nr_blocks);
 253        if (ret)
 254                return ret;
 255
 256        /*
 257         * Validate the blocks in the data zone and invalidate
 258         * in the buffer zone, if there is one.
 259         */
 260        ret = dmz_validate_blocks(zmd, zone, chunk_block, nr_blocks);
 261        if (ret == 0 && bzone)
 262                ret = dmz_invalidate_blocks(zmd, bzone, chunk_block, nr_blocks);
 263
 264        return ret;
 265}
 266
 267/*
 268 * Write blocks in the buffer zone of @zone.
 269 * If no buffer zone is assigned yet, get one.
 270 * Called with @zone write locked.
 271 */
 272static int dmz_handle_buffered_write(struct dmz_target *dmz,
 273                                     struct dm_zone *zone, struct bio *bio,
 274                                     sector_t chunk_block,
 275                                     unsigned int nr_blocks)
 276{
 277        struct dmz_metadata *zmd = dmz->metadata;
 278        struct dm_zone *bzone;
 279        int ret;
 280
 281        /* Get the buffer zone. One will be allocated if needed */
 282        bzone = dmz_get_chunk_buffer(zmd, zone);
 283        if (IS_ERR(bzone))
 284                return PTR_ERR(bzone);
 285
 286        if (dmz_is_readonly(bzone))
 287                return -EROFS;
 288
 289        /* Submit write */
 290        ret = dmz_submit_bio(dmz, bzone, bio, chunk_block, nr_blocks);
 291        if (ret)
 292                return ret;
 293
 294        /*
 295         * Validate the blocks in the buffer zone
 296         * and invalidate in the data zone.
 297         */
 298        ret = dmz_validate_blocks(zmd, bzone, chunk_block, nr_blocks);
 299        if (ret == 0 && chunk_block < zone->wp_block)
 300                ret = dmz_invalidate_blocks(zmd, zone, chunk_block, nr_blocks);
 301
 302        return ret;
 303}
 304
 305/*
 306 * Process a write BIO.
 307 */
 308static int dmz_handle_write(struct dmz_target *dmz, struct dm_zone *zone,
 309                            struct bio *bio)
 310{
 311        sector_t chunk_block = dmz_chunk_block(dmz->dev, dmz_bio_block(bio));
 312        unsigned int nr_blocks = dmz_bio_blocks(bio);
 313
 314        if (!zone)
 315                return -ENOSPC;
 316
 317        dmz_dev_debug(dmz->dev, "WRITE chunk %llu -> %s zone %u, block %llu, %u blocks",
 318                      (unsigned long long)dmz_bio_chunk(dmz->dev, bio),
 319                      (dmz_is_rnd(zone) ? "RND" : "SEQ"),
 320                      dmz_id(dmz->metadata, zone),
 321                      (unsigned long long)chunk_block, nr_blocks);
 322
 323        if (dmz_is_rnd(zone) || chunk_block == zone->wp_block) {
 324                /*
 325                 * zone is a random zone or it is a sequential zone
 326                 * and the BIO is aligned to the zone write pointer:
 327                 * direct write the zone.
 328                 */
 329                return dmz_handle_direct_write(dmz, zone, bio, chunk_block, nr_blocks);
 330        }
 331
 332        /*
 333         * This is an unaligned write in a sequential zone:
 334         * use buffered write.
 335         */
 336        return dmz_handle_buffered_write(dmz, zone, bio, chunk_block, nr_blocks);
 337}
 338
 339/*
 340 * Process a discard BIO.
 341 */
 342static int dmz_handle_discard(struct dmz_target *dmz, struct dm_zone *zone,
 343                              struct bio *bio)
 344{
 345        struct dmz_metadata *zmd = dmz->metadata;
 346        sector_t block = dmz_bio_block(bio);
 347        unsigned int nr_blocks = dmz_bio_blocks(bio);
 348        sector_t chunk_block = dmz_chunk_block(dmz->dev, block);
 349        int ret = 0;
 350
 351        /* For unmapped chunks, there is nothing to do */
 352        if (!zone)
 353                return 0;
 354
 355        if (dmz_is_readonly(zone))
 356                return -EROFS;
 357
 358        dmz_dev_debug(dmz->dev, "DISCARD chunk %llu -> zone %u, block %llu, %u blocks",
 359                      (unsigned long long)dmz_bio_chunk(dmz->dev, bio),
 360                      dmz_id(zmd, zone),
 361                      (unsigned long long)chunk_block, nr_blocks);
 362
 363        /*
 364         * Invalidate blocks in the data zone and its
 365         * buffer zone if one is mapped.
 366         */
 367        if (dmz_is_rnd(zone) || chunk_block < zone->wp_block)
 368                ret = dmz_invalidate_blocks(zmd, zone, chunk_block, nr_blocks);
 369        if (ret == 0 && zone->bzone)
 370                ret = dmz_invalidate_blocks(zmd, zone->bzone,
 371                                            chunk_block, nr_blocks);
 372        return ret;
 373}
 374
 375/*
 376 * Process a BIO.
 377 */
 378static void dmz_handle_bio(struct dmz_target *dmz, struct dm_chunk_work *cw,
 379                           struct bio *bio)
 380{
 381        struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
 382        struct dmz_metadata *zmd = dmz->metadata;
 383        struct dm_zone *zone;
 384        int ret;
 385
 386        /*
 387         * Write may trigger a zone allocation. So make sure the
 388         * allocation can succeed.
 389         */
 390        if (bio_op(bio) == REQ_OP_WRITE)
 391                dmz_schedule_reclaim(dmz->reclaim);
 392
 393        dmz_lock_metadata(zmd);
 394
 395        if (dmz->dev->flags & DMZ_BDEV_DYING) {
 396                ret = -EIO;
 397                goto out;
 398        }
 399
 400        /*
 401         * Get the data zone mapping the chunk. There may be no
 402         * mapping for read and discard. If a mapping is obtained,
 403         + the zone returned will be set to active state.
 404         */
 405        zone = dmz_get_chunk_mapping(zmd, dmz_bio_chunk(dmz->dev, bio),
 406                                     bio_op(bio));
 407        if (IS_ERR(zone)) {
 408                ret = PTR_ERR(zone);
 409                goto out;
 410        }
 411
 412        /* Process the BIO */
 413        if (zone) {
 414                dmz_activate_zone(zone);
 415                bioctx->zone = zone;
 416        }
 417
 418        switch (bio_op(bio)) {
 419        case REQ_OP_READ:
 420                ret = dmz_handle_read(dmz, zone, bio);
 421                break;
 422        case REQ_OP_WRITE:
 423                ret = dmz_handle_write(dmz, zone, bio);
 424                break;
 425        case REQ_OP_DISCARD:
 426        case REQ_OP_WRITE_ZEROES:
 427                ret = dmz_handle_discard(dmz, zone, bio);
 428                break;
 429        default:
 430                dmz_dev_err(dmz->dev, "Unsupported BIO operation 0x%x",
 431                            bio_op(bio));
 432                ret = -EIO;
 433        }
 434
 435        /*
 436         * Release the chunk mapping. This will check that the mapping
 437         * is still valid, that is, that the zone used still has valid blocks.
 438         */
 439        if (zone)
 440                dmz_put_chunk_mapping(zmd, zone);
 441out:
 442        dmz_bio_endio(bio, errno_to_blk_status(ret));
 443
 444        dmz_unlock_metadata(zmd);
 445}
 446
 447/*
 448 * Increment a chunk reference counter.
 449 */
 450static inline void dmz_get_chunk_work(struct dm_chunk_work *cw)
 451{
 452        refcount_inc(&cw->refcount);
 453}
 454
 455/*
 456 * Decrement a chunk work reference count and
 457 * free it if it becomes 0.
 458 */
 459static void dmz_put_chunk_work(struct dm_chunk_work *cw)
 460{
 461        if (refcount_dec_and_test(&cw->refcount)) {
 462                WARN_ON(!bio_list_empty(&cw->bio_list));
 463                radix_tree_delete(&cw->target->chunk_rxtree, cw->chunk);
 464                kfree(cw);
 465        }
 466}
 467
 468/*
 469 * Chunk BIO work function.
 470 */
 471static void dmz_chunk_work(struct work_struct *work)
 472{
 473        struct dm_chunk_work *cw = container_of(work, struct dm_chunk_work, work);
 474        struct dmz_target *dmz = cw->target;
 475        struct bio *bio;
 476
 477        mutex_lock(&dmz->chunk_lock);
 478
 479        /* Process the chunk BIOs */
 480        while ((bio = bio_list_pop(&cw->bio_list))) {
 481                mutex_unlock(&dmz->chunk_lock);
 482                dmz_handle_bio(dmz, cw, bio);
 483                mutex_lock(&dmz->chunk_lock);
 484                dmz_put_chunk_work(cw);
 485        }
 486
 487        /* Queueing the work incremented the work refcount */
 488        dmz_put_chunk_work(cw);
 489
 490        mutex_unlock(&dmz->chunk_lock);
 491}
 492
 493/*
 494 * Flush work.
 495 */
 496static void dmz_flush_work(struct work_struct *work)
 497{
 498        struct dmz_target *dmz = container_of(work, struct dmz_target, flush_work.work);
 499        struct bio *bio;
 500        int ret;
 501
 502        /* Flush dirty metadata blocks */
 503        ret = dmz_flush_metadata(dmz->metadata);
 504        if (ret)
 505                dmz_dev_debug(dmz->dev, "Metadata flush failed, rc=%d\n", ret);
 506
 507        /* Process queued flush requests */
 508        while (1) {
 509                spin_lock(&dmz->flush_lock);
 510                bio = bio_list_pop(&dmz->flush_list);
 511                spin_unlock(&dmz->flush_lock);
 512
 513                if (!bio)
 514                        break;
 515
 516                dmz_bio_endio(bio, errno_to_blk_status(ret));
 517        }
 518
 519        queue_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD);
 520}
 521
 522/*
 523 * Get a chunk work and start it to process a new BIO.
 524 * If the BIO chunk has no work yet, create one.
 525 */
 526static int dmz_queue_chunk_work(struct dmz_target *dmz, struct bio *bio)
 527{
 528        unsigned int chunk = dmz_bio_chunk(dmz->dev, bio);
 529        struct dm_chunk_work *cw;
 530        int ret = 0;
 531
 532        mutex_lock(&dmz->chunk_lock);
 533
 534        /* Get the BIO chunk work. If one is not active yet, create one */
 535        cw = radix_tree_lookup(&dmz->chunk_rxtree, chunk);
 536        if (!cw) {
 537
 538                /* Create a new chunk work */
 539                cw = kmalloc(sizeof(struct dm_chunk_work), GFP_NOIO);
 540                if (unlikely(!cw)) {
 541                        ret = -ENOMEM;
 542                        goto out;
 543                }
 544
 545                INIT_WORK(&cw->work, dmz_chunk_work);
 546                refcount_set(&cw->refcount, 0);
 547                cw->target = dmz;
 548                cw->chunk = chunk;
 549                bio_list_init(&cw->bio_list);
 550
 551                ret = radix_tree_insert(&dmz->chunk_rxtree, chunk, cw);
 552                if (unlikely(ret)) {
 553                        kfree(cw);
 554                        goto out;
 555                }
 556        }
 557
 558        bio_list_add(&cw->bio_list, bio);
 559        dmz_get_chunk_work(cw);
 560
 561        dmz_reclaim_bio_acc(dmz->reclaim);
 562        if (queue_work(dmz->chunk_wq, &cw->work))
 563                dmz_get_chunk_work(cw);
 564out:
 565        mutex_unlock(&dmz->chunk_lock);
 566        return ret;
 567}
 568
 569/*
 570 * Check the backing device availability. If it's on the way out,
 571 * start failing I/O. Reclaim and metadata components also call this
 572 * function to cleanly abort operation in the event of such failure.
 573 */
 574bool dmz_bdev_is_dying(struct dmz_dev *dmz_dev)
 575{
 576        struct gendisk *disk;
 577
 578        if (!(dmz_dev->flags & DMZ_BDEV_DYING)) {
 579                disk = dmz_dev->bdev->bd_disk;
 580                if (blk_queue_dying(bdev_get_queue(dmz_dev->bdev))) {
 581                        dmz_dev_warn(dmz_dev, "Backing device queue dying");
 582                        dmz_dev->flags |= DMZ_BDEV_DYING;
 583                } else if (disk->fops->check_events) {
 584                        if (disk->fops->check_events(disk, 0) &
 585                                        DISK_EVENT_MEDIA_CHANGE) {
 586                                dmz_dev_warn(dmz_dev, "Backing device offline");
 587                                dmz_dev->flags |= DMZ_BDEV_DYING;
 588                        }
 589                }
 590        }
 591
 592        return dmz_dev->flags & DMZ_BDEV_DYING;
 593}
 594
 595/*
 596 * Process a new BIO.
 597 */
 598static int dmz_map(struct dm_target *ti, struct bio *bio)
 599{
 600        struct dmz_target *dmz = ti->private;
 601        struct dmz_dev *dev = dmz->dev;
 602        struct dmz_bioctx *bioctx = dm_per_bio_data(bio, sizeof(struct dmz_bioctx));
 603        sector_t sector = bio->bi_iter.bi_sector;
 604        unsigned int nr_sectors = bio_sectors(bio);
 605        sector_t chunk_sector;
 606        int ret;
 607
 608        if (dmz_bdev_is_dying(dmz->dev))
 609                return DM_MAPIO_KILL;
 610
 611        dmz_dev_debug(dev, "BIO op %d sector %llu + %u => chunk %llu, block %llu, %u blocks",
 612                      bio_op(bio), (unsigned long long)sector, nr_sectors,
 613                      (unsigned long long)dmz_bio_chunk(dmz->dev, bio),
 614                      (unsigned long long)dmz_chunk_block(dmz->dev, dmz_bio_block(bio)),
 615                      (unsigned int)dmz_bio_blocks(bio));
 616
 617        bio_set_dev(bio, dev->bdev);
 618
 619        if (!nr_sectors && bio_op(bio) != REQ_OP_WRITE)
 620                return DM_MAPIO_REMAPPED;
 621
 622        /* The BIO should be block aligned */
 623        if ((nr_sectors & DMZ_BLOCK_SECTORS_MASK) || (sector & DMZ_BLOCK_SECTORS_MASK))
 624                return DM_MAPIO_KILL;
 625
 626        /* Initialize the BIO context */
 627        bioctx->target = dmz;
 628        bioctx->zone = NULL;
 629        bioctx->bio = bio;
 630        refcount_set(&bioctx->ref, 1);
 631
 632        /* Set the BIO pending in the flush list */
 633        if (!nr_sectors && bio_op(bio) == REQ_OP_WRITE) {
 634                spin_lock(&dmz->flush_lock);
 635                bio_list_add(&dmz->flush_list, bio);
 636                spin_unlock(&dmz->flush_lock);
 637                mod_delayed_work(dmz->flush_wq, &dmz->flush_work, 0);
 638                return DM_MAPIO_SUBMITTED;
 639        }
 640
 641        /* Split zone BIOs to fit entirely into a zone */
 642        chunk_sector = sector & (dev->zone_nr_sectors - 1);
 643        if (chunk_sector + nr_sectors > dev->zone_nr_sectors)
 644                dm_accept_partial_bio(bio, dev->zone_nr_sectors - chunk_sector);
 645
 646        /* Now ready to handle this BIO */
 647        ret = dmz_queue_chunk_work(dmz, bio);
 648        if (ret) {
 649                dmz_dev_debug(dmz->dev,
 650                              "BIO op %d, can't process chunk %llu, err %i\n",
 651                              bio_op(bio), (u64)dmz_bio_chunk(dmz->dev, bio),
 652                              ret);
 653                return DM_MAPIO_REQUEUE;
 654        }
 655
 656        return DM_MAPIO_SUBMITTED;
 657}
 658
 659/*
 660 * Get zoned device information.
 661 */
 662static int dmz_get_zoned_device(struct dm_target *ti, char *path)
 663{
 664        struct dmz_target *dmz = ti->private;
 665        struct request_queue *q;
 666        struct dmz_dev *dev;
 667        sector_t aligned_capacity;
 668        int ret;
 669
 670        /* Get the target device */
 671        ret = dm_get_device(ti, path, dm_table_get_mode(ti->table), &dmz->ddev);
 672        if (ret) {
 673                ti->error = "Get target device failed";
 674                dmz->ddev = NULL;
 675                return ret;
 676        }
 677
 678        dev = kzalloc(sizeof(struct dmz_dev), GFP_KERNEL);
 679        if (!dev) {
 680                ret = -ENOMEM;
 681                goto err;
 682        }
 683
 684        dev->bdev = dmz->ddev->bdev;
 685        (void)bdevname(dev->bdev, dev->name);
 686
 687        if (bdev_zoned_model(dev->bdev) == BLK_ZONED_NONE) {
 688                ti->error = "Not a zoned block device";
 689                ret = -EINVAL;
 690                goto err;
 691        }
 692
 693        q = bdev_get_queue(dev->bdev);
 694        dev->capacity = i_size_read(dev->bdev->bd_inode) >> SECTOR_SHIFT;
 695        aligned_capacity = dev->capacity &
 696                                ~((sector_t)blk_queue_zone_sectors(q) - 1);
 697        if (ti->begin ||
 698            ((ti->len != dev->capacity) && (ti->len != aligned_capacity))) {
 699                ti->error = "Partial mapping not supported";
 700                ret = -EINVAL;
 701                goto err;
 702        }
 703
 704        dev->zone_nr_sectors = blk_queue_zone_sectors(q);
 705        dev->zone_nr_sectors_shift = ilog2(dev->zone_nr_sectors);
 706
 707        dev->zone_nr_blocks = dmz_sect2blk(dev->zone_nr_sectors);
 708        dev->zone_nr_blocks_shift = ilog2(dev->zone_nr_blocks);
 709
 710        dev->nr_zones = blkdev_nr_zones(dev->bdev);
 711
 712        dmz->dev = dev;
 713
 714        return 0;
 715err:
 716        dm_put_device(ti, dmz->ddev);
 717        kfree(dev);
 718
 719        return ret;
 720}
 721
 722/*
 723 * Cleanup zoned device information.
 724 */
 725static void dmz_put_zoned_device(struct dm_target *ti)
 726{
 727        struct dmz_target *dmz = ti->private;
 728
 729        dm_put_device(ti, dmz->ddev);
 730        kfree(dmz->dev);
 731        dmz->dev = NULL;
 732}
 733
 734/*
 735 * Setup target.
 736 */
 737static int dmz_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 738{
 739        struct dmz_target *dmz;
 740        struct dmz_dev *dev;
 741        int ret;
 742
 743        /* Check arguments */
 744        if (argc != 1) {
 745                ti->error = "Invalid argument count";
 746                return -EINVAL;
 747        }
 748
 749        /* Allocate and initialize the target descriptor */
 750        dmz = kzalloc(sizeof(struct dmz_target), GFP_KERNEL);
 751        if (!dmz) {
 752                ti->error = "Unable to allocate the zoned target descriptor";
 753                return -ENOMEM;
 754        }
 755        ti->private = dmz;
 756
 757        /* Get the target zoned block device */
 758        ret = dmz_get_zoned_device(ti, argv[0]);
 759        if (ret) {
 760                dmz->ddev = NULL;
 761                goto err;
 762        }
 763
 764        /* Initialize metadata */
 765        dev = dmz->dev;
 766        ret = dmz_ctr_metadata(dev, &dmz->metadata);
 767        if (ret) {
 768                ti->error = "Metadata initialization failed";
 769                goto err_dev;
 770        }
 771
 772        /* Set target (no write same support) */
 773        ti->max_io_len = dev->zone_nr_sectors << 9;
 774        ti->num_flush_bios = 1;
 775        ti->num_discard_bios = 1;
 776        ti->num_write_zeroes_bios = 1;
 777        ti->per_io_data_size = sizeof(struct dmz_bioctx);
 778        ti->flush_supported = true;
 779        ti->discards_supported = true;
 780
 781        /* The exposed capacity is the number of chunks that can be mapped */
 782        ti->len = (sector_t)dmz_nr_chunks(dmz->metadata) << dev->zone_nr_sectors_shift;
 783
 784        /* Zone BIO */
 785        ret = bioset_init(&dmz->bio_set, DMZ_MIN_BIOS, 0, 0);
 786        if (ret) {
 787                ti->error = "Create BIO set failed";
 788                goto err_meta;
 789        }
 790
 791        /* Chunk BIO work */
 792        mutex_init(&dmz->chunk_lock);
 793        INIT_RADIX_TREE(&dmz->chunk_rxtree, GFP_NOIO);
 794        dmz->chunk_wq = alloc_workqueue("dmz_cwq_%s", WQ_MEM_RECLAIM | WQ_UNBOUND,
 795                                        0, dev->name);
 796        if (!dmz->chunk_wq) {
 797                ti->error = "Create chunk workqueue failed";
 798                ret = -ENOMEM;
 799                goto err_bio;
 800        }
 801
 802        /* Flush work */
 803        spin_lock_init(&dmz->flush_lock);
 804        bio_list_init(&dmz->flush_list);
 805        INIT_DELAYED_WORK(&dmz->flush_work, dmz_flush_work);
 806        dmz->flush_wq = alloc_ordered_workqueue("dmz_fwq_%s", WQ_MEM_RECLAIM,
 807                                                dev->name);
 808        if (!dmz->flush_wq) {
 809                ti->error = "Create flush workqueue failed";
 810                ret = -ENOMEM;
 811                goto err_cwq;
 812        }
 813        mod_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD);
 814
 815        /* Initialize reclaim */
 816        ret = dmz_ctr_reclaim(dev, dmz->metadata, &dmz->reclaim);
 817        if (ret) {
 818                ti->error = "Zone reclaim initialization failed";
 819                goto err_fwq;
 820        }
 821
 822        dmz_dev_info(dev, "Target device: %llu 512-byte logical sectors (%llu blocks)",
 823                     (unsigned long long)ti->len,
 824                     (unsigned long long)dmz_sect2blk(ti->len));
 825
 826        return 0;
 827err_fwq:
 828        destroy_workqueue(dmz->flush_wq);
 829err_cwq:
 830        destroy_workqueue(dmz->chunk_wq);
 831err_bio:
 832        mutex_destroy(&dmz->chunk_lock);
 833        bioset_exit(&dmz->bio_set);
 834err_meta:
 835        dmz_dtr_metadata(dmz->metadata);
 836err_dev:
 837        dmz_put_zoned_device(ti);
 838err:
 839        kfree(dmz);
 840
 841        return ret;
 842}
 843
 844/*
 845 * Cleanup target.
 846 */
 847static void dmz_dtr(struct dm_target *ti)
 848{
 849        struct dmz_target *dmz = ti->private;
 850
 851        flush_workqueue(dmz->chunk_wq);
 852        destroy_workqueue(dmz->chunk_wq);
 853
 854        dmz_dtr_reclaim(dmz->reclaim);
 855
 856        cancel_delayed_work_sync(&dmz->flush_work);
 857        destroy_workqueue(dmz->flush_wq);
 858
 859        (void) dmz_flush_metadata(dmz->metadata);
 860
 861        dmz_dtr_metadata(dmz->metadata);
 862
 863        bioset_exit(&dmz->bio_set);
 864
 865        dmz_put_zoned_device(ti);
 866
 867        mutex_destroy(&dmz->chunk_lock);
 868
 869        kfree(dmz);
 870}
 871
 872/*
 873 * Setup target request queue limits.
 874 */
 875static void dmz_io_hints(struct dm_target *ti, struct queue_limits *limits)
 876{
 877        struct dmz_target *dmz = ti->private;
 878        unsigned int chunk_sectors = dmz->dev->zone_nr_sectors;
 879
 880        limits->logical_block_size = DMZ_BLOCK_SIZE;
 881        limits->physical_block_size = DMZ_BLOCK_SIZE;
 882
 883        blk_limits_io_min(limits, DMZ_BLOCK_SIZE);
 884        blk_limits_io_opt(limits, DMZ_BLOCK_SIZE);
 885
 886        limits->discard_alignment = DMZ_BLOCK_SIZE;
 887        limits->discard_granularity = DMZ_BLOCK_SIZE;
 888        limits->max_discard_sectors = chunk_sectors;
 889        limits->max_hw_discard_sectors = chunk_sectors;
 890        limits->max_write_zeroes_sectors = chunk_sectors;
 891
 892        /* FS hint to try to align to the device zone size */
 893        limits->chunk_sectors = chunk_sectors;
 894        limits->max_sectors = chunk_sectors;
 895
 896        /* We are exposing a drive-managed zoned block device */
 897        limits->zoned = BLK_ZONED_NONE;
 898}
 899
 900/*
 901 * Pass on ioctl to the backend device.
 902 */
 903static int dmz_prepare_ioctl(struct dm_target *ti, struct block_device **bdev)
 904{
 905        struct dmz_target *dmz = ti->private;
 906
 907        if (dmz_bdev_is_dying(dmz->dev))
 908                return -ENODEV;
 909
 910        *bdev = dmz->dev->bdev;
 911
 912        return 0;
 913}
 914
 915/*
 916 * Stop works on suspend.
 917 */
 918static void dmz_suspend(struct dm_target *ti)
 919{
 920        struct dmz_target *dmz = ti->private;
 921
 922        flush_workqueue(dmz->chunk_wq);
 923        dmz_suspend_reclaim(dmz->reclaim);
 924        cancel_delayed_work_sync(&dmz->flush_work);
 925}
 926
 927/*
 928 * Restart works on resume or if suspend failed.
 929 */
 930static void dmz_resume(struct dm_target *ti)
 931{
 932        struct dmz_target *dmz = ti->private;
 933
 934        queue_delayed_work(dmz->flush_wq, &dmz->flush_work, DMZ_FLUSH_PERIOD);
 935        dmz_resume_reclaim(dmz->reclaim);
 936}
 937
 938static int dmz_iterate_devices(struct dm_target *ti,
 939                               iterate_devices_callout_fn fn, void *data)
 940{
 941        struct dmz_target *dmz = ti->private;
 942        struct dmz_dev *dev = dmz->dev;
 943        sector_t capacity = dev->capacity & ~(dev->zone_nr_sectors - 1);
 944
 945        return fn(ti, dmz->ddev, 0, capacity, data);
 946}
 947
 948static struct target_type dmz_type = {
 949        .name            = "zoned",
 950        .version         = {1, 0, 0},
 951        .features        = DM_TARGET_SINGLETON | DM_TARGET_ZONED_HM,
 952        .module          = THIS_MODULE,
 953        .ctr             = dmz_ctr,
 954        .dtr             = dmz_dtr,
 955        .map             = dmz_map,
 956        .io_hints        = dmz_io_hints,
 957        .prepare_ioctl   = dmz_prepare_ioctl,
 958        .postsuspend     = dmz_suspend,
 959        .resume          = dmz_resume,
 960        .iterate_devices = dmz_iterate_devices,
 961};
 962
 963static int __init dmz_init(void)
 964{
 965        return dm_register_target(&dmz_type);
 966}
 967
 968static void __exit dmz_exit(void)
 969{
 970        dm_unregister_target(&dmz_type);
 971}
 972
 973module_init(dmz_init);
 974module_exit(dmz_exit);
 975
 976MODULE_DESCRIPTION(DM_NAME " target for zoned block devices");
 977MODULE_AUTHOR("Damien Le Moal <damien.lemoal@wdc.com>");
 978MODULE_LICENSE("GPL");
 979