linux/drivers/md/dm-zone.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2021 Western Digital Corporation or its affiliates.
   4 */
   5
   6#include <linux/blkdev.h>
   7#include <linux/mm.h>
   8#include <linux/sched/mm.h>
   9#include <linux/slab.h>
  10
  11#include "dm-core.h"
  12
  13#define DM_MSG_PREFIX "zone"
  14
  15#define DM_ZONE_INVALID_WP_OFST         UINT_MAX
  16
  17/*
  18 * For internal zone reports bypassing the top BIO submission path.
  19 */
  20static int dm_blk_do_report_zones(struct mapped_device *md, struct dm_table *t,
  21                                  sector_t sector, unsigned int nr_zones,
  22                                  report_zones_cb cb, void *data)
  23{
  24        struct gendisk *disk = md->disk;
  25        int ret;
  26        struct dm_report_zones_args args = {
  27                .next_sector = sector,
  28                .orig_data = data,
  29                .orig_cb = cb,
  30        };
  31
  32        do {
  33                struct dm_target *tgt;
  34
  35                tgt = dm_table_find_target(t, args.next_sector);
  36                if (WARN_ON_ONCE(!tgt->type->report_zones))
  37                        return -EIO;
  38
  39                args.tgt = tgt;
  40                ret = tgt->type->report_zones(tgt, &args,
  41                                              nr_zones - args.zone_idx);
  42                if (ret < 0)
  43                        return ret;
  44        } while (args.zone_idx < nr_zones &&
  45                 args.next_sector < get_capacity(disk));
  46
  47        return args.zone_idx;
  48}
  49
  50/*
  51 * User facing dm device block device report zone operation. This calls the
  52 * report_zones operation for each target of a device table. This operation is
  53 * generally implemented by targets using dm_report_zones().
  54 */
  55int dm_blk_report_zones(struct gendisk *disk, sector_t sector,
  56                        unsigned int nr_zones, report_zones_cb cb, void *data)
  57{
  58        struct mapped_device *md = disk->private_data;
  59        struct dm_table *map;
  60        int srcu_idx, ret;
  61
  62        if (dm_suspended_md(md))
  63                return -EAGAIN;
  64
  65        map = dm_get_live_table(md, &srcu_idx);
  66        if (!map)
  67                return -EIO;
  68
  69        ret = dm_blk_do_report_zones(md, map, sector, nr_zones, cb, data);
  70
  71        dm_put_live_table(md, srcu_idx);
  72
  73        return ret;
  74}
  75
  76static int dm_report_zones_cb(struct blk_zone *zone, unsigned int idx,
  77                              void *data)
  78{
  79        struct dm_report_zones_args *args = data;
  80        sector_t sector_diff = args->tgt->begin - args->start;
  81
  82        /*
  83         * Ignore zones beyond the target range.
  84         */
  85        if (zone->start >= args->start + args->tgt->len)
  86                return 0;
  87
  88        /*
  89         * Remap the start sector and write pointer position of the zone
  90         * to match its position in the target range.
  91         */
  92        zone->start += sector_diff;
  93        if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL) {
  94                if (zone->cond == BLK_ZONE_COND_FULL)
  95                        zone->wp = zone->start + zone->len;
  96                else if (zone->cond == BLK_ZONE_COND_EMPTY)
  97                        zone->wp = zone->start;
  98                else
  99                        zone->wp += sector_diff;
 100        }
 101
 102        args->next_sector = zone->start + zone->len;
 103        return args->orig_cb(zone, args->zone_idx++, args->orig_data);
 104}
 105
 106/*
 107 * Helper for drivers of zoned targets to implement struct target_type
 108 * report_zones operation.
 109 */
 110int dm_report_zones(struct block_device *bdev, sector_t start, sector_t sector,
 111                    struct dm_report_zones_args *args, unsigned int nr_zones)
 112{
 113        /*
 114         * Set the target mapping start sector first so that
 115         * dm_report_zones_cb() can correctly remap zone information.
 116         */
 117        args->start = start;
 118
 119        return blkdev_report_zones(bdev, sector, nr_zones,
 120                                   dm_report_zones_cb, args);
 121}
 122EXPORT_SYMBOL_GPL(dm_report_zones);
 123
 124bool dm_is_zone_write(struct mapped_device *md, struct bio *bio)
 125{
 126        struct request_queue *q = md->queue;
 127
 128        if (!blk_queue_is_zoned(q))
 129                return false;
 130
 131        switch (bio_op(bio)) {
 132        case REQ_OP_WRITE_ZEROES:
 133        case REQ_OP_WRITE_SAME:
 134        case REQ_OP_WRITE:
 135                return !op_is_flush(bio->bi_opf) && bio_sectors(bio);
 136        default:
 137                return false;
 138        }
 139}
 140
 141void dm_cleanup_zoned_dev(struct mapped_device *md)
 142{
 143        struct request_queue *q = md->queue;
 144
 145        if (q) {
 146                kfree(q->conv_zones_bitmap);
 147                q->conv_zones_bitmap = NULL;
 148                kfree(q->seq_zones_wlock);
 149                q->seq_zones_wlock = NULL;
 150        }
 151
 152        kvfree(md->zwp_offset);
 153        md->zwp_offset = NULL;
 154        md->nr_zones = 0;
 155}
 156
 157static unsigned int dm_get_zone_wp_offset(struct blk_zone *zone)
 158{
 159        switch (zone->cond) {
 160        case BLK_ZONE_COND_IMP_OPEN:
 161        case BLK_ZONE_COND_EXP_OPEN:
 162        case BLK_ZONE_COND_CLOSED:
 163                return zone->wp - zone->start;
 164        case BLK_ZONE_COND_FULL:
 165                return zone->len;
 166        case BLK_ZONE_COND_EMPTY:
 167        case BLK_ZONE_COND_NOT_WP:
 168        case BLK_ZONE_COND_OFFLINE:
 169        case BLK_ZONE_COND_READONLY:
 170        default:
 171                /*
 172                 * Conventional, offline and read-only zones do not have a valid
 173                 * write pointer. Use 0 as for an empty zone.
 174                 */
 175                return 0;
 176        }
 177}
 178
 179static int dm_zone_revalidate_cb(struct blk_zone *zone, unsigned int idx,
 180                                 void *data)
 181{
 182        struct mapped_device *md = data;
 183        struct request_queue *q = md->queue;
 184
 185        switch (zone->type) {
 186        case BLK_ZONE_TYPE_CONVENTIONAL:
 187                if (!q->conv_zones_bitmap) {
 188                        q->conv_zones_bitmap =
 189                                kcalloc(BITS_TO_LONGS(q->nr_zones),
 190                                        sizeof(unsigned long), GFP_NOIO);
 191                        if (!q->conv_zones_bitmap)
 192                                return -ENOMEM;
 193                }
 194                set_bit(idx, q->conv_zones_bitmap);
 195                break;
 196        case BLK_ZONE_TYPE_SEQWRITE_REQ:
 197        case BLK_ZONE_TYPE_SEQWRITE_PREF:
 198                if (!q->seq_zones_wlock) {
 199                        q->seq_zones_wlock =
 200                                kcalloc(BITS_TO_LONGS(q->nr_zones),
 201                                        sizeof(unsigned long), GFP_NOIO);
 202                        if (!q->seq_zones_wlock)
 203                                return -ENOMEM;
 204                }
 205                if (!md->zwp_offset) {
 206                        md->zwp_offset =
 207                                kvcalloc(q->nr_zones, sizeof(unsigned int),
 208                                         GFP_KERNEL);
 209                        if (!md->zwp_offset)
 210                                return -ENOMEM;
 211                }
 212                md->zwp_offset[idx] = dm_get_zone_wp_offset(zone);
 213
 214                break;
 215        default:
 216                DMERR("Invalid zone type 0x%x at sectors %llu",
 217                      (int)zone->type, zone->start);
 218                return -ENODEV;
 219        }
 220
 221        return 0;
 222}
 223
 224/*
 225 * Revalidate the zones of a mapped device to initialize resource necessary
 226 * for zone append emulation. Note that we cannot simply use the block layer
 227 * blk_revalidate_disk_zones() function here as the mapped device is suspended
 228 * (this is called from __bind() context).
 229 */
 230static int dm_revalidate_zones(struct mapped_device *md, struct dm_table *t)
 231{
 232        struct request_queue *q = md->queue;
 233        unsigned int noio_flag;
 234        int ret;
 235
 236        /*
 237         * Check if something changed. If yes, cleanup the current resources
 238         * and reallocate everything.
 239         */
 240        if (!q->nr_zones || q->nr_zones != md->nr_zones)
 241                dm_cleanup_zoned_dev(md);
 242        if (md->nr_zones)
 243                return 0;
 244
 245        /*
 246         * Scan all zones to initialize everything. Ensure that all vmalloc
 247         * operations in this context are done as if GFP_NOIO was specified.
 248         */
 249        noio_flag = memalloc_noio_save();
 250        ret = dm_blk_do_report_zones(md, t, 0, q->nr_zones,
 251                                     dm_zone_revalidate_cb, md);
 252        memalloc_noio_restore(noio_flag);
 253        if (ret < 0)
 254                goto err;
 255        if (ret != q->nr_zones) {
 256                ret = -EIO;
 257                goto err;
 258        }
 259
 260        md->nr_zones = q->nr_zones;
 261
 262        return 0;
 263
 264err:
 265        DMERR("Revalidate zones failed %d", ret);
 266        dm_cleanup_zoned_dev(md);
 267        return ret;
 268}
 269
 270static int device_not_zone_append_capable(struct dm_target *ti,
 271                                          struct dm_dev *dev, sector_t start,
 272                                          sector_t len, void *data)
 273{
 274        return !blk_queue_is_zoned(bdev_get_queue(dev->bdev));
 275}
 276
 277static bool dm_table_supports_zone_append(struct dm_table *t)
 278{
 279        struct dm_target *ti;
 280        unsigned int i;
 281
 282        for (i = 0; i < dm_table_get_num_targets(t); i++) {
 283                ti = dm_table_get_target(t, i);
 284
 285                if (ti->emulate_zone_append)
 286                        return false;
 287
 288                if (!ti->type->iterate_devices ||
 289                    ti->type->iterate_devices(ti, device_not_zone_append_capable, NULL))
 290                        return false;
 291        }
 292
 293        return true;
 294}
 295
 296int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q)
 297{
 298        struct mapped_device *md = t->md;
 299
 300        /*
 301         * For a zoned target, the number of zones should be updated for the
 302         * correct value to be exposed in sysfs queue/nr_zones.
 303         */
 304        WARN_ON_ONCE(queue_is_mq(q));
 305        q->nr_zones = blkdev_nr_zones(md->disk);
 306
 307        /* Check if zone append is natively supported */
 308        if (dm_table_supports_zone_append(t)) {
 309                clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
 310                dm_cleanup_zoned_dev(md);
 311                return 0;
 312        }
 313
 314        /*
 315         * Mark the mapped device as needing zone append emulation and
 316         * initialize the emulation resources once the capacity is set.
 317         */
 318        set_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
 319        if (!get_capacity(md->disk))
 320                return 0;
 321
 322        return dm_revalidate_zones(md, t);
 323}
 324
 325static int dm_update_zone_wp_offset_cb(struct blk_zone *zone, unsigned int idx,
 326                                       void *data)
 327{
 328        unsigned int *wp_offset = data;
 329
 330        *wp_offset = dm_get_zone_wp_offset(zone);
 331
 332        return 0;
 333}
 334
 335static int dm_update_zone_wp_offset(struct mapped_device *md, unsigned int zno,
 336                                    unsigned int *wp_ofst)
 337{
 338        sector_t sector = zno * blk_queue_zone_sectors(md->queue);
 339        unsigned int noio_flag;
 340        struct dm_table *t;
 341        int srcu_idx, ret;
 342
 343        t = dm_get_live_table(md, &srcu_idx);
 344        if (!t)
 345                return -EIO;
 346
 347        /*
 348         * Ensure that all memory allocations in this context are done as if
 349         * GFP_NOIO was specified.
 350         */
 351        noio_flag = memalloc_noio_save();
 352        ret = dm_blk_do_report_zones(md, t, sector, 1,
 353                                     dm_update_zone_wp_offset_cb, wp_ofst);
 354        memalloc_noio_restore(noio_flag);
 355
 356        dm_put_live_table(md, srcu_idx);
 357
 358        if (ret != 1)
 359                return -EIO;
 360
 361        return 0;
 362}
 363
 364/*
 365 * First phase of BIO mapping for targets with zone append emulation:
 366 * check all BIO that change a zone writer pointer and change zone
 367 * append operations into regular write operations.
 368 */
 369static bool dm_zone_map_bio_begin(struct mapped_device *md,
 370                                  struct bio *orig_bio, struct bio *clone)
 371{
 372        sector_t zsectors = blk_queue_zone_sectors(md->queue);
 373        unsigned int zno = bio_zone_no(orig_bio);
 374        unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]);
 375
 376        /*
 377         * If the target zone is in an error state, recover by inspecting the
 378         * zone to get its current write pointer position. Note that since the
 379         * target zone is already locked, a BIO issuing context should never
 380         * see the zone write in the DM_ZONE_UPDATING_WP_OFST state.
 381         */
 382        if (zwp_offset == DM_ZONE_INVALID_WP_OFST) {
 383                if (dm_update_zone_wp_offset(md, zno, &zwp_offset))
 384                        return false;
 385                WRITE_ONCE(md->zwp_offset[zno], zwp_offset);
 386        }
 387
 388        switch (bio_op(orig_bio)) {
 389        case REQ_OP_ZONE_RESET:
 390        case REQ_OP_ZONE_FINISH:
 391                return true;
 392        case REQ_OP_WRITE_ZEROES:
 393        case REQ_OP_WRITE_SAME:
 394        case REQ_OP_WRITE:
 395                /* Writes must be aligned to the zone write pointer */
 396                if ((clone->bi_iter.bi_sector & (zsectors - 1)) != zwp_offset)
 397                        return false;
 398                break;
 399        case REQ_OP_ZONE_APPEND:
 400                /*
 401                 * Change zone append operations into a non-mergeable regular
 402                 * writes directed at the current write pointer position of the
 403                 * target zone.
 404                 */
 405                clone->bi_opf = REQ_OP_WRITE | REQ_NOMERGE |
 406                        (orig_bio->bi_opf & (~REQ_OP_MASK));
 407                clone->bi_iter.bi_sector =
 408                        orig_bio->bi_iter.bi_sector + zwp_offset;
 409                break;
 410        default:
 411                DMWARN_LIMIT("Invalid BIO operation");
 412                return false;
 413        }
 414
 415        /* Cannot write to a full zone */
 416        if (zwp_offset >= zsectors)
 417                return false;
 418
 419        return true;
 420}
 421
 422/*
 423 * Second phase of BIO mapping for targets with zone append emulation:
 424 * update the zone write pointer offset array to account for the additional
 425 * data written to a zone. Note that at this point, the remapped clone BIO
 426 * may already have completed, so we do not touch it.
 427 */
 428static blk_status_t dm_zone_map_bio_end(struct mapped_device *md,
 429                                        struct bio *orig_bio,
 430                                        unsigned int nr_sectors)
 431{
 432        unsigned int zno = bio_zone_no(orig_bio);
 433        unsigned int zwp_offset = READ_ONCE(md->zwp_offset[zno]);
 434
 435        /* The clone BIO may already have been completed and failed */
 436        if (zwp_offset == DM_ZONE_INVALID_WP_OFST)
 437                return BLK_STS_IOERR;
 438
 439        /* Update the zone wp offset */
 440        switch (bio_op(orig_bio)) {
 441        case REQ_OP_ZONE_RESET:
 442                WRITE_ONCE(md->zwp_offset[zno], 0);
 443                return BLK_STS_OK;
 444        case REQ_OP_ZONE_FINISH:
 445                WRITE_ONCE(md->zwp_offset[zno],
 446                           blk_queue_zone_sectors(md->queue));
 447                return BLK_STS_OK;
 448        case REQ_OP_WRITE_ZEROES:
 449        case REQ_OP_WRITE_SAME:
 450        case REQ_OP_WRITE:
 451                WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors);
 452                return BLK_STS_OK;
 453        case REQ_OP_ZONE_APPEND:
 454                /*
 455                 * Check that the target did not truncate the write operation
 456                 * emulating a zone append.
 457                 */
 458                if (nr_sectors != bio_sectors(orig_bio)) {
 459                        DMWARN_LIMIT("Truncated write for zone append");
 460                        return BLK_STS_IOERR;
 461                }
 462                WRITE_ONCE(md->zwp_offset[zno], zwp_offset + nr_sectors);
 463                return BLK_STS_OK;
 464        default:
 465                DMWARN_LIMIT("Invalid BIO operation");
 466                return BLK_STS_IOERR;
 467        }
 468}
 469
 470static inline void dm_zone_lock(struct request_queue *q,
 471                                unsigned int zno, struct bio *clone)
 472{
 473        if (WARN_ON_ONCE(bio_flagged(clone, BIO_ZONE_WRITE_LOCKED)))
 474                return;
 475
 476        wait_on_bit_lock_io(q->seq_zones_wlock, zno, TASK_UNINTERRUPTIBLE);
 477        bio_set_flag(clone, BIO_ZONE_WRITE_LOCKED);
 478}
 479
 480static inline void dm_zone_unlock(struct request_queue *q,
 481                                  unsigned int zno, struct bio *clone)
 482{
 483        if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED))
 484                return;
 485
 486        WARN_ON_ONCE(!test_bit(zno, q->seq_zones_wlock));
 487        clear_bit_unlock(zno, q->seq_zones_wlock);
 488        smp_mb__after_atomic();
 489        wake_up_bit(q->seq_zones_wlock, zno);
 490
 491        bio_clear_flag(clone, BIO_ZONE_WRITE_LOCKED);
 492}
 493
 494static bool dm_need_zone_wp_tracking(struct bio *orig_bio)
 495{
 496        /*
 497         * Special processing is not needed for operations that do not need the
 498         * zone write lock, that is, all operations that target conventional
 499         * zones and all operations that do not modify directly a sequential
 500         * zone write pointer.
 501         */
 502        if (op_is_flush(orig_bio->bi_opf) && !bio_sectors(orig_bio))
 503                return false;
 504        switch (bio_op(orig_bio)) {
 505        case REQ_OP_WRITE_ZEROES:
 506        case REQ_OP_WRITE_SAME:
 507        case REQ_OP_WRITE:
 508        case REQ_OP_ZONE_RESET:
 509        case REQ_OP_ZONE_FINISH:
 510        case REQ_OP_ZONE_APPEND:
 511                return bio_zone_is_seq(orig_bio);
 512        default:
 513                return false;
 514        }
 515}
 516
 517/*
 518 * Special IO mapping for targets needing zone append emulation.
 519 */
 520int dm_zone_map_bio(struct dm_target_io *tio)
 521{
 522        struct dm_io *io = tio->io;
 523        struct dm_target *ti = tio->ti;
 524        struct mapped_device *md = io->md;
 525        struct request_queue *q = md->queue;
 526        struct bio *orig_bio = io->orig_bio;
 527        struct bio *clone = &tio->clone;
 528        unsigned int zno;
 529        blk_status_t sts;
 530        int r;
 531
 532        /*
 533         * IOs that do not change a zone write pointer do not need
 534         * any additional special processing.
 535         */
 536        if (!dm_need_zone_wp_tracking(orig_bio))
 537                return ti->type->map(ti, clone);
 538
 539        /* Lock the target zone */
 540        zno = bio_zone_no(orig_bio);
 541        dm_zone_lock(q, zno, clone);
 542
 543        /*
 544         * Check that the bio and the target zone write pointer offset are
 545         * both valid, and if the bio is a zone append, remap it to a write.
 546         */
 547        if (!dm_zone_map_bio_begin(md, orig_bio, clone)) {
 548                dm_zone_unlock(q, zno, clone);
 549                return DM_MAPIO_KILL;
 550        }
 551
 552        /*
 553         * The target map function may issue and complete the IO quickly.
 554         * Take an extra reference on the IO to make sure it does disappear
 555         * until we run dm_zone_map_bio_end().
 556         */
 557        dm_io_inc_pending(io);
 558
 559        /* Let the target do its work */
 560        r = ti->type->map(ti, clone);
 561        switch (r) {
 562        case DM_MAPIO_SUBMITTED:
 563                /*
 564                 * The target submitted the clone BIO. The target zone will
 565                 * be unlocked on completion of the clone.
 566                 */
 567                sts = dm_zone_map_bio_end(md, orig_bio, *tio->len_ptr);
 568                break;
 569        case DM_MAPIO_REMAPPED:
 570                /*
 571                 * The target only remapped the clone BIO. In case of error,
 572                 * unlock the target zone here as the clone will not be
 573                 * submitted.
 574                 */
 575                sts = dm_zone_map_bio_end(md, orig_bio, *tio->len_ptr);
 576                if (sts != BLK_STS_OK)
 577                        dm_zone_unlock(q, zno, clone);
 578                break;
 579        case DM_MAPIO_REQUEUE:
 580        case DM_MAPIO_KILL:
 581        default:
 582                dm_zone_unlock(q, zno, clone);
 583                sts = BLK_STS_IOERR;
 584                break;
 585        }
 586
 587        /* Drop the extra reference on the IO */
 588        dm_io_dec_pending(io, sts);
 589
 590        if (sts != BLK_STS_OK)
 591                return DM_MAPIO_KILL;
 592
 593        return r;
 594}
 595
 596/*
 597 * IO completion callback called from clone_endio().
 598 */
 599void dm_zone_endio(struct dm_io *io, struct bio *clone)
 600{
 601        struct mapped_device *md = io->md;
 602        struct request_queue *q = md->queue;
 603        struct bio *orig_bio = io->orig_bio;
 604        unsigned int zwp_offset;
 605        unsigned int zno;
 606
 607        /*
 608         * For targets that do not emulate zone append, we only need to
 609         * handle native zone-append bios.
 610         */
 611        if (!dm_emulate_zone_append(md)) {
 612                /*
 613                 * Get the offset within the zone of the written sector
 614                 * and add that to the original bio sector position.
 615                 */
 616                if (clone->bi_status == BLK_STS_OK &&
 617                    bio_op(clone) == REQ_OP_ZONE_APPEND) {
 618                        sector_t mask = (sector_t)blk_queue_zone_sectors(q) - 1;
 619
 620                        orig_bio->bi_iter.bi_sector +=
 621                                clone->bi_iter.bi_sector & mask;
 622                }
 623
 624                return;
 625        }
 626
 627        /*
 628         * For targets that do emulate zone append, if the clone BIO does not
 629         * own the target zone write lock, we have nothing to do.
 630         */
 631        if (!bio_flagged(clone, BIO_ZONE_WRITE_LOCKED))
 632                return;
 633
 634        zno = bio_zone_no(orig_bio);
 635
 636        if (clone->bi_status != BLK_STS_OK) {
 637                /*
 638                 * BIOs that modify a zone write pointer may leave the zone
 639                 * in an unknown state in case of failure (e.g. the write
 640                 * pointer was only partially advanced). In this case, set
 641                 * the target zone write pointer as invalid unless it is
 642                 * already being updated.
 643                 */
 644                WRITE_ONCE(md->zwp_offset[zno], DM_ZONE_INVALID_WP_OFST);
 645        } else if (bio_op(orig_bio) == REQ_OP_ZONE_APPEND) {
 646                /*
 647                 * Get the written sector for zone append operation that were
 648                 * emulated using regular write operations.
 649                 */
 650                zwp_offset = READ_ONCE(md->zwp_offset[zno]);
 651                if (WARN_ON_ONCE(zwp_offset < bio_sectors(orig_bio)))
 652                        WRITE_ONCE(md->zwp_offset[zno],
 653                                   DM_ZONE_INVALID_WP_OFST);
 654                else
 655                        orig_bio->bi_iter.bi_sector +=
 656                                zwp_offset - bio_sectors(orig_bio);
 657        }
 658
 659        dm_zone_unlock(q, zno, clone);
 660}
 661