linux/block/blk-zoned.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Zoned block device handling
   4 *
   5 * Copyright (c) 2015, Hannes Reinecke
   6 * Copyright (c) 2015, SUSE Linux GmbH
   7 *
   8 * Copyright (c) 2016, Damien Le Moal
   9 * Copyright (c) 2016, Western Digital
  10 */
  11
  12#include <linux/kernel.h>
  13#include <linux/module.h>
  14#include <linux/rbtree.h>
  15#include <linux/blkdev.h>
  16#include <linux/blk-mq.h>
  17#include <linux/mm.h>
  18#include <linux/vmalloc.h>
  19#include <linux/sched/mm.h>
  20
  21#include "blk.h"
  22
  23#define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name
  24static const char *const zone_cond_name[] = {
  25        ZONE_COND_NAME(NOT_WP),
  26        ZONE_COND_NAME(EMPTY),
  27        ZONE_COND_NAME(IMP_OPEN),
  28        ZONE_COND_NAME(EXP_OPEN),
  29        ZONE_COND_NAME(CLOSED),
  30        ZONE_COND_NAME(READONLY),
  31        ZONE_COND_NAME(FULL),
  32        ZONE_COND_NAME(OFFLINE),
  33};
  34#undef ZONE_COND_NAME
  35
  36/**
  37 * blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX.
  38 * @zone_cond: BLK_ZONE_COND_XXX.
  39 *
  40 * Description: Centralize block layer function to convert BLK_ZONE_COND_XXX
  41 * into string format. Useful in the debugging and tracing zone conditions. For
  42 * invalid BLK_ZONE_COND_XXX it returns string "UNKNOWN".
  43 */
  44const char *blk_zone_cond_str(enum blk_zone_cond zone_cond)
  45{
  46        static const char *zone_cond_str = "UNKNOWN";
  47
  48        if (zone_cond < ARRAY_SIZE(zone_cond_name) && zone_cond_name[zone_cond])
  49                zone_cond_str = zone_cond_name[zone_cond];
  50
  51        return zone_cond_str;
  52}
  53EXPORT_SYMBOL_GPL(blk_zone_cond_str);
  54
  55/*
  56 * Return true if a request is a write requests that needs zone write locking.
  57 */
  58bool blk_req_needs_zone_write_lock(struct request *rq)
  59{
  60        if (!rq->q->seq_zones_wlock)
  61                return false;
  62
  63        if (blk_rq_is_passthrough(rq))
  64                return false;
  65
  66        switch (req_op(rq)) {
  67        case REQ_OP_WRITE_ZEROES:
  68        case REQ_OP_WRITE_SAME:
  69        case REQ_OP_WRITE:
  70                return blk_rq_zone_is_seq(rq);
  71        default:
  72                return false;
  73        }
  74}
  75EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock);
  76
  77bool blk_req_zone_write_trylock(struct request *rq)
  78{
  79        unsigned int zno = blk_rq_zone_no(rq);
  80
  81        if (test_and_set_bit(zno, rq->q->seq_zones_wlock))
  82                return false;
  83
  84        WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
  85        rq->rq_flags |= RQF_ZONE_WRITE_LOCKED;
  86
  87        return true;
  88}
  89EXPORT_SYMBOL_GPL(blk_req_zone_write_trylock);
  90
  91void __blk_req_zone_write_lock(struct request *rq)
  92{
  93        if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq),
  94                                          rq->q->seq_zones_wlock)))
  95                return;
  96
  97        WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
  98        rq->rq_flags |= RQF_ZONE_WRITE_LOCKED;
  99}
 100EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock);
 101
 102void __blk_req_zone_write_unlock(struct request *rq)
 103{
 104        rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED;
 105        if (rq->q->seq_zones_wlock)
 106                WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq),
 107                                                 rq->q->seq_zones_wlock));
 108}
 109EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
 110
 111/**
 112 * blkdev_nr_zones - Get number of zones
 113 * @disk:       Target gendisk
 114 *
 115 * Return the total number of zones of a zoned block device.  For a block
 116 * device without zone capabilities, the number of zones is always 0.
 117 */
 118unsigned int blkdev_nr_zones(struct gendisk *disk)
 119{
 120        sector_t zone_sectors = blk_queue_zone_sectors(disk->queue);
 121
 122        if (!blk_queue_is_zoned(disk->queue))
 123                return 0;
 124        return (get_capacity(disk) + zone_sectors - 1) >> ilog2(zone_sectors);
 125}
 126EXPORT_SYMBOL_GPL(blkdev_nr_zones);
 127
 128/**
 129 * blkdev_report_zones - Get zones information
 130 * @bdev:       Target block device
 131 * @sector:     Sector from which to report zones
 132 * @nr_zones:   Maximum number of zones to report
 133 * @cb:         Callback function called for each reported zone
 134 * @data:       Private data for the callback
 135 *
 136 * Description:
 137 *    Get zone information starting from the zone containing @sector for at most
 138 *    @nr_zones, and call @cb for each zone reported by the device.
 139 *    To report all zones in a device starting from @sector, the BLK_ALL_ZONES
 140 *    constant can be passed to @nr_zones.
 141 *    Returns the number of zones reported by the device, or a negative errno
 142 *    value in case of failure.
 143 *
 144 *    Note: The caller must use memalloc_noXX_save/restore() calls to control
 145 *    memory allocations done within this function.
 146 */
 147int blkdev_report_zones(struct block_device *bdev, sector_t sector,
 148                        unsigned int nr_zones, report_zones_cb cb, void *data)
 149{
 150        struct gendisk *disk = bdev->bd_disk;
 151        sector_t capacity = get_capacity(disk);
 152
 153        if (!blk_queue_is_zoned(bdev_get_queue(bdev)) ||
 154            WARN_ON_ONCE(!disk->fops->report_zones))
 155                return -EOPNOTSUPP;
 156
 157        if (!nr_zones || sector >= capacity)
 158                return 0;
 159
 160        return disk->fops->report_zones(disk, sector, nr_zones, cb, data);
 161}
 162EXPORT_SYMBOL_GPL(blkdev_report_zones);
 163
 164static inline bool blkdev_allow_reset_all_zones(struct block_device *bdev,
 165                                                sector_t sector,
 166                                                sector_t nr_sectors)
 167{
 168        if (!blk_queue_zone_resetall(bdev_get_queue(bdev)))
 169                return false;
 170
 171        /*
 172         * REQ_OP_ZONE_RESET_ALL can be executed only if the number of sectors
 173         * of the applicable zone range is the entire disk.
 174         */
 175        return !sector && nr_sectors == get_capacity(bdev->bd_disk);
 176}
 177
 178/**
 179 * blkdev_zone_mgmt - Execute a zone management operation on a range of zones
 180 * @bdev:       Target block device
 181 * @op:         Operation to be performed on the zones
 182 * @sector:     Start sector of the first zone to operate on
 183 * @nr_sectors: Number of sectors, should be at least the length of one zone and
 184 *              must be zone size aligned.
 185 * @gfp_mask:   Memory allocation flags (for bio_alloc)
 186 *
 187 * Description:
 188 *    Perform the specified operation on the range of zones specified by
 189 *    @sector..@sector+@nr_sectors. Specifying the entire disk sector range
 190 *    is valid, but the specified range should not contain conventional zones.
 191 *    The operation to execute on each zone can be a zone reset, open, close
 192 *    or finish request.
 193 */
 194int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op,
 195                     sector_t sector, sector_t nr_sectors,
 196                     gfp_t gfp_mask)
 197{
 198        struct request_queue *q = bdev_get_queue(bdev);
 199        sector_t zone_sectors = blk_queue_zone_sectors(q);
 200        sector_t capacity = get_capacity(bdev->bd_disk);
 201        sector_t end_sector = sector + nr_sectors;
 202        struct bio *bio = NULL;
 203        int ret;
 204
 205        if (!blk_queue_is_zoned(q))
 206                return -EOPNOTSUPP;
 207
 208        if (bdev_read_only(bdev))
 209                return -EPERM;
 210
 211        if (!op_is_zone_mgmt(op))
 212                return -EOPNOTSUPP;
 213
 214        if (end_sector <= sector || end_sector > capacity)
 215                /* Out of range */
 216                return -EINVAL;
 217
 218        /* Check alignment (handle eventual smaller last zone) */
 219        if (sector & (zone_sectors - 1))
 220                return -EINVAL;
 221
 222        if ((nr_sectors & (zone_sectors - 1)) && end_sector != capacity)
 223                return -EINVAL;
 224
 225        while (sector < end_sector) {
 226                bio = blk_next_bio(bio, 0, gfp_mask);
 227                bio_set_dev(bio, bdev);
 228
 229                /*
 230                 * Special case for the zone reset operation that reset all
 231                 * zones, this is useful for applications like mkfs.
 232                 */
 233                if (op == REQ_OP_ZONE_RESET &&
 234                    blkdev_allow_reset_all_zones(bdev, sector, nr_sectors)) {
 235                        bio->bi_opf = REQ_OP_ZONE_RESET_ALL | REQ_SYNC;
 236                        break;
 237                }
 238
 239                bio->bi_opf = op | REQ_SYNC;
 240                bio->bi_iter.bi_sector = sector;
 241                sector += zone_sectors;
 242
 243                /* This may take a while, so be nice to others */
 244                cond_resched();
 245        }
 246
 247        ret = submit_bio_wait(bio);
 248        bio_put(bio);
 249
 250        return ret;
 251}
 252EXPORT_SYMBOL_GPL(blkdev_zone_mgmt);
 253
 254struct zone_report_args {
 255        struct blk_zone __user *zones;
 256};
 257
 258static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx,
 259                                    void *data)
 260{
 261        struct zone_report_args *args = data;
 262
 263        if (copy_to_user(&args->zones[idx], zone, sizeof(struct blk_zone)))
 264                return -EFAULT;
 265        return 0;
 266}
 267
 268/*
 269 * BLKREPORTZONE ioctl processing.
 270 * Called from blkdev_ioctl.
 271 */
 272int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
 273                              unsigned int cmd, unsigned long arg)
 274{
 275        void __user *argp = (void __user *)arg;
 276        struct zone_report_args args;
 277        struct request_queue *q;
 278        struct blk_zone_report rep;
 279        int ret;
 280
 281        if (!argp)
 282                return -EINVAL;
 283
 284        q = bdev_get_queue(bdev);
 285        if (!q)
 286                return -ENXIO;
 287
 288        if (!blk_queue_is_zoned(q))
 289                return -ENOTTY;
 290
 291        if (!capable(CAP_SYS_ADMIN))
 292                return -EACCES;
 293
 294        if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report)))
 295                return -EFAULT;
 296
 297        if (!rep.nr_zones)
 298                return -EINVAL;
 299
 300        args.zones = argp + sizeof(struct blk_zone_report);
 301        ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones,
 302                                  blkdev_copy_zone_to_user, &args);
 303        if (ret < 0)
 304                return ret;
 305
 306        rep.nr_zones = ret;
 307        rep.flags = BLK_ZONE_REP_CAPACITY;
 308        if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report)))
 309                return -EFAULT;
 310        return 0;
 311}
 312
 313static int blkdev_truncate_zone_range(struct block_device *bdev, fmode_t mode,
 314                                      const struct blk_zone_range *zrange)
 315{
 316        loff_t start, end;
 317
 318        if (zrange->sector + zrange->nr_sectors <= zrange->sector ||
 319            zrange->sector + zrange->nr_sectors > get_capacity(bdev->bd_disk))
 320                /* Out of range */
 321                return -EINVAL;
 322
 323        start = zrange->sector << SECTOR_SHIFT;
 324        end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - 1;
 325
 326        return truncate_bdev_range(bdev, mode, start, end);
 327}
 328
 329/*
 330 * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing.
 331 * Called from blkdev_ioctl.
 332 */
 333int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
 334                           unsigned int cmd, unsigned long arg)
 335{
 336        void __user *argp = (void __user *)arg;
 337        struct request_queue *q;
 338        struct blk_zone_range zrange;
 339        enum req_opf op;
 340        int ret;
 341
 342        if (!argp)
 343                return -EINVAL;
 344
 345        q = bdev_get_queue(bdev);
 346        if (!q)
 347                return -ENXIO;
 348
 349        if (!blk_queue_is_zoned(q))
 350                return -ENOTTY;
 351
 352        if (!capable(CAP_SYS_ADMIN))
 353                return -EACCES;
 354
 355        if (!(mode & FMODE_WRITE))
 356                return -EBADF;
 357
 358        if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range)))
 359                return -EFAULT;
 360
 361        switch (cmd) {
 362        case BLKRESETZONE:
 363                op = REQ_OP_ZONE_RESET;
 364
 365                /* Invalidate the page cache, including dirty pages. */
 366                ret = blkdev_truncate_zone_range(bdev, mode, &zrange);
 367                if (ret)
 368                        return ret;
 369                break;
 370        case BLKOPENZONE:
 371                op = REQ_OP_ZONE_OPEN;
 372                break;
 373        case BLKCLOSEZONE:
 374                op = REQ_OP_ZONE_CLOSE;
 375                break;
 376        case BLKFINISHZONE:
 377                op = REQ_OP_ZONE_FINISH;
 378                break;
 379        default:
 380                return -ENOTTY;
 381        }
 382
 383        ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors,
 384                               GFP_KERNEL);
 385
 386        /*
 387         * Invalidate the page cache again for zone reset: writes can only be
 388         * direct for zoned devices so concurrent writes would not add any page
 389         * to the page cache after/during reset. The page cache may be filled
 390         * again due to concurrent reads though and dropping the pages for
 391         * these is fine.
 392         */
 393        if (!ret && cmd == BLKRESETZONE)
 394                ret = blkdev_truncate_zone_range(bdev, mode, &zrange);
 395
 396        return ret;
 397}
 398
 399static inline unsigned long *blk_alloc_zone_bitmap(int node,
 400                                                   unsigned int nr_zones)
 401{
 402        return kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(unsigned long),
 403                            GFP_NOIO, node);
 404}
 405
 406void blk_queue_free_zone_bitmaps(struct request_queue *q)
 407{
 408        kfree(q->conv_zones_bitmap);
 409        q->conv_zones_bitmap = NULL;
 410        kfree(q->seq_zones_wlock);
 411        q->seq_zones_wlock = NULL;
 412}
 413
 414struct blk_revalidate_zone_args {
 415        struct gendisk  *disk;
 416        unsigned long   *conv_zones_bitmap;
 417        unsigned long   *seq_zones_wlock;
 418        unsigned int    nr_zones;
 419        sector_t        zone_sectors;
 420        sector_t        sector;
 421};
 422
 423/*
 424 * Helper function to check the validity of zones of a zoned block device.
 425 */
 426static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
 427                                  void *data)
 428{
 429        struct blk_revalidate_zone_args *args = data;
 430        struct gendisk *disk = args->disk;
 431        struct request_queue *q = disk->queue;
 432        sector_t capacity = get_capacity(disk);
 433
 434        /*
 435         * All zones must have the same size, with the exception on an eventual
 436         * smaller last zone.
 437         */
 438        if (zone->start == 0) {
 439                if (zone->len == 0 || !is_power_of_2(zone->len)) {
 440                        pr_warn("%s: Invalid zoned device with non power of two zone size (%llu)\n",
 441                                disk->disk_name, zone->len);
 442                        return -ENODEV;
 443                }
 444
 445                args->zone_sectors = zone->len;
 446                args->nr_zones = (capacity + zone->len - 1) >> ilog2(zone->len);
 447        } else if (zone->start + args->zone_sectors < capacity) {
 448                if (zone->len != args->zone_sectors) {
 449                        pr_warn("%s: Invalid zoned device with non constant zone size\n",
 450                                disk->disk_name);
 451                        return -ENODEV;
 452                }
 453        } else {
 454                if (zone->len > args->zone_sectors) {
 455                        pr_warn("%s: Invalid zoned device with larger last zone size\n",
 456                                disk->disk_name);
 457                        return -ENODEV;
 458                }
 459        }
 460
 461        /* Check for holes in the zone report */
 462        if (zone->start != args->sector) {
 463                pr_warn("%s: Zone gap at sectors %llu..%llu\n",
 464                        disk->disk_name, args->sector, zone->start);
 465                return -ENODEV;
 466        }
 467
 468        /* Check zone type */
 469        switch (zone->type) {
 470        case BLK_ZONE_TYPE_CONVENTIONAL:
 471                if (!args->conv_zones_bitmap) {
 472                        args->conv_zones_bitmap =
 473                                blk_alloc_zone_bitmap(q->node, args->nr_zones);
 474                        if (!args->conv_zones_bitmap)
 475                                return -ENOMEM;
 476                }
 477                set_bit(idx, args->conv_zones_bitmap);
 478                break;
 479        case BLK_ZONE_TYPE_SEQWRITE_REQ:
 480        case BLK_ZONE_TYPE_SEQWRITE_PREF:
 481                if (!args->seq_zones_wlock) {
 482                        args->seq_zones_wlock =
 483                                blk_alloc_zone_bitmap(q->node, args->nr_zones);
 484                        if (!args->seq_zones_wlock)
 485                                return -ENOMEM;
 486                }
 487                break;
 488        default:
 489                pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n",
 490                        disk->disk_name, (int)zone->type, zone->start);
 491                return -ENODEV;
 492        }
 493
 494        args->sector += zone->len;
 495        return 0;
 496}
 497
 498/**
 499 * blk_revalidate_disk_zones - (re)allocate and initialize zone bitmaps
 500 * @disk:       Target disk
 501 * @update_driver_data: Callback to update driver data on the frozen disk
 502 *
 503 * Helper function for low-level device drivers to (re) allocate and initialize
 504 * a disk request queue zone bitmaps. This functions should normally be called
 505 * within the disk ->revalidate method for blk-mq based drivers.  For BIO based
 506 * drivers only q->nr_zones needs to be updated so that the sysfs exposed value
 507 * is correct.
 508 * If the @update_driver_data callback function is not NULL, the callback is
 509 * executed with the device request queue frozen after all zones have been
 510 * checked.
 511 */
 512int blk_revalidate_disk_zones(struct gendisk *disk,
 513                              void (*update_driver_data)(struct gendisk *disk))
 514{
 515        struct request_queue *q = disk->queue;
 516        struct blk_revalidate_zone_args args = {
 517                .disk           = disk,
 518        };
 519        unsigned int noio_flag;
 520        int ret;
 521
 522        if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
 523                return -EIO;
 524        if (WARN_ON_ONCE(!queue_is_mq(q)))
 525                return -EIO;
 526
 527        if (!get_capacity(disk))
 528                return -EIO;
 529
 530        /*
 531         * Ensure that all memory allocations in this context are done as if
 532         * GFP_NOIO was specified.
 533         */
 534        noio_flag = memalloc_noio_save();
 535        ret = disk->fops->report_zones(disk, 0, UINT_MAX,
 536                                       blk_revalidate_zone_cb, &args);
 537        if (!ret) {
 538                pr_warn("%s: No zones reported\n", disk->disk_name);
 539                ret = -ENODEV;
 540        }
 541        memalloc_noio_restore(noio_flag);
 542
 543        /*
 544         * If zones where reported, make sure that the entire disk capacity
 545         * has been checked.
 546         */
 547        if (ret > 0 && args.sector != get_capacity(disk)) {
 548                pr_warn("%s: Missing zones from sector %llu\n",
 549                        disk->disk_name, args.sector);
 550                ret = -ENODEV;
 551        }
 552
 553        /*
 554         * Install the new bitmaps and update nr_zones only once the queue is
 555         * stopped and all I/Os are completed (i.e. a scheduler is not
 556         * referencing the bitmaps).
 557         */
 558        blk_mq_freeze_queue(q);
 559        if (ret > 0) {
 560                blk_queue_chunk_sectors(q, args.zone_sectors);
 561                q->nr_zones = args.nr_zones;
 562                swap(q->seq_zones_wlock, args.seq_zones_wlock);
 563                swap(q->conv_zones_bitmap, args.conv_zones_bitmap);
 564                if (update_driver_data)
 565                        update_driver_data(disk);
 566                ret = 0;
 567        } else {
 568                pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
 569                blk_queue_free_zone_bitmaps(q);
 570        }
 571        blk_mq_unfreeze_queue(q);
 572
 573        kfree(args.seq_zones_wlock);
 574        kfree(args.conv_zones_bitmap);
 575        return ret;
 576}
 577EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
 578
 579void blk_queue_clear_zone_settings(struct request_queue *q)
 580{
 581        blk_mq_freeze_queue(q);
 582
 583        blk_queue_free_zone_bitmaps(q);
 584        blk_queue_flag_clear(QUEUE_FLAG_ZONE_RESETALL, q);
 585        q->required_elevator_features &= ~ELEVATOR_F_ZBD_SEQ_WRITE;
 586        q->nr_zones = 0;
 587        q->max_open_zones = 0;
 588        q->max_active_zones = 0;
 589        q->limits.chunk_sectors = 0;
 590        q->limits.zone_write_granularity = 0;
 591        q->limits.max_zone_append_sectors = 0;
 592
 593        blk_mq_unfreeze_queue(q);
 594}
 595