linux/block/blk-zoned.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Zoned block device handling
   4 *
   5 * Copyright (c) 2015, Hannes Reinecke
   6 * Copyright (c) 2015, SUSE Linux GmbH
   7 *
   8 * Copyright (c) 2016, Damien Le Moal
   9 * Copyright (c) 2016, Western Digital
  10 */
  11
  12#include <linux/kernel.h>
  13#include <linux/module.h>
  14#include <linux/rbtree.h>
  15#include <linux/blkdev.h>
  16#include <linux/blk-mq.h>
  17
  18#include "blk.h"
  19
  20static inline sector_t blk_zone_start(struct request_queue *q,
  21                                      sector_t sector)
  22{
  23        sector_t zone_mask = blk_queue_zone_sectors(q) - 1;
  24
  25        return sector & ~zone_mask;
  26}
  27
  28/*
  29 * Return true if a request is a write requests that needs zone write locking.
  30 */
  31bool blk_req_needs_zone_write_lock(struct request *rq)
  32{
  33        if (!rq->q->seq_zones_wlock)
  34                return false;
  35
  36        if (blk_rq_is_passthrough(rq))
  37                return false;
  38
  39        switch (req_op(rq)) {
  40        case REQ_OP_WRITE_ZEROES:
  41        case REQ_OP_WRITE_SAME:
  42        case REQ_OP_WRITE:
  43                return blk_rq_zone_is_seq(rq);
  44        default:
  45                return false;
  46        }
  47}
  48EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock);
  49
  50void __blk_req_zone_write_lock(struct request *rq)
  51{
  52        if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq),
  53                                          rq->q->seq_zones_wlock)))
  54                return;
  55
  56        WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
  57        rq->rq_flags |= RQF_ZONE_WRITE_LOCKED;
  58}
  59EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock);
  60
  61void __blk_req_zone_write_unlock(struct request *rq)
  62{
  63        rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED;
  64        if (rq->q->seq_zones_wlock)
  65                WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq),
  66                                                 rq->q->seq_zones_wlock));
  67}
  68EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
  69
  70static inline unsigned int __blkdev_nr_zones(struct request_queue *q,
  71                                             sector_t nr_sectors)
  72{
  73        unsigned long zone_sectors = blk_queue_zone_sectors(q);
  74
  75        return (nr_sectors + zone_sectors - 1) >> ilog2(zone_sectors);
  76}
  77
  78/**
  79 * blkdev_nr_zones - Get number of zones
  80 * @bdev:       Target block device
  81 *
  82 * Description:
  83 *    Return the total number of zones of a zoned block device.
  84 *    For a regular block device, the number of zones is always 0.
  85 */
  86unsigned int blkdev_nr_zones(struct block_device *bdev)
  87{
  88        struct request_queue *q = bdev_get_queue(bdev);
  89
  90        if (!blk_queue_is_zoned(q))
  91                return 0;
  92
  93        return __blkdev_nr_zones(q, bdev->bd_part->nr_sects);
  94}
  95EXPORT_SYMBOL_GPL(blkdev_nr_zones);
  96
  97/*
  98 * Check that a zone report belongs to this partition, and if yes, fix its start
  99 * sector and write pointer and return true. Return false otherwise.
 100 */
 101static bool blkdev_report_zone(struct block_device *bdev, struct blk_zone *rep)
 102{
 103        sector_t offset = get_start_sect(bdev);
 104
 105        if (rep->start < offset)
 106                return false;
 107
 108        rep->start -= offset;
 109        if (rep->start + rep->len > bdev->bd_part->nr_sects)
 110                return false;
 111
 112        if (rep->type == BLK_ZONE_TYPE_CONVENTIONAL)
 113                rep->wp = rep->start + rep->len;
 114        else
 115                rep->wp -= offset;
 116        return true;
 117}
 118
 119static int blk_report_zones(struct gendisk *disk, sector_t sector,
 120                            struct blk_zone *zones, unsigned int *nr_zones,
 121                            gfp_t gfp_mask)
 122{
 123        struct request_queue *q = disk->queue;
 124        unsigned int z = 0, n, nrz = *nr_zones;
 125        sector_t capacity = get_capacity(disk);
 126        int ret;
 127
 128        while (z < nrz && sector < capacity) {
 129                n = nrz - z;
 130                ret = disk->fops->report_zones(disk, sector, &zones[z], &n,
 131                                               gfp_mask);
 132                if (ret)
 133                        return ret;
 134                if (!n)
 135                        break;
 136                sector += blk_queue_zone_sectors(q) * n;
 137                z += n;
 138        }
 139
 140        WARN_ON(z > *nr_zones);
 141        *nr_zones = z;
 142
 143        return 0;
 144}
 145
 146/**
 147 * blkdev_report_zones - Get zones information
 148 * @bdev:       Target block device
 149 * @sector:     Sector from which to report zones
 150 * @zones:      Array of zone structures where to return the zones information
 151 * @nr_zones:   Number of zone structures in the zone array
 152 * @gfp_mask:   Memory allocation flags (for bio_alloc)
 153 *
 154 * Description:
 155 *    Get zone information starting from the zone containing @sector.
 156 *    The number of zone information reported may be less than the number
 157 *    requested by @nr_zones. The number of zones actually reported is
 158 *    returned in @nr_zones.
 159 */
 160int blkdev_report_zones(struct block_device *bdev, sector_t sector,
 161                        struct blk_zone *zones, unsigned int *nr_zones,
 162                        gfp_t gfp_mask)
 163{
 164        struct request_queue *q = bdev_get_queue(bdev);
 165        unsigned int i, nrz;
 166        int ret;
 167
 168        if (!blk_queue_is_zoned(q))
 169                return -EOPNOTSUPP;
 170
 171        /*
 172         * A block device that advertized itself as zoned must have a
 173         * report_zones method. If it does not have one defined, the device
 174         * driver has a bug. So warn about that.
 175         */
 176        if (WARN_ON_ONCE(!bdev->bd_disk->fops->report_zones))
 177                return -EOPNOTSUPP;
 178
 179        if (!*nr_zones || sector >= bdev->bd_part->nr_sects) {
 180                *nr_zones = 0;
 181                return 0;
 182        }
 183
 184        nrz = min(*nr_zones,
 185                  __blkdev_nr_zones(q, bdev->bd_part->nr_sects - sector));
 186        ret = blk_report_zones(bdev->bd_disk, get_start_sect(bdev) + sector,
 187                               zones, &nrz, gfp_mask);
 188        if (ret)
 189                return ret;
 190
 191        for (i = 0; i < nrz; i++) {
 192                if (!blkdev_report_zone(bdev, zones))
 193                        break;
 194                zones++;
 195        }
 196
 197        *nr_zones = i;
 198
 199        return 0;
 200}
 201EXPORT_SYMBOL_GPL(blkdev_report_zones);
 202
 203/**
 204 * blkdev_reset_zones - Reset zones write pointer
 205 * @bdev:       Target block device
 206 * @sector:     Start sector of the first zone to reset
 207 * @nr_sectors: Number of sectors, at least the length of one zone
 208 * @gfp_mask:   Memory allocation flags (for bio_alloc)
 209 *
 210 * Description:
 211 *    Reset the write pointer of the zones contained in the range
 212 *    @sector..@sector+@nr_sectors. Specifying the entire disk sector range
 213 *    is valid, but the specified range should not contain conventional zones.
 214 */
 215int blkdev_reset_zones(struct block_device *bdev,
 216                       sector_t sector, sector_t nr_sectors,
 217                       gfp_t gfp_mask)
 218{
 219        struct request_queue *q = bdev_get_queue(bdev);
 220        sector_t zone_sectors;
 221        sector_t end_sector = sector + nr_sectors;
 222        struct bio *bio = NULL;
 223        struct blk_plug plug;
 224        int ret;
 225
 226        if (!blk_queue_is_zoned(q))
 227                return -EOPNOTSUPP;
 228
 229        if (bdev_read_only(bdev))
 230                return -EPERM;
 231
 232        if (!nr_sectors || end_sector > bdev->bd_part->nr_sects)
 233                /* Out of range */
 234                return -EINVAL;
 235
 236        /* Check alignment (handle eventual smaller last zone) */
 237        zone_sectors = blk_queue_zone_sectors(q);
 238        if (sector & (zone_sectors - 1))
 239                return -EINVAL;
 240
 241        if ((nr_sectors & (zone_sectors - 1)) &&
 242            end_sector != bdev->bd_part->nr_sects)
 243                return -EINVAL;
 244
 245        blk_start_plug(&plug);
 246        while (sector < end_sector) {
 247
 248                bio = blk_next_bio(bio, 0, gfp_mask);
 249                bio->bi_iter.bi_sector = sector;
 250                bio_set_dev(bio, bdev);
 251                bio_set_op_attrs(bio, REQ_OP_ZONE_RESET, 0);
 252
 253                sector += zone_sectors;
 254
 255                /* This may take a while, so be nice to others */
 256                cond_resched();
 257
 258        }
 259
 260        ret = submit_bio_wait(bio);
 261        bio_put(bio);
 262
 263        blk_finish_plug(&plug);
 264
 265        return ret;
 266}
 267EXPORT_SYMBOL_GPL(blkdev_reset_zones);
 268
 269/*
 270 * BLKREPORTZONE ioctl processing.
 271 * Called from blkdev_ioctl.
 272 */
 273int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
 274                              unsigned int cmd, unsigned long arg)
 275{
 276        void __user *argp = (void __user *)arg;
 277        struct request_queue *q;
 278        struct blk_zone_report rep;
 279        struct blk_zone *zones;
 280        int ret;
 281
 282        if (!argp)
 283                return -EINVAL;
 284
 285        q = bdev_get_queue(bdev);
 286        if (!q)
 287                return -ENXIO;
 288
 289        if (!blk_queue_is_zoned(q))
 290                return -ENOTTY;
 291
 292        if (!capable(CAP_SYS_ADMIN))
 293                return -EACCES;
 294
 295        if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report)))
 296                return -EFAULT;
 297
 298        if (!rep.nr_zones)
 299                return -EINVAL;
 300
 301        rep.nr_zones = min(blkdev_nr_zones(bdev), rep.nr_zones);
 302
 303        zones = kvmalloc_array(rep.nr_zones, sizeof(struct blk_zone),
 304                               GFP_KERNEL | __GFP_ZERO);
 305        if (!zones)
 306                return -ENOMEM;
 307
 308        ret = blkdev_report_zones(bdev, rep.sector,
 309                                  zones, &rep.nr_zones,
 310                                  GFP_KERNEL);
 311        if (ret)
 312                goto out;
 313
 314        if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report))) {
 315                ret = -EFAULT;
 316                goto out;
 317        }
 318
 319        if (rep.nr_zones) {
 320                if (copy_to_user(argp + sizeof(struct blk_zone_report), zones,
 321                                 sizeof(struct blk_zone) * rep.nr_zones))
 322                        ret = -EFAULT;
 323        }
 324
 325 out:
 326        kvfree(zones);
 327
 328        return ret;
 329}
 330
 331/*
 332 * BLKRESETZONE ioctl processing.
 333 * Called from blkdev_ioctl.
 334 */
 335int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode,
 336                             unsigned int cmd, unsigned long arg)
 337{
 338        void __user *argp = (void __user *)arg;
 339        struct request_queue *q;
 340        struct blk_zone_range zrange;
 341
 342        if (!argp)
 343                return -EINVAL;
 344
 345        q = bdev_get_queue(bdev);
 346        if (!q)
 347                return -ENXIO;
 348
 349        if (!blk_queue_is_zoned(q))
 350                return -ENOTTY;
 351
 352        if (!capable(CAP_SYS_ADMIN))
 353                return -EACCES;
 354
 355        if (!(mode & FMODE_WRITE))
 356                return -EBADF;
 357
 358        if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range)))
 359                return -EFAULT;
 360
 361        return blkdev_reset_zones(bdev, zrange.sector, zrange.nr_sectors,
 362                                  GFP_KERNEL);
 363}
 364
 365static inline unsigned long *blk_alloc_zone_bitmap(int node,
 366                                                   unsigned int nr_zones)
 367{
 368        return kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(unsigned long),
 369                            GFP_NOIO, node);
 370}
 371
 372/*
 373 * Allocate an array of struct blk_zone to get nr_zones zone information.
 374 * The allocated array may be smaller than nr_zones.
 375 */
 376static struct blk_zone *blk_alloc_zones(int node, unsigned int *nr_zones)
 377{
 378        size_t size = *nr_zones * sizeof(struct blk_zone);
 379        struct page *page;
 380        int order;
 381
 382        for (order = get_order(size); order >= 0; order--) {
 383                page = alloc_pages_node(node, GFP_NOIO | __GFP_ZERO, order);
 384                if (page) {
 385                        *nr_zones = min_t(unsigned int, *nr_zones,
 386                                (PAGE_SIZE << order) / sizeof(struct blk_zone));
 387                        return page_address(page);
 388                }
 389        }
 390
 391        return NULL;
 392}
 393
 394void blk_queue_free_zone_bitmaps(struct request_queue *q)
 395{
 396        kfree(q->seq_zones_bitmap);
 397        q->seq_zones_bitmap = NULL;
 398        kfree(q->seq_zones_wlock);
 399        q->seq_zones_wlock = NULL;
 400}
 401
 402/**
 403 * blk_revalidate_disk_zones - (re)allocate and initialize zone bitmaps
 404 * @disk:       Target disk
 405 *
 406 * Helper function for low-level device drivers to (re) allocate and initialize
 407 * a disk request queue zone bitmaps. This functions should normally be called
 408 * within the disk ->revalidate method. For BIO based queues, no zone bitmap
 409 * is allocated.
 410 */
 411int blk_revalidate_disk_zones(struct gendisk *disk)
 412{
 413        struct request_queue *q = disk->queue;
 414        unsigned int nr_zones = __blkdev_nr_zones(q, get_capacity(disk));
 415        unsigned long *seq_zones_wlock = NULL, *seq_zones_bitmap = NULL;
 416        unsigned int i, rep_nr_zones = 0, z = 0, nrz;
 417        struct blk_zone *zones = NULL;
 418        sector_t sector = 0;
 419        int ret = 0;
 420
 421        /*
 422         * BIO based queues do not use a scheduler so only q->nr_zones
 423         * needs to be updated so that the sysfs exposed value is correct.
 424         */
 425        if (!queue_is_mq(q)) {
 426                q->nr_zones = nr_zones;
 427                return 0;
 428        }
 429
 430        if (!blk_queue_is_zoned(q) || !nr_zones) {
 431                nr_zones = 0;
 432                goto update;
 433        }
 434
 435        /* Allocate bitmaps */
 436        ret = -ENOMEM;
 437        seq_zones_wlock = blk_alloc_zone_bitmap(q->node, nr_zones);
 438        if (!seq_zones_wlock)
 439                goto out;
 440        seq_zones_bitmap = blk_alloc_zone_bitmap(q->node, nr_zones);
 441        if (!seq_zones_bitmap)
 442                goto out;
 443
 444        /* Get zone information and initialize seq_zones_bitmap */
 445        rep_nr_zones = nr_zones;
 446        zones = blk_alloc_zones(q->node, &rep_nr_zones);
 447        if (!zones)
 448                goto out;
 449
 450        while (z < nr_zones) {
 451                nrz = min(nr_zones - z, rep_nr_zones);
 452                ret = blk_report_zones(disk, sector, zones, &nrz, GFP_NOIO);
 453                if (ret)
 454                        goto out;
 455                if (!nrz)
 456                        break;
 457                for (i = 0; i < nrz; i++) {
 458                        if (zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL)
 459                                set_bit(z, seq_zones_bitmap);
 460                        z++;
 461                }
 462                sector += nrz * blk_queue_zone_sectors(q);
 463        }
 464
 465        if (WARN_ON(z != nr_zones)) {
 466                ret = -EIO;
 467                goto out;
 468        }
 469
 470update:
 471        /*
 472         * Install the new bitmaps, making sure the queue is stopped and
 473         * all I/Os are completed (i.e. a scheduler is not referencing the
 474         * bitmaps).
 475         */
 476        blk_mq_freeze_queue(q);
 477        q->nr_zones = nr_zones;
 478        swap(q->seq_zones_wlock, seq_zones_wlock);
 479        swap(q->seq_zones_bitmap, seq_zones_bitmap);
 480        blk_mq_unfreeze_queue(q);
 481
 482out:
 483        free_pages((unsigned long)zones,
 484                   get_order(rep_nr_zones * sizeof(struct blk_zone)));
 485        kfree(seq_zones_wlock);
 486        kfree(seq_zones_bitmap);
 487
 488        if (ret) {
 489                pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
 490                blk_mq_freeze_queue(q);
 491                blk_queue_free_zone_bitmaps(q);
 492                blk_mq_unfreeze_queue(q);
 493        }
 494
 495        return ret;
 496}
 497EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
 498
 499