linux/fs/btrfs/zoned.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2
   3#include <linux/bitops.h>
   4#include <linux/slab.h>
   5#include <linux/blkdev.h>
   6#include <linux/sched/mm.h>
   7#include "ctree.h"
   8#include "volumes.h"
   9#include "zoned.h"
  10#include "rcu-string.h"
  11#include "disk-io.h"
  12#include "block-group.h"
  13#include "transaction.h"
  14#include "dev-replace.h"
  15#include "space-info.h"
  16
  17/* Maximum number of zones to report per blkdev_report_zones() call */
  18#define BTRFS_REPORT_NR_ZONES   4096
  19/* Invalid allocation pointer value for missing devices */
  20#define WP_MISSING_DEV ((u64)-1)
  21/* Pseudo write pointer value for conventional zone */
  22#define WP_CONVENTIONAL ((u64)-2)
  23
  24/*
  25 * Location of the first zone of superblock logging zone pairs.
  26 *
  27 * - primary superblock:    0B (zone 0)
  28 * - first copy:          512G (zone starting at that offset)
  29 * - second copy:           4T (zone starting at that offset)
  30 */
  31#define BTRFS_SB_LOG_PRIMARY_OFFSET     (0ULL)
  32#define BTRFS_SB_LOG_FIRST_OFFSET       (512ULL * SZ_1G)
  33#define BTRFS_SB_LOG_SECOND_OFFSET      (4096ULL * SZ_1G)
  34
  35#define BTRFS_SB_LOG_FIRST_SHIFT        const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET)
  36#define BTRFS_SB_LOG_SECOND_SHIFT       const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET)
  37
  38/* Number of superblock log zones */
  39#define BTRFS_NR_SB_LOG_ZONES 2
  40
  41/*
  42 * Maximum supported zone size. Currently, SMR disks have a zone size of
  43 * 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not
  44 * expect the zone size to become larger than 8GiB in the near future.
  45 */
  46#define BTRFS_MAX_ZONE_SIZE             SZ_8G
  47
  48static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data)
  49{
  50        struct blk_zone *zones = data;
  51
  52        memcpy(&zones[idx], zone, sizeof(*zone));
  53
  54        return 0;
  55}
  56
  57static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
  58                            u64 *wp_ret)
  59{
  60        bool empty[BTRFS_NR_SB_LOG_ZONES];
  61        bool full[BTRFS_NR_SB_LOG_ZONES];
  62        sector_t sector;
  63
  64        ASSERT(zones[0].type != BLK_ZONE_TYPE_CONVENTIONAL &&
  65               zones[1].type != BLK_ZONE_TYPE_CONVENTIONAL);
  66
  67        empty[0] = (zones[0].cond == BLK_ZONE_COND_EMPTY);
  68        empty[1] = (zones[1].cond == BLK_ZONE_COND_EMPTY);
  69        full[0] = (zones[0].cond == BLK_ZONE_COND_FULL);
  70        full[1] = (zones[1].cond == BLK_ZONE_COND_FULL);
  71
  72        /*
  73         * Possible states of log buffer zones
  74         *
  75         *           Empty[0]  In use[0]  Full[0]
  76         * Empty[1]         *          x        0
  77         * In use[1]        0          x        0
  78         * Full[1]          1          1        C
  79         *
  80         * Log position:
  81         *   *: Special case, no superblock is written
  82         *   0: Use write pointer of zones[0]
  83         *   1: Use write pointer of zones[1]
  84         *   C: Compare super blocks from zones[0] and zones[1], use the latest
  85         *      one determined by generation
  86         *   x: Invalid state
  87         */
  88
  89        if (empty[0] && empty[1]) {
  90                /* Special case to distinguish no superblock to read */
  91                *wp_ret = zones[0].start << SECTOR_SHIFT;
  92                return -ENOENT;
  93        } else if (full[0] && full[1]) {
  94                /* Compare two super blocks */
  95                struct address_space *mapping = bdev->bd_inode->i_mapping;
  96                struct page *page[BTRFS_NR_SB_LOG_ZONES];
  97                struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES];
  98                int i;
  99
 100                for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
 101                        u64 bytenr;
 102
 103                        bytenr = ((zones[i].start + zones[i].len)
 104                                   << SECTOR_SHIFT) - BTRFS_SUPER_INFO_SIZE;
 105
 106                        page[i] = read_cache_page_gfp(mapping,
 107                                        bytenr >> PAGE_SHIFT, GFP_NOFS);
 108                        if (IS_ERR(page[i])) {
 109                                if (i == 1)
 110                                        btrfs_release_disk_super(super[0]);
 111                                return PTR_ERR(page[i]);
 112                        }
 113                        super[i] = page_address(page[i]);
 114                }
 115
 116                if (super[0]->generation > super[1]->generation)
 117                        sector = zones[1].start;
 118                else
 119                        sector = zones[0].start;
 120
 121                for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++)
 122                        btrfs_release_disk_super(super[i]);
 123        } else if (!full[0] && (empty[1] || full[1])) {
 124                sector = zones[0].wp;
 125        } else if (full[0]) {
 126                sector = zones[1].wp;
 127        } else {
 128                return -EUCLEAN;
 129        }
 130        *wp_ret = sector << SECTOR_SHIFT;
 131        return 0;
 132}
 133
 134/*
 135 * Get the first zone number of the superblock mirror
 136 */
 137static inline u32 sb_zone_number(int shift, int mirror)
 138{
 139        u64 zone;
 140
 141        ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
 142        switch (mirror) {
 143        case 0: zone = 0; break;
 144        case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break;
 145        case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break;
 146        }
 147
 148        ASSERT(zone <= U32_MAX);
 149
 150        return (u32)zone;
 151}
 152
 153static inline sector_t zone_start_sector(u32 zone_number,
 154                                         struct block_device *bdev)
 155{
 156        return (sector_t)zone_number << ilog2(bdev_zone_sectors(bdev));
 157}
 158
 159static inline u64 zone_start_physical(u32 zone_number,
 160                                      struct btrfs_zoned_device_info *zone_info)
 161{
 162        return (u64)zone_number << zone_info->zone_size_shift;
 163}
 164
 165/*
 166 * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block
 167 * device into static sized chunks and fake a conventional zone on each of
 168 * them.
 169 */
 170static int emulate_report_zones(struct btrfs_device *device, u64 pos,
 171                                struct blk_zone *zones, unsigned int nr_zones)
 172{
 173        const sector_t zone_sectors = device->fs_info->zone_size >> SECTOR_SHIFT;
 174        sector_t bdev_size = bdev_nr_sectors(device->bdev);
 175        unsigned int i;
 176
 177        pos >>= SECTOR_SHIFT;
 178        for (i = 0; i < nr_zones; i++) {
 179                zones[i].start = i * zone_sectors + pos;
 180                zones[i].len = zone_sectors;
 181                zones[i].capacity = zone_sectors;
 182                zones[i].wp = zones[i].start + zone_sectors;
 183                zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL;
 184                zones[i].cond = BLK_ZONE_COND_NOT_WP;
 185
 186                if (zones[i].wp >= bdev_size) {
 187                        i++;
 188                        break;
 189                }
 190        }
 191
 192        return i;
 193}
 194
 195static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
 196                               struct blk_zone *zones, unsigned int *nr_zones)
 197{
 198        int ret;
 199
 200        if (!*nr_zones)
 201                return 0;
 202
 203        if (!bdev_is_zoned(device->bdev)) {
 204                ret = emulate_report_zones(device, pos, zones, *nr_zones);
 205                *nr_zones = ret;
 206                return 0;
 207        }
 208
 209        ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
 210                                  copy_zone_info_cb, zones);
 211        if (ret < 0) {
 212                btrfs_err_in_rcu(device->fs_info,
 213                                 "zoned: failed to read zone %llu on %s (devid %llu)",
 214                                 pos, rcu_str_deref(device->name),
 215                                 device->devid);
 216                return ret;
 217        }
 218        *nr_zones = ret;
 219        if (!ret)
 220                return -EIO;
 221
 222        return 0;
 223}
 224
 225/* The emulated zone size is determined from the size of device extent */
 226static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info)
 227{
 228        struct btrfs_path *path;
 229        struct btrfs_root *root = fs_info->dev_root;
 230        struct btrfs_key key;
 231        struct extent_buffer *leaf;
 232        struct btrfs_dev_extent *dext;
 233        int ret = 0;
 234
 235        key.objectid = 1;
 236        key.type = BTRFS_DEV_EXTENT_KEY;
 237        key.offset = 0;
 238
 239        path = btrfs_alloc_path();
 240        if (!path)
 241                return -ENOMEM;
 242
 243        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 244        if (ret < 0)
 245                goto out;
 246
 247        if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
 248                ret = btrfs_next_item(root, path);
 249                if (ret < 0)
 250                        goto out;
 251                /* No dev extents at all? Not good */
 252                if (ret > 0) {
 253                        ret = -EUCLEAN;
 254                        goto out;
 255                }
 256        }
 257
 258        leaf = path->nodes[0];
 259        dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
 260        fs_info->zone_size = btrfs_dev_extent_length(leaf, dext);
 261        ret = 0;
 262
 263out:
 264        btrfs_free_path(path);
 265
 266        return ret;
 267}
 268
 269int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
 270{
 271        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 272        struct btrfs_device *device;
 273        int ret = 0;
 274
 275        /* fs_info->zone_size might not set yet. Use the incomapt flag here. */
 276        if (!btrfs_fs_incompat(fs_info, ZONED))
 277                return 0;
 278
 279        mutex_lock(&fs_devices->device_list_mutex);
 280        list_for_each_entry(device, &fs_devices->devices, dev_list) {
 281                /* We can skip reading of zone info for missing devices */
 282                if (!device->bdev)
 283                        continue;
 284
 285                ret = btrfs_get_dev_zone_info(device);
 286                if (ret)
 287                        break;
 288        }
 289        mutex_unlock(&fs_devices->device_list_mutex);
 290
 291        return ret;
 292}
 293
 294int btrfs_get_dev_zone_info(struct btrfs_device *device)
 295{
 296        struct btrfs_fs_info *fs_info = device->fs_info;
 297        struct btrfs_zoned_device_info *zone_info = NULL;
 298        struct block_device *bdev = device->bdev;
 299        struct request_queue *queue = bdev_get_queue(bdev);
 300        sector_t nr_sectors;
 301        sector_t sector = 0;
 302        struct blk_zone *zones = NULL;
 303        unsigned int i, nreported = 0, nr_zones;
 304        sector_t zone_sectors;
 305        char *model, *emulated;
 306        int ret;
 307
 308        /*
 309         * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not
 310         * yet be set.
 311         */
 312        if (!btrfs_fs_incompat(fs_info, ZONED))
 313                return 0;
 314
 315        if (device->zone_info)
 316                return 0;
 317
 318        zone_info = kzalloc(sizeof(*zone_info), GFP_KERNEL);
 319        if (!zone_info)
 320                return -ENOMEM;
 321
 322        if (!bdev_is_zoned(bdev)) {
 323                if (!fs_info->zone_size) {
 324                        ret = calculate_emulated_zone_size(fs_info);
 325                        if (ret)
 326                                goto out;
 327                }
 328
 329                ASSERT(fs_info->zone_size);
 330                zone_sectors = fs_info->zone_size >> SECTOR_SHIFT;
 331        } else {
 332                zone_sectors = bdev_zone_sectors(bdev);
 333        }
 334
 335        /* Check if it's power of 2 (see is_power_of_2) */
 336        ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0);
 337        zone_info->zone_size = zone_sectors << SECTOR_SHIFT;
 338
 339        /* We reject devices with a zone size larger than 8GB */
 340        if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) {
 341                btrfs_err_in_rcu(fs_info,
 342                "zoned: %s: zone size %llu larger than supported maximum %llu",
 343                                 rcu_str_deref(device->name),
 344                                 zone_info->zone_size, BTRFS_MAX_ZONE_SIZE);
 345                ret = -EINVAL;
 346                goto out;
 347        }
 348
 349        nr_sectors = bdev_nr_sectors(bdev);
 350        zone_info->zone_size_shift = ilog2(zone_info->zone_size);
 351        zone_info->max_zone_append_size =
 352                (u64)queue_max_zone_append_sectors(queue) << SECTOR_SHIFT;
 353        zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors);
 354        if (!IS_ALIGNED(nr_sectors, zone_sectors))
 355                zone_info->nr_zones++;
 356
 357        if (bdev_is_zoned(bdev) && zone_info->max_zone_append_size == 0) {
 358                btrfs_err(fs_info, "zoned: device %pg does not support zone append",
 359                          bdev);
 360                ret = -EINVAL;
 361                goto out;
 362        }
 363
 364        zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
 365        if (!zone_info->seq_zones) {
 366                ret = -ENOMEM;
 367                goto out;
 368        }
 369
 370        zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
 371        if (!zone_info->empty_zones) {
 372                ret = -ENOMEM;
 373                goto out;
 374        }
 375
 376        zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
 377        if (!zones) {
 378                ret = -ENOMEM;
 379                goto out;
 380        }
 381
 382        /* Get zones type */
 383        while (sector < nr_sectors) {
 384                nr_zones = BTRFS_REPORT_NR_ZONES;
 385                ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones,
 386                                          &nr_zones);
 387                if (ret)
 388                        goto out;
 389
 390                for (i = 0; i < nr_zones; i++) {
 391                        if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ)
 392                                __set_bit(nreported, zone_info->seq_zones);
 393                        if (zones[i].cond == BLK_ZONE_COND_EMPTY)
 394                                __set_bit(nreported, zone_info->empty_zones);
 395                        nreported++;
 396                }
 397                sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len;
 398        }
 399
 400        if (nreported != zone_info->nr_zones) {
 401                btrfs_err_in_rcu(device->fs_info,
 402                                 "inconsistent number of zones on %s (%u/%u)",
 403                                 rcu_str_deref(device->name), nreported,
 404                                 zone_info->nr_zones);
 405                ret = -EIO;
 406                goto out;
 407        }
 408
 409        /* Validate superblock log */
 410        nr_zones = BTRFS_NR_SB_LOG_ZONES;
 411        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
 412                u32 sb_zone;
 413                u64 sb_wp;
 414                int sb_pos = BTRFS_NR_SB_LOG_ZONES * i;
 415
 416                sb_zone = sb_zone_number(zone_info->zone_size_shift, i);
 417                if (sb_zone + 1 >= zone_info->nr_zones)
 418                        continue;
 419
 420                ret = btrfs_get_dev_zones(device,
 421                                          zone_start_physical(sb_zone, zone_info),
 422                                          &zone_info->sb_zones[sb_pos],
 423                                          &nr_zones);
 424                if (ret)
 425                        goto out;
 426
 427                if (nr_zones != BTRFS_NR_SB_LOG_ZONES) {
 428                        btrfs_err_in_rcu(device->fs_info,
 429        "zoned: failed to read super block log zone info at devid %llu zone %u",
 430                                         device->devid, sb_zone);
 431                        ret = -EUCLEAN;
 432                        goto out;
 433                }
 434
 435                /*
 436                 * If zones[0] is conventional, always use the beginning of the
 437                 * zone to record superblock. No need to validate in that case.
 438                 */
 439                if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type ==
 440                    BLK_ZONE_TYPE_CONVENTIONAL)
 441                        continue;
 442
 443                ret = sb_write_pointer(device->bdev,
 444                                       &zone_info->sb_zones[sb_pos], &sb_wp);
 445                if (ret != -ENOENT && ret) {
 446                        btrfs_err_in_rcu(device->fs_info,
 447                        "zoned: super block log zone corrupted devid %llu zone %u",
 448                                         device->devid, sb_zone);
 449                        ret = -EUCLEAN;
 450                        goto out;
 451                }
 452        }
 453
 454
 455        kfree(zones);
 456
 457        device->zone_info = zone_info;
 458
 459        switch (bdev_zoned_model(bdev)) {
 460        case BLK_ZONED_HM:
 461                model = "host-managed zoned";
 462                emulated = "";
 463                break;
 464        case BLK_ZONED_HA:
 465                model = "host-aware zoned";
 466                emulated = "";
 467                break;
 468        case BLK_ZONED_NONE:
 469                model = "regular";
 470                emulated = "emulated ";
 471                break;
 472        default:
 473                /* Just in case */
 474                btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s",
 475                                 bdev_zoned_model(bdev),
 476                                 rcu_str_deref(device->name));
 477                ret = -EOPNOTSUPP;
 478                goto out_free_zone_info;
 479        }
 480
 481        btrfs_info_in_rcu(fs_info,
 482                "%s block device %s, %u %szones of %llu bytes",
 483                model, rcu_str_deref(device->name), zone_info->nr_zones,
 484                emulated, zone_info->zone_size);
 485
 486        return 0;
 487
 488out:
 489        kfree(zones);
 490out_free_zone_info:
 491        bitmap_free(zone_info->empty_zones);
 492        bitmap_free(zone_info->seq_zones);
 493        kfree(zone_info);
 494        device->zone_info = NULL;
 495
 496        return ret;
 497}
 498
 499void btrfs_destroy_dev_zone_info(struct btrfs_device *device)
 500{
 501        struct btrfs_zoned_device_info *zone_info = device->zone_info;
 502
 503        if (!zone_info)
 504                return;
 505
 506        bitmap_free(zone_info->seq_zones);
 507        bitmap_free(zone_info->empty_zones);
 508        kfree(zone_info);
 509        device->zone_info = NULL;
 510}
 511
 512int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
 513                       struct blk_zone *zone)
 514{
 515        unsigned int nr_zones = 1;
 516        int ret;
 517
 518        ret = btrfs_get_dev_zones(device, pos, zone, &nr_zones);
 519        if (ret != 0 || !nr_zones)
 520                return ret ? ret : -EIO;
 521
 522        return 0;
 523}
 524
 525int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
 526{
 527        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 528        struct btrfs_device *device;
 529        u64 zoned_devices = 0;
 530        u64 nr_devices = 0;
 531        u64 zone_size = 0;
 532        u64 max_zone_append_size = 0;
 533        const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED);
 534        int ret = 0;
 535
 536        /* Count zoned devices */
 537        list_for_each_entry(device, &fs_devices->devices, dev_list) {
 538                enum blk_zoned_model model;
 539
 540                if (!device->bdev)
 541                        continue;
 542
 543                model = bdev_zoned_model(device->bdev);
 544                /*
 545                 * A Host-Managed zoned device must be used as a zoned device.
 546                 * A Host-Aware zoned device and a non-zoned devices can be
 547                 * treated as a zoned device, if ZONED flag is enabled in the
 548                 * superblock.
 549                 */
 550                if (model == BLK_ZONED_HM ||
 551                    (model == BLK_ZONED_HA && incompat_zoned) ||
 552                    (model == BLK_ZONED_NONE && incompat_zoned)) {
 553                        struct btrfs_zoned_device_info *zone_info =
 554                                device->zone_info;
 555
 556                        zone_info = device->zone_info;
 557                        zoned_devices++;
 558                        if (!zone_size) {
 559                                zone_size = zone_info->zone_size;
 560                        } else if (zone_info->zone_size != zone_size) {
 561                                btrfs_err(fs_info,
 562                "zoned: unequal block device zone sizes: have %llu found %llu",
 563                                          device->zone_info->zone_size,
 564                                          zone_size);
 565                                ret = -EINVAL;
 566                                goto out;
 567                        }
 568                        if (!max_zone_append_size ||
 569                            (zone_info->max_zone_append_size &&
 570                             zone_info->max_zone_append_size < max_zone_append_size))
 571                                max_zone_append_size =
 572                                        zone_info->max_zone_append_size;
 573                }
 574                nr_devices++;
 575        }
 576
 577        if (!zoned_devices && !incompat_zoned)
 578                goto out;
 579
 580        if (!zoned_devices && incompat_zoned) {
 581                /* No zoned block device found on ZONED filesystem */
 582                btrfs_err(fs_info,
 583                          "zoned: no zoned devices found on a zoned filesystem");
 584                ret = -EINVAL;
 585                goto out;
 586        }
 587
 588        if (zoned_devices && !incompat_zoned) {
 589                btrfs_err(fs_info,
 590                          "zoned: mode not enabled but zoned device found");
 591                ret = -EINVAL;
 592                goto out;
 593        }
 594
 595        if (zoned_devices != nr_devices) {
 596                btrfs_err(fs_info,
 597                          "zoned: cannot mix zoned and regular devices");
 598                ret = -EINVAL;
 599                goto out;
 600        }
 601
 602        /*
 603         * stripe_size is always aligned to BTRFS_STRIPE_LEN in
 604         * __btrfs_alloc_chunk(). Since we want stripe_len == zone_size,
 605         * check the alignment here.
 606         */
 607        if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) {
 608                btrfs_err(fs_info,
 609                          "zoned: zone size %llu not aligned to stripe %u",
 610                          zone_size, BTRFS_STRIPE_LEN);
 611                ret = -EINVAL;
 612                goto out;
 613        }
 614
 615        if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
 616                btrfs_err(fs_info, "zoned: mixed block groups not supported");
 617                ret = -EINVAL;
 618                goto out;
 619        }
 620
 621        fs_info->zone_size = zone_size;
 622        fs_info->max_zone_append_size = max_zone_append_size;
 623        fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
 624
 625        /*
 626         * Check mount options here, because we might change fs_info->zoned
 627         * from fs_info->zone_size.
 628         */
 629        ret = btrfs_check_mountopts_zoned(fs_info);
 630        if (ret)
 631                goto out;
 632
 633        btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size);
 634out:
 635        return ret;
 636}
 637
 638int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
 639{
 640        if (!btrfs_is_zoned(info))
 641                return 0;
 642
 643        /*
 644         * Space cache writing is not COWed. Disable that to avoid write errors
 645         * in sequential zones.
 646         */
 647        if (btrfs_test_opt(info, SPACE_CACHE)) {
 648                btrfs_err(info, "zoned: space cache v1 is not supported");
 649                return -EINVAL;
 650        }
 651
 652        if (btrfs_test_opt(info, NODATACOW)) {
 653                btrfs_err(info, "zoned: NODATACOW not supported");
 654                return -EINVAL;
 655        }
 656
 657        return 0;
 658}
 659
 660static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
 661                           int rw, u64 *bytenr_ret)
 662{
 663        u64 wp;
 664        int ret;
 665
 666        if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) {
 667                *bytenr_ret = zones[0].start << SECTOR_SHIFT;
 668                return 0;
 669        }
 670
 671        ret = sb_write_pointer(bdev, zones, &wp);
 672        if (ret != -ENOENT && ret < 0)
 673                return ret;
 674
 675        if (rw == WRITE) {
 676                struct blk_zone *reset = NULL;
 677
 678                if (wp == zones[0].start << SECTOR_SHIFT)
 679                        reset = &zones[0];
 680                else if (wp == zones[1].start << SECTOR_SHIFT)
 681                        reset = &zones[1];
 682
 683                if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
 684                        ASSERT(reset->cond == BLK_ZONE_COND_FULL);
 685
 686                        ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
 687                                               reset->start, reset->len,
 688                                               GFP_NOFS);
 689                        if (ret)
 690                                return ret;
 691
 692                        reset->cond = BLK_ZONE_COND_EMPTY;
 693                        reset->wp = reset->start;
 694                }
 695        } else if (ret != -ENOENT) {
 696                /* For READ, we want the precious one */
 697                if (wp == zones[0].start << SECTOR_SHIFT)
 698                        wp = (zones[1].start + zones[1].len) << SECTOR_SHIFT;
 699                wp -= BTRFS_SUPER_INFO_SIZE;
 700        }
 701
 702        *bytenr_ret = wp;
 703        return 0;
 704
 705}
 706
 707int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
 708                               u64 *bytenr_ret)
 709{
 710        struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES];
 711        sector_t zone_sectors;
 712        u32 sb_zone;
 713        int ret;
 714        u8 zone_sectors_shift;
 715        sector_t nr_sectors;
 716        u32 nr_zones;
 717
 718        if (!bdev_is_zoned(bdev)) {
 719                *bytenr_ret = btrfs_sb_offset(mirror);
 720                return 0;
 721        }
 722
 723        ASSERT(rw == READ || rw == WRITE);
 724
 725        zone_sectors = bdev_zone_sectors(bdev);
 726        if (!is_power_of_2(zone_sectors))
 727                return -EINVAL;
 728        zone_sectors_shift = ilog2(zone_sectors);
 729        nr_sectors = bdev_nr_sectors(bdev);
 730        nr_zones = nr_sectors >> zone_sectors_shift;
 731
 732        sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
 733        if (sb_zone + 1 >= nr_zones)
 734                return -ENOENT;
 735
 736        ret = blkdev_report_zones(bdev, zone_start_sector(sb_zone, bdev),
 737                                  BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb,
 738                                  zones);
 739        if (ret < 0)
 740                return ret;
 741        if (ret != BTRFS_NR_SB_LOG_ZONES)
 742                return -EIO;
 743
 744        return sb_log_location(bdev, zones, rw, bytenr_ret);
 745}
 746
 747int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
 748                          u64 *bytenr_ret)
 749{
 750        struct btrfs_zoned_device_info *zinfo = device->zone_info;
 751        u32 zone_num;
 752
 753        /*
 754         * For a zoned filesystem on a non-zoned block device, use the same
 755         * super block locations as regular filesystem. Doing so, the super
 756         * block can always be retrieved and the zoned flag of the volume
 757         * detected from the super block information.
 758         */
 759        if (!bdev_is_zoned(device->bdev)) {
 760                *bytenr_ret = btrfs_sb_offset(mirror);
 761                return 0;
 762        }
 763
 764        zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
 765        if (zone_num + 1 >= zinfo->nr_zones)
 766                return -ENOENT;
 767
 768        return sb_log_location(device->bdev,
 769                               &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror],
 770                               rw, bytenr_ret);
 771}
 772
 773static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo,
 774                                  int mirror)
 775{
 776        u32 zone_num;
 777
 778        if (!zinfo)
 779                return false;
 780
 781        zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
 782        if (zone_num + 1 >= zinfo->nr_zones)
 783                return false;
 784
 785        if (!test_bit(zone_num, zinfo->seq_zones))
 786                return false;
 787
 788        return true;
 789}
 790
 791void btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
 792{
 793        struct btrfs_zoned_device_info *zinfo = device->zone_info;
 794        struct blk_zone *zone;
 795
 796        if (!is_sb_log_zone(zinfo, mirror))
 797                return;
 798
 799        zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror];
 800        if (zone->cond != BLK_ZONE_COND_FULL) {
 801                if (zone->cond == BLK_ZONE_COND_EMPTY)
 802                        zone->cond = BLK_ZONE_COND_IMP_OPEN;
 803
 804                zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
 805
 806                if (zone->wp == zone->start + zone->len)
 807                        zone->cond = BLK_ZONE_COND_FULL;
 808
 809                return;
 810        }
 811
 812        zone++;
 813        ASSERT(zone->cond != BLK_ZONE_COND_FULL);
 814        if (zone->cond == BLK_ZONE_COND_EMPTY)
 815                zone->cond = BLK_ZONE_COND_IMP_OPEN;
 816
 817        zone->wp += (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
 818
 819        if (zone->wp == zone->start + zone->len)
 820                zone->cond = BLK_ZONE_COND_FULL;
 821}
 822
 823int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
 824{
 825        sector_t zone_sectors;
 826        sector_t nr_sectors;
 827        u8 zone_sectors_shift;
 828        u32 sb_zone;
 829        u32 nr_zones;
 830
 831        zone_sectors = bdev_zone_sectors(bdev);
 832        zone_sectors_shift = ilog2(zone_sectors);
 833        nr_sectors = bdev_nr_sectors(bdev);
 834        nr_zones = nr_sectors >> zone_sectors_shift;
 835
 836        sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
 837        if (sb_zone + 1 >= nr_zones)
 838                return -ENOENT;
 839
 840        return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
 841                                zone_start_sector(sb_zone, bdev),
 842                                zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS);
 843}
 844
 845/**
 846 * btrfs_find_allocatable_zones - find allocatable zones within a given region
 847 *
 848 * @device:     the device to allocate a region on
 849 * @hole_start: the position of the hole to allocate the region
 850 * @num_bytes:  size of wanted region
 851 * @hole_end:   the end of the hole
 852 * @return:     position of allocatable zones
 853 *
 854 * Allocatable region should not contain any superblock locations.
 855 */
 856u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
 857                                 u64 hole_end, u64 num_bytes)
 858{
 859        struct btrfs_zoned_device_info *zinfo = device->zone_info;
 860        const u8 shift = zinfo->zone_size_shift;
 861        u64 nzones = num_bytes >> shift;
 862        u64 pos = hole_start;
 863        u64 begin, end;
 864        bool have_sb;
 865        int i;
 866
 867        ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size));
 868        ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size));
 869
 870        while (pos < hole_end) {
 871                begin = pos >> shift;
 872                end = begin + nzones;
 873
 874                if (end > zinfo->nr_zones)
 875                        return hole_end;
 876
 877                /* Check if zones in the region are all empty */
 878                if (btrfs_dev_is_sequential(device, pos) &&
 879                    find_next_zero_bit(zinfo->empty_zones, end, begin) != end) {
 880                        pos += zinfo->zone_size;
 881                        continue;
 882                }
 883
 884                have_sb = false;
 885                for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
 886                        u32 sb_zone;
 887                        u64 sb_pos;
 888
 889                        sb_zone = sb_zone_number(shift, i);
 890                        if (!(end <= sb_zone ||
 891                              sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) {
 892                                have_sb = true;
 893                                pos = zone_start_physical(
 894                                        sb_zone + BTRFS_NR_SB_LOG_ZONES, zinfo);
 895                                break;
 896                        }
 897
 898                        /* We also need to exclude regular superblock positions */
 899                        sb_pos = btrfs_sb_offset(i);
 900                        if (!(pos + num_bytes <= sb_pos ||
 901                              sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) {
 902                                have_sb = true;
 903                                pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE,
 904                                            zinfo->zone_size);
 905                                break;
 906                        }
 907                }
 908                if (!have_sb)
 909                        break;
 910        }
 911
 912        return pos;
 913}
 914
 915int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
 916                            u64 length, u64 *bytes)
 917{
 918        int ret;
 919
 920        *bytes = 0;
 921        ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET,
 922                               physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT,
 923                               GFP_NOFS);
 924        if (ret)
 925                return ret;
 926
 927        *bytes = length;
 928        while (length) {
 929                btrfs_dev_set_zone_empty(device, physical);
 930                physical += device->zone_info->zone_size;
 931                length -= device->zone_info->zone_size;
 932        }
 933
 934        return 0;
 935}
 936
 937int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
 938{
 939        struct btrfs_zoned_device_info *zinfo = device->zone_info;
 940        const u8 shift = zinfo->zone_size_shift;
 941        unsigned long begin = start >> shift;
 942        unsigned long end = (start + size) >> shift;
 943        u64 pos;
 944        int ret;
 945
 946        ASSERT(IS_ALIGNED(start, zinfo->zone_size));
 947        ASSERT(IS_ALIGNED(size, zinfo->zone_size));
 948
 949        if (end > zinfo->nr_zones)
 950                return -ERANGE;
 951
 952        /* All the zones are conventional */
 953        if (find_next_bit(zinfo->seq_zones, begin, end) == end)
 954                return 0;
 955
 956        /* All the zones are sequential and empty */
 957        if (find_next_zero_bit(zinfo->seq_zones, begin, end) == end &&
 958            find_next_zero_bit(zinfo->empty_zones, begin, end) == end)
 959                return 0;
 960
 961        for (pos = start; pos < start + size; pos += zinfo->zone_size) {
 962                u64 reset_bytes;
 963
 964                if (!btrfs_dev_is_sequential(device, pos) ||
 965                    btrfs_dev_is_empty_zone(device, pos))
 966                        continue;
 967
 968                /* Free regions should be empty */
 969                btrfs_warn_in_rcu(
 970                        device->fs_info,
 971                "zoned: resetting device %s (devid %llu) zone %llu for allocation",
 972                        rcu_str_deref(device->name), device->devid, pos >> shift);
 973                WARN_ON_ONCE(1);
 974
 975                ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size,
 976                                              &reset_bytes);
 977                if (ret)
 978                        return ret;
 979        }
 980
 981        return 0;
 982}
 983
 984/*
 985 * Calculate an allocation pointer from the extent allocation information
 986 * for a block group consist of conventional zones. It is pointed to the
 987 * end of the highest addressed extent in the block group as an allocation
 988 * offset.
 989 */
 990static int calculate_alloc_pointer(struct btrfs_block_group *cache,
 991                                   u64 *offset_ret)
 992{
 993        struct btrfs_fs_info *fs_info = cache->fs_info;
 994        struct btrfs_root *root = fs_info->extent_root;
 995        struct btrfs_path *path;
 996        struct btrfs_key key;
 997        struct btrfs_key found_key;
 998        int ret;
 999        u64 length;
1000
1001        path = btrfs_alloc_path();
1002        if (!path)
1003                return -ENOMEM;
1004
1005        key.objectid = cache->start + cache->length;
1006        key.type = 0;
1007        key.offset = 0;
1008
1009        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1010        /* We should not find the exact match */
1011        if (!ret)
1012                ret = -EUCLEAN;
1013        if (ret < 0)
1014                goto out;
1015
1016        ret = btrfs_previous_extent_item(root, path, cache->start);
1017        if (ret) {
1018                if (ret == 1) {
1019                        ret = 0;
1020                        *offset_ret = 0;
1021                }
1022                goto out;
1023        }
1024
1025        btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
1026
1027        if (found_key.type == BTRFS_EXTENT_ITEM_KEY)
1028                length = found_key.offset;
1029        else
1030                length = fs_info->nodesize;
1031
1032        if (!(found_key.objectid >= cache->start &&
1033               found_key.objectid + length <= cache->start + cache->length)) {
1034                ret = -EUCLEAN;
1035                goto out;
1036        }
1037        *offset_ret = found_key.objectid + length - cache->start;
1038        ret = 0;
1039
1040out:
1041        btrfs_free_path(path);
1042        return ret;
1043}
1044
1045int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
1046{
1047        struct btrfs_fs_info *fs_info = cache->fs_info;
1048        struct extent_map_tree *em_tree = &fs_info->mapping_tree;
1049        struct extent_map *em;
1050        struct map_lookup *map;
1051        struct btrfs_device *device;
1052        u64 logical = cache->start;
1053        u64 length = cache->length;
1054        u64 physical = 0;
1055        int ret;
1056        int i;
1057        unsigned int nofs_flag;
1058        u64 *alloc_offsets = NULL;
1059        u64 last_alloc = 0;
1060        u32 num_sequential = 0, num_conventional = 0;
1061
1062        if (!btrfs_is_zoned(fs_info))
1063                return 0;
1064
1065        /* Sanity check */
1066        if (!IS_ALIGNED(length, fs_info->zone_size)) {
1067                btrfs_err(fs_info,
1068                "zoned: block group %llu len %llu unaligned to zone size %llu",
1069                          logical, length, fs_info->zone_size);
1070                return -EIO;
1071        }
1072
1073        /* Get the chunk mapping */
1074        read_lock(&em_tree->lock);
1075        em = lookup_extent_mapping(em_tree, logical, length);
1076        read_unlock(&em_tree->lock);
1077
1078        if (!em)
1079                return -EINVAL;
1080
1081        map = em->map_lookup;
1082
1083        alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS);
1084        if (!alloc_offsets) {
1085                free_extent_map(em);
1086                return -ENOMEM;
1087        }
1088
1089        for (i = 0; i < map->num_stripes; i++) {
1090                bool is_sequential;
1091                struct blk_zone zone;
1092                struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
1093                int dev_replace_is_ongoing = 0;
1094
1095                device = map->stripes[i].dev;
1096                physical = map->stripes[i].physical;
1097
1098                if (device->bdev == NULL) {
1099                        alloc_offsets[i] = WP_MISSING_DEV;
1100                        continue;
1101                }
1102
1103                is_sequential = btrfs_dev_is_sequential(device, physical);
1104                if (is_sequential)
1105                        num_sequential++;
1106                else
1107                        num_conventional++;
1108
1109                if (!is_sequential) {
1110                        alloc_offsets[i] = WP_CONVENTIONAL;
1111                        continue;
1112                }
1113
1114                /*
1115                 * This zone will be used for allocation, so mark this zone
1116                 * non-empty.
1117                 */
1118                btrfs_dev_clear_zone_empty(device, physical);
1119
1120                down_read(&dev_replace->rwsem);
1121                dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
1122                if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
1123                        btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical);
1124                up_read(&dev_replace->rwsem);
1125
1126                /*
1127                 * The group is mapped to a sequential zone. Get the zone write
1128                 * pointer to determine the allocation offset within the zone.
1129                 */
1130                WARN_ON(!IS_ALIGNED(physical, fs_info->zone_size));
1131                nofs_flag = memalloc_nofs_save();
1132                ret = btrfs_get_dev_zone(device, physical, &zone);
1133                memalloc_nofs_restore(nofs_flag);
1134                if (ret == -EIO || ret == -EOPNOTSUPP) {
1135                        ret = 0;
1136                        alloc_offsets[i] = WP_MISSING_DEV;
1137                        continue;
1138                } else if (ret) {
1139                        goto out;
1140                }
1141
1142                if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
1143                        btrfs_err_in_rcu(fs_info,
1144        "zoned: unexpected conventional zone %llu on device %s (devid %llu)",
1145                                zone.start << SECTOR_SHIFT,
1146                                rcu_str_deref(device->name), device->devid);
1147                        ret = -EIO;
1148                        goto out;
1149                }
1150
1151                switch (zone.cond) {
1152                case BLK_ZONE_COND_OFFLINE:
1153                case BLK_ZONE_COND_READONLY:
1154                        btrfs_err(fs_info,
1155                "zoned: offline/readonly zone %llu on device %s (devid %llu)",
1156                                  physical >> device->zone_info->zone_size_shift,
1157                                  rcu_str_deref(device->name), device->devid);
1158                        alloc_offsets[i] = WP_MISSING_DEV;
1159                        break;
1160                case BLK_ZONE_COND_EMPTY:
1161                        alloc_offsets[i] = 0;
1162                        break;
1163                case BLK_ZONE_COND_FULL:
1164                        alloc_offsets[i] = fs_info->zone_size;
1165                        break;
1166                default:
1167                        /* Partially used zone */
1168                        alloc_offsets[i] =
1169                                        ((zone.wp - zone.start) << SECTOR_SHIFT);
1170                        break;
1171                }
1172        }
1173
1174        if (num_sequential > 0)
1175                cache->seq_zone = true;
1176
1177        if (num_conventional > 0) {
1178                /*
1179                 * Avoid calling calculate_alloc_pointer() for new BG. It
1180                 * is no use for new BG. It must be always 0.
1181                 *
1182                 * Also, we have a lock chain of extent buffer lock ->
1183                 * chunk mutex.  For new BG, this function is called from
1184                 * btrfs_make_block_group() which is already taking the
1185                 * chunk mutex. Thus, we cannot call
1186                 * calculate_alloc_pointer() which takes extent buffer
1187                 * locks to avoid deadlock.
1188                 */
1189                if (new) {
1190                        cache->alloc_offset = 0;
1191                        goto out;
1192                }
1193                ret = calculate_alloc_pointer(cache, &last_alloc);
1194                if (ret || map->num_stripes == num_conventional) {
1195                        if (!ret)
1196                                cache->alloc_offset = last_alloc;
1197                        else
1198                                btrfs_err(fs_info,
1199                        "zoned: failed to determine allocation offset of bg %llu",
1200                                          cache->start);
1201                        goto out;
1202                }
1203        }
1204
1205        switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
1206        case 0: /* single */
1207                if (alloc_offsets[0] == WP_MISSING_DEV) {
1208                        btrfs_err(fs_info,
1209                        "zoned: cannot recover write pointer for zone %llu",
1210                                physical);
1211                        ret = -EIO;
1212                        goto out;
1213                }
1214                cache->alloc_offset = alloc_offsets[0];
1215                break;
1216        case BTRFS_BLOCK_GROUP_DUP:
1217        case BTRFS_BLOCK_GROUP_RAID1:
1218        case BTRFS_BLOCK_GROUP_RAID0:
1219        case BTRFS_BLOCK_GROUP_RAID10:
1220        case BTRFS_BLOCK_GROUP_RAID5:
1221        case BTRFS_BLOCK_GROUP_RAID6:
1222                /* non-single profiles are not supported yet */
1223        default:
1224                btrfs_err(fs_info, "zoned: profile %s not yet supported",
1225                          btrfs_bg_type_to_raid_name(map->type));
1226                ret = -EINVAL;
1227                goto out;
1228        }
1229
1230out:
1231        if (cache->alloc_offset > fs_info->zone_size) {
1232                btrfs_err(fs_info,
1233                        "zoned: invalid write pointer %llu in block group %llu",
1234                        cache->alloc_offset, cache->start);
1235                ret = -EIO;
1236        }
1237
1238        /* An extent is allocated after the write pointer */
1239        if (!ret && num_conventional && last_alloc > cache->alloc_offset) {
1240                btrfs_err(fs_info,
1241                          "zoned: got wrong write pointer in BG %llu: %llu > %llu",
1242                          logical, last_alloc, cache->alloc_offset);
1243                ret = -EIO;
1244        }
1245
1246        if (!ret)
1247                cache->meta_write_pointer = cache->alloc_offset + cache->start;
1248
1249        kfree(alloc_offsets);
1250        free_extent_map(em);
1251
1252        return ret;
1253}
1254
1255void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
1256{
1257        u64 unusable, free;
1258
1259        if (!btrfs_is_zoned(cache->fs_info))
1260                return;
1261
1262        WARN_ON(cache->bytes_super != 0);
1263        unusable = cache->alloc_offset - cache->used;
1264        free = cache->length - cache->alloc_offset;
1265
1266        /* We only need ->free_space in ALLOC_SEQ block groups */
1267        cache->last_byte_to_unpin = (u64)-1;
1268        cache->cached = BTRFS_CACHE_FINISHED;
1269        cache->free_space_ctl->free_space = free;
1270        cache->zone_unusable = unusable;
1271
1272        /* Should not have any excluded extents. Just in case, though */
1273        btrfs_free_excluded_extents(cache);
1274}
1275
1276void btrfs_redirty_list_add(struct btrfs_transaction *trans,
1277                            struct extent_buffer *eb)
1278{
1279        struct btrfs_fs_info *fs_info = eb->fs_info;
1280
1281        if (!btrfs_is_zoned(fs_info) ||
1282            btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN) ||
1283            !list_empty(&eb->release_list))
1284                return;
1285
1286        set_extent_buffer_dirty(eb);
1287        set_extent_bits_nowait(&trans->dirty_pages, eb->start,
1288                               eb->start + eb->len - 1, EXTENT_DIRTY);
1289        memzero_extent_buffer(eb, 0, eb->len);
1290        set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags);
1291
1292        spin_lock(&trans->releasing_ebs_lock);
1293        list_add_tail(&eb->release_list, &trans->releasing_ebs);
1294        spin_unlock(&trans->releasing_ebs_lock);
1295        atomic_inc(&eb->refs);
1296}
1297
1298void btrfs_free_redirty_list(struct btrfs_transaction *trans)
1299{
1300        spin_lock(&trans->releasing_ebs_lock);
1301        while (!list_empty(&trans->releasing_ebs)) {
1302                struct extent_buffer *eb;
1303
1304                eb = list_first_entry(&trans->releasing_ebs,
1305                                      struct extent_buffer, release_list);
1306                list_del_init(&eb->release_list);
1307                free_extent_buffer(eb);
1308        }
1309        spin_unlock(&trans->releasing_ebs_lock);
1310}
1311
1312bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
1313{
1314        struct btrfs_fs_info *fs_info = inode->root->fs_info;
1315        struct btrfs_block_group *cache;
1316        bool ret = false;
1317
1318        if (!btrfs_is_zoned(fs_info))
1319                return false;
1320
1321        if (!fs_info->max_zone_append_size)
1322                return false;
1323
1324        if (!is_data_inode(&inode->vfs_inode))
1325                return false;
1326
1327        cache = btrfs_lookup_block_group(fs_info, start);
1328        ASSERT(cache);
1329        if (!cache)
1330                return false;
1331
1332        ret = cache->seq_zone;
1333        btrfs_put_block_group(cache);
1334
1335        return ret;
1336}
1337
1338void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset,
1339                                 struct bio *bio)
1340{
1341        struct btrfs_ordered_extent *ordered;
1342        const u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
1343
1344        if (bio_op(bio) != REQ_OP_ZONE_APPEND)
1345                return;
1346
1347        ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), file_offset);
1348        if (WARN_ON(!ordered))
1349                return;
1350
1351        ordered->physical = physical;
1352        ordered->bdev = bio->bi_bdev;
1353
1354        btrfs_put_ordered_extent(ordered);
1355}
1356
1357void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered)
1358{
1359        struct btrfs_inode *inode = BTRFS_I(ordered->inode);
1360        struct btrfs_fs_info *fs_info = inode->root->fs_info;
1361        struct extent_map_tree *em_tree;
1362        struct extent_map *em;
1363        struct btrfs_ordered_sum *sum;
1364        u64 orig_logical = ordered->disk_bytenr;
1365        u64 *logical = NULL;
1366        int nr, stripe_len;
1367
1368        /* Zoned devices should not have partitions. So, we can assume it is 0 */
1369        ASSERT(!bdev_is_partition(ordered->bdev));
1370        if (WARN_ON(!ordered->bdev))
1371                return;
1372
1373        if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, ordered->bdev,
1374                                     ordered->physical, &logical, &nr,
1375                                     &stripe_len)))
1376                goto out;
1377
1378        WARN_ON(nr != 1);
1379
1380        if (orig_logical == *logical)
1381                goto out;
1382
1383        ordered->disk_bytenr = *logical;
1384
1385        em_tree = &inode->extent_tree;
1386        write_lock(&em_tree->lock);
1387        em = search_extent_mapping(em_tree, ordered->file_offset,
1388                                   ordered->num_bytes);
1389        em->block_start = *logical;
1390        free_extent_map(em);
1391        write_unlock(&em_tree->lock);
1392
1393        list_for_each_entry(sum, &ordered->list, list) {
1394                if (*logical < orig_logical)
1395                        sum->bytenr -= orig_logical - *logical;
1396                else
1397                        sum->bytenr += *logical - orig_logical;
1398        }
1399
1400out:
1401        kfree(logical);
1402}
1403
1404bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
1405                                    struct extent_buffer *eb,
1406                                    struct btrfs_block_group **cache_ret)
1407{
1408        struct btrfs_block_group *cache;
1409        bool ret = true;
1410
1411        if (!btrfs_is_zoned(fs_info))
1412                return true;
1413
1414        cache = *cache_ret;
1415
1416        if (cache && (eb->start < cache->start ||
1417                      cache->start + cache->length <= eb->start)) {
1418                btrfs_put_block_group(cache);
1419                cache = NULL;
1420                *cache_ret = NULL;
1421        }
1422
1423        if (!cache)
1424                cache = btrfs_lookup_block_group(fs_info, eb->start);
1425
1426        if (cache) {
1427                if (cache->meta_write_pointer != eb->start) {
1428                        btrfs_put_block_group(cache);
1429                        cache = NULL;
1430                        ret = false;
1431                } else {
1432                        cache->meta_write_pointer = eb->start + eb->len;
1433                }
1434
1435                *cache_ret = cache;
1436        }
1437
1438        return ret;
1439}
1440
1441void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache,
1442                                     struct extent_buffer *eb)
1443{
1444        if (!btrfs_is_zoned(eb->fs_info) || !cache)
1445                return;
1446
1447        ASSERT(cache->meta_write_pointer == eb->start + eb->len);
1448        cache->meta_write_pointer = eb->start;
1449}
1450
1451int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length)
1452{
1453        if (!btrfs_dev_is_sequential(device, physical))
1454                return -EOPNOTSUPP;
1455
1456        return blkdev_issue_zeroout(device->bdev, physical >> SECTOR_SHIFT,
1457                                    length >> SECTOR_SHIFT, GFP_NOFS, 0);
1458}
1459
1460static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
1461                          struct blk_zone *zone)
1462{
1463        struct btrfs_bio *bbio = NULL;
1464        u64 mapped_length = PAGE_SIZE;
1465        unsigned int nofs_flag;
1466        int nmirrors;
1467        int i, ret;
1468
1469        ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
1470                               &mapped_length, &bbio);
1471        if (ret || !bbio || mapped_length < PAGE_SIZE) {
1472                btrfs_put_bbio(bbio);
1473                return -EIO;
1474        }
1475
1476        if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
1477                return -EINVAL;
1478
1479        nofs_flag = memalloc_nofs_save();
1480        nmirrors = (int)bbio->num_stripes;
1481        for (i = 0; i < nmirrors; i++) {
1482                u64 physical = bbio->stripes[i].physical;
1483                struct btrfs_device *dev = bbio->stripes[i].dev;
1484
1485                /* Missing device */
1486                if (!dev->bdev)
1487                        continue;
1488
1489                ret = btrfs_get_dev_zone(dev, physical, zone);
1490                /* Failing device */
1491                if (ret == -EIO || ret == -EOPNOTSUPP)
1492                        continue;
1493                break;
1494        }
1495        memalloc_nofs_restore(nofs_flag);
1496
1497        return ret;
1498}
1499
1500/*
1501 * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by
1502 * filling zeros between @physical_pos to a write pointer of dev-replace
1503 * source device.
1504 */
1505int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
1506                                    u64 physical_start, u64 physical_pos)
1507{
1508        struct btrfs_fs_info *fs_info = tgt_dev->fs_info;
1509        struct blk_zone zone;
1510        u64 length;
1511        u64 wp;
1512        int ret;
1513
1514        if (!btrfs_dev_is_sequential(tgt_dev, physical_pos))
1515                return 0;
1516
1517        ret = read_zone_info(fs_info, logical, &zone);
1518        if (ret)
1519                return ret;
1520
1521        wp = physical_start + ((zone.wp - zone.start) << SECTOR_SHIFT);
1522
1523        if (physical_pos == wp)
1524                return 0;
1525
1526        if (physical_pos > wp)
1527                return -EUCLEAN;
1528
1529        length = wp - physical_pos;
1530        return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length);
1531}
1532
1533struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
1534                                            u64 logical, u64 length)
1535{
1536        struct btrfs_device *device;
1537        struct extent_map *em;
1538        struct map_lookup *map;
1539
1540        em = btrfs_get_chunk_map(fs_info, logical, length);
1541        if (IS_ERR(em))
1542                return ERR_CAST(em);
1543
1544        map = em->map_lookup;
1545        /* We only support single profile for now */
1546        ASSERT(map->num_stripes == 1);
1547        device = map->stripes[0].dev;
1548
1549        free_extent_map(em);
1550
1551        return device;
1552}
1553