linux/fs/btrfs/zoned.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2
   3#include <linux/bitops.h>
   4#include <linux/slab.h>
   5#include <linux/blkdev.h>
   6#include <linux/sched/mm.h>
   7#include <linux/atomic.h>
   8#include "ctree.h"
   9#include "volumes.h"
  10#include "zoned.h"
  11#include "rcu-string.h"
  12#include "disk-io.h"
  13#include "block-group.h"
  14#include "transaction.h"
  15#include "dev-replace.h"
  16#include "space-info.h"
  17
  18/* Maximum number of zones to report per blkdev_report_zones() call */
  19#define BTRFS_REPORT_NR_ZONES   4096
  20/* Invalid allocation pointer value for missing devices */
  21#define WP_MISSING_DEV ((u64)-1)
  22/* Pseudo write pointer value for conventional zone */
  23#define WP_CONVENTIONAL ((u64)-2)
  24
  25/*
  26 * Location of the first zone of superblock logging zone pairs.
  27 *
  28 * - primary superblock:    0B (zone 0)
  29 * - first copy:          512G (zone starting at that offset)
  30 * - second copy:           4T (zone starting at that offset)
  31 */
  32#define BTRFS_SB_LOG_PRIMARY_OFFSET     (0ULL)
  33#define BTRFS_SB_LOG_FIRST_OFFSET       (512ULL * SZ_1G)
  34#define BTRFS_SB_LOG_SECOND_OFFSET      (4096ULL * SZ_1G)
  35
  36#define BTRFS_SB_LOG_FIRST_SHIFT        const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET)
  37#define BTRFS_SB_LOG_SECOND_SHIFT       const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET)
  38
  39/* Number of superblock log zones */
  40#define BTRFS_NR_SB_LOG_ZONES 2
  41
  42/*
  43 * Minimum of active zones we need:
  44 *
  45 * - BTRFS_SUPER_MIRROR_MAX zones for superblock mirrors
  46 * - 3 zones to ensure at least one zone per SYSTEM, META and DATA block group
  47 * - 1 zone for tree-log dedicated block group
  48 * - 1 zone for relocation
  49 */
  50#define BTRFS_MIN_ACTIVE_ZONES          (BTRFS_SUPER_MIRROR_MAX + 5)
  51
  52/*
  53 * Maximum supported zone size. Currently, SMR disks have a zone size of
  54 * 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not
  55 * expect the zone size to become larger than 8GiB in the near future.
  56 */
  57#define BTRFS_MAX_ZONE_SIZE             SZ_8G
  58
  59#define SUPER_INFO_SECTORS      ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT)
  60
  61static inline bool sb_zone_is_full(const struct blk_zone *zone)
  62{
  63        return (zone->cond == BLK_ZONE_COND_FULL) ||
  64                (zone->wp + SUPER_INFO_SECTORS > zone->start + zone->capacity);
  65}
  66
  67static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data)
  68{
  69        struct blk_zone *zones = data;
  70
  71        memcpy(&zones[idx], zone, sizeof(*zone));
  72
  73        return 0;
  74}
  75
  76static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
  77                            u64 *wp_ret)
  78{
  79        bool empty[BTRFS_NR_SB_LOG_ZONES];
  80        bool full[BTRFS_NR_SB_LOG_ZONES];
  81        sector_t sector;
  82        int i;
  83
  84        for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
  85                ASSERT(zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL);
  86                empty[i] = (zones[i].cond == BLK_ZONE_COND_EMPTY);
  87                full[i] = sb_zone_is_full(&zones[i]);
  88        }
  89
  90        /*
  91         * Possible states of log buffer zones
  92         *
  93         *           Empty[0]  In use[0]  Full[0]
  94         * Empty[1]         *          x        0
  95         * In use[1]        0          x        0
  96         * Full[1]          1          1        C
  97         *
  98         * Log position:
  99         *   *: Special case, no superblock is written
 100         *   0: Use write pointer of zones[0]
 101         *   1: Use write pointer of zones[1]
 102         *   C: Compare super blocks from zones[0] and zones[1], use the latest
 103         *      one determined by generation
 104         *   x: Invalid state
 105         */
 106
 107        if (empty[0] && empty[1]) {
 108                /* Special case to distinguish no superblock to read */
 109                *wp_ret = zones[0].start << SECTOR_SHIFT;
 110                return -ENOENT;
 111        } else if (full[0] && full[1]) {
 112                /* Compare two super blocks */
 113                struct address_space *mapping = bdev->bd_inode->i_mapping;
 114                struct page *page[BTRFS_NR_SB_LOG_ZONES];
 115                struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES];
 116                int i;
 117
 118                for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
 119                        u64 bytenr;
 120
 121                        bytenr = ((zones[i].start + zones[i].len)
 122                                   << SECTOR_SHIFT) - BTRFS_SUPER_INFO_SIZE;
 123
 124                        page[i] = read_cache_page_gfp(mapping,
 125                                        bytenr >> PAGE_SHIFT, GFP_NOFS);
 126                        if (IS_ERR(page[i])) {
 127                                if (i == 1)
 128                                        btrfs_release_disk_super(super[0]);
 129                                return PTR_ERR(page[i]);
 130                        }
 131                        super[i] = page_address(page[i]);
 132                }
 133
 134                if (super[0]->generation > super[1]->generation)
 135                        sector = zones[1].start;
 136                else
 137                        sector = zones[0].start;
 138
 139                for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++)
 140                        btrfs_release_disk_super(super[i]);
 141        } else if (!full[0] && (empty[1] || full[1])) {
 142                sector = zones[0].wp;
 143        } else if (full[0]) {
 144                sector = zones[1].wp;
 145        } else {
 146                return -EUCLEAN;
 147        }
 148        *wp_ret = sector << SECTOR_SHIFT;
 149        return 0;
 150}
 151
 152/*
 153 * Get the first zone number of the superblock mirror
 154 */
 155static inline u32 sb_zone_number(int shift, int mirror)
 156{
 157        u64 zone;
 158
 159        ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
 160        switch (mirror) {
 161        case 0: zone = 0; break;
 162        case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break;
 163        case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break;
 164        }
 165
 166        ASSERT(zone <= U32_MAX);
 167
 168        return (u32)zone;
 169}
 170
 171static inline sector_t zone_start_sector(u32 zone_number,
 172                                         struct block_device *bdev)
 173{
 174        return (sector_t)zone_number << ilog2(bdev_zone_sectors(bdev));
 175}
 176
 177static inline u64 zone_start_physical(u32 zone_number,
 178                                      struct btrfs_zoned_device_info *zone_info)
 179{
 180        return (u64)zone_number << zone_info->zone_size_shift;
 181}
 182
 183/*
 184 * Emulate blkdev_report_zones() for a non-zoned device. It slices up the block
 185 * device into static sized chunks and fake a conventional zone on each of
 186 * them.
 187 */
 188static int emulate_report_zones(struct btrfs_device *device, u64 pos,
 189                                struct blk_zone *zones, unsigned int nr_zones)
 190{
 191        const sector_t zone_sectors = device->fs_info->zone_size >> SECTOR_SHIFT;
 192        sector_t bdev_size = bdev_nr_sectors(device->bdev);
 193        unsigned int i;
 194
 195        pos >>= SECTOR_SHIFT;
 196        for (i = 0; i < nr_zones; i++) {
 197                zones[i].start = i * zone_sectors + pos;
 198                zones[i].len = zone_sectors;
 199                zones[i].capacity = zone_sectors;
 200                zones[i].wp = zones[i].start + zone_sectors;
 201                zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL;
 202                zones[i].cond = BLK_ZONE_COND_NOT_WP;
 203
 204                if (zones[i].wp >= bdev_size) {
 205                        i++;
 206                        break;
 207                }
 208        }
 209
 210        return i;
 211}
 212
 213static int btrfs_get_dev_zones(struct btrfs_device *device, u64 pos,
 214                               struct blk_zone *zones, unsigned int *nr_zones)
 215{
 216        int ret;
 217
 218        if (!*nr_zones)
 219                return 0;
 220
 221        if (!bdev_is_zoned(device->bdev)) {
 222                ret = emulate_report_zones(device, pos, zones, *nr_zones);
 223                *nr_zones = ret;
 224                return 0;
 225        }
 226
 227        ret = blkdev_report_zones(device->bdev, pos >> SECTOR_SHIFT, *nr_zones,
 228                                  copy_zone_info_cb, zones);
 229        if (ret < 0) {
 230                btrfs_err_in_rcu(device->fs_info,
 231                                 "zoned: failed to read zone %llu on %s (devid %llu)",
 232                                 pos, rcu_str_deref(device->name),
 233                                 device->devid);
 234                return ret;
 235        }
 236        *nr_zones = ret;
 237        if (!ret)
 238                return -EIO;
 239
 240        return 0;
 241}
 242
 243/* The emulated zone size is determined from the size of device extent */
 244static int calculate_emulated_zone_size(struct btrfs_fs_info *fs_info)
 245{
 246        struct btrfs_path *path;
 247        struct btrfs_root *root = fs_info->dev_root;
 248        struct btrfs_key key;
 249        struct extent_buffer *leaf;
 250        struct btrfs_dev_extent *dext;
 251        int ret = 0;
 252
 253        key.objectid = 1;
 254        key.type = BTRFS_DEV_EXTENT_KEY;
 255        key.offset = 0;
 256
 257        path = btrfs_alloc_path();
 258        if (!path)
 259                return -ENOMEM;
 260
 261        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 262        if (ret < 0)
 263                goto out;
 264
 265        if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
 266                ret = btrfs_next_leaf(root, path);
 267                if (ret < 0)
 268                        goto out;
 269                /* No dev extents at all? Not good */
 270                if (ret > 0) {
 271                        ret = -EUCLEAN;
 272                        goto out;
 273                }
 274        }
 275
 276        leaf = path->nodes[0];
 277        dext = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
 278        fs_info->zone_size = btrfs_dev_extent_length(leaf, dext);
 279        ret = 0;
 280
 281out:
 282        btrfs_free_path(path);
 283
 284        return ret;
 285}
 286
 287int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
 288{
 289        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 290        struct btrfs_device *device;
 291        int ret = 0;
 292
 293        /* fs_info->zone_size might not set yet. Use the incomapt flag here. */
 294        if (!btrfs_fs_incompat(fs_info, ZONED))
 295                return 0;
 296
 297        mutex_lock(&fs_devices->device_list_mutex);
 298        list_for_each_entry(device, &fs_devices->devices, dev_list) {
 299                /* We can skip reading of zone info for missing devices */
 300                if (!device->bdev)
 301                        continue;
 302
 303                ret = btrfs_get_dev_zone_info(device);
 304                if (ret)
 305                        break;
 306        }
 307        mutex_unlock(&fs_devices->device_list_mutex);
 308
 309        return ret;
 310}
 311
 312int btrfs_get_dev_zone_info(struct btrfs_device *device)
 313{
 314        struct btrfs_fs_info *fs_info = device->fs_info;
 315        struct btrfs_zoned_device_info *zone_info = NULL;
 316        struct block_device *bdev = device->bdev;
 317        struct request_queue *queue = bdev_get_queue(bdev);
 318        unsigned int max_active_zones;
 319        unsigned int nactive;
 320        sector_t nr_sectors;
 321        sector_t sector = 0;
 322        struct blk_zone *zones = NULL;
 323        unsigned int i, nreported = 0, nr_zones;
 324        sector_t zone_sectors;
 325        char *model, *emulated;
 326        int ret;
 327
 328        /*
 329         * Cannot use btrfs_is_zoned here, since fs_info::zone_size might not
 330         * yet be set.
 331         */
 332        if (!btrfs_fs_incompat(fs_info, ZONED))
 333                return 0;
 334
 335        if (device->zone_info)
 336                return 0;
 337
 338        zone_info = kzalloc(sizeof(*zone_info), GFP_KERNEL);
 339        if (!zone_info)
 340                return -ENOMEM;
 341
 342        if (!bdev_is_zoned(bdev)) {
 343                if (!fs_info->zone_size) {
 344                        ret = calculate_emulated_zone_size(fs_info);
 345                        if (ret)
 346                                goto out;
 347                }
 348
 349                ASSERT(fs_info->zone_size);
 350                zone_sectors = fs_info->zone_size >> SECTOR_SHIFT;
 351        } else {
 352                zone_sectors = bdev_zone_sectors(bdev);
 353        }
 354
 355        /* Check if it's power of 2 (see is_power_of_2) */
 356        ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0);
 357        zone_info->zone_size = zone_sectors << SECTOR_SHIFT;
 358
 359        /* We reject devices with a zone size larger than 8GB */
 360        if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) {
 361                btrfs_err_in_rcu(fs_info,
 362                "zoned: %s: zone size %llu larger than supported maximum %llu",
 363                                 rcu_str_deref(device->name),
 364                                 zone_info->zone_size, BTRFS_MAX_ZONE_SIZE);
 365                ret = -EINVAL;
 366                goto out;
 367        }
 368
 369        nr_sectors = bdev_nr_sectors(bdev);
 370        zone_info->zone_size_shift = ilog2(zone_info->zone_size);
 371        zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors);
 372        if (!IS_ALIGNED(nr_sectors, zone_sectors))
 373                zone_info->nr_zones++;
 374
 375        max_active_zones = queue_max_active_zones(queue);
 376        if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) {
 377                btrfs_err_in_rcu(fs_info,
 378"zoned: %s: max active zones %u is too small, need at least %u active zones",
 379                                 rcu_str_deref(device->name), max_active_zones,
 380                                 BTRFS_MIN_ACTIVE_ZONES);
 381                ret = -EINVAL;
 382                goto out;
 383        }
 384        zone_info->max_active_zones = max_active_zones;
 385
 386        zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
 387        if (!zone_info->seq_zones) {
 388                ret = -ENOMEM;
 389                goto out;
 390        }
 391
 392        zone_info->empty_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
 393        if (!zone_info->empty_zones) {
 394                ret = -ENOMEM;
 395                goto out;
 396        }
 397
 398        zone_info->active_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
 399        if (!zone_info->active_zones) {
 400                ret = -ENOMEM;
 401                goto out;
 402        }
 403
 404        zones = kcalloc(BTRFS_REPORT_NR_ZONES, sizeof(struct blk_zone), GFP_KERNEL);
 405        if (!zones) {
 406                ret = -ENOMEM;
 407                goto out;
 408        }
 409
 410        /* Get zones type */
 411        nactive = 0;
 412        while (sector < nr_sectors) {
 413                nr_zones = BTRFS_REPORT_NR_ZONES;
 414                ret = btrfs_get_dev_zones(device, sector << SECTOR_SHIFT, zones,
 415                                          &nr_zones);
 416                if (ret)
 417                        goto out;
 418
 419                for (i = 0; i < nr_zones; i++) {
 420                        if (zones[i].type == BLK_ZONE_TYPE_SEQWRITE_REQ)
 421                                __set_bit(nreported, zone_info->seq_zones);
 422                        switch (zones[i].cond) {
 423                        case BLK_ZONE_COND_EMPTY:
 424                                __set_bit(nreported, zone_info->empty_zones);
 425                                break;
 426                        case BLK_ZONE_COND_IMP_OPEN:
 427                        case BLK_ZONE_COND_EXP_OPEN:
 428                        case BLK_ZONE_COND_CLOSED:
 429                                __set_bit(nreported, zone_info->active_zones);
 430                                nactive++;
 431                                break;
 432                        }
 433                        nreported++;
 434                }
 435                sector = zones[nr_zones - 1].start + zones[nr_zones - 1].len;
 436        }
 437
 438        if (nreported != zone_info->nr_zones) {
 439                btrfs_err_in_rcu(device->fs_info,
 440                                 "inconsistent number of zones on %s (%u/%u)",
 441                                 rcu_str_deref(device->name), nreported,
 442                                 zone_info->nr_zones);
 443                ret = -EIO;
 444                goto out;
 445        }
 446
 447        if (max_active_zones) {
 448                if (nactive > max_active_zones) {
 449                        btrfs_err_in_rcu(device->fs_info,
 450                        "zoned: %u active zones on %s exceeds max_active_zones %u",
 451                                         nactive, rcu_str_deref(device->name),
 452                                         max_active_zones);
 453                        ret = -EIO;
 454                        goto out;
 455                }
 456                atomic_set(&zone_info->active_zones_left,
 457                           max_active_zones - nactive);
 458        }
 459
 460        /* Validate superblock log */
 461        nr_zones = BTRFS_NR_SB_LOG_ZONES;
 462        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
 463                u32 sb_zone;
 464                u64 sb_wp;
 465                int sb_pos = BTRFS_NR_SB_LOG_ZONES * i;
 466
 467                sb_zone = sb_zone_number(zone_info->zone_size_shift, i);
 468                if (sb_zone + 1 >= zone_info->nr_zones)
 469                        continue;
 470
 471                ret = btrfs_get_dev_zones(device,
 472                                          zone_start_physical(sb_zone, zone_info),
 473                                          &zone_info->sb_zones[sb_pos],
 474                                          &nr_zones);
 475                if (ret)
 476                        goto out;
 477
 478                if (nr_zones != BTRFS_NR_SB_LOG_ZONES) {
 479                        btrfs_err_in_rcu(device->fs_info,
 480        "zoned: failed to read super block log zone info at devid %llu zone %u",
 481                                         device->devid, sb_zone);
 482                        ret = -EUCLEAN;
 483                        goto out;
 484                }
 485
 486                /*
 487                 * If zones[0] is conventional, always use the beginning of the
 488                 * zone to record superblock. No need to validate in that case.
 489                 */
 490                if (zone_info->sb_zones[BTRFS_NR_SB_LOG_ZONES * i].type ==
 491                    BLK_ZONE_TYPE_CONVENTIONAL)
 492                        continue;
 493
 494                ret = sb_write_pointer(device->bdev,
 495                                       &zone_info->sb_zones[sb_pos], &sb_wp);
 496                if (ret != -ENOENT && ret) {
 497                        btrfs_err_in_rcu(device->fs_info,
 498                        "zoned: super block log zone corrupted devid %llu zone %u",
 499                                         device->devid, sb_zone);
 500                        ret = -EUCLEAN;
 501                        goto out;
 502                }
 503        }
 504
 505
 506        kfree(zones);
 507
 508        device->zone_info = zone_info;
 509
 510        switch (bdev_zoned_model(bdev)) {
 511        case BLK_ZONED_HM:
 512                model = "host-managed zoned";
 513                emulated = "";
 514                break;
 515        case BLK_ZONED_HA:
 516                model = "host-aware zoned";
 517                emulated = "";
 518                break;
 519        case BLK_ZONED_NONE:
 520                model = "regular";
 521                emulated = "emulated ";
 522                break;
 523        default:
 524                /* Just in case */
 525                btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s",
 526                                 bdev_zoned_model(bdev),
 527                                 rcu_str_deref(device->name));
 528                ret = -EOPNOTSUPP;
 529                goto out_free_zone_info;
 530        }
 531
 532        btrfs_info_in_rcu(fs_info,
 533                "%s block device %s, %u %szones of %llu bytes",
 534                model, rcu_str_deref(device->name), zone_info->nr_zones,
 535                emulated, zone_info->zone_size);
 536
 537        return 0;
 538
 539out:
 540        kfree(zones);
 541out_free_zone_info:
 542        bitmap_free(zone_info->active_zones);
 543        bitmap_free(zone_info->empty_zones);
 544        bitmap_free(zone_info->seq_zones);
 545        kfree(zone_info);
 546        device->zone_info = NULL;
 547
 548        return ret;
 549}
 550
 551void btrfs_destroy_dev_zone_info(struct btrfs_device *device)
 552{
 553        struct btrfs_zoned_device_info *zone_info = device->zone_info;
 554
 555        if (!zone_info)
 556                return;
 557
 558        bitmap_free(zone_info->active_zones);
 559        bitmap_free(zone_info->seq_zones);
 560        bitmap_free(zone_info->empty_zones);
 561        kfree(zone_info);
 562        device->zone_info = NULL;
 563}
 564
 565int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
 566                       struct blk_zone *zone)
 567{
 568        unsigned int nr_zones = 1;
 569        int ret;
 570
 571        ret = btrfs_get_dev_zones(device, pos, zone, &nr_zones);
 572        if (ret != 0 || !nr_zones)
 573                return ret ? ret : -EIO;
 574
 575        return 0;
 576}
 577
 578int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
 579{
 580        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
 581        struct btrfs_device *device;
 582        u64 zoned_devices = 0;
 583        u64 nr_devices = 0;
 584        u64 zone_size = 0;
 585        const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED);
 586        int ret = 0;
 587
 588        /* Count zoned devices */
 589        list_for_each_entry(device, &fs_devices->devices, dev_list) {
 590                enum blk_zoned_model model;
 591
 592                if (!device->bdev)
 593                        continue;
 594
 595                model = bdev_zoned_model(device->bdev);
 596                /*
 597                 * A Host-Managed zoned device must be used as a zoned device.
 598                 * A Host-Aware zoned device and a non-zoned devices can be
 599                 * treated as a zoned device, if ZONED flag is enabled in the
 600                 * superblock.
 601                 */
 602                if (model == BLK_ZONED_HM ||
 603                    (model == BLK_ZONED_HA && incompat_zoned) ||
 604                    (model == BLK_ZONED_NONE && incompat_zoned)) {
 605                        struct btrfs_zoned_device_info *zone_info =
 606                                device->zone_info;
 607
 608                        zone_info = device->zone_info;
 609                        zoned_devices++;
 610                        if (!zone_size) {
 611                                zone_size = zone_info->zone_size;
 612                        } else if (zone_info->zone_size != zone_size) {
 613                                btrfs_err(fs_info,
 614                "zoned: unequal block device zone sizes: have %llu found %llu",
 615                                          device->zone_info->zone_size,
 616                                          zone_size);
 617                                ret = -EINVAL;
 618                                goto out;
 619                        }
 620                }
 621                nr_devices++;
 622        }
 623
 624        if (!zoned_devices && !incompat_zoned)
 625                goto out;
 626
 627        if (!zoned_devices && incompat_zoned) {
 628                /* No zoned block device found on ZONED filesystem */
 629                btrfs_err(fs_info,
 630                          "zoned: no zoned devices found on a zoned filesystem");
 631                ret = -EINVAL;
 632                goto out;
 633        }
 634
 635        if (zoned_devices && !incompat_zoned) {
 636                btrfs_err(fs_info,
 637                          "zoned: mode not enabled but zoned device found");
 638                ret = -EINVAL;
 639                goto out;
 640        }
 641
 642        if (zoned_devices != nr_devices) {
 643                btrfs_err(fs_info,
 644                          "zoned: cannot mix zoned and regular devices");
 645                ret = -EINVAL;
 646                goto out;
 647        }
 648
 649        /*
 650         * stripe_size is always aligned to BTRFS_STRIPE_LEN in
 651         * btrfs_create_chunk(). Since we want stripe_len == zone_size,
 652         * check the alignment here.
 653         */
 654        if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) {
 655                btrfs_err(fs_info,
 656                          "zoned: zone size %llu not aligned to stripe %u",
 657                          zone_size, BTRFS_STRIPE_LEN);
 658                ret = -EINVAL;
 659                goto out;
 660        }
 661
 662        if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
 663                btrfs_err(fs_info, "zoned: mixed block groups not supported");
 664                ret = -EINVAL;
 665                goto out;
 666        }
 667
 668        fs_info->zone_size = zone_size;
 669        fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
 670
 671        /*
 672         * Check mount options here, because we might change fs_info->zoned
 673         * from fs_info->zone_size.
 674         */
 675        ret = btrfs_check_mountopts_zoned(fs_info);
 676        if (ret)
 677                goto out;
 678
 679        btrfs_info(fs_info, "zoned mode enabled with zone size %llu", zone_size);
 680out:
 681        return ret;
 682}
 683
 684int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
 685{
 686        if (!btrfs_is_zoned(info))
 687                return 0;
 688
 689        /*
 690         * Space cache writing is not COWed. Disable that to avoid write errors
 691         * in sequential zones.
 692         */
 693        if (btrfs_test_opt(info, SPACE_CACHE)) {
 694                btrfs_err(info, "zoned: space cache v1 is not supported");
 695                return -EINVAL;
 696        }
 697
 698        if (btrfs_test_opt(info, NODATACOW)) {
 699                btrfs_err(info, "zoned: NODATACOW not supported");
 700                return -EINVAL;
 701        }
 702
 703        return 0;
 704}
 705
 706static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
 707                           int rw, u64 *bytenr_ret)
 708{
 709        u64 wp;
 710        int ret;
 711
 712        if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) {
 713                *bytenr_ret = zones[0].start << SECTOR_SHIFT;
 714                return 0;
 715        }
 716
 717        ret = sb_write_pointer(bdev, zones, &wp);
 718        if (ret != -ENOENT && ret < 0)
 719                return ret;
 720
 721        if (rw == WRITE) {
 722                struct blk_zone *reset = NULL;
 723
 724                if (wp == zones[0].start << SECTOR_SHIFT)
 725                        reset = &zones[0];
 726                else if (wp == zones[1].start << SECTOR_SHIFT)
 727                        reset = &zones[1];
 728
 729                if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
 730                        ASSERT(sb_zone_is_full(reset));
 731
 732                        ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
 733                                               reset->start, reset->len,
 734                                               GFP_NOFS);
 735                        if (ret)
 736                                return ret;
 737
 738                        reset->cond = BLK_ZONE_COND_EMPTY;
 739                        reset->wp = reset->start;
 740                }
 741        } else if (ret != -ENOENT) {
 742                /*
 743                 * For READ, we want the previous one. Move write pointer to
 744                 * the end of a zone, if it is at the head of a zone.
 745                 */
 746                u64 zone_end = 0;
 747
 748                if (wp == zones[0].start << SECTOR_SHIFT)
 749                        zone_end = zones[1].start + zones[1].capacity;
 750                else if (wp == zones[1].start << SECTOR_SHIFT)
 751                        zone_end = zones[0].start + zones[0].capacity;
 752                if (zone_end)
 753                        wp = ALIGN_DOWN(zone_end << SECTOR_SHIFT,
 754                                        BTRFS_SUPER_INFO_SIZE);
 755
 756                wp -= BTRFS_SUPER_INFO_SIZE;
 757        }
 758
 759        *bytenr_ret = wp;
 760        return 0;
 761
 762}
 763
 764int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
 765                               u64 *bytenr_ret)
 766{
 767        struct blk_zone zones[BTRFS_NR_SB_LOG_ZONES];
 768        sector_t zone_sectors;
 769        u32 sb_zone;
 770        int ret;
 771        u8 zone_sectors_shift;
 772        sector_t nr_sectors;
 773        u32 nr_zones;
 774
 775        if (!bdev_is_zoned(bdev)) {
 776                *bytenr_ret = btrfs_sb_offset(mirror);
 777                return 0;
 778        }
 779
 780        ASSERT(rw == READ || rw == WRITE);
 781
 782        zone_sectors = bdev_zone_sectors(bdev);
 783        if (!is_power_of_2(zone_sectors))
 784                return -EINVAL;
 785        zone_sectors_shift = ilog2(zone_sectors);
 786        nr_sectors = bdev_nr_sectors(bdev);
 787        nr_zones = nr_sectors >> zone_sectors_shift;
 788
 789        sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
 790        if (sb_zone + 1 >= nr_zones)
 791                return -ENOENT;
 792
 793        ret = blkdev_report_zones(bdev, zone_start_sector(sb_zone, bdev),
 794                                  BTRFS_NR_SB_LOG_ZONES, copy_zone_info_cb,
 795                                  zones);
 796        if (ret < 0)
 797                return ret;
 798        if (ret != BTRFS_NR_SB_LOG_ZONES)
 799                return -EIO;
 800
 801        return sb_log_location(bdev, zones, rw, bytenr_ret);
 802}
 803
 804int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
 805                          u64 *bytenr_ret)
 806{
 807        struct btrfs_zoned_device_info *zinfo = device->zone_info;
 808        u32 zone_num;
 809
 810        /*
 811         * For a zoned filesystem on a non-zoned block device, use the same
 812         * super block locations as regular filesystem. Doing so, the super
 813         * block can always be retrieved and the zoned flag of the volume
 814         * detected from the super block information.
 815         */
 816        if (!bdev_is_zoned(device->bdev)) {
 817                *bytenr_ret = btrfs_sb_offset(mirror);
 818                return 0;
 819        }
 820
 821        zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
 822        if (zone_num + 1 >= zinfo->nr_zones)
 823                return -ENOENT;
 824
 825        return sb_log_location(device->bdev,
 826                               &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror],
 827                               rw, bytenr_ret);
 828}
 829
 830static inline bool is_sb_log_zone(struct btrfs_zoned_device_info *zinfo,
 831                                  int mirror)
 832{
 833        u32 zone_num;
 834
 835        if (!zinfo)
 836                return false;
 837
 838        zone_num = sb_zone_number(zinfo->zone_size_shift, mirror);
 839        if (zone_num + 1 >= zinfo->nr_zones)
 840                return false;
 841
 842        if (!test_bit(zone_num, zinfo->seq_zones))
 843                return false;
 844
 845        return true;
 846}
 847
 848int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
 849{
 850        struct btrfs_zoned_device_info *zinfo = device->zone_info;
 851        struct blk_zone *zone;
 852        int i;
 853
 854        if (!is_sb_log_zone(zinfo, mirror))
 855                return 0;
 856
 857        zone = &zinfo->sb_zones[BTRFS_NR_SB_LOG_ZONES * mirror];
 858        for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
 859                /* Advance the next zone */
 860                if (zone->cond == BLK_ZONE_COND_FULL) {
 861                        zone++;
 862                        continue;
 863                }
 864
 865                if (zone->cond == BLK_ZONE_COND_EMPTY)
 866                        zone->cond = BLK_ZONE_COND_IMP_OPEN;
 867
 868                zone->wp += SUPER_INFO_SECTORS;
 869
 870                if (sb_zone_is_full(zone)) {
 871                        /*
 872                         * No room left to write new superblock. Since
 873                         * superblock is written with REQ_SYNC, it is safe to
 874                         * finish the zone now.
 875                         *
 876                         * If the write pointer is exactly at the capacity,
 877                         * explicit ZONE_FINISH is not necessary.
 878                         */
 879                        if (zone->wp != zone->start + zone->capacity) {
 880                                int ret;
 881
 882                                ret = blkdev_zone_mgmt(device->bdev,
 883                                                REQ_OP_ZONE_FINISH, zone->start,
 884                                                zone->len, GFP_NOFS);
 885                                if (ret)
 886                                        return ret;
 887                        }
 888
 889                        zone->wp = zone->start + zone->len;
 890                        zone->cond = BLK_ZONE_COND_FULL;
 891                }
 892                return 0;
 893        }
 894
 895        /* All the zones are FULL. Should not reach here. */
 896        ASSERT(0);
 897        return -EIO;
 898}
 899
 900int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
 901{
 902        sector_t zone_sectors;
 903        sector_t nr_sectors;
 904        u8 zone_sectors_shift;
 905        u32 sb_zone;
 906        u32 nr_zones;
 907
 908        zone_sectors = bdev_zone_sectors(bdev);
 909        zone_sectors_shift = ilog2(zone_sectors);
 910        nr_sectors = bdev_nr_sectors(bdev);
 911        nr_zones = nr_sectors >> zone_sectors_shift;
 912
 913        sb_zone = sb_zone_number(zone_sectors_shift + SECTOR_SHIFT, mirror);
 914        if (sb_zone + 1 >= nr_zones)
 915                return -ENOENT;
 916
 917        return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
 918                                zone_start_sector(sb_zone, bdev),
 919                                zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS);
 920}
 921
 922/**
 923 * btrfs_find_allocatable_zones - find allocatable zones within a given region
 924 *
 925 * @device:     the device to allocate a region on
 926 * @hole_start: the position of the hole to allocate the region
 927 * @num_bytes:  size of wanted region
 928 * @hole_end:   the end of the hole
 929 * @return:     position of allocatable zones
 930 *
 931 * Allocatable region should not contain any superblock locations.
 932 */
 933u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
 934                                 u64 hole_end, u64 num_bytes)
 935{
 936        struct btrfs_zoned_device_info *zinfo = device->zone_info;
 937        const u8 shift = zinfo->zone_size_shift;
 938        u64 nzones = num_bytes >> shift;
 939        u64 pos = hole_start;
 940        u64 begin, end;
 941        bool have_sb;
 942        int i;
 943
 944        ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size));
 945        ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size));
 946
 947        while (pos < hole_end) {
 948                begin = pos >> shift;
 949                end = begin + nzones;
 950
 951                if (end > zinfo->nr_zones)
 952                        return hole_end;
 953
 954                /* Check if zones in the region are all empty */
 955                if (btrfs_dev_is_sequential(device, pos) &&
 956                    find_next_zero_bit(zinfo->empty_zones, end, begin) != end) {
 957                        pos += zinfo->zone_size;
 958                        continue;
 959                }
 960
 961                have_sb = false;
 962                for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
 963                        u32 sb_zone;
 964                        u64 sb_pos;
 965
 966                        sb_zone = sb_zone_number(shift, i);
 967                        if (!(end <= sb_zone ||
 968                              sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) {
 969                                have_sb = true;
 970                                pos = zone_start_physical(
 971                                        sb_zone + BTRFS_NR_SB_LOG_ZONES, zinfo);
 972                                break;
 973                        }
 974
 975                        /* We also need to exclude regular superblock positions */
 976                        sb_pos = btrfs_sb_offset(i);
 977                        if (!(pos + num_bytes <= sb_pos ||
 978                              sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) {
 979                                have_sb = true;
 980                                pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE,
 981                                            zinfo->zone_size);
 982                                break;
 983                        }
 984                }
 985                if (!have_sb)
 986                        break;
 987        }
 988
 989        return pos;
 990}
 991
 992static bool btrfs_dev_set_active_zone(struct btrfs_device *device, u64 pos)
 993{
 994        struct btrfs_zoned_device_info *zone_info = device->zone_info;
 995        unsigned int zno = (pos >> zone_info->zone_size_shift);
 996
 997        /* We can use any number of zones */
 998        if (zone_info->max_active_zones == 0)
 999                return true;
1000
1001        if (!test_bit(zno, zone_info->active_zones)) {
1002                /* Active zone left? */
1003                if (atomic_dec_if_positive(&zone_info->active_zones_left) < 0)
1004                        return false;
1005                if (test_and_set_bit(zno, zone_info->active_zones)) {
1006                        /* Someone already set the bit */
1007                        atomic_inc(&zone_info->active_zones_left);
1008                }
1009        }
1010
1011        return true;
1012}
1013
1014static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos)
1015{
1016        struct btrfs_zoned_device_info *zone_info = device->zone_info;
1017        unsigned int zno = (pos >> zone_info->zone_size_shift);
1018
1019        /* We can use any number of zones */
1020        if (zone_info->max_active_zones == 0)
1021                return;
1022
1023        if (test_and_clear_bit(zno, zone_info->active_zones))
1024                atomic_inc(&zone_info->active_zones_left);
1025}
1026
1027int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
1028                            u64 length, u64 *bytes)
1029{
1030        int ret;
1031
1032        *bytes = 0;
1033        ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET,
1034                               physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT,
1035                               GFP_NOFS);
1036        if (ret)
1037                return ret;
1038
1039        *bytes = length;
1040        while (length) {
1041                btrfs_dev_set_zone_empty(device, physical);
1042                btrfs_dev_clear_active_zone(device, physical);
1043                physical += device->zone_info->zone_size;
1044                length -= device->zone_info->zone_size;
1045        }
1046
1047        return 0;
1048}
1049
1050int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size)
1051{
1052        struct btrfs_zoned_device_info *zinfo = device->zone_info;
1053        const u8 shift = zinfo->zone_size_shift;
1054        unsigned long begin = start >> shift;
1055        unsigned long end = (start + size) >> shift;
1056        u64 pos;
1057        int ret;
1058
1059        ASSERT(IS_ALIGNED(start, zinfo->zone_size));
1060        ASSERT(IS_ALIGNED(size, zinfo->zone_size));
1061
1062        if (end > zinfo->nr_zones)
1063                return -ERANGE;
1064
1065        /* All the zones are conventional */
1066        if (find_next_bit(zinfo->seq_zones, begin, end) == end)
1067                return 0;
1068
1069        /* All the zones are sequential and empty */
1070        if (find_next_zero_bit(zinfo->seq_zones, begin, end) == end &&
1071            find_next_zero_bit(zinfo->empty_zones, begin, end) == end)
1072                return 0;
1073
1074        for (pos = start; pos < start + size; pos += zinfo->zone_size) {
1075                u64 reset_bytes;
1076
1077                if (!btrfs_dev_is_sequential(device, pos) ||
1078                    btrfs_dev_is_empty_zone(device, pos))
1079                        continue;
1080
1081                /* Free regions should be empty */
1082                btrfs_warn_in_rcu(
1083                        device->fs_info,
1084                "zoned: resetting device %s (devid %llu) zone %llu for allocation",
1085                        rcu_str_deref(device->name), device->devid, pos >> shift);
1086                WARN_ON_ONCE(1);
1087
1088                ret = btrfs_reset_device_zone(device, pos, zinfo->zone_size,
1089                                              &reset_bytes);
1090                if (ret)
1091                        return ret;
1092        }
1093
1094        return 0;
1095}
1096
1097/*
1098 * Calculate an allocation pointer from the extent allocation information
1099 * for a block group consist of conventional zones. It is pointed to the
1100 * end of the highest addressed extent in the block group as an allocation
1101 * offset.
1102 */
1103static int calculate_alloc_pointer(struct btrfs_block_group *cache,
1104                                   u64 *offset_ret)
1105{
1106        struct btrfs_fs_info *fs_info = cache->fs_info;
1107        struct btrfs_root *root = fs_info->extent_root;
1108        struct btrfs_path *path;
1109        struct btrfs_key key;
1110        struct btrfs_key found_key;
1111        int ret;
1112        u64 length;
1113
1114        path = btrfs_alloc_path();
1115        if (!path)
1116                return -ENOMEM;
1117
1118        key.objectid = cache->start + cache->length;
1119        key.type = 0;
1120        key.offset = 0;
1121
1122        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1123        /* We should not find the exact match */
1124        if (!ret)
1125                ret = -EUCLEAN;
1126        if (ret < 0)
1127                goto out;
1128
1129        ret = btrfs_previous_extent_item(root, path, cache->start);
1130        if (ret) {
1131                if (ret == 1) {
1132                        ret = 0;
1133                        *offset_ret = 0;
1134                }
1135                goto out;
1136        }
1137
1138        btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
1139
1140        if (found_key.type == BTRFS_EXTENT_ITEM_KEY)
1141                length = found_key.offset;
1142        else
1143                length = fs_info->nodesize;
1144
1145        if (!(found_key.objectid >= cache->start &&
1146               found_key.objectid + length <= cache->start + cache->length)) {
1147                ret = -EUCLEAN;
1148                goto out;
1149        }
1150        *offset_ret = found_key.objectid + length - cache->start;
1151        ret = 0;
1152
1153out:
1154        btrfs_free_path(path);
1155        return ret;
1156}
1157
1158int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
1159{
1160        struct btrfs_fs_info *fs_info = cache->fs_info;
1161        struct extent_map_tree *em_tree = &fs_info->mapping_tree;
1162        struct extent_map *em;
1163        struct map_lookup *map;
1164        struct btrfs_device *device;
1165        u64 logical = cache->start;
1166        u64 length = cache->length;
1167        u64 physical = 0;
1168        int ret;
1169        int i;
1170        unsigned int nofs_flag;
1171        u64 *alloc_offsets = NULL;
1172        u64 *caps = NULL;
1173        unsigned long *active = NULL;
1174        u64 last_alloc = 0;
1175        u32 num_sequential = 0, num_conventional = 0;
1176
1177        if (!btrfs_is_zoned(fs_info))
1178                return 0;
1179
1180        /* Sanity check */
1181        if (!IS_ALIGNED(length, fs_info->zone_size)) {
1182                btrfs_err(fs_info,
1183                "zoned: block group %llu len %llu unaligned to zone size %llu",
1184                          logical, length, fs_info->zone_size);
1185                return -EIO;
1186        }
1187
1188        /* Get the chunk mapping */
1189        read_lock(&em_tree->lock);
1190        em = lookup_extent_mapping(em_tree, logical, length);
1191        read_unlock(&em_tree->lock);
1192
1193        if (!em)
1194                return -EINVAL;
1195
1196        map = em->map_lookup;
1197
1198        cache->physical_map = kmemdup(map, map_lookup_size(map->num_stripes), GFP_NOFS);
1199        if (!cache->physical_map) {
1200                ret = -ENOMEM;
1201                goto out;
1202        }
1203
1204        alloc_offsets = kcalloc(map->num_stripes, sizeof(*alloc_offsets), GFP_NOFS);
1205        if (!alloc_offsets) {
1206                ret = -ENOMEM;
1207                goto out;
1208        }
1209
1210        caps = kcalloc(map->num_stripes, sizeof(*caps), GFP_NOFS);
1211        if (!caps) {
1212                ret = -ENOMEM;
1213                goto out;
1214        }
1215
1216        active = bitmap_zalloc(map->num_stripes, GFP_NOFS);
1217        if (!active) {
1218                ret = -ENOMEM;
1219                goto out;
1220        }
1221
1222        for (i = 0; i < map->num_stripes; i++) {
1223                bool is_sequential;
1224                struct blk_zone zone;
1225                struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
1226                int dev_replace_is_ongoing = 0;
1227
1228                device = map->stripes[i].dev;
1229                physical = map->stripes[i].physical;
1230
1231                if (device->bdev == NULL) {
1232                        alloc_offsets[i] = WP_MISSING_DEV;
1233                        continue;
1234                }
1235
1236                is_sequential = btrfs_dev_is_sequential(device, physical);
1237                if (is_sequential)
1238                        num_sequential++;
1239                else
1240                        num_conventional++;
1241
1242                if (!is_sequential) {
1243                        alloc_offsets[i] = WP_CONVENTIONAL;
1244                        continue;
1245                }
1246
1247                /*
1248                 * This zone will be used for allocation, so mark this zone
1249                 * non-empty.
1250                 */
1251                btrfs_dev_clear_zone_empty(device, physical);
1252
1253                down_read(&dev_replace->rwsem);
1254                dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
1255                if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
1256                        btrfs_dev_clear_zone_empty(dev_replace->tgtdev, physical);
1257                up_read(&dev_replace->rwsem);
1258
1259                /*
1260                 * The group is mapped to a sequential zone. Get the zone write
1261                 * pointer to determine the allocation offset within the zone.
1262                 */
1263                WARN_ON(!IS_ALIGNED(physical, fs_info->zone_size));
1264                nofs_flag = memalloc_nofs_save();
1265                ret = btrfs_get_dev_zone(device, physical, &zone);
1266                memalloc_nofs_restore(nofs_flag);
1267                if (ret == -EIO || ret == -EOPNOTSUPP) {
1268                        ret = 0;
1269                        alloc_offsets[i] = WP_MISSING_DEV;
1270                        continue;
1271                } else if (ret) {
1272                        goto out;
1273                }
1274
1275                if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
1276                        btrfs_err_in_rcu(fs_info,
1277        "zoned: unexpected conventional zone %llu on device %s (devid %llu)",
1278                                zone.start << SECTOR_SHIFT,
1279                                rcu_str_deref(device->name), device->devid);
1280                        ret = -EIO;
1281                        goto out;
1282                }
1283
1284                caps[i] = (zone.capacity << SECTOR_SHIFT);
1285
1286                switch (zone.cond) {
1287                case BLK_ZONE_COND_OFFLINE:
1288                case BLK_ZONE_COND_READONLY:
1289                        btrfs_err(fs_info,
1290                "zoned: offline/readonly zone %llu on device %s (devid %llu)",
1291                                  physical >> device->zone_info->zone_size_shift,
1292                                  rcu_str_deref(device->name), device->devid);
1293                        alloc_offsets[i] = WP_MISSING_DEV;
1294                        break;
1295                case BLK_ZONE_COND_EMPTY:
1296                        alloc_offsets[i] = 0;
1297                        break;
1298                case BLK_ZONE_COND_FULL:
1299                        alloc_offsets[i] = caps[i];
1300                        break;
1301                default:
1302                        /* Partially used zone */
1303                        alloc_offsets[i] =
1304                                        ((zone.wp - zone.start) << SECTOR_SHIFT);
1305                        __set_bit(i, active);
1306                        break;
1307                }
1308
1309                /*
1310                 * Consider a zone as active if we can allow any number of
1311                 * active zones.
1312                 */
1313                if (!device->zone_info->max_active_zones)
1314                        __set_bit(i, active);
1315        }
1316
1317        if (num_sequential > 0)
1318                cache->seq_zone = true;
1319
1320        if (num_conventional > 0) {
1321                /*
1322                 * Avoid calling calculate_alloc_pointer() for new BG. It
1323                 * is no use for new BG. It must be always 0.
1324                 *
1325                 * Also, we have a lock chain of extent buffer lock ->
1326                 * chunk mutex.  For new BG, this function is called from
1327                 * btrfs_make_block_group() which is already taking the
1328                 * chunk mutex. Thus, we cannot call
1329                 * calculate_alloc_pointer() which takes extent buffer
1330                 * locks to avoid deadlock.
1331                 */
1332
1333                /* Zone capacity is always zone size in emulation */
1334                cache->zone_capacity = cache->length;
1335                if (new) {
1336                        cache->alloc_offset = 0;
1337                        goto out;
1338                }
1339                ret = calculate_alloc_pointer(cache, &last_alloc);
1340                if (ret || map->num_stripes == num_conventional) {
1341                        if (!ret)
1342                                cache->alloc_offset = last_alloc;
1343                        else
1344                                btrfs_err(fs_info,
1345                        "zoned: failed to determine allocation offset of bg %llu",
1346                                          cache->start);
1347                        goto out;
1348                }
1349        }
1350
1351        switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
1352        case 0: /* single */
1353                if (alloc_offsets[0] == WP_MISSING_DEV) {
1354                        btrfs_err(fs_info,
1355                        "zoned: cannot recover write pointer for zone %llu",
1356                                physical);
1357                        ret = -EIO;
1358                        goto out;
1359                }
1360                cache->alloc_offset = alloc_offsets[0];
1361                cache->zone_capacity = caps[0];
1362                cache->zone_is_active = test_bit(0, active);
1363                break;
1364        case BTRFS_BLOCK_GROUP_DUP:
1365        case BTRFS_BLOCK_GROUP_RAID1:
1366        case BTRFS_BLOCK_GROUP_RAID0:
1367        case BTRFS_BLOCK_GROUP_RAID10:
1368        case BTRFS_BLOCK_GROUP_RAID5:
1369        case BTRFS_BLOCK_GROUP_RAID6:
1370                /* non-single profiles are not supported yet */
1371        default:
1372                btrfs_err(fs_info, "zoned: profile %s not yet supported",
1373                          btrfs_bg_type_to_raid_name(map->type));
1374                ret = -EINVAL;
1375                goto out;
1376        }
1377
1378        if (cache->zone_is_active) {
1379                btrfs_get_block_group(cache);
1380                spin_lock(&fs_info->zone_active_bgs_lock);
1381                list_add_tail(&cache->active_bg_list, &fs_info->zone_active_bgs);
1382                spin_unlock(&fs_info->zone_active_bgs_lock);
1383        }
1384
1385out:
1386        if (cache->alloc_offset > fs_info->zone_size) {
1387                btrfs_err(fs_info,
1388                        "zoned: invalid write pointer %llu in block group %llu",
1389                        cache->alloc_offset, cache->start);
1390                ret = -EIO;
1391        }
1392
1393        if (cache->alloc_offset > cache->zone_capacity) {
1394                btrfs_err(fs_info,
1395"zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
1396                          cache->alloc_offset, cache->zone_capacity,
1397                          cache->start);
1398                ret = -EIO;
1399        }
1400
1401        /* An extent is allocated after the write pointer */
1402        if (!ret && num_conventional && last_alloc > cache->alloc_offset) {
1403                btrfs_err(fs_info,
1404                          "zoned: got wrong write pointer in BG %llu: %llu > %llu",
1405                          logical, last_alloc, cache->alloc_offset);
1406                ret = -EIO;
1407        }
1408
1409        if (!ret)
1410                cache->meta_write_pointer = cache->alloc_offset + cache->start;
1411
1412        if (ret) {
1413                kfree(cache->physical_map);
1414                cache->physical_map = NULL;
1415        }
1416        bitmap_free(active);
1417        kfree(caps);
1418        kfree(alloc_offsets);
1419        free_extent_map(em);
1420
1421        return ret;
1422}
1423
1424void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
1425{
1426        u64 unusable, free;
1427
1428        if (!btrfs_is_zoned(cache->fs_info))
1429                return;
1430
1431        WARN_ON(cache->bytes_super != 0);
1432        unusable = (cache->alloc_offset - cache->used) +
1433                   (cache->length - cache->zone_capacity);
1434        free = cache->zone_capacity - cache->alloc_offset;
1435
1436        /* We only need ->free_space in ALLOC_SEQ block groups */
1437        cache->last_byte_to_unpin = (u64)-1;
1438        cache->cached = BTRFS_CACHE_FINISHED;
1439        cache->free_space_ctl->free_space = free;
1440        cache->zone_unusable = unusable;
1441}
1442
1443void btrfs_redirty_list_add(struct btrfs_transaction *trans,
1444                            struct extent_buffer *eb)
1445{
1446        struct btrfs_fs_info *fs_info = eb->fs_info;
1447
1448        if (!btrfs_is_zoned(fs_info) ||
1449            btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN) ||
1450            !list_empty(&eb->release_list))
1451                return;
1452
1453        set_extent_buffer_dirty(eb);
1454        set_extent_bits_nowait(&trans->dirty_pages, eb->start,
1455                               eb->start + eb->len - 1, EXTENT_DIRTY);
1456        memzero_extent_buffer(eb, 0, eb->len);
1457        set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags);
1458
1459        spin_lock(&trans->releasing_ebs_lock);
1460        list_add_tail(&eb->release_list, &trans->releasing_ebs);
1461        spin_unlock(&trans->releasing_ebs_lock);
1462        atomic_inc(&eb->refs);
1463}
1464
1465void btrfs_free_redirty_list(struct btrfs_transaction *trans)
1466{
1467        spin_lock(&trans->releasing_ebs_lock);
1468        while (!list_empty(&trans->releasing_ebs)) {
1469                struct extent_buffer *eb;
1470
1471                eb = list_first_entry(&trans->releasing_ebs,
1472                                      struct extent_buffer, release_list);
1473                list_del_init(&eb->release_list);
1474                free_extent_buffer(eb);
1475        }
1476        spin_unlock(&trans->releasing_ebs_lock);
1477}
1478
1479bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
1480{
1481        struct btrfs_fs_info *fs_info = inode->root->fs_info;
1482        struct btrfs_block_group *cache;
1483        bool ret = false;
1484
1485        if (!btrfs_is_zoned(fs_info))
1486                return false;
1487
1488        if (!is_data_inode(&inode->vfs_inode))
1489                return false;
1490
1491        /*
1492         * Using REQ_OP_ZONE_APPNED for relocation can break assumptions on the
1493         * extent layout the relocation code has.
1494         * Furthermore we have set aside own block-group from which only the
1495         * relocation "process" can allocate and make sure only one process at a
1496         * time can add pages to an extent that gets relocated, so it's safe to
1497         * use regular REQ_OP_WRITE for this special case.
1498         */
1499        if (btrfs_is_data_reloc_root(inode->root))
1500                return false;
1501
1502        cache = btrfs_lookup_block_group(fs_info, start);
1503        ASSERT(cache);
1504        if (!cache)
1505                return false;
1506
1507        ret = cache->seq_zone;
1508        btrfs_put_block_group(cache);
1509
1510        return ret;
1511}
1512
1513void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset,
1514                                 struct bio *bio)
1515{
1516        struct btrfs_ordered_extent *ordered;
1517        const u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
1518
1519        if (bio_op(bio) != REQ_OP_ZONE_APPEND)
1520                return;
1521
1522        ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), file_offset);
1523        if (WARN_ON(!ordered))
1524                return;
1525
1526        ordered->physical = physical;
1527        ordered->bdev = bio->bi_bdev;
1528
1529        btrfs_put_ordered_extent(ordered);
1530}
1531
1532void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered)
1533{
1534        struct btrfs_inode *inode = BTRFS_I(ordered->inode);
1535        struct btrfs_fs_info *fs_info = inode->root->fs_info;
1536        struct extent_map_tree *em_tree;
1537        struct extent_map *em;
1538        struct btrfs_ordered_sum *sum;
1539        u64 orig_logical = ordered->disk_bytenr;
1540        u64 *logical = NULL;
1541        int nr, stripe_len;
1542
1543        /* Zoned devices should not have partitions. So, we can assume it is 0 */
1544        ASSERT(!bdev_is_partition(ordered->bdev));
1545        if (WARN_ON(!ordered->bdev))
1546                return;
1547
1548        if (WARN_ON(btrfs_rmap_block(fs_info, orig_logical, ordered->bdev,
1549                                     ordered->physical, &logical, &nr,
1550                                     &stripe_len)))
1551                goto out;
1552
1553        WARN_ON(nr != 1);
1554
1555        if (orig_logical == *logical)
1556                goto out;
1557
1558        ordered->disk_bytenr = *logical;
1559
1560        em_tree = &inode->extent_tree;
1561        write_lock(&em_tree->lock);
1562        em = search_extent_mapping(em_tree, ordered->file_offset,
1563                                   ordered->num_bytes);
1564        em->block_start = *logical;
1565        free_extent_map(em);
1566        write_unlock(&em_tree->lock);
1567
1568        list_for_each_entry(sum, &ordered->list, list) {
1569                if (*logical < orig_logical)
1570                        sum->bytenr -= orig_logical - *logical;
1571                else
1572                        sum->bytenr += *logical - orig_logical;
1573        }
1574
1575out:
1576        kfree(logical);
1577}
1578
1579bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
1580                                    struct extent_buffer *eb,
1581                                    struct btrfs_block_group **cache_ret)
1582{
1583        struct btrfs_block_group *cache;
1584        bool ret = true;
1585
1586        if (!btrfs_is_zoned(fs_info))
1587                return true;
1588
1589        cache = *cache_ret;
1590
1591        if (cache && (eb->start < cache->start ||
1592                      cache->start + cache->length <= eb->start)) {
1593                btrfs_put_block_group(cache);
1594                cache = NULL;
1595                *cache_ret = NULL;
1596        }
1597
1598        if (!cache)
1599                cache = btrfs_lookup_block_group(fs_info, eb->start);
1600
1601        if (cache) {
1602                if (cache->meta_write_pointer != eb->start) {
1603                        btrfs_put_block_group(cache);
1604                        cache = NULL;
1605                        ret = false;
1606                } else {
1607                        cache->meta_write_pointer = eb->start + eb->len;
1608                }
1609
1610                *cache_ret = cache;
1611        }
1612
1613        return ret;
1614}
1615
1616void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache,
1617                                     struct extent_buffer *eb)
1618{
1619        if (!btrfs_is_zoned(eb->fs_info) || !cache)
1620                return;
1621
1622        ASSERT(cache->meta_write_pointer == eb->start + eb->len);
1623        cache->meta_write_pointer = eb->start;
1624}
1625
1626int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length)
1627{
1628        if (!btrfs_dev_is_sequential(device, physical))
1629                return -EOPNOTSUPP;
1630
1631        return blkdev_issue_zeroout(device->bdev, physical >> SECTOR_SHIFT,
1632                                    length >> SECTOR_SHIFT, GFP_NOFS, 0);
1633}
1634
1635static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical,
1636                          struct blk_zone *zone)
1637{
1638        struct btrfs_io_context *bioc = NULL;
1639        u64 mapped_length = PAGE_SIZE;
1640        unsigned int nofs_flag;
1641        int nmirrors;
1642        int i, ret;
1643
1644        ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
1645                               &mapped_length, &bioc);
1646        if (ret || !bioc || mapped_length < PAGE_SIZE) {
1647                btrfs_put_bioc(bioc);
1648                return -EIO;
1649        }
1650
1651        if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK)
1652                return -EINVAL;
1653
1654        nofs_flag = memalloc_nofs_save();
1655        nmirrors = (int)bioc->num_stripes;
1656        for (i = 0; i < nmirrors; i++) {
1657                u64 physical = bioc->stripes[i].physical;
1658                struct btrfs_device *dev = bioc->stripes[i].dev;
1659
1660                /* Missing device */
1661                if (!dev->bdev)
1662                        continue;
1663
1664                ret = btrfs_get_dev_zone(dev, physical, zone);
1665                /* Failing device */
1666                if (ret == -EIO || ret == -EOPNOTSUPP)
1667                        continue;
1668                break;
1669        }
1670        memalloc_nofs_restore(nofs_flag);
1671
1672        return ret;
1673}
1674
1675/*
1676 * Synchronize write pointer in a zone at @physical_start on @tgt_dev, by
1677 * filling zeros between @physical_pos to a write pointer of dev-replace
1678 * source device.
1679 */
1680int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
1681                                    u64 physical_start, u64 physical_pos)
1682{
1683        struct btrfs_fs_info *fs_info = tgt_dev->fs_info;
1684        struct blk_zone zone;
1685        u64 length;
1686        u64 wp;
1687        int ret;
1688
1689        if (!btrfs_dev_is_sequential(tgt_dev, physical_pos))
1690                return 0;
1691
1692        ret = read_zone_info(fs_info, logical, &zone);
1693        if (ret)
1694                return ret;
1695
1696        wp = physical_start + ((zone.wp - zone.start) << SECTOR_SHIFT);
1697
1698        if (physical_pos == wp)
1699                return 0;
1700
1701        if (physical_pos > wp)
1702                return -EUCLEAN;
1703
1704        length = wp - physical_pos;
1705        return btrfs_zoned_issue_zeroout(tgt_dev, physical_pos, length);
1706}
1707
1708struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info,
1709                                            u64 logical, u64 length)
1710{
1711        struct btrfs_device *device;
1712        struct extent_map *em;
1713        struct map_lookup *map;
1714
1715        em = btrfs_get_chunk_map(fs_info, logical, length);
1716        if (IS_ERR(em))
1717                return ERR_CAST(em);
1718
1719        map = em->map_lookup;
1720        /* We only support single profile for now */
1721        ASSERT(map->num_stripes == 1);
1722        device = map->stripes[0].dev;
1723
1724        free_extent_map(em);
1725
1726        return device;
1727}
1728
1729/**
1730 * Activate block group and underlying device zones
1731 *
1732 * @block_group: the block group to activate
1733 *
1734 * Return: true on success, false otherwise
1735 */
1736bool btrfs_zone_activate(struct btrfs_block_group *block_group)
1737{
1738        struct btrfs_fs_info *fs_info = block_group->fs_info;
1739        struct map_lookup *map;
1740        struct btrfs_device *device;
1741        u64 physical;
1742        bool ret;
1743
1744        if (!btrfs_is_zoned(block_group->fs_info))
1745                return true;
1746
1747        map = block_group->physical_map;
1748        /* Currently support SINGLE profile only */
1749        ASSERT(map->num_stripes == 1);
1750        device = map->stripes[0].dev;
1751        physical = map->stripes[0].physical;
1752
1753        if (device->zone_info->max_active_zones == 0)
1754                return true;
1755
1756        spin_lock(&block_group->lock);
1757
1758        if (block_group->zone_is_active) {
1759                ret = true;
1760                goto out_unlock;
1761        }
1762
1763        /* No space left */
1764        if (block_group->alloc_offset == block_group->zone_capacity) {
1765                ret = false;
1766                goto out_unlock;
1767        }
1768
1769        if (!btrfs_dev_set_active_zone(device, physical)) {
1770                /* Cannot activate the zone */
1771                ret = false;
1772                goto out_unlock;
1773        }
1774
1775        /* Successfully activated all the zones */
1776        block_group->zone_is_active = 1;
1777
1778        spin_unlock(&block_group->lock);
1779
1780        /* For the active block group list */
1781        btrfs_get_block_group(block_group);
1782
1783        spin_lock(&fs_info->zone_active_bgs_lock);
1784        ASSERT(list_empty(&block_group->active_bg_list));
1785        list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs);
1786        spin_unlock(&fs_info->zone_active_bgs_lock);
1787
1788        return true;
1789
1790out_unlock:
1791        spin_unlock(&block_group->lock);
1792        return ret;
1793}
1794
1795int btrfs_zone_finish(struct btrfs_block_group *block_group)
1796{
1797        struct btrfs_fs_info *fs_info = block_group->fs_info;
1798        struct map_lookup *map;
1799        struct btrfs_device *device;
1800        u64 physical;
1801        int ret = 0;
1802
1803        if (!btrfs_is_zoned(fs_info))
1804                return 0;
1805
1806        map = block_group->physical_map;
1807        /* Currently support SINGLE profile only */
1808        ASSERT(map->num_stripes == 1);
1809
1810        device = map->stripes[0].dev;
1811        physical = map->stripes[0].physical;
1812
1813        if (device->zone_info->max_active_zones == 0)
1814                return 0;
1815
1816        spin_lock(&block_group->lock);
1817        if (!block_group->zone_is_active) {
1818                spin_unlock(&block_group->lock);
1819                return 0;
1820        }
1821
1822        /* Check if we have unwritten allocated space */
1823        if ((block_group->flags &
1824             (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) &&
1825            block_group->alloc_offset > block_group->meta_write_pointer) {
1826                spin_unlock(&block_group->lock);
1827                return -EAGAIN;
1828        }
1829        spin_unlock(&block_group->lock);
1830
1831        ret = btrfs_inc_block_group_ro(block_group, false);
1832        if (ret)
1833                return ret;
1834
1835        /* Ensure all writes in this block group finish */
1836        btrfs_wait_block_group_reservations(block_group);
1837        /* No need to wait for NOCOW writers. Zoned mode does not allow that. */
1838        btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start,
1839                                 block_group->length);
1840
1841        spin_lock(&block_group->lock);
1842
1843        /*
1844         * Bail out if someone already deactivated the block group, or
1845         * allocated space is left in the block group.
1846         */
1847        if (!block_group->zone_is_active) {
1848                spin_unlock(&block_group->lock);
1849                btrfs_dec_block_group_ro(block_group);
1850                return 0;
1851        }
1852
1853        if (block_group->reserved) {
1854                spin_unlock(&block_group->lock);
1855                btrfs_dec_block_group_ro(block_group);
1856                return -EAGAIN;
1857        }
1858
1859        block_group->zone_is_active = 0;
1860        block_group->alloc_offset = block_group->zone_capacity;
1861        block_group->free_space_ctl->free_space = 0;
1862        btrfs_clear_treelog_bg(block_group);
1863        btrfs_clear_data_reloc_bg(block_group);
1864        spin_unlock(&block_group->lock);
1865
1866        ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
1867                               physical >> SECTOR_SHIFT,
1868                               device->zone_info->zone_size >> SECTOR_SHIFT,
1869                               GFP_NOFS);
1870        btrfs_dec_block_group_ro(block_group);
1871
1872        if (!ret) {
1873                btrfs_dev_clear_active_zone(device, physical);
1874
1875                spin_lock(&fs_info->zone_active_bgs_lock);
1876                ASSERT(!list_empty(&block_group->active_bg_list));
1877                list_del_init(&block_group->active_bg_list);
1878                spin_unlock(&fs_info->zone_active_bgs_lock);
1879
1880                /* For active_bg_list */
1881                btrfs_put_block_group(block_group);
1882        }
1883
1884        return ret;
1885}
1886
1887bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, int raid_index)
1888{
1889        struct btrfs_device *device;
1890        bool ret = false;
1891
1892        if (!btrfs_is_zoned(fs_devices->fs_info))
1893                return true;
1894
1895        /* Non-single profiles are not supported yet */
1896        if (raid_index != BTRFS_RAID_SINGLE)
1897                return false;
1898
1899        /* Check if there is a device with active zones left */
1900        mutex_lock(&fs_devices->device_list_mutex);
1901        list_for_each_entry(device, &fs_devices->devices, dev_list) {
1902                struct btrfs_zoned_device_info *zinfo = device->zone_info;
1903
1904                if (!device->bdev)
1905                        continue;
1906
1907                if (!zinfo->max_active_zones ||
1908                    atomic_read(&zinfo->active_zones_left)) {
1909                        ret = true;
1910                        break;
1911                }
1912        }
1913        mutex_unlock(&fs_devices->device_list_mutex);
1914
1915        return ret;
1916}
1917
1918void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
1919{
1920        struct btrfs_block_group *block_group;
1921        struct map_lookup *map;
1922        struct btrfs_device *device;
1923        u64 physical;
1924
1925        if (!btrfs_is_zoned(fs_info))
1926                return;
1927
1928        block_group = btrfs_lookup_block_group(fs_info, logical);
1929        ASSERT(block_group);
1930
1931        if (logical + length < block_group->start + block_group->zone_capacity)
1932                goto out;
1933
1934        spin_lock(&block_group->lock);
1935
1936        if (!block_group->zone_is_active) {
1937                spin_unlock(&block_group->lock);
1938                goto out;
1939        }
1940
1941        block_group->zone_is_active = 0;
1942        /* We should have consumed all the free space */
1943        ASSERT(block_group->alloc_offset == block_group->zone_capacity);
1944        ASSERT(block_group->free_space_ctl->free_space == 0);
1945        btrfs_clear_treelog_bg(block_group);
1946        btrfs_clear_data_reloc_bg(block_group);
1947        spin_unlock(&block_group->lock);
1948
1949        map = block_group->physical_map;
1950        device = map->stripes[0].dev;
1951        physical = map->stripes[0].physical;
1952
1953        if (!device->zone_info->max_active_zones)
1954                goto out;
1955
1956        btrfs_dev_clear_active_zone(device, physical);
1957
1958        spin_lock(&fs_info->zone_active_bgs_lock);
1959        ASSERT(!list_empty(&block_group->active_bg_list));
1960        list_del_init(&block_group->active_bg_list);
1961        spin_unlock(&fs_info->zone_active_bgs_lock);
1962
1963        btrfs_put_block_group(block_group);
1964
1965out:
1966        btrfs_put_block_group(block_group);
1967}
1968
1969void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
1970{
1971        struct btrfs_fs_info *fs_info = bg->fs_info;
1972
1973        spin_lock(&fs_info->relocation_bg_lock);
1974        if (fs_info->data_reloc_bg == bg->start)
1975                fs_info->data_reloc_bg = 0;
1976        spin_unlock(&fs_info->relocation_bg_lock);
1977}
1978