linux/fs/btrfs/volumes.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2007 Oracle.  All rights reserved.
   4 */
   5
   6#include <linux/sched.h>
   7#include <linux/sched/mm.h>
   8#include <linux/bio.h>
   9#include <linux/slab.h>
  10#include <linux/blkdev.h>
  11#include <linux/ratelimit.h>
  12#include <linux/kthread.h>
  13#include <linux/raid/pq.h>
  14#include <linux/semaphore.h>
  15#include <linux/uuid.h>
  16#include <linux/list_sort.h>
  17#include "misc.h"
  18#include "ctree.h"
  19#include "extent_map.h"
  20#include "disk-io.h"
  21#include "transaction.h"
  22#include "print-tree.h"
  23#include "volumes.h"
  24#include "raid56.h"
  25#include "async-thread.h"
  26#include "check-integrity.h"
  27#include "rcu-string.h"
  28#include "dev-replace.h"
  29#include "sysfs.h"
  30#include "tree-checker.h"
  31#include "space-info.h"
  32#include "block-group.h"
  33#include "discard.h"
  34#include "zoned.h"
  35
  36const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
  37        [BTRFS_RAID_RAID10] = {
  38                .sub_stripes    = 2,
  39                .dev_stripes    = 1,
  40                .devs_max       = 0,    /* 0 == as many as possible */
  41                .devs_min       = 2,
  42                .tolerated_failures = 1,
  43                .devs_increment = 2,
  44                .ncopies        = 2,
  45                .nparity        = 0,
  46                .raid_name      = "raid10",
  47                .bg_flag        = BTRFS_BLOCK_GROUP_RAID10,
  48                .mindev_error   = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
  49        },
  50        [BTRFS_RAID_RAID1] = {
  51                .sub_stripes    = 1,
  52                .dev_stripes    = 1,
  53                .devs_max       = 2,
  54                .devs_min       = 2,
  55                .tolerated_failures = 1,
  56                .devs_increment = 2,
  57                .ncopies        = 2,
  58                .nparity        = 0,
  59                .raid_name      = "raid1",
  60                .bg_flag        = BTRFS_BLOCK_GROUP_RAID1,
  61                .mindev_error   = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
  62        },
  63        [BTRFS_RAID_RAID1C3] = {
  64                .sub_stripes    = 1,
  65                .dev_stripes    = 1,
  66                .devs_max       = 3,
  67                .devs_min       = 3,
  68                .tolerated_failures = 2,
  69                .devs_increment = 3,
  70                .ncopies        = 3,
  71                .nparity        = 0,
  72                .raid_name      = "raid1c3",
  73                .bg_flag        = BTRFS_BLOCK_GROUP_RAID1C3,
  74                .mindev_error   = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
  75        },
  76        [BTRFS_RAID_RAID1C4] = {
  77                .sub_stripes    = 1,
  78                .dev_stripes    = 1,
  79                .devs_max       = 4,
  80                .devs_min       = 4,
  81                .tolerated_failures = 3,
  82                .devs_increment = 4,
  83                .ncopies        = 4,
  84                .nparity        = 0,
  85                .raid_name      = "raid1c4",
  86                .bg_flag        = BTRFS_BLOCK_GROUP_RAID1C4,
  87                .mindev_error   = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
  88        },
  89        [BTRFS_RAID_DUP] = {
  90                .sub_stripes    = 1,
  91                .dev_stripes    = 2,
  92                .devs_max       = 1,
  93                .devs_min       = 1,
  94                .tolerated_failures = 0,
  95                .devs_increment = 1,
  96                .ncopies        = 2,
  97                .nparity        = 0,
  98                .raid_name      = "dup",
  99                .bg_flag        = BTRFS_BLOCK_GROUP_DUP,
 100                .mindev_error   = 0,
 101        },
 102        [BTRFS_RAID_RAID0] = {
 103                .sub_stripes    = 1,
 104                .dev_stripes    = 1,
 105                .devs_max       = 0,
 106                .devs_min       = 1,
 107                .tolerated_failures = 0,
 108                .devs_increment = 1,
 109                .ncopies        = 1,
 110                .nparity        = 0,
 111                .raid_name      = "raid0",
 112                .bg_flag        = BTRFS_BLOCK_GROUP_RAID0,
 113                .mindev_error   = 0,
 114        },
 115        [BTRFS_RAID_SINGLE] = {
 116                .sub_stripes    = 1,
 117                .dev_stripes    = 1,
 118                .devs_max       = 1,
 119                .devs_min       = 1,
 120                .tolerated_failures = 0,
 121                .devs_increment = 1,
 122                .ncopies        = 1,
 123                .nparity        = 0,
 124                .raid_name      = "single",
 125                .bg_flag        = 0,
 126                .mindev_error   = 0,
 127        },
 128        [BTRFS_RAID_RAID5] = {
 129                .sub_stripes    = 1,
 130                .dev_stripes    = 1,
 131                .devs_max       = 0,
 132                .devs_min       = 2,
 133                .tolerated_failures = 1,
 134                .devs_increment = 1,
 135                .ncopies        = 1,
 136                .nparity        = 1,
 137                .raid_name      = "raid5",
 138                .bg_flag        = BTRFS_BLOCK_GROUP_RAID5,
 139                .mindev_error   = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
 140        },
 141        [BTRFS_RAID_RAID6] = {
 142                .sub_stripes    = 1,
 143                .dev_stripes    = 1,
 144                .devs_max       = 0,
 145                .devs_min       = 3,
 146                .tolerated_failures = 2,
 147                .devs_increment = 1,
 148                .ncopies        = 1,
 149                .nparity        = 2,
 150                .raid_name      = "raid6",
 151                .bg_flag        = BTRFS_BLOCK_GROUP_RAID6,
 152                .mindev_error   = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
 153        },
 154};
 155
 156/*
 157 * Convert block group flags (BTRFS_BLOCK_GROUP_*) to btrfs_raid_types, which
 158 * can be used as index to access btrfs_raid_array[].
 159 */
 160enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags)
 161{
 162        if (flags & BTRFS_BLOCK_GROUP_RAID10)
 163                return BTRFS_RAID_RAID10;
 164        else if (flags & BTRFS_BLOCK_GROUP_RAID1)
 165                return BTRFS_RAID_RAID1;
 166        else if (flags & BTRFS_BLOCK_GROUP_RAID1C3)
 167                return BTRFS_RAID_RAID1C3;
 168        else if (flags & BTRFS_BLOCK_GROUP_RAID1C4)
 169                return BTRFS_RAID_RAID1C4;
 170        else if (flags & BTRFS_BLOCK_GROUP_DUP)
 171                return BTRFS_RAID_DUP;
 172        else if (flags & BTRFS_BLOCK_GROUP_RAID0)
 173                return BTRFS_RAID_RAID0;
 174        else if (flags & BTRFS_BLOCK_GROUP_RAID5)
 175                return BTRFS_RAID_RAID5;
 176        else if (flags & BTRFS_BLOCK_GROUP_RAID6)
 177                return BTRFS_RAID_RAID6;
 178
 179        return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
 180}
 181
 182const char *btrfs_bg_type_to_raid_name(u64 flags)
 183{
 184        const int index = btrfs_bg_flags_to_raid_index(flags);
 185
 186        if (index >= BTRFS_NR_RAID_TYPES)
 187                return NULL;
 188
 189        return btrfs_raid_array[index].raid_name;
 190}
 191
 192/*
 193 * Fill @buf with textual description of @bg_flags, no more than @size_buf
 194 * bytes including terminating null byte.
 195 */
 196void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
 197{
 198        int i;
 199        int ret;
 200        char *bp = buf;
 201        u64 flags = bg_flags;
 202        u32 size_bp = size_buf;
 203
 204        if (!flags) {
 205                strcpy(bp, "NONE");
 206                return;
 207        }
 208
 209#define DESCRIBE_FLAG(flag, desc)                                               \
 210        do {                                                            \
 211                if (flags & (flag)) {                                   \
 212                        ret = snprintf(bp, size_bp, "%s|", (desc));     \
 213                        if (ret < 0 || ret >= size_bp)                  \
 214                                goto out_overflow;                      \
 215                        size_bp -= ret;                                 \
 216                        bp += ret;                                      \
 217                        flags &= ~(flag);                               \
 218                }                                                       \
 219        } while (0)
 220
 221        DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
 222        DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
 223        DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
 224
 225        DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
 226        for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
 227                DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
 228                              btrfs_raid_array[i].raid_name);
 229#undef DESCRIBE_FLAG
 230
 231        if (flags) {
 232                ret = snprintf(bp, size_bp, "0x%llx|", flags);
 233                size_bp -= ret;
 234        }
 235
 236        if (size_bp < size_buf)
 237                buf[size_buf - size_bp - 1] = '\0'; /* remove last | */
 238
 239        /*
 240         * The text is trimmed, it's up to the caller to provide sufficiently
 241         * large buffer
 242         */
 243out_overflow:;
 244}
 245
 246static int init_first_rw_device(struct btrfs_trans_handle *trans);
 247static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
 248static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
 249static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
 250static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
 251                             enum btrfs_map_op op,
 252                             u64 logical, u64 *length,
 253                             struct btrfs_bio **bbio_ret,
 254                             int mirror_num, int need_raid_map);
 255
 256/*
 257 * Device locking
 258 * ==============
 259 *
 260 * There are several mutexes that protect manipulation of devices and low-level
 261 * structures like chunks but not block groups, extents or files
 262 *
 263 * uuid_mutex (global lock)
 264 * ------------------------
 265 * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
 266 * the SCAN_DEV ioctl registration or from mount either implicitly (the first
 267 * device) or requested by the device= mount option
 268 *
 269 * the mutex can be very coarse and can cover long-running operations
 270 *
 271 * protects: updates to fs_devices counters like missing devices, rw devices,
 272 * seeding, structure cloning, opening/closing devices at mount/umount time
 273 *
 274 * global::fs_devs - add, remove, updates to the global list
 275 *
 276 * does not protect: manipulation of the fs_devices::devices list in general
 277 * but in mount context it could be used to exclude list modifications by eg.
 278 * scan ioctl
 279 *
 280 * btrfs_device::name - renames (write side), read is RCU
 281 *
 282 * fs_devices::device_list_mutex (per-fs, with RCU)
 283 * ------------------------------------------------
 284 * protects updates to fs_devices::devices, ie. adding and deleting
 285 *
 286 * simple list traversal with read-only actions can be done with RCU protection
 287 *
 288 * may be used to exclude some operations from running concurrently without any
 289 * modifications to the list (see write_all_supers)
 290 *
 291 * Is not required at mount and close times, because our device list is
 292 * protected by the uuid_mutex at that point.
 293 *
 294 * balance_mutex
 295 * -------------
 296 * protects balance structures (status, state) and context accessed from
 297 * several places (internally, ioctl)
 298 *
 299 * chunk_mutex
 300 * -----------
 301 * protects chunks, adding or removing during allocation, trim or when a new
 302 * device is added/removed. Additionally it also protects post_commit_list of
 303 * individual devices, since they can be added to the transaction's
 304 * post_commit_list only with chunk_mutex held.
 305 *
 306 * cleaner_mutex
 307 * -------------
 308 * a big lock that is held by the cleaner thread and prevents running subvolume
 309 * cleaning together with relocation or delayed iputs
 310 *
 311 *
 312 * Lock nesting
 313 * ============
 314 *
 315 * uuid_mutex
 316 *   device_list_mutex
 317 *     chunk_mutex
 318 *   balance_mutex
 319 *
 320 *
 321 * Exclusive operations
 322 * ====================
 323 *
 324 * Maintains the exclusivity of the following operations that apply to the
 325 * whole filesystem and cannot run in parallel.
 326 *
 327 * - Balance (*)
 328 * - Device add
 329 * - Device remove
 330 * - Device replace (*)
 331 * - Resize
 332 *
 333 * The device operations (as above) can be in one of the following states:
 334 *
 335 * - Running state
 336 * - Paused state
 337 * - Completed state
 338 *
 339 * Only device operations marked with (*) can go into the Paused state for the
 340 * following reasons:
 341 *
 342 * - ioctl (only Balance can be Paused through ioctl)
 343 * - filesystem remounted as read-only
 344 * - filesystem unmounted and mounted as read-only
 345 * - system power-cycle and filesystem mounted as read-only
 346 * - filesystem or device errors leading to forced read-only
 347 *
 348 * The status of exclusive operation is set and cleared atomically.
 349 * During the course of Paused state, fs_info::exclusive_operation remains set.
 350 * A device operation in Paused or Running state can be canceled or resumed
 351 * either by ioctl (Balance only) or when remounted as read-write.
 352 * The exclusive status is cleared when the device operation is canceled or
 353 * completed.
 354 */
 355
 356DEFINE_MUTEX(uuid_mutex);
 357static LIST_HEAD(fs_uuids);
 358struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
 359{
 360        return &fs_uuids;
 361}
 362
 363/*
 364 * alloc_fs_devices - allocate struct btrfs_fs_devices
 365 * @fsid:               if not NULL, copy the UUID to fs_devices::fsid
 366 * @metadata_fsid:      if not NULL, copy the UUID to fs_devices::metadata_fsid
 367 *
 368 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
 369 * The returned struct is not linked onto any lists and can be destroyed with
 370 * kfree() right away.
 371 */
 372static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
 373                                                 const u8 *metadata_fsid)
 374{
 375        struct btrfs_fs_devices *fs_devs;
 376
 377        fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
 378        if (!fs_devs)
 379                return ERR_PTR(-ENOMEM);
 380
 381        mutex_init(&fs_devs->device_list_mutex);
 382
 383        INIT_LIST_HEAD(&fs_devs->devices);
 384        INIT_LIST_HEAD(&fs_devs->alloc_list);
 385        INIT_LIST_HEAD(&fs_devs->fs_list);
 386        INIT_LIST_HEAD(&fs_devs->seed_list);
 387        if (fsid)
 388                memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
 389
 390        if (metadata_fsid)
 391                memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
 392        else if (fsid)
 393                memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
 394
 395        return fs_devs;
 396}
 397
 398void btrfs_free_device(struct btrfs_device *device)
 399{
 400        WARN_ON(!list_empty(&device->post_commit_list));
 401        rcu_string_free(device->name);
 402        extent_io_tree_release(&device->alloc_state);
 403        bio_put(device->flush_bio);
 404        btrfs_destroy_dev_zone_info(device);
 405        kfree(device);
 406}
 407
 408static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
 409{
 410        struct btrfs_device *device;
 411        WARN_ON(fs_devices->opened);
 412        while (!list_empty(&fs_devices->devices)) {
 413                device = list_entry(fs_devices->devices.next,
 414                                    struct btrfs_device, dev_list);
 415                list_del(&device->dev_list);
 416                btrfs_free_device(device);
 417        }
 418        kfree(fs_devices);
 419}
 420
 421void __exit btrfs_cleanup_fs_uuids(void)
 422{
 423        struct btrfs_fs_devices *fs_devices;
 424
 425        while (!list_empty(&fs_uuids)) {
 426                fs_devices = list_entry(fs_uuids.next,
 427                                        struct btrfs_fs_devices, fs_list);
 428                list_del(&fs_devices->fs_list);
 429                free_fs_devices(fs_devices);
 430        }
 431}
 432
 433static noinline struct btrfs_fs_devices *find_fsid(
 434                const u8 *fsid, const u8 *metadata_fsid)
 435{
 436        struct btrfs_fs_devices *fs_devices;
 437
 438        ASSERT(fsid);
 439
 440        /* Handle non-split brain cases */
 441        list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 442                if (metadata_fsid) {
 443                        if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0
 444                            && memcmp(metadata_fsid, fs_devices->metadata_uuid,
 445                                      BTRFS_FSID_SIZE) == 0)
 446                                return fs_devices;
 447                } else {
 448                        if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
 449                                return fs_devices;
 450                }
 451        }
 452        return NULL;
 453}
 454
 455static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
 456                                struct btrfs_super_block *disk_super)
 457{
 458
 459        struct btrfs_fs_devices *fs_devices;
 460
 461        /*
 462         * Handle scanned device having completed its fsid change but
 463         * belonging to a fs_devices that was created by first scanning
 464         * a device which didn't have its fsid/metadata_uuid changed
 465         * at all and the CHANGING_FSID_V2 flag set.
 466         */
 467        list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 468                if (fs_devices->fsid_change &&
 469                    memcmp(disk_super->metadata_uuid, fs_devices->fsid,
 470                           BTRFS_FSID_SIZE) == 0 &&
 471                    memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
 472                           BTRFS_FSID_SIZE) == 0) {
 473                        return fs_devices;
 474                }
 475        }
 476        /*
 477         * Handle scanned device having completed its fsid change but
 478         * belonging to a fs_devices that was created by a device that
 479         * has an outdated pair of fsid/metadata_uuid and
 480         * CHANGING_FSID_V2 flag set.
 481         */
 482        list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 483                if (fs_devices->fsid_change &&
 484                    memcmp(fs_devices->metadata_uuid,
 485                           fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
 486                    memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid,
 487                           BTRFS_FSID_SIZE) == 0) {
 488                        return fs_devices;
 489                }
 490        }
 491
 492        return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
 493}
 494
 495
 496static int
 497btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
 498                      int flush, struct block_device **bdev,
 499                      struct btrfs_super_block **disk_super)
 500{
 501        int ret;
 502
 503        *bdev = blkdev_get_by_path(device_path, flags, holder);
 504
 505        if (IS_ERR(*bdev)) {
 506                ret = PTR_ERR(*bdev);
 507                goto error;
 508        }
 509
 510        if (flush)
 511                filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
 512        ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
 513        if (ret) {
 514                blkdev_put(*bdev, flags);
 515                goto error;
 516        }
 517        invalidate_bdev(*bdev);
 518        *disk_super = btrfs_read_dev_super(*bdev);
 519        if (IS_ERR(*disk_super)) {
 520                ret = PTR_ERR(*disk_super);
 521                blkdev_put(*bdev, flags);
 522                goto error;
 523        }
 524
 525        return 0;
 526
 527error:
 528        *bdev = NULL;
 529        return ret;
 530}
 531
 532static bool device_path_matched(const char *path, struct btrfs_device *device)
 533{
 534        int found;
 535
 536        rcu_read_lock();
 537        found = strcmp(rcu_str_deref(device->name), path);
 538        rcu_read_unlock();
 539
 540        return found == 0;
 541}
 542
 543/*
 544 *  Search and remove all stale (devices which are not mounted) devices.
 545 *  When both inputs are NULL, it will search and release all stale devices.
 546 *  path:       Optional. When provided will it release all unmounted devices
 547 *              matching this path only.
 548 *  skip_dev:   Optional. Will skip this device when searching for the stale
 549 *              devices.
 550 *  Return:     0 for success or if @path is NULL.
 551 *              -EBUSY if @path is a mounted device.
 552 *              -ENOENT if @path does not match any device in the list.
 553 */
 554static int btrfs_free_stale_devices(const char *path,
 555                                     struct btrfs_device *skip_device)
 556{
 557        struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
 558        struct btrfs_device *device, *tmp_device;
 559        int ret = 0;
 560
 561        lockdep_assert_held(&uuid_mutex);
 562
 563        if (path)
 564                ret = -ENOENT;
 565
 566        list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
 567
 568                mutex_lock(&fs_devices->device_list_mutex);
 569                list_for_each_entry_safe(device, tmp_device,
 570                                         &fs_devices->devices, dev_list) {
 571                        if (skip_device && skip_device == device)
 572                                continue;
 573                        if (path && !device->name)
 574                                continue;
 575                        if (path && !device_path_matched(path, device))
 576                                continue;
 577                        if (fs_devices->opened) {
 578                                /* for an already deleted device return 0 */
 579                                if (path && ret != 0)
 580                                        ret = -EBUSY;
 581                                break;
 582                        }
 583
 584                        /* delete the stale device */
 585                        fs_devices->num_devices--;
 586                        list_del(&device->dev_list);
 587                        btrfs_free_device(device);
 588
 589                        ret = 0;
 590                }
 591                mutex_unlock(&fs_devices->device_list_mutex);
 592
 593                if (fs_devices->num_devices == 0) {
 594                        btrfs_sysfs_remove_fsid(fs_devices);
 595                        list_del(&fs_devices->fs_list);
 596                        free_fs_devices(fs_devices);
 597                }
 598        }
 599
 600        return ret;
 601}
 602
 603/*
 604 * This is only used on mount, and we are protected from competing things
 605 * messing with our fs_devices by the uuid_mutex, thus we do not need the
 606 * fs_devices->device_list_mutex here.
 607 */
 608static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 609                        struct btrfs_device *device, fmode_t flags,
 610                        void *holder)
 611{
 612        struct request_queue *q;
 613        struct block_device *bdev;
 614        struct btrfs_super_block *disk_super;
 615        u64 devid;
 616        int ret;
 617
 618        if (device->bdev)
 619                return -EINVAL;
 620        if (!device->name)
 621                return -EINVAL;
 622
 623        ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
 624                                    &bdev, &disk_super);
 625        if (ret)
 626                return ret;
 627
 628        devid = btrfs_stack_device_id(&disk_super->dev_item);
 629        if (devid != device->devid)
 630                goto error_free_page;
 631
 632        if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
 633                goto error_free_page;
 634
 635        device->generation = btrfs_super_generation(disk_super);
 636
 637        if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
 638                if (btrfs_super_incompat_flags(disk_super) &
 639                    BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
 640                        pr_err(
 641                "BTRFS: Invalid seeding and uuid-changed device detected\n");
 642                        goto error_free_page;
 643                }
 644
 645                clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 646                fs_devices->seeding = true;
 647        } else {
 648                if (bdev_read_only(bdev))
 649                        clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 650                else
 651                        set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 652        }
 653
 654        q = bdev_get_queue(bdev);
 655        if (!blk_queue_nonrot(q))
 656                fs_devices->rotating = true;
 657
 658        device->bdev = bdev;
 659        clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
 660        device->mode = flags;
 661
 662        fs_devices->open_devices++;
 663        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
 664            device->devid != BTRFS_DEV_REPLACE_DEVID) {
 665                fs_devices->rw_devices++;
 666                list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
 667        }
 668        btrfs_release_disk_super(disk_super);
 669
 670        return 0;
 671
 672error_free_page:
 673        btrfs_release_disk_super(disk_super);
 674        blkdev_put(bdev, flags);
 675
 676        return -EINVAL;
 677}
 678
 679/*
 680 * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
 681 * being created with a disk that has already completed its fsid change. Such
 682 * disk can belong to an fs which has its FSID changed or to one which doesn't.
 683 * Handle both cases here.
 684 */
 685static struct btrfs_fs_devices *find_fsid_inprogress(
 686                                        struct btrfs_super_block *disk_super)
 687{
 688        struct btrfs_fs_devices *fs_devices;
 689
 690        list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 691                if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
 692                           BTRFS_FSID_SIZE) != 0 &&
 693                    memcmp(fs_devices->metadata_uuid, disk_super->fsid,
 694                           BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) {
 695                        return fs_devices;
 696                }
 697        }
 698
 699        return find_fsid(disk_super->fsid, NULL);
 700}
 701
 702
 703static struct btrfs_fs_devices *find_fsid_changed(
 704                                        struct btrfs_super_block *disk_super)
 705{
 706        struct btrfs_fs_devices *fs_devices;
 707
 708        /*
 709         * Handles the case where scanned device is part of an fs that had
 710         * multiple successful changes of FSID but currently device didn't
 711         * observe it. Meaning our fsid will be different than theirs. We need
 712         * to handle two subcases :
 713         *  1 - The fs still continues to have different METADATA/FSID uuids.
 714         *  2 - The fs is switched back to its original FSID (METADATA/FSID
 715         *  are equal).
 716         */
 717        list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 718                /* Changed UUIDs */
 719                if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
 720                           BTRFS_FSID_SIZE) != 0 &&
 721                    memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
 722                           BTRFS_FSID_SIZE) == 0 &&
 723                    memcmp(fs_devices->fsid, disk_super->fsid,
 724                           BTRFS_FSID_SIZE) != 0)
 725                        return fs_devices;
 726
 727                /* Unchanged UUIDs */
 728                if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
 729                           BTRFS_FSID_SIZE) == 0 &&
 730                    memcmp(fs_devices->fsid, disk_super->metadata_uuid,
 731                           BTRFS_FSID_SIZE) == 0)
 732                        return fs_devices;
 733        }
 734
 735        return NULL;
 736}
 737
 738static struct btrfs_fs_devices *find_fsid_reverted_metadata(
 739                                struct btrfs_super_block *disk_super)
 740{
 741        struct btrfs_fs_devices *fs_devices;
 742
 743        /*
 744         * Handle the case where the scanned device is part of an fs whose last
 745         * metadata UUID change reverted it to the original FSID. At the same
 746         * time * fs_devices was first created by another constitutent device
 747         * which didn't fully observe the operation. This results in an
 748         * btrfs_fs_devices created with metadata/fsid different AND
 749         * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
 750         * fs_devices equal to the FSID of the disk.
 751         */
 752        list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 753                if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
 754                           BTRFS_FSID_SIZE) != 0 &&
 755                    memcmp(fs_devices->metadata_uuid, disk_super->fsid,
 756                           BTRFS_FSID_SIZE) == 0 &&
 757                    fs_devices->fsid_change)
 758                        return fs_devices;
 759        }
 760
 761        return NULL;
 762}
 763/*
 764 * Add new device to list of registered devices
 765 *
 766 * Returns:
 767 * device pointer which was just added or updated when successful
 768 * error pointer when failed
 769 */
 770static noinline struct btrfs_device *device_list_add(const char *path,
 771                           struct btrfs_super_block *disk_super,
 772                           bool *new_device_added)
 773{
 774        struct btrfs_device *device;
 775        struct btrfs_fs_devices *fs_devices = NULL;
 776        struct rcu_string *name;
 777        u64 found_transid = btrfs_super_generation(disk_super);
 778        u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
 779        bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
 780                BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
 781        bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
 782                                        BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
 783
 784        if (fsid_change_in_progress) {
 785                if (!has_metadata_uuid)
 786                        fs_devices = find_fsid_inprogress(disk_super);
 787                else
 788                        fs_devices = find_fsid_changed(disk_super);
 789        } else if (has_metadata_uuid) {
 790                fs_devices = find_fsid_with_metadata_uuid(disk_super);
 791        } else {
 792                fs_devices = find_fsid_reverted_metadata(disk_super);
 793                if (!fs_devices)
 794                        fs_devices = find_fsid(disk_super->fsid, NULL);
 795        }
 796
 797
 798        if (!fs_devices) {
 799                if (has_metadata_uuid)
 800                        fs_devices = alloc_fs_devices(disk_super->fsid,
 801                                                      disk_super->metadata_uuid);
 802                else
 803                        fs_devices = alloc_fs_devices(disk_super->fsid, NULL);
 804
 805                if (IS_ERR(fs_devices))
 806                        return ERR_CAST(fs_devices);
 807
 808                fs_devices->fsid_change = fsid_change_in_progress;
 809
 810                mutex_lock(&fs_devices->device_list_mutex);
 811                list_add(&fs_devices->fs_list, &fs_uuids);
 812
 813                device = NULL;
 814        } else {
 815                mutex_lock(&fs_devices->device_list_mutex);
 816                device = btrfs_find_device(fs_devices, devid,
 817                                disk_super->dev_item.uuid, NULL);
 818
 819                /*
 820                 * If this disk has been pulled into an fs devices created by
 821                 * a device which had the CHANGING_FSID_V2 flag then replace the
 822                 * metadata_uuid/fsid values of the fs_devices.
 823                 */
 824                if (fs_devices->fsid_change &&
 825                    found_transid > fs_devices->latest_generation) {
 826                        memcpy(fs_devices->fsid, disk_super->fsid,
 827                                        BTRFS_FSID_SIZE);
 828
 829                        if (has_metadata_uuid)
 830                                memcpy(fs_devices->metadata_uuid,
 831                                       disk_super->metadata_uuid,
 832                                       BTRFS_FSID_SIZE);
 833                        else
 834                                memcpy(fs_devices->metadata_uuid,
 835                                       disk_super->fsid, BTRFS_FSID_SIZE);
 836
 837                        fs_devices->fsid_change = false;
 838                }
 839        }
 840
 841        if (!device) {
 842                if (fs_devices->opened) {
 843                        mutex_unlock(&fs_devices->device_list_mutex);
 844                        return ERR_PTR(-EBUSY);
 845                }
 846
 847                device = btrfs_alloc_device(NULL, &devid,
 848                                            disk_super->dev_item.uuid);
 849                if (IS_ERR(device)) {
 850                        mutex_unlock(&fs_devices->device_list_mutex);
 851                        /* we can safely leave the fs_devices entry around */
 852                        return device;
 853                }
 854
 855                name = rcu_string_strdup(path, GFP_NOFS);
 856                if (!name) {
 857                        btrfs_free_device(device);
 858                        mutex_unlock(&fs_devices->device_list_mutex);
 859                        return ERR_PTR(-ENOMEM);
 860                }
 861                rcu_assign_pointer(device->name, name);
 862
 863                list_add_rcu(&device->dev_list, &fs_devices->devices);
 864                fs_devices->num_devices++;
 865
 866                device->fs_devices = fs_devices;
 867                *new_device_added = true;
 868
 869                if (disk_super->label[0])
 870                        pr_info(
 871        "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
 872                                disk_super->label, devid, found_transid, path,
 873                                current->comm, task_pid_nr(current));
 874                else
 875                        pr_info(
 876        "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
 877                                disk_super->fsid, devid, found_transid, path,
 878                                current->comm, task_pid_nr(current));
 879
 880        } else if (!device->name || strcmp(device->name->str, path)) {
 881                /*
 882                 * When FS is already mounted.
 883                 * 1. If you are here and if the device->name is NULL that
 884                 *    means this device was missing at time of FS mount.
 885                 * 2. If you are here and if the device->name is different
 886                 *    from 'path' that means either
 887                 *      a. The same device disappeared and reappeared with
 888                 *         different name. or
 889                 *      b. The missing-disk-which-was-replaced, has
 890                 *         reappeared now.
 891                 *
 892                 * We must allow 1 and 2a above. But 2b would be a spurious
 893                 * and unintentional.
 894                 *
 895                 * Further in case of 1 and 2a above, the disk at 'path'
 896                 * would have missed some transaction when it was away and
 897                 * in case of 2a the stale bdev has to be updated as well.
 898                 * 2b must not be allowed at all time.
 899                 */
 900
 901                /*
 902                 * For now, we do allow update to btrfs_fs_device through the
 903                 * btrfs dev scan cli after FS has been mounted.  We're still
 904                 * tracking a problem where systems fail mount by subvolume id
 905                 * when we reject replacement on a mounted FS.
 906                 */
 907                if (!fs_devices->opened && found_transid < device->generation) {
 908                        /*
 909                         * That is if the FS is _not_ mounted and if you
 910                         * are here, that means there is more than one
 911                         * disk with same uuid and devid.We keep the one
 912                         * with larger generation number or the last-in if
 913                         * generation are equal.
 914                         */
 915                        mutex_unlock(&fs_devices->device_list_mutex);
 916                        return ERR_PTR(-EEXIST);
 917                }
 918
 919                /*
 920                 * We are going to replace the device path for a given devid,
 921                 * make sure it's the same device if the device is mounted
 922                 */
 923                if (device->bdev) {
 924                        int error;
 925                        dev_t path_dev;
 926
 927                        error = lookup_bdev(path, &path_dev);
 928                        if (error) {
 929                                mutex_unlock(&fs_devices->device_list_mutex);
 930                                return ERR_PTR(error);
 931                        }
 932
 933                        if (device->bdev->bd_dev != path_dev) {
 934                                mutex_unlock(&fs_devices->device_list_mutex);
 935                                /*
 936                                 * device->fs_info may not be reliable here, so
 937                                 * pass in a NULL instead. This avoids a
 938                                 * possible use-after-free when the fs_info and
 939                                 * fs_info->sb are already torn down.
 940                                 */
 941                                btrfs_warn_in_rcu(NULL,
 942        "duplicate device %s devid %llu generation %llu scanned by %s (%d)",
 943                                                  path, devid, found_transid,
 944                                                  current->comm,
 945                                                  task_pid_nr(current));
 946                                return ERR_PTR(-EEXIST);
 947                        }
 948                        btrfs_info_in_rcu(device->fs_info,
 949        "devid %llu device path %s changed to %s scanned by %s (%d)",
 950                                          devid, rcu_str_deref(device->name),
 951                                          path, current->comm,
 952                                          task_pid_nr(current));
 953                }
 954
 955                name = rcu_string_strdup(path, GFP_NOFS);
 956                if (!name) {
 957                        mutex_unlock(&fs_devices->device_list_mutex);
 958                        return ERR_PTR(-ENOMEM);
 959                }
 960                rcu_string_free(device->name);
 961                rcu_assign_pointer(device->name, name);
 962                if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
 963                        fs_devices->missing_devices--;
 964                        clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
 965                }
 966        }
 967
 968        /*
 969         * Unmount does not free the btrfs_device struct but would zero
 970         * generation along with most of the other members. So just update
 971         * it back. We need it to pick the disk with largest generation
 972         * (as above).
 973         */
 974        if (!fs_devices->opened) {
 975                device->generation = found_transid;
 976                fs_devices->latest_generation = max_t(u64, found_transid,
 977                                                fs_devices->latest_generation);
 978        }
 979
 980        fs_devices->total_devices = btrfs_super_num_devices(disk_super);
 981
 982        mutex_unlock(&fs_devices->device_list_mutex);
 983        return device;
 984}
 985
 986static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 987{
 988        struct btrfs_fs_devices *fs_devices;
 989        struct btrfs_device *device;
 990        struct btrfs_device *orig_dev;
 991        int ret = 0;
 992
 993        lockdep_assert_held(&uuid_mutex);
 994
 995        fs_devices = alloc_fs_devices(orig->fsid, NULL);
 996        if (IS_ERR(fs_devices))
 997                return fs_devices;
 998
 999        fs_devices->total_devices = orig->total_devices;
1000
1001        list_for_each_entry(orig_dev, &orig->devices, dev_list) {
1002                struct rcu_string *name;
1003
1004                device = btrfs_alloc_device(NULL, &orig_dev->devid,
1005                                            orig_dev->uuid);
1006                if (IS_ERR(device)) {
1007                        ret = PTR_ERR(device);
1008                        goto error;
1009                }
1010
1011                /*
1012                 * This is ok to do without rcu read locked because we hold the
1013                 * uuid mutex so nothing we touch in here is going to disappear.
1014                 */
1015                if (orig_dev->name) {
1016                        name = rcu_string_strdup(orig_dev->name->str,
1017                                        GFP_KERNEL);
1018                        if (!name) {
1019                                btrfs_free_device(device);
1020                                ret = -ENOMEM;
1021                                goto error;
1022                        }
1023                        rcu_assign_pointer(device->name, name);
1024                }
1025
1026                list_add(&device->dev_list, &fs_devices->devices);
1027                device->fs_devices = fs_devices;
1028                fs_devices->num_devices++;
1029        }
1030        return fs_devices;
1031error:
1032        free_fs_devices(fs_devices);
1033        return ERR_PTR(ret);
1034}
1035
1036static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
1037                                      struct btrfs_device **latest_dev)
1038{
1039        struct btrfs_device *device, *next;
1040
1041        /* This is the initialized path, it is safe to release the devices. */
1042        list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
1043                if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
1044                        if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
1045                                      &device->dev_state) &&
1046                            !test_bit(BTRFS_DEV_STATE_MISSING,
1047                                      &device->dev_state) &&
1048                            (!*latest_dev ||
1049                             device->generation > (*latest_dev)->generation)) {
1050                                *latest_dev = device;
1051                        }
1052                        continue;
1053                }
1054
1055                /*
1056                 * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID,
1057                 * in btrfs_init_dev_replace() so just continue.
1058                 */
1059                if (device->devid == BTRFS_DEV_REPLACE_DEVID)
1060                        continue;
1061
1062                if (device->bdev) {
1063                        blkdev_put(device->bdev, device->mode);
1064                        device->bdev = NULL;
1065                        fs_devices->open_devices--;
1066                }
1067                if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1068                        list_del_init(&device->dev_alloc_list);
1069                        clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1070                        fs_devices->rw_devices--;
1071                }
1072                list_del_init(&device->dev_list);
1073                fs_devices->num_devices--;
1074                btrfs_free_device(device);
1075        }
1076
1077}
1078
1079/*
1080 * After we have read the system tree and know devids belonging to this
1081 * filesystem, remove the device which does not belong there.
1082 */
1083void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices)
1084{
1085        struct btrfs_device *latest_dev = NULL;
1086        struct btrfs_fs_devices *seed_dev;
1087
1088        mutex_lock(&uuid_mutex);
1089        __btrfs_free_extra_devids(fs_devices, &latest_dev);
1090
1091        list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
1092                __btrfs_free_extra_devids(seed_dev, &latest_dev);
1093
1094        fs_devices->latest_bdev = latest_dev->bdev;
1095
1096        mutex_unlock(&uuid_mutex);
1097}
1098
1099static void btrfs_close_bdev(struct btrfs_device *device)
1100{
1101        if (!device->bdev)
1102                return;
1103
1104        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1105                sync_blockdev(device->bdev);
1106                invalidate_bdev(device->bdev);
1107        }
1108
1109        blkdev_put(device->bdev, device->mode);
1110}
1111
1112static void btrfs_close_one_device(struct btrfs_device *device)
1113{
1114        struct btrfs_fs_devices *fs_devices = device->fs_devices;
1115
1116        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1117            device->devid != BTRFS_DEV_REPLACE_DEVID) {
1118                list_del_init(&device->dev_alloc_list);
1119                fs_devices->rw_devices--;
1120        }
1121
1122        if (device->devid == BTRFS_DEV_REPLACE_DEVID)
1123                clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
1124
1125        if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
1126                fs_devices->missing_devices--;
1127
1128        btrfs_close_bdev(device);
1129        if (device->bdev) {
1130                fs_devices->open_devices--;
1131                device->bdev = NULL;
1132        }
1133        clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1134        btrfs_destroy_dev_zone_info(device);
1135
1136        device->fs_info = NULL;
1137        atomic_set(&device->dev_stats_ccnt, 0);
1138        extent_io_tree_release(&device->alloc_state);
1139
1140        /*
1141         * Reset the flush error record. We might have a transient flush error
1142         * in this mount, and if so we aborted the current transaction and set
1143         * the fs to an error state, guaranteeing no super blocks can be further
1144         * committed. However that error might be transient and if we unmount the
1145         * filesystem and mount it again, we should allow the mount to succeed
1146         * (btrfs_check_rw_degradable() should not fail) - if after mounting the
1147         * filesystem again we still get flush errors, then we will again abort
1148         * any transaction and set the error state, guaranteeing no commits of
1149         * unsafe super blocks.
1150         */
1151        device->last_flush_error = 0;
1152
1153        /* Verify the device is back in a pristine state  */
1154        ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
1155        ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1156        ASSERT(list_empty(&device->dev_alloc_list));
1157        ASSERT(list_empty(&device->post_commit_list));
1158        ASSERT(atomic_read(&device->reada_in_flight) == 0);
1159}
1160
1161static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
1162{
1163        struct btrfs_device *device, *tmp;
1164
1165        lockdep_assert_held(&uuid_mutex);
1166
1167        if (--fs_devices->opened > 0)
1168                return;
1169
1170        list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
1171                btrfs_close_one_device(device);
1172
1173        WARN_ON(fs_devices->open_devices);
1174        WARN_ON(fs_devices->rw_devices);
1175        fs_devices->opened = 0;
1176        fs_devices->seeding = false;
1177        fs_devices->fs_info = NULL;
1178}
1179
1180void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
1181{
1182        LIST_HEAD(list);
1183        struct btrfs_fs_devices *tmp;
1184
1185        mutex_lock(&uuid_mutex);
1186        close_fs_devices(fs_devices);
1187        if (!fs_devices->opened)
1188                list_splice_init(&fs_devices->seed_list, &list);
1189
1190        list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
1191                close_fs_devices(fs_devices);
1192                list_del(&fs_devices->seed_list);
1193                free_fs_devices(fs_devices);
1194        }
1195        mutex_unlock(&uuid_mutex);
1196}
1197
1198static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
1199                                fmode_t flags, void *holder)
1200{
1201        struct btrfs_device *device;
1202        struct btrfs_device *latest_dev = NULL;
1203        struct btrfs_device *tmp_device;
1204
1205        flags |= FMODE_EXCL;
1206
1207        list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
1208                                 dev_list) {
1209                int ret;
1210
1211                ret = btrfs_open_one_device(fs_devices, device, flags, holder);
1212                if (ret == 0 &&
1213                    (!latest_dev || device->generation > latest_dev->generation)) {
1214                        latest_dev = device;
1215                } else if (ret == -ENODATA) {
1216                        fs_devices->num_devices--;
1217                        list_del(&device->dev_list);
1218                        btrfs_free_device(device);
1219                }
1220        }
1221        if (fs_devices->open_devices == 0)
1222                return -EINVAL;
1223
1224        fs_devices->opened = 1;
1225        fs_devices->latest_bdev = latest_dev->bdev;
1226        fs_devices->total_rw_bytes = 0;
1227        fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
1228        fs_devices->read_policy = BTRFS_READ_POLICY_PID;
1229
1230        return 0;
1231}
1232
1233static int devid_cmp(void *priv, const struct list_head *a,
1234                     const struct list_head *b)
1235{
1236        const struct btrfs_device *dev1, *dev2;
1237
1238        dev1 = list_entry(a, struct btrfs_device, dev_list);
1239        dev2 = list_entry(b, struct btrfs_device, dev_list);
1240
1241        if (dev1->devid < dev2->devid)
1242                return -1;
1243        else if (dev1->devid > dev2->devid)
1244                return 1;
1245        return 0;
1246}
1247
1248int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1249                       fmode_t flags, void *holder)
1250{
1251        int ret;
1252
1253        lockdep_assert_held(&uuid_mutex);
1254        /*
1255         * The device_list_mutex cannot be taken here in case opening the
1256         * underlying device takes further locks like open_mutex.
1257         *
1258         * We also don't need the lock here as this is called during mount and
1259         * exclusion is provided by uuid_mutex
1260         */
1261
1262        if (fs_devices->opened) {
1263                fs_devices->opened++;
1264                ret = 0;
1265        } else {
1266                list_sort(NULL, &fs_devices->devices, devid_cmp);
1267                ret = open_fs_devices(fs_devices, flags, holder);
1268        }
1269
1270        return ret;
1271}
1272
1273void btrfs_release_disk_super(struct btrfs_super_block *super)
1274{
1275        struct page *page = virt_to_page(super);
1276
1277        put_page(page);
1278}
1279
1280static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
1281                                                       u64 bytenr, u64 bytenr_orig)
1282{
1283        struct btrfs_super_block *disk_super;
1284        struct page *page;
1285        void *p;
1286        pgoff_t index;
1287
1288        /* make sure our super fits in the device */
1289        if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
1290                return ERR_PTR(-EINVAL);
1291
1292        /* make sure our super fits in the page */
1293        if (sizeof(*disk_super) > PAGE_SIZE)
1294                return ERR_PTR(-EINVAL);
1295
1296        /* make sure our super doesn't straddle pages on disk */
1297        index = bytenr >> PAGE_SHIFT;
1298        if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
1299                return ERR_PTR(-EINVAL);
1300
1301        /* pull in the page with our super */
1302        page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL);
1303
1304        if (IS_ERR(page))
1305                return ERR_CAST(page);
1306
1307        p = page_address(page);
1308
1309        /* align our pointer to the offset of the super block */
1310        disk_super = p + offset_in_page(bytenr);
1311
1312        if (btrfs_super_bytenr(disk_super) != bytenr_orig ||
1313            btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
1314                btrfs_release_disk_super(p);
1315                return ERR_PTR(-EINVAL);
1316        }
1317
1318        if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1])
1319                disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;
1320
1321        return disk_super;
1322}
1323
1324int btrfs_forget_devices(const char *path)
1325{
1326        int ret;
1327
1328        mutex_lock(&uuid_mutex);
1329        ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
1330        mutex_unlock(&uuid_mutex);
1331
1332        return ret;
1333}
1334
1335/*
1336 * Look for a btrfs signature on a device. This may be called out of the mount path
1337 * and we are not allowed to call set_blocksize during the scan. The superblock
1338 * is read via pagecache
1339 */
1340struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
1341                                           void *holder)
1342{
1343        struct btrfs_super_block *disk_super;
1344        bool new_device_added = false;
1345        struct btrfs_device *device = NULL;
1346        struct block_device *bdev;
1347        u64 bytenr, bytenr_orig;
1348        int ret;
1349
1350        lockdep_assert_held(&uuid_mutex);
1351
1352        /*
1353         * we would like to check all the supers, but that would make
1354         * a btrfs mount succeed after a mkfs from a different FS.
1355         * So, we need to add a special mount option to scan for
1356         * later supers, using BTRFS_SUPER_MIRROR_MAX instead
1357         */
1358        flags |= FMODE_EXCL;
1359
1360        bdev = blkdev_get_by_path(path, flags, holder);
1361        if (IS_ERR(bdev))
1362                return ERR_CAST(bdev);
1363
1364        bytenr_orig = btrfs_sb_offset(0);
1365        ret = btrfs_sb_log_location_bdev(bdev, 0, READ, &bytenr);
1366        if (ret)
1367                return ERR_PTR(ret);
1368
1369        disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr_orig);
1370        if (IS_ERR(disk_super)) {
1371                device = ERR_CAST(disk_super);
1372                goto error_bdev_put;
1373        }
1374
1375        device = device_list_add(path, disk_super, &new_device_added);
1376        if (!IS_ERR(device)) {
1377                if (new_device_added)
1378                        btrfs_free_stale_devices(path, device);
1379        }
1380
1381        btrfs_release_disk_super(disk_super);
1382
1383error_bdev_put:
1384        blkdev_put(bdev, flags);
1385
1386        return device;
1387}
1388
1389/*
1390 * Try to find a chunk that intersects [start, start + len] range and when one
1391 * such is found, record the end of it in *start
1392 */
1393static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
1394                                    u64 len)
1395{
1396        u64 physical_start, physical_end;
1397
1398        lockdep_assert_held(&device->fs_info->chunk_mutex);
1399
1400        if (!find_first_extent_bit(&device->alloc_state, *start,
1401                                   &physical_start, &physical_end,
1402                                   CHUNK_ALLOCATED, NULL)) {
1403
1404                if (in_range(physical_start, *start, len) ||
1405                    in_range(*start, physical_start,
1406                             physical_end - physical_start)) {
1407                        *start = physical_end + 1;
1408                        return true;
1409                }
1410        }
1411        return false;
1412}
1413
1414static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
1415{
1416        switch (device->fs_devices->chunk_alloc_policy) {
1417        case BTRFS_CHUNK_ALLOC_REGULAR:
1418                /*
1419                 * We don't want to overwrite the superblock on the drive nor
1420                 * any area used by the boot loader (grub for example), so we
1421                 * make sure to start at an offset of at least 1MB.
1422                 */
1423                return max_t(u64, start, SZ_1M);
1424        case BTRFS_CHUNK_ALLOC_ZONED:
1425                /*
1426                 * We don't care about the starting region like regular
1427                 * allocator, because we anyway use/reserve the first two zones
1428                 * for superblock logging.
1429                 */
1430                return ALIGN(start, device->zone_info->zone_size);
1431        default:
1432                BUG();
1433        }
1434}
1435
1436static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
1437                                        u64 *hole_start, u64 *hole_size,
1438                                        u64 num_bytes)
1439{
1440        u64 zone_size = device->zone_info->zone_size;
1441        u64 pos;
1442        int ret;
1443        bool changed = false;
1444
1445        ASSERT(IS_ALIGNED(*hole_start, zone_size));
1446
1447        while (*hole_size > 0) {
1448                pos = btrfs_find_allocatable_zones(device, *hole_start,
1449                                                   *hole_start + *hole_size,
1450                                                   num_bytes);
1451                if (pos != *hole_start) {
1452                        *hole_size = *hole_start + *hole_size - pos;
1453                        *hole_start = pos;
1454                        changed = true;
1455                        if (*hole_size < num_bytes)
1456                                break;
1457                }
1458
1459                ret = btrfs_ensure_empty_zones(device, pos, num_bytes);
1460
1461                /* Range is ensured to be empty */
1462                if (!ret)
1463                        return changed;
1464
1465                /* Given hole range was invalid (outside of device) */
1466                if (ret == -ERANGE) {
1467                        *hole_start += *hole_size;
1468                        *hole_size = 0;
1469                        return true;
1470                }
1471
1472                *hole_start += zone_size;
1473                *hole_size -= zone_size;
1474                changed = true;
1475        }
1476
1477        return changed;
1478}
1479
1480/**
1481 * dev_extent_hole_check - check if specified hole is suitable for allocation
1482 * @device:     the device which we have the hole
1483 * @hole_start: starting position of the hole
1484 * @hole_size:  the size of the hole
1485 * @num_bytes:  the size of the free space that we need
1486 *
1487 * This function may modify @hole_start and @hole_size to reflect the suitable
1488 * position for allocation. Returns 1 if hole position is updated, 0 otherwise.
1489 */
1490static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
1491                                  u64 *hole_size, u64 num_bytes)
1492{
1493        bool changed = false;
1494        u64 hole_end = *hole_start + *hole_size;
1495
1496        for (;;) {
1497                /*
1498                 * Check before we set max_hole_start, otherwise we could end up
1499                 * sending back this offset anyway.
1500                 */
1501                if (contains_pending_extent(device, hole_start, *hole_size)) {
1502                        if (hole_end >= *hole_start)
1503                                *hole_size = hole_end - *hole_start;
1504                        else
1505                                *hole_size = 0;
1506                        changed = true;
1507                }
1508
1509                switch (device->fs_devices->chunk_alloc_policy) {
1510                case BTRFS_CHUNK_ALLOC_REGULAR:
1511                        /* No extra check */
1512                        break;
1513                case BTRFS_CHUNK_ALLOC_ZONED:
1514                        if (dev_extent_hole_check_zoned(device, hole_start,
1515                                                        hole_size, num_bytes)) {
1516                                changed = true;
1517                                /*
1518                                 * The changed hole can contain pending extent.
1519                                 * Loop again to check that.
1520                                 */
1521                                continue;
1522                        }
1523                        break;
1524                default:
1525                        BUG();
1526                }
1527
1528                break;
1529        }
1530
1531        return changed;
1532}
1533
1534/*
1535 * find_free_dev_extent_start - find free space in the specified device
1536 * @device:       the device which we search the free space in
1537 * @num_bytes:    the size of the free space that we need
1538 * @search_start: the position from which to begin the search
1539 * @start:        store the start of the free space.
1540 * @len:          the size of the free space. that we find, or the size
1541 *                of the max free space if we don't find suitable free space
1542 *
1543 * this uses a pretty simple search, the expectation is that it is
1544 * called very infrequently and that a given device has a small number
1545 * of extents
1546 *
1547 * @start is used to store the start of the free space if we find. But if we
1548 * don't find suitable free space, it will be used to store the start position
1549 * of the max free space.
1550 *
1551 * @len is used to store the size of the free space that we find.
1552 * But if we don't find suitable free space, it is used to store the size of
1553 * the max free space.
1554 *
1555 * NOTE: This function will search *commit* root of device tree, and does extra
1556 * check to ensure dev extents are not double allocated.
1557 * This makes the function safe to allocate dev extents but may not report
1558 * correct usable device space, as device extent freed in current transaction
1559 * is not reported as available.
1560 */
1561static int find_free_dev_extent_start(struct btrfs_device *device,
1562                                u64 num_bytes, u64 search_start, u64 *start,
1563                                u64 *len)
1564{
1565        struct btrfs_fs_info *fs_info = device->fs_info;
1566        struct btrfs_root *root = fs_info->dev_root;
1567        struct btrfs_key key;
1568        struct btrfs_dev_extent *dev_extent;
1569        struct btrfs_path *path;
1570        u64 hole_size;
1571        u64 max_hole_start;
1572        u64 max_hole_size;
1573        u64 extent_end;
1574        u64 search_end = device->total_bytes;
1575        int ret;
1576        int slot;
1577        struct extent_buffer *l;
1578
1579        search_start = dev_extent_search_start(device, search_start);
1580
1581        WARN_ON(device->zone_info &&
1582                !IS_ALIGNED(num_bytes, device->zone_info->zone_size));
1583
1584        path = btrfs_alloc_path();
1585        if (!path)
1586                return -ENOMEM;
1587
1588        max_hole_start = search_start;
1589        max_hole_size = 0;
1590
1591again:
1592        if (search_start >= search_end ||
1593                test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1594                ret = -ENOSPC;
1595                goto out;
1596        }
1597
1598        path->reada = READA_FORWARD;
1599        path->search_commit_root = 1;
1600        path->skip_locking = 1;
1601
1602        key.objectid = device->devid;
1603        key.offset = search_start;
1604        key.type = BTRFS_DEV_EXTENT_KEY;
1605
1606        ret = btrfs_search_backwards(root, &key, path);
1607        if (ret < 0)
1608                goto out;
1609
1610        while (1) {
1611                l = path->nodes[0];
1612                slot = path->slots[0];
1613                if (slot >= btrfs_header_nritems(l)) {
1614                        ret = btrfs_next_leaf(root, path);
1615                        if (ret == 0)
1616                                continue;
1617                        if (ret < 0)
1618                                goto out;
1619
1620                        break;
1621                }
1622                btrfs_item_key_to_cpu(l, &key, slot);
1623
1624                if (key.objectid < device->devid)
1625                        goto next;
1626
1627                if (key.objectid > device->devid)
1628                        break;
1629
1630                if (key.type != BTRFS_DEV_EXTENT_KEY)
1631                        goto next;
1632
1633                if (key.offset > search_start) {
1634                        hole_size = key.offset - search_start;
1635                        dev_extent_hole_check(device, &search_start, &hole_size,
1636                                              num_bytes);
1637
1638                        if (hole_size > max_hole_size) {
1639                                max_hole_start = search_start;
1640                                max_hole_size = hole_size;
1641                        }
1642
1643                        /*
1644                         * If this free space is greater than which we need,
1645                         * it must be the max free space that we have found
1646                         * until now, so max_hole_start must point to the start
1647                         * of this free space and the length of this free space
1648                         * is stored in max_hole_size. Thus, we return
1649                         * max_hole_start and max_hole_size and go back to the
1650                         * caller.
1651                         */
1652                        if (hole_size >= num_bytes) {
1653                                ret = 0;
1654                                goto out;
1655                        }
1656                }
1657
1658                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1659                extent_end = key.offset + btrfs_dev_extent_length(l,
1660                                                                  dev_extent);
1661                if (extent_end > search_start)
1662                        search_start = extent_end;
1663next:
1664                path->slots[0]++;
1665                cond_resched();
1666        }
1667
1668        /*
1669         * At this point, search_start should be the end of
1670         * allocated dev extents, and when shrinking the device,
1671         * search_end may be smaller than search_start.
1672         */
1673        if (search_end > search_start) {
1674                hole_size = search_end - search_start;
1675                if (dev_extent_hole_check(device, &search_start, &hole_size,
1676                                          num_bytes)) {
1677                        btrfs_release_path(path);
1678                        goto again;
1679                }
1680
1681                if (hole_size > max_hole_size) {
1682                        max_hole_start = search_start;
1683                        max_hole_size = hole_size;
1684                }
1685        }
1686
1687        /* See above. */
1688        if (max_hole_size < num_bytes)
1689                ret = -ENOSPC;
1690        else
1691                ret = 0;
1692
1693out:
1694        btrfs_free_path(path);
1695        *start = max_hole_start;
1696        if (len)
1697                *len = max_hole_size;
1698        return ret;
1699}
1700
1701int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
1702                         u64 *start, u64 *len)
1703{
1704        /* FIXME use last free of some kind */
1705        return find_free_dev_extent_start(device, num_bytes, 0, start, len);
1706}
1707
1708static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1709                          struct btrfs_device *device,
1710                          u64 start, u64 *dev_extent_len)
1711{
1712        struct btrfs_fs_info *fs_info = device->fs_info;
1713        struct btrfs_root *root = fs_info->dev_root;
1714        int ret;
1715        struct btrfs_path *path;
1716        struct btrfs_key key;
1717        struct btrfs_key found_key;
1718        struct extent_buffer *leaf = NULL;
1719        struct btrfs_dev_extent *extent = NULL;
1720
1721        path = btrfs_alloc_path();
1722        if (!path)
1723                return -ENOMEM;
1724
1725        key.objectid = device->devid;
1726        key.offset = start;
1727        key.type = BTRFS_DEV_EXTENT_KEY;
1728again:
1729        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1730        if (ret > 0) {
1731                ret = btrfs_previous_item(root, path, key.objectid,
1732                                          BTRFS_DEV_EXTENT_KEY);
1733                if (ret)
1734                        goto out;
1735                leaf = path->nodes[0];
1736                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1737                extent = btrfs_item_ptr(leaf, path->slots[0],
1738                                        struct btrfs_dev_extent);
1739                BUG_ON(found_key.offset > start || found_key.offset +
1740                       btrfs_dev_extent_length(leaf, extent) < start);
1741                key = found_key;
1742                btrfs_release_path(path);
1743                goto again;
1744        } else if (ret == 0) {
1745                leaf = path->nodes[0];
1746                extent = btrfs_item_ptr(leaf, path->slots[0],
1747                                        struct btrfs_dev_extent);
1748        } else {
1749                goto out;
1750        }
1751
1752        *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1753
1754        ret = btrfs_del_item(trans, root, path);
1755        if (ret == 0)
1756                set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1757out:
1758        btrfs_free_path(path);
1759        return ret;
1760}
1761
1762static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1763{
1764        struct extent_map_tree *em_tree;
1765        struct extent_map *em;
1766        struct rb_node *n;
1767        u64 ret = 0;
1768
1769        em_tree = &fs_info->mapping_tree;
1770        read_lock(&em_tree->lock);
1771        n = rb_last(&em_tree->map.rb_root);
1772        if (n) {
1773                em = rb_entry(n, struct extent_map, rb_node);
1774                ret = em->start + em->len;
1775        }
1776        read_unlock(&em_tree->lock);
1777
1778        return ret;
1779}
1780
1781static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1782                                    u64 *devid_ret)
1783{
1784        int ret;
1785        struct btrfs_key key;
1786        struct btrfs_key found_key;
1787        struct btrfs_path *path;
1788
1789        path = btrfs_alloc_path();
1790        if (!path)
1791                return -ENOMEM;
1792
1793        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1794        key.type = BTRFS_DEV_ITEM_KEY;
1795        key.offset = (u64)-1;
1796
1797        ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1798        if (ret < 0)
1799                goto error;
1800
1801        if (ret == 0) {
1802                /* Corruption */
1803                btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
1804                ret = -EUCLEAN;
1805                goto error;
1806        }
1807
1808        ret = btrfs_previous_item(fs_info->chunk_root, path,
1809                                  BTRFS_DEV_ITEMS_OBJECTID,
1810                                  BTRFS_DEV_ITEM_KEY);
1811        if (ret) {
1812                *devid_ret = 1;
1813        } else {
1814                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1815                                      path->slots[0]);
1816                *devid_ret = found_key.offset + 1;
1817        }
1818        ret = 0;
1819error:
1820        btrfs_free_path(path);
1821        return ret;
1822}
1823
1824/*
1825 * the device information is stored in the chunk root
1826 * the btrfs_device struct should be fully filled in
1827 */
1828static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
1829                            struct btrfs_device *device)
1830{
1831        int ret;
1832        struct btrfs_path *path;
1833        struct btrfs_dev_item *dev_item;
1834        struct extent_buffer *leaf;
1835        struct btrfs_key key;
1836        unsigned long ptr;
1837
1838        path = btrfs_alloc_path();
1839        if (!path)
1840                return -ENOMEM;
1841
1842        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1843        key.type = BTRFS_DEV_ITEM_KEY;
1844        key.offset = device->devid;
1845
1846        ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
1847                                      &key, sizeof(*dev_item));
1848        if (ret)
1849                goto out;
1850
1851        leaf = path->nodes[0];
1852        dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1853
1854        btrfs_set_device_id(leaf, dev_item, device->devid);
1855        btrfs_set_device_generation(leaf, dev_item, 0);
1856        btrfs_set_device_type(leaf, dev_item, device->type);
1857        btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1858        btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1859        btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1860        btrfs_set_device_total_bytes(leaf, dev_item,
1861                                     btrfs_device_get_disk_total_bytes(device));
1862        btrfs_set_device_bytes_used(leaf, dev_item,
1863                                    btrfs_device_get_bytes_used(device));
1864        btrfs_set_device_group(leaf, dev_item, 0);
1865        btrfs_set_device_seek_speed(leaf, dev_item, 0);
1866        btrfs_set_device_bandwidth(leaf, dev_item, 0);
1867        btrfs_set_device_start_offset(leaf, dev_item, 0);
1868
1869        ptr = btrfs_device_uuid(dev_item);
1870        write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1871        ptr = btrfs_device_fsid(dev_item);
1872        write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
1873                            ptr, BTRFS_FSID_SIZE);
1874        btrfs_mark_buffer_dirty(leaf);
1875
1876        ret = 0;
1877out:
1878        btrfs_free_path(path);
1879        return ret;
1880}
1881
1882/*
1883 * Function to update ctime/mtime for a given device path.
1884 * Mainly used for ctime/mtime based probe like libblkid.
1885 */
1886static void update_dev_time(struct block_device *bdev)
1887{
1888        struct inode *inode = bdev->bd_inode;
1889        struct timespec64 now;
1890
1891        /* Shouldn't happen but just in case. */
1892        if (!inode)
1893                return;
1894
1895        now = current_time(inode);
1896        generic_update_time(inode, &now, S_MTIME | S_CTIME);
1897}
1898
1899static int btrfs_rm_dev_item(struct btrfs_device *device)
1900{
1901        struct btrfs_root *root = device->fs_info->chunk_root;
1902        int ret;
1903        struct btrfs_path *path;
1904        struct btrfs_key key;
1905        struct btrfs_trans_handle *trans;
1906
1907        path = btrfs_alloc_path();
1908        if (!path)
1909                return -ENOMEM;
1910
1911        trans = btrfs_start_transaction(root, 0);
1912        if (IS_ERR(trans)) {
1913                btrfs_free_path(path);
1914                return PTR_ERR(trans);
1915        }
1916        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1917        key.type = BTRFS_DEV_ITEM_KEY;
1918        key.offset = device->devid;
1919
1920        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1921        if (ret) {
1922                if (ret > 0)
1923                        ret = -ENOENT;
1924                btrfs_abort_transaction(trans, ret);
1925                btrfs_end_transaction(trans);
1926                goto out;
1927        }
1928
1929        ret = btrfs_del_item(trans, root, path);
1930        if (ret) {
1931                btrfs_abort_transaction(trans, ret);
1932                btrfs_end_transaction(trans);
1933        }
1934
1935out:
1936        btrfs_free_path(path);
1937        if (!ret)
1938                ret = btrfs_commit_transaction(trans);
1939        return ret;
1940}
1941
1942/*
1943 * Verify that @num_devices satisfies the RAID profile constraints in the whole
1944 * filesystem. It's up to the caller to adjust that number regarding eg. device
1945 * replace.
1946 */
1947static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
1948                u64 num_devices)
1949{
1950        u64 all_avail;
1951        unsigned seq;
1952        int i;
1953
1954        do {
1955                seq = read_seqbegin(&fs_info->profiles_lock);
1956
1957                all_avail = fs_info->avail_data_alloc_bits |
1958                            fs_info->avail_system_alloc_bits |
1959                            fs_info->avail_metadata_alloc_bits;
1960        } while (read_seqretry(&fs_info->profiles_lock, seq));
1961
1962        for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
1963                if (!(all_avail & btrfs_raid_array[i].bg_flag))
1964                        continue;
1965
1966                if (num_devices < btrfs_raid_array[i].devs_min)
1967                        return btrfs_raid_array[i].mindev_error;
1968        }
1969
1970        return 0;
1971}
1972
1973static struct btrfs_device * btrfs_find_next_active_device(
1974                struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
1975{
1976        struct btrfs_device *next_device;
1977
1978        list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
1979                if (next_device != device &&
1980                    !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
1981                    && next_device->bdev)
1982                        return next_device;
1983        }
1984
1985        return NULL;
1986}
1987
1988/*
1989 * Helper function to check if the given device is part of s_bdev / latest_bdev
1990 * and replace it with the provided or the next active device, in the context
1991 * where this function called, there should be always be another device (or
1992 * this_dev) which is active.
1993 */
1994void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
1995                                            struct btrfs_device *next_device)
1996{
1997        struct btrfs_fs_info *fs_info = device->fs_info;
1998
1999        if (!next_device)
2000                next_device = btrfs_find_next_active_device(fs_info->fs_devices,
2001                                                            device);
2002        ASSERT(next_device);
2003
2004        if (fs_info->sb->s_bdev &&
2005                        (fs_info->sb->s_bdev == device->bdev))
2006                fs_info->sb->s_bdev = next_device->bdev;
2007
2008        if (fs_info->fs_devices->latest_bdev == device->bdev)
2009                fs_info->fs_devices->latest_bdev = next_device->bdev;
2010}
2011
2012/*
2013 * Return btrfs_fs_devices::num_devices excluding the device that's being
2014 * currently replaced.
2015 */
2016static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
2017{
2018        u64 num_devices = fs_info->fs_devices->num_devices;
2019
2020        down_read(&fs_info->dev_replace.rwsem);
2021        if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
2022                ASSERT(num_devices > 1);
2023                num_devices--;
2024        }
2025        up_read(&fs_info->dev_replace.rwsem);
2026
2027        return num_devices;
2028}
2029
2030void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
2031                               struct block_device *bdev,
2032                               const char *device_path)
2033{
2034        struct btrfs_super_block *disk_super;
2035        int copy_num;
2036
2037        if (!bdev)
2038                return;
2039
2040        for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
2041                struct page *page;
2042                int ret;
2043
2044                disk_super = btrfs_read_dev_one_super(bdev, copy_num);
2045                if (IS_ERR(disk_super))
2046                        continue;
2047
2048                if (bdev_is_zoned(bdev)) {
2049                        btrfs_reset_sb_log_zones(bdev, copy_num);
2050                        continue;
2051                }
2052
2053                memset(&disk_super->magic, 0, sizeof(disk_super->magic));
2054
2055                page = virt_to_page(disk_super);
2056                set_page_dirty(page);
2057                lock_page(page);
2058                /* write_on_page() unlocks the page */
2059                ret = write_one_page(page);
2060                if (ret)
2061                        btrfs_warn(fs_info,
2062                                "error clearing superblock number %d (%d)",
2063                                copy_num, ret);
2064                btrfs_release_disk_super(disk_super);
2065
2066        }
2067
2068        /* Notify udev that device has changed */
2069        btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
2070
2071        /* Update ctime/mtime for device path for libblkid */
2072        update_dev_time(bdev);
2073}
2074
2075int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
2076                    u64 devid, struct block_device **bdev, fmode_t *mode)
2077{
2078        struct btrfs_device *device;
2079        struct btrfs_fs_devices *cur_devices;
2080        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2081        u64 num_devices;
2082        int ret = 0;
2083
2084        mutex_lock(&uuid_mutex);
2085
2086        num_devices = btrfs_num_devices(fs_info);
2087
2088        ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
2089        if (ret)
2090                goto out;
2091
2092        device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
2093
2094        if (IS_ERR(device)) {
2095                if (PTR_ERR(device) == -ENOENT &&
2096                    device_path && strcmp(device_path, "missing") == 0)
2097                        ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2098                else
2099                        ret = PTR_ERR(device);
2100                goto out;
2101        }
2102
2103        if (btrfs_pinned_by_swapfile(fs_info, device)) {
2104                btrfs_warn_in_rcu(fs_info,
2105                  "cannot remove device %s (devid %llu) due to active swapfile",
2106                                  rcu_str_deref(device->name), device->devid);
2107                ret = -ETXTBSY;
2108                goto out;
2109        }
2110
2111        if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2112                ret = BTRFS_ERROR_DEV_TGT_REPLACE;
2113                goto out;
2114        }
2115
2116        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
2117            fs_info->fs_devices->rw_devices == 1) {
2118                ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
2119                goto out;
2120        }
2121
2122        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2123                mutex_lock(&fs_info->chunk_mutex);
2124                list_del_init(&device->dev_alloc_list);
2125                device->fs_devices->rw_devices--;
2126                mutex_unlock(&fs_info->chunk_mutex);
2127        }
2128
2129        mutex_unlock(&uuid_mutex);
2130        ret = btrfs_shrink_device(device, 0);
2131        if (!ret)
2132                btrfs_reada_remove_dev(device);
2133        mutex_lock(&uuid_mutex);
2134        if (ret)
2135                goto error_undo;
2136
2137        /*
2138         * TODO: the superblock still includes this device in its num_devices
2139         * counter although write_all_supers() is not locked out. This
2140         * could give a filesystem state which requires a degraded mount.
2141         */
2142        ret = btrfs_rm_dev_item(device);
2143        if (ret)
2144                goto error_undo;
2145
2146        clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2147        btrfs_scrub_cancel_dev(device);
2148
2149        /*
2150         * the device list mutex makes sure that we don't change
2151         * the device list while someone else is writing out all
2152         * the device supers. Whoever is writing all supers, should
2153         * lock the device list mutex before getting the number of
2154         * devices in the super block (super_copy). Conversely,
2155         * whoever updates the number of devices in the super block
2156         * (super_copy) should hold the device list mutex.
2157         */
2158
2159        /*
2160         * In normal cases the cur_devices == fs_devices. But in case
2161         * of deleting a seed device, the cur_devices should point to
2162         * its own fs_devices listed under the fs_devices->seed.
2163         */
2164        cur_devices = device->fs_devices;
2165        mutex_lock(&fs_devices->device_list_mutex);
2166        list_del_rcu(&device->dev_list);
2167
2168        cur_devices->num_devices--;
2169        cur_devices->total_devices--;
2170        /* Update total_devices of the parent fs_devices if it's seed */
2171        if (cur_devices != fs_devices)
2172                fs_devices->total_devices--;
2173
2174        if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
2175                cur_devices->missing_devices--;
2176
2177        btrfs_assign_next_active_device(device, NULL);
2178
2179        if (device->bdev) {
2180                cur_devices->open_devices--;
2181                /* remove sysfs entry */
2182                btrfs_sysfs_remove_device(device);
2183        }
2184
2185        num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
2186        btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
2187        mutex_unlock(&fs_devices->device_list_mutex);
2188
2189        /*
2190         * At this point, the device is zero sized and detached from the
2191         * devices list.  All that's left is to zero out the old supers and
2192         * free the device.
2193         *
2194         * We cannot call btrfs_close_bdev() here because we're holding the sb
2195         * write lock, and blkdev_put() will pull in the ->open_mutex on the
2196         * block device and it's dependencies.  Instead just flush the device
2197         * and let the caller do the final blkdev_put.
2198         */
2199        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2200                btrfs_scratch_superblocks(fs_info, device->bdev,
2201                                          device->name->str);
2202                if (device->bdev) {
2203                        sync_blockdev(device->bdev);
2204                        invalidate_bdev(device->bdev);
2205                }
2206        }
2207
2208        *bdev = device->bdev;
2209        *mode = device->mode;
2210        synchronize_rcu();
2211        btrfs_free_device(device);
2212
2213        if (cur_devices->open_devices == 0) {
2214                list_del_init(&cur_devices->seed_list);
2215                close_fs_devices(cur_devices);
2216                free_fs_devices(cur_devices);
2217        }
2218
2219out:
2220        mutex_unlock(&uuid_mutex);
2221        return ret;
2222
2223error_undo:
2224        btrfs_reada_undo_remove_dev(device);
2225        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2226                mutex_lock(&fs_info->chunk_mutex);
2227                list_add(&device->dev_alloc_list,
2228                         &fs_devices->alloc_list);
2229                device->fs_devices->rw_devices++;
2230                mutex_unlock(&fs_info->chunk_mutex);
2231        }
2232        goto out;
2233}
2234
2235void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
2236{
2237        struct btrfs_fs_devices *fs_devices;
2238
2239        lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);
2240
2241        /*
2242         * in case of fs with no seed, srcdev->fs_devices will point
2243         * to fs_devices of fs_info. However when the dev being replaced is
2244         * a seed dev it will point to the seed's local fs_devices. In short
2245         * srcdev will have its correct fs_devices in both the cases.
2246         */
2247        fs_devices = srcdev->fs_devices;
2248
2249        list_del_rcu(&srcdev->dev_list);
2250        list_del(&srcdev->dev_alloc_list);
2251        fs_devices->num_devices--;
2252        if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2253                fs_devices->missing_devices--;
2254
2255        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
2256                fs_devices->rw_devices--;
2257
2258        if (srcdev->bdev)
2259                fs_devices->open_devices--;
2260}
2261
2262void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
2263{
2264        struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2265
2266        mutex_lock(&uuid_mutex);
2267
2268        btrfs_close_bdev(srcdev);
2269        synchronize_rcu();
2270        btrfs_free_device(srcdev);
2271
2272        /* if this is no devs we rather delete the fs_devices */
2273        if (!fs_devices->num_devices) {
2274                /*
2275                 * On a mounted FS, num_devices can't be zero unless it's a
2276                 * seed. In case of a seed device being replaced, the replace
2277                 * target added to the sprout FS, so there will be no more
2278                 * device left under the seed FS.
2279                 */
2280                ASSERT(fs_devices->seeding);
2281
2282                list_del_init(&fs_devices->seed_list);
2283                close_fs_devices(fs_devices);
2284                free_fs_devices(fs_devices);
2285        }
2286        mutex_unlock(&uuid_mutex);
2287}
2288
2289void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
2290{
2291        struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
2292
2293        mutex_lock(&fs_devices->device_list_mutex);
2294
2295        btrfs_sysfs_remove_device(tgtdev);
2296
2297        if (tgtdev->bdev)
2298                fs_devices->open_devices--;
2299
2300        fs_devices->num_devices--;
2301
2302        btrfs_assign_next_active_device(tgtdev, NULL);
2303
2304        list_del_rcu(&tgtdev->dev_list);
2305
2306        mutex_unlock(&fs_devices->device_list_mutex);
2307
2308        /*
2309         * The update_dev_time() with in btrfs_scratch_superblocks()
2310         * may lead to a call to btrfs_show_devname() which will try
2311         * to hold device_list_mutex. And here this device
2312         * is already out of device list, so we don't have to hold
2313         * the device_list_mutex lock.
2314         */
2315        btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
2316                                  tgtdev->name->str);
2317
2318        btrfs_close_bdev(tgtdev);
2319        synchronize_rcu();
2320        btrfs_free_device(tgtdev);
2321}
2322
2323static struct btrfs_device *btrfs_find_device_by_path(
2324                struct btrfs_fs_info *fs_info, const char *device_path)
2325{
2326        int ret = 0;
2327        struct btrfs_super_block *disk_super;
2328        u64 devid;
2329        u8 *dev_uuid;
2330        struct block_device *bdev;
2331        struct btrfs_device *device;
2332
2333        ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
2334                                    fs_info->bdev_holder, 0, &bdev, &disk_super);
2335        if (ret)
2336                return ERR_PTR(ret);
2337
2338        devid = btrfs_stack_device_id(&disk_super->dev_item);
2339        dev_uuid = disk_super->dev_item.uuid;
2340        if (btrfs_fs_incompat(fs_info, METADATA_UUID))
2341                device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2342                                           disk_super->metadata_uuid);
2343        else
2344                device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2345                                           disk_super->fsid);
2346
2347        btrfs_release_disk_super(disk_super);
2348        if (!device)
2349                device = ERR_PTR(-ENOENT);
2350        blkdev_put(bdev, FMODE_READ);
2351        return device;
2352}
2353
2354/*
2355 * Lookup a device given by device id, or the path if the id is 0.
2356 */
2357struct btrfs_device *btrfs_find_device_by_devspec(
2358                struct btrfs_fs_info *fs_info, u64 devid,
2359                const char *device_path)
2360{
2361        struct btrfs_device *device;
2362
2363        if (devid) {
2364                device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
2365                                           NULL);
2366                if (!device)
2367                        return ERR_PTR(-ENOENT);
2368                return device;
2369        }
2370
2371        if (!device_path || !device_path[0])
2372                return ERR_PTR(-EINVAL);
2373
2374        if (strcmp(device_path, "missing") == 0) {
2375                /* Find first missing device */
2376                list_for_each_entry(device, &fs_info->fs_devices->devices,
2377                                    dev_list) {
2378                        if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
2379                                     &device->dev_state) && !device->bdev)
2380                                return device;
2381                }
2382                return ERR_PTR(-ENOENT);
2383        }
2384
2385        return btrfs_find_device_by_path(fs_info, device_path);
2386}
2387
2388/*
2389 * does all the dirty work required for changing file system's UUID.
2390 */
2391static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
2392{
2393        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2394        struct btrfs_fs_devices *old_devices;
2395        struct btrfs_fs_devices *seed_devices;
2396        struct btrfs_super_block *disk_super = fs_info->super_copy;
2397        struct btrfs_device *device;
2398        u64 super_flags;
2399
2400        lockdep_assert_held(&uuid_mutex);
2401        if (!fs_devices->seeding)
2402                return -EINVAL;
2403
2404        /*
2405         * Private copy of the seed devices, anchored at
2406         * fs_info->fs_devices->seed_list
2407         */
2408        seed_devices = alloc_fs_devices(NULL, NULL);
2409        if (IS_ERR(seed_devices))
2410                return PTR_ERR(seed_devices);
2411
2412        /*
2413         * It's necessary to retain a copy of the original seed fs_devices in
2414         * fs_uuids so that filesystems which have been seeded can successfully
2415         * reference the seed device from open_seed_devices. This also supports
2416         * multiple fs seed.
2417         */
2418        old_devices = clone_fs_devices(fs_devices);
2419        if (IS_ERR(old_devices)) {
2420                kfree(seed_devices);
2421                return PTR_ERR(old_devices);
2422        }
2423
2424        list_add(&old_devices->fs_list, &fs_uuids);
2425
2426        memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2427        seed_devices->opened = 1;
2428        INIT_LIST_HEAD(&seed_devices->devices);
2429        INIT_LIST_HEAD(&seed_devices->alloc_list);
2430        mutex_init(&seed_devices->device_list_mutex);
2431
2432        mutex_lock(&fs_devices->device_list_mutex);
2433        list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
2434                              synchronize_rcu);
2435        list_for_each_entry(device, &seed_devices->devices, dev_list)
2436                device->fs_devices = seed_devices;
2437
2438        fs_devices->seeding = false;
2439        fs_devices->num_devices = 0;
2440        fs_devices->open_devices = 0;
2441        fs_devices->missing_devices = 0;
2442        fs_devices->rotating = false;
2443        list_add(&seed_devices->seed_list, &fs_devices->seed_list);
2444
2445        generate_random_uuid(fs_devices->fsid);
2446        memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
2447        memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2448        mutex_unlock(&fs_devices->device_list_mutex);
2449
2450        super_flags = btrfs_super_flags(disk_super) &
2451                      ~BTRFS_SUPER_FLAG_SEEDING;
2452        btrfs_set_super_flags(disk_super, super_flags);
2453
2454        return 0;
2455}
2456
2457/*
2458 * Store the expected generation for seed devices in device items.
2459 */
2460static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
2461{
2462        struct btrfs_fs_info *fs_info = trans->fs_info;
2463        struct btrfs_root *root = fs_info->chunk_root;
2464        struct btrfs_path *path;
2465        struct extent_buffer *leaf;
2466        struct btrfs_dev_item *dev_item;
2467        struct btrfs_device *device;
2468        struct btrfs_key key;
2469        u8 fs_uuid[BTRFS_FSID_SIZE];
2470        u8 dev_uuid[BTRFS_UUID_SIZE];
2471        u64 devid;
2472        int ret;
2473
2474        path = btrfs_alloc_path();
2475        if (!path)
2476                return -ENOMEM;
2477
2478        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2479        key.offset = 0;
2480        key.type = BTRFS_DEV_ITEM_KEY;
2481
2482        while (1) {
2483                ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2484                if (ret < 0)
2485                        goto error;
2486
2487                leaf = path->nodes[0];
2488next_slot:
2489                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2490                        ret = btrfs_next_leaf(root, path);
2491                        if (ret > 0)
2492                                break;
2493                        if (ret < 0)
2494                                goto error;
2495                        leaf = path->nodes[0];
2496                        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2497                        btrfs_release_path(path);
2498                        continue;
2499                }
2500
2501                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2502                if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2503                    key.type != BTRFS_DEV_ITEM_KEY)
2504                        break;
2505
2506                dev_item = btrfs_item_ptr(leaf, path->slots[0],
2507                                          struct btrfs_dev_item);
2508                devid = btrfs_device_id(leaf, dev_item);
2509                read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2510                                   BTRFS_UUID_SIZE);
2511                read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2512                                   BTRFS_FSID_SIZE);
2513                device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2514                                           fs_uuid);
2515                BUG_ON(!device); /* Logic error */
2516
2517                if (device->fs_devices->seeding) {
2518                        btrfs_set_device_generation(leaf, dev_item,
2519                                                    device->generation);
2520                        btrfs_mark_buffer_dirty(leaf);
2521                }
2522
2523                path->slots[0]++;
2524                goto next_slot;
2525        }
2526        ret = 0;
2527error:
2528        btrfs_free_path(path);
2529        return ret;
2530}
2531
2532int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2533{
2534        struct btrfs_root *root = fs_info->dev_root;
2535        struct request_queue *q;
2536        struct btrfs_trans_handle *trans;
2537        struct btrfs_device *device;
2538        struct block_device *bdev;
2539        struct super_block *sb = fs_info->sb;
2540        struct rcu_string *name;
2541        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2542        u64 orig_super_total_bytes;
2543        u64 orig_super_num_devices;
2544        int seeding_dev = 0;
2545        int ret = 0;
2546        bool locked = false;
2547
2548        if (sb_rdonly(sb) && !fs_devices->seeding)
2549                return -EROFS;
2550
2551        bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2552                                  fs_info->bdev_holder);
2553        if (IS_ERR(bdev))
2554                return PTR_ERR(bdev);
2555
2556        if (!btrfs_check_device_zone_type(fs_info, bdev)) {
2557                ret = -EINVAL;
2558                goto error;
2559        }
2560
2561        if (fs_devices->seeding) {
2562                seeding_dev = 1;
2563                down_write(&sb->s_umount);
2564                mutex_lock(&uuid_mutex);
2565                locked = true;
2566        }
2567
2568        sync_blockdev(bdev);
2569
2570        rcu_read_lock();
2571        list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
2572                if (device->bdev == bdev) {
2573                        ret = -EEXIST;
2574                        rcu_read_unlock();
2575                        goto error;
2576                }
2577        }
2578        rcu_read_unlock();
2579
2580        device = btrfs_alloc_device(fs_info, NULL, NULL);
2581        if (IS_ERR(device)) {
2582                /* we can safely leave the fs_devices entry around */
2583                ret = PTR_ERR(device);
2584                goto error;
2585        }
2586
2587        name = rcu_string_strdup(device_path, GFP_KERNEL);
2588        if (!name) {
2589                ret = -ENOMEM;
2590                goto error_free_device;
2591        }
2592        rcu_assign_pointer(device->name, name);
2593
2594        device->fs_info = fs_info;
2595        device->bdev = bdev;
2596
2597        ret = btrfs_get_dev_zone_info(device);
2598        if (ret)
2599                goto error_free_device;
2600
2601        trans = btrfs_start_transaction(root, 0);
2602        if (IS_ERR(trans)) {
2603                ret = PTR_ERR(trans);
2604                goto error_free_zone;
2605        }
2606
2607        q = bdev_get_queue(bdev);
2608        set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
2609        device->generation = trans->transid;
2610        device->io_width = fs_info->sectorsize;
2611        device->io_align = fs_info->sectorsize;
2612        device->sector_size = fs_info->sectorsize;
2613        device->total_bytes = round_down(i_size_read(bdev->bd_inode),
2614                                         fs_info->sectorsize);
2615        device->disk_total_bytes = device->total_bytes;
2616        device->commit_total_bytes = device->total_bytes;
2617        set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2618        clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2619        device->mode = FMODE_EXCL;
2620        device->dev_stats_valid = 1;
2621        set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2622
2623        if (seeding_dev) {
2624                btrfs_clear_sb_rdonly(sb);
2625                ret = btrfs_prepare_sprout(fs_info);
2626                if (ret) {
2627                        btrfs_abort_transaction(trans, ret);
2628                        goto error_trans;
2629                }
2630        }
2631
2632        device->fs_devices = fs_devices;
2633
2634        mutex_lock(&fs_devices->device_list_mutex);
2635        mutex_lock(&fs_info->chunk_mutex);
2636        list_add_rcu(&device->dev_list, &fs_devices->devices);
2637        list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
2638        fs_devices->num_devices++;
2639        fs_devices->open_devices++;
2640        fs_devices->rw_devices++;
2641        fs_devices->total_devices++;
2642        fs_devices->total_rw_bytes += device->total_bytes;
2643
2644        atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2645
2646        if (!blk_queue_nonrot(q))
2647                fs_devices->rotating = true;
2648
2649        orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
2650        btrfs_set_super_total_bytes(fs_info->super_copy,
2651                round_down(orig_super_total_bytes + device->total_bytes,
2652                           fs_info->sectorsize));
2653
2654        orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
2655        btrfs_set_super_num_devices(fs_info->super_copy,
2656                                    orig_super_num_devices + 1);
2657
2658        /*
2659         * we've got more storage, clear any full flags on the space
2660         * infos
2661         */
2662        btrfs_clear_space_info_full(fs_info);
2663
2664        mutex_unlock(&fs_info->chunk_mutex);
2665
2666        /* Add sysfs device entry */
2667        btrfs_sysfs_add_device(device);
2668
2669        mutex_unlock(&fs_devices->device_list_mutex);
2670
2671        if (seeding_dev) {
2672                mutex_lock(&fs_info->chunk_mutex);
2673                ret = init_first_rw_device(trans);
2674                mutex_unlock(&fs_info->chunk_mutex);
2675                if (ret) {
2676                        btrfs_abort_transaction(trans, ret);
2677                        goto error_sysfs;
2678                }
2679        }
2680
2681        ret = btrfs_add_dev_item(trans, device);
2682        if (ret) {
2683                btrfs_abort_transaction(trans, ret);
2684                goto error_sysfs;
2685        }
2686
2687        if (seeding_dev) {
2688                ret = btrfs_finish_sprout(trans);
2689                if (ret) {
2690                        btrfs_abort_transaction(trans, ret);
2691                        goto error_sysfs;
2692                }
2693
2694                /*
2695                 * fs_devices now represents the newly sprouted filesystem and
2696                 * its fsid has been changed by btrfs_prepare_sprout
2697                 */
2698                btrfs_sysfs_update_sprout_fsid(fs_devices);
2699        }
2700
2701        ret = btrfs_commit_transaction(trans);
2702
2703        if (seeding_dev) {
2704                mutex_unlock(&uuid_mutex);
2705                up_write(&sb->s_umount);
2706                locked = false;
2707
2708                if (ret) /* transaction commit */
2709                        return ret;
2710
2711                ret = btrfs_relocate_sys_chunks(fs_info);
2712                if (ret < 0)
2713                        btrfs_handle_fs_error(fs_info, ret,
2714                                    "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2715                trans = btrfs_attach_transaction(root);
2716                if (IS_ERR(trans)) {
2717                        if (PTR_ERR(trans) == -ENOENT)
2718                                return 0;
2719                        ret = PTR_ERR(trans);
2720                        trans = NULL;
2721                        goto error_sysfs;
2722                }
2723                ret = btrfs_commit_transaction(trans);
2724        }
2725
2726        /*
2727         * Now that we have written a new super block to this device, check all
2728         * other fs_devices list if device_path alienates any other scanned
2729         * device.
2730         * We can ignore the return value as it typically returns -EINVAL and
2731         * only succeeds if the device was an alien.
2732         */
2733        btrfs_forget_devices(device_path);
2734
2735        /* Update ctime/mtime for blkid or udev */
2736        update_dev_time(bdev);
2737
2738        return ret;
2739
2740error_sysfs:
2741        btrfs_sysfs_remove_device(device);
2742        mutex_lock(&fs_info->fs_devices->device_list_mutex);
2743        mutex_lock(&fs_info->chunk_mutex);
2744        list_del_rcu(&device->dev_list);
2745        list_del(&device->dev_alloc_list);
2746        fs_info->fs_devices->num_devices--;
2747        fs_info->fs_devices->open_devices--;
2748        fs_info->fs_devices->rw_devices--;
2749        fs_info->fs_devices->total_devices--;
2750        fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
2751        atomic64_sub(device->total_bytes, &fs_info->free_chunk_space);
2752        btrfs_set_super_total_bytes(fs_info->super_copy,
2753                                    orig_super_total_bytes);
2754        btrfs_set_super_num_devices(fs_info->super_copy,
2755                                    orig_super_num_devices);
2756        mutex_unlock(&fs_info->chunk_mutex);
2757        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2758error_trans:
2759        if (seeding_dev)
2760                btrfs_set_sb_rdonly(sb);
2761        if (trans)
2762                btrfs_end_transaction(trans);
2763error_free_zone:
2764        btrfs_destroy_dev_zone_info(device);
2765error_free_device:
2766        btrfs_free_device(device);
2767error:
2768        blkdev_put(bdev, FMODE_EXCL);
2769        if (locked) {
2770                mutex_unlock(&uuid_mutex);
2771                up_write(&sb->s_umount);
2772        }
2773        return ret;
2774}
2775
2776static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2777                                        struct btrfs_device *device)
2778{
2779        int ret;
2780        struct btrfs_path *path;
2781        struct btrfs_root *root = device->fs_info->chunk_root;
2782        struct btrfs_dev_item *dev_item;
2783        struct extent_buffer *leaf;
2784        struct btrfs_key key;
2785
2786        path = btrfs_alloc_path();
2787        if (!path)
2788                return -ENOMEM;
2789
2790        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2791        key.type = BTRFS_DEV_ITEM_KEY;
2792        key.offset = device->devid;
2793
2794        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2795        if (ret < 0)
2796                goto out;
2797
2798        if (ret > 0) {
2799                ret = -ENOENT;
2800                goto out;
2801        }
2802
2803        leaf = path->nodes[0];
2804        dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2805
2806        btrfs_set_device_id(leaf, dev_item, device->devid);
2807        btrfs_set_device_type(leaf, dev_item, device->type);
2808        btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2809        btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2810        btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2811        btrfs_set_device_total_bytes(leaf, dev_item,
2812                                     btrfs_device_get_disk_total_bytes(device));
2813        btrfs_set_device_bytes_used(leaf, dev_item,
2814                                    btrfs_device_get_bytes_used(device));
2815        btrfs_mark_buffer_dirty(leaf);
2816
2817out:
2818        btrfs_free_path(path);
2819        return ret;
2820}
2821
2822int btrfs_grow_device(struct btrfs_trans_handle *trans,
2823                      struct btrfs_device *device, u64 new_size)
2824{
2825        struct btrfs_fs_info *fs_info = device->fs_info;
2826        struct btrfs_super_block *super_copy = fs_info->super_copy;
2827        u64 old_total;
2828        u64 diff;
2829
2830        if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2831                return -EACCES;
2832
2833        new_size = round_down(new_size, fs_info->sectorsize);
2834
2835        mutex_lock(&fs_info->chunk_mutex);
2836        old_total = btrfs_super_total_bytes(super_copy);
2837        diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
2838
2839        if (new_size <= device->total_bytes ||
2840            test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2841                mutex_unlock(&fs_info->chunk_mutex);
2842                return -EINVAL;
2843        }
2844
2845        btrfs_set_super_total_bytes(super_copy,
2846                        round_down(old_total + diff, fs_info->sectorsize));
2847        device->fs_devices->total_rw_bytes += diff;
2848
2849        btrfs_device_set_total_bytes(device, new_size);
2850        btrfs_device_set_disk_total_bytes(device, new_size);
2851        btrfs_clear_space_info_full(device->fs_info);
2852        if (list_empty(&device->post_commit_list))
2853                list_add_tail(&device->post_commit_list,
2854                              &trans->transaction->dev_update_list);
2855        mutex_unlock(&fs_info->chunk_mutex);
2856
2857        return btrfs_update_device(trans, device);
2858}
2859
2860static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
2861{
2862        struct btrfs_fs_info *fs_info = trans->fs_info;
2863        struct btrfs_root *root = fs_info->chunk_root;
2864        int ret;
2865        struct btrfs_path *path;
2866        struct btrfs_key key;
2867
2868        path = btrfs_alloc_path();
2869        if (!path)
2870                return -ENOMEM;
2871
2872        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2873        key.offset = chunk_offset;
2874        key.type = BTRFS_CHUNK_ITEM_KEY;
2875
2876        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2877        if (ret < 0)
2878                goto out;
2879        else if (ret > 0) { /* Logic error or corruption */
2880                btrfs_handle_fs_error(fs_info, -ENOENT,
2881                                      "Failed lookup while freeing chunk.");
2882                ret = -ENOENT;
2883                goto out;
2884        }
2885
2886        ret = btrfs_del_item(trans, root, path);
2887        if (ret < 0)
2888                btrfs_handle_fs_error(fs_info, ret,
2889                                      "Failed to delete chunk item.");
2890out:
2891        btrfs_free_path(path);
2892        return ret;
2893}
2894
2895static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2896{
2897        struct btrfs_super_block *super_copy = fs_info->super_copy;
2898        struct btrfs_disk_key *disk_key;
2899        struct btrfs_chunk *chunk;
2900        u8 *ptr;
2901        int ret = 0;
2902        u32 num_stripes;
2903        u32 array_size;
2904        u32 len = 0;
2905        u32 cur;
2906        struct btrfs_key key;
2907
2908        lockdep_assert_held(&fs_info->chunk_mutex);
2909        array_size = btrfs_super_sys_array_size(super_copy);
2910
2911        ptr = super_copy->sys_chunk_array;
2912        cur = 0;
2913
2914        while (cur < array_size) {
2915                disk_key = (struct btrfs_disk_key *)ptr;
2916                btrfs_disk_key_to_cpu(&key, disk_key);
2917
2918                len = sizeof(*disk_key);
2919
2920                if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2921                        chunk = (struct btrfs_chunk *)(ptr + len);
2922                        num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2923                        len += btrfs_chunk_item_size(num_stripes);
2924                } else {
2925                        ret = -EIO;
2926                        break;
2927                }
2928                if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
2929                    key.offset == chunk_offset) {
2930                        memmove(ptr, ptr + len, array_size - (cur + len));
2931                        array_size -= len;
2932                        btrfs_set_super_sys_array_size(super_copy, array_size);
2933                } else {
2934                        ptr += len;
2935                        cur += len;
2936                }
2937        }
2938        return ret;
2939}
2940
2941/*
2942 * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
2943 * @logical: Logical block offset in bytes.
2944 * @length: Length of extent in bytes.
2945 *
2946 * Return: Chunk mapping or ERR_PTR.
2947 */
2948struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
2949                                       u64 logical, u64 length)
2950{
2951        struct extent_map_tree *em_tree;
2952        struct extent_map *em;
2953
2954        em_tree = &fs_info->mapping_tree;
2955        read_lock(&em_tree->lock);
2956        em = lookup_extent_mapping(em_tree, logical, length);
2957        read_unlock(&em_tree->lock);
2958
2959        if (!em) {
2960                btrfs_crit(fs_info, "unable to find logical %llu length %llu",
2961                           logical, length);
2962                return ERR_PTR(-EINVAL);
2963        }
2964
2965        if (em->start > logical || em->start + em->len < logical) {
2966                btrfs_crit(fs_info,
2967                           "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
2968                           logical, length, em->start, em->start + em->len);
2969                free_extent_map(em);
2970                return ERR_PTR(-EINVAL);
2971        }
2972
2973        /* callers are responsible for dropping em's ref. */
2974        return em;
2975}
2976
2977static int remove_chunk_item(struct btrfs_trans_handle *trans,
2978                             struct map_lookup *map, u64 chunk_offset)
2979{
2980        int i;
2981
2982        /*
2983         * Removing chunk items and updating the device items in the chunks btree
2984         * requires holding the chunk_mutex.
2985         * See the comment at btrfs_chunk_alloc() for the details.
2986         */
2987        lockdep_assert_held(&trans->fs_info->chunk_mutex);
2988
2989        for (i = 0; i < map->num_stripes; i++) {
2990                int ret;
2991
2992                ret = btrfs_update_device(trans, map->stripes[i].dev);
2993                if (ret)
2994                        return ret;
2995        }
2996
2997        return btrfs_free_chunk(trans, chunk_offset);
2998}
2999
3000int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
3001{
3002        struct btrfs_fs_info *fs_info = trans->fs_info;
3003        struct extent_map *em;
3004        struct map_lookup *map;
3005        u64 dev_extent_len = 0;
3006        int i, ret = 0;
3007        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
3008
3009        em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
3010        if (IS_ERR(em)) {
3011                /*
3012                 * This is a logic error, but we don't want to just rely on the
3013                 * user having built with ASSERT enabled, so if ASSERT doesn't
3014                 * do anything we still error out.
3015                 */
3016                ASSERT(0);
3017                return PTR_ERR(em);
3018        }
3019        map = em->map_lookup;
3020
3021        /*
3022         * First delete the device extent items from the devices btree.
3023         * We take the device_list_mutex to avoid racing with the finishing phase
3024         * of a device replace operation. See the comment below before acquiring
3025         * fs_info->chunk_mutex. Note that here we do not acquire the chunk_mutex
3026         * because that can result in a deadlock when deleting the device extent
3027         * items from the devices btree - COWing an extent buffer from the btree
3028         * may result in allocating a new metadata chunk, which would attempt to
3029         * lock again fs_info->chunk_mutex.
3030         */
3031        mutex_lock(&fs_devices->device_list_mutex);
3032        for (i = 0; i < map->num_stripes; i++) {
3033                struct btrfs_device *device = map->stripes[i].dev;
3034                ret = btrfs_free_dev_extent(trans, device,
3035                                            map->stripes[i].physical,
3036                                            &dev_extent_len);
3037                if (ret) {
3038                        mutex_unlock(&fs_devices->device_list_mutex);
3039                        btrfs_abort_transaction(trans, ret);
3040                        goto out;
3041                }
3042
3043                if (device->bytes_used > 0) {
3044                        mutex_lock(&fs_info->chunk_mutex);
3045                        btrfs_device_set_bytes_used(device,
3046                                        device->bytes_used - dev_extent_len);
3047                        atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
3048                        btrfs_clear_space_info_full(fs_info);
3049                        mutex_unlock(&fs_info->chunk_mutex);
3050                }
3051        }
3052        mutex_unlock(&fs_devices->device_list_mutex);
3053
3054        /*
3055         * We acquire fs_info->chunk_mutex for 2 reasons:
3056         *
3057         * 1) Just like with the first phase of the chunk allocation, we must
3058         *    reserve system space, do all chunk btree updates and deletions, and
3059         *    update the system chunk array in the superblock while holding this
3060         *    mutex. This is for similar reasons as explained on the comment at
3061         *    the top of btrfs_chunk_alloc();
3062         *
3063         * 2) Prevent races with the final phase of a device replace operation
3064         *    that replaces the device object associated with the map's stripes,
3065         *    because the device object's id can change at any time during that
3066         *    final phase of the device replace operation
3067         *    (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
3068         *    replaced device and then see it with an ID of
3069         *    BTRFS_DEV_REPLACE_DEVID, which would cause a failure when updating
3070         *    the device item, which does not exists on the chunk btree.
3071         *    The finishing phase of device replace acquires both the
3072         *    device_list_mutex and the chunk_mutex, in that order, so we are
3073         *    safe by just acquiring the chunk_mutex.
3074         */
3075        trans->removing_chunk = true;
3076        mutex_lock(&fs_info->chunk_mutex);
3077
3078        check_system_chunk(trans, map->type);
3079
3080        ret = remove_chunk_item(trans, map, chunk_offset);
3081        /*
3082         * Normally we should not get -ENOSPC since we reserved space before
3083         * through the call to check_system_chunk().
3084         *
3085         * Despite our system space_info having enough free space, we may not
3086         * be able to allocate extents from its block groups, because all have
3087         * an incompatible profile, which will force us to allocate a new system
3088         * block group with the right profile, or right after we called
3089         * check_system_space() above, a scrub turned the only system block group
3090         * with enough free space into RO mode.
3091         * This is explained with more detail at do_chunk_alloc().
3092         *
3093         * So if we get -ENOSPC, allocate a new system chunk and retry once.
3094         */
3095        if (ret == -ENOSPC) {
3096                const u64 sys_flags = btrfs_system_alloc_profile(fs_info);
3097                struct btrfs_block_group *sys_bg;
3098
3099                sys_bg = btrfs_alloc_chunk(trans, sys_flags);
3100                if (IS_ERR(sys_bg)) {
3101                        ret = PTR_ERR(sys_bg);
3102                        btrfs_abort_transaction(trans, ret);
3103                        goto out;
3104                }
3105
3106                ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
3107                if (ret) {
3108                        btrfs_abort_transaction(trans, ret);
3109                        goto out;
3110                }
3111
3112                ret = remove_chunk_item(trans, map, chunk_offset);
3113                if (ret) {
3114                        btrfs_abort_transaction(trans, ret);
3115                        goto out;
3116                }
3117        } else if (ret) {
3118                btrfs_abort_transaction(trans, ret);
3119                goto out;
3120        }
3121
3122        trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
3123
3124        if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
3125                ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
3126                if (ret) {
3127                        btrfs_abort_transaction(trans, ret);
3128                        goto out;
3129                }
3130        }
3131
3132        mutex_unlock(&fs_info->chunk_mutex);
3133        trans->removing_chunk = false;
3134
3135        /*
3136         * We are done with chunk btree updates and deletions, so release the
3137         * system space we previously reserved (with check_system_chunk()).
3138         */
3139        btrfs_trans_release_chunk_metadata(trans);
3140
3141        ret = btrfs_remove_block_group(trans, chunk_offset, em);
3142        if (ret) {
3143                btrfs_abort_transaction(trans, ret);
3144                goto out;
3145        }
3146
3147out:
3148        if (trans->removing_chunk) {
3149                mutex_unlock(&fs_info->chunk_mutex);
3150                trans->removing_chunk = false;
3151        }
3152        /* once for us */
3153        free_extent_map(em);
3154        return ret;
3155}
3156
3157int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
3158{
3159        struct btrfs_root *root = fs_info->chunk_root;
3160        struct btrfs_trans_handle *trans;
3161        struct btrfs_block_group *block_group;
3162        u64 length;
3163        int ret;
3164
3165        /*
3166         * Prevent races with automatic removal of unused block groups.
3167         * After we relocate and before we remove the chunk with offset
3168         * chunk_offset, automatic removal of the block group can kick in,
3169         * resulting in a failure when calling btrfs_remove_chunk() below.
3170         *
3171         * Make sure to acquire this mutex before doing a tree search (dev
3172         * or chunk trees) to find chunks. Otherwise the cleaner kthread might
3173         * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
3174         * we release the path used to search the chunk/dev tree and before
3175         * the current task acquires this mutex and calls us.
3176         */
3177        lockdep_assert_held(&fs_info->reclaim_bgs_lock);
3178
3179        /* step one, relocate all the extents inside this chunk */
3180        btrfs_scrub_pause(fs_info);
3181        ret = btrfs_relocate_block_group(fs_info, chunk_offset);
3182        btrfs_scrub_continue(fs_info);
3183        if (ret)
3184                return ret;
3185
3186        block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
3187        if (!block_group)
3188                return -ENOENT;
3189        btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
3190        length = block_group->length;
3191        btrfs_put_block_group(block_group);
3192
3193        /*
3194         * On a zoned file system, discard the whole block group, this will
3195         * trigger a REQ_OP_ZONE_RESET operation on the device zone. If
3196         * resetting the zone fails, don't treat it as a fatal problem from the
3197         * filesystem's point of view.
3198         */
3199        if (btrfs_is_zoned(fs_info)) {
3200                ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL);
3201                if (ret)
3202                        btrfs_info(fs_info,
3203                                "failed to reset zone %llu after relocation",
3204                                chunk_offset);
3205        }
3206
3207        trans = btrfs_start_trans_remove_block_group(root->fs_info,
3208                                                     chunk_offset);
3209        if (IS_ERR(trans)) {
3210                ret = PTR_ERR(trans);
3211                btrfs_handle_fs_error(root->fs_info, ret, NULL);
3212                return ret;
3213        }
3214
3215        /*
3216         * step two, delete the device extents and the
3217         * chunk tree entries
3218         */
3219        ret = btrfs_remove_chunk(trans, chunk_offset);
3220        btrfs_end_transaction(trans);
3221        return ret;
3222}
3223
3224static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
3225{
3226        struct btrfs_root *chunk_root = fs_info->chunk_root;
3227        struct btrfs_path *path;
3228        struct extent_buffer *leaf;
3229        struct btrfs_chunk *chunk;
3230        struct btrfs_key key;
3231        struct btrfs_key found_key;
3232        u64 chunk_type;
3233        bool retried = false;
3234        int failed = 0;
3235        int ret;
3236
3237        path = btrfs_alloc_path();
3238        if (!path)
3239                return -ENOMEM;
3240
3241again:
3242        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3243        key.offset = (u64)-1;
3244        key.type = BTRFS_CHUNK_ITEM_KEY;
3245
3246        while (1) {
3247                mutex_lock(&fs_info->reclaim_bgs_lock);
3248                ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3249                if (ret < 0) {
3250                        mutex_unlock(&fs_info->reclaim_bgs_lock);
3251                        goto error;
3252                }
3253                BUG_ON(ret == 0); /* Corruption */
3254
3255                ret = btrfs_previous_item(chunk_root, path, key.objectid,
3256                                          key.type);
3257                if (ret)
3258                        mutex_unlock(&fs_info->reclaim_bgs_lock);
3259                if (ret < 0)
3260                        goto error;
3261                if (ret > 0)
3262                        break;
3263
3264                leaf = path->nodes[0];
3265                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3266
3267                chunk = btrfs_item_ptr(leaf, path->slots[0],
3268                                       struct btrfs_chunk);
3269                chunk_type = btrfs_chunk_type(leaf, chunk);
3270                btrfs_release_path(path);
3271
3272                if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
3273                        ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3274                        if (ret == -ENOSPC)
3275                                failed++;
3276                        else
3277                                BUG_ON(ret);
3278                }
3279                mutex_unlock(&fs_info->reclaim_bgs_lock);
3280
3281                if (found_key.offset == 0)
3282                        break;
3283                key.offset = found_key.offset - 1;
3284        }
3285        ret = 0;
3286        if (failed && !retried) {
3287                failed = 0;
3288                retried = true;
3289                goto again;
3290        } else if (WARN_ON(failed && retried)) {
3291                ret = -ENOSPC;
3292        }
3293error:
3294        btrfs_free_path(path);
3295        return ret;
3296}
3297
3298/*
3299 * return 1 : allocate a data chunk successfully,
3300 * return <0: errors during allocating a data chunk,
3301 * return 0 : no need to allocate a data chunk.
3302 */
3303static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
3304                                      u64 chunk_offset)
3305{
3306        struct btrfs_block_group *cache;
3307        u64 bytes_used;
3308        u64 chunk_type;
3309
3310        cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3311        ASSERT(cache);
3312        chunk_type = cache->flags;
3313        btrfs_put_block_group(cache);
3314
3315        if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
3316                return 0;
3317
3318        spin_lock(&fs_info->data_sinfo->lock);
3319        bytes_used = fs_info->data_sinfo->bytes_used;
3320        spin_unlock(&fs_info->data_sinfo->lock);
3321
3322        if (!bytes_used) {
3323                struct btrfs_trans_handle *trans;
3324                int ret;
3325
3326                trans = btrfs_join_transaction(fs_info->tree_root);
3327                if (IS_ERR(trans))
3328                        return PTR_ERR(trans);
3329
3330                ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
3331                btrfs_end_transaction(trans);
3332                if (ret < 0)
3333                        return ret;
3334                return 1;
3335        }
3336
3337        return 0;
3338}
3339
3340static int insert_balance_item(struct btrfs_fs_info *fs_info,
3341                               struct btrfs_balance_control *bctl)
3342{
3343        struct btrfs_root *root = fs_info->tree_root;
3344        struct btrfs_trans_handle *trans;
3345        struct btrfs_balance_item *item;
3346        struct btrfs_disk_balance_args disk_bargs;
3347        struct btrfs_path *path;
3348        struct extent_buffer *leaf;
3349        struct btrfs_key key;
3350        int ret, err;
3351
3352        path = btrfs_alloc_path();
3353        if (!path)
3354                return -ENOMEM;
3355
3356        trans = btrfs_start_transaction(root, 0);
3357        if (IS_ERR(trans)) {
3358                btrfs_free_path(path);
3359                return PTR_ERR(trans);
3360        }
3361
3362        key.objectid = BTRFS_BALANCE_OBJECTID;
3363        key.type = BTRFS_TEMPORARY_ITEM_KEY;
3364        key.offset = 0;
3365
3366        ret = btrfs_insert_empty_item(trans, root, path, &key,
3367                                      sizeof(*item));
3368        if (ret)
3369                goto out;
3370
3371        leaf = path->nodes[0];
3372        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3373
3374        memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3375
3376        btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
3377        btrfs_set_balance_data(leaf, item, &disk_bargs);
3378        btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
3379        btrfs_set_balance_meta(leaf, item, &disk_bargs);
3380        btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
3381        btrfs_set_balance_sys(leaf, item, &disk_bargs);
3382
3383        btrfs_set_balance_flags(leaf, item, bctl->flags);
3384
3385        btrfs_mark_buffer_dirty(leaf);
3386out:
3387        btrfs_free_path(path);
3388        err = btrfs_commit_transaction(trans);
3389        if (err && !ret)
3390                ret = err;
3391        return ret;
3392}
3393
3394static int del_balance_item(struct btrfs_fs_info *fs_info)
3395{
3396        struct btrfs_root *root = fs_info->tree_root;
3397        struct btrfs_trans_handle *trans;
3398        struct btrfs_path *path;
3399        struct btrfs_key key;
3400        int ret, err;
3401
3402        path = btrfs_alloc_path();
3403        if (!path)
3404                return -ENOMEM;
3405
3406        trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
3407        if (IS_ERR(trans)) {
3408                btrfs_free_path(path);
3409                return PTR_ERR(trans);
3410        }
3411
3412        key.objectid = BTRFS_BALANCE_OBJECTID;
3413        key.type = BTRFS_TEMPORARY_ITEM_KEY;
3414        key.offset = 0;
3415
3416        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3417        if (ret < 0)
3418                goto out;
3419        if (ret > 0) {
3420                ret = -ENOENT;
3421                goto out;
3422        }
3423
3424        ret = btrfs_del_item(trans, root, path);
3425out:
3426        btrfs_free_path(path);
3427        err = btrfs_commit_transaction(trans);
3428        if (err && !ret)
3429                ret = err;
3430        return ret;
3431}
3432
3433/*
3434 * This is a heuristic used to reduce the number of chunks balanced on
3435 * resume after balance was interrupted.
3436 */
3437static void update_balance_args(struct btrfs_balance_control *bctl)
3438{
3439        /*
3440         * Turn on soft mode for chunk types that were being converted.
3441         */
3442        if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
3443                bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
3444        if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
3445                bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
3446        if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
3447                bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
3448
3449        /*
3450         * Turn on usage filter if is not already used.  The idea is
3451         * that chunks that we have already balanced should be
3452         * reasonably full.  Don't do it for chunks that are being
3453         * converted - that will keep us from relocating unconverted
3454         * (albeit full) chunks.
3455         */
3456        if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3457            !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3458            !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3459                bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
3460                bctl->data.usage = 90;
3461        }
3462        if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3463            !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3464            !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3465                bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
3466                bctl->sys.usage = 90;
3467        }
3468        if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3469            !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3470            !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3471                bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
3472                bctl->meta.usage = 90;
3473        }
3474}
3475
3476/*
3477 * Clear the balance status in fs_info and delete the balance item from disk.
3478 */
3479static void reset_balance_state(struct btrfs_fs_info *fs_info)
3480{
3481        struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3482        int ret;
3483
3484        BUG_ON(!fs_info->balance_ctl);
3485
3486        spin_lock(&fs_info->balance_lock);
3487        fs_info->balance_ctl = NULL;
3488        spin_unlock(&fs_info->balance_lock);
3489
3490        kfree(bctl);
3491        ret = del_balance_item(fs_info);
3492        if (ret)
3493                btrfs_handle_fs_error(fs_info, ret, NULL);
3494}
3495
3496/*
3497 * Balance filters.  Return 1 if chunk should be filtered out
3498 * (should not be balanced).
3499 */
3500static int chunk_profiles_filter(u64 chunk_type,
3501                                 struct btrfs_balance_args *bargs)
3502{
3503        chunk_type = chunk_to_extended(chunk_type) &
3504                                BTRFS_EXTENDED_PROFILE_MASK;
3505
3506        if (bargs->profiles & chunk_type)
3507                return 0;
3508
3509        return 1;
3510}
3511
3512static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
3513                              struct btrfs_balance_args *bargs)
3514{
3515        struct btrfs_block_group *cache;
3516        u64 chunk_used;
3517        u64 user_thresh_min;
3518        u64 user_thresh_max;
3519        int ret = 1;
3520
3521        cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3522        chunk_used = cache->used;
3523
3524        if (bargs->usage_min == 0)
3525                user_thresh_min = 0;
3526        else
3527                user_thresh_min = div_factor_fine(cache->length,
3528                                                  bargs->usage_min);
3529
3530        if (bargs->usage_max == 0)
3531                user_thresh_max = 1;
3532        else if (bargs->usage_max > 100)
3533                user_thresh_max = cache->length;
3534        else
3535                user_thresh_max = div_factor_fine(cache->length,
3536                                                  bargs->usage_max);
3537
3538        if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
3539                ret = 0;
3540
3541        btrfs_put_block_group(cache);
3542        return ret;
3543}
3544
3545static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
3546                u64 chunk_offset, struct btrfs_balance_args *bargs)
3547{
3548        struct btrfs_block_group *cache;
3549        u64 chunk_used, user_thresh;
3550        int ret = 1;
3551
3552        cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3553        chunk_used = cache->used;
3554
3555        if (bargs->usage_min == 0)
3556                user_thresh = 1;
3557        else if (bargs->usage > 100)
3558                user_thresh = cache->length;
3559        else
3560                user_thresh = div_factor_fine(cache->length, bargs->usage);
3561
3562        if (chunk_used < user_thresh)
3563                ret = 0;
3564
3565        btrfs_put_block_group(cache);
3566        return ret;
3567}
3568
3569static int chunk_devid_filter(struct extent_buffer *leaf,
3570                              struct btrfs_chunk *chunk,
3571                              struct btrfs_balance_args *bargs)
3572{
3573        struct btrfs_stripe *stripe;
3574        int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3575        int i;
3576
3577        for (i = 0; i < num_stripes; i++) {
3578                stripe = btrfs_stripe_nr(chunk, i);
3579                if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
3580                        return 0;
3581        }
3582
3583        return 1;
3584}
3585
3586static u64 calc_data_stripes(u64 type, int num_stripes)
3587{
3588        const int index = btrfs_bg_flags_to_raid_index(type);
3589        const int ncopies = btrfs_raid_array[index].ncopies;
3590        const int nparity = btrfs_raid_array[index].nparity;
3591
3592        return (num_stripes - nparity) / ncopies;
3593}
3594
3595/* [pstart, pend) */
3596static int chunk_drange_filter(struct extent_buffer *leaf,
3597                               struct btrfs_chunk *chunk,
3598                               struct btrfs_balance_args *bargs)
3599{
3600        struct btrfs_stripe *stripe;
3601        int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3602        u64 stripe_offset;
3603        u64 stripe_length;
3604        u64 type;
3605        int factor;
3606        int i;
3607
3608        if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
3609                return 0;
3610
3611        type = btrfs_chunk_type(leaf, chunk);
3612        factor = calc_data_stripes(type, num_stripes);
3613
3614        for (i = 0; i < num_stripes; i++) {
3615                stripe = btrfs_stripe_nr(chunk, i);
3616                if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
3617                        continue;
3618
3619                stripe_offset = btrfs_stripe_offset(leaf, stripe);
3620                stripe_length = btrfs_chunk_length(leaf, chunk);
3621                stripe_length = div_u64(stripe_length, factor);
3622
3623                if (stripe_offset < bargs->pend &&
3624                    stripe_offset + stripe_length > bargs->pstart)
3625                        return 0;
3626        }
3627
3628        return 1;
3629}
3630
3631/* [vstart, vend) */
3632static int chunk_vrange_filter(struct extent_buffer *leaf,
3633                               struct btrfs_chunk *chunk,
3634                               u64 chunk_offset,
3635                               struct btrfs_balance_args *bargs)
3636{
3637        if (chunk_offset < bargs->vend &&
3638            chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
3639                /* at least part of the chunk is inside this vrange */
3640                return 0;
3641
3642        return 1;
3643}
3644
3645static int chunk_stripes_range_filter(struct extent_buffer *leaf,
3646                               struct btrfs_chunk *chunk,
3647                               struct btrfs_balance_args *bargs)
3648{
3649        int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3650
3651        if (bargs->stripes_min <= num_stripes
3652                        && num_stripes <= bargs->stripes_max)
3653                return 0;
3654
3655        return 1;
3656}
3657
3658static int chunk_soft_convert_filter(u64 chunk_type,
3659                                     struct btrfs_balance_args *bargs)
3660{
3661        if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3662                return 0;
3663
3664        chunk_type = chunk_to_extended(chunk_type) &
3665                                BTRFS_EXTENDED_PROFILE_MASK;
3666
3667        if (bargs->target == chunk_type)
3668                return 1;
3669
3670        return 0;
3671}
3672
3673static int should_balance_chunk(struct extent_buffer *leaf,
3674                                struct btrfs_chunk *chunk, u64 chunk_offset)
3675{
3676        struct btrfs_fs_info *fs_info = leaf->fs_info;
3677        struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3678        struct btrfs_balance_args *bargs = NULL;
3679        u64 chunk_type = btrfs_chunk_type(leaf, chunk);
3680
3681        /* type filter */
3682        if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
3683              (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
3684                return 0;
3685        }
3686
3687        if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3688                bargs = &bctl->data;
3689        else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3690                bargs = &bctl->sys;
3691        else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3692                bargs = &bctl->meta;
3693
3694        /* profiles filter */
3695        if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
3696            chunk_profiles_filter(chunk_type, bargs)) {
3697                return 0;
3698        }
3699
3700        /* usage filter */
3701        if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
3702            chunk_usage_filter(fs_info, chunk_offset, bargs)) {
3703                return 0;
3704        } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3705            chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
3706                return 0;
3707        }
3708
3709        /* devid filter */
3710        if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
3711            chunk_devid_filter(leaf, chunk, bargs)) {
3712                return 0;
3713        }
3714
3715        /* drange filter, makes sense only with devid filter */
3716        if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3717            chunk_drange_filter(leaf, chunk, bargs)) {
3718                return 0;
3719        }
3720
3721        /* vrange filter */
3722        if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
3723            chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
3724                return 0;
3725        }
3726
3727        /* stripes filter */
3728        if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
3729            chunk_stripes_range_filter(leaf, chunk, bargs)) {
3730                return 0;
3731        }
3732
3733        /* soft profile changing mode */
3734        if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
3735            chunk_soft_convert_filter(chunk_type, bargs)) {
3736                return 0;
3737        }
3738
3739        /*
3740         * limited by count, must be the last filter
3741         */
3742        if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
3743                if (bargs->limit == 0)
3744                        return 0;
3745                else
3746                        bargs->limit--;
3747        } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
3748                /*
3749                 * Same logic as the 'limit' filter; the minimum cannot be
3750                 * determined here because we do not have the global information
3751                 * about the count of all chunks that satisfy the filters.
3752                 */
3753                if (bargs->limit_max == 0)
3754                        return 0;
3755                else
3756                        bargs->limit_max--;
3757        }
3758
3759        return 1;
3760}
3761
3762static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3763{
3764        struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3765        struct btrfs_root *chunk_root = fs_info->chunk_root;
3766        u64 chunk_type;
3767        struct btrfs_chunk *chunk;
3768        struct btrfs_path *path = NULL;
3769        struct btrfs_key key;
3770        struct btrfs_key found_key;
3771        struct extent_buffer *leaf;
3772        int slot;
3773        int ret;
3774        int enospc_errors = 0;
3775        bool counting = true;
3776        /* The single value limit and min/max limits use the same bytes in the */
3777        u64 limit_data = bctl->data.limit;
3778        u64 limit_meta = bctl->meta.limit;
3779        u64 limit_sys = bctl->sys.limit;
3780        u32 count_data = 0;
3781        u32 count_meta = 0;
3782        u32 count_sys = 0;
3783        int chunk_reserved = 0;
3784
3785        path = btrfs_alloc_path();
3786        if (!path) {
3787                ret = -ENOMEM;
3788                goto error;
3789        }
3790
3791        /* zero out stat counters */
3792        spin_lock(&fs_info->balance_lock);
3793        memset(&bctl->stat, 0, sizeof(bctl->stat));
3794        spin_unlock(&fs_info->balance_lock);
3795again:
3796        if (!counting) {
3797                /*
3798                 * The single value limit and min/max limits use the same bytes
3799                 * in the
3800                 */
3801                bctl->data.limit = limit_data;
3802                bctl->meta.limit = limit_meta;
3803                bctl->sys.limit = limit_sys;
3804        }
3805        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3806        key.offset = (u64)-1;
3807        key.type = BTRFS_CHUNK_ITEM_KEY;
3808
3809        while (1) {
3810                if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
3811                    atomic_read(&fs_info->balance_cancel_req)) {
3812                        ret = -ECANCELED;
3813                        goto error;
3814                }
3815
3816                mutex_lock(&fs_info->reclaim_bgs_lock);
3817                ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3818                if (ret < 0) {
3819                        mutex_unlock(&fs_info->reclaim_bgs_lock);
3820                        goto error;
3821                }
3822
3823                /*
3824                 * this shouldn't happen, it means the last relocate
3825                 * failed
3826                 */
3827                if (ret == 0)
3828                        BUG(); /* FIXME break ? */
3829
3830                ret = btrfs_previous_item(chunk_root, path, 0,
3831                                          BTRFS_CHUNK_ITEM_KEY);
3832                if (ret) {
3833                        mutex_unlock(&fs_info->reclaim_bgs_lock);
3834                        ret = 0;
3835                        break;
3836                }
3837
3838                leaf = path->nodes[0];
3839                slot = path->slots[0];
3840                btrfs_item_key_to_cpu(leaf, &found_key, slot);
3841
3842                if (found_key.objectid != key.objectid) {
3843                        mutex_unlock(&fs_info->reclaim_bgs_lock);
3844                        break;
3845                }
3846
3847                chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3848                chunk_type = btrfs_chunk_type(leaf, chunk);
3849
3850                if (!counting) {
3851                        spin_lock(&fs_info->balance_lock);
3852                        bctl->stat.considered++;
3853                        spin_unlock(&fs_info->balance_lock);
3854                }
3855
3856                ret = should_balance_chunk(leaf, chunk, found_key.offset);
3857
3858                btrfs_release_path(path);
3859                if (!ret) {
3860                        mutex_unlock(&fs_info->reclaim_bgs_lock);
3861                        goto loop;
3862                }
3863
3864                if (counting) {
3865                        mutex_unlock(&fs_info->reclaim_bgs_lock);
3866                        spin_lock(&fs_info->balance_lock);
3867                        bctl->stat.expected++;
3868                        spin_unlock(&fs_info->balance_lock);
3869
3870                        if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3871                                count_data++;
3872                        else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3873                                count_sys++;
3874                        else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3875                                count_meta++;
3876
3877                        goto loop;
3878                }
3879
3880                /*
3881                 * Apply limit_min filter, no need to check if the LIMITS
3882                 * filter is used, limit_min is 0 by default
3883                 */
3884                if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
3885                                        count_data < bctl->data.limit_min)
3886                                || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
3887                                        count_meta < bctl->meta.limit_min)
3888                                || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
3889                                        count_sys < bctl->sys.limit_min)) {
3890                        mutex_unlock(&fs_info->reclaim_bgs_lock);
3891                        goto loop;
3892                }
3893
3894                if (!chunk_reserved) {
3895                        /*
3896                         * We may be relocating the only data chunk we have,
3897                         * which could potentially end up with losing data's
3898                         * raid profile, so lets allocate an empty one in
3899                         * advance.
3900                         */
3901                        ret = btrfs_may_alloc_data_chunk(fs_info,
3902                                                         found_key.offset);
3903                        if (ret < 0) {
3904                                mutex_unlock(&fs_info->reclaim_bgs_lock);
3905                                goto error;
3906                        } else if (ret == 1) {
3907                                chunk_reserved = 1;
3908                        }
3909                }
3910
3911                ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3912                mutex_unlock(&fs_info->reclaim_bgs_lock);
3913                if (ret == -ENOSPC) {
3914                        enospc_errors++;
3915                } else if (ret == -ETXTBSY) {
3916                        btrfs_info(fs_info,
3917           "skipping relocation of block group %llu due to active swapfile",
3918                                   found_key.offset);
3919                        ret = 0;
3920                } else if (ret) {
3921                        goto error;
3922                } else {
3923                        spin_lock(&fs_info->balance_lock);
3924                        bctl->stat.completed++;
3925                        spin_unlock(&fs_info->balance_lock);
3926                }
3927loop:
3928                if (found_key.offset == 0)
3929                        break;
3930                key.offset = found_key.offset - 1;
3931        }
3932
3933        if (counting) {
3934                btrfs_release_path(path);
3935                counting = false;
3936                goto again;
3937        }
3938error:
3939        btrfs_free_path(path);
3940        if (enospc_errors) {
3941                btrfs_info(fs_info, "%d enospc errors during balance",
3942                           enospc_errors);
3943                if (!ret)
3944                        ret = -ENOSPC;
3945        }
3946
3947        return ret;
3948}
3949
3950/**
3951 * alloc_profile_is_valid - see if a given profile is valid and reduced
3952 * @flags: profile to validate
3953 * @extended: if true @flags is treated as an extended profile
3954 */
3955static int alloc_profile_is_valid(u64 flags, int extended)
3956{
3957        u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
3958                               BTRFS_BLOCK_GROUP_PROFILE_MASK);
3959
3960        flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
3961
3962        /* 1) check that all other bits are zeroed */
3963        if (flags & ~mask)
3964                return 0;
3965
3966        /* 2) see if profile is reduced */
3967        if (flags == 0)
3968                return !extended; /* "0" is valid for usual profiles */
3969
3970        return has_single_bit_set(flags);
3971}
3972
3973static inline int balance_need_close(struct btrfs_fs_info *fs_info)
3974{
3975        /* cancel requested || normal exit path */
3976        return atomic_read(&fs_info->balance_cancel_req) ||
3977                (atomic_read(&fs_info->balance_pause_req) == 0 &&
3978                 atomic_read(&fs_info->balance_cancel_req) == 0);
3979}
3980
3981/*
3982 * Validate target profile against allowed profiles and return true if it's OK.
3983 * Otherwise print the error message and return false.
3984 */
3985static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
3986                const struct btrfs_balance_args *bargs,
3987                u64 allowed, const char *type)
3988{
3989        if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3990                return true;
3991
3992        if (fs_info->sectorsize < PAGE_SIZE &&
3993                bargs->target & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3994                btrfs_err(fs_info,
3995                "RAID56 is not yet supported for sectorsize %u with page size %lu",
3996                          fs_info->sectorsize, PAGE_SIZE);
3997                return false;
3998        }
3999        /* Profile is valid and does not have bits outside of the allowed set */
4000        if (alloc_profile_is_valid(bargs->target, 1) &&
4001            (bargs->target & ~allowed) == 0)
4002                return true;
4003
4004        btrfs_err(fs_info, "balance: invalid convert %s profile %s",
4005                        type, btrfs_bg_type_to_raid_name(bargs->target));
4006        return false;
4007}
4008
4009/*
4010 * Fill @buf with textual description of balance filter flags @bargs, up to
4011 * @size_buf including the terminating null. The output may be trimmed if it
4012 * does not fit into the provided buffer.
4013 */
4014static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
4015                                 u32 size_buf)
4016{
4017        int ret;
4018        u32 size_bp = size_buf;
4019        char *bp = buf;
4020        u64 flags = bargs->flags;
4021        char tmp_buf[128] = {'\0'};
4022
4023        if (!flags)
4024                return;
4025
4026#define CHECK_APPEND_NOARG(a)                                           \
4027        do {                                                            \
4028                ret = snprintf(bp, size_bp, (a));                       \
4029                if (ret < 0 || ret >= size_bp)                          \
4030                        goto out_overflow;                              \
4031                size_bp -= ret;                                         \
4032                bp += ret;                                              \
4033        } while (0)
4034
4035#define CHECK_APPEND_1ARG(a, v1)                                        \
4036        do {                                                            \
4037                ret = snprintf(bp, size_bp, (a), (v1));                 \
4038                if (ret < 0 || ret >= size_bp)                          \
4039                        goto out_overflow;                              \
4040                size_bp -= ret;                                         \
4041                bp += ret;                                              \
4042        } while (0)
4043
4044#define CHECK_APPEND_2ARG(a, v1, v2)                                    \
4045        do {                                                            \
4046                ret = snprintf(bp, size_bp, (a), (v1), (v2));           \
4047                if (ret < 0 || ret >= size_bp)                          \
4048                        goto out_overflow;                              \
4049                size_bp -= ret;                                         \
4050                bp += ret;                                              \
4051        } while (0)
4052
4053        if (flags & BTRFS_BALANCE_ARGS_CONVERT)
4054                CHECK_APPEND_1ARG("convert=%s,",
4055                                  btrfs_bg_type_to_raid_name(bargs->target));
4056
4057        if (flags & BTRFS_BALANCE_ARGS_SOFT)
4058                CHECK_APPEND_NOARG("soft,");
4059
4060        if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
4061                btrfs_describe_block_groups(bargs->profiles, tmp_buf,
4062                                            sizeof(tmp_buf));
4063                CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
4064        }
4065
4066        if (flags & BTRFS_BALANCE_ARGS_USAGE)
4067                CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);
4068
4069        if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
4070                CHECK_APPEND_2ARG("usage=%u..%u,",
4071                                  bargs->usage_min, bargs->usage_max);
4072
4073        if (flags & BTRFS_BALANCE_ARGS_DEVID)
4074                CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);
4075
4076        if (flags & BTRFS_BALANCE_ARGS_DRANGE)
4077                CHECK_APPEND_2ARG("drange=%llu..%llu,",
4078                                  bargs->pstart, bargs->pend);
4079
4080        if (flags & BTRFS_BALANCE_ARGS_VRANGE)
4081                CHECK_APPEND_2ARG("vrange=%llu..%llu,",
4082                                  bargs->vstart, bargs->vend);
4083
4084        if (flags & BTRFS_BALANCE_ARGS_LIMIT)
4085                CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);
4086
4087        if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
4088                CHECK_APPEND_2ARG("limit=%u..%u,",
4089                                bargs->limit_min, bargs->limit_max);
4090
4091        if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
4092                CHECK_APPEND_2ARG("stripes=%u..%u,",
4093                                  bargs->stripes_min, bargs->stripes_max);
4094
4095#undef CHECK_APPEND_2ARG
4096#undef CHECK_APPEND_1ARG
4097#undef CHECK_APPEND_NOARG
4098
4099out_overflow:
4100
4101        if (size_bp < size_buf)
4102                buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
4103        else
4104                buf[0] = '\0';
4105}
4106
4107static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
4108{
4109        u32 size_buf = 1024;
4110        char tmp_buf[192] = {'\0'};
4111        char *buf;
4112        char *bp;
4113        u32 size_bp = size_buf;
4114        int ret;
4115        struct btrfs_balance_control *bctl = fs_info->balance_ctl;
4116
4117        buf = kzalloc(size_buf, GFP_KERNEL);
4118        if (!buf)
4119                return;
4120
4121        bp = buf;
4122
4123#define CHECK_APPEND_1ARG(a, v1)                                        \
4124        do {                                                            \
4125                ret = snprintf(bp, size_bp, (a), (v1));                 \
4126                if (ret < 0 || ret >= size_bp)                          \
4127                        goto out_overflow;                              \
4128                size_bp -= ret;                                         \
4129                bp += ret;                                              \
4130        } while (0)
4131
4132        if (bctl->flags & BTRFS_BALANCE_FORCE)
4133                CHECK_APPEND_1ARG("%s", "-f ");
4134
4135        if (bctl->flags & BTRFS_BALANCE_DATA) {
4136                describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
4137                CHECK_APPEND_1ARG("-d%s ", tmp_buf);
4138        }
4139
4140        if (bctl->flags & BTRFS_BALANCE_METADATA) {
4141                describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
4142                CHECK_APPEND_1ARG("-m%s ", tmp_buf);
4143        }
4144
4145        if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
4146                describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
4147                CHECK_APPEND_1ARG("-s%s ", tmp_buf);
4148        }
4149
4150#undef CHECK_APPEND_1ARG
4151
4152out_overflow:
4153
4154        if (size_bp < size_buf)
4155                buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
4156        btrfs_info(fs_info, "balance: %s %s",
4157                   (bctl->flags & BTRFS_BALANCE_RESUME) ?
4158                   "resume" : "start", buf);
4159
4160        kfree(buf);
4161}
4162
4163/*
4164 * Should be called with balance mutexe held
4165 */
4166int btrfs_balance(struct btrfs_fs_info *fs_info,
4167                  struct btrfs_balance_control *bctl,
4168                  struct btrfs_ioctl_balance_args *bargs)
4169{
4170        u64 meta_target, data_target;
4171        u64 allowed;
4172        int mixed = 0;
4173        int ret;
4174        u64 num_devices;
4175        unsigned seq;
4176        bool reducing_redundancy;
4177        int i;
4178
4179        if (btrfs_fs_closing(fs_info) ||
4180            atomic_read(&fs_info->balance_pause_req) ||
4181            btrfs_should_cancel_balance(fs_info)) {
4182                ret = -EINVAL;
4183                goto out;
4184        }
4185
4186        allowed = btrfs_super_incompat_flags(fs_info->super_copy);
4187        if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
4188                mixed = 1;
4189
4190        /*
4191         * In case of mixed groups both data and meta should be picked,
4192         * and identical options should be given for both of them.
4193         */
4194        allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
4195        if (mixed && (bctl->flags & allowed)) {
4196                if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
4197                    !(bctl->flags & BTRFS_BALANCE_METADATA) ||
4198                    memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
4199                        btrfs_err(fs_info,
4200          "balance: mixed groups data and metadata options must be the same");
4201                        ret = -EINVAL;
4202                        goto out;
4203                }
4204        }
4205
4206        /*
4207         * rw_devices will not change at the moment, device add/delete/replace
4208         * are exclusive
4209         */
4210        num_devices = fs_info->fs_devices->rw_devices;
4211
4212        /*
4213         * SINGLE profile on-disk has no profile bit, but in-memory we have a
4214         * special bit for it, to make it easier to distinguish.  Thus we need
4215         * to set it manually, or balance would refuse the profile.
4216         */
4217        allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
4218        for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
4219                if (num_devices >= btrfs_raid_array[i].devs_min)
4220                        allowed |= btrfs_raid_array[i].bg_flag;
4221
4222        if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") ||
4223            !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") ||
4224            !validate_convert_profile(fs_info, &bctl->sys,  allowed, "system")) {
4225                ret = -EINVAL;
4226                goto out;
4227        }
4228
4229        /*
4230         * Allow to reduce metadata or system integrity only if force set for
4231         * profiles with redundancy (copies, parity)
4232         */
4233        allowed = 0;
4234        for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
4235                if (btrfs_raid_array[i].ncopies >= 2 ||
4236                    btrfs_raid_array[i].tolerated_failures >= 1)
4237                        allowed |= btrfs_raid_array[i].bg_flag;
4238        }
4239        do {
4240                seq = read_seqbegin(&fs_info->profiles_lock);
4241
4242                if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4243                     (fs_info->avail_system_alloc_bits & allowed) &&
4244                     !(bctl->sys.target & allowed)) ||
4245                    ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4246                     (fs_info->avail_metadata_alloc_bits & allowed) &&
4247                     !(bctl->meta.target & allowed)))
4248                        reducing_redundancy = true;
4249                else
4250                        reducing_redundancy = false;
4251
4252                /* if we're not converting, the target field is uninitialized */
4253                meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
4254                        bctl->meta.target : fs_info->avail_metadata_alloc_bits;
4255                data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
4256                        bctl->data.target : fs_info->avail_data_alloc_bits;
4257        } while (read_seqretry(&fs_info->profiles_lock, seq));
4258
4259        if (reducing_redundancy) {
4260                if (bctl->flags & BTRFS_BALANCE_FORCE) {
4261                        btrfs_info(fs_info,
4262                           "balance: force reducing metadata redundancy");
4263                } else {
4264                        btrfs_err(fs_info,
4265        "balance: reduces metadata redundancy, use --force if you want this");
4266                        ret = -EINVAL;
4267                        goto out;
4268                }
4269        }
4270
4271        if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
4272                btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
4273                btrfs_warn(fs_info,
4274        "balance: metadata profile %s has lower redundancy than data profile %s",
4275                                btrfs_bg_type_to_raid_name(meta_target),
4276                                btrfs_bg_type_to_raid_name(data_target));
4277        }
4278
4279        ret = insert_balance_item(fs_info, bctl);
4280        if (ret && ret != -EEXIST)
4281                goto out;
4282
4283        if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
4284                BUG_ON(ret == -EEXIST);
4285                BUG_ON(fs_info->balance_ctl);
4286                spin_lock(&fs_info->balance_lock);
4287                fs_info->balance_ctl = bctl;
4288                spin_unlock(&fs_info->balance_lock);
4289        } else {
4290                BUG_ON(ret != -EEXIST);
4291                spin_lock(&fs_info->balance_lock);
4292                update_balance_args(bctl);
4293                spin_unlock(&fs_info->balance_lock);
4294        }
4295
4296        ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4297        set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4298        describe_balance_start_or_resume(fs_info);
4299        mutex_unlock(&fs_info->balance_mutex);
4300
4301        ret = __btrfs_balance(fs_info);
4302
4303        mutex_lock(&fs_info->balance_mutex);
4304        if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req))
4305                btrfs_info(fs_info, "balance: paused");
4306        /*
4307         * Balance can be canceled by:
4308         *
4309         * - Regular cancel request
4310         *   Then ret == -ECANCELED and balance_cancel_req > 0
4311         *
4312         * - Fatal signal to "btrfs" process
4313         *   Either the signal caught by wait_reserve_ticket() and callers
4314         *   got -EINTR, or caught by btrfs_should_cancel_balance() and
4315         *   got -ECANCELED.
4316         *   Either way, in this case balance_cancel_req = 0, and
4317         *   ret == -EINTR or ret == -ECANCELED.
4318         *
4319         * So here we only check the return value to catch canceled balance.
4320         */
4321        else if (ret == -ECANCELED || ret == -EINTR)
4322                btrfs_info(fs_info, "balance: canceled");
4323        else
4324                btrfs_info(fs_info, "balance: ended with status: %d", ret);
4325
4326        clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4327
4328        if (bargs) {
4329                memset(bargs, 0, sizeof(*bargs));
4330                btrfs_update_ioctl_balance_args(fs_info, bargs);
4331        }
4332
4333        if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
4334            balance_need_close(fs_info)) {
4335                reset_balance_state(fs_info);
4336                btrfs_exclop_finish(fs_info);
4337        }
4338
4339        wake_up(&fs_info->balance_wait_q);
4340
4341        return ret;
4342out:
4343        if (bctl->flags & BTRFS_BALANCE_RESUME)
4344                reset_balance_state(fs_info);
4345        else
4346                kfree(bctl);
4347        btrfs_exclop_finish(fs_info);
4348
4349        return ret;
4350}
4351
4352static int balance_kthread(void *data)
4353{
4354        struct btrfs_fs_info *fs_info = data;
4355        int ret = 0;
4356
4357        mutex_lock(&fs_info->balance_mutex);
4358        if (fs_info->balance_ctl)
4359                ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
4360        mutex_unlock(&fs_info->balance_mutex);
4361
4362        return ret;
4363}
4364
4365int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
4366{
4367        struct task_struct *tsk;
4368
4369        mutex_lock(&fs_info->balance_mutex);
4370        if (!fs_info->balance_ctl) {
4371                mutex_unlock(&fs_info->balance_mutex);
4372                return 0;
4373        }
4374        mutex_unlock(&fs_info->balance_mutex);
4375
4376        if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
4377                btrfs_info(fs_info, "balance: resume skipped");
4378                return 0;
4379        }
4380
4381        /*
4382         * A ro->rw remount sequence should continue with the paused balance
4383         * regardless of who pauses it, system or the user as of now, so set
4384         * the resume flag.
4385         */
4386        spin_lock(&fs_info->balance_lock);
4387        fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
4388        spin_unlock(&fs_info->balance_lock);
4389
4390        tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
4391        return PTR_ERR_OR_ZERO(tsk);
4392}
4393
4394int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
4395{
4396        struct btrfs_balance_control *bctl;
4397        struct btrfs_balance_item *item;
4398        struct btrfs_disk_balance_args disk_bargs;
4399        struct btrfs_path *path;
4400        struct extent_buffer *leaf;
4401        struct btrfs_key key;
4402        int ret;
4403
4404        path = btrfs_alloc_path();
4405        if (!path)
4406                return -ENOMEM;
4407
4408        key.objectid = BTRFS_BALANCE_OBJECTID;
4409        key.type = BTRFS_TEMPORARY_ITEM_KEY;
4410        key.offset = 0;
4411
4412        ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
4413        if (ret < 0)
4414                goto out;
4415        if (ret > 0) { /* ret = -ENOENT; */
4416                ret = 0;
4417                goto out;
4418        }
4419
4420        bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
4421        if (!bctl) {
4422                ret = -ENOMEM;
4423                goto out;
4424        }
4425
4426        leaf = path->nodes[0];
4427        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
4428
4429        bctl->flags = btrfs_balance_flags(leaf, item);
4430        bctl->flags |= BTRFS_BALANCE_RESUME;
4431
4432        btrfs_balance_data(leaf, item, &disk_bargs);
4433        btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
4434        btrfs_balance_meta(leaf, item, &disk_bargs);
4435        btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
4436        btrfs_balance_sys(leaf, item, &disk_bargs);
4437        btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
4438
4439        /*
4440         * This should never happen, as the paused balance state is recovered
4441         * during mount without any chance of other exclusive ops to collide.
4442         *
4443         * This gives the exclusive op status to balance and keeps in paused
4444         * state until user intervention (cancel or umount). If the ownership
4445         * cannot be assigned, show a message but do not fail. The balance
4446         * is in a paused state and must have fs_info::balance_ctl properly
4447         * set up.
4448         */
4449        if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
4450                btrfs_warn(fs_info,
4451        "balance: cannot set exclusive op status, resume manually");
4452
4453        btrfs_release_path(path);
4454
4455        mutex_lock(&fs_info->balance_mutex);
4456        BUG_ON(fs_info->balance_ctl);
4457        spin_lock(&fs_info->balance_lock);
4458        fs_info->balance_ctl = bctl;
4459        spin_unlock(&fs_info->balance_lock);
4460        mutex_unlock(&fs_info->balance_mutex);
4461out:
4462        btrfs_free_path(path);
4463        return ret;
4464}
4465
4466int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
4467{
4468        int ret = 0;
4469
4470        mutex_lock(&fs_info->balance_mutex);
4471        if (!fs_info->balance_ctl) {
4472                mutex_unlock(&fs_info->balance_mutex);
4473                return -ENOTCONN;
4474        }
4475
4476        if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4477                atomic_inc(&fs_info->balance_pause_req);
4478                mutex_unlock(&fs_info->balance_mutex);
4479
4480                wait_event(fs_info->balance_wait_q,
4481                           !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4482
4483                mutex_lock(&fs_info->balance_mutex);
4484                /* we are good with balance_ctl ripped off from under us */
4485                BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4486                atomic_dec(&fs_info->balance_pause_req);
4487        } else {
4488                ret = -ENOTCONN;
4489        }
4490
4491        mutex_unlock(&fs_info->balance_mutex);
4492        return ret;
4493}
4494
4495int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
4496{
4497        mutex_lock(&fs_info->balance_mutex);
4498        if (!fs_info->balance_ctl) {
4499                mutex_unlock(&fs_info->balance_mutex);
4500                return -ENOTCONN;
4501        }
4502
4503        /*
4504         * A paused balance with the item stored on disk can be resumed at
4505         * mount time if the mount is read-write. Otherwise it's still paused
4506         * and we must not allow cancelling as it deletes the item.
4507         */
4508        if (sb_rdonly(fs_info->sb)) {
4509                mutex_unlock(&fs_info->balance_mutex);
4510                return -EROFS;
4511        }
4512
4513        atomic_inc(&fs_info->balance_cancel_req);
4514        /*
4515         * if we are running just wait and return, balance item is
4516         * deleted in btrfs_balance in this case
4517         */
4518        if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4519                mutex_unlock(&fs_info->balance_mutex);
4520                wait_event(fs_info->balance_wait_q,
4521                           !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4522                mutex_lock(&fs_info->balance_mutex);
4523        } else {
4524                mutex_unlock(&fs_info->balance_mutex);
4525                /*
4526                 * Lock released to allow other waiters to continue, we'll
4527                 * reexamine the status again.
4528                 */
4529                mutex_lock(&fs_info->balance_mutex);
4530
4531                if (fs_info->balance_ctl) {
4532                        reset_balance_state(fs_info);
4533                        btrfs_exclop_finish(fs_info);
4534                        btrfs_info(fs_info, "balance: canceled");
4535                }
4536        }
4537
4538        BUG_ON(fs_info->balance_ctl ||
4539                test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4540        atomic_dec(&fs_info->balance_cancel_req);
4541        mutex_unlock(&fs_info->balance_mutex);
4542        return 0;
4543}
4544
4545int btrfs_uuid_scan_kthread(void *data)
4546{
4547        struct btrfs_fs_info *fs_info = data;
4548        struct btrfs_root *root = fs_info->tree_root;
4549        struct btrfs_key key;
4550        struct btrfs_path *path = NULL;
4551        int ret = 0;
4552        struct extent_buffer *eb;
4553        int slot;
4554        struct btrfs_root_item root_item;
4555        u32 item_size;
4556        struct btrfs_trans_handle *trans = NULL;
4557        bool closing = false;
4558
4559        path = btrfs_alloc_path();
4560        if (!path) {
4561                ret = -ENOMEM;
4562                goto out;
4563        }
4564
4565        key.objectid = 0;
4566        key.type = BTRFS_ROOT_ITEM_KEY;
4567        key.offset = 0;
4568
4569        while (1) {
4570                if (btrfs_fs_closing(fs_info)) {
4571                        closing = true;
4572                        break;
4573                }
4574                ret = btrfs_search_forward(root, &key, path,
4575                                BTRFS_OLDEST_GENERATION);
4576                if (ret) {
4577                        if (ret > 0)
4578                                ret = 0;
4579                        break;
4580                }
4581
4582                if (key.type != BTRFS_ROOT_ITEM_KEY ||
4583                    (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
4584                     key.objectid != BTRFS_FS_TREE_OBJECTID) ||
4585                    key.objectid > BTRFS_LAST_FREE_OBJECTID)
4586                        goto skip;
4587
4588                eb = path->nodes[0];
4589                slot = path->slots[0];
4590                item_size = btrfs_item_size_nr(eb, slot);
4591                if (item_size < sizeof(root_item))
4592                        goto skip;
4593
4594                read_extent_buffer(eb, &root_item,
4595                                   btrfs_item_ptr_offset(eb, slot),
4596                                   (int)sizeof(root_item));
4597                if (btrfs_root_refs(&root_item) == 0)
4598                        goto skip;
4599
4600                if (!btrfs_is_empty_uuid(root_item.uuid) ||
4601                    !btrfs_is_empty_uuid(root_item.received_uuid)) {
4602                        if (trans)
4603                                goto update_tree;
4604
4605                        btrfs_release_path(path);
4606                        /*
4607                         * 1 - subvol uuid item
4608                         * 1 - received_subvol uuid item
4609                         */
4610                        trans = btrfs_start_transaction(fs_info->uuid_root, 2);
4611                        if (IS_ERR(trans)) {
4612                                ret = PTR_ERR(trans);
4613                                break;
4614                        }
4615                        continue;
4616                } else {
4617                        goto skip;
4618                }
4619update_tree:
4620                btrfs_release_path(path);
4621                if (!btrfs_is_empty_uuid(root_item.uuid)) {
4622                        ret = btrfs_uuid_tree_add(trans, root_item.uuid,
4623                                                  BTRFS_UUID_KEY_SUBVOL,
4624                                                  key.objectid);
4625                        if (ret < 0) {
4626                                btrfs_warn(fs_info, "uuid_tree_add failed %d",
4627                                        ret);
4628                                break;
4629                        }
4630                }
4631
4632                if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
4633                        ret = btrfs_uuid_tree_add(trans,
4634                                                  root_item.received_uuid,
4635                                                 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4636                                                  key.objectid);
4637                        if (ret < 0) {
4638                                btrfs_warn(fs_info, "uuid_tree_add failed %d",
4639                                        ret);
4640                                break;
4641                        }
4642                }
4643
4644skip:
4645                btrfs_release_path(path);
4646                if (trans) {
4647                        ret = btrfs_end_transaction(trans);
4648                        trans = NULL;
4649                        if (ret)
4650                                break;
4651                }
4652
4653                if (key.offset < (u64)-1) {
4654                        key.offset++;
4655                } else if (key.type < BTRFS_ROOT_ITEM_KEY) {
4656                        key.offset = 0;
4657                        key.type = BTRFS_ROOT_ITEM_KEY;
4658                } else if (key.objectid < (u64)-1) {
4659                        key.offset = 0;
4660                        key.type = BTRFS_ROOT_ITEM_KEY;
4661                        key.objectid++;
4662                } else {
4663                        break;
4664                }
4665                cond_resched();
4666        }
4667
4668out:
4669        btrfs_free_path(path);
4670        if (trans && !IS_ERR(trans))
4671                btrfs_end_transaction(trans);
4672        if (ret)
4673                btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
4674        else if (!closing)
4675                set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
4676        up(&fs_info->uuid_tree_rescan_sem);
4677        return 0;
4678}
4679
4680int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
4681{
4682        struct btrfs_trans_handle *trans;
4683        struct btrfs_root *tree_root = fs_info->tree_root;
4684        struct btrfs_root *uuid_root;
4685        struct task_struct *task;
4686        int ret;
4687
4688        /*
4689         * 1 - root node
4690         * 1 - root item
4691         */
4692        trans = btrfs_start_transaction(tree_root, 2);
4693        if (IS_ERR(trans))
4694                return PTR_ERR(trans);
4695
4696        uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
4697        if (IS_ERR(uuid_root)) {
4698                ret = PTR_ERR(uuid_root);
4699                btrfs_abort_transaction(trans, ret);
4700                btrfs_end_transaction(trans);
4701                return ret;
4702        }
4703
4704        fs_info->uuid_root = uuid_root;
4705
4706        ret = btrfs_commit_transaction(trans);
4707        if (ret)
4708                return ret;
4709
4710        down(&fs_info->uuid_tree_rescan_sem);
4711        task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
4712        if (IS_ERR(task)) {
4713                /* fs_info->update_uuid_tree_gen remains 0 in all error case */
4714                btrfs_warn(fs_info, "failed to start uuid_scan task");
4715                up(&fs_info->uuid_tree_rescan_sem);
4716                return PTR_ERR(task);
4717        }
4718
4719        return 0;
4720}
4721
4722/*
4723 * shrinking a device means finding all of the device extents past
4724 * the new size, and then following the back refs to the chunks.
4725 * The chunk relocation code actually frees the device extent
4726 */
4727int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
4728{
4729        struct btrfs_fs_info *fs_info = device->fs_info;
4730        struct btrfs_root *root = fs_info->dev_root;
4731        struct btrfs_trans_handle *trans;
4732        struct btrfs_dev_extent *dev_extent = NULL;
4733        struct btrfs_path *path;
4734        u64 length;
4735        u64 chunk_offset;
4736        int ret;
4737        int slot;
4738        int failed = 0;
4739        bool retried = false;
4740        struct extent_buffer *l;
4741        struct btrfs_key key;
4742        struct btrfs_super_block *super_copy = fs_info->super_copy;
4743        u64 old_total = btrfs_super_total_bytes(super_copy);
4744        u64 old_size = btrfs_device_get_total_bytes(device);
4745        u64 diff;
4746        u64 start;
4747
4748        new_size = round_down(new_size, fs_info->sectorsize);
4749        start = new_size;
4750        diff = round_down(old_size - new_size, fs_info->sectorsize);
4751
4752        if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
4753                return -EINVAL;
4754
4755        path = btrfs_alloc_path();
4756        if (!path)
4757                return -ENOMEM;
4758
4759        path->reada = READA_BACK;
4760
4761        trans = btrfs_start_transaction(root, 0);
4762        if (IS_ERR(trans)) {
4763                btrfs_free_path(path);
4764                return PTR_ERR(trans);
4765        }
4766
4767        mutex_lock(&fs_info->chunk_mutex);
4768
4769        btrfs_device_set_total_bytes(device, new_size);
4770        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
4771                device->fs_devices->total_rw_bytes -= diff;
4772                atomic64_sub(diff, &fs_info->free_chunk_space);
4773        }
4774
4775        /*
4776         * Once the device's size has been set to the new size, ensure all
4777         * in-memory chunks are synced to disk so that the loop below sees them
4778         * and relocates them accordingly.
4779         */
4780        if (contains_pending_extent(device, &start, diff)) {
4781                mutex_unlock(&fs_info->chunk_mutex);
4782                ret = btrfs_commit_transaction(trans);
4783                if (ret)
4784                        goto done;
4785        } else {
4786                mutex_unlock(&fs_info->chunk_mutex);
4787                btrfs_end_transaction(trans);
4788        }
4789
4790again:
4791        key.objectid = device->devid;
4792        key.offset = (u64)-1;
4793        key.type = BTRFS_DEV_EXTENT_KEY;
4794
4795        do {
4796                mutex_lock(&fs_info->reclaim_bgs_lock);
4797                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4798                if (ret < 0) {
4799                        mutex_unlock(&fs_info->reclaim_bgs_lock);
4800                        goto done;
4801                }
4802
4803                ret = btrfs_previous_item(root, path, 0, key.type);
4804                if (ret) {
4805                        mutex_unlock(&fs_info->reclaim_bgs_lock);
4806                        if (ret < 0)
4807                                goto done;
4808                        ret = 0;
4809                        btrfs_release_path(path);
4810                        break;
4811                }
4812
4813                l = path->nodes[0];
4814                slot = path->slots[0];
4815                btrfs_item_key_to_cpu(l, &key, path->slots[0]);
4816
4817                if (key.objectid != device->devid) {
4818                        mutex_unlock(&fs_info->reclaim_bgs_lock);
4819                        btrfs_release_path(path);
4820                        break;
4821                }
4822
4823                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
4824                length = btrfs_dev_extent_length(l, dev_extent);
4825
4826                if (key.offset + length <= new_size) {
4827                        mutex_unlock(&fs_info->reclaim_bgs_lock);
4828                        btrfs_release_path(path);
4829                        break;
4830                }
4831
4832                chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
4833                btrfs_release_path(path);
4834
4835                /*
4836                 * We may be relocating the only data chunk we have,
4837                 * which could potentially end up with losing data's
4838                 * raid profile, so lets allocate an empty one in
4839                 * advance.
4840                 */
4841                ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
4842                if (ret < 0) {
4843                        mutex_unlock(&fs_info->reclaim_bgs_lock);
4844                        goto done;
4845                }
4846
4847                ret = btrfs_relocate_chunk(fs_info, chunk_offset);
4848                mutex_unlock(&fs_info->reclaim_bgs_lock);
4849                if (ret == -ENOSPC) {
4850                        failed++;
4851                } else if (ret) {
4852                        if (ret == -ETXTBSY) {
4853                                btrfs_warn(fs_info,
4854                   "could not shrink block group %llu due to active swapfile",
4855                                           chunk_offset);
4856                        }
4857                        goto done;
4858                }
4859        } while (key.offset-- > 0);
4860
4861        if (failed && !retried) {
4862                failed = 0;
4863                retried = true;
4864                goto again;
4865        } else if (failed && retried) {
4866                ret = -ENOSPC;
4867                goto done;
4868        }
4869
4870        /* Shrinking succeeded, else we would be at "done". */
4871        trans = btrfs_start_transaction(root, 0);
4872        if (IS_ERR(trans)) {
4873                ret = PTR_ERR(trans);
4874                goto done;
4875        }
4876
4877        mutex_lock(&fs_info->chunk_mutex);
4878        /* Clear all state bits beyond the shrunk device size */
4879        clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
4880                          CHUNK_STATE_MASK);
4881
4882        btrfs_device_set_disk_total_bytes(device, new_size);
4883        if (list_empty(&device->post_commit_list))
4884                list_add_tail(&device->post_commit_list,
4885                              &trans->transaction->dev_update_list);
4886
4887        WARN_ON(diff > old_total);
4888        btrfs_set_super_total_bytes(super_copy,
4889                        round_down(old_total - diff, fs_info->sectorsize));
4890        mutex_unlock(&fs_info->chunk_mutex);
4891
4892        /* Now btrfs_update_device() will change the on-disk size. */
4893        ret = btrfs_update_device(trans, device);
4894        if (ret < 0) {
4895                btrfs_abort_transaction(trans, ret);
4896                btrfs_end_transaction(trans);
4897        } else {
4898                ret = btrfs_commit_transaction(trans);
4899        }
4900done:
4901        btrfs_free_path(path);
4902        if (ret) {
4903                mutex_lock(&fs_info->chunk_mutex);
4904                btrfs_device_set_total_bytes(device, old_size);
4905                if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
4906                        device->fs_devices->total_rw_bytes += diff;
4907                atomic64_add(diff, &fs_info->free_chunk_space);
4908                mutex_unlock(&fs_info->chunk_mutex);
4909        }
4910        return ret;
4911}
4912
4913static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
4914                           struct btrfs_key *key,
4915                           struct btrfs_chunk *chunk, int item_size)
4916{
4917        struct btrfs_super_block *super_copy = fs_info->super_copy;
4918        struct btrfs_disk_key disk_key;
4919        u32 array_size;
4920        u8 *ptr;
4921
4922        lockdep_assert_held(&fs_info->chunk_mutex);
4923
4924        array_size = btrfs_super_sys_array_size(super_copy);
4925        if (array_size + item_size + sizeof(disk_key)
4926                        > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
4927                return -EFBIG;
4928
4929        ptr = super_copy->sys_chunk_array + array_size;
4930        btrfs_cpu_key_to_disk(&disk_key, key);
4931        memcpy(ptr, &disk_key, sizeof(disk_key));
4932        ptr += sizeof(disk_key);
4933        memcpy(ptr, chunk, item_size);
4934        item_size += sizeof(disk_key);
4935        btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
4936
4937        return 0;
4938}
4939
4940/*
4941 * sort the devices in descending order by max_avail, total_avail
4942 */
4943static int btrfs_cmp_device_info(const void *a, const void *b)
4944{
4945        const struct btrfs_device_info *di_a = a;
4946        const struct btrfs_device_info *di_b = b;
4947
4948        if (di_a->max_avail > di_b->max_avail)
4949                return -1;
4950        if (di_a->max_avail < di_b->max_avail)
4951                return 1;
4952        if (di_a->total_avail > di_b->total_avail)
4953                return -1;
4954        if (di_a->total_avail < di_b->total_avail)
4955                return 1;
4956        return 0;
4957}
4958
4959static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
4960{
4961        if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
4962                return;
4963
4964        btrfs_set_fs_incompat(info, RAID56);
4965}
4966
4967static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
4968{
4969        if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4)))
4970                return;
4971
4972        btrfs_set_fs_incompat(info, RAID1C34);
4973}
4974
4975/*
4976 * Structure used internally for __btrfs_alloc_chunk() function.
4977 * Wraps needed parameters.
4978 */
4979struct alloc_chunk_ctl {
4980        u64 start;
4981        u64 type;
4982        /* Total number of stripes to allocate */
4983        int num_stripes;
4984        /* sub_stripes info for map */
4985        int sub_stripes;
4986        /* Stripes per device */
4987        int dev_stripes;
4988        /* Maximum number of devices to use */
4989        int devs_max;
4990        /* Minimum number of devices to use */
4991        int devs_min;
4992        /* ndevs has to be a multiple of this */
4993        int devs_increment;
4994        /* Number of copies */
4995        int ncopies;
4996        /* Number of stripes worth of bytes to store parity information */
4997        int nparity;
4998        u64 max_stripe_size;
4999        u64 max_chunk_size;
5000        u64 dev_extent_min;
5001        u64 stripe_size;
5002        u64 chunk_size;
5003        int ndevs;
5004};
5005
5006static void init_alloc_chunk_ctl_policy_regular(
5007                                struct btrfs_fs_devices *fs_devices,
5008                                struct alloc_chunk_ctl *ctl)
5009{
5010        u64 type = ctl->type;
5011
5012        if (type & BTRFS_BLOCK_GROUP_DATA) {
5013                ctl->max_stripe_size = SZ_1G;
5014                ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
5015        } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
5016                /* For larger filesystems, use larger metadata chunks */
5017                if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
5018                        ctl->max_stripe_size = SZ_1G;
5019                else
5020                        ctl->max_stripe_size = SZ_256M;
5021                ctl->max_chunk_size = ctl->max_stripe_size;
5022        } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
5023                ctl->max_stripe_size = SZ_32M;
5024                ctl->max_chunk_size = 2 * ctl->max_stripe_size;
5025                ctl->devs_max = min_t(int, ctl->devs_max,
5026                                      BTRFS_MAX_DEVS_SYS_CHUNK);
5027        } else {
5028                BUG();
5029        }
5030
5031        /* We don't want a chunk larger than 10% of writable space */
5032        ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
5033                                  ctl->max_chunk_size);
5034        ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
5035}
5036
5037static void init_alloc_chunk_ctl_policy_zoned(
5038                                      struct btrfs_fs_devices *fs_devices,
5039                                      struct alloc_chunk_ctl *ctl)
5040{
5041        u64 zone_size = fs_devices->fs_info->zone_size;
5042        u64 limit;
5043        int min_num_stripes = ctl->devs_min * ctl->dev_stripes;
5044        int min_data_stripes = (min_num_stripes - ctl->nparity) / ctl->ncopies;
5045        u64 min_chunk_size = min_data_stripes * zone_size;
5046        u64 type = ctl->type;
5047
5048        ctl->max_stripe_size = zone_size;
5049        if (type & BTRFS_BLOCK_GROUP_DATA) {
5050                ctl->max_chunk_size = round_down(BTRFS_MAX_DATA_CHUNK_SIZE,
5051                                                 zone_size);
5052        } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
5053                ctl->max_chunk_size = ctl->max_stripe_size;
5054        } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
5055                ctl->max_chunk_size = 2 * ctl->max_stripe_size;
5056                ctl->devs_max = min_t(int, ctl->devs_max,
5057                                      BTRFS_MAX_DEVS_SYS_CHUNK);
5058        } else {
5059                BUG();
5060        }
5061
5062        /* We don't want a chunk larger than 10% of writable space */
5063        limit = max(round_down(div_factor(fs_devices->total_rw_bytes, 1),
5064                               zone_size),
5065                    min_chunk_size);
5066        ctl->max_chunk_size = min(limit, ctl->max_chunk_size);
5067        ctl->dev_extent_min = zone_size * ctl->dev_stripes;
5068}
5069
5070static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
5071                                 struct alloc_chunk_ctl *ctl)
5072{
5073        int index = btrfs_bg_flags_to_raid_index(ctl->type);
5074
5075        ctl->sub_stripes = btrfs_raid_array[index].sub_stripes;
5076        ctl->dev_stripes = btrfs_raid_array[index].dev_stripes;
5077        ctl->devs_max = btrfs_raid_array[index].devs_max;
5078        if (!ctl->devs_max)
5079                ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info);
5080        ctl->devs_min = btrfs_raid_array[index].devs_min;
5081        ctl->devs_increment = btrfs_raid_array[index].devs_increment;
5082        ctl->ncopies = btrfs_raid_array[index].ncopies;
5083        ctl->nparity = btrfs_raid_array[index].nparity;
5084        ctl->ndevs = 0;
5085
5086        switch (fs_devices->chunk_alloc_policy) {
5087        case BTRFS_CHUNK_ALLOC_REGULAR:
5088                init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
5089                break;
5090        case BTRFS_CHUNK_ALLOC_ZONED:
5091                init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl);
5092                break;
5093        default:
5094                BUG();
5095        }
5096}
5097
5098static int gather_device_info(struct btrfs_fs_devices *fs_devices,
5099                              struct alloc_chunk_ctl *ctl,
5100                              struct btrfs_device_info *devices_info)
5101{
5102        struct btrfs_fs_info *info = fs_devices->fs_info;
5103        struct btrfs_device *device;
5104        u64 total_avail;
5105        u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
5106        int ret;
5107        int ndevs = 0;
5108        u64 max_avail;
5109        u64 dev_offset;
5110
5111        /*
5112         * in the first pass through the devices list, we gather information
5113         * about the available holes on each device.
5114         */
5115        list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
5116                if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
5117                        WARN(1, KERN_ERR
5118                               "BTRFS: read-only device in alloc_list\n");
5119                        continue;
5120                }
5121
5122                if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
5123                                        &device->dev_state) ||
5124                    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
5125                        continue;
5126
5127                if (device->total_bytes > device->bytes_used)
5128                        total_avail = device->total_bytes - device->bytes_used;
5129                else
5130                        total_avail = 0;
5131
5132                /* If there is no space on this device, skip it. */
5133                if (total_avail < ctl->dev_extent_min)
5134                        continue;
5135
5136                ret = find_free_dev_extent(device, dev_extent_want, &dev_offset,
5137                                           &max_avail);
5138                if (ret && ret != -ENOSPC)
5139                        return ret;
5140
5141                if (ret == 0)
5142                        max_avail = dev_extent_want;
5143
5144                if (max_avail < ctl->dev_extent_min) {
5145                        if (btrfs_test_opt(info, ENOSPC_DEBUG))
5146                                btrfs_debug(info,
5147                        "%s: devid %llu has no free space, have=%llu want=%llu",
5148                                            __func__, device->devid, max_avail,
5149                                            ctl->dev_extent_min);
5150                        continue;
5151                }
5152
5153                if (ndevs == fs_devices->rw_devices) {
5154                        WARN(1, "%s: found more than %llu devices\n",
5155                             __func__, fs_devices->rw_devices);
5156                        break;
5157                }
5158                devices_info[ndevs].dev_offset = dev_offset;
5159                devices_info[ndevs].max_avail = max_avail;
5160                devices_info[ndevs].total_avail = total_avail;
5161                devices_info[ndevs].dev = device;
5162                ++ndevs;
5163        }
5164        ctl->ndevs = ndevs;
5165
5166        /*
5167         * now sort the devices by hole size / available space
5168         */
5169        sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
5170             btrfs_cmp_device_info, NULL);
5171
5172        return 0;
5173}
5174
5175static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
5176                                      struct btrfs_device_info *devices_info)
5177{
5178        /* Number of stripes that count for block group size */
5179        int data_stripes;
5180
5181        /*
5182         * The primary goal is to maximize the number of stripes, so use as
5183         * many devices as possible, even if the stripes are not maximum sized.
5184         *
5185         * The DUP profile stores more than one stripe per device, the
5186         * max_avail is the total size so we have to adjust.
5187         */
5188        ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail,
5189                                   ctl->dev_stripes);
5190        ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
5191
5192        /* This will have to be fixed for RAID1 and RAID10 over more drives */
5193        data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
5194
5195        /*
5196         * Use the number of data stripes to figure out how big this chunk is
5197         * really going to be in terms of logical address space, and compare
5198         * that answer with the max chunk size. If it's higher, we try to
5199         * reduce stripe_size.
5200         */
5201        if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
5202                /*
5203                 * Reduce stripe_size, round it up to a 16MB boundary again and
5204                 * then use it, unless it ends up being even bigger than the
5205                 * previous value we had already.
5206                 */
5207                ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size,
5208                                                        data_stripes), SZ_16M),
5209                                       ctl->stripe_size);
5210        }
5211
5212        /* Align to BTRFS_STRIPE_LEN */
5213        ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
5214        ctl->chunk_size = ctl->stripe_size * data_stripes;
5215
5216        return 0;
5217}
5218
5219static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl,
5220                                    struct btrfs_device_info *devices_info)
5221{
5222        u64 zone_size = devices_info[0].dev->zone_info->zone_size;
5223        /* Number of stripes that count for block group size */
5224        int data_stripes;
5225
5226        /*
5227         * It should hold because:
5228         *    dev_extent_min == dev_extent_want == zone_size * dev_stripes
5229         */
5230        ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min);
5231
5232        ctl->stripe_size = zone_size;
5233        ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
5234        data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
5235
5236        /* stripe_size is fixed in zoned filesysmte. Reduce ndevs instead. */
5237        if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
5238                ctl->ndevs = div_u64(div_u64(ctl->max_chunk_size * ctl->ncopies,
5239                                             ctl->stripe_size) + ctl->nparity,
5240                                     ctl->dev_stripes);
5241                ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
5242                data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
5243                ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size);
5244        }
5245
5246        ctl->chunk_size = ctl->stripe_size * data_stripes;
5247
5248        return 0;
5249}
5250
5251static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
5252                              struct alloc_chunk_ctl *ctl,
5253                              struct btrfs_device_info *devices_info)
5254{
5255        struct btrfs_fs_info *info = fs_devices->fs_info;
5256
5257        /*
5258         * Round down to number of usable stripes, devs_increment can be any
5259         * number so we can't use round_down() that requires power of 2, while
5260         * rounddown is safe.
5261         */
5262        ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment);
5263
5264        if (ctl->ndevs < ctl->devs_min) {
5265                if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
5266                        btrfs_debug(info,
5267        "%s: not enough devices with free space: have=%d minimum required=%d",
5268                                    __func__, ctl->ndevs, ctl->devs_min);
5269                }
5270                return -ENOSPC;
5271        }
5272
5273        ctl->ndevs = min(ctl->ndevs, ctl->devs_max);
5274
5275        switch (fs_devices->chunk_alloc_policy) {
5276        case BTRFS_CHUNK_ALLOC_REGULAR:
5277                return decide_stripe_size_regular(ctl, devices_info);
5278        case BTRFS_CHUNK_ALLOC_ZONED:
5279                return decide_stripe_size_zoned(ctl, devices_info);
5280        default:
5281                BUG();
5282        }
5283}
5284
5285static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
5286                        struct alloc_chunk_ctl *ctl,
5287                        struct btrfs_device_info *devices_info)
5288{
5289        struct btrfs_fs_info *info = trans->fs_info;
5290        struct map_lookup *map = NULL;
5291        struct extent_map_tree *em_tree;
5292        struct btrfs_block_group *block_group;
5293        struct extent_map *em;
5294        u64 start = ctl->start;
5295        u64 type = ctl->type;
5296        int ret;
5297        int i;
5298        int j;
5299
5300        map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
5301        if (!map)
5302                return ERR_PTR(-ENOMEM);
5303        map->num_stripes = ctl->num_stripes;
5304
5305        for (i = 0; i < ctl->ndevs; ++i) {
5306                for (j = 0; j < ctl->dev_stripes; ++j) {
5307                        int s = i * ctl->dev_stripes + j;
5308                        map->stripes[s].dev = devices_info[i].dev;
5309                        map->stripes[s].physical = devices_info[i].dev_offset +
5310                                                   j * ctl->stripe_size;
5311                }
5312        }
5313        map->stripe_len = BTRFS_STRIPE_LEN;
5314        map->io_align = BTRFS_STRIPE_LEN;
5315        map->io_width = BTRFS_STRIPE_LEN;
5316        map->type = type;
5317        map->sub_stripes = ctl->sub_stripes;
5318
5319        trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
5320
5321        em = alloc_extent_map();
5322        if (!em) {
5323                kfree(map);
5324                return ERR_PTR(-ENOMEM);
5325        }
5326        set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
5327        em->map_lookup = map;
5328        em->start = start;
5329        em->len = ctl->chunk_size;
5330        em->block_start = 0;
5331        em->block_len = em->len;
5332        em->orig_block_len = ctl->stripe_size;
5333
5334        em_tree = &info->mapping_tree;
5335        write_lock(&em_tree->lock);
5336        ret = add_extent_mapping(em_tree, em, 0);
5337        if (ret) {
5338                write_unlock(&em_tree->lock);
5339                free_extent_map(em);
5340                return ERR_PTR(ret);
5341        }
5342        write_unlock(&em_tree->lock);
5343
5344        block_group = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
5345        if (IS_ERR(block_group))
5346                goto error_del_extent;
5347
5348        for (i = 0; i < map->num_stripes; i++) {
5349                struct btrfs_device *dev = map->stripes[i].dev;
5350
5351                btrfs_device_set_bytes_used(dev,
5352                                            dev->bytes_used + ctl->stripe_size);
5353                if (list_empty(&dev->post_commit_list))
5354                        list_add_tail(&dev->post_commit_list,
5355                                      &trans->transaction->dev_update_list);
5356        }
5357
5358        atomic64_sub(ctl->stripe_size * map->num_stripes,
5359                     &info->free_chunk_space);
5360
5361        free_extent_map(em);
5362        check_raid56_incompat_flag(info, type);
5363        check_raid1c34_incompat_flag(info, type);
5364
5365        return block_group;
5366
5367error_del_extent:
5368        write_lock(&em_tree->lock);
5369        remove_extent_mapping(em_tree, em);
5370        write_unlock(&em_tree->lock);
5371
5372        /* One for our allocation */
5373        free_extent_map(em);
5374        /* One for the tree reference */
5375        free_extent_map(em);
5376
5377        return block_group;
5378}
5379
5380struct btrfs_block_group *btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
5381                                            u64 type)
5382{
5383        struct btrfs_fs_info *info = trans->fs_info;
5384        struct btrfs_fs_devices *fs_devices = info->fs_devices;
5385        struct btrfs_device_info *devices_info = NULL;
5386        struct alloc_chunk_ctl ctl;
5387        struct btrfs_block_group *block_group;
5388        int ret;
5389
5390        lockdep_assert_held(&info->chunk_mutex);
5391
5392        if (!alloc_profile_is_valid(type, 0)) {
5393                ASSERT(0);
5394                return ERR_PTR(-EINVAL);
5395        }
5396
5397        if (list_empty(&fs_devices->alloc_list)) {
5398                if (btrfs_test_opt(info, ENOSPC_DEBUG))
5399                        btrfs_debug(info, "%s: no writable device", __func__);
5400                return ERR_PTR(-ENOSPC);
5401        }
5402
5403        if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
5404                btrfs_err(info, "invalid chunk type 0x%llx requested", type);
5405                ASSERT(0);
5406                return ERR_PTR(-EINVAL);
5407        }
5408
5409        ctl.start = find_next_chunk(info);
5410        ctl.type = type;
5411        init_alloc_chunk_ctl(fs_devices, &ctl);
5412
5413        devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
5414                               GFP_NOFS);
5415        if (!devices_info)
5416                return ERR_PTR(-ENOMEM);
5417
5418        ret = gather_device_info(fs_devices, &ctl, devices_info);
5419        if (ret < 0) {
5420                block_group = ERR_PTR(ret);
5421                goto out;
5422        }
5423
5424        ret = decide_stripe_size(fs_devices, &ctl, devices_info);
5425        if (ret < 0) {
5426                block_group = ERR_PTR(ret);
5427                goto out;
5428        }
5429
5430        block_group = create_chunk(trans, &ctl, devices_info);
5431
5432out:
5433        kfree(devices_info);
5434        return block_group;
5435}
5436
5437/*
5438 * This function, btrfs_chunk_alloc_add_chunk_item(), typically belongs to the
5439 * phase 1 of chunk allocation. It belongs to phase 2 only when allocating system
5440 * chunks.
5441 *
5442 * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
5443 * phases.
5444 */
5445int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
5446                                     struct btrfs_block_group *bg)
5447{
5448        struct btrfs_fs_info *fs_info = trans->fs_info;
5449        struct btrfs_root *extent_root = fs_info->extent_root;
5450        struct btrfs_root *chunk_root = fs_info->chunk_root;
5451        struct btrfs_key key;
5452        struct btrfs_chunk *chunk;
5453        struct btrfs_stripe *stripe;
5454        struct extent_map *em;
5455        struct map_lookup *map;
5456        size_t item_size;
5457        int i;
5458        int ret;
5459
5460        /*
5461         * We take the chunk_mutex for 2 reasons:
5462         *
5463         * 1) Updates and insertions in the chunk btree must be done while holding
5464         *    the chunk_mutex, as well as updating the system chunk array in the
5465         *    superblock. See the comment on top of btrfs_chunk_alloc() for the
5466         *    details;
5467         *
5468         * 2) To prevent races with the final phase of a device replace operation
5469         *    that replaces the device object associated with the map's stripes,
5470         *    because the device object's id can change at any time during that
5471         *    final phase of the device replace operation
5472         *    (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
5473         *    replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
5474         *    which would cause a failure when updating the device item, which does
5475         *    not exists, or persisting a stripe of the chunk item with such ID.
5476         *    Here we can't use the device_list_mutex because our caller already
5477         *    has locked the chunk_mutex, and the final phase of device replace
5478         *    acquires both mutexes - first the device_list_mutex and then the
5479         *    chunk_mutex. Using any of those two mutexes protects us from a
5480         *    concurrent device replace.
5481         */
5482        lockdep_assert_held(&fs_info->chunk_mutex);
5483
5484        em = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
5485        if (IS_ERR(em)) {
5486                ret = PTR_ERR(em);
5487                btrfs_abort_transaction(trans, ret);
5488                return ret;
5489        }
5490
5491        map = em->map_lookup;
5492        item_size = btrfs_chunk_item_size(map->num_stripes);
5493
5494        chunk = kzalloc(item_size, GFP_NOFS);
5495        if (!chunk) {
5496                ret = -ENOMEM;
5497                btrfs_abort_transaction(trans, ret);
5498                goto out;
5499        }
5500
5501        for (i = 0; i < map->num_stripes; i++) {
5502                struct btrfs_device *device = map->stripes[i].dev;
5503
5504                ret = btrfs_update_device(trans, device);
5505                if (ret)
5506                        goto out;
5507        }
5508
5509        stripe = &chunk->stripe;
5510        for (i = 0; i < map->num_stripes; i++) {
5511                struct btrfs_device *device = map->stripes[i].dev;
5512                const u64 dev_offset = map->stripes[i].physical;
5513
5514                btrfs_set_stack_stripe_devid(stripe, device->devid);
5515                btrfs_set_stack_stripe_offset(stripe, dev_offset);
5516                memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
5517                stripe++;
5518        }
5519
5520        btrfs_set_stack_chunk_length(chunk, bg->length);
5521        btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
5522        btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
5523        btrfs_set_stack_chunk_type(chunk, map->type);
5524        btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
5525        btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
5526        btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
5527        btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
5528        btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
5529
5530        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
5531        key.type = BTRFS_CHUNK_ITEM_KEY;
5532        key.offset = bg->start;
5533
5534        ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
5535        if (ret)
5536                goto out;
5537
5538        bg->chunk_item_inserted = 1;
5539
5540        if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
5541                ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
5542                if (ret)
5543                        goto out;
5544        }
5545
5546out:
5547        kfree(chunk);
5548        free_extent_map(em);
5549        return ret;
5550}
5551
5552static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
5553{
5554        struct btrfs_fs_info *fs_info = trans->fs_info;
5555        u64 alloc_profile;
5556        struct btrfs_block_group *meta_bg;
5557        struct btrfs_block_group *sys_bg;
5558
5559        /*
5560         * When adding a new device for sprouting, the seed device is read-only
5561         * so we must first allocate a metadata and a system chunk. But before
5562         * adding the block group items to the extent, device and chunk btrees,
5563         * we must first:
5564         *
5565         * 1) Create both chunks without doing any changes to the btrees, as
5566         *    otherwise we would get -ENOSPC since the block groups from the
5567         *    seed device are read-only;
5568         *
5569         * 2) Add the device item for the new sprout device - finishing the setup
5570         *    of a new block group requires updating the device item in the chunk
5571         *    btree, so it must exist when we attempt to do it. The previous step
5572         *    ensures this does not fail with -ENOSPC.
5573         *
5574         * After that we can add the block group items to their btrees:
5575         * update existing device item in the chunk btree, add a new block group
5576         * item to the extent btree, add a new chunk item to the chunk btree and
5577         * finally add the new device extent items to the devices btree.
5578         */
5579
5580        alloc_profile = btrfs_metadata_alloc_profile(fs_info);
5581        meta_bg = btrfs_alloc_chunk(trans, alloc_profile);
5582        if (IS_ERR(meta_bg))
5583                return PTR_ERR(meta_bg);
5584
5585        alloc_profile = btrfs_system_alloc_profile(fs_info);
5586        sys_bg = btrfs_alloc_chunk(trans, alloc_profile);
5587        if (IS_ERR(sys_bg))
5588                return PTR_ERR(sys_bg);
5589
5590        return 0;
5591}
5592
5593static inline int btrfs_chunk_max_errors(struct map_lookup *map)
5594{
5595        const int index = btrfs_bg_flags_to_raid_index(map->type);
5596
5597        return btrfs_raid_array[index].tolerated_failures;
5598}
5599
5600int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
5601{
5602        struct extent_map *em;
5603        struct map_lookup *map;
5604        int readonly = 0;
5605        int miss_ndevs = 0;
5606        int i;
5607
5608        em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
5609        if (IS_ERR(em))
5610                return 1;
5611
5612        map = em->map_lookup;
5613        for (i = 0; i < map->num_stripes; i++) {
5614                if (test_bit(BTRFS_DEV_STATE_MISSING,
5615                                        &map->stripes[i].dev->dev_state)) {
5616                        miss_ndevs++;
5617                        continue;
5618                }
5619                if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
5620                                        &map->stripes[i].dev->dev_state)) {
5621                        readonly = 1;
5622                        goto end;
5623                }
5624        }
5625
5626        /*
5627         * If the number of missing devices is larger than max errors,
5628         * we can not write the data into that chunk successfully, so
5629         * set it readonly.
5630         */
5631        if (miss_ndevs > btrfs_chunk_max_errors(map))
5632                readonly = 1;
5633end:
5634        free_extent_map(em);
5635        return readonly;
5636}
5637
5638void btrfs_mapping_tree_free(struct extent_map_tree *tree)
5639{
5640        struct extent_map *em;
5641
5642        while (1) {
5643                write_lock(&tree->lock);
5644                em = lookup_extent_mapping(tree, 0, (u64)-1);
5645                if (em)
5646                        remove_extent_mapping(tree, em);
5647                write_unlock(&tree->lock);
5648                if (!em)
5649                        break;
5650                /* once for us */
5651                free_extent_map(em);
5652                /* once for the tree */
5653                free_extent_map(em);
5654        }
5655}
5656
5657int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5658{
5659        struct extent_map *em;
5660        struct map_lookup *map;
5661        int ret;
5662
5663        em = btrfs_get_chunk_map(fs_info, logical, len);
5664        if (IS_ERR(em))
5665                /*
5666                 * We could return errors for these cases, but that could get
5667                 * ugly and we'd probably do the same thing which is just not do
5668                 * anything else and exit, so return 1 so the callers don't try
5669                 * to use other copies.
5670                 */
5671                return 1;
5672
5673        map = em->map_lookup;
5674        if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK))
5675                ret = map->num_stripes;
5676        else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5677                ret = map->sub_stripes;
5678        else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
5679                ret = 2;
5680        else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
5681                /*
5682                 * There could be two corrupted data stripes, we need
5683                 * to loop retry in order to rebuild the correct data.
5684                 *
5685                 * Fail a stripe at a time on every retry except the
5686                 * stripe under reconstruction.
5687                 */
5688                ret = map->num_stripes;
5689        else
5690                ret = 1;
5691        free_extent_map(em);
5692
5693        down_read(&fs_info->dev_replace.rwsem);
5694        if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
5695            fs_info->dev_replace.tgtdev)
5696                ret++;
5697        up_read(&fs_info->dev_replace.rwsem);
5698
5699        return ret;
5700}
5701
5702unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
5703                                    u64 logical)
5704{
5705        struct extent_map *em;
5706        struct map_lookup *map;
5707        unsigned long len = fs_info->sectorsize;
5708
5709        em = btrfs_get_chunk_map(fs_info, logical, len);
5710
5711        if (!WARN_ON(IS_ERR(em))) {
5712                map = em->map_lookup;
5713                if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5714                        len = map->stripe_len * nr_data_stripes(map);
5715                free_extent_map(em);
5716        }
5717        return len;
5718}
5719
5720int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5721{
5722        struct extent_map *em;
5723        struct map_lookup *map;
5724        int ret = 0;
5725
5726        em = btrfs_get_chunk_map(fs_info, logical, len);
5727
5728        if(!WARN_ON(IS_ERR(em))) {
5729                map = em->map_lookup;
5730                if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5731                        ret = 1;
5732                free_extent_map(em);
5733        }
5734        return ret;
5735}
5736
5737static int find_live_mirror(struct btrfs_fs_info *fs_info,
5738                            struct map_lookup *map, int first,
5739                            int dev_replace_is_ongoing)
5740{
5741        int i;
5742        int num_stripes;
5743        int preferred_mirror;
5744        int tolerance;
5745        struct btrfs_device *srcdev;
5746
5747        ASSERT((map->type &
5748                 (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
5749
5750        if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5751                num_stripes = map->sub_stripes;
5752        else
5753                num_stripes = map->num_stripes;
5754
5755        switch (fs_info->fs_devices->read_policy) {
5756        default:
5757                /* Shouldn't happen, just warn and use pid instead of failing */
5758                btrfs_warn_rl(fs_info,
5759                              "unknown read_policy type %u, reset to pid",
5760                              fs_info->fs_devices->read_policy);
5761                fs_info->fs_devices->read_policy = BTRFS_READ_POLICY_PID;
5762                fallthrough;
5763        case BTRFS_READ_POLICY_PID:
5764                preferred_mirror = first + (current->pid % num_stripes);
5765                break;
5766        }
5767
5768        if (dev_replace_is_ongoing &&
5769            fs_info->dev_replace.cont_reading_from_srcdev_mode ==
5770             BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
5771                srcdev = fs_info->dev_replace.srcdev;
5772        else
5773                srcdev = NULL;
5774
5775        /*
5776         * try to avoid the drive that is the source drive for a
5777         * dev-replace procedure, only choose it if no other non-missing
5778         * mirror is available
5779         */
5780        for (tolerance = 0; tolerance < 2; tolerance++) {
5781                if (map->stripes[preferred_mirror].dev->bdev &&
5782                    (tolerance || map->stripes[preferred_mirror].dev != srcdev))
5783                        return preferred_mirror;
5784                for (i = first; i < first + num_stripes; i++) {
5785                        if (map->stripes[i].dev->bdev &&
5786                            (tolerance || map->stripes[i].dev != srcdev))
5787                                return i;
5788                }
5789        }
5790
5791        /* we couldn't find one that doesn't fail.  Just return something
5792         * and the io error handling code will clean up eventually
5793         */
5794        return preferred_mirror;
5795}
5796
5797/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
5798static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
5799{
5800        int i;
5801        int again = 1;
5802
5803        while (again) {
5804                again = 0;
5805                for (i = 0; i < num_stripes - 1; i++) {
5806                        /* Swap if parity is on a smaller index */
5807                        if (bbio->raid_map[i] > bbio->raid_map[i + 1]) {
5808                                swap(bbio->stripes[i], bbio->stripes[i + 1]);
5809                                swap(bbio->raid_map[i], bbio->raid_map[i + 1]);
5810                                again = 1;
5811                        }
5812                }
5813        }
5814}
5815
5816static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
5817{
5818        struct btrfs_bio *bbio = kzalloc(
5819                 /* the size of the btrfs_bio */
5820                sizeof(struct btrfs_bio) +
5821                /* plus the variable array for the stripes */
5822                sizeof(struct btrfs_bio_stripe) * (total_stripes) +
5823                /* plus the variable array for the tgt dev */
5824                sizeof(int) * (real_stripes) +
5825                /*
5826                 * plus the raid_map, which includes both the tgt dev
5827                 * and the stripes
5828                 */
5829                sizeof(u64) * (total_stripes),
5830                GFP_NOFS|__GFP_NOFAIL);
5831
5832        atomic_set(&bbio->error, 0);
5833        refcount_set(&bbio->refs, 1);
5834
5835        bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes);
5836        bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes);
5837
5838        return bbio;
5839}
5840
5841void btrfs_get_bbio(struct btrfs_bio *bbio)
5842{
5843        WARN_ON(!refcount_read(&bbio->refs));
5844        refcount_inc(&bbio->refs);
5845}
5846
5847void btrfs_put_bbio(struct btrfs_bio *bbio)
5848{
5849        if (!bbio)
5850                return;
5851        if (refcount_dec_and_test(&bbio->refs))
5852                kfree(bbio);
5853}
5854
5855/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
5856/*
5857 * Please note that, discard won't be sent to target device of device
5858 * replace.
5859 */
5860static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
5861                                         u64 logical, u64 *length_ret,
5862                                         struct btrfs_bio **bbio_ret)
5863{
5864        struct extent_map *em;
5865        struct map_lookup *map;
5866        struct btrfs_bio *bbio;
5867        u64 length = *length_ret;
5868        u64 offset;
5869        u64 stripe_nr;
5870        u64 stripe_nr_end;
5871        u64 stripe_end_offset;
5872        u64 stripe_cnt;
5873        u64 stripe_len;
5874        u64 stripe_offset;
5875        u64 num_stripes;
5876        u32 stripe_index;
5877        u32 factor = 0;
5878        u32 sub_stripes = 0;
5879        u64 stripes_per_dev = 0;
5880        u32 remaining_stripes = 0;
5881        u32 last_stripe = 0;
5882        int ret = 0;
5883        int i;
5884
5885        /* discard always return a bbio */
5886        ASSERT(bbio_ret);
5887
5888        em = btrfs_get_chunk_map(fs_info, logical, length);
5889        if (IS_ERR(em))
5890                return PTR_ERR(em);
5891
5892        map = em->map_lookup;
5893        /* we don't discard raid56 yet */
5894        if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5895                ret = -EOPNOTSUPP;
5896                goto out;
5897        }
5898
5899        offset = logical - em->start;
5900        length = min_t(u64, em->start + em->len - logical, length);
5901        *length_ret = length;
5902
5903        stripe_len = map->stripe_len;
5904        /*
5905         * stripe_nr counts the total number of stripes we have to stride
5906         * to get to this block
5907         */
5908        stripe_nr = div64_u64(offset, stripe_len);
5909
5910        /* stripe_offset is the offset of this block in its stripe */
5911        stripe_offset = offset - stripe_nr * stripe_len;
5912
5913        stripe_nr_end = round_up(offset + length, map->stripe_len);
5914        stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
5915        stripe_cnt = stripe_nr_end - stripe_nr;
5916        stripe_end_offset = stripe_nr_end * map->stripe_len -
5917                            (offset + length);
5918        /*
5919         * after this, stripe_nr is the number of stripes on this
5920         * device we have to walk to find the data, and stripe_index is
5921         * the number of our device in the stripe array
5922         */
5923        num_stripes = 1;
5924        stripe_index = 0;
5925        if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
5926                         BTRFS_BLOCK_GROUP_RAID10)) {
5927                if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5928                        sub_stripes = 1;
5929                else
5930                        sub_stripes = map->sub_stripes;
5931
5932                factor = map->num_stripes / sub_stripes;
5933                num_stripes = min_t(u64, map->num_stripes,
5934                                    sub_stripes * stripe_cnt);
5935                stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
5936                stripe_index *= sub_stripes;
5937                stripes_per_dev = div_u64_rem(stripe_cnt, factor,
5938                                              &remaining_stripes);
5939                div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
5940                last_stripe *= sub_stripes;
5941        } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
5942                                BTRFS_BLOCK_GROUP_DUP)) {
5943                num_stripes = map->num_stripes;
5944        } else {
5945                stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
5946                                        &stripe_index);
5947        }
5948
5949        bbio = alloc_btrfs_bio(num_stripes, 0);
5950        if (!bbio) {
5951                ret = -ENOMEM;
5952                goto out;
5953        }
5954
5955        for (i = 0; i < num_stripes; i++) {
5956                bbio->stripes[i].physical =
5957                        map->stripes[stripe_index].physical +
5958                        stripe_offset + stripe_nr * map->stripe_len;
5959                bbio->stripes[i].dev = map->stripes[stripe_index].dev;
5960
5961                if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
5962                                 BTRFS_BLOCK_GROUP_RAID10)) {
5963                        bbio->stripes[i].length = stripes_per_dev *
5964                                map->stripe_len;
5965
5966                        if (i / sub_stripes < remaining_stripes)
5967                                bbio->stripes[i].length +=
5968                                        map->stripe_len;
5969
5970                        /*
5971                         * Special for the first stripe and
5972                         * the last stripe:
5973                         *
5974                         * |-------|...|-------|
5975                         *     |----------|
5976                         *    off     end_off
5977                         */
5978                        if (i < sub_stripes)
5979                                bbio->stripes[i].length -=
5980                                        stripe_offset;
5981
5982                        if (stripe_index >= last_stripe &&
5983                            stripe_index <= (last_stripe +
5984                                             sub_stripes - 1))
5985                                bbio->stripes[i].length -=
5986                                        stripe_end_offset;
5987
5988                        if (i == sub_stripes - 1)
5989                                stripe_offset = 0;
5990                } else {
5991                        bbio->stripes[i].length = length;
5992                }
5993
5994                stripe_index++;
5995                if (stripe_index == map->num_stripes) {
5996                        stripe_index = 0;
5997                        stripe_nr++;
5998                }
5999        }
6000
6001        *bbio_ret = bbio;
6002        bbio->map_type = map->type;
6003        bbio->num_stripes = num_stripes;
6004out:
6005        free_extent_map(em);
6006        return ret;
6007}
6008
6009/*
6010 * In dev-replace case, for repair case (that's the only case where the mirror
6011 * is selected explicitly when calling btrfs_map_block), blocks left of the
6012 * left cursor can also be read from the target drive.
6013 *
6014 * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
6015 * array of stripes.
6016 * For READ, it also needs to be supported using the same mirror number.
6017 *
6018 * If the requested block is not left of the left cursor, EIO is returned. This
6019 * can happen because btrfs_num_copies() returns one more in the dev-replace
6020 * case.
6021 */
6022static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
6023                                         u64 logical, u64 length,
6024                                         u64 srcdev_devid, int *mirror_num,
6025                                         u64 *physical)
6026{
6027        struct btrfs_bio *bbio = NULL;
6028        int num_stripes;
6029        int index_srcdev = 0;
6030        int found = 0;
6031        u64 physical_of_found = 0;
6032        int i;
6033        int ret = 0;
6034
6035        ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
6036                                logical, &length, &bbio, 0, 0);
6037        if (ret) {
6038                ASSERT(bbio == NULL);
6039                return ret;
6040        }
6041
6042        num_stripes = bbio->num_stripes;
6043        if (*mirror_num > num_stripes) {
6044                /*
6045                 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
6046                 * that means that the requested area is not left of the left
6047                 * cursor
6048                 */
6049                btrfs_put_bbio(bbio);
6050                return -EIO;
6051        }
6052
6053        /*
6054         * process the rest of the function using the mirror_num of the source
6055         * drive. Therefore look it up first.  At the end, patch the device
6056         * pointer to the one of the target drive.
6057         */
6058        for (i = 0; i < num_stripes; i++) {
6059                if (bbio->stripes[i].dev->devid != srcdev_devid)
6060                        continue;
6061
6062                /*
6063                 * In case of DUP, in order to keep it simple, only add the
6064                 * mirror with the lowest physical address
6065                 */
6066                if (found &&
6067                    physical_of_found <= bbio->stripes[i].physical)
6068                        continue;
6069
6070                index_srcdev = i;
6071                found = 1;
6072                physical_of_found = bbio->stripes[i].physical;
6073        }
6074
6075        btrfs_put_bbio(bbio);
6076
6077        ASSERT(found);
6078        if (!found)
6079                return -EIO;
6080
6081        *mirror_num = index_srcdev + 1;
6082        *physical = physical_of_found;
6083        return ret;
6084}
6085
6086static bool is_block_group_to_copy(struct btrfs_fs_info *fs_info, u64 logical)
6087{
6088        struct btrfs_block_group *cache;
6089        bool ret;
6090
6091        /* Non zoned filesystem does not use "to_copy" flag */
6092        if (!btrfs_is_zoned(fs_info))
6093                return false;
6094
6095        cache = btrfs_lookup_block_group(fs_info, logical);
6096
6097        spin_lock(&cache->lock);
6098        ret = cache->to_copy;
6099        spin_unlock(&cache->lock);
6100
6101        btrfs_put_block_group(cache);
6102        return ret;
6103}
6104
6105static void handle_ops_on_dev_replace(enum btrfs_map_op op,
6106                                      struct btrfs_bio **bbio_ret,
6107                                      struct btrfs_dev_replace *dev_replace,
6108                                      u64 logical,
6109                                      int *num_stripes_ret, int *max_errors_ret)
6110{
6111        struct btrfs_bio *bbio = *bbio_ret;
6112        u64 srcdev_devid = dev_replace->srcdev->devid;
6113        int tgtdev_indexes = 0;
6114        int num_stripes = *num_stripes_ret;
6115        int max_errors = *max_errors_ret;
6116        int i;
6117
6118        if (op == BTRFS_MAP_WRITE) {
6119                int index_where_to_add;
6120
6121                /*
6122                 * A block group which have "to_copy" set will eventually
6123                 * copied by dev-replace process. We can avoid cloning IO here.
6124                 */
6125                if (is_block_group_to_copy(dev_replace->srcdev->fs_info, logical))
6126                        return;
6127
6128                /*
6129                 * duplicate the write operations while the dev replace
6130                 * procedure is running. Since the copying of the old disk to
6131                 * the new disk takes place at run time while the filesystem is
6132                 * mounted writable, the regular write operations to the old
6133                 * disk have to be duplicated to go to the new disk as well.
6134                 *
6135                 * Note that device->missing is handled by the caller, and that
6136                 * the write to the old disk is already set up in the stripes
6137                 * array.
6138                 */
6139                index_where_to_add = num_stripes;
6140                for (i = 0; i < num_stripes; i++) {
6141                        if (bbio->stripes[i].dev->devid == srcdev_devid) {
6142                                /* write to new disk, too */
6143                                struct btrfs_bio_stripe *new =
6144                                        bbio->stripes + index_where_to_add;
6145                                struct btrfs_bio_stripe *old =
6146                                        bbio->stripes + i;
6147
6148                                new->physical = old->physical;
6149                                new->length = old->length;
6150                                new->dev = dev_replace->tgtdev;
6151                                bbio->tgtdev_map[i] = index_where_to_add;
6152                                index_where_to_add++;
6153                                max_errors++;
6154                                tgtdev_indexes++;
6155                        }
6156                }
6157                num_stripes = index_where_to_add;
6158        } else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
6159                int index_srcdev = 0;
6160                int found = 0;
6161                u64 physical_of_found = 0;
6162
6163                /*
6164                 * During the dev-replace procedure, the target drive can also
6165                 * be used to read data in case it is needed to repair a corrupt
6166                 * block elsewhere. This is possible if the requested area is
6167                 * left of the left cursor. In this area, the target drive is a
6168                 * full copy of the source drive.
6169                 */
6170                for (i = 0; i < num_stripes; i++) {
6171                        if (bbio->stripes[i].dev->devid == srcdev_devid) {
6172                                /*
6173                                 * In case of DUP, in order to keep it simple,
6174                                 * only add the mirror with the lowest physical
6175                                 * address
6176                                 */
6177                                if (found &&
6178                                    physical_of_found <=
6179                                     bbio->stripes[i].physical)
6180                                        continue;
6181                                index_srcdev = i;
6182                                found = 1;
6183                                physical_of_found = bbio->stripes[i].physical;
6184                        }
6185                }
6186                if (found) {
6187                        struct btrfs_bio_stripe *tgtdev_stripe =
6188                                bbio->stripes + num_stripes;
6189
6190                        tgtdev_stripe->physical = physical_of_found;
6191                        tgtdev_stripe->length =
6192                                bbio->stripes[index_srcdev].length;
6193                        tgtdev_stripe->dev = dev_replace->tgtdev;
6194                        bbio->tgtdev_map[index_srcdev] = num_stripes;
6195
6196                        tgtdev_indexes++;
6197                        num_stripes++;
6198                }
6199        }
6200
6201        *num_stripes_ret = num_stripes;
6202        *max_errors_ret = max_errors;
6203        bbio->num_tgtdevs = tgtdev_indexes;
6204        *bbio_ret = bbio;
6205}
6206
6207static bool need_full_stripe(enum btrfs_map_op op)
6208{
6209        return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
6210}
6211
6212/*
6213 * Calculate the geometry of a particular (address, len) tuple. This
6214 * information is used to calculate how big a particular bio can get before it
6215 * straddles a stripe.
6216 *
6217 * @fs_info: the filesystem
6218 * @em:      mapping containing the logical extent
6219 * @op:      type of operation - write or read
6220 * @logical: address that we want to figure out the geometry of
6221 * @io_geom: pointer used to return values
6222 *
6223 * Returns < 0 in case a chunk for the given logical address cannot be found,
6224 * usually shouldn't happen unless @logical is corrupted, 0 otherwise.
6225 */
6226int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
6227                          enum btrfs_map_op op, u64 logical,
6228                          struct btrfs_io_geometry *io_geom)
6229{
6230        struct map_lookup *map;
6231        u64 len;
6232        u64 offset;
6233        u64 stripe_offset;
6234        u64 stripe_nr;
6235        u64 stripe_len;
6236        u64 raid56_full_stripe_start = (u64)-1;
6237        int data_stripes;
6238
6239        ASSERT(op != BTRFS_MAP_DISCARD);
6240
6241        map = em->map_lookup;
6242        /* Offset of this logical address in the chunk */
6243        offset = logical - em->start;
6244        /* Len of a stripe in a chunk */
6245        stripe_len = map->stripe_len;
6246        /* Stripe where this block falls in */
6247        stripe_nr = div64_u64(offset, stripe_len);
6248        /* Offset of stripe in the chunk */
6249        stripe_offset = stripe_nr * stripe_len;
6250        if (offset < stripe_offset) {
6251                btrfs_crit(fs_info,
6252"stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
6253                        stripe_offset, offset, em->start, logical, stripe_len);
6254                return -EINVAL;
6255        }
6256
6257        /* stripe_offset is the offset of this block in its stripe */
6258        stripe_offset = offset - stripe_offset;
6259        data_stripes = nr_data_stripes(map);
6260
6261        if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
6262                u64 max_len = stripe_len - stripe_offset;
6263
6264                /*
6265                 * In case of raid56, we need to know the stripe aligned start
6266                 */
6267                if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6268                        unsigned long full_stripe_len = stripe_len * data_stripes;
6269                        raid56_full_stripe_start = offset;
6270
6271                        /*
6272                         * Allow a write of a full stripe, but make sure we
6273                         * don't allow straddling of stripes
6274                         */
6275                        raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
6276                                        full_stripe_len);
6277                        raid56_full_stripe_start *= full_stripe_len;
6278
6279                        /*
6280                         * For writes to RAID[56], allow a full stripeset across
6281                         * all disks. For other RAID types and for RAID[56]
6282                         * reads, just allow a single stripe (on a single disk).
6283                         */
6284                        if (op == BTRFS_MAP_WRITE) {
6285                                max_len = stripe_len * data_stripes -
6286                                          (offset - raid56_full_stripe_start);
6287                        }
6288                }
6289                len = min_t(u64, em->len - offset, max_len);
6290        } else {
6291                len = em->len - offset;
6292        }
6293
6294        io_geom->len = len;
6295        io_geom->offset = offset;
6296        io_geom->stripe_len = stripe_len;
6297        io_geom->stripe_nr = stripe_nr;
6298        io_geom->stripe_offset = stripe_offset;
6299        io_geom->raid56_stripe_offset = raid56_full_stripe_start;
6300
6301        return 0;
6302}
6303
6304static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
6305                             enum btrfs_map_op op,
6306                             u64 logical, u64 *length,
6307                             struct btrfs_bio **bbio_ret,
6308                             int mirror_num, int need_raid_map)
6309{
6310        struct extent_map *em;
6311        struct map_lookup *map;
6312        u64 stripe_offset;
6313        u64 stripe_nr;
6314        u64 stripe_len;
6315        u32 stripe_index;
6316        int data_stripes;
6317        int i;
6318        int ret = 0;
6319        int num_stripes;
6320        int max_errors = 0;
6321        int tgtdev_indexes = 0;
6322        struct btrfs_bio *bbio = NULL;
6323        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
6324        int dev_replace_is_ongoing = 0;
6325        int num_alloc_stripes;
6326        int patch_the_first_stripe_for_dev_replace = 0;
6327        u64 physical_to_patch_in_first_stripe = 0;
6328        u64 raid56_full_stripe_start = (u64)-1;
6329        struct btrfs_io_geometry geom;
6330
6331        ASSERT(bbio_ret);
6332        ASSERT(op != BTRFS_MAP_DISCARD);
6333
6334        em = btrfs_get_chunk_map(fs_info, logical, *length);
6335        ASSERT(!IS_ERR(em));
6336
6337        ret = btrfs_get_io_geometry(fs_info, em, op, logical, &geom);
6338        if (ret < 0)
6339                return ret;
6340
6341        map = em->map_lookup;
6342
6343        *length = geom.len;
6344        stripe_len = geom.stripe_len;
6345        stripe_nr = geom.stripe_nr;
6346        stripe_offset = geom.stripe_offset;
6347        raid56_full_stripe_start = geom.raid56_stripe_offset;
6348        data_stripes = nr_data_stripes(map);
6349
6350        down_read(&dev_replace->rwsem);
6351        dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
6352        /*
6353         * Hold the semaphore for read during the whole operation, write is
6354         * requested at commit time but must wait.
6355         */
6356        if (!dev_replace_is_ongoing)
6357                up_read(&dev_replace->rwsem);
6358
6359        if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
6360            !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
6361                ret = get_extra_mirror_from_replace(fs_info, logical, *length,
6362                                                    dev_replace->srcdev->devid,
6363                                                    &mirror_num,
6364                                            &physical_to_patch_in_first_stripe);
6365                if (ret)
6366                        goto out;
6367                else
6368                        patch_the_first_stripe_for_dev_replace = 1;
6369        } else if (mirror_num > map->num_stripes) {
6370                mirror_num = 0;
6371        }
6372
6373        num_stripes = 1;
6374        stripe_index = 0;
6375        if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
6376                stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
6377                                &stripe_index);
6378                if (!need_full_stripe(op))
6379                        mirror_num = 1;
6380        } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
6381                if (need_full_stripe(op))
6382                        num_stripes = map->num_stripes;
6383                else if (mirror_num)
6384                        stripe_index = mirror_num - 1;
6385                else {
6386                        stripe_index = find_live_mirror(fs_info, map, 0,
6387                                            dev_replace_is_ongoing);
6388                        mirror_num = stripe_index + 1;
6389                }
6390
6391        } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
6392                if (need_full_stripe(op)) {
6393                        num_stripes = map->num_stripes;
6394                } else if (mirror_num) {
6395                        stripe_index = mirror_num - 1;
6396                } else {
6397                        mirror_num = 1;
6398                }
6399
6400        } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
6401                u32 factor = map->num_stripes / map->sub_stripes;
6402
6403                stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
6404                stripe_index *= map->sub_stripes;
6405
6406                if (need_full_stripe(op))
6407                        num_stripes = map->sub_stripes;
6408                else if (mirror_num)
6409                        stripe_index += mirror_num - 1;
6410                else {
6411                        int old_stripe_index = stripe_index;
6412                        stripe_index = find_live_mirror(fs_info, map,
6413                                              stripe_index,
6414                                              dev_replace_is_ongoing);
6415                        mirror_num = stripe_index - old_stripe_index + 1;
6416                }
6417
6418        } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6419                if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
6420                        /* push stripe_nr back to the start of the full stripe */
6421                        stripe_nr = div64_u64(raid56_full_stripe_start,
6422                                        stripe_len * data_stripes);
6423
6424                        /* RAID[56] write or recovery. Return all stripes */
6425                        num_stripes = map->num_stripes;
6426                        max_errors = nr_parity_stripes(map);
6427
6428                        *length = map->stripe_len;
6429                        stripe_index = 0;
6430                        stripe_offset = 0;
6431                } else {
6432                        /*
6433                         * Mirror #0 or #1 means the original data block.
6434                         * Mirror #2 is RAID5 parity block.
6435                         * Mirror #3 is RAID6 Q block.
6436                         */
6437                        stripe_nr = div_u64_rem(stripe_nr,
6438                                        data_stripes, &stripe_index);
6439                        if (mirror_num > 1)
6440                                stripe_index = data_stripes + mirror_num - 2;
6441
6442                        /* We distribute the parity blocks across stripes */
6443                        div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
6444                                        &stripe_index);
6445                        if (!need_full_stripe(op) && mirror_num <= 1)
6446                                mirror_num = 1;
6447                }
6448        } else {
6449                /*
6450                 * after this, stripe_nr is the number of stripes on this
6451                 * device we have to walk to find the data, and stripe_index is
6452                 * the number of our device in the stripe array
6453                 */
6454                stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
6455                                &stripe_index);
6456                mirror_num = stripe_index + 1;
6457        }
6458        if (stripe_index >= map->num_stripes) {
6459                btrfs_crit(fs_info,
6460                           "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
6461                           stripe_index, map->num_stripes);
6462                ret = -EINVAL;
6463                goto out;
6464        }
6465
6466        num_alloc_stripes = num_stripes;
6467        if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
6468                if (op == BTRFS_MAP_WRITE)
6469                        num_alloc_stripes <<= 1;
6470                if (op == BTRFS_MAP_GET_READ_MIRRORS)
6471                        num_alloc_stripes++;
6472                tgtdev_indexes = num_stripes;
6473        }
6474
6475        bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
6476        if (!bbio) {
6477                ret = -ENOMEM;
6478                goto out;
6479        }
6480
6481        for (i = 0; i < num_stripes; i++) {
6482                bbio->stripes[i].physical = map->stripes[stripe_index].physical +
6483                        stripe_offset + stripe_nr * map->stripe_len;
6484                bbio->stripes[i].dev = map->stripes[stripe_index].dev;
6485                stripe_index++;
6486        }
6487
6488        /* build raid_map */
6489        if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
6490            (need_full_stripe(op) || mirror_num > 1)) {
6491                u64 tmp;
6492                unsigned rot;
6493
6494                /* Work out the disk rotation on this stripe-set */
6495                div_u64_rem(stripe_nr, num_stripes, &rot);
6496
6497                /* Fill in the logical address of each stripe */
6498                tmp = stripe_nr * data_stripes;
6499                for (i = 0; i < data_stripes; i++)
6500                        bbio->raid_map[(i+rot) % num_stripes] =
6501                                em->start + (tmp + i) * map->stripe_len;
6502
6503                bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
6504                if (map->type & BTRFS_BLOCK_GROUP_RAID6)
6505                        bbio->raid_map[(i+rot+1) % num_stripes] =
6506                                RAID6_Q_STRIPE;
6507
6508                sort_parity_stripes(bbio, num_stripes);
6509        }
6510
6511        if (need_full_stripe(op))
6512                max_errors = btrfs_chunk_max_errors(map);
6513
6514        if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
6515            need_full_stripe(op)) {
6516                handle_ops_on_dev_replace(op, &bbio, dev_replace, logical,
6517                                          &num_stripes, &max_errors);
6518        }
6519
6520        *bbio_ret = bbio;
6521        bbio->map_type = map->type;
6522        bbio->num_stripes = num_stripes;
6523        bbio->max_errors = max_errors;
6524        bbio->mirror_num = mirror_num;
6525
6526        /*
6527         * this is the case that REQ_READ && dev_replace_is_ongoing &&
6528         * mirror_num == num_stripes + 1 && dev_replace target drive is
6529         * available as a mirror
6530         */
6531        if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
6532                WARN_ON(num_stripes > 1);
6533                bbio->stripes[0].dev = dev_replace->tgtdev;
6534                bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
6535                bbio->mirror_num = map->num_stripes + 1;
6536        }
6537out:
6538        if (dev_replace_is_ongoing) {
6539                lockdep_assert_held(&dev_replace->rwsem);
6540                /* Unlock and let waiting writers proceed */
6541                up_read(&dev_replace->rwsem);
6542        }
6543        free_extent_map(em);
6544        return ret;
6545}
6546
6547int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6548                      u64 logical, u64 *length,
6549                      struct btrfs_bio **bbio_ret, int mirror_num)
6550{
6551        if (op == BTRFS_MAP_DISCARD)
6552                return __btrfs_map_block_for_discard(fs_info, logical,
6553                                                     length, bbio_ret);
6554
6555        return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
6556                                 mirror_num, 0);
6557}
6558
6559/* For Scrub/replace */
6560int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6561                     u64 logical, u64 *length,
6562                     struct btrfs_bio **bbio_ret)
6563{
6564        return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
6565}
6566
6567static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
6568{
6569        bio->bi_private = bbio->private;
6570        bio->bi_end_io = bbio->end_io;
6571        bio_endio(bio);
6572
6573        btrfs_put_bbio(bbio);
6574}
6575
6576static void btrfs_end_bio(struct bio *bio)
6577{
6578        struct btrfs_bio *bbio = bio->bi_private;
6579        int is_orig_bio = 0;
6580
6581        if (bio->bi_status) {
6582                atomic_inc(&bbio->error);
6583                if (bio->bi_status == BLK_STS_IOERR ||
6584                    bio->bi_status == BLK_STS_TARGET) {
6585                        struct btrfs_device *dev = btrfs_io_bio(bio)->device;
6586
6587                        ASSERT(dev->bdev);
6588                        if (btrfs_op(bio) == BTRFS_MAP_WRITE)
6589                                btrfs_dev_stat_inc_and_print(dev,
6590                                                BTRFS_DEV_STAT_WRITE_ERRS);
6591                        else if (!(bio->bi_opf & REQ_RAHEAD))
6592                                btrfs_dev_stat_inc_and_print(dev,
6593                                                BTRFS_DEV_STAT_READ_ERRS);
6594                        if (bio->bi_opf & REQ_PREFLUSH)
6595                                btrfs_dev_stat_inc_and_print(dev,
6596                                                BTRFS_DEV_STAT_FLUSH_ERRS);
6597                }
6598        }
6599
6600        if (bio == bbio->orig_bio)
6601                is_orig_bio = 1;
6602
6603        btrfs_bio_counter_dec(bbio->fs_info);
6604
6605        if (atomic_dec_and_test(&bbio->stripes_pending)) {
6606                if (!is_orig_bio) {
6607                        bio_put(bio);
6608                        bio = bbio->orig_bio;
6609                }
6610
6611                btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6612                /* only send an error to the higher layers if it is
6613                 * beyond the tolerance of the btrfs bio
6614                 */
6615                if (atomic_read(&bbio->error) > bbio->max_errors) {
6616                        bio->bi_status = BLK_STS_IOERR;
6617                } else {
6618                        /*
6619                         * this bio is actually up to date, we didn't
6620                         * go over the max number of errors
6621                         */
6622                        bio->bi_status = BLK_STS_OK;
6623                }
6624
6625                btrfs_end_bbio(bbio, bio);
6626        } else if (!is_orig_bio) {
6627                bio_put(bio);
6628        }
6629}
6630
6631static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
6632                              u64 physical, struct btrfs_device *dev)
6633{
6634        struct btrfs_fs_info *fs_info = bbio->fs_info;
6635
6636        bio->bi_private = bbio;
6637        btrfs_io_bio(bio)->device = dev;
6638        bio->bi_end_io = btrfs_end_bio;
6639        bio->bi_iter.bi_sector = physical >> 9;
6640        /*
6641         * For zone append writing, bi_sector must point the beginning of the
6642         * zone
6643         */
6644        if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
6645                if (btrfs_dev_is_sequential(dev, physical)) {
6646                        u64 zone_start = round_down(physical, fs_info->zone_size);
6647
6648                        bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
6649                } else {
6650                        bio->bi_opf &= ~REQ_OP_ZONE_APPEND;
6651                        bio->bi_opf |= REQ_OP_WRITE;
6652                }
6653        }
6654        btrfs_debug_in_rcu(fs_info,
6655        "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
6656                bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
6657                (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
6658                dev->devid, bio->bi_iter.bi_size);
6659        bio_set_dev(bio, dev->bdev);
6660
6661        btrfs_bio_counter_inc_noblocked(fs_info);
6662
6663        btrfsic_submit_bio(bio);
6664}
6665
6666static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
6667{
6668        atomic_inc(&bbio->error);
6669        if (atomic_dec_and_test(&bbio->stripes_pending)) {
6670                /* Should be the original bio. */
6671                WARN_ON(bio != bbio->orig_bio);
6672
6673                btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6674                bio->bi_iter.bi_sector = logical >> 9;
6675                if (atomic_read(&bbio->error) > bbio->max_errors)
6676                        bio->bi_status = BLK_STS_IOERR;
6677                else
6678                        bio->bi_status = BLK_STS_OK;
6679                btrfs_end_bbio(bbio, bio);
6680        }
6681}
6682
6683blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
6684                           int mirror_num)
6685{
6686        struct btrfs_device *dev;
6687        struct bio *first_bio = bio;
6688        u64 logical = bio->bi_iter.bi_sector << 9;
6689        u64 length = 0;
6690        u64 map_length;
6691        int ret;
6692        int dev_nr;
6693        int total_devs;
6694        struct btrfs_bio *bbio = NULL;
6695
6696        length = bio->bi_iter.bi_size;
6697        map_length = length;
6698
6699        btrfs_bio_counter_inc_blocked(fs_info);
6700        ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
6701                                &map_length, &bbio, mirror_num, 1);
6702        if (ret) {
6703                btrfs_bio_counter_dec(fs_info);
6704                return errno_to_blk_status(ret);
6705        }
6706
6707        total_devs = bbio->num_stripes;
6708        bbio->orig_bio = first_bio;
6709        bbio->private = first_bio->bi_private;
6710        bbio->end_io = first_bio->bi_end_io;
6711        bbio->fs_info = fs_info;
6712        atomic_set(&bbio->stripes_pending, bbio->num_stripes);
6713
6714        if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
6715            ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) {
6716                /* In this case, map_length has been set to the length of
6717                   a single stripe; not the whole write */
6718                if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
6719                        ret = raid56_parity_write(fs_info, bio, bbio,
6720                                                  map_length);
6721                } else {
6722                        ret = raid56_parity_recover(fs_info, bio, bbio,
6723                                                    map_length, mirror_num, 1);
6724                }
6725
6726                btrfs_bio_counter_dec(fs_info);
6727                return errno_to_blk_status(ret);
6728        }
6729
6730        if (map_length < length) {
6731                btrfs_crit(fs_info,
6732                           "mapping failed logical %llu bio len %llu len %llu",
6733                           logical, length, map_length);
6734                BUG();
6735        }
6736
6737        for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
6738                dev = bbio->stripes[dev_nr].dev;
6739                if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
6740                                                   &dev->dev_state) ||
6741                    (btrfs_op(first_bio) == BTRFS_MAP_WRITE &&
6742                    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
6743                        bbio_error(bbio, first_bio, logical);
6744                        continue;
6745                }
6746
6747                if (dev_nr < total_devs - 1)
6748                        bio = btrfs_bio_clone(first_bio);
6749                else
6750                        bio = first_bio;
6751
6752                submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev);
6753        }
6754        btrfs_bio_counter_dec(fs_info);
6755        return BLK_STS_OK;
6756}
6757
6758/*
6759 * Find a device specified by @devid or @uuid in the list of @fs_devices, or
6760 * return NULL.
6761 *
6762 * If devid and uuid are both specified, the match must be exact, otherwise
6763 * only devid is used.
6764 */
6765struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
6766                                       u64 devid, u8 *uuid, u8 *fsid)
6767{
6768        struct btrfs_device *device;
6769        struct btrfs_fs_devices *seed_devs;
6770
6771        if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
6772                list_for_each_entry(device, &fs_devices->devices, dev_list) {
6773                        if (device->devid == devid &&
6774                            (!uuid || memcmp(device->uuid, uuid,
6775                                             BTRFS_UUID_SIZE) == 0))
6776                                return device;
6777                }
6778        }
6779
6780        list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
6781                if (!fsid ||
6782                    !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
6783                        list_for_each_entry(device, &seed_devs->devices,
6784                                            dev_list) {
6785                                if (device->devid == devid &&
6786                                    (!uuid || memcmp(device->uuid, uuid,
6787                                                     BTRFS_UUID_SIZE) == 0))
6788                                        return device;
6789                        }
6790                }
6791        }
6792
6793        return NULL;
6794}
6795
6796static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
6797                                            u64 devid, u8 *dev_uuid)
6798{
6799        struct btrfs_device *device;
6800        unsigned int nofs_flag;
6801
6802        /*
6803         * We call this under the chunk_mutex, so we want to use NOFS for this
6804         * allocation, however we don't want to change btrfs_alloc_device() to
6805         * always do NOFS because we use it in a lot of other GFP_KERNEL safe
6806         * places.
6807         */
6808        nofs_flag = memalloc_nofs_save();
6809        device = btrfs_alloc_device(NULL, &devid, dev_uuid);
6810        memalloc_nofs_restore(nofs_flag);
6811        if (IS_ERR(device))
6812                return device;
6813
6814        list_add(&device->dev_list, &fs_devices->devices);
6815        device->fs_devices = fs_devices;
6816        fs_devices->num_devices++;
6817
6818        set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
6819        fs_devices->missing_devices++;
6820
6821        return device;
6822}
6823
6824/**
6825 * btrfs_alloc_device - allocate struct btrfs_device
6826 * @fs_info:    used only for generating a new devid, can be NULL if
6827 *              devid is provided (i.e. @devid != NULL).
6828 * @devid:      a pointer to devid for this device.  If NULL a new devid
6829 *              is generated.
6830 * @uuid:       a pointer to UUID for this device.  If NULL a new UUID
6831 *              is generated.
6832 *
6833 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
6834 * on error.  Returned struct is not linked onto any lists and must be
6835 * destroyed with btrfs_free_device.
6836 */
6837struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
6838                                        const u64 *devid,
6839                                        const u8 *uuid)
6840{
6841        struct btrfs_device *dev;
6842        u64 tmp;
6843
6844        if (WARN_ON(!devid && !fs_info))
6845                return ERR_PTR(-EINVAL);
6846
6847        dev = kzalloc(sizeof(*dev), GFP_KERNEL);
6848        if (!dev)
6849                return ERR_PTR(-ENOMEM);
6850
6851        /*
6852         * Preallocate a bio that's always going to be used for flushing device
6853         * barriers and matches the device lifespan
6854         */
6855        dev->flush_bio = bio_kmalloc(GFP_KERNEL, 0);
6856        if (!dev->flush_bio) {
6857                kfree(dev);
6858                return ERR_PTR(-ENOMEM);
6859        }
6860
6861        INIT_LIST_HEAD(&dev->dev_list);
6862        INIT_LIST_HEAD(&dev->dev_alloc_list);
6863        INIT_LIST_HEAD(&dev->post_commit_list);
6864
6865        atomic_set(&dev->reada_in_flight, 0);
6866        atomic_set(&dev->dev_stats_ccnt, 0);
6867        btrfs_device_data_ordered_init(dev);
6868        INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
6869        INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
6870        extent_io_tree_init(fs_info, &dev->alloc_state,
6871                            IO_TREE_DEVICE_ALLOC_STATE, NULL);
6872
6873        if (devid)
6874                tmp = *devid;
6875        else {
6876                int ret;
6877
6878                ret = find_next_devid(fs_info, &tmp);
6879                if (ret) {
6880                        btrfs_free_device(dev);
6881                        return ERR_PTR(ret);
6882                }
6883        }
6884        dev->devid = tmp;
6885
6886        if (uuid)
6887                memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
6888        else
6889                generate_random_uuid(dev->uuid);
6890
6891        return dev;
6892}
6893
6894static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
6895                                        u64 devid, u8 *uuid, bool error)
6896{
6897        if (error)
6898                btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
6899                              devid, uuid);
6900        else
6901                btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
6902                              devid, uuid);
6903}
6904
6905static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
6906{
6907        const int data_stripes = calc_data_stripes(type, num_stripes);
6908
6909        return div_u64(chunk_len, data_stripes);
6910}
6911
6912#if BITS_PER_LONG == 32
6913/*
6914 * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE
6915 * can't be accessed on 32bit systems.
6916 *
6917 * This function do mount time check to reject the fs if it already has
6918 * metadata chunk beyond that limit.
6919 */
6920static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
6921                                  u64 logical, u64 length, u64 type)
6922{
6923        if (!(type & BTRFS_BLOCK_GROUP_METADATA))
6924                return 0;
6925
6926        if (logical + length < MAX_LFS_FILESIZE)
6927                return 0;
6928
6929        btrfs_err_32bit_limit(fs_info);
6930        return -EOVERFLOW;
6931}
6932
6933/*
6934 * This is to give early warning for any metadata chunk reaching
6935 * BTRFS_32BIT_EARLY_WARN_THRESHOLD.
6936 * Although we can still access the metadata, it's not going to be possible
6937 * once the limit is reached.
6938 */
6939static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
6940                                  u64 logical, u64 length, u64 type)
6941{
6942        if (!(type & BTRFS_BLOCK_GROUP_METADATA))
6943                return;
6944
6945        if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD)
6946                return;
6947
6948        btrfs_warn_32bit_limit(fs_info);
6949}
6950#endif
6951
6952static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
6953                          struct btrfs_chunk *chunk)
6954{
6955        struct btrfs_fs_info *fs_info = leaf->fs_info;
6956        struct extent_map_tree *map_tree = &fs_info->mapping_tree;
6957        struct map_lookup *map;
6958        struct extent_map *em;
6959        u64 logical;
6960        u64 length;
6961        u64 devid;
6962        u64 type;
6963        u8 uuid[BTRFS_UUID_SIZE];
6964        int num_stripes;
6965        int ret;
6966        int i;
6967
6968        logical = key->offset;
6969        length = btrfs_chunk_length(leaf, chunk);
6970        type = btrfs_chunk_type(leaf, chunk);
6971        num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
6972
6973#if BITS_PER_LONG == 32
6974        ret = check_32bit_meta_chunk(fs_info, logical, length, type);
6975        if (ret < 0)
6976                return ret;
6977        warn_32bit_meta_chunk(fs_info, logical, length, type);
6978#endif
6979
6980        /*
6981         * Only need to verify chunk item if we're reading from sys chunk array,
6982         * as chunk item in tree block is already verified by tree-checker.
6983         */
6984        if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
6985                ret = btrfs_check_chunk_valid(leaf, chunk, logical);
6986                if (ret)
6987                        return ret;
6988        }
6989
6990        read_lock(&map_tree->lock);
6991        em = lookup_extent_mapping(map_tree, logical, 1);
6992        read_unlock(&map_tree->lock);
6993
6994        /* already mapped? */
6995        if (em && em->start <= logical && em->start + em->len > logical) {
6996                free_extent_map(em);
6997                return 0;
6998        } else if (em) {
6999                free_extent_map(em);
7000        }
7001
7002        em = alloc_extent_map();
7003        if (!em)
7004                return -ENOMEM;
7005        map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
7006        if (!map) {
7007                free_extent_map(em);
7008                return -ENOMEM;
7009        }
7010
7011        set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
7012        em->map_lookup = map;
7013        em->start = logical;
7014        em->len = length;
7015        em->orig_start = 0;
7016        em->block_start = 0;
7017        em->block_len = em->len;
7018
7019        map->num_stripes = num_stripes;
7020        map->io_width = btrfs_chunk_io_width(leaf, chunk);
7021        map->io_align = btrfs_chunk_io_align(leaf, chunk);
7022        map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
7023        map->type = type;
7024        map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
7025        map->verified_stripes = 0;
7026        em->orig_block_len = calc_stripe_length(type, em->len,
7027                                                map->num_stripes);
7028        for (i = 0; i < num_stripes; i++) {
7029                map->stripes[i].physical =
7030                        btrfs_stripe_offset_nr(leaf, chunk, i);
7031                devid = btrfs_stripe_devid_nr(leaf, chunk, i);
7032                read_extent_buffer(leaf, uuid, (unsigned long)
7033                                   btrfs_stripe_dev_uuid_nr(chunk, i),
7034                                   BTRFS_UUID_SIZE);
7035                map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
7036                                                        devid, uuid, NULL);
7037                if (!map->stripes[i].dev &&
7038                    !btrfs_test_opt(fs_info, DEGRADED)) {
7039                        free_extent_map(em);
7040                        btrfs_report_missing_device(fs_info, devid, uuid, true);
7041                        return -ENOENT;
7042                }
7043                if (!map->stripes[i].dev) {
7044                        map->stripes[i].dev =
7045                                add_missing_dev(fs_info->fs_devices, devid,
7046                                                uuid);
7047                        if (IS_ERR(map->stripes[i].dev)) {
7048                                free_extent_map(em);
7049                                btrfs_err(fs_info,
7050                                        "failed to init missing dev %llu: %ld",
7051                                        devid, PTR_ERR(map->stripes[i].dev));
7052                                return PTR_ERR(map->stripes[i].dev);
7053                        }
7054                        btrfs_report_missing_device(fs_info, devid, uuid, false);
7055                }
7056                set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
7057                                &(map->stripes[i].dev->dev_state));
7058
7059        }
7060
7061        write_lock(&map_tree->lock);
7062        ret = add_extent_mapping(map_tree, em, 0);
7063        write_unlock(&map_tree->lock);
7064        if (ret < 0) {
7065                btrfs_err(fs_info,
7066                          "failed to add chunk map, start=%llu len=%llu: %d",
7067                          em->start, em->len, ret);
7068        }
7069        free_extent_map(em);
7070
7071        return ret;
7072}
7073
7074static void fill_device_from_item(struct extent_buffer *leaf,
7075                                 struct btrfs_dev_item *dev_item,
7076                                 struct btrfs_device *device)
7077{
7078        unsigned long ptr;
7079
7080        device->devid = btrfs_device_id(leaf, dev_item);
7081        device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
7082        device->total_bytes = device->disk_total_bytes;
7083        device->commit_total_bytes = device->disk_total_bytes;
7084        device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
7085        device->commit_bytes_used = device->bytes_used;
7086        device->type = btrfs_device_type(leaf, dev_item);
7087        device->io_align = btrfs_device_io_align(leaf, dev_item);
7088        device->io_width = btrfs_device_io_width(leaf, dev_item);
7089        device->sector_size = btrfs_device_sector_size(leaf, dev_item);
7090        WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
7091        clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
7092
7093        ptr = btrfs_device_uuid(dev_item);
7094        read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
7095}
7096
7097static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
7098                                                  u8 *fsid)
7099{
7100        struct btrfs_fs_devices *fs_devices;
7101        int ret;
7102
7103        lockdep_assert_held(&uuid_mutex);
7104        ASSERT(fsid);
7105
7106        /* This will match only for multi-device seed fs */
7107        list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
7108                if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
7109                        return fs_devices;
7110
7111
7112        fs_devices = find_fsid(fsid, NULL);
7113        if (!fs_devices) {
7114                if (!btrfs_test_opt(fs_info, DEGRADED))
7115                        return ERR_PTR(-ENOENT);
7116
7117                fs_devices = alloc_fs_devices(fsid, NULL);
7118                if (IS_ERR(fs_devices))
7119                        return fs_devices;
7120
7121                fs_devices->seeding = true;
7122                fs_devices->opened = 1;
7123                return fs_devices;
7124        }
7125
7126        /*
7127         * Upon first call for a seed fs fsid, just create a private copy of the
7128         * respective fs_devices and anchor it at fs_info->fs_devices->seed_list
7129         */
7130        fs_devices = clone_fs_devices(fs_devices);
7131        if (IS_ERR(fs_devices))
7132                return fs_devices;
7133
7134        ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
7135        if (ret) {
7136                free_fs_devices(fs_devices);
7137                return ERR_PTR(ret);
7138        }
7139
7140        if (!fs_devices->seeding) {
7141                close_fs_devices(fs_devices);
7142                free_fs_devices(fs_devices);
7143                return ERR_PTR(-EINVAL);
7144        }
7145
7146        list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
7147
7148        return fs_devices;
7149}
7150
7151static int read_one_dev(struct extent_buffer *leaf,
7152                        struct btrfs_dev_item *dev_item)
7153{
7154        struct btrfs_fs_info *fs_info = leaf->fs_info;
7155        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7156        struct btrfs_device *device;
7157        u64 devid;
7158        int ret;
7159        u8 fs_uuid[BTRFS_FSID_SIZE];
7160        u8 dev_uuid[BTRFS_UUID_SIZE];
7161
7162        devid = btrfs_device_id(leaf, dev_item);
7163        read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
7164                           BTRFS_UUID_SIZE);
7165        read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
7166                           BTRFS_FSID_SIZE);
7167
7168        if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
7169                fs_devices = open_seed_devices(fs_info, fs_uuid);
7170                if (IS_ERR(fs_devices))
7171                        return PTR_ERR(fs_devices);
7172        }
7173
7174        device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
7175                                   fs_uuid);
7176        if (!device) {
7177                if (!btrfs_test_opt(fs_info, DEGRADED)) {
7178                        btrfs_report_missing_device(fs_info, devid,
7179                                                        dev_uuid, true);
7180                        return -ENOENT;
7181                }
7182
7183                device = add_missing_dev(fs_devices, devid, dev_uuid);
7184                if (IS_ERR(device)) {
7185                        btrfs_err(fs_info,
7186                                "failed to add missing dev %llu: %ld",
7187                                devid, PTR_ERR(device));
7188                        return PTR_ERR(device);
7189                }
7190                btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
7191        } else {
7192                if (!device->bdev) {
7193                        if (!btrfs_test_opt(fs_info, DEGRADED)) {
7194                                btrfs_report_missing_device(fs_info,
7195                                                devid, dev_uuid, true);
7196                                return -ENOENT;
7197                        }
7198                        btrfs_report_missing_device(fs_info, devid,
7199                                                        dev_uuid, false);
7200                }
7201
7202                if (!device->bdev &&
7203                    !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
7204                        /*
7205                         * this happens when a device that was properly setup
7206                         * in the device info lists suddenly goes bad.
7207                         * device->bdev is NULL, and so we have to set
7208                         * device->missing to one here
7209                         */
7210                        device->fs_devices->missing_devices++;
7211                        set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
7212                }
7213
7214                /* Move the device to its own fs_devices */
7215                if (device->fs_devices != fs_devices) {
7216                        ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
7217                                                        &device->dev_state));
7218
7219                        list_move(&device->dev_list, &fs_devices->devices);
7220                        device->fs_devices->num_devices--;
7221                        fs_devices->num_devices++;
7222
7223                        device->fs_devices->missing_devices--;
7224                        fs_devices->missing_devices++;
7225
7226                        device->fs_devices = fs_devices;
7227                }
7228        }
7229
7230        if (device->fs_devices != fs_info->fs_devices) {
7231                BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
7232                if (device->generation !=
7233                    btrfs_device_generation(leaf, dev_item))
7234                        return -EINVAL;
7235        }
7236
7237        fill_device_from_item(leaf, dev_item, device);
7238        if (device->bdev) {
7239                u64 max_total_bytes = i_size_read(device->bdev->bd_inode);
7240
7241                if (device->total_bytes > max_total_bytes) {
7242                        btrfs_err(fs_info,
7243                        "device total_bytes should be at most %llu but found %llu",
7244                                  max_total_bytes, device->total_bytes);
7245                        return -EINVAL;
7246                }
7247        }
7248        set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
7249        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
7250           !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
7251                device->fs_devices->total_rw_bytes += device->total_bytes;
7252                atomic64_add(device->total_bytes - device->bytes_used,
7253                                &fs_info->free_chunk_space);
7254        }
7255        ret = 0;
7256        return ret;
7257}
7258
7259int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
7260{
7261        struct btrfs_root *root = fs_info->tree_root;
7262        struct btrfs_super_block *super_copy = fs_info->super_copy;
7263        struct extent_buffer *sb;
7264        struct btrfs_disk_key *disk_key;
7265        struct btrfs_chunk *chunk;
7266        u8 *array_ptr;
7267        unsigned long sb_array_offset;
7268        int ret = 0;
7269        u32 num_stripes;
7270        u32 array_size;
7271        u32 len = 0;
7272        u32 cur_offset;
7273        u64 type;
7274        struct btrfs_key key;
7275
7276        ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
7277        /*
7278         * This will create extent buffer of nodesize, superblock size is
7279         * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
7280         * overallocate but we can keep it as-is, only the first page is used.
7281         */
7282        sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET,
7283                                          root->root_key.objectid, 0);
7284        if (IS_ERR(sb))
7285                return PTR_ERR(sb);
7286        set_extent_buffer_uptodate(sb);
7287        /*
7288         * The sb extent buffer is artificial and just used to read the system array.
7289         * set_extent_buffer_uptodate() call does not properly mark all it's
7290         * pages up-to-date when the page is larger: extent does not cover the
7291         * whole page and consequently check_page_uptodate does not find all
7292         * the page's extents up-to-date (the hole beyond sb),
7293         * write_extent_buffer then triggers a WARN_ON.
7294         *
7295         * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
7296         * but sb spans only this function. Add an explicit SetPageUptodate call
7297         * to silence the warning eg. on PowerPC 64.
7298         */
7299        if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
7300                SetPageUptodate(sb->pages[0]);
7301
7302        write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
7303        array_size = btrfs_super_sys_array_size(super_copy);
7304
7305        array_ptr = super_copy->sys_chunk_array;
7306        sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
7307        cur_offset = 0;
7308
7309        while (cur_offset < array_size) {
7310                disk_key = (struct btrfs_disk_key *)array_ptr;
7311                len = sizeof(*disk_key);
7312                if (cur_offset + len > array_size)
7313                        goto out_short_read;
7314
7315                btrfs_disk_key_to_cpu(&key, disk_key);
7316
7317                array_ptr += len;
7318                sb_array_offset += len;
7319                cur_offset += len;
7320
7321                if (key.type != BTRFS_CHUNK_ITEM_KEY) {
7322                        btrfs_err(fs_info,
7323                            "unexpected item type %u in sys_array at offset %u",
7324                                  (u32)key.type, cur_offset);
7325                        ret = -EIO;
7326                        break;
7327                }
7328
7329                chunk = (struct btrfs_chunk *)sb_array_offset;
7330                /*
7331                 * At least one btrfs_chunk with one stripe must be present,
7332                 * exact stripe count check comes afterwards
7333                 */
7334                len = btrfs_chunk_item_size(1);
7335                if (cur_offset + len > array_size)
7336                        goto out_short_read;
7337
7338                num_stripes = btrfs_chunk_num_stripes(sb, chunk);
7339                if (!num_stripes) {
7340                        btrfs_err(fs_info,
7341                        "invalid number of stripes %u in sys_array at offset %u",
7342                                  num_stripes, cur_offset);
7343                        ret = -EIO;
7344                        break;
7345                }
7346
7347                type = btrfs_chunk_type(sb, chunk);
7348                if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
7349                        btrfs_err(fs_info,
7350                        "invalid chunk type %llu in sys_array at offset %u",
7351                                  type, cur_offset);
7352                        ret = -EIO;
7353                        break;
7354                }
7355
7356                len = btrfs_chunk_item_size(num_stripes);
7357                if (cur_offset + len > array_size)
7358                        goto out_short_read;
7359
7360                ret = read_one_chunk(&key, sb, chunk);
7361                if (ret)
7362                        break;
7363
7364                array_ptr += len;
7365                sb_array_offset += len;
7366                cur_offset += len;
7367        }
7368        clear_extent_buffer_uptodate(sb);
7369        free_extent_buffer_stale(sb);
7370        return ret;
7371
7372out_short_read:
7373        btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
7374                        len, cur_offset);
7375        clear_extent_buffer_uptodate(sb);
7376        free_extent_buffer_stale(sb);
7377        return -EIO;
7378}
7379
7380/*
7381 * Check if all chunks in the fs are OK for read-write degraded mount
7382 *
7383 * If the @failing_dev is specified, it's accounted as missing.
7384 *
7385 * Return true if all chunks meet the minimal RW mount requirements.
7386 * Return false if any chunk doesn't meet the minimal RW mount requirements.
7387 */
7388bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
7389                                        struct btrfs_device *failing_dev)
7390{
7391        struct extent_map_tree *map_tree = &fs_info->mapping_tree;
7392        struct extent_map *em;
7393        u64 next_start = 0;
7394        bool ret = true;
7395
7396        read_lock(&map_tree->lock);
7397        em = lookup_extent_mapping(map_tree, 0, (u64)-1);
7398        read_unlock(&map_tree->lock);
7399        /* No chunk at all? Return false anyway */
7400        if (!em) {
7401                ret = false;
7402                goto out;
7403        }
7404        while (em) {
7405                struct map_lookup *map;
7406                int missing = 0;
7407                int max_tolerated;
7408                int i;
7409
7410                map = em->map_lookup;
7411                max_tolerated =
7412                        btrfs_get_num_tolerated_disk_barrier_failures(
7413                                        map->type);
7414                for (i = 0; i < map->num_stripes; i++) {
7415                        struct btrfs_device *dev = map->stripes[i].dev;
7416
7417                        if (!dev || !dev->bdev ||
7418                            test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
7419                            dev->last_flush_error)
7420                                missing++;
7421                        else if (failing_dev && failing_dev == dev)
7422                                missing++;
7423                }
7424                if (missing > max_tolerated) {
7425                        if (!failing_dev)
7426                                btrfs_warn(fs_info,
7427        "chunk %llu missing %d devices, max tolerance is %d for writable mount",
7428                                   em->start, missing, max_tolerated);
7429                        free_extent_map(em);
7430                        ret = false;
7431                        goto out;
7432                }
7433                next_start = extent_map_end(em);
7434                free_extent_map(em);
7435
7436                read_lock(&map_tree->lock);
7437                em = lookup_extent_mapping(map_tree, next_start,
7438                                           (u64)(-1) - next_start);
7439                read_unlock(&map_tree->lock);
7440        }
7441out:
7442        return ret;
7443}
7444
7445static void readahead_tree_node_children(struct extent_buffer *node)
7446{
7447        int i;
7448        const int nr_items = btrfs_header_nritems(node);
7449
7450        for (i = 0; i < nr_items; i++)
7451                btrfs_readahead_node_child(node, i);
7452}
7453
7454int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
7455{
7456        struct btrfs_root *root = fs_info->chunk_root;
7457        struct btrfs_path *path;
7458        struct extent_buffer *leaf;
7459        struct btrfs_key key;
7460        struct btrfs_key found_key;
7461        int ret;
7462        int slot;
7463        u64 total_dev = 0;
7464        u64 last_ra_node = 0;
7465
7466        path = btrfs_alloc_path();
7467        if (!path)
7468                return -ENOMEM;
7469
7470        /*
7471         * uuid_mutex is needed only if we are mounting a sprout FS
7472         * otherwise we don't need it.
7473         */
7474        mutex_lock(&uuid_mutex);
7475
7476        /*
7477         * It is possible for mount and umount to race in such a way that
7478         * we execute this code path, but open_fs_devices failed to clear
7479         * total_rw_bytes. We certainly want it cleared before reading the
7480         * device items, so clear it here.
7481         */
7482        fs_info->fs_devices->total_rw_bytes = 0;
7483
7484        /*
7485         * Read all device items, and then all the chunk items. All
7486         * device items are found before any chunk item (their object id
7487         * is smaller than the lowest possible object id for a chunk
7488         * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
7489         */
7490        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
7491        key.offset = 0;
7492        key.type = 0;
7493        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7494        if (ret < 0)
7495                goto error;
7496        while (1) {
7497                struct extent_buffer *node;
7498
7499                leaf = path->nodes[0];
7500                slot = path->slots[0];
7501                if (slot >= btrfs_header_nritems(leaf)) {
7502                        ret = btrfs_next_leaf(root, path);
7503                        if (ret == 0)
7504                                continue;
7505                        if (ret < 0)
7506                                goto error;
7507                        break;
7508                }
7509                /*
7510                 * The nodes on level 1 are not locked but we don't need to do
7511                 * that during mount time as nothing else can access the tree
7512                 */
7513                node = path->nodes[1];
7514                if (node) {
7515                        if (last_ra_node != node->start) {
7516                                readahead_tree_node_children(node);
7517                                last_ra_node = node->start;
7518                        }
7519                }
7520                btrfs_item_key_to_cpu(leaf, &found_key, slot);
7521                if (found_key.type == BTRFS_DEV_ITEM_KEY) {
7522                        struct btrfs_dev_item *dev_item;
7523                        dev_item = btrfs_item_ptr(leaf, slot,
7524                                                  struct btrfs_dev_item);
7525                        ret = read_one_dev(leaf, dev_item);
7526                        if (ret)
7527                                goto error;
7528                        total_dev++;
7529                } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
7530                        struct btrfs_chunk *chunk;
7531
7532                        /*
7533                         * We are only called at mount time, so no need to take
7534                         * fs_info->chunk_mutex. Plus, to avoid lockdep warnings,
7535                         * we always lock first fs_info->chunk_mutex before
7536                         * acquiring any locks on the chunk tree. This is a
7537                         * requirement for chunk allocation, see the comment on
7538                         * top of btrfs_chunk_alloc() for details.
7539                         */
7540                        ASSERT(!test_bit(BTRFS_FS_OPEN, &fs_info->flags));
7541                        chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
7542                        ret = read_one_chunk(&found_key, leaf, chunk);
7543                        if (ret)
7544                                goto error;
7545                }
7546                path->slots[0]++;
7547        }
7548
7549        /*
7550         * After loading chunk tree, we've got all device information,
7551         * do another round of validation checks.
7552         */
7553        if (total_dev != fs_info->fs_devices->total_devices) {
7554                btrfs_err(fs_info,
7555           "super_num_devices %llu mismatch with num_devices %llu found here",
7556                          btrfs_super_num_devices(fs_info->super_copy),
7557                          total_dev);
7558                ret = -EINVAL;
7559                goto error;
7560        }
7561        if (btrfs_super_total_bytes(fs_info->super_copy) <
7562            fs_info->fs_devices->total_rw_bytes) {
7563                btrfs_err(fs_info,
7564        "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
7565                          btrfs_super_total_bytes(fs_info->super_copy),
7566                          fs_info->fs_devices->total_rw_bytes);
7567                ret = -EINVAL;
7568                goto error;
7569        }
7570        ret = 0;
7571error:
7572        mutex_unlock(&uuid_mutex);
7573
7574        btrfs_free_path(path);
7575        return ret;
7576}
7577
7578void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
7579{
7580        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7581        struct btrfs_device *device;
7582
7583        fs_devices->fs_info = fs_info;
7584
7585        mutex_lock(&fs_devices->device_list_mutex);
7586        list_for_each_entry(device, &fs_devices->devices, dev_list)
7587                device->fs_info = fs_info;
7588
7589        list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
7590                list_for_each_entry(device, &seed_devs->devices, dev_list)
7591                        device->fs_info = fs_info;
7592
7593                seed_devs->fs_info = fs_info;
7594        }
7595        mutex_unlock(&fs_devices->device_list_mutex);
7596}
7597
7598static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
7599                                 const struct btrfs_dev_stats_item *ptr,
7600                                 int index)
7601{
7602        u64 val;
7603
7604        read_extent_buffer(eb, &val,
7605                           offsetof(struct btrfs_dev_stats_item, values) +
7606                            ((unsigned long)ptr) + (index * sizeof(u64)),
7607                           sizeof(val));
7608        return val;
7609}
7610
7611static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
7612                                      struct btrfs_dev_stats_item *ptr,
7613                                      int index, u64 val)
7614{
7615        write_extent_buffer(eb, &val,
7616                            offsetof(struct btrfs_dev_stats_item, values) +
7617                             ((unsigned long)ptr) + (index * sizeof(u64)),
7618                            sizeof(val));
7619}
7620
7621static int btrfs_device_init_dev_stats(struct btrfs_device *device,
7622                                       struct btrfs_path *path)
7623{
7624        struct btrfs_dev_stats_item *ptr;
7625        struct extent_buffer *eb;
7626        struct btrfs_key key;
7627        int item_size;
7628        int i, ret, slot;
7629
7630        if (!device->fs_info->dev_root)
7631                return 0;
7632
7633        key.objectid = BTRFS_DEV_STATS_OBJECTID;
7634        key.type = BTRFS_PERSISTENT_ITEM_KEY;
7635        key.offset = device->devid;
7636        ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
7637        if (ret) {
7638                for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7639                        btrfs_dev_stat_set(device, i, 0);
7640                device->dev_stats_valid = 1;
7641                btrfs_release_path(path);
7642                return ret < 0 ? ret : 0;
7643        }
7644        slot = path->slots[0];
7645        eb = path->nodes[0];
7646        item_size = btrfs_item_size_nr(eb, slot);
7647
7648        ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
7649
7650        for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7651                if (item_size >= (1 + i) * sizeof(__le64))
7652                        btrfs_dev_stat_set(device, i,
7653                                           btrfs_dev_stats_value(eb, ptr, i));
7654                else
7655                        btrfs_dev_stat_set(device, i, 0);
7656        }
7657
7658        device->dev_stats_valid = 1;
7659        btrfs_dev_stat_print_on_load(device);
7660        btrfs_release_path(path);
7661
7662        return 0;
7663}
7664
7665int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
7666{
7667        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7668        struct btrfs_device *device;
7669        struct btrfs_path *path = NULL;
7670        int ret = 0;
7671
7672        path = btrfs_alloc_path();
7673        if (!path)
7674                return -ENOMEM;
7675
7676        mutex_lock(&fs_devices->device_list_mutex);
7677        list_for_each_entry(device, &fs_devices->devices, dev_list) {
7678                ret = btrfs_device_init_dev_stats(device, path);
7679                if (ret)
7680                        goto out;
7681        }
7682        list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
7683                list_for_each_entry(device, &seed_devs->devices, dev_list) {
7684                        ret = btrfs_device_init_dev_stats(device, path);
7685                        if (ret)
7686                                goto out;
7687                }
7688        }
7689out:
7690        mutex_unlock(&fs_devices->device_list_mutex);
7691
7692        btrfs_free_path(path);
7693        return ret;
7694}
7695
7696static int update_dev_stat_item(struct btrfs_trans_handle *trans,
7697                                struct btrfs_device *device)
7698{
7699        struct btrfs_fs_info *fs_info = trans->fs_info;
7700        struct btrfs_root *dev_root = fs_info->dev_root;
7701        struct btrfs_path *path;
7702        struct btrfs_key key;
7703        struct extent_buffer *eb;
7704        struct btrfs_dev_stats_item *ptr;
7705        int ret;
7706        int i;
7707
7708        key.objectid = BTRFS_DEV_STATS_OBJECTID;
7709        key.type = BTRFS_PERSISTENT_ITEM_KEY;
7710        key.offset = device->devid;
7711
7712        path = btrfs_alloc_path();
7713        if (!path)
7714                return -ENOMEM;
7715        ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
7716        if (ret < 0) {
7717                btrfs_warn_in_rcu(fs_info,
7718                        "error %d while searching for dev_stats item for device %s",
7719                              ret, rcu_str_deref(device->name));
7720                goto out;
7721        }
7722
7723        if (ret == 0 &&
7724            btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
7725                /* need to delete old one and insert a new one */
7726                ret = btrfs_del_item(trans, dev_root, path);
7727                if (ret != 0) {
7728                        btrfs_warn_in_rcu(fs_info,
7729                                "delete too small dev_stats item for device %s failed %d",
7730                                      rcu_str_deref(device->name), ret);
7731                        goto out;
7732                }
7733                ret = 1;
7734        }
7735
7736        if (ret == 1) {
7737                /* need to insert a new item */
7738                btrfs_release_path(path);
7739                ret = btrfs_insert_empty_item(trans, dev_root, path,
7740                                              &key, sizeof(*ptr));
7741                if (ret < 0) {
7742                        btrfs_warn_in_rcu(fs_info,
7743                                "insert dev_stats item for device %s failed %d",
7744                                rcu_str_deref(device->name), ret);
7745                        goto out;
7746                }
7747        }
7748
7749        eb = path->nodes[0];
7750        ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
7751        for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7752                btrfs_set_dev_stats_value(eb, ptr, i,
7753                                          btrfs_dev_stat_read(device, i));
7754        btrfs_mark_buffer_dirty(eb);
7755
7756out:
7757        btrfs_free_path(path);
7758        return ret;
7759}
7760
7761/*
7762 * called from commit_transaction. Writes all changed device stats to disk.
7763 */
7764int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
7765{
7766        struct btrfs_fs_info *fs_info = trans->fs_info;
7767        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7768        struct btrfs_device *device;
7769        int stats_cnt;
7770        int ret = 0;
7771
7772        mutex_lock(&fs_devices->device_list_mutex);
7773        list_for_each_entry(device, &fs_devices->devices, dev_list) {
7774                stats_cnt = atomic_read(&device->dev_stats_ccnt);
7775                if (!device->dev_stats_valid || stats_cnt == 0)
7776                        continue;
7777
7778
7779                /*
7780                 * There is a LOAD-LOAD control dependency between the value of
7781                 * dev_stats_ccnt and updating the on-disk values which requires
7782                 * reading the in-memory counters. Such control dependencies
7783                 * require explicit read memory barriers.
7784                 *
7785                 * This memory barriers pairs with smp_mb__before_atomic in
7786                 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
7787                 * barrier implied by atomic_xchg in
7788                 * btrfs_dev_stats_read_and_reset
7789                 */
7790                smp_rmb();
7791
7792                ret = update_dev_stat_item(trans, device);
7793                if (!ret)
7794                        atomic_sub(stats_cnt, &device->dev_stats_ccnt);
7795        }
7796        mutex_unlock(&fs_devices->device_list_mutex);
7797
7798        return ret;
7799}
7800
7801void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
7802{
7803        btrfs_dev_stat_inc(dev, index);
7804        btrfs_dev_stat_print_on_error(dev);
7805}
7806
7807static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
7808{
7809        if (!dev->dev_stats_valid)
7810                return;
7811        btrfs_err_rl_in_rcu(dev->fs_info,
7812                "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7813                           rcu_str_deref(dev->name),
7814                           btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7815                           btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7816                           btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7817                           btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7818                           btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7819}
7820
7821static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
7822{
7823        int i;
7824
7825        for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7826                if (btrfs_dev_stat_read(dev, i) != 0)
7827                        break;
7828        if (i == BTRFS_DEV_STAT_VALUES_MAX)
7829                return; /* all values == 0, suppress message */
7830
7831        btrfs_info_in_rcu(dev->fs_info,
7832                "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7833               rcu_str_deref(dev->name),
7834               btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7835               btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7836               btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7837               btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7838               btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7839}
7840
7841int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
7842                        struct btrfs_ioctl_get_dev_stats *stats)
7843{
7844        struct btrfs_device *dev;
7845        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7846        int i;
7847
7848        mutex_lock(&fs_devices->device_list_mutex);
7849        dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL);
7850        mutex_unlock(&fs_devices->device_list_mutex);
7851
7852        if (!dev) {
7853                btrfs_warn(fs_info, "get dev_stats failed, device not found");
7854                return -ENODEV;
7855        } else if (!dev->dev_stats_valid) {
7856                btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
7857                return -ENODEV;
7858        } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
7859                for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7860                        if (stats->nr_items > i)
7861                                stats->values[i] =
7862                                        btrfs_dev_stat_read_and_reset(dev, i);
7863                        else
7864                                btrfs_dev_stat_set(dev, i, 0);
7865                }
7866                btrfs_info(fs_info, "device stats zeroed by %s (%d)",
7867                           current->comm, task_pid_nr(current));
7868        } else {
7869                for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7870                        if (stats->nr_items > i)
7871                                stats->values[i] = btrfs_dev_stat_read(dev, i);
7872        }
7873        if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
7874                stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
7875        return 0;
7876}
7877
7878/*
7879 * Update the size and bytes used for each device where it changed.  This is
7880 * delayed since we would otherwise get errors while writing out the
7881 * superblocks.
7882 *
7883 * Must be invoked during transaction commit.
7884 */
7885void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
7886{
7887        struct btrfs_device *curr, *next;
7888
7889        ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
7890
7891        if (list_empty(&trans->dev_update_list))
7892                return;
7893
7894        /*
7895         * We don't need the device_list_mutex here.  This list is owned by the
7896         * transaction and the transaction must complete before the device is
7897         * released.
7898         */
7899        mutex_lock(&trans->fs_info->chunk_mutex);
7900        list_for_each_entry_safe(curr, next, &trans->dev_update_list,
7901                                 post_commit_list) {
7902                list_del_init(&curr->post_commit_list);
7903                curr->commit_total_bytes = curr->disk_total_bytes;
7904                curr->commit_bytes_used = curr->bytes_used;
7905        }
7906        mutex_unlock(&trans->fs_info->chunk_mutex);
7907}
7908
7909/*
7910 * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
7911 */
7912int btrfs_bg_type_to_factor(u64 flags)
7913{
7914        const int index = btrfs_bg_flags_to_raid_index(flags);
7915
7916        return btrfs_raid_array[index].ncopies;
7917}
7918
7919
7920
7921static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
7922                                 u64 chunk_offset, u64 devid,
7923                                 u64 physical_offset, u64 physical_len)
7924{
7925        struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7926        struct extent_map *em;
7927        struct map_lookup *map;
7928        struct btrfs_device *dev;
7929        u64 stripe_len;
7930        bool found = false;
7931        int ret = 0;
7932        int i;
7933
7934        read_lock(&em_tree->lock);
7935        em = lookup_extent_mapping(em_tree, chunk_offset, 1);
7936        read_unlock(&em_tree->lock);
7937
7938        if (!em) {
7939                btrfs_err(fs_info,
7940"dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
7941                          physical_offset, devid);
7942                ret = -EUCLEAN;
7943                goto out;
7944        }
7945
7946        map = em->map_lookup;
7947        stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes);
7948        if (physical_len != stripe_len) {
7949                btrfs_err(fs_info,
7950"dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
7951                          physical_offset, devid, em->start, physical_len,
7952                          stripe_len);
7953                ret = -EUCLEAN;
7954                goto out;
7955        }
7956
7957        for (i = 0; i < map->num_stripes; i++) {
7958                if (map->stripes[i].dev->devid == devid &&
7959                    map->stripes[i].physical == physical_offset) {
7960                        found = true;
7961                        if (map->verified_stripes >= map->num_stripes) {
7962                                btrfs_err(fs_info,
7963                                "too many dev extents for chunk %llu found",
7964                                          em->start);
7965                                ret = -EUCLEAN;
7966                                goto out;
7967                        }
7968                        map->verified_stripes++;
7969                        break;
7970                }
7971        }
7972        if (!found) {
7973                btrfs_err(fs_info,
7974        "dev extent physical offset %llu devid %llu has no corresponding chunk",
7975                        physical_offset, devid);
7976                ret = -EUCLEAN;
7977        }
7978
7979        /* Make sure no dev extent is beyond device boundary */
7980        dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL);
7981        if (!dev) {
7982                btrfs_err(fs_info, "failed to find devid %llu", devid);
7983                ret = -EUCLEAN;
7984                goto out;
7985        }
7986
7987        if (physical_offset + physical_len > dev->disk_total_bytes) {
7988                btrfs_err(fs_info,
7989"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
7990                          devid, physical_offset, physical_len,
7991                          dev->disk_total_bytes);
7992                ret = -EUCLEAN;
7993                goto out;
7994        }
7995
7996        if (dev->zone_info) {
7997                u64 zone_size = dev->zone_info->zone_size;
7998
7999                if (!IS_ALIGNED(physical_offset, zone_size) ||
8000                    !IS_ALIGNED(physical_len, zone_size)) {
8001                        btrfs_err(fs_info,
8002"zoned: dev extent devid %llu physical offset %llu len %llu is not aligned to device zone",
8003                                  devid, physical_offset, physical_len);
8004                        ret = -EUCLEAN;
8005                        goto out;
8006                }
8007        }
8008
8009out:
8010        free_extent_map(em);
8011        return ret;
8012}
8013
8014static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
8015{
8016        struct extent_map_tree *em_tree = &fs_info->mapping_tree;
8017        struct extent_map *em;
8018        struct rb_node *node;
8019        int ret = 0;
8020
8021        read_lock(&em_tree->lock);
8022        for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
8023                em = rb_entry(node, struct extent_map, rb_node);
8024                if (em->map_lookup->num_stripes !=
8025                    em->map_lookup->verified_stripes) {
8026                        btrfs_err(fs_info,
8027                        "chunk %llu has missing dev extent, have %d expect %d",
8028                                  em->start, em->map_lookup->verified_stripes,
8029                                  em->map_lookup->num_stripes);
8030                        ret = -EUCLEAN;
8031                        goto out;
8032                }
8033        }
8034out:
8035        read_unlock(&em_tree->lock);
8036        return ret;
8037}
8038
8039/*
8040 * Ensure that all dev extents are mapped to correct chunk, otherwise
8041 * later chunk allocation/free would cause unexpected behavior.
8042 *
8043 * NOTE: This will iterate through the whole device tree, which should be of
8044 * the same size level as the chunk tree.  This slightly increases mount time.
8045 */
8046int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
8047{
8048        struct btrfs_path *path;
8049        struct btrfs_root *root = fs_info->dev_root;
8050        struct btrfs_key key;
8051        u64 prev_devid = 0;
8052        u64 prev_dev_ext_end = 0;
8053        int ret = 0;
8054
8055        /*
8056         * We don't have a dev_root because we mounted with ignorebadroots and
8057         * failed to load the root, so we want to skip the verification in this
8058         * case for sure.
8059         *
8060         * However if the dev root is fine, but the tree itself is corrupted
8061         * we'd still fail to mount.  This verification is only to make sure
8062         * writes can happen safely, so instead just bypass this check
8063         * completely in the case of IGNOREBADROOTS.
8064         */
8065        if (btrfs_test_opt(fs_info, IGNOREBADROOTS))
8066                return 0;
8067
8068        key.objectid = 1;
8069        key.type = BTRFS_DEV_EXTENT_KEY;
8070        key.offset = 0;
8071
8072        path = btrfs_alloc_path();
8073        if (!path)
8074                return -ENOMEM;
8075
8076        path->reada = READA_FORWARD;
8077        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
8078        if (ret < 0)
8079                goto out;
8080
8081        if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
8082                ret = btrfs_next_leaf(root, path);
8083                if (ret < 0)
8084                        goto out;
8085                /* No dev extents at all? Not good */
8086                if (ret > 0) {
8087                        ret = -EUCLEAN;
8088                        goto out;
8089                }
8090        }
8091        while (1) {
8092                struct extent_buffer *leaf = path->nodes[0];
8093                struct btrfs_dev_extent *dext;
8094                int slot = path->slots[0];
8095                u64 chunk_offset;
8096                u64 physical_offset;
8097                u64 physical_len;
8098                u64 devid;
8099
8100                btrfs_item_key_to_cpu(leaf, &key, slot);
8101                if (key.type != BTRFS_DEV_EXTENT_KEY)
8102                        break;
8103                devid = key.objectid;
8104                physical_offset = key.offset;
8105
8106                dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
8107                chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
8108                physical_len = btrfs_dev_extent_length(leaf, dext);
8109
8110                /* Check if this dev extent overlaps with the previous one */
8111                if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
8112                        btrfs_err(fs_info,
8113"dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
8114                                  devid, physical_offset, prev_dev_ext_end);
8115                        ret = -EUCLEAN;
8116                        goto out;
8117                }
8118
8119                ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
8120                                            physical_offset, physical_len);
8121                if (ret < 0)
8122                        goto out;
8123                prev_devid = devid;
8124                prev_dev_ext_end = physical_offset + physical_len;
8125
8126                ret = btrfs_next_item(root, path);
8127                if (ret < 0)
8128                        goto out;
8129                if (ret > 0) {
8130                        ret = 0;
8131                        break;
8132                }
8133        }
8134
8135        /* Ensure all chunks have corresponding dev extents */
8136        ret = verify_chunk_dev_extent_mapping(fs_info);
8137out:
8138        btrfs_free_path(path);
8139        return ret;
8140}
8141
8142/*
8143 * Check whether the given block group or device is pinned by any inode being
8144 * used as a swapfile.
8145 */
8146bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
8147{
8148        struct btrfs_swapfile_pin *sp;
8149        struct rb_node *node;
8150
8151        spin_lock(&fs_info->swapfile_pins_lock);
8152        node = fs_info->swapfile_pins.rb_node;
8153        while (node) {
8154                sp = rb_entry(node, struct btrfs_swapfile_pin, node);
8155                if (ptr < sp->ptr)
8156                        node = node->rb_left;
8157                else if (ptr > sp->ptr)
8158                        node = node->rb_right;
8159                else
8160                        break;
8161        }
8162        spin_unlock(&fs_info->swapfile_pins_lock);
8163        return node != NULL;
8164}
8165
8166static int relocating_repair_kthread(void *data)
8167{
8168        struct btrfs_block_group *cache = (struct btrfs_block_group *)data;
8169        struct btrfs_fs_info *fs_info = cache->fs_info;
8170        u64 target;
8171        int ret = 0;
8172
8173        target = cache->start;
8174        btrfs_put_block_group(cache);
8175
8176        if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
8177                btrfs_info(fs_info,
8178                           "zoned: skip relocating block group %llu to repair: EBUSY",
8179                           target);
8180                return -EBUSY;
8181        }
8182
8183        mutex_lock(&fs_info->reclaim_bgs_lock);
8184
8185        /* Ensure block group still exists */
8186        cache = btrfs_lookup_block_group(fs_info, target);
8187        if (!cache)
8188                goto out;
8189
8190        if (!cache->relocating_repair)
8191                goto out;
8192
8193        ret = btrfs_may_alloc_data_chunk(fs_info, target);
8194        if (ret < 0)
8195                goto out;
8196
8197        btrfs_info(fs_info,
8198                   "zoned: relocating block group %llu to repair IO failure",
8199                   target);
8200        ret = btrfs_relocate_chunk(fs_info, target);
8201
8202out:
8203        if (cache)
8204                btrfs_put_block_group(cache);
8205        mutex_unlock(&fs_info->reclaim_bgs_lock);
8206        btrfs_exclop_finish(fs_info);
8207
8208        return ret;
8209}
8210
8211int btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical)
8212{
8213        struct btrfs_block_group *cache;
8214
8215        /* Do not attempt to repair in degraded state */
8216        if (btrfs_test_opt(fs_info, DEGRADED))
8217                return 0;
8218
8219        cache = btrfs_lookup_block_group(fs_info, logical);
8220        if (!cache)
8221                return 0;
8222
8223        spin_lock(&cache->lock);
8224        if (cache->relocating_repair) {
8225                spin_unlock(&cache->lock);
8226                btrfs_put_block_group(cache);
8227                return 0;
8228        }
8229        cache->relocating_repair = 1;
8230        spin_unlock(&cache->lock);
8231
8232        kthread_run(relocating_repair_kthread, cache,
8233                    "btrfs-relocating-repair");
8234
8235        return 0;
8236}
8237