linux/fs/btrfs/volumes.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2007 Oracle.  All rights reserved.
   4 */
   5
   6#include <linux/sched.h>
   7#include <linux/bio.h>
   8#include <linux/slab.h>
   9#include <linux/blkdev.h>
  10#include <linux/ratelimit.h>
  11#include <linux/kthread.h>
  12#include <linux/raid/pq.h>
  13#include <linux/semaphore.h>
  14#include <linux/uuid.h>
  15#include <linux/list_sort.h>
  16#include "misc.h"
  17#include "ctree.h"
  18#include "extent_map.h"
  19#include "disk-io.h"
  20#include "transaction.h"
  21#include "print-tree.h"
  22#include "volumes.h"
  23#include "raid56.h"
  24#include "async-thread.h"
  25#include "check-integrity.h"
  26#include "rcu-string.h"
  27#include "dev-replace.h"
  28#include "sysfs.h"
  29#include "tree-checker.h"
  30#include "space-info.h"
  31#include "block-group.h"
  32#include "discard.h"
  33
  34const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
  35        [BTRFS_RAID_RAID10] = {
  36                .sub_stripes    = 2,
  37                .dev_stripes    = 1,
  38                .devs_max       = 0,    /* 0 == as many as possible */
  39                .devs_min       = 4,
  40                .tolerated_failures = 1,
  41                .devs_increment = 2,
  42                .ncopies        = 2,
  43                .nparity        = 0,
  44                .raid_name      = "raid10",
  45                .bg_flag        = BTRFS_BLOCK_GROUP_RAID10,
  46                .mindev_error   = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
  47        },
  48        [BTRFS_RAID_RAID1] = {
  49                .sub_stripes    = 1,
  50                .dev_stripes    = 1,
  51                .devs_max       = 2,
  52                .devs_min       = 2,
  53                .tolerated_failures = 1,
  54                .devs_increment = 2,
  55                .ncopies        = 2,
  56                .nparity        = 0,
  57                .raid_name      = "raid1",
  58                .bg_flag        = BTRFS_BLOCK_GROUP_RAID1,
  59                .mindev_error   = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
  60        },
  61        [BTRFS_RAID_RAID1C3] = {
  62                .sub_stripes    = 1,
  63                .dev_stripes    = 1,
  64                .devs_max       = 3,
  65                .devs_min       = 3,
  66                .tolerated_failures = 2,
  67                .devs_increment = 3,
  68                .ncopies        = 3,
  69                .nparity        = 0,
  70                .raid_name      = "raid1c3",
  71                .bg_flag        = BTRFS_BLOCK_GROUP_RAID1C3,
  72                .mindev_error   = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
  73        },
  74        [BTRFS_RAID_RAID1C4] = {
  75                .sub_stripes    = 1,
  76                .dev_stripes    = 1,
  77                .devs_max       = 4,
  78                .devs_min       = 4,
  79                .tolerated_failures = 3,
  80                .devs_increment = 4,
  81                .ncopies        = 4,
  82                .nparity        = 0,
  83                .raid_name      = "raid1c4",
  84                .bg_flag        = BTRFS_BLOCK_GROUP_RAID1C4,
  85                .mindev_error   = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
  86        },
  87        [BTRFS_RAID_DUP] = {
  88                .sub_stripes    = 1,
  89                .dev_stripes    = 2,
  90                .devs_max       = 1,
  91                .devs_min       = 1,
  92                .tolerated_failures = 0,
  93                .devs_increment = 1,
  94                .ncopies        = 2,
  95                .nparity        = 0,
  96                .raid_name      = "dup",
  97                .bg_flag        = BTRFS_BLOCK_GROUP_DUP,
  98                .mindev_error   = 0,
  99        },
 100        [BTRFS_RAID_RAID0] = {
 101                .sub_stripes    = 1,
 102                .dev_stripes    = 1,
 103                .devs_max       = 0,
 104                .devs_min       = 2,
 105                .tolerated_failures = 0,
 106                .devs_increment = 1,
 107                .ncopies        = 1,
 108                .nparity        = 0,
 109                .raid_name      = "raid0",
 110                .bg_flag        = BTRFS_BLOCK_GROUP_RAID0,
 111                .mindev_error   = 0,
 112        },
 113        [BTRFS_RAID_SINGLE] = {
 114                .sub_stripes    = 1,
 115                .dev_stripes    = 1,
 116                .devs_max       = 1,
 117                .devs_min       = 1,
 118                .tolerated_failures = 0,
 119                .devs_increment = 1,
 120                .ncopies        = 1,
 121                .nparity        = 0,
 122                .raid_name      = "single",
 123                .bg_flag        = 0,
 124                .mindev_error   = 0,
 125        },
 126        [BTRFS_RAID_RAID5] = {
 127                .sub_stripes    = 1,
 128                .dev_stripes    = 1,
 129                .devs_max       = 0,
 130                .devs_min       = 2,
 131                .tolerated_failures = 1,
 132                .devs_increment = 1,
 133                .ncopies        = 1,
 134                .nparity        = 1,
 135                .raid_name      = "raid5",
 136                .bg_flag        = BTRFS_BLOCK_GROUP_RAID5,
 137                .mindev_error   = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
 138        },
 139        [BTRFS_RAID_RAID6] = {
 140                .sub_stripes    = 1,
 141                .dev_stripes    = 1,
 142                .devs_max       = 0,
 143                .devs_min       = 3,
 144                .tolerated_failures = 2,
 145                .devs_increment = 1,
 146                .ncopies        = 1,
 147                .nparity        = 2,
 148                .raid_name      = "raid6",
 149                .bg_flag        = BTRFS_BLOCK_GROUP_RAID6,
 150                .mindev_error   = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
 151        },
 152};
 153
 154const char *btrfs_bg_type_to_raid_name(u64 flags)
 155{
 156        const int index = btrfs_bg_flags_to_raid_index(flags);
 157
 158        if (index >= BTRFS_NR_RAID_TYPES)
 159                return NULL;
 160
 161        return btrfs_raid_array[index].raid_name;
 162}
 163
 164/*
 165 * Fill @buf with textual description of @bg_flags, no more than @size_buf
 166 * bytes including terminating null byte.
 167 */
 168void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
 169{
 170        int i;
 171        int ret;
 172        char *bp = buf;
 173        u64 flags = bg_flags;
 174        u32 size_bp = size_buf;
 175
 176        if (!flags) {
 177                strcpy(bp, "NONE");
 178                return;
 179        }
 180
 181#define DESCRIBE_FLAG(flag, desc)                                               \
 182        do {                                                            \
 183                if (flags & (flag)) {                                   \
 184                        ret = snprintf(bp, size_bp, "%s|", (desc));     \
 185                        if (ret < 0 || ret >= size_bp)                  \
 186                                goto out_overflow;                      \
 187                        size_bp -= ret;                                 \
 188                        bp += ret;                                      \
 189                        flags &= ~(flag);                               \
 190                }                                                       \
 191        } while (0)
 192
 193        DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
 194        DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
 195        DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
 196
 197        DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
 198        for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
 199                DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
 200                              btrfs_raid_array[i].raid_name);
 201#undef DESCRIBE_FLAG
 202
 203        if (flags) {
 204                ret = snprintf(bp, size_bp, "0x%llx|", flags);
 205                size_bp -= ret;
 206        }
 207
 208        if (size_bp < size_buf)
 209                buf[size_buf - size_bp - 1] = '\0'; /* remove last | */
 210
 211        /*
 212         * The text is trimmed, it's up to the caller to provide sufficiently
 213         * large buffer
 214         */
 215out_overflow:;
 216}
 217
 218static int init_first_rw_device(struct btrfs_trans_handle *trans);
 219static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
 220static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
 221static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
 222static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
 223                             enum btrfs_map_op op,
 224                             u64 logical, u64 *length,
 225                             struct btrfs_bio **bbio_ret,
 226                             int mirror_num, int need_raid_map);
 227
 228/*
 229 * Device locking
 230 * ==============
 231 *
 232 * There are several mutexes that protect manipulation of devices and low-level
 233 * structures like chunks but not block groups, extents or files
 234 *
 235 * uuid_mutex (global lock)
 236 * ------------------------
 237 * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
 238 * the SCAN_DEV ioctl registration or from mount either implicitly (the first
 239 * device) or requested by the device= mount option
 240 *
 241 * the mutex can be very coarse and can cover long-running operations
 242 *
 243 * protects: updates to fs_devices counters like missing devices, rw devices,
 244 * seeding, structure cloning, opening/closing devices at mount/umount time
 245 *
 246 * global::fs_devs - add, remove, updates to the global list
 247 *
 248 * does not protect: manipulation of the fs_devices::devices list!
 249 *
 250 * btrfs_device::name - renames (write side), read is RCU
 251 *
 252 * fs_devices::device_list_mutex (per-fs, with RCU)
 253 * ------------------------------------------------
 254 * protects updates to fs_devices::devices, ie. adding and deleting
 255 *
 256 * simple list traversal with read-only actions can be done with RCU protection
 257 *
 258 * may be used to exclude some operations from running concurrently without any
 259 * modifications to the list (see write_all_supers)
 260 *
 261 * balance_mutex
 262 * -------------
 263 * protects balance structures (status, state) and context accessed from
 264 * several places (internally, ioctl)
 265 *
 266 * chunk_mutex
 267 * -----------
 268 * protects chunks, adding or removing during allocation, trim or when a new
 269 * device is added/removed. Additionally it also protects post_commit_list of
 270 * individual devices, since they can be added to the transaction's
 271 * post_commit_list only with chunk_mutex held.
 272 *
 273 * cleaner_mutex
 274 * -------------
 275 * a big lock that is held by the cleaner thread and prevents running subvolume
 276 * cleaning together with relocation or delayed iputs
 277 *
 278 *
 279 * Lock nesting
 280 * ============
 281 *
 282 * uuid_mutex
 283 *   device_list_mutex
 284 *     chunk_mutex
 285 *   balance_mutex
 286 *
 287 *
 288 * Exclusive operations, BTRFS_FS_EXCL_OP
 289 * ======================================
 290 *
 291 * Maintains the exclusivity of the following operations that apply to the
 292 * whole filesystem and cannot run in parallel.
 293 *
 294 * - Balance (*)
 295 * - Device add
 296 * - Device remove
 297 * - Device replace (*)
 298 * - Resize
 299 *
 300 * The device operations (as above) can be in one of the following states:
 301 *
 302 * - Running state
 303 * - Paused state
 304 * - Completed state
 305 *
 306 * Only device operations marked with (*) can go into the Paused state for the
 307 * following reasons:
 308 *
 309 * - ioctl (only Balance can be Paused through ioctl)
 310 * - filesystem remounted as read-only
 311 * - filesystem unmounted and mounted as read-only
 312 * - system power-cycle and filesystem mounted as read-only
 313 * - filesystem or device errors leading to forced read-only
 314 *
 315 * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations.
 316 * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set.
 317 * A device operation in Paused or Running state can be canceled or resumed
 318 * either by ioctl (Balance only) or when remounted as read-write.
 319 * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or
 320 * completed.
 321 */
 322
 323DEFINE_MUTEX(uuid_mutex);
 324static LIST_HEAD(fs_uuids);
 325struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
 326{
 327        return &fs_uuids;
 328}
 329
 330/*
 331 * alloc_fs_devices - allocate struct btrfs_fs_devices
 332 * @fsid:               if not NULL, copy the UUID to fs_devices::fsid
 333 * @metadata_fsid:      if not NULL, copy the UUID to fs_devices::metadata_fsid
 334 *
 335 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
 336 * The returned struct is not linked onto any lists and can be destroyed with
 337 * kfree() right away.
 338 */
 339static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
 340                                                 const u8 *metadata_fsid)
 341{
 342        struct btrfs_fs_devices *fs_devs;
 343
 344        fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
 345        if (!fs_devs)
 346                return ERR_PTR(-ENOMEM);
 347
 348        mutex_init(&fs_devs->device_list_mutex);
 349
 350        INIT_LIST_HEAD(&fs_devs->devices);
 351        INIT_LIST_HEAD(&fs_devs->alloc_list);
 352        INIT_LIST_HEAD(&fs_devs->fs_list);
 353        if (fsid)
 354                memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
 355
 356        if (metadata_fsid)
 357                memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
 358        else if (fsid)
 359                memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
 360
 361        return fs_devs;
 362}
 363
 364void btrfs_free_device(struct btrfs_device *device)
 365{
 366        WARN_ON(!list_empty(&device->post_commit_list));
 367        rcu_string_free(device->name);
 368        extent_io_tree_release(&device->alloc_state);
 369        bio_put(device->flush_bio);
 370        kfree(device);
 371}
 372
 373static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
 374{
 375        struct btrfs_device *device;
 376        WARN_ON(fs_devices->opened);
 377        while (!list_empty(&fs_devices->devices)) {
 378                device = list_entry(fs_devices->devices.next,
 379                                    struct btrfs_device, dev_list);
 380                list_del(&device->dev_list);
 381                btrfs_free_device(device);
 382        }
 383        kfree(fs_devices);
 384}
 385
 386void __exit btrfs_cleanup_fs_uuids(void)
 387{
 388        struct btrfs_fs_devices *fs_devices;
 389
 390        while (!list_empty(&fs_uuids)) {
 391                fs_devices = list_entry(fs_uuids.next,
 392                                        struct btrfs_fs_devices, fs_list);
 393                list_del(&fs_devices->fs_list);
 394                free_fs_devices(fs_devices);
 395        }
 396}
 397
 398/*
 399 * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
 400 * Returned struct is not linked onto any lists and must be destroyed using
 401 * btrfs_free_device.
 402 */
 403static struct btrfs_device *__alloc_device(void)
 404{
 405        struct btrfs_device *dev;
 406
 407        dev = kzalloc(sizeof(*dev), GFP_KERNEL);
 408        if (!dev)
 409                return ERR_PTR(-ENOMEM);
 410
 411        /*
 412         * Preallocate a bio that's always going to be used for flushing device
 413         * barriers and matches the device lifespan
 414         */
 415        dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
 416        if (!dev->flush_bio) {
 417                kfree(dev);
 418                return ERR_PTR(-ENOMEM);
 419        }
 420
 421        INIT_LIST_HEAD(&dev->dev_list);
 422        INIT_LIST_HEAD(&dev->dev_alloc_list);
 423        INIT_LIST_HEAD(&dev->post_commit_list);
 424
 425        atomic_set(&dev->reada_in_flight, 0);
 426        atomic_set(&dev->dev_stats_ccnt, 0);
 427        btrfs_device_data_ordered_init(dev);
 428        INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
 429        INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
 430        extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL);
 431
 432        return dev;
 433}
 434
 435static noinline struct btrfs_fs_devices *find_fsid(
 436                const u8 *fsid, const u8 *metadata_fsid)
 437{
 438        struct btrfs_fs_devices *fs_devices;
 439
 440        ASSERT(fsid);
 441
 442        /* Handle non-split brain cases */
 443        list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 444                if (metadata_fsid) {
 445                        if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0
 446                            && memcmp(metadata_fsid, fs_devices->metadata_uuid,
 447                                      BTRFS_FSID_SIZE) == 0)
 448                                return fs_devices;
 449                } else {
 450                        if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
 451                                return fs_devices;
 452                }
 453        }
 454        return NULL;
 455}
 456
 457static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
 458                                struct btrfs_super_block *disk_super)
 459{
 460
 461        struct btrfs_fs_devices *fs_devices;
 462
 463        /*
 464         * Handle scanned device having completed its fsid change but
 465         * belonging to a fs_devices that was created by first scanning
 466         * a device which didn't have its fsid/metadata_uuid changed
 467         * at all and the CHANGING_FSID_V2 flag set.
 468         */
 469        list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 470                if (fs_devices->fsid_change &&
 471                    memcmp(disk_super->metadata_uuid, fs_devices->fsid,
 472                           BTRFS_FSID_SIZE) == 0 &&
 473                    memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
 474                           BTRFS_FSID_SIZE) == 0) {
 475                        return fs_devices;
 476                }
 477        }
 478        /*
 479         * Handle scanned device having completed its fsid change but
 480         * belonging to a fs_devices that was created by a device that
 481         * has an outdated pair of fsid/metadata_uuid and
 482         * CHANGING_FSID_V2 flag set.
 483         */
 484        list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 485                if (fs_devices->fsid_change &&
 486                    memcmp(fs_devices->metadata_uuid,
 487                           fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
 488                    memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid,
 489                           BTRFS_FSID_SIZE) == 0) {
 490                        return fs_devices;
 491                }
 492        }
 493
 494        return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
 495}
 496
 497
 498static int
 499btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
 500                      int flush, struct block_device **bdev,
 501                      struct btrfs_super_block **disk_super)
 502{
 503        int ret;
 504
 505        *bdev = blkdev_get_by_path(device_path, flags, holder);
 506
 507        if (IS_ERR(*bdev)) {
 508                ret = PTR_ERR(*bdev);
 509                goto error;
 510        }
 511
 512        if (flush)
 513                filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
 514        ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
 515        if (ret) {
 516                blkdev_put(*bdev, flags);
 517                goto error;
 518        }
 519        invalidate_bdev(*bdev);
 520        *disk_super = btrfs_read_dev_super(*bdev);
 521        if (IS_ERR(*disk_super)) {
 522                ret = PTR_ERR(*disk_super);
 523                blkdev_put(*bdev, flags);
 524                goto error;
 525        }
 526
 527        return 0;
 528
 529error:
 530        *bdev = NULL;
 531        return ret;
 532}
 533
 534static bool device_path_matched(const char *path, struct btrfs_device *device)
 535{
 536        int found;
 537
 538        rcu_read_lock();
 539        found = strcmp(rcu_str_deref(device->name), path);
 540        rcu_read_unlock();
 541
 542        return found == 0;
 543}
 544
 545/*
 546 *  Search and remove all stale (devices which are not mounted) devices.
 547 *  When both inputs are NULL, it will search and release all stale devices.
 548 *  path:       Optional. When provided will it release all unmounted devices
 549 *              matching this path only.
 550 *  skip_dev:   Optional. Will skip this device when searching for the stale
 551 *              devices.
 552 *  Return:     0 for success or if @path is NULL.
 553 *              -EBUSY if @path is a mounted device.
 554 *              -ENOENT if @path does not match any device in the list.
 555 */
 556static int btrfs_free_stale_devices(const char *path,
 557                                     struct btrfs_device *skip_device)
 558{
 559        struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
 560        struct btrfs_device *device, *tmp_device;
 561        int ret = 0;
 562
 563        if (path)
 564                ret = -ENOENT;
 565
 566        list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
 567
 568                mutex_lock(&fs_devices->device_list_mutex);
 569                list_for_each_entry_safe(device, tmp_device,
 570                                         &fs_devices->devices, dev_list) {
 571                        if (skip_device && skip_device == device)
 572                                continue;
 573                        if (path && !device->name)
 574                                continue;
 575                        if (path && !device_path_matched(path, device))
 576                                continue;
 577                        if (fs_devices->opened) {
 578                                /* for an already deleted device return 0 */
 579                                if (path && ret != 0)
 580                                        ret = -EBUSY;
 581                                break;
 582                        }
 583
 584                        /* delete the stale device */
 585                        fs_devices->num_devices--;
 586                        list_del(&device->dev_list);
 587                        btrfs_free_device(device);
 588
 589                        ret = 0;
 590                        if (fs_devices->num_devices == 0)
 591                                break;
 592                }
 593                mutex_unlock(&fs_devices->device_list_mutex);
 594
 595                if (fs_devices->num_devices == 0) {
 596                        btrfs_sysfs_remove_fsid(fs_devices);
 597                        list_del(&fs_devices->fs_list);
 598                        free_fs_devices(fs_devices);
 599                }
 600        }
 601
 602        return ret;
 603}
 604
 605static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 606                        struct btrfs_device *device, fmode_t flags,
 607                        void *holder)
 608{
 609        struct request_queue *q;
 610        struct block_device *bdev;
 611        struct btrfs_super_block *disk_super;
 612        u64 devid;
 613        int ret;
 614
 615        if (device->bdev)
 616                return -EINVAL;
 617        if (!device->name)
 618                return -EINVAL;
 619
 620        ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
 621                                    &bdev, &disk_super);
 622        if (ret)
 623                return ret;
 624
 625        devid = btrfs_stack_device_id(&disk_super->dev_item);
 626        if (devid != device->devid)
 627                goto error_free_page;
 628
 629        if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
 630                goto error_free_page;
 631
 632        device->generation = btrfs_super_generation(disk_super);
 633
 634        if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
 635                if (btrfs_super_incompat_flags(disk_super) &
 636                    BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
 637                        pr_err(
 638                "BTRFS: Invalid seeding and uuid-changed device detected\n");
 639                        goto error_free_page;
 640                }
 641
 642                clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 643                fs_devices->seeding = true;
 644        } else {
 645                if (bdev_read_only(bdev))
 646                        clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 647                else
 648                        set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 649        }
 650
 651        q = bdev_get_queue(bdev);
 652        if (!blk_queue_nonrot(q))
 653                fs_devices->rotating = true;
 654
 655        device->bdev = bdev;
 656        clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
 657        device->mode = flags;
 658
 659        fs_devices->open_devices++;
 660        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
 661            device->devid != BTRFS_DEV_REPLACE_DEVID) {
 662                fs_devices->rw_devices++;
 663                list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
 664        }
 665        btrfs_release_disk_super(disk_super);
 666
 667        return 0;
 668
 669error_free_page:
 670        btrfs_release_disk_super(disk_super);
 671        blkdev_put(bdev, flags);
 672
 673        return -EINVAL;
 674}
 675
 676/*
 677 * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
 678 * being created with a disk that has already completed its fsid change. Such
 679 * disk can belong to an fs which has its FSID changed or to one which doesn't.
 680 * Handle both cases here.
 681 */
 682static struct btrfs_fs_devices *find_fsid_inprogress(
 683                                        struct btrfs_super_block *disk_super)
 684{
 685        struct btrfs_fs_devices *fs_devices;
 686
 687        list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 688                if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
 689                           BTRFS_FSID_SIZE) != 0 &&
 690                    memcmp(fs_devices->metadata_uuid, disk_super->fsid,
 691                           BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) {
 692                        return fs_devices;
 693                }
 694        }
 695
 696        return find_fsid(disk_super->fsid, NULL);
 697}
 698
 699
 700static struct btrfs_fs_devices *find_fsid_changed(
 701                                        struct btrfs_super_block *disk_super)
 702{
 703        struct btrfs_fs_devices *fs_devices;
 704
 705        /*
 706         * Handles the case where scanned device is part of an fs that had
 707         * multiple successful changes of FSID but curently device didn't
 708         * observe it. Meaning our fsid will be different than theirs. We need
 709         * to handle two subcases :
 710         *  1 - The fs still continues to have different METADATA/FSID uuids.
 711         *  2 - The fs is switched back to its original FSID (METADATA/FSID
 712         *  are equal).
 713         */
 714        list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 715                /* Changed UUIDs */
 716                if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
 717                           BTRFS_FSID_SIZE) != 0 &&
 718                    memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
 719                           BTRFS_FSID_SIZE) == 0 &&
 720                    memcmp(fs_devices->fsid, disk_super->fsid,
 721                           BTRFS_FSID_SIZE) != 0)
 722                        return fs_devices;
 723
 724                /* Unchanged UUIDs */
 725                if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
 726                           BTRFS_FSID_SIZE) == 0 &&
 727                    memcmp(fs_devices->fsid, disk_super->metadata_uuid,
 728                           BTRFS_FSID_SIZE) == 0)
 729                        return fs_devices;
 730        }
 731
 732        return NULL;
 733}
 734
 735static struct btrfs_fs_devices *find_fsid_reverted_metadata(
 736                                struct btrfs_super_block *disk_super)
 737{
 738        struct btrfs_fs_devices *fs_devices;
 739
 740        /*
 741         * Handle the case where the scanned device is part of an fs whose last
 742         * metadata UUID change reverted it to the original FSID. At the same
 743         * time * fs_devices was first created by another constitutent device
 744         * which didn't fully observe the operation. This results in an
 745         * btrfs_fs_devices created with metadata/fsid different AND
 746         * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
 747         * fs_devices equal to the FSID of the disk.
 748         */
 749        list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 750                if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
 751                           BTRFS_FSID_SIZE) != 0 &&
 752                    memcmp(fs_devices->metadata_uuid, disk_super->fsid,
 753                           BTRFS_FSID_SIZE) == 0 &&
 754                    fs_devices->fsid_change)
 755                        return fs_devices;
 756        }
 757
 758        return NULL;
 759}
 760/*
 761 * Add new device to list of registered devices
 762 *
 763 * Returns:
 764 * device pointer which was just added or updated when successful
 765 * error pointer when failed
 766 */
 767static noinline struct btrfs_device *device_list_add(const char *path,
 768                           struct btrfs_super_block *disk_super,
 769                           bool *new_device_added)
 770{
 771        struct btrfs_device *device;
 772        struct btrfs_fs_devices *fs_devices = NULL;
 773        struct rcu_string *name;
 774        u64 found_transid = btrfs_super_generation(disk_super);
 775        u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
 776        bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
 777                BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
 778        bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
 779                                        BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
 780
 781        if (fsid_change_in_progress) {
 782                if (!has_metadata_uuid)
 783                        fs_devices = find_fsid_inprogress(disk_super);
 784                else
 785                        fs_devices = find_fsid_changed(disk_super);
 786        } else if (has_metadata_uuid) {
 787                fs_devices = find_fsid_with_metadata_uuid(disk_super);
 788        } else {
 789                fs_devices = find_fsid_reverted_metadata(disk_super);
 790                if (!fs_devices)
 791                        fs_devices = find_fsid(disk_super->fsid, NULL);
 792        }
 793
 794
 795        if (!fs_devices) {
 796                if (has_metadata_uuid)
 797                        fs_devices = alloc_fs_devices(disk_super->fsid,
 798                                                      disk_super->metadata_uuid);
 799                else
 800                        fs_devices = alloc_fs_devices(disk_super->fsid, NULL);
 801
 802                if (IS_ERR(fs_devices))
 803                        return ERR_CAST(fs_devices);
 804
 805                fs_devices->fsid_change = fsid_change_in_progress;
 806
 807                mutex_lock(&fs_devices->device_list_mutex);
 808                list_add(&fs_devices->fs_list, &fs_uuids);
 809
 810                device = NULL;
 811        } else {
 812                mutex_lock(&fs_devices->device_list_mutex);
 813                device = btrfs_find_device(fs_devices, devid,
 814                                disk_super->dev_item.uuid, NULL, false);
 815
 816                /*
 817                 * If this disk has been pulled into an fs devices created by
 818                 * a device which had the CHANGING_FSID_V2 flag then replace the
 819                 * metadata_uuid/fsid values of the fs_devices.
 820                 */
 821                if (fs_devices->fsid_change &&
 822                    found_transid > fs_devices->latest_generation) {
 823                        memcpy(fs_devices->fsid, disk_super->fsid,
 824                                        BTRFS_FSID_SIZE);
 825
 826                        if (has_metadata_uuid)
 827                                memcpy(fs_devices->metadata_uuid,
 828                                       disk_super->metadata_uuid,
 829                                       BTRFS_FSID_SIZE);
 830                        else
 831                                memcpy(fs_devices->metadata_uuid,
 832                                       disk_super->fsid, BTRFS_FSID_SIZE);
 833
 834                        fs_devices->fsid_change = false;
 835                }
 836        }
 837
 838        if (!device) {
 839                if (fs_devices->opened) {
 840                        mutex_unlock(&fs_devices->device_list_mutex);
 841                        return ERR_PTR(-EBUSY);
 842                }
 843
 844                device = btrfs_alloc_device(NULL, &devid,
 845                                            disk_super->dev_item.uuid);
 846                if (IS_ERR(device)) {
 847                        mutex_unlock(&fs_devices->device_list_mutex);
 848                        /* we can safely leave the fs_devices entry around */
 849                        return device;
 850                }
 851
 852                name = rcu_string_strdup(path, GFP_NOFS);
 853                if (!name) {
 854                        btrfs_free_device(device);
 855                        mutex_unlock(&fs_devices->device_list_mutex);
 856                        return ERR_PTR(-ENOMEM);
 857                }
 858                rcu_assign_pointer(device->name, name);
 859
 860                list_add_rcu(&device->dev_list, &fs_devices->devices);
 861                fs_devices->num_devices++;
 862
 863                device->fs_devices = fs_devices;
 864                *new_device_added = true;
 865
 866                if (disk_super->label[0])
 867                        pr_info(
 868        "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
 869                                disk_super->label, devid, found_transid, path,
 870                                current->comm, task_pid_nr(current));
 871                else
 872                        pr_info(
 873        "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
 874                                disk_super->fsid, devid, found_transid, path,
 875                                current->comm, task_pid_nr(current));
 876
 877        } else if (!device->name || strcmp(device->name->str, path)) {
 878                /*
 879                 * When FS is already mounted.
 880                 * 1. If you are here and if the device->name is NULL that
 881                 *    means this device was missing at time of FS mount.
 882                 * 2. If you are here and if the device->name is different
 883                 *    from 'path' that means either
 884                 *      a. The same device disappeared and reappeared with
 885                 *         different name. or
 886                 *      b. The missing-disk-which-was-replaced, has
 887                 *         reappeared now.
 888                 *
 889                 * We must allow 1 and 2a above. But 2b would be a spurious
 890                 * and unintentional.
 891                 *
 892                 * Further in case of 1 and 2a above, the disk at 'path'
 893                 * would have missed some transaction when it was away and
 894                 * in case of 2a the stale bdev has to be updated as well.
 895                 * 2b must not be allowed at all time.
 896                 */
 897
 898                /*
 899                 * For now, we do allow update to btrfs_fs_device through the
 900                 * btrfs dev scan cli after FS has been mounted.  We're still
 901                 * tracking a problem where systems fail mount by subvolume id
 902                 * when we reject replacement on a mounted FS.
 903                 */
 904                if (!fs_devices->opened && found_transid < device->generation) {
 905                        /*
 906                         * That is if the FS is _not_ mounted and if you
 907                         * are here, that means there is more than one
 908                         * disk with same uuid and devid.We keep the one
 909                         * with larger generation number or the last-in if
 910                         * generation are equal.
 911                         */
 912                        mutex_unlock(&fs_devices->device_list_mutex);
 913                        return ERR_PTR(-EEXIST);
 914                }
 915
 916                /*
 917                 * We are going to replace the device path for a given devid,
 918                 * make sure it's the same device if the device is mounted
 919                 */
 920                if (device->bdev) {
 921                        struct block_device *path_bdev;
 922
 923                        path_bdev = lookup_bdev(path);
 924                        if (IS_ERR(path_bdev)) {
 925                                mutex_unlock(&fs_devices->device_list_mutex);
 926                                return ERR_CAST(path_bdev);
 927                        }
 928
 929                        if (device->bdev != path_bdev) {
 930                                bdput(path_bdev);
 931                                mutex_unlock(&fs_devices->device_list_mutex);
 932                                btrfs_warn_in_rcu(device->fs_info,
 933                        "duplicate device fsid:devid for %pU:%llu old:%s new:%s",
 934                                        disk_super->fsid, devid,
 935                                        rcu_str_deref(device->name), path);
 936                                return ERR_PTR(-EEXIST);
 937                        }
 938                        bdput(path_bdev);
 939                        btrfs_info_in_rcu(device->fs_info,
 940                                "device fsid %pU devid %llu moved old:%s new:%s",
 941                                disk_super->fsid, devid,
 942                                rcu_str_deref(device->name), path);
 943                }
 944
 945                name = rcu_string_strdup(path, GFP_NOFS);
 946                if (!name) {
 947                        mutex_unlock(&fs_devices->device_list_mutex);
 948                        return ERR_PTR(-ENOMEM);
 949                }
 950                rcu_string_free(device->name);
 951                rcu_assign_pointer(device->name, name);
 952                if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
 953                        fs_devices->missing_devices--;
 954                        clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
 955                }
 956        }
 957
 958        /*
 959         * Unmount does not free the btrfs_device struct but would zero
 960         * generation along with most of the other members. So just update
 961         * it back. We need it to pick the disk with largest generation
 962         * (as above).
 963         */
 964        if (!fs_devices->opened) {
 965                device->generation = found_transid;
 966                fs_devices->latest_generation = max_t(u64, found_transid,
 967                                                fs_devices->latest_generation);
 968        }
 969
 970        fs_devices->total_devices = btrfs_super_num_devices(disk_super);
 971
 972        mutex_unlock(&fs_devices->device_list_mutex);
 973        return device;
 974}
 975
 976static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 977{
 978        struct btrfs_fs_devices *fs_devices;
 979        struct btrfs_device *device;
 980        struct btrfs_device *orig_dev;
 981        int ret = 0;
 982
 983        fs_devices = alloc_fs_devices(orig->fsid, NULL);
 984        if (IS_ERR(fs_devices))
 985                return fs_devices;
 986
 987        mutex_lock(&orig->device_list_mutex);
 988        fs_devices->total_devices = orig->total_devices;
 989
 990        list_for_each_entry(orig_dev, &orig->devices, dev_list) {
 991                struct rcu_string *name;
 992
 993                device = btrfs_alloc_device(NULL, &orig_dev->devid,
 994                                            orig_dev->uuid);
 995                if (IS_ERR(device)) {
 996                        ret = PTR_ERR(device);
 997                        goto error;
 998                }
 999
1000                /*
1001                 * This is ok to do without rcu read locked because we hold the
1002                 * uuid mutex so nothing we touch in here is going to disappear.
1003                 */
1004                if (orig_dev->name) {
1005                        name = rcu_string_strdup(orig_dev->name->str,
1006                                        GFP_KERNEL);
1007                        if (!name) {
1008                                btrfs_free_device(device);
1009                                ret = -ENOMEM;
1010                                goto error;
1011                        }
1012                        rcu_assign_pointer(device->name, name);
1013                }
1014
1015                list_add(&device->dev_list, &fs_devices->devices);
1016                device->fs_devices = fs_devices;
1017                fs_devices->num_devices++;
1018        }
1019        mutex_unlock(&orig->device_list_mutex);
1020        return fs_devices;
1021error:
1022        mutex_unlock(&orig->device_list_mutex);
1023        free_fs_devices(fs_devices);
1024        return ERR_PTR(ret);
1025}
1026
1027/*
1028 * After we have read the system tree and know devids belonging to
1029 * this filesystem, remove the device which does not belong there.
1030 */
1031void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
1032{
1033        struct btrfs_device *device, *next;
1034        struct btrfs_device *latest_dev = NULL;
1035
1036        mutex_lock(&uuid_mutex);
1037again:
1038        /* This is the initialized path, it is safe to release the devices. */
1039        list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
1040                if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
1041                                                        &device->dev_state)) {
1042                        if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
1043                             &device->dev_state) &&
1044                            !test_bit(BTRFS_DEV_STATE_MISSING,
1045                                      &device->dev_state) &&
1046                             (!latest_dev ||
1047                              device->generation > latest_dev->generation)) {
1048                                latest_dev = device;
1049                        }
1050                        continue;
1051                }
1052
1053                if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
1054                        /*
1055                         * In the first step, keep the device which has
1056                         * the correct fsid and the devid that is used
1057                         * for the dev_replace procedure.
1058                         * In the second step, the dev_replace state is
1059                         * read from the device tree and it is known
1060                         * whether the procedure is really active or
1061                         * not, which means whether this device is
1062                         * used or whether it should be removed.
1063                         */
1064                        if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
1065                                                  &device->dev_state)) {
1066                                continue;
1067                        }
1068                }
1069                if (device->bdev) {
1070                        blkdev_put(device->bdev, device->mode);
1071                        device->bdev = NULL;
1072                        fs_devices->open_devices--;
1073                }
1074                if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1075                        list_del_init(&device->dev_alloc_list);
1076                        clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1077                        if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
1078                                      &device->dev_state))
1079                                fs_devices->rw_devices--;
1080                }
1081                list_del_init(&device->dev_list);
1082                fs_devices->num_devices--;
1083                btrfs_free_device(device);
1084        }
1085
1086        if (fs_devices->seed) {
1087                fs_devices = fs_devices->seed;
1088                goto again;
1089        }
1090
1091        fs_devices->latest_bdev = latest_dev->bdev;
1092
1093        mutex_unlock(&uuid_mutex);
1094}
1095
1096static void btrfs_close_bdev(struct btrfs_device *device)
1097{
1098        if (!device->bdev)
1099                return;
1100
1101        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1102                sync_blockdev(device->bdev);
1103                invalidate_bdev(device->bdev);
1104        }
1105
1106        blkdev_put(device->bdev, device->mode);
1107}
1108
1109static void btrfs_close_one_device(struct btrfs_device *device)
1110{
1111        struct btrfs_fs_devices *fs_devices = device->fs_devices;
1112
1113        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1114            device->devid != BTRFS_DEV_REPLACE_DEVID) {
1115                list_del_init(&device->dev_alloc_list);
1116                fs_devices->rw_devices--;
1117        }
1118
1119        if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
1120                fs_devices->missing_devices--;
1121
1122        btrfs_close_bdev(device);
1123        if (device->bdev) {
1124                fs_devices->open_devices--;
1125                device->bdev = NULL;
1126        }
1127        clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1128
1129        device->fs_info = NULL;
1130        atomic_set(&device->dev_stats_ccnt, 0);
1131        extent_io_tree_release(&device->alloc_state);
1132
1133        /* Verify the device is back in a pristine state  */
1134        ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
1135        ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1136        ASSERT(list_empty(&device->dev_alloc_list));
1137        ASSERT(list_empty(&device->post_commit_list));
1138        ASSERT(atomic_read(&device->reada_in_flight) == 0);
1139}
1140
1141static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
1142{
1143        struct btrfs_device *device, *tmp;
1144
1145        if (--fs_devices->opened > 0)
1146                return 0;
1147
1148        mutex_lock(&fs_devices->device_list_mutex);
1149        list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
1150                btrfs_close_one_device(device);
1151        }
1152        mutex_unlock(&fs_devices->device_list_mutex);
1153
1154        WARN_ON(fs_devices->open_devices);
1155        WARN_ON(fs_devices->rw_devices);
1156        fs_devices->opened = 0;
1157        fs_devices->seeding = false;
1158
1159        return 0;
1160}
1161
1162int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
1163{
1164        struct btrfs_fs_devices *seed_devices = NULL;
1165        int ret;
1166
1167        mutex_lock(&uuid_mutex);
1168        ret = close_fs_devices(fs_devices);
1169        if (!fs_devices->opened) {
1170                seed_devices = fs_devices->seed;
1171                fs_devices->seed = NULL;
1172        }
1173        mutex_unlock(&uuid_mutex);
1174
1175        while (seed_devices) {
1176                fs_devices = seed_devices;
1177                seed_devices = fs_devices->seed;
1178                close_fs_devices(fs_devices);
1179                free_fs_devices(fs_devices);
1180        }
1181        return ret;
1182}
1183
1184static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
1185                                fmode_t flags, void *holder)
1186{
1187        struct btrfs_device *device;
1188        struct btrfs_device *latest_dev = NULL;
1189
1190        flags |= FMODE_EXCL;
1191
1192        list_for_each_entry(device, &fs_devices->devices, dev_list) {
1193                /* Just open everything we can; ignore failures here */
1194                if (btrfs_open_one_device(fs_devices, device, flags, holder))
1195                        continue;
1196
1197                if (!latest_dev ||
1198                    device->generation > latest_dev->generation)
1199                        latest_dev = device;
1200        }
1201        if (fs_devices->open_devices == 0)
1202                return -EINVAL;
1203
1204        fs_devices->opened = 1;
1205        fs_devices->latest_bdev = latest_dev->bdev;
1206        fs_devices->total_rw_bytes = 0;
1207        fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
1208
1209        return 0;
1210}
1211
1212static int devid_cmp(void *priv, struct list_head *a, struct list_head *b)
1213{
1214        struct btrfs_device *dev1, *dev2;
1215
1216        dev1 = list_entry(a, struct btrfs_device, dev_list);
1217        dev2 = list_entry(b, struct btrfs_device, dev_list);
1218
1219        if (dev1->devid < dev2->devid)
1220                return -1;
1221        else if (dev1->devid > dev2->devid)
1222                return 1;
1223        return 0;
1224}
1225
1226int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1227                       fmode_t flags, void *holder)
1228{
1229        int ret;
1230
1231        lockdep_assert_held(&uuid_mutex);
1232
1233        mutex_lock(&fs_devices->device_list_mutex);
1234        if (fs_devices->opened) {
1235                fs_devices->opened++;
1236                ret = 0;
1237        } else {
1238                list_sort(NULL, &fs_devices->devices, devid_cmp);
1239                ret = open_fs_devices(fs_devices, flags, holder);
1240        }
1241        mutex_unlock(&fs_devices->device_list_mutex);
1242
1243        return ret;
1244}
1245
1246void btrfs_release_disk_super(struct btrfs_super_block *super)
1247{
1248        struct page *page = virt_to_page(super);
1249
1250        put_page(page);
1251}
1252
1253static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
1254                                                       u64 bytenr)
1255{
1256        struct btrfs_super_block *disk_super;
1257        struct page *page;
1258        void *p;
1259        pgoff_t index;
1260
1261        /* make sure our super fits in the device */
1262        if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
1263                return ERR_PTR(-EINVAL);
1264
1265        /* make sure our super fits in the page */
1266        if (sizeof(*disk_super) > PAGE_SIZE)
1267                return ERR_PTR(-EINVAL);
1268
1269        /* make sure our super doesn't straddle pages on disk */
1270        index = bytenr >> PAGE_SHIFT;
1271        if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
1272                return ERR_PTR(-EINVAL);
1273
1274        /* pull in the page with our super */
1275        page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL);
1276
1277        if (IS_ERR(page))
1278                return ERR_CAST(page);
1279
1280        p = page_address(page);
1281
1282        /* align our pointer to the offset of the super block */
1283        disk_super = p + offset_in_page(bytenr);
1284
1285        if (btrfs_super_bytenr(disk_super) != bytenr ||
1286            btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
1287                btrfs_release_disk_super(p);
1288                return ERR_PTR(-EINVAL);
1289        }
1290
1291        if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1])
1292                disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;
1293
1294        return disk_super;
1295}
1296
1297int btrfs_forget_devices(const char *path)
1298{
1299        int ret;
1300
1301        mutex_lock(&uuid_mutex);
1302        ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
1303        mutex_unlock(&uuid_mutex);
1304
1305        return ret;
1306}
1307
1308/*
1309 * Look for a btrfs signature on a device. This may be called out of the mount path
1310 * and we are not allowed to call set_blocksize during the scan. The superblock
1311 * is read via pagecache
1312 */
1313struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
1314                                           void *holder)
1315{
1316        struct btrfs_super_block *disk_super;
1317        bool new_device_added = false;
1318        struct btrfs_device *device = NULL;
1319        struct block_device *bdev;
1320        u64 bytenr;
1321
1322        lockdep_assert_held(&uuid_mutex);
1323
1324        /*
1325         * we would like to check all the supers, but that would make
1326         * a btrfs mount succeed after a mkfs from a different FS.
1327         * So, we need to add a special mount option to scan for
1328         * later supers, using BTRFS_SUPER_MIRROR_MAX instead
1329         */
1330        bytenr = btrfs_sb_offset(0);
1331        flags |= FMODE_EXCL;
1332
1333        bdev = blkdev_get_by_path(path, flags, holder);
1334        if (IS_ERR(bdev))
1335                return ERR_CAST(bdev);
1336
1337        disk_super = btrfs_read_disk_super(bdev, bytenr);
1338        if (IS_ERR(disk_super)) {
1339                device = ERR_CAST(disk_super);
1340                goto error_bdev_put;
1341        }
1342
1343        device = device_list_add(path, disk_super, &new_device_added);
1344        if (!IS_ERR(device)) {
1345                if (new_device_added)
1346                        btrfs_free_stale_devices(path, device);
1347        }
1348
1349        btrfs_release_disk_super(disk_super);
1350
1351error_bdev_put:
1352        blkdev_put(bdev, flags);
1353
1354        return device;
1355}
1356
1357/*
1358 * Try to find a chunk that intersects [start, start + len] range and when one
1359 * such is found, record the end of it in *start
1360 */
1361static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
1362                                    u64 len)
1363{
1364        u64 physical_start, physical_end;
1365
1366        lockdep_assert_held(&device->fs_info->chunk_mutex);
1367
1368        if (!find_first_extent_bit(&device->alloc_state, *start,
1369                                   &physical_start, &physical_end,
1370                                   CHUNK_ALLOCATED, NULL)) {
1371
1372                if (in_range(physical_start, *start, len) ||
1373                    in_range(*start, physical_start,
1374                             physical_end - physical_start)) {
1375                        *start = physical_end + 1;
1376                        return true;
1377                }
1378        }
1379        return false;
1380}
1381
1382static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
1383{
1384        switch (device->fs_devices->chunk_alloc_policy) {
1385        case BTRFS_CHUNK_ALLOC_REGULAR:
1386                /*
1387                 * We don't want to overwrite the superblock on the drive nor
1388                 * any area used by the boot loader (grub for example), so we
1389                 * make sure to start at an offset of at least 1MB.
1390                 */
1391                return max_t(u64, start, SZ_1M);
1392        default:
1393                BUG();
1394        }
1395}
1396
1397/**
1398 * dev_extent_hole_check - check if specified hole is suitable for allocation
1399 * @device:     the device which we have the hole
1400 * @hole_start: starting position of the hole
1401 * @hole_size:  the size of the hole
1402 * @num_bytes:  the size of the free space that we need
1403 *
1404 * This function may modify @hole_start and @hole_end to reflect the suitable
1405 * position for allocation. Returns 1 if hole position is updated, 0 otherwise.
1406 */
1407static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
1408                                  u64 *hole_size, u64 num_bytes)
1409{
1410        bool changed = false;
1411        u64 hole_end = *hole_start + *hole_size;
1412
1413        /*
1414         * Check before we set max_hole_start, otherwise we could end up
1415         * sending back this offset anyway.
1416         */
1417        if (contains_pending_extent(device, hole_start, *hole_size)) {
1418                if (hole_end >= *hole_start)
1419                        *hole_size = hole_end - *hole_start;
1420                else
1421                        *hole_size = 0;
1422                changed = true;
1423        }
1424
1425        switch (device->fs_devices->chunk_alloc_policy) {
1426        case BTRFS_CHUNK_ALLOC_REGULAR:
1427                /* No extra check */
1428                break;
1429        default:
1430                BUG();
1431        }
1432
1433        return changed;
1434}
1435
1436/*
1437 * find_free_dev_extent_start - find free space in the specified device
1438 * @device:       the device which we search the free space in
1439 * @num_bytes:    the size of the free space that we need
1440 * @search_start: the position from which to begin the search
1441 * @start:        store the start of the free space.
1442 * @len:          the size of the free space. that we find, or the size
1443 *                of the max free space if we don't find suitable free space
1444 *
1445 * this uses a pretty simple search, the expectation is that it is
1446 * called very infrequently and that a given device has a small number
1447 * of extents
1448 *
1449 * @start is used to store the start of the free space if we find. But if we
1450 * don't find suitable free space, it will be used to store the start position
1451 * of the max free space.
1452 *
1453 * @len is used to store the size of the free space that we find.
1454 * But if we don't find suitable free space, it is used to store the size of
1455 * the max free space.
1456 *
1457 * NOTE: This function will search *commit* root of device tree, and does extra
1458 * check to ensure dev extents are not double allocated.
1459 * This makes the function safe to allocate dev extents but may not report
1460 * correct usable device space, as device extent freed in current transaction
1461 * is not reported as avaiable.
1462 */
1463static int find_free_dev_extent_start(struct btrfs_device *device,
1464                                u64 num_bytes, u64 search_start, u64 *start,
1465                                u64 *len)
1466{
1467        struct btrfs_fs_info *fs_info = device->fs_info;
1468        struct btrfs_root *root = fs_info->dev_root;
1469        struct btrfs_key key;
1470        struct btrfs_dev_extent *dev_extent;
1471        struct btrfs_path *path;
1472        u64 hole_size;
1473        u64 max_hole_start;
1474        u64 max_hole_size;
1475        u64 extent_end;
1476        u64 search_end = device->total_bytes;
1477        int ret;
1478        int slot;
1479        struct extent_buffer *l;
1480
1481        search_start = dev_extent_search_start(device, search_start);
1482
1483        path = btrfs_alloc_path();
1484        if (!path)
1485                return -ENOMEM;
1486
1487        max_hole_start = search_start;
1488        max_hole_size = 0;
1489
1490again:
1491        if (search_start >= search_end ||
1492                test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1493                ret = -ENOSPC;
1494                goto out;
1495        }
1496
1497        path->reada = READA_FORWARD;
1498        path->search_commit_root = 1;
1499        path->skip_locking = 1;
1500
1501        key.objectid = device->devid;
1502        key.offset = search_start;
1503        key.type = BTRFS_DEV_EXTENT_KEY;
1504
1505        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1506        if (ret < 0)
1507                goto out;
1508        if (ret > 0) {
1509                ret = btrfs_previous_item(root, path, key.objectid, key.type);
1510                if (ret < 0)
1511                        goto out;
1512        }
1513
1514        while (1) {
1515                l = path->nodes[0];
1516                slot = path->slots[0];
1517                if (slot >= btrfs_header_nritems(l)) {
1518                        ret = btrfs_next_leaf(root, path);
1519                        if (ret == 0)
1520                                continue;
1521                        if (ret < 0)
1522                                goto out;
1523
1524                        break;
1525                }
1526                btrfs_item_key_to_cpu(l, &key, slot);
1527
1528                if (key.objectid < device->devid)
1529                        goto next;
1530
1531                if (key.objectid > device->devid)
1532                        break;
1533
1534                if (key.type != BTRFS_DEV_EXTENT_KEY)
1535                        goto next;
1536
1537                if (key.offset > search_start) {
1538                        hole_size = key.offset - search_start;
1539                        dev_extent_hole_check(device, &search_start, &hole_size,
1540                                              num_bytes);
1541
1542                        if (hole_size > max_hole_size) {
1543                                max_hole_start = search_start;
1544                                max_hole_size = hole_size;
1545                        }
1546
1547                        /*
1548                         * If this free space is greater than which we need,
1549                         * it must be the max free space that we have found
1550                         * until now, so max_hole_start must point to the start
1551                         * of this free space and the length of this free space
1552                         * is stored in max_hole_size. Thus, we return
1553                         * max_hole_start and max_hole_size and go back to the
1554                         * caller.
1555                         */
1556                        if (hole_size >= num_bytes) {
1557                                ret = 0;
1558                                goto out;
1559                        }
1560                }
1561
1562                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1563                extent_end = key.offset + btrfs_dev_extent_length(l,
1564                                                                  dev_extent);
1565                if (extent_end > search_start)
1566                        search_start = extent_end;
1567next:
1568                path->slots[0]++;
1569                cond_resched();
1570        }
1571
1572        /*
1573         * At this point, search_start should be the end of
1574         * allocated dev extents, and when shrinking the device,
1575         * search_end may be smaller than search_start.
1576         */
1577        if (search_end > search_start) {
1578                hole_size = search_end - search_start;
1579                if (dev_extent_hole_check(device, &search_start, &hole_size,
1580                                          num_bytes)) {
1581                        btrfs_release_path(path);
1582                        goto again;
1583                }
1584
1585                if (hole_size > max_hole_size) {
1586                        max_hole_start = search_start;
1587                        max_hole_size = hole_size;
1588                }
1589        }
1590
1591        /* See above. */
1592        if (max_hole_size < num_bytes)
1593                ret = -ENOSPC;
1594        else
1595                ret = 0;
1596
1597out:
1598        btrfs_free_path(path);
1599        *start = max_hole_start;
1600        if (len)
1601                *len = max_hole_size;
1602        return ret;
1603}
1604
1605int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
1606                         u64 *start, u64 *len)
1607{
1608        /* FIXME use last free of some kind */
1609        return find_free_dev_extent_start(device, num_bytes, 0, start, len);
1610}
1611
1612static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1613                          struct btrfs_device *device,
1614                          u64 start, u64 *dev_extent_len)
1615{
1616        struct btrfs_fs_info *fs_info = device->fs_info;
1617        struct btrfs_root *root = fs_info->dev_root;
1618        int ret;
1619        struct btrfs_path *path;
1620        struct btrfs_key key;
1621        struct btrfs_key found_key;
1622        struct extent_buffer *leaf = NULL;
1623        struct btrfs_dev_extent *extent = NULL;
1624
1625        path = btrfs_alloc_path();
1626        if (!path)
1627                return -ENOMEM;
1628
1629        key.objectid = device->devid;
1630        key.offset = start;
1631        key.type = BTRFS_DEV_EXTENT_KEY;
1632again:
1633        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1634        if (ret > 0) {
1635                ret = btrfs_previous_item(root, path, key.objectid,
1636                                          BTRFS_DEV_EXTENT_KEY);
1637                if (ret)
1638                        goto out;
1639                leaf = path->nodes[0];
1640                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1641                extent = btrfs_item_ptr(leaf, path->slots[0],
1642                                        struct btrfs_dev_extent);
1643                BUG_ON(found_key.offset > start || found_key.offset +
1644                       btrfs_dev_extent_length(leaf, extent) < start);
1645                key = found_key;
1646                btrfs_release_path(path);
1647                goto again;
1648        } else if (ret == 0) {
1649                leaf = path->nodes[0];
1650                extent = btrfs_item_ptr(leaf, path->slots[0],
1651                                        struct btrfs_dev_extent);
1652        } else {
1653                btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
1654                goto out;
1655        }
1656
1657        *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1658
1659        ret = btrfs_del_item(trans, root, path);
1660        if (ret) {
1661                btrfs_handle_fs_error(fs_info, ret,
1662                                      "Failed to remove dev extent item");
1663        } else {
1664                set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1665        }
1666out:
1667        btrfs_free_path(path);
1668        return ret;
1669}
1670
1671static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1672                                  struct btrfs_device *device,
1673                                  u64 chunk_offset, u64 start, u64 num_bytes)
1674{
1675        int ret;
1676        struct btrfs_path *path;
1677        struct btrfs_fs_info *fs_info = device->fs_info;
1678        struct btrfs_root *root = fs_info->dev_root;
1679        struct btrfs_dev_extent *extent;
1680        struct extent_buffer *leaf;
1681        struct btrfs_key key;
1682
1683        WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
1684        WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1685        path = btrfs_alloc_path();
1686        if (!path)
1687                return -ENOMEM;
1688
1689        key.objectid = device->devid;
1690        key.offset = start;
1691        key.type = BTRFS_DEV_EXTENT_KEY;
1692        ret = btrfs_insert_empty_item(trans, root, path, &key,
1693                                      sizeof(*extent));
1694        if (ret)
1695                goto out;
1696
1697        leaf = path->nodes[0];
1698        extent = btrfs_item_ptr(leaf, path->slots[0],
1699                                struct btrfs_dev_extent);
1700        btrfs_set_dev_extent_chunk_tree(leaf, extent,
1701                                        BTRFS_CHUNK_TREE_OBJECTID);
1702        btrfs_set_dev_extent_chunk_objectid(leaf, extent,
1703                                            BTRFS_FIRST_CHUNK_TREE_OBJECTID);
1704        btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1705
1706        btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1707        btrfs_mark_buffer_dirty(leaf);
1708out:
1709        btrfs_free_path(path);
1710        return ret;
1711}
1712
1713static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1714{
1715        struct extent_map_tree *em_tree;
1716        struct extent_map *em;
1717        struct rb_node *n;
1718        u64 ret = 0;
1719
1720        em_tree = &fs_info->mapping_tree;
1721        read_lock(&em_tree->lock);
1722        n = rb_last(&em_tree->map.rb_root);
1723        if (n) {
1724                em = rb_entry(n, struct extent_map, rb_node);
1725                ret = em->start + em->len;
1726        }
1727        read_unlock(&em_tree->lock);
1728
1729        return ret;
1730}
1731
1732static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1733                                    u64 *devid_ret)
1734{
1735        int ret;
1736        struct btrfs_key key;
1737        struct btrfs_key found_key;
1738        struct btrfs_path *path;
1739
1740        path = btrfs_alloc_path();
1741        if (!path)
1742                return -ENOMEM;
1743
1744        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1745        key.type = BTRFS_DEV_ITEM_KEY;
1746        key.offset = (u64)-1;
1747
1748        ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1749        if (ret < 0)
1750                goto error;
1751
1752        if (ret == 0) {
1753                /* Corruption */
1754                btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
1755                ret = -EUCLEAN;
1756                goto error;
1757        }
1758
1759        ret = btrfs_previous_item(fs_info->chunk_root, path,
1760                                  BTRFS_DEV_ITEMS_OBJECTID,
1761                                  BTRFS_DEV_ITEM_KEY);
1762        if (ret) {
1763                *devid_ret = 1;
1764        } else {
1765                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1766                                      path->slots[0]);
1767                *devid_ret = found_key.offset + 1;
1768        }
1769        ret = 0;
1770error:
1771        btrfs_free_path(path);
1772        return ret;
1773}
1774
1775/*
1776 * the device information is stored in the chunk root
1777 * the btrfs_device struct should be fully filled in
1778 */
1779static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
1780                            struct btrfs_device *device)
1781{
1782        int ret;
1783        struct btrfs_path *path;
1784        struct btrfs_dev_item *dev_item;
1785        struct extent_buffer *leaf;
1786        struct btrfs_key key;
1787        unsigned long ptr;
1788
1789        path = btrfs_alloc_path();
1790        if (!path)
1791                return -ENOMEM;
1792
1793        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1794        key.type = BTRFS_DEV_ITEM_KEY;
1795        key.offset = device->devid;
1796
1797        ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
1798                                      &key, sizeof(*dev_item));
1799        if (ret)
1800                goto out;
1801
1802        leaf = path->nodes[0];
1803        dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1804
1805        btrfs_set_device_id(leaf, dev_item, device->devid);
1806        btrfs_set_device_generation(leaf, dev_item, 0);
1807        btrfs_set_device_type(leaf, dev_item, device->type);
1808        btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1809        btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1810        btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1811        btrfs_set_device_total_bytes(leaf, dev_item,
1812                                     btrfs_device_get_disk_total_bytes(device));
1813        btrfs_set_device_bytes_used(leaf, dev_item,
1814                                    btrfs_device_get_bytes_used(device));
1815        btrfs_set_device_group(leaf, dev_item, 0);
1816        btrfs_set_device_seek_speed(leaf, dev_item, 0);
1817        btrfs_set_device_bandwidth(leaf, dev_item, 0);
1818        btrfs_set_device_start_offset(leaf, dev_item, 0);
1819
1820        ptr = btrfs_device_uuid(dev_item);
1821        write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1822        ptr = btrfs_device_fsid(dev_item);
1823        write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
1824                            ptr, BTRFS_FSID_SIZE);
1825        btrfs_mark_buffer_dirty(leaf);
1826
1827        ret = 0;
1828out:
1829        btrfs_free_path(path);
1830        return ret;
1831}
1832
1833/*
1834 * Function to update ctime/mtime for a given device path.
1835 * Mainly used for ctime/mtime based probe like libblkid.
1836 */
1837static void update_dev_time(const char *path_name)
1838{
1839        struct file *filp;
1840
1841        filp = filp_open(path_name, O_RDWR, 0);
1842        if (IS_ERR(filp))
1843                return;
1844        file_update_time(filp);
1845        filp_close(filp, NULL);
1846}
1847
1848static int btrfs_rm_dev_item(struct btrfs_device *device)
1849{
1850        struct btrfs_root *root = device->fs_info->chunk_root;
1851        int ret;
1852        struct btrfs_path *path;
1853        struct btrfs_key key;
1854        struct btrfs_trans_handle *trans;
1855
1856        path = btrfs_alloc_path();
1857        if (!path)
1858                return -ENOMEM;
1859
1860        trans = btrfs_start_transaction(root, 0);
1861        if (IS_ERR(trans)) {
1862                btrfs_free_path(path);
1863                return PTR_ERR(trans);
1864        }
1865        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1866        key.type = BTRFS_DEV_ITEM_KEY;
1867        key.offset = device->devid;
1868
1869        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1870        if (ret) {
1871                if (ret > 0)
1872                        ret = -ENOENT;
1873                btrfs_abort_transaction(trans, ret);
1874                btrfs_end_transaction(trans);
1875                goto out;
1876        }
1877
1878        ret = btrfs_del_item(trans, root, path);
1879        if (ret) {
1880                btrfs_abort_transaction(trans, ret);
1881                btrfs_end_transaction(trans);
1882        }
1883
1884out:
1885        btrfs_free_path(path);
1886        if (!ret)
1887                ret = btrfs_commit_transaction(trans);
1888        return ret;
1889}
1890
1891/*
1892 * Verify that @num_devices satisfies the RAID profile constraints in the whole
1893 * filesystem. It's up to the caller to adjust that number regarding eg. device
1894 * replace.
1895 */
1896static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
1897                u64 num_devices)
1898{
1899        u64 all_avail;
1900        unsigned seq;
1901        int i;
1902
1903        do {
1904                seq = read_seqbegin(&fs_info->profiles_lock);
1905
1906                all_avail = fs_info->avail_data_alloc_bits |
1907                            fs_info->avail_system_alloc_bits |
1908                            fs_info->avail_metadata_alloc_bits;
1909        } while (read_seqretry(&fs_info->profiles_lock, seq));
1910
1911        for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
1912                if (!(all_avail & btrfs_raid_array[i].bg_flag))
1913                        continue;
1914
1915                if (num_devices < btrfs_raid_array[i].devs_min) {
1916                        int ret = btrfs_raid_array[i].mindev_error;
1917
1918                        if (ret)
1919                                return ret;
1920                }
1921        }
1922
1923        return 0;
1924}
1925
1926static struct btrfs_device * btrfs_find_next_active_device(
1927                struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
1928{
1929        struct btrfs_device *next_device;
1930
1931        list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
1932                if (next_device != device &&
1933                    !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
1934                    && next_device->bdev)
1935                        return next_device;
1936        }
1937
1938        return NULL;
1939}
1940
1941/*
1942 * Helper function to check if the given device is part of s_bdev / latest_bdev
1943 * and replace it with the provided or the next active device, in the context
1944 * where this function called, there should be always be another device (or
1945 * this_dev) which is active.
1946 */
1947void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
1948                                     struct btrfs_device *this_dev)
1949{
1950        struct btrfs_fs_info *fs_info = device->fs_info;
1951        struct btrfs_device *next_device;
1952
1953        if (this_dev)
1954                next_device = this_dev;
1955        else
1956                next_device = btrfs_find_next_active_device(fs_info->fs_devices,
1957                                                                device);
1958        ASSERT(next_device);
1959
1960        if (fs_info->sb->s_bdev &&
1961                        (fs_info->sb->s_bdev == device->bdev))
1962                fs_info->sb->s_bdev = next_device->bdev;
1963
1964        if (fs_info->fs_devices->latest_bdev == device->bdev)
1965                fs_info->fs_devices->latest_bdev = next_device->bdev;
1966}
1967
1968/*
1969 * Return btrfs_fs_devices::num_devices excluding the device that's being
1970 * currently replaced.
1971 */
1972static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
1973{
1974        u64 num_devices = fs_info->fs_devices->num_devices;
1975
1976        down_read(&fs_info->dev_replace.rwsem);
1977        if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
1978                ASSERT(num_devices > 1);
1979                num_devices--;
1980        }
1981        up_read(&fs_info->dev_replace.rwsem);
1982
1983        return num_devices;
1984}
1985
1986static void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
1987                                      struct block_device *bdev,
1988                                      const char *device_path)
1989{
1990        struct btrfs_super_block *disk_super;
1991        int copy_num;
1992
1993        if (!bdev)
1994                return;
1995
1996        for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
1997                struct page *page;
1998                int ret;
1999
2000                disk_super = btrfs_read_dev_one_super(bdev, copy_num);
2001                if (IS_ERR(disk_super))
2002                        continue;
2003
2004                memset(&disk_super->magic, 0, sizeof(disk_super->magic));
2005
2006                page = virt_to_page(disk_super);
2007                set_page_dirty(page);
2008                lock_page(page);
2009                /* write_on_page() unlocks the page */
2010                ret = write_one_page(page);
2011                if (ret)
2012                        btrfs_warn(fs_info,
2013                                "error clearing superblock number %d (%d)",
2014                                copy_num, ret);
2015                btrfs_release_disk_super(disk_super);
2016
2017        }
2018
2019        /* Notify udev that device has changed */
2020        btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
2021
2022        /* Update ctime/mtime for device path for libblkid */
2023        update_dev_time(device_path);
2024}
2025
2026int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
2027                u64 devid)
2028{
2029        struct btrfs_device *device;
2030        struct btrfs_fs_devices *cur_devices;
2031        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2032        u64 num_devices;
2033        int ret = 0;
2034
2035        mutex_lock(&uuid_mutex);
2036
2037        num_devices = btrfs_num_devices(fs_info);
2038
2039        ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
2040        if (ret)
2041                goto out;
2042
2043        device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
2044
2045        if (IS_ERR(device)) {
2046                if (PTR_ERR(device) == -ENOENT &&
2047                    strcmp(device_path, "missing") == 0)
2048                        ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2049                else
2050                        ret = PTR_ERR(device);
2051                goto out;
2052        }
2053
2054        if (btrfs_pinned_by_swapfile(fs_info, device)) {
2055                btrfs_warn_in_rcu(fs_info,
2056                  "cannot remove device %s (devid %llu) due to active swapfile",
2057                                  rcu_str_deref(device->name), device->devid);
2058                ret = -ETXTBSY;
2059                goto out;
2060        }
2061
2062        if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2063                ret = BTRFS_ERROR_DEV_TGT_REPLACE;
2064                goto out;
2065        }
2066
2067        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
2068            fs_info->fs_devices->rw_devices == 1) {
2069                ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
2070                goto out;
2071        }
2072
2073        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2074                mutex_lock(&fs_info->chunk_mutex);
2075                list_del_init(&device->dev_alloc_list);
2076                device->fs_devices->rw_devices--;
2077                mutex_unlock(&fs_info->chunk_mutex);
2078        }
2079
2080        mutex_unlock(&uuid_mutex);
2081        ret = btrfs_shrink_device(device, 0);
2082        mutex_lock(&uuid_mutex);
2083        if (ret)
2084                goto error_undo;
2085
2086        /*
2087         * TODO: the superblock still includes this device in its num_devices
2088         * counter although write_all_supers() is not locked out. This
2089         * could give a filesystem state which requires a degraded mount.
2090         */
2091        ret = btrfs_rm_dev_item(device);
2092        if (ret)
2093                goto error_undo;
2094
2095        clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2096        btrfs_scrub_cancel_dev(device);
2097
2098        /*
2099         * the device list mutex makes sure that we don't change
2100         * the device list while someone else is writing out all
2101         * the device supers. Whoever is writing all supers, should
2102         * lock the device list mutex before getting the number of
2103         * devices in the super block (super_copy). Conversely,
2104         * whoever updates the number of devices in the super block
2105         * (super_copy) should hold the device list mutex.
2106         */
2107
2108        /*
2109         * In normal cases the cur_devices == fs_devices. But in case
2110         * of deleting a seed device, the cur_devices should point to
2111         * its own fs_devices listed under the fs_devices->seed.
2112         */
2113        cur_devices = device->fs_devices;
2114        mutex_lock(&fs_devices->device_list_mutex);
2115        list_del_rcu(&device->dev_list);
2116
2117        cur_devices->num_devices--;
2118        cur_devices->total_devices--;
2119        /* Update total_devices of the parent fs_devices if it's seed */
2120        if (cur_devices != fs_devices)
2121                fs_devices->total_devices--;
2122
2123        if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
2124                cur_devices->missing_devices--;
2125
2126        btrfs_assign_next_active_device(device, NULL);
2127
2128        if (device->bdev) {
2129                cur_devices->open_devices--;
2130                /* remove sysfs entry */
2131                btrfs_sysfs_remove_devices_dir(fs_devices, device);
2132        }
2133
2134        num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
2135        btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
2136        mutex_unlock(&fs_devices->device_list_mutex);
2137
2138        /*
2139         * at this point, the device is zero sized and detached from
2140         * the devices list.  All that's left is to zero out the old
2141         * supers and free the device.
2142         */
2143        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2144                btrfs_scratch_superblocks(fs_info, device->bdev,
2145                                          device->name->str);
2146
2147        btrfs_close_bdev(device);
2148        synchronize_rcu();
2149        btrfs_free_device(device);
2150
2151        if (cur_devices->open_devices == 0) {
2152                while (fs_devices) {
2153                        if (fs_devices->seed == cur_devices) {
2154                                fs_devices->seed = cur_devices->seed;
2155                                break;
2156                        }
2157                        fs_devices = fs_devices->seed;
2158                }
2159                cur_devices->seed = NULL;
2160                close_fs_devices(cur_devices);
2161                free_fs_devices(cur_devices);
2162        }
2163
2164out:
2165        mutex_unlock(&uuid_mutex);
2166        return ret;
2167
2168error_undo:
2169        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2170                mutex_lock(&fs_info->chunk_mutex);
2171                list_add(&device->dev_alloc_list,
2172                         &fs_devices->alloc_list);
2173                device->fs_devices->rw_devices++;
2174                mutex_unlock(&fs_info->chunk_mutex);
2175        }
2176        goto out;
2177}
2178
2179void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
2180{
2181        struct btrfs_fs_devices *fs_devices;
2182
2183        lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);
2184
2185        /*
2186         * in case of fs with no seed, srcdev->fs_devices will point
2187         * to fs_devices of fs_info. However when the dev being replaced is
2188         * a seed dev it will point to the seed's local fs_devices. In short
2189         * srcdev will have its correct fs_devices in both the cases.
2190         */
2191        fs_devices = srcdev->fs_devices;
2192
2193        list_del_rcu(&srcdev->dev_list);
2194        list_del(&srcdev->dev_alloc_list);
2195        fs_devices->num_devices--;
2196        if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2197                fs_devices->missing_devices--;
2198
2199        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
2200                fs_devices->rw_devices--;
2201
2202        if (srcdev->bdev)
2203                fs_devices->open_devices--;
2204}
2205
2206void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
2207{
2208        struct btrfs_fs_info *fs_info = srcdev->fs_info;
2209        struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2210
2211        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
2212                /* zero out the old super if it is writable */
2213                btrfs_scratch_superblocks(fs_info, srcdev->bdev,
2214                                          srcdev->name->str);
2215        }
2216
2217        btrfs_close_bdev(srcdev);
2218        synchronize_rcu();
2219        btrfs_free_device(srcdev);
2220
2221        /* if this is no devs we rather delete the fs_devices */
2222        if (!fs_devices->num_devices) {
2223                struct btrfs_fs_devices *tmp_fs_devices;
2224
2225                /*
2226                 * On a mounted FS, num_devices can't be zero unless it's a
2227                 * seed. In case of a seed device being replaced, the replace
2228                 * target added to the sprout FS, so there will be no more
2229                 * device left under the seed FS.
2230                 */
2231                ASSERT(fs_devices->seeding);
2232
2233                tmp_fs_devices = fs_info->fs_devices;
2234                while (tmp_fs_devices) {
2235                        if (tmp_fs_devices->seed == fs_devices) {
2236                                tmp_fs_devices->seed = fs_devices->seed;
2237                                break;
2238                        }
2239                        tmp_fs_devices = tmp_fs_devices->seed;
2240                }
2241                fs_devices->seed = NULL;
2242                close_fs_devices(fs_devices);
2243                free_fs_devices(fs_devices);
2244        }
2245}
2246
2247void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
2248{
2249        struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
2250
2251        mutex_lock(&fs_devices->device_list_mutex);
2252
2253        btrfs_sysfs_remove_devices_dir(fs_devices, tgtdev);
2254
2255        if (tgtdev->bdev)
2256                fs_devices->open_devices--;
2257
2258        fs_devices->num_devices--;
2259
2260        btrfs_assign_next_active_device(tgtdev, NULL);
2261
2262        list_del_rcu(&tgtdev->dev_list);
2263
2264        mutex_unlock(&fs_devices->device_list_mutex);
2265
2266        /*
2267         * The update_dev_time() with in btrfs_scratch_superblocks()
2268         * may lead to a call to btrfs_show_devname() which will try
2269         * to hold device_list_mutex. And here this device
2270         * is already out of device list, so we don't have to hold
2271         * the device_list_mutex lock.
2272         */
2273        btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
2274                                  tgtdev->name->str);
2275
2276        btrfs_close_bdev(tgtdev);
2277        synchronize_rcu();
2278        btrfs_free_device(tgtdev);
2279}
2280
2281static struct btrfs_device *btrfs_find_device_by_path(
2282                struct btrfs_fs_info *fs_info, const char *device_path)
2283{
2284        int ret = 0;
2285        struct btrfs_super_block *disk_super;
2286        u64 devid;
2287        u8 *dev_uuid;
2288        struct block_device *bdev;
2289        struct btrfs_device *device;
2290
2291        ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
2292                                    fs_info->bdev_holder, 0, &bdev, &disk_super);
2293        if (ret)
2294                return ERR_PTR(ret);
2295
2296        devid = btrfs_stack_device_id(&disk_super->dev_item);
2297        dev_uuid = disk_super->dev_item.uuid;
2298        if (btrfs_fs_incompat(fs_info, METADATA_UUID))
2299                device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2300                                           disk_super->metadata_uuid, true);
2301        else
2302                device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2303                                           disk_super->fsid, true);
2304
2305        btrfs_release_disk_super(disk_super);
2306        if (!device)
2307                device = ERR_PTR(-ENOENT);
2308        blkdev_put(bdev, FMODE_READ);
2309        return device;
2310}
2311
2312/*
2313 * Lookup a device given by device id, or the path if the id is 0.
2314 */
2315struct btrfs_device *btrfs_find_device_by_devspec(
2316                struct btrfs_fs_info *fs_info, u64 devid,
2317                const char *device_path)
2318{
2319        struct btrfs_device *device;
2320
2321        if (devid) {
2322                device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
2323                                           NULL, true);
2324                if (!device)
2325                        return ERR_PTR(-ENOENT);
2326                return device;
2327        }
2328
2329        if (!device_path || !device_path[0])
2330                return ERR_PTR(-EINVAL);
2331
2332        if (strcmp(device_path, "missing") == 0) {
2333                /* Find first missing device */
2334                list_for_each_entry(device, &fs_info->fs_devices->devices,
2335                                    dev_list) {
2336                        if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
2337                                     &device->dev_state) && !device->bdev)
2338                                return device;
2339                }
2340                return ERR_PTR(-ENOENT);
2341        }
2342
2343        return btrfs_find_device_by_path(fs_info, device_path);
2344}
2345
2346/*
2347 * does all the dirty work required for changing file system's UUID.
2348 */
2349static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
2350{
2351        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2352        struct btrfs_fs_devices *old_devices;
2353        struct btrfs_fs_devices *seed_devices;
2354        struct btrfs_super_block *disk_super = fs_info->super_copy;
2355        struct btrfs_device *device;
2356        u64 super_flags;
2357
2358        lockdep_assert_held(&uuid_mutex);
2359        if (!fs_devices->seeding)
2360                return -EINVAL;
2361
2362        seed_devices = alloc_fs_devices(NULL, NULL);
2363        if (IS_ERR(seed_devices))
2364                return PTR_ERR(seed_devices);
2365
2366        old_devices = clone_fs_devices(fs_devices);
2367        if (IS_ERR(old_devices)) {
2368                kfree(seed_devices);
2369                return PTR_ERR(old_devices);
2370        }
2371
2372        list_add(&old_devices->fs_list, &fs_uuids);
2373
2374        memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2375        seed_devices->opened = 1;
2376        INIT_LIST_HEAD(&seed_devices->devices);
2377        INIT_LIST_HEAD(&seed_devices->alloc_list);
2378        mutex_init(&seed_devices->device_list_mutex);
2379
2380        mutex_lock(&fs_devices->device_list_mutex);
2381        list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
2382                              synchronize_rcu);
2383        list_for_each_entry(device, &seed_devices->devices, dev_list)
2384                device->fs_devices = seed_devices;
2385
2386        mutex_lock(&fs_info->chunk_mutex);
2387        list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
2388        mutex_unlock(&fs_info->chunk_mutex);
2389
2390        fs_devices->seeding = false;
2391        fs_devices->num_devices = 0;
2392        fs_devices->open_devices = 0;
2393        fs_devices->missing_devices = 0;
2394        fs_devices->rotating = false;
2395        fs_devices->seed = seed_devices;
2396
2397        generate_random_uuid(fs_devices->fsid);
2398        memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
2399        memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2400        mutex_unlock(&fs_devices->device_list_mutex);
2401
2402        super_flags = btrfs_super_flags(disk_super) &
2403                      ~BTRFS_SUPER_FLAG_SEEDING;
2404        btrfs_set_super_flags(disk_super, super_flags);
2405
2406        return 0;
2407}
2408
2409/*
2410 * Store the expected generation for seed devices in device items.
2411 */
2412static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
2413{
2414        struct btrfs_fs_info *fs_info = trans->fs_info;
2415        struct btrfs_root *root = fs_info->chunk_root;
2416        struct btrfs_path *path;
2417        struct extent_buffer *leaf;
2418        struct btrfs_dev_item *dev_item;
2419        struct btrfs_device *device;
2420        struct btrfs_key key;
2421        u8 fs_uuid[BTRFS_FSID_SIZE];
2422        u8 dev_uuid[BTRFS_UUID_SIZE];
2423        u64 devid;
2424        int ret;
2425
2426        path = btrfs_alloc_path();
2427        if (!path)
2428                return -ENOMEM;
2429
2430        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2431        key.offset = 0;
2432        key.type = BTRFS_DEV_ITEM_KEY;
2433
2434        while (1) {
2435                ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2436                if (ret < 0)
2437                        goto error;
2438
2439                leaf = path->nodes[0];
2440next_slot:
2441                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2442                        ret = btrfs_next_leaf(root, path);
2443                        if (ret > 0)
2444                                break;
2445                        if (ret < 0)
2446                                goto error;
2447                        leaf = path->nodes[0];
2448                        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2449                        btrfs_release_path(path);
2450                        continue;
2451                }
2452
2453                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2454                if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2455                    key.type != BTRFS_DEV_ITEM_KEY)
2456                        break;
2457
2458                dev_item = btrfs_item_ptr(leaf, path->slots[0],
2459                                          struct btrfs_dev_item);
2460                devid = btrfs_device_id(leaf, dev_item);
2461                read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2462                                   BTRFS_UUID_SIZE);
2463                read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2464                                   BTRFS_FSID_SIZE);
2465                device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2466                                           fs_uuid, true);
2467                BUG_ON(!device); /* Logic error */
2468
2469                if (device->fs_devices->seeding) {
2470                        btrfs_set_device_generation(leaf, dev_item,
2471                                                    device->generation);
2472                        btrfs_mark_buffer_dirty(leaf);
2473                }
2474
2475                path->slots[0]++;
2476                goto next_slot;
2477        }
2478        ret = 0;
2479error:
2480        btrfs_free_path(path);
2481        return ret;
2482}
2483
2484int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2485{
2486        struct btrfs_root *root = fs_info->dev_root;
2487        struct request_queue *q;
2488        struct btrfs_trans_handle *trans;
2489        struct btrfs_device *device;
2490        struct block_device *bdev;
2491        struct super_block *sb = fs_info->sb;
2492        struct rcu_string *name;
2493        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2494        u64 orig_super_total_bytes;
2495        u64 orig_super_num_devices;
2496        int seeding_dev = 0;
2497        int ret = 0;
2498        bool unlocked = false;
2499
2500        if (sb_rdonly(sb) && !fs_devices->seeding)
2501                return -EROFS;
2502
2503        bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2504                                  fs_info->bdev_holder);
2505        if (IS_ERR(bdev))
2506                return PTR_ERR(bdev);
2507
2508        if (fs_devices->seeding) {
2509                seeding_dev = 1;
2510                down_write(&sb->s_umount);
2511                mutex_lock(&uuid_mutex);
2512        }
2513
2514        filemap_write_and_wait(bdev->bd_inode->i_mapping);
2515
2516        mutex_lock(&fs_devices->device_list_mutex);
2517        list_for_each_entry(device, &fs_devices->devices, dev_list) {
2518                if (device->bdev == bdev) {
2519                        ret = -EEXIST;
2520                        mutex_unlock(
2521                                &fs_devices->device_list_mutex);
2522                        goto error;
2523                }
2524        }
2525        mutex_unlock(&fs_devices->device_list_mutex);
2526
2527        device = btrfs_alloc_device(fs_info, NULL, NULL);
2528        if (IS_ERR(device)) {
2529                /* we can safely leave the fs_devices entry around */
2530                ret = PTR_ERR(device);
2531                goto error;
2532        }
2533
2534        name = rcu_string_strdup(device_path, GFP_KERNEL);
2535        if (!name) {
2536                ret = -ENOMEM;
2537                goto error_free_device;
2538        }
2539        rcu_assign_pointer(device->name, name);
2540
2541        trans = btrfs_start_transaction(root, 0);
2542        if (IS_ERR(trans)) {
2543                ret = PTR_ERR(trans);
2544                goto error_free_device;
2545        }
2546
2547        q = bdev_get_queue(bdev);
2548        set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
2549        device->generation = trans->transid;
2550        device->io_width = fs_info->sectorsize;
2551        device->io_align = fs_info->sectorsize;
2552        device->sector_size = fs_info->sectorsize;
2553        device->total_bytes = round_down(i_size_read(bdev->bd_inode),
2554                                         fs_info->sectorsize);
2555        device->disk_total_bytes = device->total_bytes;
2556        device->commit_total_bytes = device->total_bytes;
2557        device->fs_info = fs_info;
2558        device->bdev = bdev;
2559        set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2560        clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2561        device->mode = FMODE_EXCL;
2562        device->dev_stats_valid = 1;
2563        set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2564
2565        if (seeding_dev) {
2566                sb->s_flags &= ~SB_RDONLY;
2567                ret = btrfs_prepare_sprout(fs_info);
2568                if (ret) {
2569                        btrfs_abort_transaction(trans, ret);
2570                        goto error_trans;
2571                }
2572        }
2573
2574        device->fs_devices = fs_devices;
2575
2576        mutex_lock(&fs_devices->device_list_mutex);
2577        mutex_lock(&fs_info->chunk_mutex);
2578        list_add_rcu(&device->dev_list, &fs_devices->devices);
2579        list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
2580        fs_devices->num_devices++;
2581        fs_devices->open_devices++;
2582        fs_devices->rw_devices++;
2583        fs_devices->total_devices++;
2584        fs_devices->total_rw_bytes += device->total_bytes;
2585
2586        atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2587
2588        if (!blk_queue_nonrot(q))
2589                fs_devices->rotating = true;
2590
2591        orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
2592        btrfs_set_super_total_bytes(fs_info->super_copy,
2593                round_down(orig_super_total_bytes + device->total_bytes,
2594                           fs_info->sectorsize));
2595
2596        orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
2597        btrfs_set_super_num_devices(fs_info->super_copy,
2598                                    orig_super_num_devices + 1);
2599
2600        /* add sysfs device entry */
2601        btrfs_sysfs_add_devices_dir(fs_devices, device);
2602
2603        /*
2604         * we've got more storage, clear any full flags on the space
2605         * infos
2606         */
2607        btrfs_clear_space_info_full(fs_info);
2608
2609        mutex_unlock(&fs_info->chunk_mutex);
2610        mutex_unlock(&fs_devices->device_list_mutex);
2611
2612        if (seeding_dev) {
2613                mutex_lock(&fs_info->chunk_mutex);
2614                ret = init_first_rw_device(trans);
2615                mutex_unlock(&fs_info->chunk_mutex);
2616                if (ret) {
2617                        btrfs_abort_transaction(trans, ret);
2618                        goto error_sysfs;
2619                }
2620        }
2621
2622        ret = btrfs_add_dev_item(trans, device);
2623        if (ret) {
2624                btrfs_abort_transaction(trans, ret);
2625                goto error_sysfs;
2626        }
2627
2628        if (seeding_dev) {
2629                ret = btrfs_finish_sprout(trans);
2630                if (ret) {
2631                        btrfs_abort_transaction(trans, ret);
2632                        goto error_sysfs;
2633                }
2634
2635                btrfs_sysfs_update_sprout_fsid(fs_devices,
2636                                fs_info->fs_devices->fsid);
2637        }
2638
2639        ret = btrfs_commit_transaction(trans);
2640
2641        if (seeding_dev) {
2642                mutex_unlock(&uuid_mutex);
2643                up_write(&sb->s_umount);
2644                unlocked = true;
2645
2646                if (ret) /* transaction commit */
2647                        return ret;
2648
2649                ret = btrfs_relocate_sys_chunks(fs_info);
2650                if (ret < 0)
2651                        btrfs_handle_fs_error(fs_info, ret,
2652                                    "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2653                trans = btrfs_attach_transaction(root);
2654                if (IS_ERR(trans)) {
2655                        if (PTR_ERR(trans) == -ENOENT)
2656                                return 0;
2657                        ret = PTR_ERR(trans);
2658                        trans = NULL;
2659                        goto error_sysfs;
2660                }
2661                ret = btrfs_commit_transaction(trans);
2662        }
2663
2664        /*
2665         * Now that we have written a new super block to this device, check all
2666         * other fs_devices list if device_path alienates any other scanned
2667         * device.
2668         * We can ignore the return value as it typically returns -EINVAL and
2669         * only succeeds if the device was an alien.
2670         */
2671        btrfs_forget_devices(device_path);
2672
2673        /* Update ctime/mtime for blkid or udev */
2674        update_dev_time(device_path);
2675
2676        return ret;
2677
2678error_sysfs:
2679        btrfs_sysfs_remove_devices_dir(fs_devices, device);
2680        mutex_lock(&fs_info->fs_devices->device_list_mutex);
2681        mutex_lock(&fs_info->chunk_mutex);
2682        list_del_rcu(&device->dev_list);
2683        list_del(&device->dev_alloc_list);
2684        fs_info->fs_devices->num_devices--;
2685        fs_info->fs_devices->open_devices--;
2686        fs_info->fs_devices->rw_devices--;
2687        fs_info->fs_devices->total_devices--;
2688        fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
2689        atomic64_sub(device->total_bytes, &fs_info->free_chunk_space);
2690        btrfs_set_super_total_bytes(fs_info->super_copy,
2691                                    orig_super_total_bytes);
2692        btrfs_set_super_num_devices(fs_info->super_copy,
2693                                    orig_super_num_devices);
2694        mutex_unlock(&fs_info->chunk_mutex);
2695        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2696error_trans:
2697        if (seeding_dev)
2698                sb->s_flags |= SB_RDONLY;
2699        if (trans)
2700                btrfs_end_transaction(trans);
2701error_free_device:
2702        btrfs_free_device(device);
2703error:
2704        blkdev_put(bdev, FMODE_EXCL);
2705        if (seeding_dev && !unlocked) {
2706                mutex_unlock(&uuid_mutex);
2707                up_write(&sb->s_umount);
2708        }
2709        return ret;
2710}
2711
2712static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2713                                        struct btrfs_device *device)
2714{
2715        int ret;
2716        struct btrfs_path *path;
2717        struct btrfs_root *root = device->fs_info->chunk_root;
2718        struct btrfs_dev_item *dev_item;
2719        struct extent_buffer *leaf;
2720        struct btrfs_key key;
2721
2722        path = btrfs_alloc_path();
2723        if (!path)
2724                return -ENOMEM;
2725
2726        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2727        key.type = BTRFS_DEV_ITEM_KEY;
2728        key.offset = device->devid;
2729
2730        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2731        if (ret < 0)
2732                goto out;
2733
2734        if (ret > 0) {
2735                ret = -ENOENT;
2736                goto out;
2737        }
2738
2739        leaf = path->nodes[0];
2740        dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2741
2742        btrfs_set_device_id(leaf, dev_item, device->devid);
2743        btrfs_set_device_type(leaf, dev_item, device->type);
2744        btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2745        btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2746        btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2747        btrfs_set_device_total_bytes(leaf, dev_item,
2748                                     btrfs_device_get_disk_total_bytes(device));
2749        btrfs_set_device_bytes_used(leaf, dev_item,
2750                                    btrfs_device_get_bytes_used(device));
2751        btrfs_mark_buffer_dirty(leaf);
2752
2753out:
2754        btrfs_free_path(path);
2755        return ret;
2756}
2757
2758int btrfs_grow_device(struct btrfs_trans_handle *trans,
2759                      struct btrfs_device *device, u64 new_size)
2760{
2761        struct btrfs_fs_info *fs_info = device->fs_info;
2762        struct btrfs_super_block *super_copy = fs_info->super_copy;
2763        u64 old_total;
2764        u64 diff;
2765
2766        if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2767                return -EACCES;
2768
2769        new_size = round_down(new_size, fs_info->sectorsize);
2770
2771        mutex_lock(&fs_info->chunk_mutex);
2772        old_total = btrfs_super_total_bytes(super_copy);
2773        diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
2774
2775        if (new_size <= device->total_bytes ||
2776            test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2777                mutex_unlock(&fs_info->chunk_mutex);
2778                return -EINVAL;
2779        }
2780
2781        btrfs_set_super_total_bytes(super_copy,
2782                        round_down(old_total + diff, fs_info->sectorsize));
2783        device->fs_devices->total_rw_bytes += diff;
2784
2785        btrfs_device_set_total_bytes(device, new_size);
2786        btrfs_device_set_disk_total_bytes(device, new_size);
2787        btrfs_clear_space_info_full(device->fs_info);
2788        if (list_empty(&device->post_commit_list))
2789                list_add_tail(&device->post_commit_list,
2790                              &trans->transaction->dev_update_list);
2791        mutex_unlock(&fs_info->chunk_mutex);
2792
2793        return btrfs_update_device(trans, device);
2794}
2795
2796static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
2797{
2798        struct btrfs_fs_info *fs_info = trans->fs_info;
2799        struct btrfs_root *root = fs_info->chunk_root;
2800        int ret;
2801        struct btrfs_path *path;
2802        struct btrfs_key key;
2803
2804        path = btrfs_alloc_path();
2805        if (!path)
2806                return -ENOMEM;
2807
2808        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2809        key.offset = chunk_offset;
2810        key.type = BTRFS_CHUNK_ITEM_KEY;
2811
2812        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2813        if (ret < 0)
2814                goto out;
2815        else if (ret > 0) { /* Logic error or corruption */
2816                btrfs_handle_fs_error(fs_info, -ENOENT,
2817                                      "Failed lookup while freeing chunk.");
2818                ret = -ENOENT;
2819                goto out;
2820        }
2821
2822        ret = btrfs_del_item(trans, root, path);
2823        if (ret < 0)
2824                btrfs_handle_fs_error(fs_info, ret,
2825                                      "Failed to delete chunk item.");
2826out:
2827        btrfs_free_path(path);
2828        return ret;
2829}
2830
2831static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2832{
2833        struct btrfs_super_block *super_copy = fs_info->super_copy;
2834        struct btrfs_disk_key *disk_key;
2835        struct btrfs_chunk *chunk;
2836        u8 *ptr;
2837        int ret = 0;
2838        u32 num_stripes;
2839        u32 array_size;
2840        u32 len = 0;
2841        u32 cur;
2842        struct btrfs_key key;
2843
2844        mutex_lock(&fs_info->chunk_mutex);
2845        array_size = btrfs_super_sys_array_size(super_copy);
2846
2847        ptr = super_copy->sys_chunk_array;
2848        cur = 0;
2849
2850        while (cur < array_size) {
2851                disk_key = (struct btrfs_disk_key *)ptr;
2852                btrfs_disk_key_to_cpu(&key, disk_key);
2853
2854                len = sizeof(*disk_key);
2855
2856                if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2857                        chunk = (struct btrfs_chunk *)(ptr + len);
2858                        num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2859                        len += btrfs_chunk_item_size(num_stripes);
2860                } else {
2861                        ret = -EIO;
2862                        break;
2863                }
2864                if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
2865                    key.offset == chunk_offset) {
2866                        memmove(ptr, ptr + len, array_size - (cur + len));
2867                        array_size -= len;
2868                        btrfs_set_super_sys_array_size(super_copy, array_size);
2869                } else {
2870                        ptr += len;
2871                        cur += len;
2872                }
2873        }
2874        mutex_unlock(&fs_info->chunk_mutex);
2875        return ret;
2876}
2877
2878/*
2879 * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
2880 * @logical: Logical block offset in bytes.
2881 * @length: Length of extent in bytes.
2882 *
2883 * Return: Chunk mapping or ERR_PTR.
2884 */
2885struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
2886                                       u64 logical, u64 length)
2887{
2888        struct extent_map_tree *em_tree;
2889        struct extent_map *em;
2890
2891        em_tree = &fs_info->mapping_tree;
2892        read_lock(&em_tree->lock);
2893        em = lookup_extent_mapping(em_tree, logical, length);
2894        read_unlock(&em_tree->lock);
2895
2896        if (!em) {
2897                btrfs_crit(fs_info, "unable to find logical %llu length %llu",
2898                           logical, length);
2899                return ERR_PTR(-EINVAL);
2900        }
2901
2902        if (em->start > logical || em->start + em->len < logical) {
2903                btrfs_crit(fs_info,
2904                           "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
2905                           logical, length, em->start, em->start + em->len);
2906                free_extent_map(em);
2907                return ERR_PTR(-EINVAL);
2908        }
2909
2910        /* callers are responsible for dropping em's ref. */
2911        return em;
2912}
2913
2914int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
2915{
2916        struct btrfs_fs_info *fs_info = trans->fs_info;
2917        struct extent_map *em;
2918        struct map_lookup *map;
2919        u64 dev_extent_len = 0;
2920        int i, ret = 0;
2921        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2922
2923        em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
2924        if (IS_ERR(em)) {
2925                /*
2926                 * This is a logic error, but we don't want to just rely on the
2927                 * user having built with ASSERT enabled, so if ASSERT doesn't
2928                 * do anything we still error out.
2929                 */
2930                ASSERT(0);
2931                return PTR_ERR(em);
2932        }
2933        map = em->map_lookup;
2934        mutex_lock(&fs_info->chunk_mutex);
2935        check_system_chunk(trans, map->type);
2936        mutex_unlock(&fs_info->chunk_mutex);
2937
2938        /*
2939         * Take the device list mutex to prevent races with the final phase of
2940         * a device replace operation that replaces the device object associated
2941         * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
2942         */
2943        mutex_lock(&fs_devices->device_list_mutex);
2944        for (i = 0; i < map->num_stripes; i++) {
2945                struct btrfs_device *device = map->stripes[i].dev;
2946                ret = btrfs_free_dev_extent(trans, device,
2947                                            map->stripes[i].physical,
2948                                            &dev_extent_len);
2949                if (ret) {
2950                        mutex_unlock(&fs_devices->device_list_mutex);
2951                        btrfs_abort_transaction(trans, ret);
2952                        goto out;
2953                }
2954
2955                if (device->bytes_used > 0) {
2956                        mutex_lock(&fs_info->chunk_mutex);
2957                        btrfs_device_set_bytes_used(device,
2958                                        device->bytes_used - dev_extent_len);
2959                        atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
2960                        btrfs_clear_space_info_full(fs_info);
2961                        mutex_unlock(&fs_info->chunk_mutex);
2962                }
2963
2964                ret = btrfs_update_device(trans, device);
2965                if (ret) {
2966                        mutex_unlock(&fs_devices->device_list_mutex);
2967                        btrfs_abort_transaction(trans, ret);
2968                        goto out;
2969                }
2970        }
2971        mutex_unlock(&fs_devices->device_list_mutex);
2972
2973        ret = btrfs_free_chunk(trans, chunk_offset);
2974        if (ret) {
2975                btrfs_abort_transaction(trans, ret);
2976                goto out;
2977        }
2978
2979        trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
2980
2981        if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2982                ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
2983                if (ret) {
2984                        btrfs_abort_transaction(trans, ret);
2985                        goto out;
2986                }
2987        }
2988
2989        ret = btrfs_remove_block_group(trans, chunk_offset, em);
2990        if (ret) {
2991                btrfs_abort_transaction(trans, ret);
2992                goto out;
2993        }
2994
2995out:
2996        /* once for us */
2997        free_extent_map(em);
2998        return ret;
2999}
3000
3001static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
3002{
3003        struct btrfs_root *root = fs_info->chunk_root;
3004        struct btrfs_trans_handle *trans;
3005        struct btrfs_block_group *block_group;
3006        int ret;
3007
3008        /*
3009         * Prevent races with automatic removal of unused block groups.
3010         * After we relocate and before we remove the chunk with offset
3011         * chunk_offset, automatic removal of the block group can kick in,
3012         * resulting in a failure when calling btrfs_remove_chunk() below.
3013         *
3014         * Make sure to acquire this mutex before doing a tree search (dev
3015         * or chunk trees) to find chunks. Otherwise the cleaner kthread might
3016         * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
3017         * we release the path used to search the chunk/dev tree and before
3018         * the current task acquires this mutex and calls us.
3019         */
3020        lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
3021
3022        /* step one, relocate all the extents inside this chunk */
3023        btrfs_scrub_pause(fs_info);
3024        ret = btrfs_relocate_block_group(fs_info, chunk_offset);
3025        btrfs_scrub_continue(fs_info);
3026        if (ret)
3027                return ret;
3028
3029        block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
3030        if (!block_group)
3031                return -ENOENT;
3032        btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
3033        btrfs_put_block_group(block_group);
3034
3035        trans = btrfs_start_trans_remove_block_group(root->fs_info,
3036                                                     chunk_offset);
3037        if (IS_ERR(trans)) {
3038                ret = PTR_ERR(trans);
3039                btrfs_handle_fs_error(root->fs_info, ret, NULL);
3040                return ret;
3041        }
3042
3043        /*
3044         * step two, delete the device extents and the
3045         * chunk tree entries
3046         */
3047        ret = btrfs_remove_chunk(trans, chunk_offset);
3048        btrfs_end_transaction(trans);
3049        return ret;
3050}
3051
3052static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
3053{
3054        struct btrfs_root *chunk_root = fs_info->chunk_root;
3055        struct btrfs_path *path;
3056        struct extent_buffer *leaf;
3057        struct btrfs_chunk *chunk;
3058        struct btrfs_key key;
3059        struct btrfs_key found_key;
3060        u64 chunk_type;
3061        bool retried = false;
3062        int failed = 0;
3063        int ret;
3064
3065        path = btrfs_alloc_path();
3066        if (!path)
3067                return -ENOMEM;
3068
3069again:
3070        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3071        key.offset = (u64)-1;
3072        key.type = BTRFS_CHUNK_ITEM_KEY;
3073
3074        while (1) {
3075                mutex_lock(&fs_info->delete_unused_bgs_mutex);
3076                ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3077                if (ret < 0) {
3078                        mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3079                        goto error;
3080                }
3081                BUG_ON(ret == 0); /* Corruption */
3082
3083                ret = btrfs_previous_item(chunk_root, path, key.objectid,
3084                                          key.type);
3085                if (ret)
3086                        mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3087                if (ret < 0)
3088                        goto error;
3089                if (ret > 0)
3090                        break;
3091
3092                leaf = path->nodes[0];
3093                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3094
3095                chunk = btrfs_item_ptr(leaf, path->slots[0],
3096                                       struct btrfs_chunk);
3097                chunk_type = btrfs_chunk_type(leaf, chunk);
3098                btrfs_release_path(path);
3099
3100                if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
3101                        ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3102                        if (ret == -ENOSPC)
3103                                failed++;
3104                        else
3105                                BUG_ON(ret);
3106                }
3107                mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3108
3109                if (found_key.offset == 0)
3110                        break;
3111                key.offset = found_key.offset - 1;
3112        }
3113        ret = 0;
3114        if (failed && !retried) {
3115                failed = 0;
3116                retried = true;
3117                goto again;
3118        } else if (WARN_ON(failed && retried)) {
3119                ret = -ENOSPC;
3120        }
3121error:
3122        btrfs_free_path(path);
3123        return ret;
3124}
3125
3126/*
3127 * return 1 : allocate a data chunk successfully,
3128 * return <0: errors during allocating a data chunk,
3129 * return 0 : no need to allocate a data chunk.
3130 */
3131static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
3132                                      u64 chunk_offset)
3133{
3134        struct btrfs_block_group *cache;
3135        u64 bytes_used;
3136        u64 chunk_type;
3137
3138        cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3139        ASSERT(cache);
3140        chunk_type = cache->flags;
3141        btrfs_put_block_group(cache);
3142
3143        if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
3144                return 0;
3145
3146        spin_lock(&fs_info->data_sinfo->lock);
3147        bytes_used = fs_info->data_sinfo->bytes_used;
3148        spin_unlock(&fs_info->data_sinfo->lock);
3149
3150        if (!bytes_used) {
3151                struct btrfs_trans_handle *trans;
3152                int ret;
3153
3154                trans = btrfs_join_transaction(fs_info->tree_root);
3155                if (IS_ERR(trans))
3156                        return PTR_ERR(trans);
3157
3158                ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
3159                btrfs_end_transaction(trans);
3160                if (ret < 0)
3161                        return ret;
3162                return 1;
3163        }
3164
3165        return 0;
3166}
3167
3168static int insert_balance_item(struct btrfs_fs_info *fs_info,
3169                               struct btrfs_balance_control *bctl)
3170{
3171        struct btrfs_root *root = fs_info->tree_root;
3172        struct btrfs_trans_handle *trans;
3173        struct btrfs_balance_item *item;
3174        struct btrfs_disk_balance_args disk_bargs;
3175        struct btrfs_path *path;
3176        struct extent_buffer *leaf;
3177        struct btrfs_key key;
3178        int ret, err;
3179
3180        path = btrfs_alloc_path();
3181        if (!path)
3182                return -ENOMEM;
3183
3184        trans = btrfs_start_transaction(root, 0);
3185        if (IS_ERR(trans)) {
3186                btrfs_free_path(path);
3187                return PTR_ERR(trans);
3188        }
3189
3190        key.objectid = BTRFS_BALANCE_OBJECTID;
3191        key.type = BTRFS_TEMPORARY_ITEM_KEY;
3192        key.offset = 0;
3193
3194        ret = btrfs_insert_empty_item(trans, root, path, &key,
3195                                      sizeof(*item));
3196        if (ret)
3197                goto out;
3198
3199        leaf = path->nodes[0];
3200        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3201
3202        memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3203
3204        btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
3205        btrfs_set_balance_data(leaf, item, &disk_bargs);
3206        btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
3207        btrfs_set_balance_meta(leaf, item, &disk_bargs);
3208        btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
3209        btrfs_set_balance_sys(leaf, item, &disk_bargs);
3210
3211        btrfs_set_balance_flags(leaf, item, bctl->flags);
3212
3213        btrfs_mark_buffer_dirty(leaf);
3214out:
3215        btrfs_free_path(path);
3216        err = btrfs_commit_transaction(trans);
3217        if (err && !ret)
3218                ret = err;
3219        return ret;
3220}
3221
3222static int del_balance_item(struct btrfs_fs_info *fs_info)
3223{
3224        struct btrfs_root *root = fs_info->tree_root;
3225        struct btrfs_trans_handle *trans;
3226        struct btrfs_path *path;
3227        struct btrfs_key key;
3228        int ret, err;
3229
3230        path = btrfs_alloc_path();
3231        if (!path)
3232                return -ENOMEM;
3233
3234        trans = btrfs_start_transaction(root, 0);
3235        if (IS_ERR(trans)) {
3236                btrfs_free_path(path);
3237                return PTR_ERR(trans);
3238        }
3239
3240        key.objectid = BTRFS_BALANCE_OBJECTID;
3241        key.type = BTRFS_TEMPORARY_ITEM_KEY;
3242        key.offset = 0;
3243
3244        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3245        if (ret < 0)
3246                goto out;
3247        if (ret > 0) {
3248                ret = -ENOENT;
3249                goto out;
3250        }
3251
3252        ret = btrfs_del_item(trans, root, path);
3253out:
3254        btrfs_free_path(path);
3255        err = btrfs_commit_transaction(trans);
3256        if (err && !ret)
3257                ret = err;
3258        return ret;
3259}
3260
3261/*
3262 * This is a heuristic used to reduce the number of chunks balanced on
3263 * resume after balance was interrupted.
3264 */
3265static void update_balance_args(struct btrfs_balance_control *bctl)
3266{
3267        /*
3268         * Turn on soft mode for chunk types that were being converted.
3269         */
3270        if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
3271                bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
3272        if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
3273                bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
3274        if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
3275                bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
3276
3277        /*
3278         * Turn on usage filter if is not already used.  The idea is
3279         * that chunks that we have already balanced should be
3280         * reasonably full.  Don't do it for chunks that are being
3281         * converted - that will keep us from relocating unconverted
3282         * (albeit full) chunks.
3283         */
3284        if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3285            !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3286            !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3287                bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
3288                bctl->data.usage = 90;
3289        }
3290        if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3291            !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3292            !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3293                bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
3294                bctl->sys.usage = 90;
3295        }
3296        if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3297            !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3298            !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3299                bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
3300                bctl->meta.usage = 90;
3301        }
3302}
3303
3304/*
3305 * Clear the balance status in fs_info and delete the balance item from disk.
3306 */
3307static void reset_balance_state(struct btrfs_fs_info *fs_info)
3308{
3309        struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3310        int ret;
3311
3312        BUG_ON(!fs_info->balance_ctl);
3313
3314        spin_lock(&fs_info->balance_lock);
3315        fs_info->balance_ctl = NULL;
3316        spin_unlock(&fs_info->balance_lock);
3317
3318        kfree(bctl);
3319        ret = del_balance_item(fs_info);
3320        if (ret)
3321                btrfs_handle_fs_error(fs_info, ret, NULL);
3322}
3323
3324/*
3325 * Balance filters.  Return 1 if chunk should be filtered out
3326 * (should not be balanced).
3327 */
3328static int chunk_profiles_filter(u64 chunk_type,
3329                                 struct btrfs_balance_args *bargs)
3330{
3331        chunk_type = chunk_to_extended(chunk_type) &
3332                                BTRFS_EXTENDED_PROFILE_MASK;
3333
3334        if (bargs->profiles & chunk_type)
3335                return 0;
3336
3337        return 1;
3338}
3339
3340static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
3341                              struct btrfs_balance_args *bargs)
3342{
3343        struct btrfs_block_group *cache;
3344        u64 chunk_used;
3345        u64 user_thresh_min;
3346        u64 user_thresh_max;
3347        int ret = 1;
3348
3349        cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3350        chunk_used = cache->used;
3351
3352        if (bargs->usage_min == 0)
3353                user_thresh_min = 0;
3354        else
3355                user_thresh_min = div_factor_fine(cache->length,
3356                                                  bargs->usage_min);
3357
3358        if (bargs->usage_max == 0)
3359                user_thresh_max = 1;
3360        else if (bargs->usage_max > 100)
3361                user_thresh_max = cache->length;
3362        else
3363                user_thresh_max = div_factor_fine(cache->length,
3364                                                  bargs->usage_max);
3365
3366        if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
3367                ret = 0;
3368
3369        btrfs_put_block_group(cache);
3370        return ret;
3371}
3372
3373static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
3374                u64 chunk_offset, struct btrfs_balance_args *bargs)
3375{
3376        struct btrfs_block_group *cache;
3377        u64 chunk_used, user_thresh;
3378        int ret = 1;
3379
3380        cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3381        chunk_used = cache->used;
3382
3383        if (bargs->usage_min == 0)
3384                user_thresh = 1;
3385        else if (bargs->usage > 100)
3386                user_thresh = cache->length;
3387        else
3388                user_thresh = div_factor_fine(cache->length, bargs->usage);
3389
3390        if (chunk_used < user_thresh)
3391                ret = 0;
3392
3393        btrfs_put_block_group(cache);
3394        return ret;
3395}
3396
3397static int chunk_devid_filter(struct extent_buffer *leaf,
3398                              struct btrfs_chunk *chunk,
3399                              struct btrfs_balance_args *bargs)
3400{
3401        struct btrfs_stripe *stripe;
3402        int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3403        int i;
3404
3405        for (i = 0; i < num_stripes; i++) {
3406                stripe = btrfs_stripe_nr(chunk, i);
3407                if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
3408                        return 0;
3409        }
3410
3411        return 1;
3412}
3413
3414static u64 calc_data_stripes(u64 type, int num_stripes)
3415{
3416        const int index = btrfs_bg_flags_to_raid_index(type);
3417        const int ncopies = btrfs_raid_array[index].ncopies;
3418        const int nparity = btrfs_raid_array[index].nparity;
3419
3420        if (nparity)
3421                return num_stripes - nparity;
3422        else
3423                return num_stripes / ncopies;
3424}
3425
3426/* [pstart, pend) */
3427static int chunk_drange_filter(struct extent_buffer *leaf,
3428                               struct btrfs_chunk *chunk,
3429                               struct btrfs_balance_args *bargs)
3430{
3431        struct btrfs_stripe *stripe;
3432        int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3433        u64 stripe_offset;
3434        u64 stripe_length;
3435        u64 type;
3436        int factor;
3437        int i;
3438
3439        if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
3440                return 0;
3441
3442        type = btrfs_chunk_type(leaf, chunk);
3443        factor = calc_data_stripes(type, num_stripes);
3444
3445        for (i = 0; i < num_stripes; i++) {
3446                stripe = btrfs_stripe_nr(chunk, i);
3447                if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
3448                        continue;
3449
3450                stripe_offset = btrfs_stripe_offset(leaf, stripe);
3451                stripe_length = btrfs_chunk_length(leaf, chunk);
3452                stripe_length = div_u64(stripe_length, factor);
3453
3454                if (stripe_offset < bargs->pend &&
3455                    stripe_offset + stripe_length > bargs->pstart)
3456                        return 0;
3457        }
3458
3459        return 1;
3460}
3461
3462/* [vstart, vend) */
3463static int chunk_vrange_filter(struct extent_buffer *leaf,
3464                               struct btrfs_chunk *chunk,
3465                               u64 chunk_offset,
3466                               struct btrfs_balance_args *bargs)
3467{
3468        if (chunk_offset < bargs->vend &&
3469            chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
3470                /* at least part of the chunk is inside this vrange */
3471                return 0;
3472
3473        return 1;
3474}
3475
3476static int chunk_stripes_range_filter(struct extent_buffer *leaf,
3477                               struct btrfs_chunk *chunk,
3478                               struct btrfs_balance_args *bargs)
3479{
3480        int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3481
3482        if (bargs->stripes_min <= num_stripes
3483                        && num_stripes <= bargs->stripes_max)
3484                return 0;
3485
3486        return 1;
3487}
3488
3489static int chunk_soft_convert_filter(u64 chunk_type,
3490                                     struct btrfs_balance_args *bargs)
3491{
3492        if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3493                return 0;
3494
3495        chunk_type = chunk_to_extended(chunk_type) &
3496                                BTRFS_EXTENDED_PROFILE_MASK;
3497
3498        if (bargs->target == chunk_type)
3499                return 1;
3500
3501        return 0;
3502}
3503
3504static int should_balance_chunk(struct extent_buffer *leaf,
3505                                struct btrfs_chunk *chunk, u64 chunk_offset)
3506{
3507        struct btrfs_fs_info *fs_info = leaf->fs_info;
3508        struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3509        struct btrfs_balance_args *bargs = NULL;
3510        u64 chunk_type = btrfs_chunk_type(leaf, chunk);
3511
3512        /* type filter */
3513        if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
3514              (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
3515                return 0;
3516        }
3517
3518        if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3519                bargs = &bctl->data;
3520        else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3521                bargs = &bctl->sys;
3522        else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3523                bargs = &bctl->meta;
3524
3525        /* profiles filter */
3526        if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
3527            chunk_profiles_filter(chunk_type, bargs)) {
3528                return 0;
3529        }
3530
3531        /* usage filter */
3532        if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
3533            chunk_usage_filter(fs_info, chunk_offset, bargs)) {
3534                return 0;
3535        } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3536            chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
3537                return 0;
3538        }
3539
3540        /* devid filter */
3541        if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
3542            chunk_devid_filter(leaf, chunk, bargs)) {
3543                return 0;
3544        }
3545
3546        /* drange filter, makes sense only with devid filter */
3547        if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3548            chunk_drange_filter(leaf, chunk, bargs)) {
3549                return 0;
3550        }
3551
3552        /* vrange filter */
3553        if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
3554            chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
3555                return 0;
3556        }
3557
3558        /* stripes filter */
3559        if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
3560            chunk_stripes_range_filter(leaf, chunk, bargs)) {
3561                return 0;
3562        }
3563
3564        /* soft profile changing mode */
3565        if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
3566            chunk_soft_convert_filter(chunk_type, bargs)) {
3567                return 0;
3568        }
3569
3570        /*
3571         * limited by count, must be the last filter
3572         */
3573        if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
3574                if (bargs->limit == 0)
3575                        return 0;
3576                else
3577                        bargs->limit--;
3578        } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
3579                /*
3580                 * Same logic as the 'limit' filter; the minimum cannot be
3581                 * determined here because we do not have the global information
3582                 * about the count of all chunks that satisfy the filters.
3583                 */
3584                if (bargs->limit_max == 0)
3585                        return 0;
3586                else
3587                        bargs->limit_max--;
3588        }
3589
3590        return 1;
3591}
3592
3593static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3594{
3595        struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3596        struct btrfs_root *chunk_root = fs_info->chunk_root;
3597        u64 chunk_type;
3598        struct btrfs_chunk *chunk;
3599        struct btrfs_path *path = NULL;
3600        struct btrfs_key key;
3601        struct btrfs_key found_key;
3602        struct extent_buffer *leaf;
3603        int slot;
3604        int ret;
3605        int enospc_errors = 0;
3606        bool counting = true;
3607        /* The single value limit and min/max limits use the same bytes in the */
3608        u64 limit_data = bctl->data.limit;
3609        u64 limit_meta = bctl->meta.limit;
3610        u64 limit_sys = bctl->sys.limit;
3611        u32 count_data = 0;
3612        u32 count_meta = 0;
3613        u32 count_sys = 0;
3614        int chunk_reserved = 0;
3615
3616        path = btrfs_alloc_path();
3617        if (!path) {
3618                ret = -ENOMEM;
3619                goto error;
3620        }
3621
3622        /* zero out stat counters */
3623        spin_lock(&fs_info->balance_lock);
3624        memset(&bctl->stat, 0, sizeof(bctl->stat));
3625        spin_unlock(&fs_info->balance_lock);
3626again:
3627        if (!counting) {
3628                /*
3629                 * The single value limit and min/max limits use the same bytes
3630                 * in the
3631                 */
3632                bctl->data.limit = limit_data;
3633                bctl->meta.limit = limit_meta;
3634                bctl->sys.limit = limit_sys;
3635        }
3636        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3637        key.offset = (u64)-1;
3638        key.type = BTRFS_CHUNK_ITEM_KEY;
3639
3640        while (1) {
3641                if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
3642                    atomic_read(&fs_info->balance_cancel_req)) {
3643                        ret = -ECANCELED;
3644                        goto error;
3645                }
3646
3647                mutex_lock(&fs_info->delete_unused_bgs_mutex);
3648                ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3649                if (ret < 0) {
3650                        mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3651                        goto error;
3652                }
3653
3654                /*
3655                 * this shouldn't happen, it means the last relocate
3656                 * failed
3657                 */
3658                if (ret == 0)
3659                        BUG(); /* FIXME break ? */
3660
3661                ret = btrfs_previous_item(chunk_root, path, 0,
3662                                          BTRFS_CHUNK_ITEM_KEY);
3663                if (ret) {
3664                        mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3665                        ret = 0;
3666                        break;
3667                }
3668
3669                leaf = path->nodes[0];
3670                slot = path->slots[0];
3671                btrfs_item_key_to_cpu(leaf, &found_key, slot);
3672
3673                if (found_key.objectid != key.objectid) {
3674                        mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3675                        break;
3676                }
3677
3678                chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3679                chunk_type = btrfs_chunk_type(leaf, chunk);
3680
3681                if (!counting) {
3682                        spin_lock(&fs_info->balance_lock);
3683                        bctl->stat.considered++;
3684                        spin_unlock(&fs_info->balance_lock);
3685                }
3686
3687                ret = should_balance_chunk(leaf, chunk, found_key.offset);
3688
3689                btrfs_release_path(path);
3690                if (!ret) {
3691                        mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3692                        goto loop;
3693                }
3694
3695                if (counting) {
3696                        mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3697                        spin_lock(&fs_info->balance_lock);
3698                        bctl->stat.expected++;
3699                        spin_unlock(&fs_info->balance_lock);
3700
3701                        if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3702                                count_data++;
3703                        else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3704                                count_sys++;
3705                        else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3706                                count_meta++;
3707
3708                        goto loop;
3709                }
3710
3711                /*
3712                 * Apply limit_min filter, no need to check if the LIMITS
3713                 * filter is used, limit_min is 0 by default
3714                 */
3715                if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
3716                                        count_data < bctl->data.limit_min)
3717                                || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
3718                                        count_meta < bctl->meta.limit_min)
3719                                || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
3720                                        count_sys < bctl->sys.limit_min)) {
3721                        mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3722                        goto loop;
3723                }
3724
3725                if (!chunk_reserved) {
3726                        /*
3727                         * We may be relocating the only data chunk we have,
3728                         * which could potentially end up with losing data's
3729                         * raid profile, so lets allocate an empty one in
3730                         * advance.
3731                         */
3732                        ret = btrfs_may_alloc_data_chunk(fs_info,
3733                                                         found_key.offset);
3734                        if (ret < 0) {
3735                                mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3736                                goto error;
3737                        } else if (ret == 1) {
3738                                chunk_reserved = 1;
3739                        }
3740                }
3741
3742                ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3743                mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3744                if (ret == -ENOSPC) {
3745                        enospc_errors++;
3746                } else if (ret == -ETXTBSY) {
3747                        btrfs_info(fs_info,
3748           "skipping relocation of block group %llu due to active swapfile",
3749                                   found_key.offset);
3750                        ret = 0;
3751                } else if (ret) {
3752                        goto error;
3753                } else {
3754                        spin_lock(&fs_info->balance_lock);
3755                        bctl->stat.completed++;
3756                        spin_unlock(&fs_info->balance_lock);
3757                }
3758loop:
3759                if (found_key.offset == 0)
3760                        break;
3761                key.offset = found_key.offset - 1;
3762        }
3763
3764        if (counting) {
3765                btrfs_release_path(path);
3766                counting = false;
3767                goto again;
3768        }
3769error:
3770        btrfs_free_path(path);
3771        if (enospc_errors) {
3772                btrfs_info(fs_info, "%d enospc errors during balance",
3773                           enospc_errors);
3774                if (!ret)
3775                        ret = -ENOSPC;
3776        }
3777
3778        return ret;
3779}
3780
3781/**
3782 * alloc_profile_is_valid - see if a given profile is valid and reduced
3783 * @flags: profile to validate
3784 * @extended: if true @flags is treated as an extended profile
3785 */
3786static int alloc_profile_is_valid(u64 flags, int extended)
3787{
3788        u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
3789                               BTRFS_BLOCK_GROUP_PROFILE_MASK);
3790
3791        flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
3792
3793        /* 1) check that all other bits are zeroed */
3794        if (flags & ~mask)
3795                return 0;
3796
3797        /* 2) see if profile is reduced */
3798        if (flags == 0)
3799                return !extended; /* "0" is valid for usual profiles */
3800
3801        return has_single_bit_set(flags);
3802}
3803
3804static inline int balance_need_close(struct btrfs_fs_info *fs_info)
3805{
3806        /* cancel requested || normal exit path */
3807        return atomic_read(&fs_info->balance_cancel_req) ||
3808                (atomic_read(&fs_info->balance_pause_req) == 0 &&
3809                 atomic_read(&fs_info->balance_cancel_req) == 0);
3810}
3811
3812/*
3813 * Validate target profile against allowed profiles and return true if it's OK.
3814 * Otherwise print the error message and return false.
3815 */
3816static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
3817                const struct btrfs_balance_args *bargs,
3818                u64 allowed, const char *type)
3819{
3820        if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3821                return true;
3822
3823        /* Profile is valid and does not have bits outside of the allowed set */
3824        if (alloc_profile_is_valid(bargs->target, 1) &&
3825            (bargs->target & ~allowed) == 0)
3826                return true;
3827
3828        btrfs_err(fs_info, "balance: invalid convert %s profile %s",
3829                        type, btrfs_bg_type_to_raid_name(bargs->target));
3830        return false;
3831}
3832
3833/*
3834 * Fill @buf with textual description of balance filter flags @bargs, up to
3835 * @size_buf including the terminating null. The output may be trimmed if it
3836 * does not fit into the provided buffer.
3837 */
3838static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
3839                                 u32 size_buf)
3840{
3841        int ret;
3842        u32 size_bp = size_buf;
3843        char *bp = buf;
3844        u64 flags = bargs->flags;
3845        char tmp_buf[128] = {'\0'};
3846
3847        if (!flags)
3848                return;
3849
3850#define CHECK_APPEND_NOARG(a)                                           \
3851        do {                                                            \
3852                ret = snprintf(bp, size_bp, (a));                       \
3853                if (ret < 0 || ret >= size_bp)                          \
3854                        goto out_overflow;                              \
3855                size_bp -= ret;                                         \
3856                bp += ret;                                              \
3857        } while (0)
3858
3859#define CHECK_APPEND_1ARG(a, v1)                                        \
3860        do {                                                            \
3861                ret = snprintf(bp, size_bp, (a), (v1));                 \
3862                if (ret < 0 || ret >= size_bp)                          \
3863                        goto out_overflow;                              \
3864                size_bp -= ret;                                         \
3865                bp += ret;                                              \
3866        } while (0)
3867
3868#define CHECK_APPEND_2ARG(a, v1, v2)                                    \
3869        do {                                                            \
3870                ret = snprintf(bp, size_bp, (a), (v1), (v2));           \
3871                if (ret < 0 || ret >= size_bp)                          \
3872                        goto out_overflow;                              \
3873                size_bp -= ret;                                         \
3874                bp += ret;                                              \
3875        } while (0)
3876
3877        if (flags & BTRFS_BALANCE_ARGS_CONVERT)
3878                CHECK_APPEND_1ARG("convert=%s,",
3879                                  btrfs_bg_type_to_raid_name(bargs->target));
3880
3881        if (flags & BTRFS_BALANCE_ARGS_SOFT)
3882                CHECK_APPEND_NOARG("soft,");
3883
3884        if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
3885                btrfs_describe_block_groups(bargs->profiles, tmp_buf,
3886                                            sizeof(tmp_buf));
3887                CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
3888        }
3889
3890        if (flags & BTRFS_BALANCE_ARGS_USAGE)
3891                CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);
3892
3893        if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
3894                CHECK_APPEND_2ARG("usage=%u..%u,",
3895                                  bargs->usage_min, bargs->usage_max);
3896
3897        if (flags & BTRFS_BALANCE_ARGS_DEVID)
3898                CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);
3899
3900        if (flags & BTRFS_BALANCE_ARGS_DRANGE)
3901                CHECK_APPEND_2ARG("drange=%llu..%llu,",
3902                                  bargs->pstart, bargs->pend);
3903
3904        if (flags & BTRFS_BALANCE_ARGS_VRANGE)
3905                CHECK_APPEND_2ARG("vrange=%llu..%llu,",
3906                                  bargs->vstart, bargs->vend);
3907
3908        if (flags & BTRFS_BALANCE_ARGS_LIMIT)
3909                CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);
3910
3911        if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
3912                CHECK_APPEND_2ARG("limit=%u..%u,",
3913                                bargs->limit_min, bargs->limit_max);
3914
3915        if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
3916                CHECK_APPEND_2ARG("stripes=%u..%u,",
3917                                  bargs->stripes_min, bargs->stripes_max);
3918
3919#undef CHECK_APPEND_2ARG
3920#undef CHECK_APPEND_1ARG
3921#undef CHECK_APPEND_NOARG
3922
3923out_overflow:
3924
3925        if (size_bp < size_buf)
3926                buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
3927        else
3928                buf[0] = '\0';
3929}
3930
3931static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
3932{
3933        u32 size_buf = 1024;
3934        char tmp_buf[192] = {'\0'};
3935        char *buf;
3936        char *bp;
3937        u32 size_bp = size_buf;
3938        int ret;
3939        struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3940
3941        buf = kzalloc(size_buf, GFP_KERNEL);
3942        if (!buf)
3943                return;
3944
3945        bp = buf;
3946
3947#define CHECK_APPEND_1ARG(a, v1)                                        \
3948        do {                                                            \
3949                ret = snprintf(bp, size_bp, (a), (v1));                 \
3950                if (ret < 0 || ret >= size_bp)                          \
3951                        goto out_overflow;                              \
3952                size_bp -= ret;                                         \
3953                bp += ret;                                              \
3954        } while (0)
3955
3956        if (bctl->flags & BTRFS_BALANCE_FORCE)
3957                CHECK_APPEND_1ARG("%s", "-f ");
3958
3959        if (bctl->flags & BTRFS_BALANCE_DATA) {
3960                describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
3961                CHECK_APPEND_1ARG("-d%s ", tmp_buf);
3962        }
3963
3964        if (bctl->flags & BTRFS_BALANCE_METADATA) {
3965                describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
3966                CHECK_APPEND_1ARG("-m%s ", tmp_buf);
3967        }
3968
3969        if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
3970                describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
3971                CHECK_APPEND_1ARG("-s%s ", tmp_buf);
3972        }
3973
3974#undef CHECK_APPEND_1ARG
3975
3976out_overflow:
3977
3978        if (size_bp < size_buf)
3979                buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
3980        btrfs_info(fs_info, "balance: %s %s",
3981                   (bctl->flags & BTRFS_BALANCE_RESUME) ?
3982                   "resume" : "start", buf);
3983
3984        kfree(buf);
3985}
3986
3987/*
3988 * Should be called with balance mutexe held
3989 */
3990int btrfs_balance(struct btrfs_fs_info *fs_info,
3991                  struct btrfs_balance_control *bctl,
3992                  struct btrfs_ioctl_balance_args *bargs)
3993{
3994        u64 meta_target, data_target;
3995        u64 allowed;
3996        int mixed = 0;
3997        int ret;
3998        u64 num_devices;
3999        unsigned seq;
4000        bool reducing_redundancy;
4001        int i;
4002
4003        if (btrfs_fs_closing(fs_info) ||
4004            atomic_read(&fs_info->balance_pause_req) ||
4005            btrfs_should_cancel_balance(fs_info)) {
4006                ret = -EINVAL;
4007                goto out;
4008        }
4009
4010        allowed = btrfs_super_incompat_flags(fs_info->super_copy);
4011        if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
4012                mixed = 1;
4013
4014        /*
4015         * In case of mixed groups both data and meta should be picked,
4016         * and identical options should be given for both of them.
4017         */
4018        allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
4019        if (mixed && (bctl->flags & allowed)) {
4020                if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
4021                    !(bctl->flags & BTRFS_BALANCE_METADATA) ||
4022                    memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
4023                        btrfs_err(fs_info,
4024          "balance: mixed groups data and metadata options must be the same");
4025                        ret = -EINVAL;
4026                        goto out;
4027                }
4028        }
4029
4030        /*
4031         * rw_devices will not change at the moment, device add/delete/replace
4032         * are excluded by EXCL_OP
4033         */
4034        num_devices = fs_info->fs_devices->rw_devices;
4035
4036        /*
4037         * SINGLE profile on-disk has no profile bit, but in-memory we have a
4038         * special bit for it, to make it easier to distinguish.  Thus we need
4039         * to set it manually, or balance would refuse the profile.
4040         */
4041        allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
4042        for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
4043                if (num_devices >= btrfs_raid_array[i].devs_min)
4044                        allowed |= btrfs_raid_array[i].bg_flag;
4045
4046        if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") ||
4047            !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") ||
4048            !validate_convert_profile(fs_info, &bctl->sys,  allowed, "system")) {
4049                ret = -EINVAL;
4050                goto out;
4051        }
4052
4053        /*
4054         * Allow to reduce metadata or system integrity only if force set for
4055         * profiles with redundancy (copies, parity)
4056         */
4057        allowed = 0;
4058        for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
4059                if (btrfs_raid_array[i].ncopies >= 2 ||
4060                    btrfs_raid_array[i].tolerated_failures >= 1)
4061                        allowed |= btrfs_raid_array[i].bg_flag;
4062        }
4063        do {
4064                seq = read_seqbegin(&fs_info->profiles_lock);
4065
4066                if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4067                     (fs_info->avail_system_alloc_bits & allowed) &&
4068                     !(bctl->sys.target & allowed)) ||
4069                    ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4070                     (fs_info->avail_metadata_alloc_bits & allowed) &&
4071                     !(bctl->meta.target & allowed)))
4072                        reducing_redundancy = true;
4073                else
4074                        reducing_redundancy = false;
4075
4076                /* if we're not converting, the target field is uninitialized */
4077                meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
4078                        bctl->meta.target : fs_info->avail_metadata_alloc_bits;
4079                data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
4080                        bctl->data.target : fs_info->avail_data_alloc_bits;
4081        } while (read_seqretry(&fs_info->profiles_lock, seq));
4082
4083        if (reducing_redundancy) {
4084                if (bctl->flags & BTRFS_BALANCE_FORCE) {
4085                        btrfs_info(fs_info,
4086                           "balance: force reducing metadata redundancy");
4087                } else {
4088                        btrfs_err(fs_info,
4089        "balance: reduces metadata redundancy, use --force if you want this");
4090                        ret = -EINVAL;
4091                        goto out;
4092                }
4093        }
4094
4095        if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
4096                btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
4097                btrfs_warn(fs_info,
4098        "balance: metadata profile %s has lower redundancy than data profile %s",
4099                                btrfs_bg_type_to_raid_name(meta_target),
4100                                btrfs_bg_type_to_raid_name(data_target));
4101        }
4102
4103        if (fs_info->send_in_progress) {
4104                btrfs_warn_rl(fs_info,
4105"cannot run balance while send operations are in progress (%d in progress)",
4106                              fs_info->send_in_progress);
4107                ret = -EAGAIN;
4108                goto out;
4109        }
4110
4111        ret = insert_balance_item(fs_info, bctl);
4112        if (ret && ret != -EEXIST)
4113                goto out;
4114
4115        if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
4116                BUG_ON(ret == -EEXIST);
4117                BUG_ON(fs_info->balance_ctl);
4118                spin_lock(&fs_info->balance_lock);
4119                fs_info->balance_ctl = bctl;
4120                spin_unlock(&fs_info->balance_lock);
4121        } else {
4122                BUG_ON(ret != -EEXIST);
4123                spin_lock(&fs_info->balance_lock);
4124                update_balance_args(bctl);
4125                spin_unlock(&fs_info->balance_lock);
4126        }
4127
4128        ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4129        set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4130        describe_balance_start_or_resume(fs_info);
4131        mutex_unlock(&fs_info->balance_mutex);
4132
4133        ret = __btrfs_balance(fs_info);
4134
4135        mutex_lock(&fs_info->balance_mutex);
4136        if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req))
4137                btrfs_info(fs_info, "balance: paused");
4138        else if (ret == -ECANCELED && atomic_read(&fs_info->balance_cancel_req))
4139                btrfs_info(fs_info, "balance: canceled");
4140        else
4141                btrfs_info(fs_info, "balance: ended with status: %d", ret);
4142
4143        clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4144
4145        if (bargs) {
4146                memset(bargs, 0, sizeof(*bargs));
4147                btrfs_update_ioctl_balance_args(fs_info, bargs);
4148        }
4149
4150        if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
4151            balance_need_close(fs_info)) {
4152                reset_balance_state(fs_info);
4153                clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
4154        }
4155
4156        wake_up(&fs_info->balance_wait_q);
4157
4158        return ret;
4159out:
4160        if (bctl->flags & BTRFS_BALANCE_RESUME)
4161                reset_balance_state(fs_info);
4162        else
4163                kfree(bctl);
4164        clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
4165
4166        return ret;
4167}
4168
4169static int balance_kthread(void *data)
4170{
4171        struct btrfs_fs_info *fs_info = data;
4172        int ret = 0;
4173
4174        mutex_lock(&fs_info->balance_mutex);
4175        if (fs_info->balance_ctl)
4176                ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
4177        mutex_unlock(&fs_info->balance_mutex);
4178
4179        return ret;
4180}
4181
4182int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
4183{
4184        struct task_struct *tsk;
4185
4186        mutex_lock(&fs_info->balance_mutex);
4187        if (!fs_info->balance_ctl) {
4188                mutex_unlock(&fs_info->balance_mutex);
4189                return 0;
4190        }
4191        mutex_unlock(&fs_info->balance_mutex);
4192
4193        if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
4194                btrfs_info(fs_info, "balance: resume skipped");
4195                return 0;
4196        }
4197
4198        /*
4199         * A ro->rw remount sequence should continue with the paused balance
4200         * regardless of who pauses it, system or the user as of now, so set
4201         * the resume flag.
4202         */
4203        spin_lock(&fs_info->balance_lock);
4204        fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
4205        spin_unlock(&fs_info->balance_lock);
4206
4207        tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
4208        return PTR_ERR_OR_ZERO(tsk);
4209}
4210
4211int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
4212{
4213        struct btrfs_balance_control *bctl;
4214        struct btrfs_balance_item *item;
4215        struct btrfs_disk_balance_args disk_bargs;
4216        struct btrfs_path *path;
4217        struct extent_buffer *leaf;
4218        struct btrfs_key key;
4219        int ret;
4220
4221        path = btrfs_alloc_path();
4222        if (!path)
4223                return -ENOMEM;
4224
4225        key.objectid = BTRFS_BALANCE_OBJECTID;
4226        key.type = BTRFS_TEMPORARY_ITEM_KEY;
4227        key.offset = 0;
4228
4229        ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
4230        if (ret < 0)
4231                goto out;
4232        if (ret > 0) { /* ret = -ENOENT; */
4233                ret = 0;
4234                goto out;
4235        }
4236
4237        bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
4238        if (!bctl) {
4239                ret = -ENOMEM;
4240                goto out;
4241        }
4242
4243        leaf = path->nodes[0];
4244        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
4245
4246        bctl->flags = btrfs_balance_flags(leaf, item);
4247        bctl->flags |= BTRFS_BALANCE_RESUME;
4248
4249        btrfs_balance_data(leaf, item, &disk_bargs);
4250        btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
4251        btrfs_balance_meta(leaf, item, &disk_bargs);
4252        btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
4253        btrfs_balance_sys(leaf, item, &disk_bargs);
4254        btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
4255
4256        /*
4257         * This should never happen, as the paused balance state is recovered
4258         * during mount without any chance of other exclusive ops to collide.
4259         *
4260         * This gives the exclusive op status to balance and keeps in paused
4261         * state until user intervention (cancel or umount). If the ownership
4262         * cannot be assigned, show a message but do not fail. The balance
4263         * is in a paused state and must have fs_info::balance_ctl properly
4264         * set up.
4265         */
4266        if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
4267                btrfs_warn(fs_info,
4268        "balance: cannot set exclusive op status, resume manually");
4269
4270        mutex_lock(&fs_info->balance_mutex);
4271        BUG_ON(fs_info->balance_ctl);
4272        spin_lock(&fs_info->balance_lock);
4273        fs_info->balance_ctl = bctl;
4274        spin_unlock(&fs_info->balance_lock);
4275        mutex_unlock(&fs_info->balance_mutex);
4276out:
4277        btrfs_free_path(path);
4278        return ret;
4279}
4280
4281int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
4282{
4283        int ret = 0;
4284
4285        mutex_lock(&fs_info->balance_mutex);
4286        if (!fs_info->balance_ctl) {
4287                mutex_unlock(&fs_info->balance_mutex);
4288                return -ENOTCONN;
4289        }
4290
4291        if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4292                atomic_inc(&fs_info->balance_pause_req);
4293                mutex_unlock(&fs_info->balance_mutex);
4294
4295                wait_event(fs_info->balance_wait_q,
4296                           !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4297
4298                mutex_lock(&fs_info->balance_mutex);
4299                /* we are good with balance_ctl ripped off from under us */
4300                BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4301                atomic_dec(&fs_info->balance_pause_req);
4302        } else {
4303                ret = -ENOTCONN;
4304        }
4305
4306        mutex_unlock(&fs_info->balance_mutex);
4307        return ret;
4308}
4309
4310int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
4311{
4312        mutex_lock(&fs_info->balance_mutex);
4313        if (!fs_info->balance_ctl) {
4314                mutex_unlock(&fs_info->balance_mutex);
4315                return -ENOTCONN;
4316        }
4317
4318        /*
4319         * A paused balance with the item stored on disk can be resumed at
4320         * mount time if the mount is read-write. Otherwise it's still paused
4321         * and we must not allow cancelling as it deletes the item.
4322         */
4323        if (sb_rdonly(fs_info->sb)) {
4324                mutex_unlock(&fs_info->balance_mutex);
4325                return -EROFS;
4326        }
4327
4328        atomic_inc(&fs_info->balance_cancel_req);
4329        /*
4330         * if we are running just wait and return, balance item is
4331         * deleted in btrfs_balance in this case
4332         */
4333        if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4334                mutex_unlock(&fs_info->balance_mutex);
4335                wait_event(fs_info->balance_wait_q,
4336                           !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4337                mutex_lock(&fs_info->balance_mutex);
4338        } else {
4339                mutex_unlock(&fs_info->balance_mutex);
4340                /*
4341                 * Lock released to allow other waiters to continue, we'll
4342                 * reexamine the status again.
4343                 */
4344                mutex_lock(&fs_info->balance_mutex);
4345
4346                if (fs_info->balance_ctl) {
4347                        reset_balance_state(fs_info);
4348                        clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
4349                        btrfs_info(fs_info, "balance: canceled");
4350                }
4351        }
4352
4353        BUG_ON(fs_info->balance_ctl ||
4354                test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4355        atomic_dec(&fs_info->balance_cancel_req);
4356        mutex_unlock(&fs_info->balance_mutex);
4357        return 0;
4358}
4359
4360int btrfs_uuid_scan_kthread(void *data)
4361{
4362        struct btrfs_fs_info *fs_info = data;
4363        struct btrfs_root *root = fs_info->tree_root;
4364        struct btrfs_key key;
4365        struct btrfs_path *path = NULL;
4366        int ret = 0;
4367        struct extent_buffer *eb;
4368        int slot;
4369        struct btrfs_root_item root_item;
4370        u32 item_size;
4371        struct btrfs_trans_handle *trans = NULL;
4372        bool closing = false;
4373
4374        path = btrfs_alloc_path();
4375        if (!path) {
4376                ret = -ENOMEM;
4377                goto out;
4378        }
4379
4380        key.objectid = 0;
4381        key.type = BTRFS_ROOT_ITEM_KEY;
4382        key.offset = 0;
4383
4384        while (1) {
4385                if (btrfs_fs_closing(fs_info)) {
4386                        closing = true;
4387                        break;
4388                }
4389                ret = btrfs_search_forward(root, &key, path,
4390                                BTRFS_OLDEST_GENERATION);
4391                if (ret) {
4392                        if (ret > 0)
4393                                ret = 0;
4394                        break;
4395                }
4396
4397                if (key.type != BTRFS_ROOT_ITEM_KEY ||
4398                    (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
4399                     key.objectid != BTRFS_FS_TREE_OBJECTID) ||
4400                    key.objectid > BTRFS_LAST_FREE_OBJECTID)
4401                        goto skip;
4402
4403                eb = path->nodes[0];
4404                slot = path->slots[0];
4405                item_size = btrfs_item_size_nr(eb, slot);
4406                if (item_size < sizeof(root_item))
4407                        goto skip;
4408
4409                read_extent_buffer(eb, &root_item,
4410                                   btrfs_item_ptr_offset(eb, slot),
4411                                   (int)sizeof(root_item));
4412                if (btrfs_root_refs(&root_item) == 0)
4413                        goto skip;
4414
4415                if (!btrfs_is_empty_uuid(root_item.uuid) ||
4416                    !btrfs_is_empty_uuid(root_item.received_uuid)) {
4417                        if (trans)
4418                                goto update_tree;
4419
4420                        btrfs_release_path(path);
4421                        /*
4422                         * 1 - subvol uuid item
4423                         * 1 - received_subvol uuid item
4424                         */
4425                        trans = btrfs_start_transaction(fs_info->uuid_root, 2);
4426                        if (IS_ERR(trans)) {
4427                                ret = PTR_ERR(trans);
4428                                break;
4429                        }
4430                        continue;
4431                } else {
4432                        goto skip;
4433                }
4434update_tree:
4435                if (!btrfs_is_empty_uuid(root_item.uuid)) {
4436                        ret = btrfs_uuid_tree_add(trans, root_item.uuid,
4437                                                  BTRFS_UUID_KEY_SUBVOL,
4438                                                  key.objectid);
4439                        if (ret < 0) {
4440                                btrfs_warn(fs_info, "uuid_tree_add failed %d",
4441                                        ret);
4442                                break;
4443                        }
4444                }
4445
4446                if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
4447                        ret = btrfs_uuid_tree_add(trans,
4448                                                  root_item.received_uuid,
4449                                                 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4450                                                  key.objectid);
4451                        if (ret < 0) {
4452                                btrfs_warn(fs_info, "uuid_tree_add failed %d",
4453                                        ret);
4454                                break;
4455                        }
4456                }
4457
4458skip:
4459                if (trans) {
4460                        ret = btrfs_end_transaction(trans);
4461                        trans = NULL;
4462                        if (ret)
4463                                break;
4464                }
4465
4466                btrfs_release_path(path);
4467                if (key.offset < (u64)-1) {
4468                        key.offset++;
4469                } else if (key.type < BTRFS_ROOT_ITEM_KEY) {
4470                        key.offset = 0;
4471                        key.type = BTRFS_ROOT_ITEM_KEY;
4472                } else if (key.objectid < (u64)-1) {
4473                        key.offset = 0;
4474                        key.type = BTRFS_ROOT_ITEM_KEY;
4475                        key.objectid++;
4476                } else {
4477                        break;
4478                }
4479                cond_resched();
4480        }
4481
4482out:
4483        btrfs_free_path(path);
4484        if (trans && !IS_ERR(trans))
4485                btrfs_end_transaction(trans);
4486        if (ret)
4487                btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
4488        else if (!closing)
4489                set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
4490        up(&fs_info->uuid_tree_rescan_sem);
4491        return 0;
4492}
4493
4494int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
4495{
4496        struct btrfs_trans_handle *trans;
4497        struct btrfs_root *tree_root = fs_info->tree_root;
4498        struct btrfs_root *uuid_root;
4499        struct task_struct *task;
4500        int ret;
4501
4502        /*
4503         * 1 - root node
4504         * 1 - root item
4505         */
4506        trans = btrfs_start_transaction(tree_root, 2);
4507        if (IS_ERR(trans))
4508                return PTR_ERR(trans);
4509
4510        uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
4511        if (IS_ERR(uuid_root)) {
4512                ret = PTR_ERR(uuid_root);
4513                btrfs_abort_transaction(trans, ret);
4514                btrfs_end_transaction(trans);
4515                return ret;
4516        }
4517
4518        fs_info->uuid_root = uuid_root;
4519
4520        ret = btrfs_commit_transaction(trans);
4521        if (ret)
4522                return ret;
4523
4524        down(&fs_info->uuid_tree_rescan_sem);
4525        task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
4526        if (IS_ERR(task)) {
4527                /* fs_info->update_uuid_tree_gen remains 0 in all error case */
4528                btrfs_warn(fs_info, "failed to start uuid_scan task");
4529                up(&fs_info->uuid_tree_rescan_sem);
4530                return PTR_ERR(task);
4531        }
4532
4533        return 0;
4534}
4535
4536/*
4537 * shrinking a device means finding all of the device extents past
4538 * the new size, and then following the back refs to the chunks.
4539 * The chunk relocation code actually frees the device extent
4540 */
4541int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
4542{
4543        struct btrfs_fs_info *fs_info = device->fs_info;
4544        struct btrfs_root *root = fs_info->dev_root;
4545        struct btrfs_trans_handle *trans;
4546        struct btrfs_dev_extent *dev_extent = NULL;
4547        struct btrfs_path *path;
4548        u64 length;
4549        u64 chunk_offset;
4550        int ret;
4551        int slot;
4552        int failed = 0;
4553        bool retried = false;
4554        struct extent_buffer *l;
4555        struct btrfs_key key;
4556        struct btrfs_super_block *super_copy = fs_info->super_copy;
4557        u64 old_total = btrfs_super_total_bytes(super_copy);
4558        u64 old_size = btrfs_device_get_total_bytes(device);
4559        u64 diff;
4560        u64 start;
4561
4562        new_size = round_down(new_size, fs_info->sectorsize);
4563        start = new_size;
4564        diff = round_down(old_size - new_size, fs_info->sectorsize);
4565
4566        if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
4567                return -EINVAL;
4568
4569        path = btrfs_alloc_path();
4570        if (!path)
4571                return -ENOMEM;
4572
4573        path->reada = READA_BACK;
4574
4575        trans = btrfs_start_transaction(root, 0);
4576        if (IS_ERR(trans)) {
4577                btrfs_free_path(path);
4578                return PTR_ERR(trans);
4579        }
4580
4581        mutex_lock(&fs_info->chunk_mutex);
4582
4583        btrfs_device_set_total_bytes(device, new_size);
4584        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
4585                device->fs_devices->total_rw_bytes -= diff;
4586                atomic64_sub(diff, &fs_info->free_chunk_space);
4587        }
4588
4589        /*
4590         * Once the device's size has been set to the new size, ensure all
4591         * in-memory chunks are synced to disk so that the loop below sees them
4592         * and relocates them accordingly.
4593         */
4594        if (contains_pending_extent(device, &start, diff)) {
4595                mutex_unlock(&fs_info->chunk_mutex);
4596                ret = btrfs_commit_transaction(trans);
4597                if (ret)
4598                        goto done;
4599        } else {
4600                mutex_unlock(&fs_info->chunk_mutex);
4601                btrfs_end_transaction(trans);
4602        }
4603
4604again:
4605        key.objectid = device->devid;
4606        key.offset = (u64)-1;
4607        key.type = BTRFS_DEV_EXTENT_KEY;
4608
4609        do {
4610                mutex_lock(&fs_info->delete_unused_bgs_mutex);
4611                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4612                if (ret < 0) {
4613                        mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4614                        goto done;
4615                }
4616
4617                ret = btrfs_previous_item(root, path, 0, key.type);
4618                if (ret)
4619                        mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4620                if (ret < 0)
4621                        goto done;
4622                if (ret) {
4623                        ret = 0;
4624                        btrfs_release_path(path);
4625                        break;
4626                }
4627
4628                l = path->nodes[0];
4629                slot = path->slots[0];
4630                btrfs_item_key_to_cpu(l, &key, path->slots[0]);
4631
4632                if (key.objectid != device->devid) {
4633                        mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4634                        btrfs_release_path(path);
4635                        break;
4636                }
4637
4638                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
4639                length = btrfs_dev_extent_length(l, dev_extent);
4640
4641                if (key.offset + length <= new_size) {
4642                        mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4643                        btrfs_release_path(path);
4644                        break;
4645                }
4646
4647                chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
4648                btrfs_release_path(path);
4649
4650                /*
4651                 * We may be relocating the only data chunk we have,
4652                 * which could potentially end up with losing data's
4653                 * raid profile, so lets allocate an empty one in
4654                 * advance.
4655                 */
4656                ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
4657                if (ret < 0) {
4658                        mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4659                        goto done;
4660                }
4661
4662                ret = btrfs_relocate_chunk(fs_info, chunk_offset);
4663                mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4664                if (ret == -ENOSPC) {
4665                        failed++;
4666                } else if (ret) {
4667                        if (ret == -ETXTBSY) {
4668                                btrfs_warn(fs_info,
4669                   "could not shrink block group %llu due to active swapfile",
4670                                           chunk_offset);
4671                        }
4672                        goto done;
4673                }
4674        } while (key.offset-- > 0);
4675
4676        if (failed && !retried) {
4677                failed = 0;
4678                retried = true;
4679                goto again;
4680        } else if (failed && retried) {
4681                ret = -ENOSPC;
4682                goto done;
4683        }
4684
4685        /* Shrinking succeeded, else we would be at "done". */
4686        trans = btrfs_start_transaction(root, 0);
4687        if (IS_ERR(trans)) {
4688                ret = PTR_ERR(trans);
4689                goto done;
4690        }
4691
4692        mutex_lock(&fs_info->chunk_mutex);
4693        btrfs_device_set_disk_total_bytes(device, new_size);
4694        if (list_empty(&device->post_commit_list))
4695                list_add_tail(&device->post_commit_list,
4696                              &trans->transaction->dev_update_list);
4697
4698        WARN_ON(diff > old_total);
4699        btrfs_set_super_total_bytes(super_copy,
4700                        round_down(old_total - diff, fs_info->sectorsize));
4701        mutex_unlock(&fs_info->chunk_mutex);
4702
4703        /* Now btrfs_update_device() will change the on-disk size. */
4704        ret = btrfs_update_device(trans, device);
4705        if (ret < 0) {
4706                btrfs_abort_transaction(trans, ret);
4707                btrfs_end_transaction(trans);
4708        } else {
4709                ret = btrfs_commit_transaction(trans);
4710        }
4711done:
4712        btrfs_free_path(path);
4713        if (ret) {
4714                mutex_lock(&fs_info->chunk_mutex);
4715                btrfs_device_set_total_bytes(device, old_size);
4716                if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
4717                        device->fs_devices->total_rw_bytes += diff;
4718                atomic64_add(diff, &fs_info->free_chunk_space);
4719                mutex_unlock(&fs_info->chunk_mutex);
4720        }
4721        return ret;
4722}
4723
4724static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
4725                           struct btrfs_key *key,
4726                           struct btrfs_chunk *chunk, int item_size)
4727{
4728        struct btrfs_super_block *super_copy = fs_info->super_copy;
4729        struct btrfs_disk_key disk_key;
4730        u32 array_size;
4731        u8 *ptr;
4732
4733        mutex_lock(&fs_info->chunk_mutex);
4734        array_size = btrfs_super_sys_array_size(super_copy);
4735        if (array_size + item_size + sizeof(disk_key)
4736                        > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
4737                mutex_unlock(&fs_info->chunk_mutex);
4738                return -EFBIG;
4739        }
4740
4741        ptr = super_copy->sys_chunk_array + array_size;
4742        btrfs_cpu_key_to_disk(&disk_key, key);
4743        memcpy(ptr, &disk_key, sizeof(disk_key));
4744        ptr += sizeof(disk_key);
4745        memcpy(ptr, chunk, item_size);
4746        item_size += sizeof(disk_key);
4747        btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
4748        mutex_unlock(&fs_info->chunk_mutex);
4749
4750        return 0;
4751}
4752
4753/*
4754 * sort the devices in descending order by max_avail, total_avail
4755 */
4756static int btrfs_cmp_device_info(const void *a, const void *b)
4757{
4758        const struct btrfs_device_info *di_a = a;
4759        const struct btrfs_device_info *di_b = b;
4760
4761        if (di_a->max_avail > di_b->max_avail)
4762                return -1;
4763        if (di_a->max_avail < di_b->max_avail)
4764                return 1;
4765        if (di_a->total_avail > di_b->total_avail)
4766                return -1;
4767        if (di_a->total_avail < di_b->total_avail)
4768                return 1;
4769        return 0;
4770}
4771
4772static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
4773{
4774        if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
4775                return;
4776
4777        btrfs_set_fs_incompat(info, RAID56);
4778}
4779
4780static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
4781{
4782        if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4)))
4783                return;
4784
4785        btrfs_set_fs_incompat(info, RAID1C34);
4786}
4787
4788/*
4789 * Structure used internally for __btrfs_alloc_chunk() function.
4790 * Wraps needed parameters.
4791 */
4792struct alloc_chunk_ctl {
4793        u64 start;
4794        u64 type;
4795        /* Total number of stripes to allocate */
4796        int num_stripes;
4797        /* sub_stripes info for map */
4798        int sub_stripes;
4799        /* Stripes per device */
4800        int dev_stripes;
4801        /* Maximum number of devices to use */
4802        int devs_max;
4803        /* Minimum number of devices to use */
4804        int devs_min;
4805        /* ndevs has to be a multiple of this */
4806        int devs_increment;
4807        /* Number of copies */
4808        int ncopies;
4809        /* Number of stripes worth of bytes to store parity information */
4810        int nparity;
4811        u64 max_stripe_size;
4812        u64 max_chunk_size;
4813        u64 dev_extent_min;
4814        u64 stripe_size;
4815        u64 chunk_size;
4816        int ndevs;
4817};
4818
4819static void init_alloc_chunk_ctl_policy_regular(
4820                                struct btrfs_fs_devices *fs_devices,
4821                                struct alloc_chunk_ctl *ctl)
4822{
4823        u64 type = ctl->type;
4824
4825        if (type & BTRFS_BLOCK_GROUP_DATA) {
4826                ctl->max_stripe_size = SZ_1G;
4827                ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
4828        } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
4829                /* For larger filesystems, use larger metadata chunks */
4830                if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
4831                        ctl->max_stripe_size = SZ_1G;
4832                else
4833                        ctl->max_stripe_size = SZ_256M;
4834                ctl->max_chunk_size = ctl->max_stripe_size;
4835        } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
4836                ctl->max_stripe_size = SZ_32M;
4837                ctl->max_chunk_size = 2 * ctl->max_stripe_size;
4838                ctl->devs_max = min_t(int, ctl->devs_max,
4839                                      BTRFS_MAX_DEVS_SYS_CHUNK);
4840        } else {
4841                BUG();
4842        }
4843
4844        /* We don't want a chunk larger than 10% of writable space */
4845        ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
4846                                  ctl->max_chunk_size);
4847        ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
4848}
4849
4850static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
4851                                 struct alloc_chunk_ctl *ctl)
4852{
4853        int index = btrfs_bg_flags_to_raid_index(ctl->type);
4854
4855        ctl->sub_stripes = btrfs_raid_array[index].sub_stripes;
4856        ctl->dev_stripes = btrfs_raid_array[index].dev_stripes;
4857        ctl->devs_max = btrfs_raid_array[index].devs_max;
4858        if (!ctl->devs_max)
4859                ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info);
4860        ctl->devs_min = btrfs_raid_array[index].devs_min;
4861        ctl->devs_increment = btrfs_raid_array[index].devs_increment;
4862        ctl->ncopies = btrfs_raid_array[index].ncopies;
4863        ctl->nparity = btrfs_raid_array[index].nparity;
4864        ctl->ndevs = 0;
4865
4866        switch (fs_devices->chunk_alloc_policy) {
4867        case BTRFS_CHUNK_ALLOC_REGULAR:
4868                init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
4869                break;
4870        default:
4871                BUG();
4872        }
4873}
4874
4875static int gather_device_info(struct btrfs_fs_devices *fs_devices,
4876                              struct alloc_chunk_ctl *ctl,
4877                              struct btrfs_device_info *devices_info)
4878{
4879        struct btrfs_fs_info *info = fs_devices->fs_info;
4880        struct btrfs_device *device;
4881        u64 total_avail;
4882        u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
4883        int ret;
4884        int ndevs = 0;
4885        u64 max_avail;
4886        u64 dev_offset;
4887
4888        /*
4889         * in the first pass through the devices list, we gather information
4890         * about the available holes on each device.
4891         */
4892        list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
4893                if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
4894                        WARN(1, KERN_ERR
4895                               "BTRFS: read-only device in alloc_list\n");
4896                        continue;
4897                }
4898
4899                if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
4900                                        &device->dev_state) ||
4901                    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
4902                        continue;
4903
4904                if (device->total_bytes > device->bytes_used)
4905                        total_avail = device->total_bytes - device->bytes_used;
4906                else
4907                        total_avail = 0;
4908
4909                /* If there is no space on this device, skip it. */
4910                if (total_avail < ctl->dev_extent_min)
4911                        continue;
4912
4913                ret = find_free_dev_extent(device, dev_extent_want, &dev_offset,
4914                                           &max_avail);
4915                if (ret && ret != -ENOSPC)
4916                        return ret;
4917
4918                if (ret == 0)
4919                        max_avail = dev_extent_want;
4920
4921                if (max_avail < ctl->dev_extent_min) {
4922                        if (btrfs_test_opt(info, ENOSPC_DEBUG))
4923                                btrfs_debug(info,
4924                        "%s: devid %llu has no free space, have=%llu want=%llu",
4925                                            __func__, device->devid, max_avail,
4926                                            ctl->dev_extent_min);
4927                        continue;
4928                }
4929
4930                if (ndevs == fs_devices->rw_devices) {
4931                        WARN(1, "%s: found more than %llu devices\n",
4932                             __func__, fs_devices->rw_devices);
4933                        break;
4934                }
4935                devices_info[ndevs].dev_offset = dev_offset;
4936                devices_info[ndevs].max_avail = max_avail;
4937                devices_info[ndevs].total_avail = total_avail;
4938                devices_info[ndevs].dev = device;
4939                ++ndevs;
4940        }
4941        ctl->ndevs = ndevs;
4942
4943        /*
4944         * now sort the devices by hole size / available space
4945         */
4946        sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
4947             btrfs_cmp_device_info, NULL);
4948
4949        return 0;
4950}
4951
4952static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
4953                                      struct btrfs_device_info *devices_info)
4954{
4955        /* Number of stripes that count for block group size */
4956        int data_stripes;
4957
4958        /*
4959         * The primary goal is to maximize the number of stripes, so use as
4960         * many devices as possible, even if the stripes are not maximum sized.
4961         *
4962         * The DUP profile stores more than one stripe per device, the
4963         * max_avail is the total size so we have to adjust.
4964         */
4965        ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail,
4966                                   ctl->dev_stripes);
4967        ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
4968
4969        /* This will have to be fixed for RAID1 and RAID10 over more drives */
4970        data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
4971
4972        /*
4973         * Use the number of data stripes to figure out how big this chunk is
4974         * really going to be in terms of logical address space, and compare
4975         * that answer with the max chunk size. If it's higher, we try to
4976         * reduce stripe_size.
4977         */
4978        if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
4979                /*
4980                 * Reduce stripe_size, round it up to a 16MB boundary again and
4981                 * then use it, unless it ends up being even bigger than the
4982                 * previous value we had already.
4983                 */
4984                ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size,
4985                                                        data_stripes), SZ_16M),
4986                                       ctl->stripe_size);
4987        }
4988
4989        /* Align to BTRFS_STRIPE_LEN */
4990        ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
4991        ctl->chunk_size = ctl->stripe_size * data_stripes;
4992
4993        return 0;
4994}
4995
4996static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
4997                              struct alloc_chunk_ctl *ctl,
4998                              struct btrfs_device_info *devices_info)
4999{
5000        struct btrfs_fs_info *info = fs_devices->fs_info;
5001
5002        /*
5003         * Round down to number of usable stripes, devs_increment can be any
5004         * number so we can't use round_down() that requires power of 2, while
5005         * rounddown is safe.
5006         */
5007        ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment);
5008
5009        if (ctl->ndevs < ctl->devs_min) {
5010                if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
5011                        btrfs_debug(info,
5012        "%s: not enough devices with free space: have=%d minimum required=%d",
5013                                    __func__, ctl->ndevs, ctl->devs_min);
5014                }
5015                return -ENOSPC;
5016        }
5017
5018        ctl->ndevs = min(ctl->ndevs, ctl->devs_max);
5019
5020        switch (fs_devices->chunk_alloc_policy) {
5021        case BTRFS_CHUNK_ALLOC_REGULAR:
5022                return decide_stripe_size_regular(ctl, devices_info);
5023        default:
5024                BUG();
5025        }
5026}
5027
5028static int create_chunk(struct btrfs_trans_handle *trans,
5029                        struct alloc_chunk_ctl *ctl,
5030                        struct btrfs_device_info *devices_info)
5031{
5032        struct btrfs_fs_info *info = trans->fs_info;
5033        struct map_lookup *map = NULL;
5034        struct extent_map_tree *em_tree;
5035        struct extent_map *em;
5036        u64 start = ctl->start;
5037        u64 type = ctl->type;
5038        int ret;
5039        int i;
5040        int j;
5041
5042        map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
5043        if (!map)
5044                return -ENOMEM;
5045        map->num_stripes = ctl->num_stripes;
5046
5047        for (i = 0; i < ctl->ndevs; ++i) {
5048                for (j = 0; j < ctl->dev_stripes; ++j) {
5049                        int s = i * ctl->dev_stripes + j;
5050                        map->stripes[s].dev = devices_info[i].dev;
5051                        map->stripes[s].physical = devices_info[i].dev_offset +
5052                                                   j * ctl->stripe_size;
5053                }
5054        }
5055        map->stripe_len = BTRFS_STRIPE_LEN;
5056        map->io_align = BTRFS_STRIPE_LEN;
5057        map->io_width = BTRFS_STRIPE_LEN;
5058        map->type = type;
5059        map->sub_stripes = ctl->sub_stripes;
5060
5061        trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
5062
5063        em = alloc_extent_map();
5064        if (!em) {
5065                kfree(map);
5066                return -ENOMEM;
5067        }
5068        set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
5069        em->map_lookup = map;
5070        em->start = start;
5071        em->len = ctl->chunk_size;
5072        em->block_start = 0;
5073        em->block_len = em->len;
5074        em->orig_block_len = ctl->stripe_size;
5075
5076        em_tree = &info->mapping_tree;
5077        write_lock(&em_tree->lock);
5078        ret = add_extent_mapping(em_tree, em, 0);
5079        if (ret) {
5080                write_unlock(&em_tree->lock);
5081                free_extent_map(em);
5082                return ret;
5083        }
5084        write_unlock(&em_tree->lock);
5085
5086        ret = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
5087        if (ret)
5088                goto error_del_extent;
5089
5090        for (i = 0; i < map->num_stripes; i++) {
5091                struct btrfs_device *dev = map->stripes[i].dev;
5092
5093                btrfs_device_set_bytes_used(dev,
5094                                            dev->bytes_used + ctl->stripe_size);
5095                if (list_empty(&dev->post_commit_list))
5096                        list_add_tail(&dev->post_commit_list,
5097                                      &trans->transaction->dev_update_list);
5098        }
5099
5100        atomic64_sub(ctl->stripe_size * map->num_stripes,
5101                     &info->free_chunk_space);
5102
5103        free_extent_map(em);
5104        check_raid56_incompat_flag(info, type);
5105        check_raid1c34_incompat_flag(info, type);
5106
5107        return 0;
5108
5109error_del_extent:
5110        write_lock(&em_tree->lock);
5111        remove_extent_mapping(em_tree, em);
5112        write_unlock(&em_tree->lock);
5113
5114        /* One for our allocation */
5115        free_extent_map(em);
5116        /* One for the tree reference */
5117        free_extent_map(em);
5118
5119        return ret;
5120}
5121
5122int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
5123{
5124        struct btrfs_fs_info *info = trans->fs_info;
5125        struct btrfs_fs_devices *fs_devices = info->fs_devices;
5126        struct btrfs_device_info *devices_info = NULL;
5127        struct alloc_chunk_ctl ctl;
5128        int ret;
5129
5130        lockdep_assert_held(&info->chunk_mutex);
5131
5132        if (!alloc_profile_is_valid(type, 0)) {
5133                ASSERT(0);
5134                return -EINVAL;
5135        }
5136
5137        if (list_empty(&fs_devices->alloc_list)) {
5138                if (btrfs_test_opt(info, ENOSPC_DEBUG))
5139                        btrfs_debug(info, "%s: no writable device", __func__);
5140                return -ENOSPC;
5141        }
5142
5143        if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
5144                btrfs_err(info, "invalid chunk type 0x%llx requested", type);
5145                ASSERT(0);
5146                return -EINVAL;
5147        }
5148
5149        ctl.start = find_next_chunk(info);
5150        ctl.type = type;
5151        init_alloc_chunk_ctl(fs_devices, &ctl);
5152
5153        devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
5154                               GFP_NOFS);
5155        if (!devices_info)
5156                return -ENOMEM;
5157
5158        ret = gather_device_info(fs_devices, &ctl, devices_info);
5159        if (ret < 0)
5160                goto out;
5161
5162        ret = decide_stripe_size(fs_devices, &ctl, devices_info);
5163        if (ret < 0)
5164                goto out;
5165
5166        ret = create_chunk(trans, &ctl, devices_info);
5167
5168out:
5169        kfree(devices_info);
5170        return ret;
5171}
5172
5173/*
5174 * Chunk allocation falls into two parts. The first part does work
5175 * that makes the new allocated chunk usable, but does not do any operation
5176 * that modifies the chunk tree. The second part does the work that
5177 * requires modifying the chunk tree. This division is important for the
5178 * bootstrap process of adding storage to a seed btrfs.
5179 */
5180int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
5181                             u64 chunk_offset, u64 chunk_size)
5182{
5183        struct btrfs_fs_info *fs_info = trans->fs_info;
5184        struct btrfs_root *extent_root = fs_info->extent_root;
5185        struct btrfs_root *chunk_root = fs_info->chunk_root;
5186        struct btrfs_key key;
5187        struct btrfs_device *device;
5188        struct btrfs_chunk *chunk;
5189        struct btrfs_stripe *stripe;
5190        struct extent_map *em;
5191        struct map_lookup *map;
5192        size_t item_size;
5193        u64 dev_offset;
5194        u64 stripe_size;
5195        int i = 0;
5196        int ret = 0;
5197
5198        em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
5199        if (IS_ERR(em))
5200                return PTR_ERR(em);
5201
5202        map = em->map_lookup;
5203        item_size = btrfs_chunk_item_size(map->num_stripes);
5204        stripe_size = em->orig_block_len;
5205
5206        chunk = kzalloc(item_size, GFP_NOFS);
5207        if (!chunk) {
5208                ret = -ENOMEM;
5209                goto out;
5210        }
5211
5212        /*
5213         * Take the device list mutex to prevent races with the final phase of
5214         * a device replace operation that replaces the device object associated
5215         * with the map's stripes, because the device object's id can change
5216         * at any time during that final phase of the device replace operation
5217         * (dev-replace.c:btrfs_dev_replace_finishing()).
5218         */
5219        mutex_lock(&fs_info->fs_devices->device_list_mutex);
5220        for (i = 0; i < map->num_stripes; i++) {
5221                device = map->stripes[i].dev;
5222                dev_offset = map->stripes[i].physical;
5223
5224                ret = btrfs_update_device(trans, device);
5225                if (ret)
5226                        break;
5227                ret = btrfs_alloc_dev_extent(trans, device, chunk_offset,
5228                                             dev_offset, stripe_size);
5229                if (ret)
5230                        break;
5231        }
5232        if (ret) {
5233                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
5234                goto out;
5235        }
5236
5237        stripe = &chunk->stripe;
5238        for (i = 0; i < map->num_stripes; i++) {
5239                device = map->stripes[i].dev;
5240                dev_offset = map->stripes[i].physical;
5241
5242                btrfs_set_stack_stripe_devid(stripe, device->devid);
5243                btrfs_set_stack_stripe_offset(stripe, dev_offset);
5244                memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
5245                stripe++;
5246        }
5247        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
5248
5249        btrfs_set_stack_chunk_length(chunk, chunk_size);
5250        btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
5251        btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
5252        btrfs_set_stack_chunk_type(chunk, map->type);
5253        btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
5254        btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
5255        btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
5256        btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
5257        btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
5258
5259        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
5260        key.type = BTRFS_CHUNK_ITEM_KEY;
5261        key.offset = chunk_offset;
5262
5263        ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
5264        if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
5265                /*
5266                 * TODO: Cleanup of inserted chunk root in case of
5267                 * failure.
5268                 */
5269                ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
5270        }
5271
5272out:
5273        kfree(chunk);
5274        free_extent_map(em);
5275        return ret;
5276}
5277
5278static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
5279{
5280        struct btrfs_fs_info *fs_info = trans->fs_info;
5281        u64 alloc_profile;
5282        int ret;
5283
5284        alloc_profile = btrfs_metadata_alloc_profile(fs_info);
5285        ret = btrfs_alloc_chunk(trans, alloc_profile);
5286        if (ret)
5287                return ret;
5288
5289        alloc_profile = btrfs_system_alloc_profile(fs_info);
5290        ret = btrfs_alloc_chunk(trans, alloc_profile);
5291        return ret;
5292}
5293
5294static inline int btrfs_chunk_max_errors(struct map_lookup *map)
5295{
5296        const int index = btrfs_bg_flags_to_raid_index(map->type);
5297
5298        return btrfs_raid_array[index].tolerated_failures;
5299}
5300
5301int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
5302{
5303        struct extent_map *em;
5304        struct map_lookup *map;
5305        int readonly = 0;
5306        int miss_ndevs = 0;
5307        int i;
5308
5309        em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
5310        if (IS_ERR(em))
5311                return 1;
5312
5313        map = em->map_lookup;
5314        for (i = 0; i < map->num_stripes; i++) {
5315                if (test_bit(BTRFS_DEV_STATE_MISSING,
5316                                        &map->stripes[i].dev->dev_state)) {
5317                        miss_ndevs++;
5318                        continue;
5319                }
5320                if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
5321                                        &map->stripes[i].dev->dev_state)) {
5322                        readonly = 1;
5323                        goto end;
5324                }
5325        }
5326
5327        /*
5328         * If the number of missing devices is larger than max errors,
5329         * we can not write the data into that chunk successfully, so
5330         * set it readonly.
5331         */
5332        if (miss_ndevs > btrfs_chunk_max_errors(map))
5333                readonly = 1;
5334end:
5335        free_extent_map(em);
5336        return readonly;
5337}
5338
5339void btrfs_mapping_tree_free(struct extent_map_tree *tree)
5340{
5341        struct extent_map *em;
5342
5343        while (1) {
5344                write_lock(&tree->lock);
5345                em = lookup_extent_mapping(tree, 0, (u64)-1);
5346                if (em)
5347                        remove_extent_mapping(tree, em);
5348                write_unlock(&tree->lock);
5349                if (!em)
5350                        break;
5351                /* once for us */
5352                free_extent_map(em);
5353                /* once for the tree */
5354                free_extent_map(em);
5355        }
5356}
5357
5358int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5359{
5360        struct extent_map *em;
5361        struct map_lookup *map;
5362        int ret;
5363
5364        em = btrfs_get_chunk_map(fs_info, logical, len);
5365        if (IS_ERR(em))
5366                /*
5367                 * We could return errors for these cases, but that could get
5368                 * ugly and we'd probably do the same thing which is just not do
5369                 * anything else and exit, so return 1 so the callers don't try
5370                 * to use other copies.
5371                 */
5372                return 1;
5373
5374        map = em->map_lookup;
5375        if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK))
5376                ret = map->num_stripes;
5377        else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5378                ret = map->sub_stripes;
5379        else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
5380                ret = 2;
5381        else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
5382                /*
5383                 * There could be two corrupted data stripes, we need
5384                 * to loop retry in order to rebuild the correct data.
5385                 *
5386                 * Fail a stripe at a time on every retry except the
5387                 * stripe under reconstruction.
5388                 */
5389                ret = map->num_stripes;
5390        else
5391                ret = 1;
5392        free_extent_map(em);
5393
5394        down_read(&fs_info->dev_replace.rwsem);
5395        if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
5396            fs_info->dev_replace.tgtdev)
5397                ret++;
5398        up_read(&fs_info->dev_replace.rwsem);
5399
5400        return ret;
5401}
5402
5403unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
5404                                    u64 logical)
5405{
5406        struct extent_map *em;
5407        struct map_lookup *map;
5408        unsigned long len = fs_info->sectorsize;
5409
5410        em = btrfs_get_chunk_map(fs_info, logical, len);
5411
5412        if (!WARN_ON(IS_ERR(em))) {
5413                map = em->map_lookup;
5414                if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5415                        len = map->stripe_len * nr_data_stripes(map);
5416                free_extent_map(em);
5417        }
5418        return len;
5419}
5420
5421int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5422{
5423        struct extent_map *em;
5424        struct map_lookup *map;
5425        int ret = 0;
5426
5427        em = btrfs_get_chunk_map(fs_info, logical, len);
5428
5429        if(!WARN_ON(IS_ERR(em))) {
5430                map = em->map_lookup;
5431                if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5432                        ret = 1;
5433                free_extent_map(em);
5434        }
5435        return ret;
5436}
5437
5438static int find_live_mirror(struct btrfs_fs_info *fs_info,
5439                            struct map_lookup *map, int first,
5440                            int dev_replace_is_ongoing)
5441{
5442        int i;
5443        int num_stripes;
5444        int preferred_mirror;
5445        int tolerance;
5446        struct btrfs_device *srcdev;
5447
5448        ASSERT((map->type &
5449                 (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
5450
5451        if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5452                num_stripes = map->sub_stripes;
5453        else
5454                num_stripes = map->num_stripes;
5455
5456        preferred_mirror = first + current->pid % num_stripes;
5457
5458        if (dev_replace_is_ongoing &&
5459            fs_info->dev_replace.cont_reading_from_srcdev_mode ==
5460             BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
5461                srcdev = fs_info->dev_replace.srcdev;
5462        else
5463                srcdev = NULL;
5464
5465        /*
5466         * try to avoid the drive that is the source drive for a
5467         * dev-replace procedure, only choose it if no other non-missing
5468         * mirror is available
5469         */
5470        for (tolerance = 0; tolerance < 2; tolerance++) {
5471                if (map->stripes[preferred_mirror].dev->bdev &&
5472                    (tolerance || map->stripes[preferred_mirror].dev != srcdev))
5473                        return preferred_mirror;
5474                for (i = first; i < first + num_stripes; i++) {
5475                        if (map->stripes[i].dev->bdev &&
5476                            (tolerance || map->stripes[i].dev != srcdev))
5477                                return i;
5478                }
5479        }
5480
5481        /* we couldn't find one that doesn't fail.  Just return something
5482         * and the io error handling code will clean up eventually
5483         */
5484        return preferred_mirror;
5485}
5486
5487/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
5488static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
5489{
5490        int i;
5491        int again = 1;
5492
5493        while (again) {
5494                again = 0;
5495                for (i = 0; i < num_stripes - 1; i++) {
5496                        /* Swap if parity is on a smaller index */
5497                        if (bbio->raid_map[i] > bbio->raid_map[i + 1]) {
5498                                swap(bbio->stripes[i], bbio->stripes[i + 1]);
5499                                swap(bbio->raid_map[i], bbio->raid_map[i + 1]);
5500                                again = 1;
5501                        }
5502                }
5503        }
5504}
5505
5506static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
5507{
5508        struct btrfs_bio *bbio = kzalloc(
5509                 /* the size of the btrfs_bio */
5510                sizeof(struct btrfs_bio) +
5511                /* plus the variable array for the stripes */
5512                sizeof(struct btrfs_bio_stripe) * (total_stripes) +
5513                /* plus the variable array for the tgt dev */
5514                sizeof(int) * (real_stripes) +
5515                /*
5516                 * plus the raid_map, which includes both the tgt dev
5517                 * and the stripes
5518                 */
5519                sizeof(u64) * (total_stripes),
5520                GFP_NOFS|__GFP_NOFAIL);
5521
5522        atomic_set(&bbio->error, 0);
5523        refcount_set(&bbio->refs, 1);
5524
5525        return bbio;
5526}
5527
5528void btrfs_get_bbio(struct btrfs_bio *bbio)
5529{
5530        WARN_ON(!refcount_read(&bbio->refs));
5531        refcount_inc(&bbio->refs);
5532}
5533
5534void btrfs_put_bbio(struct btrfs_bio *bbio)
5535{
5536        if (!bbio)
5537                return;
5538        if (refcount_dec_and_test(&bbio->refs))
5539                kfree(bbio);
5540}
5541
5542/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
5543/*
5544 * Please note that, discard won't be sent to target device of device
5545 * replace.
5546 */
5547static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
5548                                         u64 logical, u64 *length_ret,
5549                                         struct btrfs_bio **bbio_ret)
5550{
5551        struct extent_map *em;
5552        struct map_lookup *map;
5553        struct btrfs_bio *bbio;
5554        u64 length = *length_ret;
5555        u64 offset;
5556        u64 stripe_nr;
5557        u64 stripe_nr_end;
5558        u64 stripe_end_offset;
5559        u64 stripe_cnt;
5560        u64 stripe_len;
5561        u64 stripe_offset;
5562        u64 num_stripes;
5563        u32 stripe_index;
5564        u32 factor = 0;
5565        u32 sub_stripes = 0;
5566        u64 stripes_per_dev = 0;
5567        u32 remaining_stripes = 0;
5568        u32 last_stripe = 0;
5569        int ret = 0;
5570        int i;
5571
5572        /* discard always return a bbio */
5573        ASSERT(bbio_ret);
5574
5575        em = btrfs_get_chunk_map(fs_info, logical, length);
5576        if (IS_ERR(em))
5577                return PTR_ERR(em);
5578
5579        map = em->map_lookup;
5580        /* we don't discard raid56 yet */
5581        if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5582                ret = -EOPNOTSUPP;
5583                goto out;
5584        }
5585
5586        offset = logical - em->start;
5587        length = min_t(u64, em->start + em->len - logical, length);
5588        *length_ret = length;
5589
5590        stripe_len = map->stripe_len;
5591        /*
5592         * stripe_nr counts the total number of stripes we have to stride
5593         * to get to this block
5594         */
5595        stripe_nr = div64_u64(offset, stripe_len);
5596
5597        /* stripe_offset is the offset of this block in its stripe */
5598        stripe_offset = offset - stripe_nr * stripe_len;
5599
5600        stripe_nr_end = round_up(offset + length, map->stripe_len);
5601        stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
5602        stripe_cnt = stripe_nr_end - stripe_nr;
5603        stripe_end_offset = stripe_nr_end * map->stripe_len -
5604                            (offset + length);
5605        /*
5606         * after this, stripe_nr is the number of stripes on this
5607         * device we have to walk to find the data, and stripe_index is
5608         * the number of our device in the stripe array
5609         */
5610        num_stripes = 1;
5611        stripe_index = 0;
5612        if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
5613                         BTRFS_BLOCK_GROUP_RAID10)) {
5614                if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5615                        sub_stripes = 1;
5616                else
5617                        sub_stripes = map->sub_stripes;
5618
5619                factor = map->num_stripes / sub_stripes;
5620                num_stripes = min_t(u64, map->num_stripes,
5621                                    sub_stripes * stripe_cnt);
5622                stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
5623                stripe_index *= sub_stripes;
5624                stripes_per_dev = div_u64_rem(stripe_cnt, factor,
5625                                              &remaining_stripes);
5626                div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
5627                last_stripe *= sub_stripes;
5628        } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
5629                                BTRFS_BLOCK_GROUP_DUP)) {
5630                num_stripes = map->num_stripes;
5631        } else {
5632                stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
5633                                        &stripe_index);
5634        }
5635
5636        bbio = alloc_btrfs_bio(num_stripes, 0);
5637        if (!bbio) {
5638                ret = -ENOMEM;
5639                goto out;
5640        }
5641
5642        for (i = 0; i < num_stripes; i++) {
5643                bbio->stripes[i].physical =
5644                        map->stripes[stripe_index].physical +
5645                        stripe_offset + stripe_nr * map->stripe_len;
5646                bbio->stripes[i].dev = map->stripes[stripe_index].dev;
5647
5648                if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
5649                                 BTRFS_BLOCK_GROUP_RAID10)) {
5650                        bbio->stripes[i].length = stripes_per_dev *
5651                                map->stripe_len;
5652
5653                        if (i / sub_stripes < remaining_stripes)
5654                                bbio->stripes[i].length +=
5655                                        map->stripe_len;
5656
5657                        /*
5658                         * Special for the first stripe and
5659                         * the last stripe:
5660                         *
5661                         * |-------|...|-------|
5662                         *     |----------|
5663                         *    off     end_off
5664                         */
5665                        if (i < sub_stripes)
5666                                bbio->stripes[i].length -=
5667                                        stripe_offset;
5668
5669                        if (stripe_index >= last_stripe &&
5670                            stripe_index <= (last_stripe +
5671                                             sub_stripes - 1))
5672                                bbio->stripes[i].length -=
5673                                        stripe_end_offset;
5674
5675                        if (i == sub_stripes - 1)
5676                                stripe_offset = 0;
5677                } else {
5678                        bbio->stripes[i].length = length;
5679                }
5680
5681                stripe_index++;
5682                if (stripe_index == map->num_stripes) {
5683                        stripe_index = 0;
5684                        stripe_nr++;
5685                }
5686        }
5687
5688        *bbio_ret = bbio;
5689        bbio->map_type = map->type;
5690        bbio->num_stripes = num_stripes;
5691out:
5692        free_extent_map(em);
5693        return ret;
5694}
5695
5696/*
5697 * In dev-replace case, for repair case (that's the only case where the mirror
5698 * is selected explicitly when calling btrfs_map_block), blocks left of the
5699 * left cursor can also be read from the target drive.
5700 *
5701 * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
5702 * array of stripes.
5703 * For READ, it also needs to be supported using the same mirror number.
5704 *
5705 * If the requested block is not left of the left cursor, EIO is returned. This
5706 * can happen because btrfs_num_copies() returns one more in the dev-replace
5707 * case.
5708 */
5709static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
5710                                         u64 logical, u64 length,
5711                                         u64 srcdev_devid, int *mirror_num,
5712                                         u64 *physical)
5713{
5714        struct btrfs_bio *bbio = NULL;
5715        int num_stripes;
5716        int index_srcdev = 0;
5717        int found = 0;
5718        u64 physical_of_found = 0;
5719        int i;
5720        int ret = 0;
5721
5722        ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
5723                                logical, &length, &bbio, 0, 0);
5724        if (ret) {
5725                ASSERT(bbio == NULL);
5726                return ret;
5727        }
5728
5729        num_stripes = bbio->num_stripes;
5730        if (*mirror_num > num_stripes) {
5731                /*
5732                 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
5733                 * that means that the requested area is not left of the left
5734                 * cursor
5735                 */
5736                btrfs_put_bbio(bbio);
5737                return -EIO;
5738        }
5739
5740        /*
5741         * process the rest of the function using the mirror_num of the source
5742         * drive. Therefore look it up first.  At the end, patch the device
5743         * pointer to the one of the target drive.
5744         */
5745        for (i = 0; i < num_stripes; i++) {
5746                if (bbio->stripes[i].dev->devid != srcdev_devid)
5747                        continue;
5748
5749                /*
5750                 * In case of DUP, in order to keep it simple, only add the
5751                 * mirror with the lowest physical address
5752                 */
5753                if (found &&
5754                    physical_of_found <= bbio->stripes[i].physical)
5755                        continue;
5756
5757                index_srcdev = i;
5758                found = 1;
5759                physical_of_found = bbio->stripes[i].physical;
5760        }
5761
5762        btrfs_put_bbio(bbio);
5763
5764        ASSERT(found);
5765        if (!found)
5766                return -EIO;
5767
5768        *mirror_num = index_srcdev + 1;
5769        *physical = physical_of_found;
5770        return ret;
5771}
5772
5773static void handle_ops_on_dev_replace(enum btrfs_map_op op,
5774                                      struct btrfs_bio **bbio_ret,
5775                                      struct btrfs_dev_replace *dev_replace,
5776                                      int *num_stripes_ret, int *max_errors_ret)
5777{
5778        struct btrfs_bio *bbio = *bbio_ret;
5779        u64 srcdev_devid = dev_replace->srcdev->devid;
5780        int tgtdev_indexes = 0;
5781        int num_stripes = *num_stripes_ret;
5782        int max_errors = *max_errors_ret;
5783        int i;
5784
5785        if (op == BTRFS_MAP_WRITE) {
5786                int index_where_to_add;
5787
5788                /*
5789                 * duplicate the write operations while the dev replace
5790                 * procedure is running. Since the copying of the old disk to
5791                 * the new disk takes place at run time while the filesystem is
5792                 * mounted writable, the regular write operations to the old
5793                 * disk have to be duplicated to go to the new disk as well.
5794                 *
5795                 * Note that device->missing is handled by the caller, and that
5796                 * the write to the old disk is already set up in the stripes
5797                 * array.
5798                 */
5799                index_where_to_add = num_stripes;
5800                for (i = 0; i < num_stripes; i++) {
5801                        if (bbio->stripes[i].dev->devid == srcdev_devid) {
5802                                /* write to new disk, too */
5803                                struct btrfs_bio_stripe *new =
5804                                        bbio->stripes + index_where_to_add;
5805                                struct btrfs_bio_stripe *old =
5806                                        bbio->stripes + i;
5807
5808                                new->physical = old->physical;
5809                                new->length = old->length;
5810                                new->dev = dev_replace->tgtdev;
5811                                bbio->tgtdev_map[i] = index_where_to_add;
5812                                index_where_to_add++;
5813                                max_errors++;
5814                                tgtdev_indexes++;
5815                        }
5816                }
5817                num_stripes = index_where_to_add;
5818        } else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
5819                int index_srcdev = 0;
5820                int found = 0;
5821                u64 physical_of_found = 0;
5822
5823                /*
5824                 * During the dev-replace procedure, the target drive can also
5825                 * be used to read data in case it is needed to repair a corrupt
5826                 * block elsewhere. This is possible if the requested area is
5827                 * left of the left cursor. In this area, the target drive is a
5828                 * full copy of the source drive.
5829                 */
5830                for (i = 0; i < num_stripes; i++) {
5831                        if (bbio->stripes[i].dev->devid == srcdev_devid) {
5832                                /*
5833                                 * In case of DUP, in order to keep it simple,
5834                                 * only add the mirror with the lowest physical
5835                                 * address
5836                                 */
5837                                if (found &&
5838                                    physical_of_found <=
5839                                     bbio->stripes[i].physical)
5840                                        continue;
5841                                index_srcdev = i;
5842                                found = 1;
5843                                physical_of_found = bbio->stripes[i].physical;
5844                        }
5845                }
5846                if (found) {
5847                        struct btrfs_bio_stripe *tgtdev_stripe =
5848                                bbio->stripes + num_stripes;
5849
5850                        tgtdev_stripe->physical = physical_of_found;
5851                        tgtdev_stripe->length =
5852                                bbio->stripes[index_srcdev].length;
5853                        tgtdev_stripe->dev = dev_replace->tgtdev;
5854                        bbio->tgtdev_map[index_srcdev] = num_stripes;
5855
5856                        tgtdev_indexes++;
5857                        num_stripes++;
5858                }
5859        }
5860
5861        *num_stripes_ret = num_stripes;
5862        *max_errors_ret = max_errors;
5863        bbio->num_tgtdevs = tgtdev_indexes;
5864        *bbio_ret = bbio;
5865}
5866
5867static bool need_full_stripe(enum btrfs_map_op op)
5868{
5869        return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
5870}
5871
5872/*
5873 * btrfs_get_io_geometry - calculates the geomery of a particular (address, len)
5874 *                     tuple. This information is used to calculate how big a
5875 *                     particular bio can get before it straddles a stripe.
5876 *
5877 * @fs_info - the filesystem
5878 * @logical - address that we want to figure out the geometry of
5879 * @len     - the length of IO we are going to perform, starting at @logical
5880 * @op      - type of operation - write or read
5881 * @io_geom - pointer used to return values
5882 *
5883 * Returns < 0 in case a chunk for the given logical address cannot be found,
5884 * usually shouldn't happen unless @logical is corrupted, 0 otherwise.
5885 */
5886int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
5887                        u64 logical, u64 len, struct btrfs_io_geometry *io_geom)
5888{
5889        struct extent_map *em;
5890        struct map_lookup *map;
5891        u64 offset;
5892        u64 stripe_offset;
5893        u64 stripe_nr;
5894        u64 stripe_len;
5895        u64 raid56_full_stripe_start = (u64)-1;
5896        int data_stripes;
5897        int ret = 0;
5898
5899        ASSERT(op != BTRFS_MAP_DISCARD);
5900
5901        em = btrfs_get_chunk_map(fs_info, logical, len);
5902        if (IS_ERR(em))
5903                return PTR_ERR(em);
5904
5905        map = em->map_lookup;
5906        /* Offset of this logical address in the chunk */
5907        offset = logical - em->start;
5908        /* Len of a stripe in a chunk */
5909        stripe_len = map->stripe_len;
5910        /* Stripe wher this block falls in */
5911        stripe_nr = div64_u64(offset, stripe_len);
5912        /* Offset of stripe in the chunk */
5913        stripe_offset = stripe_nr * stripe_len;
5914        if (offset < stripe_offset) {
5915                btrfs_crit(fs_info,
5916"stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
5917                        stripe_offset, offset, em->start, logical, stripe_len);
5918                ret = -EINVAL;
5919                goto out;
5920        }
5921
5922        /* stripe_offset is the offset of this block in its stripe */
5923        stripe_offset = offset - stripe_offset;
5924        data_stripes = nr_data_stripes(map);
5925
5926        if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
5927                u64 max_len = stripe_len - stripe_offset;
5928
5929                /*
5930                 * In case of raid56, we need to know the stripe aligned start
5931                 */
5932                if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5933                        unsigned long full_stripe_len = stripe_len * data_stripes;
5934                        raid56_full_stripe_start = offset;
5935
5936                        /*
5937                         * Allow a write of a full stripe, but make sure we
5938                         * don't allow straddling of stripes
5939                         */
5940                        raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
5941                                        full_stripe_len);
5942                        raid56_full_stripe_start *= full_stripe_len;
5943
5944                        /*
5945                         * For writes to RAID[56], allow a full stripeset across
5946                         * all disks. For other RAID types and for RAID[56]
5947                         * reads, just allow a single stripe (on a single disk).
5948                         */
5949                        if (op == BTRFS_MAP_WRITE) {
5950                                max_len = stripe_len * data_stripes -
5951                                          (offset - raid56_full_stripe_start);
5952                        }
5953                }
5954                len = min_t(u64, em->len - offset, max_len);
5955        } else {
5956                len = em->len - offset;
5957        }
5958
5959        io_geom->len = len;
5960        io_geom->offset = offset;
5961        io_geom->stripe_len = stripe_len;
5962        io_geom->stripe_nr = stripe_nr;
5963        io_geom->stripe_offset = stripe_offset;
5964        io_geom->raid56_stripe_offset = raid56_full_stripe_start;
5965
5966out:
5967        /* once for us */
5968        free_extent_map(em);
5969        return ret;
5970}
5971
5972static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
5973                             enum btrfs_map_op op,
5974                             u64 logical, u64 *length,
5975                             struct btrfs_bio **bbio_ret,
5976                             int mirror_num, int need_raid_map)
5977{
5978        struct extent_map *em;
5979        struct map_lookup *map;
5980        u64 stripe_offset;
5981        u64 stripe_nr;
5982        u64 stripe_len;
5983        u32 stripe_index;
5984        int data_stripes;
5985        int i;
5986        int ret = 0;
5987        int num_stripes;
5988        int max_errors = 0;
5989        int tgtdev_indexes = 0;
5990        struct btrfs_bio *bbio = NULL;
5991        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
5992        int dev_replace_is_ongoing = 0;
5993        int num_alloc_stripes;
5994        int patch_the_first_stripe_for_dev_replace = 0;
5995        u64 physical_to_patch_in_first_stripe = 0;
5996        u64 raid56_full_stripe_start = (u64)-1;
5997        struct btrfs_io_geometry geom;
5998
5999        ASSERT(bbio_ret);
6000        ASSERT(op != BTRFS_MAP_DISCARD);
6001
6002        ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom);
6003        if (ret < 0)
6004                return ret;
6005
6006        em = btrfs_get_chunk_map(fs_info, logical, *length);
6007        ASSERT(!IS_ERR(em));
6008        map = em->map_lookup;
6009
6010        *length = geom.len;
6011        stripe_len = geom.stripe_len;
6012        stripe_nr = geom.stripe_nr;
6013        stripe_offset = geom.stripe_offset;
6014        raid56_full_stripe_start = geom.raid56_stripe_offset;
6015        data_stripes = nr_data_stripes(map);
6016
6017        down_read(&dev_replace->rwsem);
6018        dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
6019        /*
6020         * Hold the semaphore for read during the whole operation, write is
6021         * requested at commit time but must wait.
6022         */
6023        if (!dev_replace_is_ongoing)
6024                up_read(&dev_replace->rwsem);
6025
6026        if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
6027            !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
6028                ret = get_extra_mirror_from_replace(fs_info, logical, *length,
6029                                                    dev_replace->srcdev->devid,
6030                                                    &mirror_num,
6031                                            &physical_to_patch_in_first_stripe);
6032                if (ret)
6033                        goto out;
6034                else
6035                        patch_the_first_stripe_for_dev_replace = 1;
6036        } else if (mirror_num > map->num_stripes) {
6037                mirror_num = 0;
6038        }
6039
6040        num_stripes = 1;
6041        stripe_index = 0;
6042        if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
6043                stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
6044                                &stripe_index);
6045                if (!need_full_stripe(op))
6046                        mirror_num = 1;
6047        } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
6048                if (need_full_stripe(op))
6049                        num_stripes = map->num_stripes;
6050                else if (mirror_num)
6051                        stripe_index = mirror_num - 1;
6052                else {
6053                        stripe_index = find_live_mirror(fs_info, map, 0,
6054                                            dev_replace_is_ongoing);
6055                        mirror_num = stripe_index + 1;
6056                }
6057
6058        } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
6059                if (need_full_stripe(op)) {
6060                        num_stripes = map->num_stripes;
6061                } else if (mirror_num) {
6062                        stripe_index = mirror_num - 1;
6063                } else {
6064                        mirror_num = 1;
6065                }
6066
6067        } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
6068                u32 factor = map->num_stripes / map->sub_stripes;
6069
6070                stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
6071                stripe_index *= map->sub_stripes;
6072
6073                if (need_full_stripe(op))
6074                        num_stripes = map->sub_stripes;
6075                else if (mirror_num)
6076                        stripe_index += mirror_num - 1;
6077                else {
6078                        int old_stripe_index = stripe_index;
6079                        stripe_index = find_live_mirror(fs_info, map,
6080                                              stripe_index,
6081                                              dev_replace_is_ongoing);
6082                        mirror_num = stripe_index - old_stripe_index + 1;
6083                }
6084
6085        } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6086                if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
6087                        /* push stripe_nr back to the start of the full stripe */
6088                        stripe_nr = div64_u64(raid56_full_stripe_start,
6089                                        stripe_len * data_stripes);
6090
6091                        /* RAID[56] write or recovery. Return all stripes */
6092                        num_stripes = map->num_stripes;
6093                        max_errors = nr_parity_stripes(map);
6094
6095                        *length = map->stripe_len;
6096                        stripe_index = 0;
6097                        stripe_offset = 0;
6098                } else {
6099                        /*
6100                         * Mirror #0 or #1 means the original data block.
6101                         * Mirror #2 is RAID5 parity block.
6102                         * Mirror #3 is RAID6 Q block.
6103                         */
6104                        stripe_nr = div_u64_rem(stripe_nr,
6105                                        data_stripes, &stripe_index);
6106                        if (mirror_num > 1)
6107                                stripe_index = data_stripes + mirror_num - 2;
6108
6109                        /* We distribute the parity blocks across stripes */
6110                        div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
6111                                        &stripe_index);
6112                        if (!need_full_stripe(op) && mirror_num <= 1)
6113                                mirror_num = 1;
6114                }
6115        } else {
6116                /*
6117                 * after this, stripe_nr is the number of stripes on this
6118                 * device we have to walk to find the data, and stripe_index is
6119                 * the number of our device in the stripe array
6120                 */
6121                stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
6122                                &stripe_index);
6123                mirror_num = stripe_index + 1;
6124        }
6125        if (stripe_index >= map->num_stripes) {
6126                btrfs_crit(fs_info,
6127                           "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
6128                           stripe_index, map->num_stripes);
6129                ret = -EINVAL;
6130                goto out;
6131        }
6132
6133        num_alloc_stripes = num_stripes;
6134        if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
6135                if (op == BTRFS_MAP_WRITE)
6136                        num_alloc_stripes <<= 1;
6137                if (op == BTRFS_MAP_GET_READ_MIRRORS)
6138                        num_alloc_stripes++;
6139                tgtdev_indexes = num_stripes;
6140        }
6141
6142        bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
6143        if (!bbio) {
6144                ret = -ENOMEM;
6145                goto out;
6146        }
6147        if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
6148                bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
6149
6150        /* build raid_map */
6151        if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
6152            (need_full_stripe(op) || mirror_num > 1)) {
6153                u64 tmp;
6154                unsigned rot;
6155
6156                bbio->raid_map = (u64 *)((void *)bbio->stripes +
6157                                 sizeof(struct btrfs_bio_stripe) *
6158                                 num_alloc_stripes +
6159                                 sizeof(int) * tgtdev_indexes);
6160
6161                /* Work out the disk rotation on this stripe-set */
6162                div_u64_rem(stripe_nr, num_stripes, &rot);
6163
6164                /* Fill in the logical address of each stripe */
6165                tmp = stripe_nr * data_stripes;
6166                for (i = 0; i < data_stripes; i++)
6167                        bbio->raid_map[(i+rot) % num_stripes] =
6168                                em->start + (tmp + i) * map->stripe_len;
6169
6170                bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
6171                if (map->type & BTRFS_BLOCK_GROUP_RAID6)
6172                        bbio->raid_map[(i+rot+1) % num_stripes] =
6173                                RAID6_Q_STRIPE;
6174        }
6175
6176
6177        for (i = 0; i < num_stripes; i++) {
6178                bbio->stripes[i].physical =
6179                        map->stripes[stripe_index].physical +
6180                        stripe_offset +
6181                        stripe_nr * map->stripe_len;
6182                bbio->stripes[i].dev =
6183                        map->stripes[stripe_index].dev;
6184                stripe_index++;
6185        }
6186
6187        if (need_full_stripe(op))
6188                max_errors = btrfs_chunk_max_errors(map);
6189
6190        if (bbio->raid_map)
6191                sort_parity_stripes(bbio, num_stripes);
6192
6193        if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
6194            need_full_stripe(op)) {
6195                handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes,
6196                                          &max_errors);
6197        }
6198
6199        *bbio_ret = bbio;
6200        bbio->map_type = map->type;
6201        bbio->num_stripes = num_stripes;
6202        bbio->max_errors = max_errors;
6203        bbio->mirror_num = mirror_num;
6204
6205        /*
6206         * this is the case that REQ_READ && dev_replace_is_ongoing &&
6207         * mirror_num == num_stripes + 1 && dev_replace target drive is
6208         * available as a mirror
6209         */
6210        if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
6211                WARN_ON(num_stripes > 1);
6212                bbio->stripes[0].dev = dev_replace->tgtdev;
6213                bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
6214                bbio->mirror_num = map->num_stripes + 1;
6215        }
6216out:
6217        if (dev_replace_is_ongoing) {
6218                lockdep_assert_held(&dev_replace->rwsem);
6219                /* Unlock and let waiting writers proceed */
6220                up_read(&dev_replace->rwsem);
6221        }
6222        free_extent_map(em);
6223        return ret;
6224}
6225
6226int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6227                      u64 logical, u64 *length,
6228                      struct btrfs_bio **bbio_ret, int mirror_num)
6229{
6230        if (op == BTRFS_MAP_DISCARD)
6231                return __btrfs_map_block_for_discard(fs_info, logical,
6232                                                     length, bbio_ret);
6233
6234        return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
6235                                 mirror_num, 0);
6236}
6237
6238/* For Scrub/replace */
6239int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6240                     u64 logical, u64 *length,
6241                     struct btrfs_bio **bbio_ret)
6242{
6243        return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
6244}
6245
6246static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
6247{
6248        bio->bi_private = bbio->private;
6249        bio->bi_end_io = bbio->end_io;
6250        bio_endio(bio);
6251
6252        btrfs_put_bbio(bbio);
6253}
6254
6255static void btrfs_end_bio(struct bio *bio)
6256{
6257        struct btrfs_bio *bbio = bio->bi_private;
6258        int is_orig_bio = 0;
6259
6260        if (bio->bi_status) {
6261                atomic_inc(&bbio->error);
6262                if (bio->bi_status == BLK_STS_IOERR ||
6263                    bio->bi_status == BLK_STS_TARGET) {
6264                        unsigned int stripe_index =
6265                                btrfs_io_bio(bio)->stripe_index;
6266                        struct btrfs_device *dev;
6267
6268                        BUG_ON(stripe_index >= bbio->num_stripes);
6269                        dev = bbio->stripes[stripe_index].dev;
6270                        if (dev->bdev) {
6271                                if (bio_op(bio) == REQ_OP_WRITE)
6272                                        btrfs_dev_stat_inc_and_print(dev,
6273                                                BTRFS_DEV_STAT_WRITE_ERRS);
6274                                else if (!(bio->bi_opf & REQ_RAHEAD))
6275                                        btrfs_dev_stat_inc_and_print(dev,
6276                                                BTRFS_DEV_STAT_READ_ERRS);
6277                                if (bio->bi_opf & REQ_PREFLUSH)
6278                                        btrfs_dev_stat_inc_and_print(dev,
6279                                                BTRFS_DEV_STAT_FLUSH_ERRS);
6280                        }
6281                }
6282        }
6283
6284        if (bio == bbio->orig_bio)
6285                is_orig_bio = 1;
6286
6287        btrfs_bio_counter_dec(bbio->fs_info);
6288
6289        if (atomic_dec_and_test(&bbio->stripes_pending)) {
6290                if (!is_orig_bio) {
6291                        bio_put(bio);
6292                        bio = bbio->orig_bio;
6293                }
6294
6295                btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6296                /* only send an error to the higher layers if it is
6297                 * beyond the tolerance of the btrfs bio
6298                 */
6299                if (atomic_read(&bbio->error) > bbio->max_errors) {
6300                        bio->bi_status = BLK_STS_IOERR;
6301                } else {
6302                        /*
6303                         * this bio is actually up to date, we didn't
6304                         * go over the max number of errors
6305                         */
6306                        bio->bi_status = BLK_STS_OK;
6307                }
6308
6309                btrfs_end_bbio(bbio, bio);
6310        } else if (!is_orig_bio) {
6311                bio_put(bio);
6312        }
6313}
6314
6315static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
6316                              u64 physical, int dev_nr)
6317{
6318        struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
6319        struct btrfs_fs_info *fs_info = bbio->fs_info;
6320
6321        bio->bi_private = bbio;
6322        btrfs_io_bio(bio)->stripe_index = dev_nr;
6323        bio->bi_end_io = btrfs_end_bio;
6324        bio->bi_iter.bi_sector = physical >> 9;
6325        btrfs_debug_in_rcu(fs_info,
6326        "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
6327                bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector,
6328                (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
6329                dev->devid, bio->bi_iter.bi_size);
6330        bio_set_dev(bio, dev->bdev);
6331
6332        btrfs_bio_counter_inc_noblocked(fs_info);
6333
6334        btrfsic_submit_bio(bio);
6335}
6336
6337static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
6338{
6339        atomic_inc(&bbio->error);
6340        if (atomic_dec_and_test(&bbio->stripes_pending)) {
6341                /* Should be the original bio. */
6342                WARN_ON(bio != bbio->orig_bio);
6343
6344                btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6345                bio->bi_iter.bi_sector = logical >> 9;
6346                if (atomic_read(&bbio->error) > bbio->max_errors)
6347                        bio->bi_status = BLK_STS_IOERR;
6348                else
6349                        bio->bi_status = BLK_STS_OK;
6350                btrfs_end_bbio(bbio, bio);
6351        }
6352}
6353
6354blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
6355                           int mirror_num)
6356{
6357        struct btrfs_device *dev;
6358        struct bio *first_bio = bio;
6359        u64 logical = (u64)bio->bi_iter.bi_sector << 9;
6360        u64 length = 0;
6361        u64 map_length;
6362        int ret;
6363        int dev_nr;
6364        int total_devs;
6365        struct btrfs_bio *bbio = NULL;
6366
6367        length = bio->bi_iter.bi_size;
6368        map_length = length;
6369
6370        btrfs_bio_counter_inc_blocked(fs_info);
6371        ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
6372                                &map_length, &bbio, mirror_num, 1);
6373        if (ret) {
6374                btrfs_bio_counter_dec(fs_info);
6375                return errno_to_blk_status(ret);
6376        }
6377
6378        total_devs = bbio->num_stripes;
6379        bbio->orig_bio = first_bio;
6380        bbio->private = first_bio->bi_private;
6381        bbio->end_io = first_bio->bi_end_io;
6382        bbio->fs_info = fs_info;
6383        atomic_set(&bbio->stripes_pending, bbio->num_stripes);
6384
6385        if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
6386            ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) {
6387                /* In this case, map_length has been set to the length of
6388                   a single stripe; not the whole write */
6389                if (bio_op(bio) == REQ_OP_WRITE) {
6390                        ret = raid56_parity_write(fs_info, bio, bbio,
6391                                                  map_length);
6392                } else {
6393                        ret = raid56_parity_recover(fs_info, bio, bbio,
6394                                                    map_length, mirror_num, 1);
6395                }
6396
6397                btrfs_bio_counter_dec(fs_info);
6398                return errno_to_blk_status(ret);
6399        }
6400
6401        if (map_length < length) {
6402                btrfs_crit(fs_info,
6403                           "mapping failed logical %llu bio len %llu len %llu",
6404                           logical, length, map_length);
6405                BUG();
6406        }
6407
6408        for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
6409                dev = bbio->stripes[dev_nr].dev;
6410                if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
6411                                                   &dev->dev_state) ||
6412                    (bio_op(first_bio) == REQ_OP_WRITE &&
6413                    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
6414                        bbio_error(bbio, first_bio, logical);
6415                        continue;
6416                }
6417
6418                if (dev_nr < total_devs - 1)
6419                        bio = btrfs_bio_clone(first_bio);
6420                else
6421                        bio = first_bio;
6422
6423                submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical,
6424                                  dev_nr);
6425        }
6426        btrfs_bio_counter_dec(fs_info);
6427        return BLK_STS_OK;
6428}
6429
6430/*
6431 * Find a device specified by @devid or @uuid in the list of @fs_devices, or
6432 * return NULL.
6433 *
6434 * If devid and uuid are both specified, the match must be exact, otherwise
6435 * only devid is used.
6436 *
6437 * If @seed is true, traverse through the seed devices.
6438 */
6439struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
6440                                       u64 devid, u8 *uuid, u8 *fsid,
6441                                       bool seed)
6442{
6443        struct btrfs_device *device;
6444
6445        while (fs_devices) {
6446                if (!fsid ||
6447                    !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
6448                        list_for_each_entry(device, &fs_devices->devices,
6449                                            dev_list) {
6450                                if (device->devid == devid &&
6451                                    (!uuid || memcmp(device->uuid, uuid,
6452                                                     BTRFS_UUID_SIZE) == 0))
6453                                        return device;
6454                        }
6455                }
6456                if (seed)
6457                        fs_devices = fs_devices->seed;
6458                else
6459                        return NULL;
6460        }
6461        return NULL;
6462}
6463
6464static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
6465                                            u64 devid, u8 *dev_uuid)
6466{
6467        struct btrfs_device *device;
6468
6469        device = btrfs_alloc_device(NULL, &devid, dev_uuid);
6470        if (IS_ERR(device))
6471                return device;
6472
6473        list_add(&device->dev_list, &fs_devices->devices);
6474        device->fs_devices = fs_devices;
6475        fs_devices->num_devices++;
6476
6477        set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
6478        fs_devices->missing_devices++;
6479
6480        return device;
6481}
6482
6483/**
6484 * btrfs_alloc_device - allocate struct btrfs_device
6485 * @fs_info:    used only for generating a new devid, can be NULL if
6486 *              devid is provided (i.e. @devid != NULL).
6487 * @devid:      a pointer to devid for this device.  If NULL a new devid
6488 *              is generated.
6489 * @uuid:       a pointer to UUID for this device.  If NULL a new UUID
6490 *              is generated.
6491 *
6492 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
6493 * on error.  Returned struct is not linked onto any lists and must be
6494 * destroyed with btrfs_free_device.
6495 */
6496struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
6497                                        const u64 *devid,
6498                                        const u8 *uuid)
6499{
6500        struct btrfs_device *dev;
6501        u64 tmp;
6502
6503        if (WARN_ON(!devid && !fs_info))
6504                return ERR_PTR(-EINVAL);
6505
6506        dev = __alloc_device();
6507        if (IS_ERR(dev))
6508                return dev;
6509
6510        if (devid)
6511                tmp = *devid;
6512        else {
6513                int ret;
6514
6515                ret = find_next_devid(fs_info, &tmp);
6516                if (ret) {
6517                        btrfs_free_device(dev);
6518                        return ERR_PTR(ret);
6519                }
6520        }
6521        dev->devid = tmp;
6522
6523        if (uuid)
6524                memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
6525        else
6526                generate_random_uuid(dev->uuid);
6527
6528        return dev;
6529}
6530
6531static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
6532                                        u64 devid, u8 *uuid, bool error)
6533{
6534        if (error)
6535                btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
6536                              devid, uuid);
6537        else
6538                btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
6539                              devid, uuid);
6540}
6541
6542static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
6543{
6544        int index = btrfs_bg_flags_to_raid_index(type);
6545        int ncopies = btrfs_raid_array[index].ncopies;
6546        const int nparity = btrfs_raid_array[index].nparity;
6547        int data_stripes;
6548
6549        if (nparity)
6550                data_stripes = num_stripes - nparity;
6551        else
6552                data_stripes = num_stripes / ncopies;
6553
6554        return div_u64(chunk_len, data_stripes);
6555}
6556
6557static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
6558                          struct btrfs_chunk *chunk)
6559{
6560        struct btrfs_fs_info *fs_info = leaf->fs_info;
6561        struct extent_map_tree *map_tree = &fs_info->mapping_tree;
6562        struct map_lookup *map;
6563        struct extent_map *em;
6564        u64 logical;
6565        u64 length;
6566        u64 devid;
6567        u8 uuid[BTRFS_UUID_SIZE];
6568        int num_stripes;
6569        int ret;
6570        int i;
6571
6572        logical = key->offset;
6573        length = btrfs_chunk_length(leaf, chunk);
6574        num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
6575
6576        /*
6577         * Only need to verify chunk item if we're reading from sys chunk array,
6578         * as chunk item in tree block is already verified by tree-checker.
6579         */
6580        if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
6581                ret = btrfs_check_chunk_valid(leaf, chunk, logical);
6582                if (ret)
6583                        return ret;
6584        }
6585
6586        read_lock(&map_tree->lock);
6587        em = lookup_extent_mapping(map_tree, logical, 1);
6588        read_unlock(&map_tree->lock);
6589
6590        /* already mapped? */
6591        if (em && em->start <= logical && em->start + em->len > logical) {
6592                free_extent_map(em);
6593                return 0;
6594        } else if (em) {
6595                free_extent_map(em);
6596        }
6597
6598        em = alloc_extent_map();
6599        if (!em)
6600                return -ENOMEM;
6601        map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
6602        if (!map) {
6603                free_extent_map(em);
6604                return -ENOMEM;
6605        }
6606
6607        set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
6608        em->map_lookup = map;
6609        em->start = logical;
6610        em->len = length;
6611        em->orig_start = 0;
6612        em->block_start = 0;
6613        em->block_len = em->len;
6614
6615        map->num_stripes = num_stripes;
6616        map->io_width = btrfs_chunk_io_width(leaf, chunk);
6617        map->io_align = btrfs_chunk_io_align(leaf, chunk);
6618        map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
6619        map->type = btrfs_chunk_type(leaf, chunk);
6620        map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
6621        map->verified_stripes = 0;
6622        em->orig_block_len = calc_stripe_length(map->type, em->len,
6623                                                map->num_stripes);
6624        for (i = 0; i < num_stripes; i++) {
6625                map->stripes[i].physical =
6626                        btrfs_stripe_offset_nr(leaf, chunk, i);
6627                devid = btrfs_stripe_devid_nr(leaf, chunk, i);
6628                read_extent_buffer(leaf, uuid, (unsigned long)
6629                                   btrfs_stripe_dev_uuid_nr(chunk, i),
6630                                   BTRFS_UUID_SIZE);
6631                map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
6632                                                        devid, uuid, NULL, true);
6633                if (!map->stripes[i].dev &&
6634                    !btrfs_test_opt(fs_info, DEGRADED)) {
6635                        free_extent_map(em);
6636                        btrfs_report_missing_device(fs_info, devid, uuid, true);
6637                        return -ENOENT;
6638                }
6639                if (!map->stripes[i].dev) {
6640                        map->stripes[i].dev =
6641                                add_missing_dev(fs_info->fs_devices, devid,
6642                                                uuid);
6643                        if (IS_ERR(map->stripes[i].dev)) {
6644                                free_extent_map(em);
6645                                btrfs_err(fs_info,
6646                                        "failed to init missing dev %llu: %ld",
6647                                        devid, PTR_ERR(map->stripes[i].dev));
6648                                return PTR_ERR(map->stripes[i].dev);
6649                        }
6650                        btrfs_report_missing_device(fs_info, devid, uuid, false);
6651                }
6652                set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
6653                                &(map->stripes[i].dev->dev_state));
6654
6655        }
6656
6657        write_lock(&map_tree->lock);
6658        ret = add_extent_mapping(map_tree, em, 0);
6659        write_unlock(&map_tree->lock);
6660        if (ret < 0) {
6661                btrfs_err(fs_info,
6662                          "failed to add chunk map, start=%llu len=%llu: %d",
6663                          em->start, em->len, ret);
6664        }
6665        free_extent_map(em);
6666
6667        return ret;
6668}
6669
6670static void fill_device_from_item(struct extent_buffer *leaf,
6671                                 struct btrfs_dev_item *dev_item,
6672                                 struct btrfs_device *device)
6673{
6674        unsigned long ptr;
6675
6676        device->devid = btrfs_device_id(leaf, dev_item);
6677        device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
6678        device->total_bytes = device->disk_total_bytes;
6679        device->commit_total_bytes = device->disk_total_bytes;
6680        device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
6681        device->commit_bytes_used = device->bytes_used;
6682        device->type = btrfs_device_type(leaf, dev_item);
6683        device->io_align = btrfs_device_io_align(leaf, dev_item);
6684        device->io_width = btrfs_device_io_width(leaf, dev_item);
6685        device->sector_size = btrfs_device_sector_size(leaf, dev_item);
6686        WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
6687        clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
6688
6689        ptr = btrfs_device_uuid(dev_item);
6690        read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
6691}
6692
6693static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
6694                                                  u8 *fsid)
6695{
6696        struct btrfs_fs_devices *fs_devices;
6697        int ret;
6698
6699        lockdep_assert_held(&uuid_mutex);
6700        ASSERT(fsid);
6701
6702        fs_devices = fs_info->fs_devices->seed;
6703        while (fs_devices) {
6704                if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
6705                        return fs_devices;
6706
6707                fs_devices = fs_devices->seed;
6708        }
6709
6710        fs_devices = find_fsid(fsid, NULL);
6711        if (!fs_devices) {
6712                if (!btrfs_test_opt(fs_info, DEGRADED))
6713                        return ERR_PTR(-ENOENT);
6714
6715                fs_devices = alloc_fs_devices(fsid, NULL);
6716                if (IS_ERR(fs_devices))
6717                        return fs_devices;
6718
6719                fs_devices->seeding = true;
6720                fs_devices->opened = 1;
6721                return fs_devices;
6722        }
6723
6724        fs_devices = clone_fs_devices(fs_devices);
6725        if (IS_ERR(fs_devices))
6726                return fs_devices;
6727
6728        ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
6729        if (ret) {
6730                free_fs_devices(fs_devices);
6731                fs_devices = ERR_PTR(ret);
6732                goto out;
6733        }
6734
6735        if (!fs_devices->seeding) {
6736                close_fs_devices(fs_devices);
6737                free_fs_devices(fs_devices);
6738                fs_devices = ERR_PTR(-EINVAL);
6739                goto out;
6740        }
6741
6742        fs_devices->seed = fs_info->fs_devices->seed;
6743        fs_info->fs_devices->seed = fs_devices;
6744out:
6745        return fs_devices;
6746}
6747
6748static int read_one_dev(struct extent_buffer *leaf,
6749                        struct btrfs_dev_item *dev_item)
6750{
6751        struct btrfs_fs_info *fs_info = leaf->fs_info;
6752        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6753        struct btrfs_device *device;
6754        u64 devid;
6755        int ret;
6756        u8 fs_uuid[BTRFS_FSID_SIZE];
6757        u8 dev_uuid[BTRFS_UUID_SIZE];
6758
6759        devid = btrfs_device_id(leaf, dev_item);
6760        read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
6761                           BTRFS_UUID_SIZE);
6762        read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
6763                           BTRFS_FSID_SIZE);
6764
6765        if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
6766                fs_devices = open_seed_devices(fs_info, fs_uuid);
6767                if (IS_ERR(fs_devices))
6768                        return PTR_ERR(fs_devices);
6769        }
6770
6771        device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
6772                                   fs_uuid, true);
6773        if (!device) {
6774                if (!btrfs_test_opt(fs_info, DEGRADED)) {
6775                        btrfs_report_missing_device(fs_info, devid,
6776                                                        dev_uuid, true);
6777                        return -ENOENT;
6778                }
6779
6780                device = add_missing_dev(fs_devices, devid, dev_uuid);
6781                if (IS_ERR(device)) {
6782                        btrfs_err(fs_info,
6783                                "failed to add missing dev %llu: %ld",
6784                                devid, PTR_ERR(device));
6785                        return PTR_ERR(device);
6786                }
6787                btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
6788        } else {
6789                if (!device->bdev) {
6790                        if (!btrfs_test_opt(fs_info, DEGRADED)) {
6791                                btrfs_report_missing_device(fs_info,
6792                                                devid, dev_uuid, true);
6793                                return -ENOENT;
6794                        }
6795                        btrfs_report_missing_device(fs_info, devid,
6796                                                        dev_uuid, false);
6797                }
6798
6799                if (!device->bdev &&
6800                    !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
6801                        /*
6802                         * this happens when a device that was properly setup
6803                         * in the device info lists suddenly goes bad.
6804                         * device->bdev is NULL, and so we have to set
6805                         * device->missing to one here
6806                         */
6807                        device->fs_devices->missing_devices++;
6808                        set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
6809                }
6810
6811                /* Move the device to its own fs_devices */
6812                if (device->fs_devices != fs_devices) {
6813                        ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
6814                                                        &device->dev_state));
6815
6816                        list_move(&device->dev_list, &fs_devices->devices);
6817                        device->fs_devices->num_devices--;
6818                        fs_devices->num_devices++;
6819
6820                        device->fs_devices->missing_devices--;
6821                        fs_devices->missing_devices++;
6822
6823                        device->fs_devices = fs_devices;
6824                }
6825        }
6826
6827        if (device->fs_devices != fs_info->fs_devices) {
6828                BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
6829                if (device->generation !=
6830                    btrfs_device_generation(leaf, dev_item))
6831                        return -EINVAL;
6832        }
6833
6834        fill_device_from_item(leaf, dev_item, device);
6835        set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
6836        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
6837           !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
6838                device->fs_devices->total_rw_bytes += device->total_bytes;
6839                atomic64_add(device->total_bytes - device->bytes_used,
6840                                &fs_info->free_chunk_space);
6841        }
6842        ret = 0;
6843        return ret;
6844}
6845
6846int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
6847{
6848        struct btrfs_root *root = fs_info->tree_root;
6849        struct btrfs_super_block *super_copy = fs_info->super_copy;
6850        struct extent_buffer *sb;
6851        struct btrfs_disk_key *disk_key;
6852        struct btrfs_chunk *chunk;
6853        u8 *array_ptr;
6854        unsigned long sb_array_offset;
6855        int ret = 0;
6856        u32 num_stripes;
6857        u32 array_size;
6858        u32 len = 0;
6859        u32 cur_offset;
6860        u64 type;
6861        struct btrfs_key key;
6862
6863        ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
6864        /*
6865         * This will create extent buffer of nodesize, superblock size is
6866         * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
6867         * overallocate but we can keep it as-is, only the first page is used.
6868         */
6869        sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET);
6870        if (IS_ERR(sb))
6871                return PTR_ERR(sb);
6872        set_extent_buffer_uptodate(sb);
6873        btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
6874        /*
6875         * The sb extent buffer is artificial and just used to read the system array.
6876         * set_extent_buffer_uptodate() call does not properly mark all it's
6877         * pages up-to-date when the page is larger: extent does not cover the
6878         * whole page and consequently check_page_uptodate does not find all
6879         * the page's extents up-to-date (the hole beyond sb),
6880         * write_extent_buffer then triggers a WARN_ON.
6881         *
6882         * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
6883         * but sb spans only this function. Add an explicit SetPageUptodate call
6884         * to silence the warning eg. on PowerPC 64.
6885         */
6886        if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
6887                SetPageUptodate(sb->pages[0]);
6888
6889        write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
6890        array_size = btrfs_super_sys_array_size(super_copy);
6891
6892        array_ptr = super_copy->sys_chunk_array;
6893        sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
6894        cur_offset = 0;
6895
6896        while (cur_offset < array_size) {
6897                disk_key = (struct btrfs_disk_key *)array_ptr;
6898                len = sizeof(*disk_key);
6899                if (cur_offset + len > array_size)
6900                        goto out_short_read;
6901
6902                btrfs_disk_key_to_cpu(&key, disk_key);
6903
6904                array_ptr += len;
6905                sb_array_offset += len;
6906                cur_offset += len;
6907
6908                if (key.type != BTRFS_CHUNK_ITEM_KEY) {
6909                        btrfs_err(fs_info,
6910                            "unexpected item type %u in sys_array at offset %u",
6911                                  (u32)key.type, cur_offset);
6912                        ret = -EIO;
6913                        break;
6914                }
6915
6916                chunk = (struct btrfs_chunk *)sb_array_offset;
6917                /*
6918                 * At least one btrfs_chunk with one stripe must be present,
6919                 * exact stripe count check comes afterwards
6920                 */
6921                len = btrfs_chunk_item_size(1);
6922                if (cur_offset + len > array_size)
6923                        goto out_short_read;
6924
6925                num_stripes = btrfs_chunk_num_stripes(sb, chunk);
6926                if (!num_stripes) {
6927                        btrfs_err(fs_info,
6928                        "invalid number of stripes %u in sys_array at offset %u",
6929                                  num_stripes, cur_offset);
6930                        ret = -EIO;
6931                        break;
6932                }
6933
6934                type = btrfs_chunk_type(sb, chunk);
6935                if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
6936                        btrfs_err(fs_info,
6937                        "invalid chunk type %llu in sys_array at offset %u",
6938                                  type, cur_offset);
6939                        ret = -EIO;
6940                        break;
6941                }
6942
6943                len = btrfs_chunk_item_size(num_stripes);
6944                if (cur_offset + len > array_size)
6945                        goto out_short_read;
6946
6947                ret = read_one_chunk(&key, sb, chunk);
6948                if (ret)
6949                        break;
6950
6951                array_ptr += len;
6952                sb_array_offset += len;
6953                cur_offset += len;
6954        }
6955        clear_extent_buffer_uptodate(sb);
6956        free_extent_buffer_stale(sb);
6957        return ret;
6958
6959out_short_read:
6960        btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
6961                        len, cur_offset);
6962        clear_extent_buffer_uptodate(sb);
6963        free_extent_buffer_stale(sb);
6964        return -EIO;
6965}
6966
6967/*
6968 * Check if all chunks in the fs are OK for read-write degraded mount
6969 *
6970 * If the @failing_dev is specified, it's accounted as missing.
6971 *
6972 * Return true if all chunks meet the minimal RW mount requirements.
6973 * Return false if any chunk doesn't meet the minimal RW mount requirements.
6974 */
6975bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
6976                                        struct btrfs_device *failing_dev)
6977{
6978        struct extent_map_tree *map_tree = &fs_info->mapping_tree;
6979        struct extent_map *em;
6980        u64 next_start = 0;
6981        bool ret = true;
6982
6983        read_lock(&map_tree->lock);
6984        em = lookup_extent_mapping(map_tree, 0, (u64)-1);
6985        read_unlock(&map_tree->lock);
6986        /* No chunk at all? Return false anyway */
6987        if (!em) {
6988                ret = false;
6989                goto out;
6990        }
6991        while (em) {
6992                struct map_lookup *map;
6993                int missing = 0;
6994                int max_tolerated;
6995                int i;
6996
6997                map = em->map_lookup;
6998                max_tolerated =
6999                        btrfs_get_num_tolerated_disk_barrier_failures(
7000                                        map->type);
7001                for (i = 0; i < map->num_stripes; i++) {
7002                        struct btrfs_device *dev = map->stripes[i].dev;
7003
7004                        if (!dev || !dev->bdev ||
7005                            test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
7006                            dev->last_flush_error)
7007                                missing++;
7008                        else if (failing_dev && failing_dev == dev)
7009                                missing++;
7010                }
7011                if (missing > max_tolerated) {
7012                        if (!failing_dev)
7013                                btrfs_warn(fs_info,
7014        "chunk %llu missing %d devices, max tolerance is %d for writable mount",
7015                                   em->start, missing, max_tolerated);
7016                        free_extent_map(em);
7017                        ret = false;
7018                        goto out;
7019                }
7020                next_start = extent_map_end(em);
7021                free_extent_map(em);
7022
7023                read_lock(&map_tree->lock);
7024                em = lookup_extent_mapping(map_tree, next_start,
7025                                           (u64)(-1) - next_start);
7026                read_unlock(&map_tree->lock);
7027        }
7028out:
7029        return ret;
7030}
7031
7032int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
7033{
7034        struct btrfs_root *root = fs_info->chunk_root;
7035        struct btrfs_path *path;
7036        struct extent_buffer *leaf;
7037        struct btrfs_key key;
7038        struct btrfs_key found_key;
7039        int ret;
7040        int slot;
7041        u64 total_dev = 0;
7042
7043        path = btrfs_alloc_path();
7044        if (!path)
7045                return -ENOMEM;
7046
7047        /*
7048         * uuid_mutex is needed only if we are mounting a sprout FS
7049         * otherwise we don't need it.
7050         */
7051        mutex_lock(&uuid_mutex);
7052        mutex_lock(&fs_info->chunk_mutex);
7053
7054        /*
7055         * It is possible for mount and umount to race in such a way that
7056         * we execute this code path, but open_fs_devices failed to clear
7057         * total_rw_bytes. We certainly want it cleared before reading the
7058         * device items, so clear it here.
7059         */
7060        fs_info->fs_devices->total_rw_bytes = 0;
7061
7062        /*
7063         * Read all device items, and then all the chunk items. All
7064         * device items are found before any chunk item (their object id
7065         * is smaller than the lowest possible object id for a chunk
7066         * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
7067         */
7068        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
7069        key.offset = 0;
7070        key.type = 0;
7071        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7072        if (ret < 0)
7073                goto error;
7074        while (1) {
7075                leaf = path->nodes[0];
7076                slot = path->slots[0];
7077                if (slot >= btrfs_header_nritems(leaf)) {
7078                        ret = btrfs_next_leaf(root, path);
7079                        if (ret == 0)
7080                                continue;
7081                        if (ret < 0)
7082                                goto error;
7083                        break;
7084                }
7085                btrfs_item_key_to_cpu(leaf, &found_key, slot);
7086                if (found_key.type == BTRFS_DEV_ITEM_KEY) {
7087                        struct btrfs_dev_item *dev_item;
7088                        dev_item = btrfs_item_ptr(leaf, slot,
7089                                                  struct btrfs_dev_item);
7090                        ret = read_one_dev(leaf, dev_item);
7091                        if (ret)
7092                                goto error;
7093                        total_dev++;
7094                } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
7095                        struct btrfs_chunk *chunk;
7096                        chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
7097                        ret = read_one_chunk(&found_key, leaf, chunk);
7098                        if (ret)
7099                                goto error;
7100                }
7101                path->slots[0]++;
7102        }
7103
7104        /*
7105         * After loading chunk tree, we've got all device information,
7106         * do another round of validation checks.
7107         */
7108        if (total_dev != fs_info->fs_devices->total_devices) {
7109                btrfs_err(fs_info,
7110           "super_num_devices %llu mismatch with num_devices %llu found here",
7111                          btrfs_super_num_devices(fs_info->super_copy),
7112                          total_dev);
7113                ret = -EINVAL;
7114                goto error;
7115        }
7116        if (btrfs_super_total_bytes(fs_info->super_copy) <
7117            fs_info->fs_devices->total_rw_bytes) {
7118                btrfs_err(fs_info,
7119        "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
7120                          btrfs_super_total_bytes(fs_info->super_copy),
7121                          fs_info->fs_devices->total_rw_bytes);
7122                ret = -EINVAL;
7123                goto error;
7124        }
7125        ret = 0;
7126error:
7127        mutex_unlock(&fs_info->chunk_mutex);
7128        mutex_unlock(&uuid_mutex);
7129
7130        btrfs_free_path(path);
7131        return ret;
7132}
7133
7134void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
7135{
7136        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7137        struct btrfs_device *device;
7138
7139        while (fs_devices) {
7140                mutex_lock(&fs_devices->device_list_mutex);
7141                list_for_each_entry(device, &fs_devices->devices, dev_list)
7142                        device->fs_info = fs_info;
7143                mutex_unlock(&fs_devices->device_list_mutex);
7144
7145                fs_devices = fs_devices->seed;
7146        }
7147}
7148
7149static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
7150                                 const struct btrfs_dev_stats_item *ptr,
7151                                 int index)
7152{
7153        u64 val;
7154
7155        read_extent_buffer(eb, &val,
7156                           offsetof(struct btrfs_dev_stats_item, values) +
7157                            ((unsigned long)ptr) + (index * sizeof(u64)),
7158                           sizeof(val));
7159        return val;
7160}
7161
7162static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
7163                                      struct btrfs_dev_stats_item *ptr,
7164                                      int index, u64 val)
7165{
7166        write_extent_buffer(eb, &val,
7167                            offsetof(struct btrfs_dev_stats_item, values) +
7168                             ((unsigned long)ptr) + (index * sizeof(u64)),
7169                            sizeof(val));
7170}
7171
7172int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
7173{
7174        struct btrfs_key key;
7175        struct btrfs_root *dev_root = fs_info->dev_root;
7176        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7177        struct extent_buffer *eb;
7178        int slot;
7179        int ret = 0;
7180        struct btrfs_device *device;
7181        struct btrfs_path *path = NULL;
7182        int i;
7183
7184        path = btrfs_alloc_path();
7185        if (!path)
7186                return -ENOMEM;
7187
7188        mutex_lock(&fs_devices->device_list_mutex);
7189        list_for_each_entry(device, &fs_devices->devices, dev_list) {
7190                int item_size;
7191                struct btrfs_dev_stats_item *ptr;
7192
7193                key.objectid = BTRFS_DEV_STATS_OBJECTID;
7194                key.type = BTRFS_PERSISTENT_ITEM_KEY;
7195                key.offset = device->devid;
7196                ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
7197                if (ret) {
7198                        for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7199                                btrfs_dev_stat_set(device, i, 0);
7200                        device->dev_stats_valid = 1;
7201                        btrfs_release_path(path);
7202                        continue;
7203                }
7204                slot = path->slots[0];
7205                eb = path->nodes[0];
7206                item_size = btrfs_item_size_nr(eb, slot);
7207
7208                ptr = btrfs_item_ptr(eb, slot,
7209                                     struct btrfs_dev_stats_item);
7210
7211                for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7212                        if (item_size >= (1 + i) * sizeof(__le64))
7213                                btrfs_dev_stat_set(device, i,
7214                                        btrfs_dev_stats_value(eb, ptr, i));
7215                        else
7216                                btrfs_dev_stat_set(device, i, 0);
7217                }
7218
7219                device->dev_stats_valid = 1;
7220                btrfs_dev_stat_print_on_load(device);
7221                btrfs_release_path(path);
7222        }
7223        mutex_unlock(&fs_devices->device_list_mutex);
7224
7225        btrfs_free_path(path);
7226        return ret < 0 ? ret : 0;
7227}
7228
7229static int update_dev_stat_item(struct btrfs_trans_handle *trans,
7230                                struct btrfs_device *device)
7231{
7232        struct btrfs_fs_info *fs_info = trans->fs_info;
7233        struct btrfs_root *dev_root = fs_info->dev_root;
7234        struct btrfs_path *path;
7235        struct btrfs_key key;
7236        struct extent_buffer *eb;
7237        struct btrfs_dev_stats_item *ptr;
7238        int ret;
7239        int i;
7240
7241        key.objectid = BTRFS_DEV_STATS_OBJECTID;
7242        key.type = BTRFS_PERSISTENT_ITEM_KEY;
7243        key.offset = device->devid;
7244
7245        path = btrfs_alloc_path();
7246        if (!path)
7247                return -ENOMEM;
7248        ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
7249        if (ret < 0) {
7250                btrfs_warn_in_rcu(fs_info,
7251                        "error %d while searching for dev_stats item for device %s",
7252                              ret, rcu_str_deref(device->name));
7253                goto out;
7254        }
7255
7256        if (ret == 0 &&
7257            btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
7258                /* need to delete old one and insert a new one */
7259                ret = btrfs_del_item(trans, dev_root, path);
7260                if (ret != 0) {
7261                        btrfs_warn_in_rcu(fs_info,
7262                                "delete too small dev_stats item for device %s failed %d",
7263                                      rcu_str_deref(device->name), ret);
7264                        goto out;
7265                }
7266                ret = 1;
7267        }
7268
7269        if (ret == 1) {
7270                /* need to insert a new item */
7271                btrfs_release_path(path);
7272                ret = btrfs_insert_empty_item(trans, dev_root, path,
7273                                              &key, sizeof(*ptr));
7274                if (ret < 0) {
7275                        btrfs_warn_in_rcu(fs_info,
7276                                "insert dev_stats item for device %s failed %d",
7277                                rcu_str_deref(device->name), ret);
7278                        goto out;
7279                }
7280        }
7281
7282        eb = path->nodes[0];
7283        ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
7284        for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7285                btrfs_set_dev_stats_value(eb, ptr, i,
7286                                          btrfs_dev_stat_read(device, i));
7287        btrfs_mark_buffer_dirty(eb);
7288
7289out:
7290        btrfs_free_path(path);
7291        return ret;
7292}
7293
7294/*
7295 * called from commit_transaction. Writes all changed device stats to disk.
7296 */
7297int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
7298{
7299        struct btrfs_fs_info *fs_info = trans->fs_info;
7300        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7301        struct btrfs_device *device;
7302        int stats_cnt;
7303        int ret = 0;
7304
7305        mutex_lock(&fs_devices->device_list_mutex);
7306        list_for_each_entry(device, &fs_devices->devices, dev_list) {
7307                stats_cnt = atomic_read(&device->dev_stats_ccnt);
7308                if (!device->dev_stats_valid || stats_cnt == 0)
7309                        continue;
7310
7311
7312                /*
7313                 * There is a LOAD-LOAD control dependency between the value of
7314                 * dev_stats_ccnt and updating the on-disk values which requires
7315                 * reading the in-memory counters. Such control dependencies
7316                 * require explicit read memory barriers.
7317                 *
7318                 * This memory barriers pairs with smp_mb__before_atomic in
7319                 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
7320                 * barrier implied by atomic_xchg in
7321                 * btrfs_dev_stats_read_and_reset
7322                 */
7323                smp_rmb();
7324
7325                ret = update_dev_stat_item(trans, device);
7326                if (!ret)
7327                        atomic_sub(stats_cnt, &device->dev_stats_ccnt);
7328        }
7329        mutex_unlock(&fs_devices->device_list_mutex);
7330
7331        return ret;
7332}
7333
7334void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
7335{
7336        btrfs_dev_stat_inc(dev, index);
7337        btrfs_dev_stat_print_on_error(dev);
7338}
7339
7340static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
7341{
7342        if (!dev->dev_stats_valid)
7343                return;
7344        btrfs_err_rl_in_rcu(dev->fs_info,
7345                "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7346                           rcu_str_deref(dev->name),
7347                           btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7348                           btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7349                           btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7350                           btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7351                           btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7352}
7353
7354static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
7355{
7356        int i;
7357
7358        for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7359                if (btrfs_dev_stat_read(dev, i) != 0)
7360                        break;
7361        if (i == BTRFS_DEV_STAT_VALUES_MAX)
7362                return; /* all values == 0, suppress message */
7363
7364        btrfs_info_in_rcu(dev->fs_info,
7365                "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7366               rcu_str_deref(dev->name),
7367               btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7368               btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7369               btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7370               btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7371               btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7372}
7373
7374int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
7375                        struct btrfs_ioctl_get_dev_stats *stats)
7376{
7377        struct btrfs_device *dev;
7378        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7379        int i;
7380
7381        mutex_lock(&fs_devices->device_list_mutex);
7382        dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL,
7383                                true);
7384        mutex_unlock(&fs_devices->device_list_mutex);
7385
7386        if (!dev) {
7387                btrfs_warn(fs_info, "get dev_stats failed, device not found");
7388                return -ENODEV;
7389        } else if (!dev->dev_stats_valid) {
7390                btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
7391                return -ENODEV;
7392        } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
7393                for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7394                        if (stats->nr_items > i)
7395                                stats->values[i] =
7396                                        btrfs_dev_stat_read_and_reset(dev, i);
7397                        else
7398                                btrfs_dev_stat_set(dev, i, 0);
7399                }
7400                btrfs_info(fs_info, "device stats zeroed by %s (%d)",
7401                           current->comm, task_pid_nr(current));
7402        } else {
7403                for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7404                        if (stats->nr_items > i)
7405                                stats->values[i] = btrfs_dev_stat_read(dev, i);
7406        }
7407        if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
7408                stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
7409        return 0;
7410}
7411
7412/*
7413 * Update the size and bytes used for each device where it changed.  This is
7414 * delayed since we would otherwise get errors while writing out the
7415 * superblocks.
7416 *
7417 * Must be invoked during transaction commit.
7418 */
7419void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
7420{
7421        struct btrfs_device *curr, *next;
7422
7423        ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
7424
7425        if (list_empty(&trans->dev_update_list))
7426                return;
7427
7428        /*
7429         * We don't need the device_list_mutex here.  This list is owned by the
7430         * transaction and the transaction must complete before the device is
7431         * released.
7432         */
7433        mutex_lock(&trans->fs_info->chunk_mutex);
7434        list_for_each_entry_safe(curr, next, &trans->dev_update_list,
7435                                 post_commit_list) {
7436                list_del_init(&curr->post_commit_list);
7437                curr->commit_total_bytes = curr->disk_total_bytes;
7438                curr->commit_bytes_used = curr->bytes_used;
7439        }
7440        mutex_unlock(&trans->fs_info->chunk_mutex);
7441}
7442
7443void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info)
7444{
7445        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7446        while (fs_devices) {
7447                fs_devices->fs_info = fs_info;
7448                fs_devices = fs_devices->seed;
7449        }
7450}
7451
7452void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
7453{
7454        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7455        while (fs_devices) {
7456                fs_devices->fs_info = NULL;
7457                fs_devices = fs_devices->seed;
7458        }
7459}
7460
7461/*
7462 * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
7463 */
7464int btrfs_bg_type_to_factor(u64 flags)
7465{
7466        const int index = btrfs_bg_flags_to_raid_index(flags);
7467
7468        return btrfs_raid_array[index].ncopies;
7469}
7470
7471
7472
7473static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
7474                                 u64 chunk_offset, u64 devid,
7475                                 u64 physical_offset, u64 physical_len)
7476{
7477        struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7478        struct extent_map *em;
7479        struct map_lookup *map;
7480        struct btrfs_device *dev;
7481        u64 stripe_len;
7482        bool found = false;
7483        int ret = 0;
7484        int i;
7485
7486        read_lock(&em_tree->lock);
7487        em = lookup_extent_mapping(em_tree, chunk_offset, 1);
7488        read_unlock(&em_tree->lock);
7489
7490        if (!em) {
7491                btrfs_err(fs_info,
7492"dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
7493                          physical_offset, devid);
7494                ret = -EUCLEAN;
7495                goto out;
7496        }
7497
7498        map = em->map_lookup;
7499        stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes);
7500        if (physical_len != stripe_len) {
7501                btrfs_err(fs_info,
7502"dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
7503                          physical_offset, devid, em->start, physical_len,
7504                          stripe_len);
7505                ret = -EUCLEAN;
7506                goto out;
7507        }
7508
7509        for (i = 0; i < map->num_stripes; i++) {
7510                if (map->stripes[i].dev->devid == devid &&
7511                    map->stripes[i].physical == physical_offset) {
7512                        found = true;
7513                        if (map->verified_stripes >= map->num_stripes) {
7514                                btrfs_err(fs_info,
7515                                "too many dev extents for chunk %llu found",
7516                                          em->start);
7517                                ret = -EUCLEAN;
7518                                goto out;
7519                        }
7520                        map->verified_stripes++;
7521                        break;
7522                }
7523        }
7524        if (!found) {
7525                btrfs_err(fs_info,
7526        "dev extent physical offset %llu devid %llu has no corresponding chunk",
7527                        physical_offset, devid);
7528                ret = -EUCLEAN;
7529        }
7530
7531        /* Make sure no dev extent is beyond device bondary */
7532        dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
7533        if (!dev) {
7534                btrfs_err(fs_info, "failed to find devid %llu", devid);
7535                ret = -EUCLEAN;
7536                goto out;
7537        }
7538
7539        /* It's possible this device is a dummy for seed device */
7540        if (dev->disk_total_bytes == 0) {
7541                dev = btrfs_find_device(fs_info->fs_devices->seed, devid, NULL,
7542                                        NULL, false);
7543                if (!dev) {
7544                        btrfs_err(fs_info, "failed to find seed devid %llu",
7545                                  devid);
7546                        ret = -EUCLEAN;
7547                        goto out;
7548                }
7549        }
7550
7551        if (physical_offset + physical_len > dev->disk_total_bytes) {
7552                btrfs_err(fs_info,
7553"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
7554                          devid, physical_offset, physical_len,
7555                          dev->disk_total_bytes);
7556                ret = -EUCLEAN;
7557                goto out;
7558        }
7559out:
7560        free_extent_map(em);
7561        return ret;
7562}
7563
7564static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
7565{
7566        struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7567        struct extent_map *em;
7568        struct rb_node *node;
7569        int ret = 0;
7570
7571        read_lock(&em_tree->lock);
7572        for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
7573                em = rb_entry(node, struct extent_map, rb_node);
7574                if (em->map_lookup->num_stripes !=
7575                    em->map_lookup->verified_stripes) {
7576                        btrfs_err(fs_info,
7577                        "chunk %llu has missing dev extent, have %d expect %d",
7578                                  em->start, em->map_lookup->verified_stripes,
7579                                  em->map_lookup->num_stripes);
7580                        ret = -EUCLEAN;
7581                        goto out;
7582                }
7583        }
7584out:
7585        read_unlock(&em_tree->lock);
7586        return ret;
7587}
7588
7589/*
7590 * Ensure that all dev extents are mapped to correct chunk, otherwise
7591 * later chunk allocation/free would cause unexpected behavior.
7592 *
7593 * NOTE: This will iterate through the whole device tree, which should be of
7594 * the same size level as the chunk tree.  This slightly increases mount time.
7595 */
7596int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
7597{
7598        struct btrfs_path *path;
7599        struct btrfs_root *root = fs_info->dev_root;
7600        struct btrfs_key key;
7601        u64 prev_devid = 0;
7602        u64 prev_dev_ext_end = 0;
7603        int ret = 0;
7604
7605        key.objectid = 1;
7606        key.type = BTRFS_DEV_EXTENT_KEY;
7607        key.offset = 0;
7608
7609        path = btrfs_alloc_path();
7610        if (!path)
7611                return -ENOMEM;
7612
7613        path->reada = READA_FORWARD;
7614        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7615        if (ret < 0)
7616                goto out;
7617
7618        if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
7619                ret = btrfs_next_item(root, path);
7620                if (ret < 0)
7621                        goto out;
7622                /* No dev extents at all? Not good */
7623                if (ret > 0) {
7624                        ret = -EUCLEAN;
7625                        goto out;
7626                }
7627        }
7628        while (1) {
7629                struct extent_buffer *leaf = path->nodes[0];
7630                struct btrfs_dev_extent *dext;
7631                int slot = path->slots[0];
7632                u64 chunk_offset;
7633                u64 physical_offset;
7634                u64 physical_len;
7635                u64 devid;
7636
7637                btrfs_item_key_to_cpu(leaf, &key, slot);
7638                if (key.type != BTRFS_DEV_EXTENT_KEY)
7639                        break;
7640                devid = key.objectid;
7641                physical_offset = key.offset;
7642
7643                dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
7644                chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
7645                physical_len = btrfs_dev_extent_length(leaf, dext);
7646
7647                /* Check if this dev extent overlaps with the previous one */
7648                if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
7649                        btrfs_err(fs_info,
7650"dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
7651                                  devid, physical_offset, prev_dev_ext_end);
7652                        ret = -EUCLEAN;
7653                        goto out;
7654                }
7655
7656                ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
7657                                            physical_offset, physical_len);
7658                if (ret < 0)
7659                        goto out;
7660                prev_devid = devid;
7661                prev_dev_ext_end = physical_offset + physical_len;
7662
7663                ret = btrfs_next_item(root, path);
7664                if (ret < 0)
7665                        goto out;
7666                if (ret > 0) {
7667                        ret = 0;
7668                        break;
7669                }
7670        }
7671
7672        /* Ensure all chunks have corresponding dev extents */
7673        ret = verify_chunk_dev_extent_mapping(fs_info);
7674out:
7675        btrfs_free_path(path);
7676        return ret;
7677}
7678
7679/*
7680 * Check whether the given block group or device is pinned by any inode being
7681 * used as a swapfile.
7682 */
7683bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
7684{
7685        struct btrfs_swapfile_pin *sp;
7686        struct rb_node *node;
7687
7688        spin_lock(&fs_info->swapfile_pins_lock);
7689        node = fs_info->swapfile_pins.rb_node;
7690        while (node) {
7691                sp = rb_entry(node, struct btrfs_swapfile_pin, node);
7692                if (ptr < sp->ptr)
7693                        node = node->rb_left;
7694                else if (ptr > sp->ptr)
7695                        node = node->rb_right;
7696                else
7697                        break;
7698        }
7699        spin_unlock(&fs_info->swapfile_pins_lock);
7700        return node != NULL;
7701}
7702