LXR linux/fs/btrfs/volumes.c

   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2007 Oracle.  All rights reserved.
   4 */
   5
   6#include <linux/sched.h>
   7#include <linux/bio.h>
   8#include <linux/slab.h>
   9#include <linux/buffer_head.h>
  10#include <linux/blkdev.h>
  11#include <linux/ratelimit.h>
  12#include <linux/kthread.h>
  13#include <linux/raid/pq.h>
  14#include <linux/semaphore.h>
  15#include <linux/uuid.h>
  16#include <linux/list_sort.h>
  17#include "ctree.h"
  18#include "extent_map.h"
  19#include "disk-io.h"
  20#include "transaction.h"
  21#include "print-tree.h"
  22#include "volumes.h"
  23#include "raid56.h"
  24#include "async-thread.h"
  25#include "check-integrity.h"
  26#include "rcu-string.h"
  27#include "math.h"
  28#include "dev-replace.h"
  29#include "sysfs.h"
  30#include "tree-checker.h"
  31#include "space-info.h"
  32
  33const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
  34        [BTRFS_RAID_RAID10] = {
  35                .sub_stripes    = 2,
  36                .dev_stripes    = 1,
  37                .devs_max       = 0,    /* 0 == as many as possible */
  38                .devs_min       = 4,
  39                .tolerated_failures = 1,
  40                .devs_increment = 2,
  41                .ncopies        = 2,
  42                .nparity        = 0,
  43                .raid_name      = "raid10",
  44                .bg_flag        = BTRFS_BLOCK_GROUP_RAID10,
  45                .mindev_error   = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
  46        },
  47        [BTRFS_RAID_RAID1] = {
  48                .sub_stripes    = 1,
  49                .dev_stripes    = 1,
  50                .devs_max       = 2,
  51                .devs_min       = 2,
  52                .tolerated_failures = 1,
  53                .devs_increment = 2,
  54                .ncopies        = 2,
  55                .nparity        = 0,
  56                .raid_name      = "raid1",
  57                .bg_flag        = BTRFS_BLOCK_GROUP_RAID1,
  58                .mindev_error   = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
  59        },
  60        [BTRFS_RAID_DUP] = {
  61                .sub_stripes    = 1,
  62                .dev_stripes    = 2,
  63                .devs_max       = 1,
  64                .devs_min       = 1,
  65                .tolerated_failures = 0,
  66                .devs_increment = 1,
  67                .ncopies        = 2,
  68                .nparity        = 0,
  69                .raid_name      = "dup",
  70                .bg_flag        = BTRFS_BLOCK_GROUP_DUP,
  71                .mindev_error   = 0,
  72        },
  73        [BTRFS_RAID_RAID0] = {
  74                .sub_stripes    = 1,
  75                .dev_stripes    = 1,
  76                .devs_max       = 0,
  77                .devs_min       = 2,
  78                .tolerated_failures = 0,
  79                .devs_increment = 1,
  80                .ncopies        = 1,
  81                .nparity        = 0,
  82                .raid_name      = "raid0",
  83                .bg_flag        = BTRFS_BLOCK_GROUP_RAID0,
  84                .mindev_error   = 0,
  85        },
  86        [BTRFS_RAID_SINGLE] = {
  87                .sub_stripes    = 1,
  88                .dev_stripes    = 1,
  89                .devs_max       = 1,
  90                .devs_min       = 1,
  91                .tolerated_failures = 0,
  92                .devs_increment = 1,
  93                .ncopies        = 1,
  94                .nparity        = 0,
  95                .raid_name      = "single",
  96                .bg_flag        = 0,
  97                .mindev_error   = 0,
  98        },
  99        [BTRFS_RAID_RAID5] = {
 100                .sub_stripes    = 1,
 101                .dev_stripes    = 1,
 102                .devs_max       = 0,
 103                .devs_min       = 2,
 104                .tolerated_failures = 1,
 105                .devs_increment = 1,
 106                .ncopies        = 1,
 107                .nparity        = 1,
 108                .raid_name      = "raid5",
 109                .bg_flag        = BTRFS_BLOCK_GROUP_RAID5,
 110                .mindev_error   = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
 111        },
 112        [BTRFS_RAID_RAID6] = {
 113                .sub_stripes    = 1,
 114                .dev_stripes    = 1,
 115                .devs_max       = 0,
 116                .devs_min       = 3,
 117                .tolerated_failures = 2,
 118                .devs_increment = 1,
 119                .ncopies        = 1,
 120                .nparity        = 2,
 121                .raid_name      = "raid6",
 122                .bg_flag        = BTRFS_BLOCK_GROUP_RAID6,
 123                .mindev_error   = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
 124        },
 125};
 126
 127const char *btrfs_bg_type_to_raid_name(u64 flags)
 128{
 129        const int index = btrfs_bg_flags_to_raid_index(flags);
 130
 131        if (index >= BTRFS_NR_RAID_TYPES)
 132                return NULL;
 133
 134        return btrfs_raid_array[index].raid_name;
 135}
 136
 137/*
 138 * Fill @buf with textual description of @bg_flags, no more than @size_buf
 139 * bytes including terminating null byte.
 140 */
 141void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
 142{
 143        int i;
 144        int ret;
 145        char *bp = buf;
 146        u64 flags = bg_flags;
 147        u32 size_bp = size_buf;
 148
 149        if (!flags) {
 150                strcpy(bp, "NONE");
 151                return;
 152        }
 153
 154#define DESCRIBE_FLAG(flag, desc)                                               \
 155        do {                                                            \
 156                if (flags & (flag)) {                                   \
 157                        ret = snprintf(bp, size_bp, "%s|", (desc));     \
 158                        if (ret < 0 || ret >= size_bp)                  \
 159                                goto out_overflow;                      \
 160                        size_bp -= ret;                                 \
 161                        bp += ret;                                      \
 162                        flags &= ~(flag);                               \
 163                }                                                       \
 164        } while (0)
 165
 166        DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
 167        DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
 168        DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
 169
 170        DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
 171        for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
 172                DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
 173                              btrfs_raid_array[i].raid_name);
 174#undef DESCRIBE_FLAG
 175
 176        if (flags) {
 177                ret = snprintf(bp, size_bp, "0x%llx|", flags);
 178                size_bp -= ret;
 179        }
 180
 181        if (size_bp < size_buf)
 182                buf[size_buf - size_bp - 1] = '\0'; /* remove last | */
 183
 184        /*
 185         * The text is trimmed, it's up to the caller to provide sufficiently
 186         * large buffer
 187         */
 188out_overflow:;
 189}
 190
 191static int init_first_rw_device(struct btrfs_trans_handle *trans);
 192static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
 193static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
 194static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
 195static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
 196static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
 197                             enum btrfs_map_op op,
 198                             u64 logical, u64 *length,
 199                             struct btrfs_bio **bbio_ret,
 200                             int mirror_num, int need_raid_map);
 201
 202/*
 203 * Device locking
 204 * ==============
 205 *
 206 * There are several mutexes that protect manipulation of devices and low-level
 207 * structures like chunks but not block groups, extents or files
 208 *
 209 * uuid_mutex (global lock)
 210 * ------------------------
 211 * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
 212 * the SCAN_DEV ioctl registration or from mount either implicitly (the first
 213 * device) or requested by the device= mount option
 214 *
 215 * the mutex can be very coarse and can cover long-running operations
 216 *
 217 * protects: updates to fs_devices counters like missing devices, rw devices,
 218 * seeding, structure cloning, opening/closing devices at mount/umount time
 219 *
 220 * global::fs_devs - add, remove, updates to the global list
 221 *
 222 * does not protect: manipulation of the fs_devices::devices list!
 223 *
 224 * btrfs_device::name - renames (write side), read is RCU
 225 *
 226 * fs_devices::device_list_mutex (per-fs, with RCU)
 227 * ------------------------------------------------
 228 * protects updates to fs_devices::devices, ie. adding and deleting
 229 *
 230 * simple list traversal with read-only actions can be done with RCU protection
 231 *
 232 * may be used to exclude some operations from running concurrently without any
 233 * modifications to the list (see write_all_supers)
 234 *
 235 * balance_mutex
 236 * -------------
 237 * protects balance structures (status, state) and context accessed from
 238 * several places (internally, ioctl)
 239 *
 240 * chunk_mutex
 241 * -----------
 242 * protects chunks, adding or removing during allocation, trim or when a new
 243 * device is added/removed. Additionally it also protects post_commit_list of
 244 * individual devices, since they can be added to the transaction's
 245 * post_commit_list only with chunk_mutex held.
 246 *
 247 * cleaner_mutex
 248 * -------------
 249 * a big lock that is held by the cleaner thread and prevents running subvolume
 250 * cleaning together with relocation or delayed iputs
 251 *
 252 *
 253 * Lock nesting
 254 * ============
 255 *
 256 * uuid_mutex
 257 *   volume_mutex
 258 *     device_list_mutex
 259 *       chunk_mutex
 260 *     balance_mutex
 261 *
 262 *
 263 * Exclusive operations, BTRFS_FS_EXCL_OP
 264 * ======================================
 265 *
 266 * Maintains the exclusivity of the following operations that apply to the
 267 * whole filesystem and cannot run in parallel.
 268 *
 269 * - Balance (*)
 270 * - Device add
 271 * - Device remove
 272 * - Device replace (*)
 273 * - Resize
 274 *
 275 * The device operations (as above) can be in one of the following states:
 276 *
 277 * - Running state
 278 * - Paused state
 279 * - Completed state
 280 *
 281 * Only device operations marked with (*) can go into the Paused state for the
 282 * following reasons:
 283 *
 284 * - ioctl (only Balance can be Paused through ioctl)
 285 * - filesystem remounted as read-only
 286 * - filesystem unmounted and mounted as read-only
 287 * - system power-cycle and filesystem mounted as read-only
 288 * - filesystem or device errors leading to forced read-only
 289 *
 290 * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations.
 291 * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set.
 292 * A device operation in Paused or Running state can be canceled or resumed
 293 * either by ioctl (Balance only) or when remounted as read-write.
 294 * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or
 295 * completed.
 296 */
 297
 298DEFINE_MUTEX(uuid_mutex);
 299static LIST_HEAD(fs_uuids);
 300struct list_head *btrfs_get_fs_uuids(void)
 301{
 302        return &fs_uuids;
 303}
 304
 305/*
 306 * alloc_fs_devices - allocate struct btrfs_fs_devices
 307 * @fsid:               if not NULL, copy the UUID to fs_devices::fsid
 308 * @metadata_fsid:      if not NULL, copy the UUID to fs_devices::metadata_fsid
 309 *
 310 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
 311 * The returned struct is not linked onto any lists and can be destroyed with
 312 * kfree() right away.
 313 */
 314static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
 315                                                 const u8 *metadata_fsid)
 316{
 317        struct btrfs_fs_devices *fs_devs;
 318
 319        fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
 320        if (!fs_devs)
 321                return ERR_PTR(-ENOMEM);
 322
 323        mutex_init(&fs_devs->device_list_mutex);
 324
 325        INIT_LIST_HEAD(&fs_devs->devices);
 326        INIT_LIST_HEAD(&fs_devs->alloc_list);
 327        INIT_LIST_HEAD(&fs_devs->fs_list);
 328        if (fsid)
 329                memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
 330
 331        if (metadata_fsid)
 332                memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
 333        else if (fsid)
 334                memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
 335
 336        return fs_devs;
 337}
 338
 339void btrfs_free_device(struct btrfs_device *device)
 340{
 341        WARN_ON(!list_empty(&device->post_commit_list));
 342        rcu_string_free(device->name);
 343        extent_io_tree_release(&device->alloc_state);
 344        bio_put(device->flush_bio);
 345        kfree(device);
 346}
 347
 348static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
 349{
 350        struct btrfs_device *device;
 351        WARN_ON(fs_devices->opened);
 352        while (!list_empty(&fs_devices->devices)) {
 353                device = list_entry(fs_devices->devices.next,
 354                                    struct btrfs_device, dev_list);
 355                list_del(&device->dev_list);
 356                btrfs_free_device(device);
 357        }
 358        kfree(fs_devices);
 359}
 360
 361static void btrfs_kobject_uevent(struct block_device *bdev,
 362                                 enum kobject_action action)
 363{
 364        int ret;
 365
 366        ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
 367        if (ret)
 368                pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
 369                        action,
 370                        kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
 371                        &disk_to_dev(bdev->bd_disk)->kobj);
 372}
 373
 374void __exit btrfs_cleanup_fs_uuids(void)
 375{
 376        struct btrfs_fs_devices *fs_devices;
 377
 378        while (!list_empty(&fs_uuids)) {
 379                fs_devices = list_entry(fs_uuids.next,
 380                                        struct btrfs_fs_devices, fs_list);
 381                list_del(&fs_devices->fs_list);
 382                free_fs_devices(fs_devices);
 383        }
 384}
 385
 386/*
 387 * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
 388 * Returned struct is not linked onto any lists and must be destroyed using
 389 * btrfs_free_device.
 390 */
 391static struct btrfs_device *__alloc_device(void)
 392{
 393        struct btrfs_device *dev;
 394
 395        dev = kzalloc(sizeof(*dev), GFP_KERNEL);
 396        if (!dev)
 397                return ERR_PTR(-ENOMEM);
 398
 399        /*
 400         * Preallocate a bio that's always going to be used for flushing device
 401         * barriers and matches the device lifespan
 402         */
 403        dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
 404        if (!dev->flush_bio) {
 405                kfree(dev);
 406                return ERR_PTR(-ENOMEM);
 407        }
 408
 409        INIT_LIST_HEAD(&dev->dev_list);
 410        INIT_LIST_HEAD(&dev->dev_alloc_list);
 411        INIT_LIST_HEAD(&dev->post_commit_list);
 412
 413        spin_lock_init(&dev->io_lock);
 414
 415        atomic_set(&dev->reada_in_flight, 0);
 416        atomic_set(&dev->dev_stats_ccnt, 0);
 417        btrfs_device_data_ordered_init(dev);
 418        INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
 419        INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
 420        extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL);
 421
 422        return dev;
 423}
 424
 425static noinline struct btrfs_fs_devices *find_fsid(
 426                const u8 *fsid, const u8 *metadata_fsid)
 427{
 428        struct btrfs_fs_devices *fs_devices;
 429
 430        ASSERT(fsid);
 431
 432        if (metadata_fsid) {
 433                /*
 434                 * Handle scanned device having completed its fsid change but
 435                 * belonging to a fs_devices that was created by first scanning
 436                 * a device which didn't have its fsid/metadata_uuid changed
 437                 * at all and the CHANGING_FSID_V2 flag set.
 438                 */
 439                list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 440                        if (fs_devices->fsid_change &&
 441                            memcmp(metadata_fsid, fs_devices->fsid,
 442                                   BTRFS_FSID_SIZE) == 0 &&
 443                            memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
 444                                   BTRFS_FSID_SIZE) == 0) {
 445                                return fs_devices;
 446                        }
 447                }
 448                /*
 449                 * Handle scanned device having completed its fsid change but
 450                 * belonging to a fs_devices that was created by a device that
 451                 * has an outdated pair of fsid/metadata_uuid and
 452                 * CHANGING_FSID_V2 flag set.
 453                 */
 454                list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 455                        if (fs_devices->fsid_change &&
 456                            memcmp(fs_devices->metadata_uuid,
 457                                   fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
 458                            memcmp(metadata_fsid, fs_devices->metadata_uuid,
 459                                   BTRFS_FSID_SIZE) == 0) {
 460                                return fs_devices;
 461                        }
 462                }
 463        }
 464
 465        /* Handle non-split brain cases */
 466        list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 467                if (metadata_fsid) {
 468                        if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0
 469                            && memcmp(metadata_fsid, fs_devices->metadata_uuid,
 470                                      BTRFS_FSID_SIZE) == 0)
 471                                return fs_devices;
 472                } else {
 473                        if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
 474                                return fs_devices;
 475                }
 476        }
 477        return NULL;
 478}
 479
 480static int
 481btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
 482                      int flush, struct block_device **bdev,
 483                      struct buffer_head **bh)
 484{
 485        int ret;
 486
 487        *bdev = blkdev_get_by_path(device_path, flags, holder);
 488
 489        if (IS_ERR(*bdev)) {
 490                ret = PTR_ERR(*bdev);
 491                goto error;
 492        }
 493
 494        if (flush)
 495                filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
 496        ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
 497        if (ret) {
 498                blkdev_put(*bdev, flags);
 499                goto error;
 500        }
 501        invalidate_bdev(*bdev);
 502        *bh = btrfs_read_dev_super(*bdev);
 503        if (IS_ERR(*bh)) {
 504                ret = PTR_ERR(*bh);
 505                blkdev_put(*bdev, flags);
 506                goto error;
 507        }
 508
 509        return 0;
 510
 511error:
 512        *bdev = NULL;
 513        *bh = NULL;
 514        return ret;
 515}
 516
 517static void requeue_list(struct btrfs_pending_bios *pending_bios,
 518                        struct bio *head, struct bio *tail)
 519{
 520
 521        struct bio *old_head;
 522
 523        old_head = pending_bios->head;
 524        pending_bios->head = head;
 525        if (pending_bios->tail)
 526                tail->bi_next = old_head;
 527        else
 528                pending_bios->tail = tail;
 529}
 530
 531/*
 532 * we try to collect pending bios for a device so we don't get a large
 533 * number of procs sending bios down to the same device.  This greatly
 534 * improves the schedulers ability to collect and merge the bios.
 535 *
 536 * But, it also turns into a long list of bios to process and that is sure
 537 * to eventually make the worker thread block.  The solution here is to
 538 * make some progress and then put this work struct back at the end of
 539 * the list if the block device is congested.  This way, multiple devices
 540 * can make progress from a single worker thread.
 541 */
 542static noinline void run_scheduled_bios(struct btrfs_device *device)
 543{
 544        struct btrfs_fs_info *fs_info = device->fs_info;
 545        struct bio *pending;
 546        struct backing_dev_info *bdi;
 547        struct btrfs_pending_bios *pending_bios;
 548        struct bio *tail;
 549        struct bio *cur;
 550        int again = 0;
 551        unsigned long num_run;
 552        unsigned long batch_run = 0;
 553        unsigned long last_waited = 0;
 554        int force_reg = 0;
 555        int sync_pending = 0;
 556        struct blk_plug plug;
 557
 558        /*
 559         * this function runs all the bios we've collected for
 560         * a particular device.  We don't want to wander off to
 561         * another device without first sending all of these down.
 562         * So, setup a plug here and finish it off before we return
 563         */
 564        blk_start_plug(&plug);
 565
 566        bdi = device->bdev->bd_bdi;
 567
 568loop:
 569        spin_lock(&device->io_lock);
 570
 571loop_lock:
 572        num_run = 0;
 573
 574        /* take all the bios off the list at once and process them
 575         * later on (without the lock held).  But, remember the
 576         * tail and other pointers so the bios can be properly reinserted
 577         * into the list if we hit congestion
 578         */
 579        if (!force_reg && device->pending_sync_bios.head) {
 580                pending_bios = &device->pending_sync_bios;
 581                force_reg = 1;
 582        } else {
 583                pending_bios = &device->pending_bios;
 584                force_reg = 0;
 585        }
 586
 587        pending = pending_bios->head;
 588        tail = pending_bios->tail;
 589        WARN_ON(pending && !tail);
 590
 591        /*
 592         * if pending was null this time around, no bios need processing
 593         * at all and we can stop.  Otherwise it'll loop back up again
 594         * and do an additional check so no bios are missed.
 595         *
 596         * device->running_pending is used to synchronize with the
 597         * schedule_bio code.
 598         */
 599        if (device->pending_sync_bios.head == NULL &&
 600            device->pending_bios.head == NULL) {
 601                again = 0;
 602                device->running_pending = 0;
 603        } else {
 604                again = 1;
 605                device->running_pending = 1;
 606        }
 607
 608        pending_bios->head = NULL;
 609        pending_bios->tail = NULL;
 610
 611        spin_unlock(&device->io_lock);
 612
 613        while (pending) {
 614
 615                rmb();
 616                /* we want to work on both lists, but do more bios on the
 617                 * sync list than the regular list
 618                 */
 619                if ((num_run > 32 &&
 620                    pending_bios != &device->pending_sync_bios &&
 621                    device->pending_sync_bios.head) ||
 622                   (num_run > 64 && pending_bios == &device->pending_sync_bios &&
 623                    device->pending_bios.head)) {
 624                        spin_lock(&device->io_lock);
 625                        requeue_list(pending_bios, pending, tail);
 626                        goto loop_lock;
 627                }
 628
 629                cur = pending;
 630                pending = pending->bi_next;
 631                cur->bi_next = NULL;
 632
 633                BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
 634
 635                /*
 636                 * if we're doing the sync list, record that our
 637                 * plug has some sync requests on it
 638                 *
 639                 * If we're doing the regular list and there are
 640                 * sync requests sitting around, unplug before
 641                 * we add more
 642                 */
 643                if (pending_bios == &device->pending_sync_bios) {
 644                        sync_pending = 1;
 645                } else if (sync_pending) {
 646                        blk_finish_plug(&plug);
 647                        blk_start_plug(&plug);
 648                        sync_pending = 0;
 649                }
 650
 651                btrfsic_submit_bio(cur);
 652                num_run++;
 653                batch_run++;
 654
 655                cond_resched();
 656
 657                /*
 658                 * we made progress, there is more work to do and the bdi
 659                 * is now congested.  Back off and let other work structs
 660                 * run instead
 661                 */
 662                if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
 663                    fs_info->fs_devices->open_devices > 1) {
 664                        struct io_context *ioc;
 665
 666                        ioc = current->io_context;
 667
 668                        /*
 669                         * the main goal here is that we don't want to
 670                         * block if we're going to be able to submit
 671                         * more requests without blocking.
 672                         *
 673                         * This code does two great things, it pokes into
 674                         * the elevator code from a filesystem _and_
 675                         * it makes assumptions about how batching works.
 676                         */
 677                        if (ioc && ioc->nr_batch_requests > 0 &&
 678                            time_before(jiffies, ioc->last_waited + HZ/50UL) &&
 679                            (last_waited == 0 ||
 680                             ioc->last_waited == last_waited)) {
 681                                /*
 682                                 * we want to go through our batch of
 683                                 * requests and stop.  So, we copy out
 684                                 * the ioc->last_waited time and test
 685                                 * against it before looping
 686                                 */
 687                                last_waited = ioc->last_waited;
 688                                cond_resched();
 689                                continue;
 690                        }
 691                        spin_lock(&device->io_lock);
 692                        requeue_list(pending_bios, pending, tail);
 693                        device->running_pending = 1;
 694
 695                        spin_unlock(&device->io_lock);
 696                        btrfs_queue_work(fs_info->submit_workers,
 697                                         &device->work);
 698                        goto done;
 699                }
 700        }
 701
 702        cond_resched();
 703        if (again)
 704                goto loop;
 705
 706        spin_lock(&device->io_lock);
 707        if (device->pending_bios.head || device->pending_sync_bios.head)
 708                goto loop_lock;
 709        spin_unlock(&device->io_lock);
 710
 711done:
 712        blk_finish_plug(&plug);
 713}
 714
 715static void pending_bios_fn(struct btrfs_work *work)
 716{
 717        struct btrfs_device *device;
 718
 719        device = container_of(work, struct btrfs_device, work);
 720        run_scheduled_bios(device);
 721}
 722
 723static bool device_path_matched(const char *path, struct btrfs_device *device)
 724{
 725        int found;
 726
 727        rcu_read_lock();
 728        found = strcmp(rcu_str_deref(device->name), path);
 729        rcu_read_unlock();
 730
 731        return found == 0;
 732}
 733
 734/*
 735 *  Search and remove all stale (devices which are not mounted) devices.
 736 *  When both inputs are NULL, it will search and release all stale devices.
 737 *  path:       Optional. When provided will it release all unmounted devices
 738 *              matching this path only.
 739 *  skip_dev:   Optional. Will skip this device when searching for the stale
 740 *              devices.
 741 *  Return:     0 for success or if @path is NULL.
 742 *              -EBUSY if @path is a mounted device.
 743 *              -ENOENT if @path does not match any device in the list.
 744 */
 745static int btrfs_free_stale_devices(const char *path,
 746                                     struct btrfs_device *skip_device)
 747{
 748        struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
 749        struct btrfs_device *device, *tmp_device;
 750        int ret = 0;
 751
 752        if (path)
 753                ret = -ENOENT;
 754
 755        list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
 756
 757                mutex_lock(&fs_devices->device_list_mutex);
 758                list_for_each_entry_safe(device, tmp_device,
 759                                         &fs_devices->devices, dev_list) {
 760                        if (skip_device && skip_device == device)
 761                                continue;
 762                        if (path && !device->name)
 763                                continue;
 764                        if (path && !device_path_matched(path, device))
 765                                continue;
 766                        if (fs_devices->opened) {
 767                                /* for an already deleted device return 0 */
 768                                if (path && ret != 0)
 769                                        ret = -EBUSY;
 770                                break;
 771                        }
 772
 773                        /* delete the stale device */
 774                        fs_devices->num_devices--;
 775                        list_del(&device->dev_list);
 776                        btrfs_free_device(device);
 777
 778                        ret = 0;
 779                        if (fs_devices->num_devices == 0)
 780                                break;
 781                }
 782                mutex_unlock(&fs_devices->device_list_mutex);
 783
 784                if (fs_devices->num_devices == 0) {
 785                        btrfs_sysfs_remove_fsid(fs_devices);
 786                        list_del(&fs_devices->fs_list);
 787                        free_fs_devices(fs_devices);
 788                }
 789        }
 790
 791        return ret;
 792}
 793
 794static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 795                        struct btrfs_device *device, fmode_t flags,
 796                        void *holder)
 797{
 798        struct request_queue *q;
 799        struct block_device *bdev;
 800        struct buffer_head *bh;
 801        struct btrfs_super_block *disk_super;
 802        u64 devid;
 803        int ret;
 804
 805        if (device->bdev)
 806                return -EINVAL;
 807        if (!device->name)
 808                return -EINVAL;
 809
 810        ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
 811                                    &bdev, &bh);
 812        if (ret)
 813                return ret;
 814
 815        disk_super = (struct btrfs_super_block *)bh->b_data;
 816        devid = btrfs_stack_device_id(&disk_super->dev_item);
 817        if (devid != device->devid)
 818                goto error_brelse;
 819
 820        if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
 821                goto error_brelse;
 822
 823        device->generation = btrfs_super_generation(disk_super);
 824
 825        if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
 826                if (btrfs_super_incompat_flags(disk_super) &
 827                    BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
 828                        pr_err(
 829                "BTRFS: Invalid seeding and uuid-changed device detected\n");
 830                        goto error_brelse;
 831                }
 832
 833                clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 834                fs_devices->seeding = 1;
 835        } else {
 836                if (bdev_read_only(bdev))
 837                        clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 838                else
 839                        set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 840        }
 841
 842        q = bdev_get_queue(bdev);
 843        if (!blk_queue_nonrot(q))
 844                fs_devices->rotating = 1;
 845
 846        device->bdev = bdev;
 847        clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
 848        device->mode = flags;
 849
 850        fs_devices->open_devices++;
 851        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
 852            device->devid != BTRFS_DEV_REPLACE_DEVID) {
 853                fs_devices->rw_devices++;
 854                list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
 855        }
 856        brelse(bh);
 857
 858        return 0;
 859
 860error_brelse:
 861        brelse(bh);
 862        blkdev_put(bdev, flags);
 863
 864        return -EINVAL;
 865}
 866
 867/*
 868 * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
 869 * being created with a disk that has already completed its fsid change.
 870 */
 871static struct btrfs_fs_devices *find_fsid_inprogress(
 872                                        struct btrfs_super_block *disk_super)
 873{
 874        struct btrfs_fs_devices *fs_devices;
 875
 876        list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 877                if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
 878                           BTRFS_FSID_SIZE) != 0 &&
 879                    memcmp(fs_devices->metadata_uuid, disk_super->fsid,
 880                           BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) {
 881                        return fs_devices;
 882                }
 883        }
 884
 885        return NULL;
 886}
 887
 888
 889static struct btrfs_fs_devices *find_fsid_changed(
 890                                        struct btrfs_super_block *disk_super)
 891{
 892        struct btrfs_fs_devices *fs_devices;
 893
 894        /*
 895         * Handles the case where scanned device is part of an fs that had
 896         * multiple successful changes of FSID but curently device didn't
 897         * observe it. Meaning our fsid will be different than theirs.
 898         */
 899        list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 900                if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
 901                           BTRFS_FSID_SIZE) != 0 &&
 902                    memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
 903                           BTRFS_FSID_SIZE) == 0 &&
 904                    memcmp(fs_devices->fsid, disk_super->fsid,
 905                           BTRFS_FSID_SIZE) != 0) {
 906                        return fs_devices;
 907                }
 908        }
 909
 910        return NULL;
 911}
 912/*
 913 * Add new device to list of registered devices
 914 *
 915 * Returns:
 916 * device pointer which was just added or updated when successful
 917 * error pointer when failed
 918 */
 919static noinline struct btrfs_device *device_list_add(const char *path,
 920                           struct btrfs_super_block *disk_super,
 921                           bool *new_device_added)
 922{
 923        struct btrfs_device *device;
 924        struct btrfs_fs_devices *fs_devices = NULL;
 925        struct rcu_string *name;
 926        u64 found_transid = btrfs_super_generation(disk_super);
 927        u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
 928        bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
 929                BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
 930        bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
 931                                        BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
 932
 933        if (fsid_change_in_progress) {
 934                if (!has_metadata_uuid) {
 935                        /*
 936                         * When we have an image which has CHANGING_FSID_V2 set
 937                         * it might belong to either a filesystem which has
 938                         * disks with completed fsid change or it might belong
 939                         * to fs with no UUID changes in effect, handle both.
 940                         */
 941                        fs_devices = find_fsid_inprogress(disk_super);
 942                        if (!fs_devices)
 943                                fs_devices = find_fsid(disk_super->fsid, NULL);
 944                } else {
 945                        fs_devices = find_fsid_changed(disk_super);
 946                }
 947        } else if (has_metadata_uuid) {
 948                fs_devices = find_fsid(disk_super->fsid,
 949                                       disk_super->metadata_uuid);
 950        } else {
 951                fs_devices = find_fsid(disk_super->fsid, NULL);
 952        }
 953
 954
 955        if (!fs_devices) {
 956                if (has_metadata_uuid)
 957                        fs_devices = alloc_fs_devices(disk_super->fsid,
 958                                                      disk_super->metadata_uuid);
 959                else
 960                        fs_devices = alloc_fs_devices(disk_super->fsid, NULL);
 961
 962                if (IS_ERR(fs_devices))
 963                        return ERR_CAST(fs_devices);
 964
 965                fs_devices->fsid_change = fsid_change_in_progress;
 966
 967                mutex_lock(&fs_devices->device_list_mutex);
 968                list_add(&fs_devices->fs_list, &fs_uuids);
 969
 970                device = NULL;
 971        } else {
 972                mutex_lock(&fs_devices->device_list_mutex);
 973                device = btrfs_find_device(fs_devices, devid,
 974                                disk_super->dev_item.uuid, NULL, false);
 975
 976                /*
 977                 * If this disk has been pulled into an fs devices created by
 978                 * a device which had the CHANGING_FSID_V2 flag then replace the
 979                 * metadata_uuid/fsid values of the fs_devices.
 980                 */
 981                if (has_metadata_uuid && fs_devices->fsid_change &&
 982                    found_transid > fs_devices->latest_generation) {
 983                        memcpy(fs_devices->fsid, disk_super->fsid,
 984                                        BTRFS_FSID_SIZE);
 985                        memcpy(fs_devices->metadata_uuid,
 986                                        disk_super->metadata_uuid, BTRFS_FSID_SIZE);
 987
 988                        fs_devices->fsid_change = false;
 989                }
 990        }
 991
 992        if (!device) {
 993                if (fs_devices->opened) {
 994                        mutex_unlock(&fs_devices->device_list_mutex);
 995                        return ERR_PTR(-EBUSY);
 996                }
 997
 998                device = btrfs_alloc_device(NULL, &devid,
 999                                            disk_super->dev_item.uuid);
1000                if (IS_ERR(device)) {

1001                        mutex_unlock(&fs_devices->device_list_mutex);
1002                        /* we can safely leave the fs_devices entry around */
1003                        return device;
1004                }
1005
1006                name = rcu_string_strdup(path, GFP_NOFS);
1007                if (!name) {
1008                        btrfs_free_device(device);
1009                        mutex_unlock(&fs_devices->device_list_mutex);
1010                        return ERR_PTR(-ENOMEM);
1011                }
1012                rcu_assign_pointer(device->name, name);
1013
1014                list_add_rcu(&device->dev_list, &fs_devices->devices);
1015                fs_devices->num_devices++;
1016
1017                device->fs_devices = fs_devices;
1018                *new_device_added = true;
1019
1020                if (disk_super->label[0])
1021                        pr_info("BTRFS: device label %s devid %llu transid %llu %s\n",
1022                                disk_super->label, devid, found_transid, path);
1023                else
1024                        pr_info("BTRFS: device fsid %pU devid %llu transid %llu %s\n",
1025                                disk_super->fsid, devid, found_transid, path);
1026
1027        } else if (!device->name || strcmp(device->name->str, path)) {
1028                /*
1029                 * When FS is already mounted.
1030                 * 1. If you are here and if the device->name is NULL that
1031                 *    means this device was missing at time of FS mount.
1032                 * 2. If you are here and if the device->name is different
1033                 *    from 'path' that means either
1034                 *      a. The same device disappeared and reappeared with
1035                 *         different name. or
1036                 *      b. The missing-disk-which-was-replaced, has
1037                 *         reappeared now.
1038                 *
1039                 * We must allow 1 and 2a above. But 2b would be a spurious
1040                 * and unintentional.
1041                 *
1042                 * Further in case of 1 and 2a above, the disk at 'path'
1043                 * would have missed some transaction when it was away and
1044                 * in case of 2a the stale bdev has to be updated as well.
1045                 * 2b must not be allowed at all time.
1046                 */
1047
1048                /*
1049                 * For now, we do allow update to btrfs_fs_device through the
1050                 * btrfs dev scan cli after FS has been mounted.  We're still
1051                 * tracking a problem where systems fail mount by subvolume id
1052                 * when we reject replacement on a mounted FS.
1053                 */
1054                if (!fs_devices->opened && found_transid < device->generation) {
1055                        /*
1056                         * That is if the FS is _not_ mounted and if you
1057                         * are here, that means there is more than one
1058                         * disk with same uuid and devid.We keep the one
1059                         * with larger generation number or the last-in if
1060                         * generation are equal.
1061                         */
1062                        mutex_unlock(&fs_devices->device_list_mutex);
1063                        return ERR_PTR(-EEXIST);
1064                }
1065
1066                /*
1067                 * We are going to replace the device path for a given devid,
1068                 * make sure it's the same device if the device is mounted
1069                 */
1070                if (device->bdev) {
1071                        struct block_device *path_bdev;
1072
1073                        path_bdev = lookup_bdev(path);
1074                        if (IS_ERR(path_bdev)) {
1075                                mutex_unlock(&fs_devices->device_list_mutex);
1076                                return ERR_CAST(path_bdev);
1077                        }
1078
1079                        if (device->bdev != path_bdev) {
1080                                bdput(path_bdev);
1081                                mutex_unlock(&fs_devices->device_list_mutex);
1082                                btrfs_warn_in_rcu(device->fs_info,
1083                        "duplicate device fsid:devid for %pU:%llu old:%s new:%s",
1084                                        disk_super->fsid, devid,
1085                                        rcu_str_deref(device->name), path);
1086                                return ERR_PTR(-EEXIST);
1087                        }
1088                        bdput(path_bdev);
1089                        btrfs_info_in_rcu(device->fs_info,
1090                                "device fsid %pU devid %llu moved old:%s new:%s",
1091                                disk_super->fsid, devid,
1092                                rcu_str_deref(device->name), path);
1093                }
1094
1095                name = rcu_string_strdup(path, GFP_NOFS);
1096                if (!name) {
1097                        mutex_unlock(&fs_devices->device_list_mutex);
1098                        return ERR_PTR(-ENOMEM);
1099                }
1100                rcu_string_free(device->name);
1101                rcu_assign_pointer(device->name, name);
1102                if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
1103                        fs_devices->missing_devices--;
1104                        clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
1105                }
1106        }
1107
1108        /*
1109         * Unmount does not free the btrfs_device struct but would zero
1110         * generation along with most of the other members. So just update
1111         * it back. We need it to pick the disk with largest generation
1112         * (as above).
1113         */
1114        if (!fs_devices->opened) {
1115                device->generation = found_transid;
1116                fs_devices->latest_generation = max_t(u64, found_transid,
1117                                                fs_devices->latest_generation);
1118        }
1119
1120        fs_devices->total_devices = btrfs_super_num_devices(disk_super);
1121
1122        mutex_unlock(&fs_devices->device_list_mutex);
1123        return device;
1124}
1125
1126static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
1127{
1128        struct btrfs_fs_devices *fs_devices;
1129        struct btrfs_device *device;
1130        struct btrfs_device *orig_dev;
1131
1132        fs_devices = alloc_fs_devices(orig->fsid, NULL);
1133        if (IS_ERR(fs_devices))
1134                return fs_devices;
1135
1136        mutex_lock(&orig->device_list_mutex);
1137        fs_devices->total_devices = orig->total_devices;
1138
1139        list_for_each_entry(orig_dev, &orig->devices, dev_list) {
1140                struct rcu_string *name;
1141
1142                device = btrfs_alloc_device(NULL, &orig_dev->devid,
1143                                            orig_dev->uuid);
1144                if (IS_ERR(device))
1145                        goto error;
1146
1147                /*
1148                 * This is ok to do without rcu read locked because we hold the
1149                 * uuid mutex so nothing we touch in here is going to disappear.
1150                 */
1151                if (orig_dev->name) {
1152                        name = rcu_string_strdup(orig_dev->name->str,
1153                                        GFP_KERNEL);
1154                        if (!name) {
1155                                btrfs_free_device(device);
1156                                goto error;
1157                        }
1158                        rcu_assign_pointer(device->name, name);
1159                }
1160
1161                list_add(&device->dev_list, &fs_devices->devices);
1162                device->fs_devices = fs_devices;
1163                fs_devices->num_devices++;
1164        }
1165        mutex_unlock(&orig->device_list_mutex);
1166        return fs_devices;
1167error:
1168        mutex_unlock(&orig->device_list_mutex);
1169        free_fs_devices(fs_devices);
1170        return ERR_PTR(-ENOMEM);
1171}
1172
1173/*
1174 * After we have read the system tree and know devids belonging to
1175 * this filesystem, remove the device which does not belong there.
1176 */
1177void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
1178{
1179        struct btrfs_device *device, *next;
1180        struct btrfs_device *latest_dev = NULL;
1181
1182        mutex_lock(&uuid_mutex);
1183again:
1184        /* This is the initialized path, it is safe to release the devices. */
1185        list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
1186                if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
1187                                                        &device->dev_state)) {
1188                        if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
1189                             &device->dev_state) &&
1190                             (!latest_dev ||
1191                              device->generation > latest_dev->generation)) {
1192                                latest_dev = device;
1193                        }
1194                        continue;
1195                }
1196
1197                if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
1198                        /*
1199                         * In the first step, keep the device which has
1200                         * the correct fsid and the devid that is used
1201                         * for the dev_replace procedure.
1202                         * In the second step, the dev_replace state is
1203                         * read from the device tree and it is known
1204                         * whether the procedure is really active or
1205                         * not, which means whether this device is
1206                         * used or whether it should be removed.
1207                         */
1208                        if (step == 0 || test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
1209                                                  &device->dev_state)) {
1210                                continue;
1211                        }
1212                }
1213                if (device->bdev) {
1214                        blkdev_put(device->bdev, device->mode);
1215                        device->bdev = NULL;
1216                        fs_devices->open_devices--;
1217                }
1218                if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1219                        list_del_init(&device->dev_alloc_list);
1220                        clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1221                        if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
1222                                      &device->dev_state))
1223                                fs_devices->rw_devices--;
1224                }
1225                list_del_init(&device->dev_list);
1226                fs_devices->num_devices--;
1227                btrfs_free_device(device);
1228        }
1229
1230        if (fs_devices->seed) {
1231                fs_devices = fs_devices->seed;
1232                goto again;
1233        }
1234
1235        fs_devices->latest_bdev = latest_dev->bdev;
1236
1237        mutex_unlock(&uuid_mutex);
1238}
1239
1240static void btrfs_close_bdev(struct btrfs_device *device)
1241{
1242        if (!device->bdev)
1243                return;
1244
1245        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1246                sync_blockdev(device->bdev);
1247                invalidate_bdev(device->bdev);
1248        }
1249
1250        blkdev_put(device->bdev, device->mode);
1251}
1252
1253static void btrfs_close_one_device(struct btrfs_device *device)
1254{
1255        struct btrfs_fs_devices *fs_devices = device->fs_devices;
1256        struct btrfs_device *new_device;
1257        struct rcu_string *name;
1258
1259        if (device->bdev)
1260                fs_devices->open_devices--;
1261
1262        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1263            device->devid != BTRFS_DEV_REPLACE_DEVID) {
1264                list_del_init(&device->dev_alloc_list);
1265                fs_devices->rw_devices--;
1266        }
1267
1268        if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
1269                fs_devices->missing_devices--;
1270
1271        btrfs_close_bdev(device);
1272
1273        new_device = btrfs_alloc_device(NULL, &device->devid,
1274                                        device->uuid);
1275        BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
1276
1277        /* Safe because we are under uuid_mutex */
1278        if (device->name) {
1279                name = rcu_string_strdup(device->name->str, GFP_NOFS);
1280                BUG_ON(!name); /* -ENOMEM */
1281                rcu_assign_pointer(new_device->name, name);
1282        }
1283
1284        list_replace_rcu(&device->dev_list, &new_device->dev_list);
1285        new_device->fs_devices = device->fs_devices;
1286
1287        synchronize_rcu();
1288        btrfs_free_device(device);
1289}
1290
1291static int close_fs_devices(struct btrfs_fs_devices *fs_devices)
1292{
1293        struct btrfs_device *device, *tmp;
1294
1295        if (--fs_devices->opened > 0)
1296                return 0;
1297
1298        mutex_lock(&fs_devices->device_list_mutex);
1299        list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
1300                btrfs_close_one_device(device);
1301        }
1302        mutex_unlock(&fs_devices->device_list_mutex);
1303
1304        WARN_ON(fs_devices->open_devices);
1305        WARN_ON(fs_devices->rw_devices);
1306        fs_devices->opened = 0;
1307        fs_devices->seeding = 0;
1308
1309        return 0;
1310}
1311
1312int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
1313{
1314        struct btrfs_fs_devices *seed_devices = NULL;
1315        int ret;
1316
1317        mutex_lock(&uuid_mutex);
1318        ret = close_fs_devices(fs_devices);
1319        if (!fs_devices->opened) {
1320                seed_devices = fs_devices->seed;
1321                fs_devices->seed = NULL;
1322        }
1323        mutex_unlock(&uuid_mutex);
1324
1325        while (seed_devices) {
1326                fs_devices = seed_devices;
1327                seed_devices = fs_devices->seed;
1328                close_fs_devices(fs_devices);
1329                free_fs_devices(fs_devices);
1330        }
1331        return ret;
1332}
1333
1334static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
1335                                fmode_t flags, void *holder)
1336{
1337        struct btrfs_device *device;
1338        struct btrfs_device *latest_dev = NULL;
1339        int ret = 0;
1340
1341        flags |= FMODE_EXCL;
1342
1343        list_for_each_entry(device, &fs_devices->devices, dev_list) {
1344                /* Just open everything we can; ignore failures here */
1345                if (btrfs_open_one_device(fs_devices, device, flags, holder))
1346                        continue;
1347
1348                if (!latest_dev ||
1349                    device->generation > latest_dev->generation)
1350                        latest_dev = device;
1351        }
1352        if (fs_devices->open_devices == 0) {
1353                ret = -EINVAL;
1354                goto out;
1355        }
1356        fs_devices->opened = 1;
1357        fs_devices->latest_bdev = latest_dev->bdev;
1358        fs_devices->total_rw_bytes = 0;
1359out:
1360        return ret;
1361}
1362
1363static int devid_cmp(void *priv, struct list_head *a, struct list_head *b)
1364{
1365        struct btrfs_device *dev1, *dev2;
1366
1367        dev1 = list_entry(a, struct btrfs_device, dev_list);
1368        dev2 = list_entry(b, struct btrfs_device, dev_list);
1369
1370        if (dev1->devid < dev2->devid)
1371                return -1;
1372        else if (dev1->devid > dev2->devid)
1373                return 1;
1374        return 0;
1375}
1376
1377int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1378                       fmode_t flags, void *holder)
1379{
1380        int ret;
1381
1382        lockdep_assert_held(&uuid_mutex);
1383
1384        mutex_lock(&fs_devices->device_list_mutex);
1385        if (fs_devices->opened) {
1386                fs_devices->opened++;
1387                ret = 0;
1388        } else {
1389                list_sort(NULL, &fs_devices->devices, devid_cmp);
1390                ret = open_fs_devices(fs_devices, flags, holder);
1391        }
1392        mutex_unlock(&fs_devices->device_list_mutex);
1393
1394        return ret;
1395}
1396
1397static void btrfs_release_disk_super(struct page *page)
1398{
1399        kunmap(page);
1400        put_page(page);
1401}
1402
1403static int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
1404                                 struct page **page,
1405                                 struct btrfs_super_block **disk_super)
1406{
1407        void *p;
1408        pgoff_t index;
1409
1410        /* make sure our super fits in the device */
1411        if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
1412                return 1;
1413
1414        /* make sure our super fits in the page */
1415        if (sizeof(**disk_super) > PAGE_SIZE)
1416                return 1;
1417
1418        /* make sure our super doesn't straddle pages on disk */
1419        index = bytenr >> PAGE_SHIFT;
1420        if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
1421                return 1;
1422
1423        /* pull in the page with our super */
1424        *page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
1425                                   index, GFP_KERNEL);
1426
1427        if (IS_ERR_OR_NULL(*page))
1428                return 1;
1429
1430        p = kmap(*page);
1431
1432        /* align our pointer to the offset of the super block */
1433        *disk_super = p + offset_in_page(bytenr);
1434
1435        if (btrfs_super_bytenr(*disk_super) != bytenr ||
1436            btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
1437                btrfs_release_disk_super(*page);
1438                return 1;
1439        }
1440
1441        if ((*disk_super)->label[0] &&
1442                (*disk_super)->label[BTRFS_LABEL_SIZE - 1])
1443                (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';
1444
1445        return 0;
1446}
1447
1448int btrfs_forget_devices(const char *path)
1449{
1450        int ret;
1451
1452        mutex_lock(&uuid_mutex);
1453        ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
1454        mutex_unlock(&uuid_mutex);
1455
1456        return ret;
1457}
1458
1459/*
1460 * Look for a btrfs signature on a device. This may be called out of the mount path
1461 * and we are not allowed to call set_blocksize during the scan. The superblock
1462 * is read via pagecache
1463 */
1464struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
1465                                           void *holder)
1466{
1467        struct btrfs_super_block *disk_super;
1468        bool new_device_added = false;
1469        struct btrfs_device *device = NULL;
1470        struct block_device *bdev;
1471        struct page *page;
1472        u64 bytenr;
1473
1474        lockdep_assert_held(&uuid_mutex);
1475
1476        /*
1477         * we would like to check all the supers, but that would make
1478         * a btrfs mount succeed after a mkfs from a different FS.
1479         * So, we need to add a special mount option to scan for
1480         * later supers, using BTRFS_SUPER_MIRROR_MAX instead
1481         */
1482        bytenr = btrfs_sb_offset(0);
1483        flags |= FMODE_EXCL;
1484
1485        bdev = blkdev_get_by_path(path, flags, holder);
1486        if (IS_ERR(bdev))
1487                return ERR_CAST(bdev);
1488
1489        if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super)) {
1490                device = ERR_PTR(-EINVAL);
1491                goto error_bdev_put;
1492        }
1493
1494        device = device_list_add(path, disk_super, &new_device_added);
1495        if (!IS_ERR(device)) {
1496                if (new_device_added)
1497                        btrfs_free_stale_devices(path, device);
1498        }
1499
1500        btrfs_release_disk_super(page);
1501
1502error_bdev_put:
1503        blkdev_put(bdev, flags);
1504
1505        return device;
1506}
1507
1508/*
1509 * Try to find a chunk that intersects [start, start + len] range and when one
1510 * such is found, record the end of it in *start
1511 */
1512static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
1513                                    u64 len)
1514{
1515        u64 physical_start, physical_end;
1516
1517        lockdep_assert_held(&device->fs_info->chunk_mutex);
1518
1519        if (!find_first_extent_bit(&device->alloc_state, *start,
1520                                   &physical_start, &physical_end,
1521                                   CHUNK_ALLOCATED, NULL)) {
1522
1523                if (in_range(physical_start, *start, len) ||
1524                    in_range(*start, physical_start,
1525                             physical_end - physical_start)) {
1526                        *start = physical_end + 1;
1527                        return true;
1528                }
1529        }
1530        return false;
1531}
1532
1533
1534/*
1535 * find_free_dev_extent_start - find free space in the specified device
1536 * @device:       the device which we search the free space in
1537 * @num_bytes:    the size of the free space that we need
1538 * @search_start: the position from which to begin the search
1539 * @start:        store the start of the free space.
1540 * @len:          the size of the free space. that we find, or the size
1541 *                of the max free space if we don't find suitable free space
1542 *
1543 * this uses a pretty simple search, the expectation is that it is
1544 * called very infrequently and that a given device has a small number
1545 * of extents
1546 *
1547 * @start is used to store the start of the free space if we find. But if we
1548 * don't find suitable free space, it will be used to store the start position
1549 * of the max free space.
1550 *
1551 * @len is used to store the size of the free space that we find.
1552 * But if we don't find suitable free space, it is used to store the size of
1553 * the max free space.
1554 */
1555int find_free_dev_extent_start(struct btrfs_device *device, u64 num_bytes,
1556                               u64 search_start, u64 *start, u64 *len)
1557{
1558        struct btrfs_fs_info *fs_info = device->fs_info;
1559        struct btrfs_root *root = fs_info->dev_root;
1560        struct btrfs_key key;
1561        struct btrfs_dev_extent *dev_extent;
1562        struct btrfs_path *path;
1563        u64 hole_size;
1564        u64 max_hole_start;
1565        u64 max_hole_size;
1566        u64 extent_end;
1567        u64 search_end = device->total_bytes;
1568        int ret;
1569        int slot;
1570        struct extent_buffer *l;
1571
1572        /*
1573         * We don't want to overwrite the superblock on the drive nor any area
1574         * used by the boot loader (grub for example), so we make sure to start
1575         * at an offset of at least 1MB.
1576         */
1577        search_start = max_t(u64, search_start, SZ_1M);
1578
1579        path = btrfs_alloc_path();
1580        if (!path)
1581                return -ENOMEM;
1582
1583        max_hole_start = search_start;
1584        max_hole_size = 0;
1585
1586again:
1587        if (search_start >= search_end ||
1588                test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1589                ret = -ENOSPC;
1590                goto out;
1591        }
1592
1593        path->reada = READA_FORWARD;
1594        path->search_commit_root = 1;
1595        path->skip_locking = 1;
1596
1597        key.objectid = device->devid;
1598        key.offset = search_start;
1599        key.type = BTRFS_DEV_EXTENT_KEY;
1600
1601        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1602        if (ret < 0)
1603                goto out;
1604        if (ret > 0) {
1605                ret = btrfs_previous_item(root, path, key.objectid, key.type);
1606                if (ret < 0)
1607                        goto out;
1608        }
1609
1610        while (1) {
1611                l = path->nodes[0];
1612                slot = path->slots[0];
1613                if (slot >= btrfs_header_nritems(l)) {
1614                        ret = btrfs_next_leaf(root, path);
1615                        if (ret == 0)
1616                                continue;
1617                        if (ret < 0)
1618                                goto out;
1619
1620                        break;
1621                }
1622                btrfs_item_key_to_cpu(l, &key, slot);
1623
1624                if (key.objectid < device->devid)
1625                        goto next;
1626
1627                if (key.objectid > device->devid)
1628                        break;
1629
1630                if (key.type != BTRFS_DEV_EXTENT_KEY)
1631                        goto next;
1632
1633                if (key.offset > search_start) {
1634                        hole_size = key.offset - search_start;
1635
1636                        /*
1637                         * Have to check before we set max_hole_start, otherwise
1638                         * we could end up sending back this offset anyway.
1639                         */
1640                        if (contains_pending_extent(device, &search_start,
1641                                                    hole_size)) {
1642                                if (key.offset >= search_start)
1643                                        hole_size = key.offset - search_start;
1644                                else
1645                                        hole_size = 0;
1646                        }
1647
1648                        if (hole_size > max_hole_size) {
1649                                max_hole_start = search_start;
1650                                max_hole_size = hole_size;
1651                        }
1652
1653                        /*
1654                         * If this free space is greater than which we need,
1655                         * it must be the max free space that we have found
1656                         * until now, so max_hole_start must point to the start
1657                         * of this free space and the length of this free space
1658                         * is stored in max_hole_size. Thus, we return
1659                         * max_hole_start and max_hole_size and go back to the
1660                         * caller.
1661                         */
1662                        if (hole_size >= num_bytes) {
1663                                ret = 0;
1664                                goto out;
1665                        }
1666                }
1667
1668                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1669                extent_end = key.offset + btrfs_dev_extent_length(l,
1670                                                                  dev_extent);
1671                if (extent_end > search_start)
1672                        search_start = extent_end;
1673next:
1674                path->slots[0]++;
1675                cond_resched();
1676        }
1677
1678        /*
1679         * At this point, search_start should be the end of
1680         * allocated dev extents, and when shrinking the device,
1681         * search_end may be smaller than search_start.
1682         */
1683        if (search_end > search_start) {
1684                hole_size = search_end - search_start;
1685
1686                if (contains_pending_extent(device, &search_start, hole_size)) {
1687                        btrfs_release_path(path);
1688                        goto again;
1689                }
1690
1691                if (hole_size > max_hole_size) {
1692                        max_hole_start = search_start;
1693                        max_hole_size = hole_size;
1694                }
1695        }
1696
1697        /* See above. */
1698        if (max_hole_size < num_bytes)
1699                ret = -ENOSPC;
1700        else
1701                ret = 0;
1702
1703out:
1704        btrfs_free_path(path);
1705        *start = max_hole_start;
1706        if (len)
1707                *len = max_hole_size;
1708        return ret;
1709}
1710
1711int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
1712                         u64 *start, u64 *len)
1713{
1714        /* FIXME use last free of some kind */
1715        return find_free_dev_extent_start(device, num_bytes, 0, start, len);
1716}
1717
1718static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1719                          struct btrfs_device *device,
1720                          u64 start, u64 *dev_extent_len)
1721{
1722        struct btrfs_fs_info *fs_info = device->fs_info;
1723        struct btrfs_root *root = fs_info->dev_root;
1724        int ret;
1725        struct btrfs_path *path;
1726        struct btrfs_key key;
1727        struct btrfs_key found_key;
1728        struct extent_buffer *leaf = NULL;
1729        struct btrfs_dev_extent *extent = NULL;
1730
1731        path = btrfs_alloc_path();
1732        if (!path)
1733                return -ENOMEM;
1734
1735        key.objectid = device->devid;
1736        key.offset = start;
1737        key.type = BTRFS_DEV_EXTENT_KEY;
1738again:
1739        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1740        if (ret > 0) {
1741                ret = btrfs_previous_item(root, path, key.objectid,
1742                                          BTRFS_DEV_EXTENT_KEY);
1743                if (ret)
1744                        goto out;
1745                leaf = path->nodes[0];
1746                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1747                extent = btrfs_item_ptr(leaf, path->slots[0],
1748                                        struct btrfs_dev_extent);
1749                BUG_ON(found_key.offset > start || found_key.offset +
1750                       btrfs_dev_extent_length(leaf, extent) < start);
1751                key = found_key;
1752                btrfs_release_path(path);
1753                goto again;
1754        } else if (ret == 0) {
1755                leaf = path->nodes[0];
1756                extent = btrfs_item_ptr(leaf, path->slots[0],
1757                                        struct btrfs_dev_extent);
1758        } else {
1759                btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
1760                goto out;
1761        }
1762
1763        *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1764
1765        ret = btrfs_del_item(trans, root, path);
1766        if (ret) {
1767                btrfs_handle_fs_error(fs_info, ret,
1768                                      "Failed to remove dev extent item");
1769        } else {
1770                set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1771        }
1772out:
1773        btrfs_free_path(path);
1774        return ret;
1775}
1776
1777static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1778                                  struct btrfs_device *device,
1779                                  u64 chunk_offset, u64 start, u64 num_bytes)
1780{
1781        int ret;
1782        struct btrfs_path *path;
1783        struct btrfs_fs_info *fs_info = device->fs_info;
1784        struct btrfs_root *root = fs_info->dev_root;
1785        struct btrfs_dev_extent *extent;
1786        struct extent_buffer *leaf;
1787        struct btrfs_key key;
1788
1789        WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
1790        WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1791        path = btrfs_alloc_path();
1792        if (!path)
1793                return -ENOMEM;
1794
1795        key.objectid = device->devid;
1796        key.offset = start;
1797        key.type = BTRFS_DEV_EXTENT_KEY;
1798        ret = btrfs_insert_empty_item(trans, root, path, &key,
1799                                      sizeof(*extent));
1800        if (ret)
1801                goto out;
1802
1803        leaf = path->nodes[0];
1804        extent = btrfs_item_ptr(leaf, path->slots[0],
1805                                struct btrfs_dev_extent);
1806        btrfs_set_dev_extent_chunk_tree(leaf, extent,
1807                                        BTRFS_CHUNK_TREE_OBJECTID);
1808        btrfs_set_dev_extent_chunk_objectid(leaf, extent,
1809                                            BTRFS_FIRST_CHUNK_TREE_OBJECTID);
1810        btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1811
1812        btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1813        btrfs_mark_buffer_dirty(leaf);
1814out:
1815        btrfs_free_path(path);
1816        return ret;
1817}
1818
1819static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1820{
1821        struct extent_map_tree *em_tree;
1822        struct extent_map *em;
1823        struct rb_node *n;
1824        u64 ret = 0;
1825
1826        em_tree = &fs_info->mapping_tree;
1827        read_lock(&em_tree->lock);
1828        n = rb_last(&em_tree->map.rb_root);
1829        if (n) {
1830                em = rb_entry(n, struct extent_map, rb_node);
1831                ret = em->start + em->len;
1832        }
1833        read_unlock(&em_tree->lock);
1834
1835        return ret;
1836}
1837
1838static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1839                                    u64 *devid_ret)
1840{
1841        int ret;
1842        struct btrfs_key key;
1843        struct btrfs_key found_key;
1844        struct btrfs_path *path;
1845
1846        path = btrfs_alloc_path();
1847        if (!path)
1848                return -ENOMEM;
1849
1850        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1851        key.type = BTRFS_DEV_ITEM_KEY;
1852        key.offset = (u64)-1;
1853
1854        ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1855        if (ret < 0)
1856                goto error;
1857
1858        BUG_ON(ret == 0); /* Corruption */
1859
1860        ret = btrfs_previous_item(fs_info->chunk_root, path,
1861                                  BTRFS_DEV_ITEMS_OBJECTID,
1862                                  BTRFS_DEV_ITEM_KEY);
1863        if (ret) {
1864                *devid_ret = 1;
1865        } else {
1866                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1867                                      path->slots[0]);
1868                *devid_ret = found_key.offset + 1;
1869        }
1870        ret = 0;
1871error:
1872        btrfs_free_path(path);
1873        return ret;
1874}
1875
1876/*
1877 * the device information is stored in the chunk root
1878 * the btrfs_device struct should be fully filled in
1879 */
1880static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
1881                            struct btrfs_device *device)
1882{
1883        int ret;
1884        struct btrfs_path *path;
1885        struct btrfs_dev_item *dev_item;
1886        struct extent_buffer *leaf;
1887        struct btrfs_key key;
1888        unsigned long ptr;
1889
1890        path = btrfs_alloc_path();
1891        if (!path)
1892                return -ENOMEM;
1893
1894        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1895        key.type = BTRFS_DEV_ITEM_KEY;
1896        key.offset = device->devid;
1897
1898        ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
1899                                      &key, sizeof(*dev_item));
1900        if (ret)
1901                goto out;
1902
1903        leaf = path->nodes[0];
1904        dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1905
1906        btrfs_set_device_id(leaf, dev_item, device->devid);
1907        btrfs_set_device_generation(leaf, dev_item, 0);
1908        btrfs_set_device_type(leaf, dev_item, device->type);
1909        btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1910        btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1911        btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1912        btrfs_set_device_total_bytes(leaf, dev_item,
1913                                     btrfs_device_get_disk_total_bytes(device));
1914        btrfs_set_device_bytes_used(leaf, dev_item,
1915                                    btrfs_device_get_bytes_used(device));
1916        btrfs_set_device_group(leaf, dev_item, 0);
1917        btrfs_set_device_seek_speed(leaf, dev_item, 0);
1918        btrfs_set_device_bandwidth(leaf, dev_item, 0);
1919        btrfs_set_device_start_offset(leaf, dev_item, 0);
1920
1921        ptr = btrfs_device_uuid(dev_item);
1922        write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1923        ptr = btrfs_device_fsid(dev_item);
1924        write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
1925                            ptr, BTRFS_FSID_SIZE);
1926        btrfs_mark_buffer_dirty(leaf);
1927
1928        ret = 0;
1929out:
1930        btrfs_free_path(path);
1931        return ret;
1932}
1933
1934/*
1935 * Function to update ctime/mtime for a given device path.
1936 * Mainly used for ctime/mtime based probe like libblkid.
1937 */
1938static void update_dev_time(const char *path_name)
1939{
1940        struct file *filp;
1941
1942        filp = filp_open(path_name, O_RDWR, 0);
1943        if (IS_ERR(filp))
1944                return;
1945        file_update_time(filp);
1946        filp_close(filp, NULL);
1947}
1948
1949static int btrfs_rm_dev_item(struct btrfs_device *device)
1950{
1951        struct btrfs_root *root = device->fs_info->chunk_root;
1952        int ret;
1953        struct btrfs_path *path;
1954        struct btrfs_key key;
1955        struct btrfs_trans_handle *trans;
1956
1957        path = btrfs_alloc_path();
1958        if (!path)
1959                return -ENOMEM;
1960
1961        trans = btrfs_start_transaction(root, 0);
1962        if (IS_ERR(trans)) {
1963                btrfs_free_path(path);
1964                return PTR_ERR(trans);
1965        }
1966        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1967        key.type = BTRFS_DEV_ITEM_KEY;
1968        key.offset = device->devid;
1969
1970        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1971        if (ret) {
1972                if (ret > 0)
1973                        ret = -ENOENT;
1974                btrfs_abort_transaction(trans, ret);
1975                btrfs_end_transaction(trans);
1976                goto out;
1977        }
1978
1979        ret = btrfs_del_item(trans, root, path);
1980        if (ret) {
1981                btrfs_abort_transaction(trans, ret);
1982                btrfs_end_transaction(trans);
1983        }
1984
1985out:
1986        btrfs_free_path(path);
1987        if (!ret)
1988                ret = btrfs_commit_transaction(trans);
1989        return ret;
1990}
1991
1992/*
1993 * Verify that @num_devices satisfies the RAID profile constraints in the whole
1994 * filesystem. It's up to the caller to adjust that number regarding eg. device
1995 * replace.
1996 */
1997static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
1998                u64 num_devices)
1999{
2000        u64 all_avail;

2001        unsigned seq;
2002        int i;
2003
2004        do {
2005                seq = read_seqbegin(&fs_info->profiles_lock);
2006
2007                all_avail = fs_info->avail_data_alloc_bits |
2008                            fs_info->avail_system_alloc_bits |
2009                            fs_info->avail_metadata_alloc_bits;
2010        } while (read_seqretry(&fs_info->profiles_lock, seq));
2011
2012        for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
2013                if (!(all_avail & btrfs_raid_array[i].bg_flag))
2014                        continue;
2015
2016                if (num_devices < btrfs_raid_array[i].devs_min) {
2017                        int ret = btrfs_raid_array[i].mindev_error;
2018
2019                        if (ret)
2020                                return ret;
2021                }
2022        }
2023
2024        return 0;
2025}
2026
2027static struct btrfs_device * btrfs_find_next_active_device(
2028                struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
2029{
2030        struct btrfs_device *next_device;
2031
2032        list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
2033                if (next_device != device &&
2034                    !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
2035                    && next_device->bdev)
2036                        return next_device;
2037        }
2038
2039        return NULL;
2040}
2041
2042/*
2043 * Helper function to check if the given device is part of s_bdev / latest_bdev
2044 * and replace it with the provided or the next active device, in the context
2045 * where this function called, there should be always be another device (or
2046 * this_dev) which is active.
2047 */
2048void btrfs_assign_next_active_device(struct btrfs_device *device,
2049                                     struct btrfs_device *this_dev)
2050{
2051        struct btrfs_fs_info *fs_info = device->fs_info;
2052        struct btrfs_device *next_device;
2053
2054        if (this_dev)
2055                next_device = this_dev;
2056        else
2057                next_device = btrfs_find_next_active_device(fs_info->fs_devices,
2058                                                                device);
2059        ASSERT(next_device);
2060
2061        if (fs_info->sb->s_bdev &&
2062                        (fs_info->sb->s_bdev == device->bdev))
2063                fs_info->sb->s_bdev = next_device->bdev;
2064
2065        if (fs_info->fs_devices->latest_bdev == device->bdev)
2066                fs_info->fs_devices->latest_bdev = next_device->bdev;
2067}
2068
2069/*
2070 * Return btrfs_fs_devices::num_devices excluding the device that's being
2071 * currently replaced.
2072 */
2073static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
2074{
2075        u64 num_devices = fs_info->fs_devices->num_devices;
2076
2077        down_read(&fs_info->dev_replace.rwsem);
2078        if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
2079                ASSERT(num_devices > 1);
2080                num_devices--;
2081        }
2082        up_read(&fs_info->dev_replace.rwsem);
2083
2084        return num_devices;
2085}
2086
2087int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
2088                u64 devid)
2089{
2090        struct btrfs_device *device;
2091        struct btrfs_fs_devices *cur_devices;
2092        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2093        u64 num_devices;
2094        int ret = 0;
2095
2096        mutex_lock(&uuid_mutex);
2097
2098        num_devices = btrfs_num_devices(fs_info);
2099
2100        ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
2101        if (ret)
2102                goto out;
2103
2104        device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
2105
2106        if (IS_ERR(device)) {
2107                if (PTR_ERR(device) == -ENOENT &&
2108                    strcmp(device_path, "missing") == 0)
2109                        ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2110                else
2111                        ret = PTR_ERR(device);
2112                goto out;
2113        }
2114
2115        if (btrfs_pinned_by_swapfile(fs_info, device)) {
2116                btrfs_warn_in_rcu(fs_info,
2117                  "cannot remove device %s (devid %llu) due to active swapfile",
2118                                  rcu_str_deref(device->name), device->devid);
2119                ret = -ETXTBSY;
2120                goto out;
2121        }
2122
2123        if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2124                ret = BTRFS_ERROR_DEV_TGT_REPLACE;
2125                goto out;
2126        }
2127
2128        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
2129            fs_info->fs_devices->rw_devices == 1) {
2130                ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
2131                goto out;
2132        }
2133
2134        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2135                mutex_lock(&fs_info->chunk_mutex);
2136                list_del_init(&device->dev_alloc_list);
2137                device->fs_devices->rw_devices--;
2138                mutex_unlock(&fs_info->chunk_mutex);
2139        }
2140
2141        mutex_unlock(&uuid_mutex);
2142        ret = btrfs_shrink_device(device, 0);
2143        mutex_lock(&uuid_mutex);
2144        if (ret)
2145                goto error_undo;
2146
2147        /*
2148         * TODO: the superblock still includes this device in its num_devices
2149         * counter although write_all_supers() is not locked out. This
2150         * could give a filesystem state which requires a degraded mount.
2151         */
2152        ret = btrfs_rm_dev_item(device);
2153        if (ret)
2154                goto error_undo;
2155
2156        clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2157        btrfs_scrub_cancel_dev(device);
2158
2159        /*
2160         * the device list mutex makes sure that we don't change
2161         * the device list while someone else is writing out all
2162         * the device supers. Whoever is writing all supers, should
2163         * lock the device list mutex before getting the number of
2164         * devices in the super block (super_copy). Conversely,
2165         * whoever updates the number of devices in the super block
2166         * (super_copy) should hold the device list mutex.
2167         */
2168
2169        /*
2170         * In normal cases the cur_devices == fs_devices. But in case
2171         * of deleting a seed device, the cur_devices should point to
2172         * its own fs_devices listed under the fs_devices->seed.
2173         */
2174        cur_devices = device->fs_devices;
2175        mutex_lock(&fs_devices->device_list_mutex);
2176        list_del_rcu(&device->dev_list);
2177
2178        cur_devices->num_devices--;
2179        cur_devices->total_devices--;
2180        /* Update total_devices of the parent fs_devices if it's seed */
2181        if (cur_devices != fs_devices)
2182                fs_devices->total_devices--;
2183
2184        if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
2185                cur_devices->missing_devices--;
2186
2187        btrfs_assign_next_active_device(device, NULL);
2188
2189        if (device->bdev) {
2190                cur_devices->open_devices--;
2191                /* remove sysfs entry */
2192                btrfs_sysfs_rm_device_link(fs_devices, device);
2193        }
2194
2195        num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
2196        btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
2197        mutex_unlock(&fs_devices->device_list_mutex);
2198
2199        /*
2200         * at this point, the device is zero sized and detached from
2201         * the devices list.  All that's left is to zero out the old
2202         * supers and free the device.
2203         */
2204        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2205                btrfs_scratch_superblocks(device->bdev, device->name->str);
2206
2207        btrfs_close_bdev(device);
2208        synchronize_rcu();
2209        btrfs_free_device(device);
2210
2211        if (cur_devices->open_devices == 0) {
2212                while (fs_devices) {
2213                        if (fs_devices->seed == cur_devices) {
2214                                fs_devices->seed = cur_devices->seed;
2215                                break;
2216                        }
2217                        fs_devices = fs_devices->seed;
2218                }
2219                cur_devices->seed = NULL;
2220                close_fs_devices(cur_devices);
2221                free_fs_devices(cur_devices);
2222        }
2223
2224out:
2225        mutex_unlock(&uuid_mutex);
2226        return ret;
2227
2228error_undo:
2229        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2230                mutex_lock(&fs_info->chunk_mutex);
2231                list_add(&device->dev_alloc_list,
2232                         &fs_devices->alloc_list);
2233                device->fs_devices->rw_devices++;
2234                mutex_unlock(&fs_info->chunk_mutex);
2235        }
2236        goto out;
2237}
2238
2239void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
2240{
2241        struct btrfs_fs_devices *fs_devices;
2242
2243        lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);
2244
2245        /*
2246         * in case of fs with no seed, srcdev->fs_devices will point
2247         * to fs_devices of fs_info. However when the dev being replaced is
2248         * a seed dev it will point to the seed's local fs_devices. In short
2249         * srcdev will have its correct fs_devices in both the cases.
2250         */
2251        fs_devices = srcdev->fs_devices;
2252
2253        list_del_rcu(&srcdev->dev_list);
2254        list_del(&srcdev->dev_alloc_list);
2255        fs_devices->num_devices--;
2256        if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2257                fs_devices->missing_devices--;
2258
2259        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
2260                fs_devices->rw_devices--;
2261
2262        if (srcdev->bdev)
2263                fs_devices->open_devices--;
2264}
2265
2266void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
2267{
2268        struct btrfs_fs_info *fs_info = srcdev->fs_info;
2269        struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2270
2271        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state)) {
2272                /* zero out the old super if it is writable */
2273                btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
2274        }
2275
2276        btrfs_close_bdev(srcdev);
2277        synchronize_rcu();
2278        btrfs_free_device(srcdev);
2279
2280        /* if this is no devs we rather delete the fs_devices */
2281        if (!fs_devices->num_devices) {
2282                struct btrfs_fs_devices *tmp_fs_devices;
2283
2284                /*
2285                 * On a mounted FS, num_devices can't be zero unless it's a
2286                 * seed. In case of a seed device being replaced, the replace
2287                 * target added to the sprout FS, so there will be no more
2288                 * device left under the seed FS.
2289                 */
2290                ASSERT(fs_devices->seeding);
2291
2292                tmp_fs_devices = fs_info->fs_devices;
2293                while (tmp_fs_devices) {
2294                        if (tmp_fs_devices->seed == fs_devices) {
2295                                tmp_fs_devices->seed = fs_devices->seed;
2296                                break;
2297                        }
2298                        tmp_fs_devices = tmp_fs_devices->seed;
2299                }
2300                fs_devices->seed = NULL;
2301                close_fs_devices(fs_devices);
2302                free_fs_devices(fs_devices);
2303        }
2304}
2305
2306void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
2307{
2308        struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
2309
2310        WARN_ON(!tgtdev);
2311        mutex_lock(&fs_devices->device_list_mutex);
2312
2313        btrfs_sysfs_rm_device_link(fs_devices, tgtdev);
2314
2315        if (tgtdev->bdev)
2316                fs_devices->open_devices--;
2317
2318        fs_devices->num_devices--;
2319
2320        btrfs_assign_next_active_device(tgtdev, NULL);
2321
2322        list_del_rcu(&tgtdev->dev_list);
2323
2324        mutex_unlock(&fs_devices->device_list_mutex);
2325
2326        /*
2327         * The update_dev_time() with in btrfs_scratch_superblocks()
2328         * may lead to a call to btrfs_show_devname() which will try
2329         * to hold device_list_mutex. And here this device
2330         * is already out of device list, so we don't have to hold
2331         * the device_list_mutex lock.
2332         */
2333        btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
2334
2335        btrfs_close_bdev(tgtdev);
2336        synchronize_rcu();
2337        btrfs_free_device(tgtdev);
2338}
2339
2340static struct btrfs_device *btrfs_find_device_by_path(
2341                struct btrfs_fs_info *fs_info, const char *device_path)
2342{
2343        int ret = 0;
2344        struct btrfs_super_block *disk_super;
2345        u64 devid;
2346        u8 *dev_uuid;
2347        struct block_device *bdev;
2348        struct buffer_head *bh;
2349        struct btrfs_device *device;
2350
2351        ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
2352                                    fs_info->bdev_holder, 0, &bdev, &bh);
2353        if (ret)
2354                return ERR_PTR(ret);
2355        disk_super = (struct btrfs_super_block *)bh->b_data;
2356        devid = btrfs_stack_device_id(&disk_super->dev_item);
2357        dev_uuid = disk_super->dev_item.uuid;
2358        if (btrfs_fs_incompat(fs_info, METADATA_UUID))
2359                device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2360                                           disk_super->metadata_uuid, true);
2361        else
2362                device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2363                                           disk_super->fsid, true);
2364
2365        brelse(bh);
2366        if (!device)
2367                device = ERR_PTR(-ENOENT);
2368        blkdev_put(bdev, FMODE_READ);
2369        return device;
2370}
2371
2372/*
2373 * Lookup a device given by device id, or the path if the id is 0.
2374 */
2375struct btrfs_device *btrfs_find_device_by_devspec(
2376                struct btrfs_fs_info *fs_info, u64 devid,
2377                const char *device_path)
2378{
2379        struct btrfs_device *device;
2380
2381        if (devid) {
2382                device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
2383                                           NULL, true);
2384                if (!device)
2385                        return ERR_PTR(-ENOENT);
2386                return device;
2387        }
2388
2389        if (!device_path || !device_path[0])
2390                return ERR_PTR(-EINVAL);
2391
2392        if (strcmp(device_path, "missing") == 0) {
2393                /* Find first missing device */
2394                list_for_each_entry(device, &fs_info->fs_devices->devices,
2395                                    dev_list) {
2396                        if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
2397                                     &device->dev_state) && !device->bdev)
2398                                return device;
2399                }
2400                return ERR_PTR(-ENOENT);
2401        }
2402
2403        return btrfs_find_device_by_path(fs_info, device_path);
2404}
2405
2406/*
2407 * does all the dirty work required for changing file system's UUID.
2408 */
2409static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
2410{
2411        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2412        struct btrfs_fs_devices *old_devices;
2413        struct btrfs_fs_devices *seed_devices;
2414        struct btrfs_super_block *disk_super = fs_info->super_copy;
2415        struct btrfs_device *device;
2416        u64 super_flags;
2417
2418        lockdep_assert_held(&uuid_mutex);
2419        if (!fs_devices->seeding)
2420                return -EINVAL;
2421
2422        seed_devices = alloc_fs_devices(NULL, NULL);
2423        if (IS_ERR(seed_devices))
2424                return PTR_ERR(seed_devices);
2425
2426        old_devices = clone_fs_devices(fs_devices);
2427        if (IS_ERR(old_devices)) {
2428                kfree(seed_devices);
2429                return PTR_ERR(old_devices);
2430        }
2431
2432        list_add(&old_devices->fs_list, &fs_uuids);
2433
2434        memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2435        seed_devices->opened = 1;
2436        INIT_LIST_HEAD(&seed_devices->devices);
2437        INIT_LIST_HEAD(&seed_devices->alloc_list);
2438        mutex_init(&seed_devices->device_list_mutex);
2439
2440        mutex_lock(&fs_devices->device_list_mutex);
2441        list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
2442                              synchronize_rcu);
2443        list_for_each_entry(device, &seed_devices->devices, dev_list)
2444                device->fs_devices = seed_devices;
2445
2446        mutex_lock(&fs_info->chunk_mutex);
2447        list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
2448        mutex_unlock(&fs_info->chunk_mutex);
2449
2450        fs_devices->seeding = 0;
2451        fs_devices->num_devices = 0;
2452        fs_devices->open_devices = 0;
2453        fs_devices->missing_devices = 0;
2454        fs_devices->rotating = 0;
2455        fs_devices->seed = seed_devices;
2456
2457        generate_random_uuid(fs_devices->fsid);
2458        memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
2459        memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2460        mutex_unlock(&fs_devices->device_list_mutex);
2461
2462        super_flags = btrfs_super_flags(disk_super) &
2463                      ~BTRFS_SUPER_FLAG_SEEDING;
2464        btrfs_set_super_flags(disk_super, super_flags);
2465
2466        return 0;
2467}
2468
2469/*
2470 * Store the expected generation for seed devices in device items.
2471 */
2472static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
2473{
2474        struct btrfs_fs_info *fs_info = trans->fs_info;
2475        struct btrfs_root *root = fs_info->chunk_root;
2476        struct btrfs_path *path;
2477        struct extent_buffer *leaf;
2478        struct btrfs_dev_item *dev_item;
2479        struct btrfs_device *device;
2480        struct btrfs_key key;
2481        u8 fs_uuid[BTRFS_FSID_SIZE];
2482        u8 dev_uuid[BTRFS_UUID_SIZE];
2483        u64 devid;
2484        int ret;
2485
2486        path = btrfs_alloc_path();
2487        if (!path)
2488                return -ENOMEM;
2489
2490        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2491        key.offset = 0;
2492        key.type = BTRFS_DEV_ITEM_KEY;
2493
2494        while (1) {
2495                ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2496                if (ret < 0)
2497                        goto error;
2498
2499                leaf = path->nodes[0];
2500next_slot:
2501                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2502                        ret = btrfs_next_leaf(root, path);
2503                        if (ret > 0)
2504                                break;
2505                        if (ret < 0)
2506                                goto error;
2507                        leaf = path->nodes[0];
2508                        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2509                        btrfs_release_path(path);
2510                        continue;
2511                }
2512
2513                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2514                if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2515                    key.type != BTRFS_DEV_ITEM_KEY)
2516                        break;
2517
2518                dev_item = btrfs_item_ptr(leaf, path->slots[0],
2519                                          struct btrfs_dev_item);
2520                devid = btrfs_device_id(leaf, dev_item);
2521                read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2522                                   BTRFS_UUID_SIZE);
2523                read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2524                                   BTRFS_FSID_SIZE);
2525                device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2526                                           fs_uuid, true);
2527                BUG_ON(!device); /* Logic error */
2528
2529                if (device->fs_devices->seeding) {
2530                        btrfs_set_device_generation(leaf, dev_item,
2531                                                    device->generation);
2532                        btrfs_mark_buffer_dirty(leaf);
2533                }
2534
2535                path->slots[0]++;
2536                goto next_slot;
2537        }
2538        ret = 0;
2539error:
2540        btrfs_free_path(path);
2541        return ret;
2542}
2543
2544int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2545{
2546        struct btrfs_root *root = fs_info->dev_root;
2547        struct request_queue *q;
2548        struct btrfs_trans_handle *trans;
2549        struct btrfs_device *device;
2550        struct block_device *bdev;
2551        struct super_block *sb = fs_info->sb;
2552        struct rcu_string *name;
2553        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2554        u64 orig_super_total_bytes;
2555        u64 orig_super_num_devices;
2556        int seeding_dev = 0;
2557        int ret = 0;
2558        bool unlocked = false;
2559
2560        if (sb_rdonly(sb) && !fs_devices->seeding)
2561                return -EROFS;
2562
2563        bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2564                                  fs_info->bdev_holder);
2565        if (IS_ERR(bdev))
2566                return PTR_ERR(bdev);
2567
2568        if (fs_devices->seeding) {
2569                seeding_dev = 1;
2570                down_write(&sb->s_umount);
2571                mutex_lock(&uuid_mutex);
2572        }
2573
2574        filemap_write_and_wait(bdev->bd_inode->i_mapping);
2575
2576        mutex_lock(&fs_devices->device_list_mutex);
2577        list_for_each_entry(device, &fs_devices->devices, dev_list) {
2578                if (device->bdev == bdev) {
2579                        ret = -EEXIST;
2580                        mutex_unlock(
2581                                &fs_devices->device_list_mutex);
2582                        goto error;
2583                }
2584        }
2585        mutex_unlock(&fs_devices->device_list_mutex);
2586
2587        device = btrfs_alloc_device(fs_info, NULL, NULL);
2588        if (IS_ERR(device)) {
2589                /* we can safely leave the fs_devices entry around */
2590                ret = PTR_ERR(device);
2591                goto error;
2592        }
2593
2594        name = rcu_string_strdup(device_path, GFP_KERNEL);
2595        if (!name) {
2596                ret = -ENOMEM;
2597                goto error_free_device;
2598        }
2599        rcu_assign_pointer(device->name, name);
2600
2601        trans = btrfs_start_transaction(root, 0);
2602        if (IS_ERR(trans)) {
2603                ret = PTR_ERR(trans);
2604                goto error_free_device;
2605        }
2606
2607        q = bdev_get_queue(bdev);
2608        set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
2609        device->generation = trans->transid;
2610        device->io_width = fs_info->sectorsize;
2611        device->io_align = fs_info->sectorsize;
2612        device->sector_size = fs_info->sectorsize;
2613        device->total_bytes = round_down(i_size_read(bdev->bd_inode),
2614                                         fs_info->sectorsize);
2615        device->disk_total_bytes = device->total_bytes;
2616        device->commit_total_bytes = device->total_bytes;
2617        device->fs_info = fs_info;
2618        device->bdev = bdev;
2619        set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2620        clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2621        device->mode = FMODE_EXCL;
2622        device->dev_stats_valid = 1;
2623        set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2624
2625        if (seeding_dev) {
2626                sb->s_flags &= ~SB_RDONLY;
2627                ret = btrfs_prepare_sprout(fs_info);
2628                if (ret) {
2629                        btrfs_abort_transaction(trans, ret);
2630                        goto error_trans;
2631                }
2632        }
2633
2634        device->fs_devices = fs_devices;
2635
2636        mutex_lock(&fs_devices->device_list_mutex);
2637        mutex_lock(&fs_info->chunk_mutex);
2638        list_add_rcu(&device->dev_list, &fs_devices->devices);
2639        list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
2640        fs_devices->num_devices++;
2641        fs_devices->open_devices++;
2642        fs_devices->rw_devices++;
2643        fs_devices->total_devices++;
2644        fs_devices->total_rw_bytes += device->total_bytes;
2645
2646        atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2647
2648        if (!blk_queue_nonrot(q))
2649                fs_devices->rotating = 1;
2650
2651        orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
2652        btrfs_set_super_total_bytes(fs_info->super_copy,
2653                round_down(orig_super_total_bytes + device->total_bytes,
2654                           fs_info->sectorsize));
2655
2656        orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
2657        btrfs_set_super_num_devices(fs_info->super_copy,
2658                                    orig_super_num_devices + 1);
2659
2660        /* add sysfs device entry */
2661        btrfs_sysfs_add_device_link(fs_devices, device);
2662
2663        /*
2664         * we've got more storage, clear any full flags on the space
2665         * infos
2666         */
2667        btrfs_clear_space_info_full(fs_info);
2668
2669        mutex_unlock(&fs_info->chunk_mutex);
2670        mutex_unlock(&fs_devices->device_list_mutex);
2671
2672        if (seeding_dev) {
2673                mutex_lock(&fs_info->chunk_mutex);
2674                ret = init_first_rw_device(trans);
2675                mutex_unlock(&fs_info->chunk_mutex);
2676                if (ret) {
2677                        btrfs_abort_transaction(trans, ret);
2678                        goto error_sysfs;
2679                }
2680        }
2681
2682        ret = btrfs_add_dev_item(trans, device);
2683        if (ret) {
2684                btrfs_abort_transaction(trans, ret);
2685                goto error_sysfs;
2686        }
2687
2688        if (seeding_dev) {
2689                char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
2690
2691                ret = btrfs_finish_sprout(trans);
2692                if (ret) {
2693                        btrfs_abort_transaction(trans, ret);
2694                        goto error_sysfs;
2695                }
2696
2697                /* Sprouting would change fsid of the mounted root,
2698                 * so rename the fsid on the sysfs
2699                 */
2700                snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
2701                                                fs_info->fs_devices->fsid);
2702                if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf))
2703                        btrfs_warn(fs_info,
2704                                   "sysfs: failed to create fsid for sprout");
2705        }
2706
2707        ret = btrfs_commit_transaction(trans);
2708
2709        if (seeding_dev) {
2710                mutex_unlock(&uuid_mutex);
2711                up_write(&sb->s_umount);
2712                unlocked = true;
2713
2714                if (ret) /* transaction commit */
2715                        return ret;
2716
2717                ret = btrfs_relocate_sys_chunks(fs_info);
2718                if (ret < 0)
2719                        btrfs_handle_fs_error(fs_info, ret,
2720                                    "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2721                trans = btrfs_attach_transaction(root);
2722                if (IS_ERR(trans)) {
2723                        if (PTR_ERR(trans) == -ENOENT)
2724                                return 0;
2725                        ret = PTR_ERR(trans);
2726                        trans = NULL;
2727                        goto error_sysfs;
2728                }
2729                ret = btrfs_commit_transaction(trans);
2730        }
2731
2732        /* Update ctime/mtime for libblkid */
2733        update_dev_time(device_path);
2734        return ret;
2735
2736error_sysfs:
2737        btrfs_sysfs_rm_device_link(fs_devices, device);
2738        mutex_lock(&fs_info->fs_devices->device_list_mutex);
2739        mutex_lock(&fs_info->chunk_mutex);
2740        list_del_rcu(&device->dev_list);
2741        list_del(&device->dev_alloc_list);
2742        fs_info->fs_devices->num_devices--;
2743        fs_info->fs_devices->open_devices--;
2744        fs_info->fs_devices->rw_devices--;
2745        fs_info->fs_devices->total_devices--;
2746        fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
2747        atomic64_sub(device->total_bytes, &fs_info->free_chunk_space);
2748        btrfs_set_super_total_bytes(fs_info->super_copy,
2749                                    orig_super_total_bytes);
2750        btrfs_set_super_num_devices(fs_info->super_copy,
2751                                    orig_super_num_devices);
2752        mutex_unlock(&fs_info->chunk_mutex);
2753        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2754error_trans:
2755        if (seeding_dev)
2756                sb->s_flags |= SB_RDONLY;
2757        if (trans)
2758                btrfs_end_transaction(trans);
2759error_free_device:
2760        btrfs_free_device(device);
2761error:
2762        blkdev_put(bdev, FMODE_EXCL);
2763        if (seeding_dev && !unlocked) {
2764                mutex_unlock(&uuid_mutex);
2765                up_write(&sb->s_umount);
2766        }
2767        return ret;
2768}
2769
2770static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2771                                        struct btrfs_device *device)
2772{
2773        int ret;
2774        struct btrfs_path *path;
2775        struct btrfs_root *root = device->fs_info->chunk_root;
2776        struct btrfs_dev_item *dev_item;
2777        struct extent_buffer *leaf;
2778        struct btrfs_key key;
2779
2780        path = btrfs_alloc_path();
2781        if (!path)
2782                return -ENOMEM;
2783
2784        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2785        key.type = BTRFS_DEV_ITEM_KEY;
2786        key.offset = device->devid;
2787
2788        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2789        if (ret < 0)
2790                goto out;
2791
2792        if (ret > 0) {
2793                ret = -ENOENT;
2794                goto out;
2795        }
2796
2797        leaf = path->nodes[0];
2798        dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2799
2800        btrfs_set_device_id(leaf, dev_item, device->devid);
2801        btrfs_set_device_type(leaf, dev_item, device->type);
2802        btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2803        btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2804        btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2805        btrfs_set_device_total_bytes(leaf, dev_item,
2806                                     btrfs_device_get_disk_total_bytes(device));
2807        btrfs_set_device_bytes_used(leaf, dev_item,
2808                                    btrfs_device_get_bytes_used(device));
2809        btrfs_mark_buffer_dirty(leaf);
2810
2811out:
2812        btrfs_free_path(path);
2813        return ret;
2814}
2815
2816int btrfs_grow_device(struct btrfs_trans_handle *trans,
2817                      struct btrfs_device *device, u64 new_size)
2818{
2819        struct btrfs_fs_info *fs_info = device->fs_info;
2820        struct btrfs_super_block *super_copy = fs_info->super_copy;
2821        u64 old_total;
2822        u64 diff;
2823
2824        if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2825                return -EACCES;
2826
2827        new_size = round_down(new_size, fs_info->sectorsize);
2828
2829        mutex_lock(&fs_info->chunk_mutex);
2830        old_total = btrfs_super_total_bytes(super_copy);
2831        diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
2832
2833        if (new_size <= device->total_bytes ||
2834            test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2835                mutex_unlock(&fs_info->chunk_mutex);
2836                return -EINVAL;
2837        }
2838
2839        btrfs_set_super_total_bytes(super_copy,
2840                        round_down(old_total + diff, fs_info->sectorsize));
2841        device->fs_devices->total_rw_bytes += diff;
2842
2843        btrfs_device_set_total_bytes(device, new_size);
2844        btrfs_device_set_disk_total_bytes(device, new_size);
2845        btrfs_clear_space_info_full(device->fs_info);
2846        if (list_empty(&device->post_commit_list))
2847                list_add_tail(&device->post_commit_list,
2848                              &trans->transaction->dev_update_list);
2849        mutex_unlock(&fs_info->chunk_mutex);
2850
2851        return btrfs_update_device(trans, device);
2852}
2853
2854static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
2855{
2856        struct btrfs_fs_info *fs_info = trans->fs_info;
2857        struct btrfs_root *root = fs_info->chunk_root;
2858        int ret;
2859        struct btrfs_path *path;
2860        struct btrfs_key key;
2861
2862        path = btrfs_alloc_path();
2863        if (!path)
2864                return -ENOMEM;
2865
2866        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2867        key.offset = chunk_offset;
2868        key.type = BTRFS_CHUNK_ITEM_KEY;
2869
2870        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2871        if (ret < 0)
2872                goto out;
2873        else if (ret > 0) { /* Logic error or corruption */
2874                btrfs_handle_fs_error(fs_info, -ENOENT,
2875                                      "Failed lookup while freeing chunk.");
2876                ret = -ENOENT;
2877                goto out;
2878        }
2879
2880        ret = btrfs_del_item(trans, root, path);
2881        if (ret < 0)
2882                btrfs_handle_fs_error(fs_info, ret,
2883                                      "Failed to delete chunk item.");
2884out:
2885        btrfs_free_path(path);
2886        return ret;
2887}
2888
2889static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2890{
2891        struct btrfs_super_block *super_copy = fs_info->super_copy;
2892        struct btrfs_disk_key *disk_key;
2893        struct btrfs_chunk *chunk;
2894        u8 *ptr;
2895        int ret = 0;
2896        u32 num_stripes;
2897        u32 array_size;
2898        u32 len = 0;
2899        u32 cur;
2900        struct btrfs_key key;
2901
2902        mutex_lock(&fs_info->chunk_mutex);
2903        array_size = btrfs_super_sys_array_size(super_copy);
2904
2905        ptr = super_copy->sys_chunk_array;
2906        cur = 0;
2907
2908        while (cur < array_size) {
2909                disk_key = (struct btrfs_disk_key *)ptr;
2910                btrfs_disk_key_to_cpu(&key, disk_key);
2911
2912                len = sizeof(*disk_key);
2913
2914                if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2915                        chunk = (struct btrfs_chunk *)(ptr + len);
2916                        num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2917                        len += btrfs_chunk_item_size(num_stripes);
2918                } else {
2919                        ret = -EIO;
2920                        break;
2921                }
2922                if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
2923                    key.offset == chunk_offset) {
2924                        memmove(ptr, ptr + len, array_size - (cur + len));
2925                        array_size -= len;
2926                        btrfs_set_super_sys_array_size(super_copy, array_size);
2927                } else {
2928                        ptr += len;
2929                        cur += len;
2930                }
2931        }
2932        mutex_unlock(&fs_info->chunk_mutex);
2933        return ret;
2934}
2935
2936/*
2937 * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
2938 * @logical: Logical block offset in bytes.
2939 * @length: Length of extent in bytes.
2940 *
2941 * Return: Chunk mapping or ERR_PTR.
2942 */
2943struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
2944                                       u64 logical, u64 length)
2945{
2946        struct extent_map_tree *em_tree;
2947        struct extent_map *em;
2948
2949        em_tree = &fs_info->mapping_tree;
2950        read_lock(&em_tree->lock);
2951        em = lookup_extent_mapping(em_tree, logical, length);
2952        read_unlock(&em_tree->lock);
2953
2954        if (!em) {
2955                btrfs_crit(fs_info, "unable to find logical %llu length %llu",
2956                           logical, length);
2957                return ERR_PTR(-EINVAL);
2958        }
2959
2960        if (em->start > logical || em->start + em->len < logical) {
2961                btrfs_crit(fs_info,
2962                           "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
2963                           logical, length, em->start, em->start + em->len);
2964                free_extent_map(em);
2965                return ERR_PTR(-EINVAL);
2966        }
2967
2968        /* callers are responsible for dropping em's ref. */
2969        return em;
2970}
2971
2972int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
2973{
2974        struct btrfs_fs_info *fs_info = trans->fs_info;
2975        struct extent_map *em;
2976        struct map_lookup *map;
2977        u64 dev_extent_len = 0;
2978        int i, ret = 0;
2979        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2980
2981        em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
2982        if (IS_ERR(em)) {
2983                /*
2984                 * This is a logic error, but we don't want to just rely on the
2985                 * user having built with ASSERT enabled, so if ASSERT doesn't
2986                 * do anything we still error out.
2987                 */
2988                ASSERT(0);
2989                return PTR_ERR(em);
2990        }
2991        map = em->map_lookup;
2992        mutex_lock(&fs_info->chunk_mutex);
2993        check_system_chunk(trans, map->type);
2994        mutex_unlock(&fs_info->chunk_mutex);
2995
2996        /*
2997         * Take the device list mutex to prevent races with the final phase of
2998         * a device replace operation that replaces the device object associated
2999         * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
3000         */

3001        mutex_lock(&fs_devices->device_list_mutex);
3002        for (i = 0; i < map->num_stripes; i++) {
3003                struct btrfs_device *device = map->stripes[i].dev;
3004                ret = btrfs_free_dev_extent(trans, device,
3005                                            map->stripes[i].physical,
3006                                            &dev_extent_len);
3007                if (ret) {
3008                        mutex_unlock(&fs_devices->device_list_mutex);
3009                        btrfs_abort_transaction(trans, ret);
3010                        goto out;
3011                }
3012
3013                if (device->bytes_used > 0) {
3014                        mutex_lock(&fs_info->chunk_mutex);
3015                        btrfs_device_set_bytes_used(device,
3016                                        device->bytes_used - dev_extent_len);
3017                        atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
3018                        btrfs_clear_space_info_full(fs_info);
3019                        mutex_unlock(&fs_info->chunk_mutex);
3020                }
3021
3022                ret = btrfs_update_device(trans, device);
3023                if (ret) {
3024                        mutex_unlock(&fs_devices->device_list_mutex);
3025                        btrfs_abort_transaction(trans, ret);
3026                        goto out;
3027                }
3028        }
3029        mutex_unlock(&fs_devices->device_list_mutex);
3030
3031        ret = btrfs_free_chunk(trans, chunk_offset);
3032        if (ret) {
3033                btrfs_abort_transaction(trans, ret);
3034                goto out;
3035        }
3036
3037        trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
3038
3039        if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
3040                ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
3041                if (ret) {
3042                        btrfs_abort_transaction(trans, ret);
3043                        goto out;
3044                }
3045        }
3046
3047        ret = btrfs_remove_block_group(trans, chunk_offset, em);
3048        if (ret) {
3049                btrfs_abort_transaction(trans, ret);
3050                goto out;
3051        }
3052
3053out:
3054        /* once for us */
3055        free_extent_map(em);
3056        return ret;
3057}
3058
3059static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
3060{
3061        struct btrfs_root *root = fs_info->chunk_root;
3062        struct btrfs_trans_handle *trans;
3063        int ret;
3064
3065        /*
3066         * Prevent races with automatic removal of unused block groups.
3067         * After we relocate and before we remove the chunk with offset
3068         * chunk_offset, automatic removal of the block group can kick in,
3069         * resulting in a failure when calling btrfs_remove_chunk() below.
3070         *
3071         * Make sure to acquire this mutex before doing a tree search (dev
3072         * or chunk trees) to find chunks. Otherwise the cleaner kthread might
3073         * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
3074         * we release the path used to search the chunk/dev tree and before
3075         * the current task acquires this mutex and calls us.
3076         */
3077        lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
3078
3079        ret = btrfs_can_relocate(fs_info, chunk_offset);
3080        if (ret)
3081                return -ENOSPC;
3082
3083        /* step one, relocate all the extents inside this chunk */
3084        btrfs_scrub_pause(fs_info);
3085        ret = btrfs_relocate_block_group(fs_info, chunk_offset);
3086        btrfs_scrub_continue(fs_info);
3087        if (ret)
3088                return ret;
3089
3090        trans = btrfs_start_trans_remove_block_group(root->fs_info,
3091                                                     chunk_offset);
3092        if (IS_ERR(trans)) {
3093                ret = PTR_ERR(trans);
3094                btrfs_handle_fs_error(root->fs_info, ret, NULL);
3095                return ret;
3096        }
3097
3098        /*
3099         * step two, delete the device extents and the
3100         * chunk tree entries
3101         */
3102        ret = btrfs_remove_chunk(trans, chunk_offset);
3103        btrfs_end_transaction(trans);
3104        return ret;
3105}
3106
3107static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
3108{
3109        struct btrfs_root *chunk_root = fs_info->chunk_root;
3110        struct btrfs_path *path;
3111        struct extent_buffer *leaf;
3112        struct btrfs_chunk *chunk;
3113        struct btrfs_key key;
3114        struct btrfs_key found_key;
3115        u64 chunk_type;
3116        bool retried = false;
3117        int failed = 0;
3118        int ret;
3119
3120        path = btrfs_alloc_path();
3121        if (!path)
3122                return -ENOMEM;
3123
3124again:
3125        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3126        key.offset = (u64)-1;
3127        key.type = BTRFS_CHUNK_ITEM_KEY;
3128
3129        while (1) {
3130                mutex_lock(&fs_info->delete_unused_bgs_mutex);
3131                ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3132                if (ret < 0) {
3133                        mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3134                        goto error;
3135                }
3136                BUG_ON(ret == 0); /* Corruption */
3137
3138                ret = btrfs_previous_item(chunk_root, path, key.objectid,
3139                                          key.type);
3140                if (ret)
3141                        mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3142                if (ret < 0)
3143                        goto error;
3144                if (ret > 0)
3145                        break;
3146
3147                leaf = path->nodes[0];
3148                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3149
3150                chunk = btrfs_item_ptr(leaf, path->slots[0],
3151                                       struct btrfs_chunk);
3152                chunk_type = btrfs_chunk_type(leaf, chunk);
3153                btrfs_release_path(path);
3154
3155                if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
3156                        ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3157                        if (ret == -ENOSPC)
3158                                failed++;
3159                        else
3160                                BUG_ON(ret);
3161                }
3162                mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3163
3164                if (found_key.offset == 0)
3165                        break;
3166                key.offset = found_key.offset - 1;
3167        }
3168        ret = 0;
3169        if (failed && !retried) {
3170                failed = 0;
3171                retried = true;
3172                goto again;
3173        } else if (WARN_ON(failed && retried)) {
3174                ret = -ENOSPC;
3175        }
3176error:
3177        btrfs_free_path(path);
3178        return ret;
3179}
3180
3181/*
3182 * return 1 : allocate a data chunk successfully,
3183 * return <0: errors during allocating a data chunk,
3184 * return 0 : no need to allocate a data chunk.
3185 */
3186static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
3187                                      u64 chunk_offset)
3188{
3189        struct btrfs_block_group_cache *cache;
3190        u64 bytes_used;
3191        u64 chunk_type;
3192
3193        cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3194        ASSERT(cache);
3195        chunk_type = cache->flags;
3196        btrfs_put_block_group(cache);
3197
3198        if (chunk_type & BTRFS_BLOCK_GROUP_DATA) {
3199                spin_lock(&fs_info->data_sinfo->lock);
3200                bytes_used = fs_info->data_sinfo->bytes_used;
3201                spin_unlock(&fs_info->data_sinfo->lock);
3202
3203                if (!bytes_used) {
3204                        struct btrfs_trans_handle *trans;
3205                        int ret;
3206
3207                        trans = btrfs_join_transaction(fs_info->tree_root);
3208                        if (IS_ERR(trans))
3209                                return PTR_ERR(trans);
3210
3211                        ret = btrfs_force_chunk_alloc(trans,
3212                                                      BTRFS_BLOCK_GROUP_DATA);
3213                        btrfs_end_transaction(trans);
3214                        if (ret < 0)
3215                                return ret;
3216                        return 1;
3217                }
3218        }
3219        return 0;
3220}
3221
3222static int insert_balance_item(struct btrfs_fs_info *fs_info,
3223                               struct btrfs_balance_control *bctl)
3224{
3225        struct btrfs_root *root = fs_info->tree_root;
3226        struct btrfs_trans_handle *trans;
3227        struct btrfs_balance_item *item;
3228        struct btrfs_disk_balance_args disk_bargs;
3229        struct btrfs_path *path;
3230        struct extent_buffer *leaf;
3231        struct btrfs_key key;
3232        int ret, err;
3233
3234        path = btrfs_alloc_path();
3235        if (!path)
3236                return -ENOMEM;
3237
3238        trans = btrfs_start_transaction(root, 0);
3239        if (IS_ERR(trans)) {
3240                btrfs_free_path(path);
3241                return PTR_ERR(trans);
3242        }
3243
3244        key.objectid = BTRFS_BALANCE_OBJECTID;
3245        key.type = BTRFS_TEMPORARY_ITEM_KEY;
3246        key.offset = 0;
3247
3248        ret = btrfs_insert_empty_item(trans, root, path, &key,
3249                                      sizeof(*item));
3250        if (ret)
3251                goto out;
3252
3253        leaf = path->nodes[0];
3254        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3255
3256        memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3257
3258        btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
3259        btrfs_set_balance_data(leaf, item, &disk_bargs);
3260        btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
3261        btrfs_set_balance_meta(leaf, item, &disk_bargs);
3262        btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
3263        btrfs_set_balance_sys(leaf, item, &disk_bargs);
3264
3265        btrfs_set_balance_flags(leaf, item, bctl->flags);
3266
3267        btrfs_mark_buffer_dirty(leaf);
3268out:
3269        btrfs_free_path(path);
3270        err = btrfs_commit_transaction(trans);
3271        if (err && !ret)
3272                ret = err;
3273        return ret;
3274}
3275
3276static int del_balance_item(struct btrfs_fs_info *fs_info)
3277{
3278        struct btrfs_root *root = fs_info->tree_root;
3279        struct btrfs_trans_handle *trans;
3280        struct btrfs_path *path;
3281        struct btrfs_key key;
3282        int ret, err;
3283
3284        path = btrfs_alloc_path();
3285        if (!path)
3286                return -ENOMEM;
3287
3288        trans = btrfs_start_transaction(root, 0);
3289        if (IS_ERR(trans)) {
3290                btrfs_free_path(path);
3291                return PTR_ERR(trans);
3292        }
3293
3294        key.objectid = BTRFS_BALANCE_OBJECTID;
3295        key.type = BTRFS_TEMPORARY_ITEM_KEY;
3296        key.offset = 0;
3297
3298        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3299        if (ret < 0)
3300                goto out;
3301        if (ret > 0) {
3302                ret = -ENOENT;
3303                goto out;
3304        }
3305
3306        ret = btrfs_del_item(trans, root, path);
3307out:
3308        btrfs_free_path(path);
3309        err = btrfs_commit_transaction(trans);
3310        if (err && !ret)
3311                ret = err;
3312        return ret;
3313}
3314
3315/*
3316 * This is a heuristic used to reduce the number of chunks balanced on
3317 * resume after balance was interrupted.
3318 */
3319static void update_balance_args(struct btrfs_balance_control *bctl)
3320{
3321        /*
3322         * Turn on soft mode for chunk types that were being converted.
3323         */
3324        if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
3325                bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
3326        if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
3327                bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
3328        if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
3329                bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
3330
3331        /*
3332         * Turn on usage filter if is not already used.  The idea is
3333         * that chunks that we have already balanced should be
3334         * reasonably full.  Don't do it for chunks that are being
3335         * converted - that will keep us from relocating unconverted
3336         * (albeit full) chunks.
3337         */
3338        if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3339            !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3340            !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3341                bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
3342                bctl->data.usage = 90;
3343        }
3344        if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3345            !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3346            !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3347                bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
3348                bctl->sys.usage = 90;
3349        }
3350        if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3351            !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3352            !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3353                bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
3354                bctl->meta.usage = 90;
3355        }
3356}
3357
3358/*
3359 * Clear the balance status in fs_info and delete the balance item from disk.
3360 */
3361static void reset_balance_state(struct btrfs_fs_info *fs_info)
3362{
3363        struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3364        int ret;
3365
3366        BUG_ON(!fs_info->balance_ctl);
3367
3368        spin_lock(&fs_info->balance_lock);
3369        fs_info->balance_ctl = NULL;
3370        spin_unlock(&fs_info->balance_lock);
3371
3372        kfree(bctl);
3373        ret = del_balance_item(fs_info);
3374        if (ret)
3375                btrfs_handle_fs_error(fs_info, ret, NULL);
3376}
3377
3378/*
3379 * Balance filters.  Return 1 if chunk should be filtered out
3380 * (should not be balanced).
3381 */
3382static int chunk_profiles_filter(u64 chunk_type,
3383                                 struct btrfs_balance_args *bargs)
3384{
3385        chunk_type = chunk_to_extended(chunk_type) &
3386                                BTRFS_EXTENDED_PROFILE_MASK;
3387
3388        if (bargs->profiles & chunk_type)
3389                return 0;
3390
3391        return 1;
3392}
3393
3394static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
3395                              struct btrfs_balance_args *bargs)
3396{
3397        struct btrfs_block_group_cache *cache;
3398        u64 chunk_used;
3399        u64 user_thresh_min;
3400        u64 user_thresh_max;
3401        int ret = 1;
3402
3403        cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3404        chunk_used = btrfs_block_group_used(&cache->item);
3405
3406        if (bargs->usage_min == 0)
3407                user_thresh_min = 0;
3408        else
3409                user_thresh_min = div_factor_fine(cache->key.offset,
3410                                        bargs->usage_min);
3411
3412        if (bargs->usage_max == 0)
3413                user_thresh_max = 1;
3414        else if (bargs->usage_max > 100)
3415                user_thresh_max = cache->key.offset;
3416        else
3417                user_thresh_max = div_factor_fine(cache->key.offset,
3418                                        bargs->usage_max);
3419
3420        if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
3421                ret = 0;
3422
3423        btrfs_put_block_group(cache);
3424        return ret;
3425}
3426
3427static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
3428                u64 chunk_offset, struct btrfs_balance_args *bargs)
3429{
3430        struct btrfs_block_group_cache *cache;
3431        u64 chunk_used, user_thresh;
3432        int ret = 1;
3433
3434        cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3435        chunk_used = btrfs_block_group_used(&cache->item);
3436
3437        if (bargs->usage_min == 0)
3438                user_thresh = 1;
3439        else if (bargs->usage > 100)
3440                user_thresh = cache->key.offset;
3441        else
3442                user_thresh = div_factor_fine(cache->key.offset,
3443                                              bargs->usage);
3444
3445        if (chunk_used < user_thresh)
3446                ret = 0;
3447
3448        btrfs_put_block_group(cache);
3449        return ret;
3450}
3451
3452static int chunk_devid_filter(struct extent_buffer *leaf,
3453                              struct btrfs_chunk *chunk,
3454                              struct btrfs_balance_args *bargs)
3455{
3456        struct btrfs_stripe *stripe;
3457        int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3458        int i;
3459
3460        for (i = 0; i < num_stripes; i++) {
3461                stripe = btrfs_stripe_nr(chunk, i);
3462                if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
3463                        return 0;
3464        }
3465
3466        return 1;
3467}
3468
3469static u64 calc_data_stripes(u64 type, int num_stripes)
3470{
3471        const int index = btrfs_bg_flags_to_raid_index(type);
3472        const int ncopies = btrfs_raid_array[index].ncopies;
3473        const int nparity = btrfs_raid_array[index].nparity;
3474
3475        if (nparity)
3476                return num_stripes - nparity;
3477        else
3478                return num_stripes / ncopies;
3479}
3480
3481/* [pstart, pend) */
3482static int chunk_drange_filter(struct extent_buffer *leaf,
3483                               struct btrfs_chunk *chunk,
3484                               struct btrfs_balance_args *bargs)
3485{
3486        struct btrfs_stripe *stripe;
3487        int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3488        u64 stripe_offset;
3489        u64 stripe_length;
3490        u64 type;
3491        int factor;
3492        int i;
3493
3494        if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
3495                return 0;
3496
3497        type = btrfs_chunk_type(leaf, chunk);
3498        factor = calc_data_stripes(type, num_stripes);
3499
3500        for (i = 0; i < num_stripes; i++) {
3501                stripe = btrfs_stripe_nr(chunk, i);
3502                if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
3503                        continue;
3504
3505                stripe_offset = btrfs_stripe_offset(leaf, stripe);
3506                stripe_length = btrfs_chunk_length(leaf, chunk);
3507                stripe_length = div_u64(stripe_length, factor);
3508
3509                if (stripe_offset < bargs->pend &&
3510                    stripe_offset + stripe_length > bargs->pstart)
3511                        return 0;
3512        }
3513
3514        return 1;
3515}
3516
3517/* [vstart, vend) */
3518static int chunk_vrange_filter(struct extent_buffer *leaf,
3519                               struct btrfs_chunk *chunk,
3520                               u64 chunk_offset,
3521                               struct btrfs_balance_args *bargs)
3522{
3523        if (chunk_offset < bargs->vend &&
3524            chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
3525                /* at least part of the chunk is inside this vrange */
3526                return 0;
3527
3528        return 1;
3529}
3530
3531static int chunk_stripes_range_filter(struct extent_buffer *leaf,
3532                               struct btrfs_chunk *chunk,
3533                               struct btrfs_balance_args *bargs)
3534{
3535        int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3536
3537        if (bargs->stripes_min <= num_stripes
3538                        && num_stripes <= bargs->stripes_max)
3539                return 0;
3540
3541        return 1;
3542}
3543
3544static int chunk_soft_convert_filter(u64 chunk_type,
3545                                     struct btrfs_balance_args *bargs)
3546{
3547        if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3548                return 0;
3549
3550        chunk_type = chunk_to_extended(chunk_type) &
3551                                BTRFS_EXTENDED_PROFILE_MASK;
3552
3553        if (bargs->target == chunk_type)
3554                return 1;
3555
3556        return 0;
3557}
3558
3559static int should_balance_chunk(struct extent_buffer *leaf,
3560                                struct btrfs_chunk *chunk, u64 chunk_offset)
3561{
3562        struct btrfs_fs_info *fs_info = leaf->fs_info;
3563        struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3564        struct btrfs_balance_args *bargs = NULL;
3565        u64 chunk_type = btrfs_chunk_type(leaf, chunk);
3566
3567        /* type filter */
3568        if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
3569              (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
3570                return 0;
3571        }
3572
3573        if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3574                bargs = &bctl->data;
3575        else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3576                bargs = &bctl->sys;
3577        else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3578                bargs = &bctl->meta;
3579
3580        /* profiles filter */
3581        if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
3582            chunk_profiles_filter(chunk_type, bargs)) {
3583                return 0;
3584        }
3585
3586        /* usage filter */
3587        if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
3588            chunk_usage_filter(fs_info, chunk_offset, bargs)) {
3589                return 0;
3590        } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3591            chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
3592                return 0;
3593        }
3594
3595        /* devid filter */
3596        if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
3597            chunk_devid_filter(leaf, chunk, bargs)) {
3598                return 0;
3599        }
3600
3601        /* drange filter, makes sense only with devid filter */
3602        if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3603            chunk_drange_filter(leaf, chunk, bargs)) {
3604                return 0;
3605        }
3606
3607        /* vrange filter */
3608        if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
3609            chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
3610                return 0;
3611        }
3612
3613        /* stripes filter */
3614        if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
3615            chunk_stripes_range_filter(leaf, chunk, bargs)) {
3616                return 0;
3617        }
3618
3619        /* soft profile changing mode */
3620        if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
3621            chunk_soft_convert_filter(chunk_type, bargs)) {
3622                return 0;
3623        }
3624
3625        /*
3626         * limited by count, must be the last filter
3627         */
3628        if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
3629                if (bargs->limit == 0)
3630                        return 0;
3631                else
3632                        bargs->limit--;
3633        } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
3634                /*
3635                 * Same logic as the 'limit' filter; the minimum cannot be
3636                 * determined here because we do not have the global information
3637                 * about the count of all chunks that satisfy the filters.
3638                 */
3639                if (bargs->limit_max == 0)
3640                        return 0;
3641                else
3642                        bargs->limit_max--;
3643        }
3644
3645        return 1;
3646}
3647
3648static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3649{
3650        struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3651        struct btrfs_root *chunk_root = fs_info->chunk_root;
3652        u64 chunk_type;
3653        struct btrfs_chunk *chunk;
3654        struct btrfs_path *path = NULL;
3655        struct btrfs_key key;
3656        struct btrfs_key found_key;
3657        struct extent_buffer *leaf;
3658        int slot;
3659        int ret;
3660        int enospc_errors = 0;
3661        bool counting = true;
3662        /* The single value limit and min/max limits use the same bytes in the */
3663        u64 limit_data = bctl->data.limit;
3664        u64 limit_meta = bctl->meta.limit;
3665        u64 limit_sys = bctl->sys.limit;
3666        u32 count_data = 0;
3667        u32 count_meta = 0;
3668        u32 count_sys = 0;
3669        int chunk_reserved = 0;
3670
3671        path = btrfs_alloc_path();
3672        if (!path) {
3673                ret = -ENOMEM;
3674                goto error;
3675        }
3676
3677        /* zero out stat counters */
3678        spin_lock(&fs_info->balance_lock);
3679        memset(&bctl->stat, 0, sizeof(bctl->stat));
3680        spin_unlock(&fs_info->balance_lock);
3681again:
3682        if (!counting) {
3683                /*
3684                 * The single value limit and min/max limits use the same bytes
3685                 * in the
3686                 */
3687                bctl->data.limit = limit_data;
3688                bctl->meta.limit = limit_meta;
3689                bctl->sys.limit = limit_sys;
3690        }
3691        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3692        key.offset = (u64)-1;
3693        key.type = BTRFS_CHUNK_ITEM_KEY;
3694
3695        while (1) {
3696                if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
3697                    atomic_read(&fs_info->balance_cancel_req)) {
3698                        ret = -ECANCELED;
3699                        goto error;
3700                }
3701
3702                mutex_lock(&fs_info->delete_unused_bgs_mutex);
3703                ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3704                if (ret < 0) {
3705                        mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3706                        goto error;
3707                }
3708
3709                /*
3710                 * this shouldn't happen, it means the last relocate
3711                 * failed
3712                 */
3713                if (ret == 0)
3714                        BUG(); /* FIXME break ? */
3715
3716                ret = btrfs_previous_item(chunk_root, path, 0,
3717                                          BTRFS_CHUNK_ITEM_KEY);
3718                if (ret) {
3719                        mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3720                        ret = 0;
3721                        break;
3722                }
3723
3724                leaf = path->nodes[0];
3725                slot = path->slots[0];
3726                btrfs_item_key_to_cpu(leaf, &found_key, slot);
3727
3728                if (found_key.objectid != key.objectid) {
3729                        mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3730                        break;
3731                }
3732
3733                chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3734                chunk_type = btrfs_chunk_type(leaf, chunk);
3735
3736                if (!counting) {
3737                        spin_lock(&fs_info->balance_lock);
3738                        bctl->stat.considered++;
3739                        spin_unlock(&fs_info->balance_lock);
3740                }
3741
3742                ret = should_balance_chunk(leaf, chunk, found_key.offset);
3743
3744                btrfs_release_path(path);
3745                if (!ret) {
3746                        mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3747                        goto loop;
3748                }
3749
3750                if (counting) {
3751                        mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3752                        spin_lock(&fs_info->balance_lock);
3753                        bctl->stat.expected++;
3754                        spin_unlock(&fs_info->balance_lock);
3755
3756                        if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3757                                count_data++;
3758                        else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3759                                count_sys++;
3760                        else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3761                                count_meta++;
3762
3763                        goto loop;
3764                }
3765
3766                /*
3767                 * Apply limit_min filter, no need to check if the LIMITS
3768                 * filter is used, limit_min is 0 by default
3769                 */
3770                if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
3771                                        count_data < bctl->data.limit_min)
3772                                || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
3773                                        count_meta < bctl->meta.limit_min)
3774                                || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
3775                                        count_sys < bctl->sys.limit_min)) {
3776                        mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3777                        goto loop;
3778                }
3779
3780                if (!chunk_reserved) {
3781                        /*
3782                         * We may be relocating the only data chunk we have,
3783                         * which could potentially end up with losing data's
3784                         * raid profile, so lets allocate an empty one in
3785                         * advance.
3786                         */
3787                        ret = btrfs_may_alloc_data_chunk(fs_info,
3788                                                         found_key.offset);
3789                        if (ret < 0) {
3790                                mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3791                                goto error;
3792                        } else if (ret == 1) {
3793                                chunk_reserved = 1;
3794                        }
3795                }
3796
3797                ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3798                mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3799                if (ret == -ENOSPC) {
3800                        enospc_errors++;
3801                } else if (ret == -ETXTBSY) {
3802                        btrfs_info(fs_info,
3803           "skipping relocation of block group %llu due to active swapfile",
3804                                   found_key.offset);
3805                        ret = 0;
3806                } else if (ret) {
3807                        goto error;
3808                } else {
3809                        spin_lock(&fs_info->balance_lock);
3810                        bctl->stat.completed++;
3811                        spin_unlock(&fs_info->balance_lock);
3812                }
3813loop:
3814                if (found_key.offset == 0)
3815                        break;
3816                key.offset = found_key.offset - 1;
3817        }
3818
3819        if (counting) {
3820                btrfs_release_path(path);
3821                counting = false;
3822                goto again;
3823        }
3824error:
3825        btrfs_free_path(path);
3826        if (enospc_errors) {
3827                btrfs_info(fs_info, "%d enospc errors during balance",
3828                           enospc_errors);
3829                if (!ret)
3830                        ret = -ENOSPC;
3831        }
3832
3833        return ret;
3834}
3835
3836/**
3837 * alloc_profile_is_valid - see if a given profile is valid and reduced
3838 * @flags: profile to validate
3839 * @extended: if true @flags is treated as an extended profile
3840 */
3841static int alloc_profile_is_valid(u64 flags, int extended)
3842{
3843        u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
3844                               BTRFS_BLOCK_GROUP_PROFILE_MASK);
3845
3846        flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
3847
3848        /* 1) check that all other bits are zeroed */
3849        if (flags & ~mask)
3850                return 0;
3851
3852        /* 2) see if profile is reduced */
3853        if (flags == 0)
3854                return !extended; /* "0" is valid for usual profiles */
3855
3856        /* true if exactly one bit set */
3857        return is_power_of_2(flags);
3858}
3859
3860static inline int balance_need_close(struct btrfs_fs_info *fs_info)
3861{
3862        /* cancel requested || normal exit path */
3863        return atomic_read(&fs_info->balance_cancel_req) ||
3864                (atomic_read(&fs_info->balance_pause_req) == 0 &&
3865                 atomic_read(&fs_info->balance_cancel_req) == 0);
3866}
3867
3868/* Non-zero return value signifies invalidity */
3869static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
3870                u64 allowed)
3871{
3872        return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3873                (!alloc_profile_is_valid(bctl_arg->target, 1) ||
3874                 (bctl_arg->target & ~allowed)));
3875}
3876
3877/*
3878 * Fill @buf with textual description of balance filter flags @bargs, up to
3879 * @size_buf including the terminating null. The output may be trimmed if it
3880 * does not fit into the provided buffer.
3881 */
3882static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
3883                                 u32 size_buf)
3884{
3885        int ret;
3886        u32 size_bp = size_buf;
3887        char *bp = buf;
3888        u64 flags = bargs->flags;
3889        char tmp_buf[128] = {'\0'};
3890
3891        if (!flags)
3892                return;
3893
3894#define CHECK_APPEND_NOARG(a)                                           \
3895        do {                                                            \
3896                ret = snprintf(bp, size_bp, (a));                       \
3897                if (ret < 0 || ret >= size_bp)                          \
3898                        goto out_overflow;                              \
3899                size_bp -= ret;                                         \
3900                bp += ret;                                              \
3901        } while (0)
3902
3903#define CHECK_APPEND_1ARG(a, v1)                                        \
3904        do {                                                            \
3905                ret = snprintf(bp, size_bp, (a), (v1));                 \
3906                if (ret < 0 || ret >= size_bp)                          \
3907                        goto out_overflow;                              \
3908                size_bp -= ret;                                         \
3909                bp += ret;                                              \
3910        } while (0)
3911
3912#define CHECK_APPEND_2ARG(a, v1, v2)                                    \
3913        do {                                                            \
3914                ret = snprintf(bp, size_bp, (a), (v1), (v2));           \
3915                if (ret < 0 || ret >= size_bp)                          \
3916                        goto out_overflow;                              \
3917                size_bp -= ret;                                         \
3918                bp += ret;                                              \
3919        } while (0)
3920
3921        if (flags & BTRFS_BALANCE_ARGS_CONVERT)
3922                CHECK_APPEND_1ARG("convert=%s,",
3923                                  btrfs_bg_type_to_raid_name(bargs->target));
3924
3925        if (flags & BTRFS_BALANCE_ARGS_SOFT)
3926                CHECK_APPEND_NOARG("soft,");
3927
3928        if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
3929                btrfs_describe_block_groups(bargs->profiles, tmp_buf,
3930                                            sizeof(tmp_buf));
3931                CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
3932        }
3933
3934        if (flags & BTRFS_BALANCE_ARGS_USAGE)
3935                CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);
3936
3937        if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
3938                CHECK_APPEND_2ARG("usage=%u..%u,",
3939                                  bargs->usage_min, bargs->usage_max);
3940
3941        if (flags & BTRFS_BALANCE_ARGS_DEVID)
3942                CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);
3943
3944        if (flags & BTRFS_BALANCE_ARGS_DRANGE)
3945                CHECK_APPEND_2ARG("drange=%llu..%llu,",
3946                                  bargs->pstart, bargs->pend);
3947
3948        if (flags & BTRFS_BALANCE_ARGS_VRANGE)
3949                CHECK_APPEND_2ARG("vrange=%llu..%llu,",
3950                                  bargs->vstart, bargs->vend);
3951
3952        if (flags & BTRFS_BALANCE_ARGS_LIMIT)
3953                CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);
3954
3955        if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
3956                CHECK_APPEND_2ARG("limit=%u..%u,",
3957                                bargs->limit_min, bargs->limit_max);
3958
3959        if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
3960                CHECK_APPEND_2ARG("stripes=%u..%u,",
3961                                  bargs->stripes_min, bargs->stripes_max);
3962
3963#undef CHECK_APPEND_2ARG
3964#undef CHECK_APPEND_1ARG
3965#undef CHECK_APPEND_NOARG
3966
3967out_overflow:
3968
3969        if (size_bp < size_buf)
3970                buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
3971        else
3972                buf[0] = '\0';
3973}
3974
3975static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
3976{
3977        u32 size_buf = 1024;
3978        char tmp_buf[192] = {'\0'};
3979        char *buf;
3980        char *bp;
3981        u32 size_bp = size_buf;
3982        int ret;
3983        struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3984
3985        buf = kzalloc(size_buf, GFP_KERNEL);
3986        if (!buf)
3987                return;
3988
3989        bp = buf;
3990
3991#define CHECK_APPEND_1ARG(a, v1)                                        \
3992        do {                                                            \
3993                ret = snprintf(bp, size_bp, (a), (v1));                 \
3994                if (ret < 0 || ret >= size_bp)                          \
3995                        goto out_overflow;                              \
3996                size_bp -= ret;                                         \
3997                bp += ret;                                              \
3998        } while (0)
3999
4000        if (bctl->flags & BTRFS_BALANCE_FORCE)

4001                CHECK_APPEND_1ARG("%s", "-f ");
4002
4003        if (bctl->flags & BTRFS_BALANCE_DATA) {
4004                describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
4005                CHECK_APPEND_1ARG("-d%s ", tmp_buf);
4006        }
4007
4008        if (bctl->flags & BTRFS_BALANCE_METADATA) {
4009                describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
4010                CHECK_APPEND_1ARG("-m%s ", tmp_buf);
4011        }
4012
4013        if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
4014                describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
4015                CHECK_APPEND_1ARG("-s%s ", tmp_buf);
4016        }
4017
4018#undef CHECK_APPEND_1ARG
4019
4020out_overflow:
4021
4022        if (size_bp < size_buf)
4023                buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
4024        btrfs_info(fs_info, "balance: %s %s",
4025                   (bctl->flags & BTRFS_BALANCE_RESUME) ?
4026                   "resume" : "start", buf);
4027
4028        kfree(buf);
4029}
4030
4031/*
4032 * Should be called with balance mutexe held
4033 */
4034int btrfs_balance(struct btrfs_fs_info *fs_info,
4035                  struct btrfs_balance_control *bctl,
4036                  struct btrfs_ioctl_balance_args *bargs)
4037{
4038        u64 meta_target, data_target;
4039        u64 allowed;
4040        int mixed = 0;
4041        int ret;
4042        u64 num_devices;
4043        unsigned seq;
4044        bool reducing_integrity;
4045        int i;
4046
4047        if (btrfs_fs_closing(fs_info) ||
4048            atomic_read(&fs_info->balance_pause_req) ||
4049            atomic_read(&fs_info->balance_cancel_req)) {
4050                ret = -EINVAL;
4051                goto out;
4052        }
4053
4054        allowed = btrfs_super_incompat_flags(fs_info->super_copy);
4055        if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
4056                mixed = 1;
4057
4058        /*
4059         * In case of mixed groups both data and meta should be picked,
4060         * and identical options should be given for both of them.
4061         */
4062        allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
4063        if (mixed && (bctl->flags & allowed)) {
4064                if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
4065                    !(bctl->flags & BTRFS_BALANCE_METADATA) ||
4066                    memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
4067                        btrfs_err(fs_info,
4068          "balance: mixed groups data and metadata options must be the same");
4069                        ret = -EINVAL;
4070                        goto out;
4071                }
4072        }
4073
4074        num_devices = btrfs_num_devices(fs_info);
4075        allowed = 0;
4076        for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
4077                if (num_devices >= btrfs_raid_array[i].devs_min)
4078                        allowed |= btrfs_raid_array[i].bg_flag;
4079
4080        if (validate_convert_profile(&bctl->data, allowed)) {
4081                btrfs_err(fs_info,
4082                          "balance: invalid convert data profile %s",
4083                          btrfs_bg_type_to_raid_name(bctl->data.target));
4084                ret = -EINVAL;
4085                goto out;
4086        }
4087        if (validate_convert_profile(&bctl->meta, allowed)) {
4088                btrfs_err(fs_info,
4089                          "balance: invalid convert metadata profile %s",
4090                          btrfs_bg_type_to_raid_name(bctl->meta.target));
4091                ret = -EINVAL;
4092                goto out;
4093        }
4094        if (validate_convert_profile(&bctl->sys, allowed)) {
4095                btrfs_err(fs_info,
4096                          "balance: invalid convert system profile %s",
4097                          btrfs_bg_type_to_raid_name(bctl->sys.target));
4098                ret = -EINVAL;
4099                goto out;
4100        }
4101
4102        /*
4103         * Allow to reduce metadata or system integrity only if force set for
4104         * profiles with redundancy (copies, parity)
4105         */
4106        allowed = 0;
4107        for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
4108                if (btrfs_raid_array[i].ncopies >= 2 ||
4109                    btrfs_raid_array[i].tolerated_failures >= 1)
4110                        allowed |= btrfs_raid_array[i].bg_flag;
4111        }
4112        do {
4113                seq = read_seqbegin(&fs_info->profiles_lock);
4114
4115                if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4116                     (fs_info->avail_system_alloc_bits & allowed) &&
4117                     !(bctl->sys.target & allowed)) ||
4118                    ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4119                     (fs_info->avail_metadata_alloc_bits & allowed) &&
4120                     !(bctl->meta.target & allowed)))
4121                        reducing_integrity = true;
4122                else
4123                        reducing_integrity = false;
4124
4125                /* if we're not converting, the target field is uninitialized */
4126                meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
4127                        bctl->meta.target : fs_info->avail_metadata_alloc_bits;
4128                data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
4129                        bctl->data.target : fs_info->avail_data_alloc_bits;
4130        } while (read_seqretry(&fs_info->profiles_lock, seq));
4131
4132        if (reducing_integrity) {
4133                if (bctl->flags & BTRFS_BALANCE_FORCE) {
4134                        btrfs_info(fs_info,
4135                                   "balance: force reducing metadata integrity");
4136                } else {
4137                        btrfs_err(fs_info,
4138          "balance: reduces metadata integrity, use --force if you want this");
4139                        ret = -EINVAL;
4140                        goto out;
4141                }
4142        }
4143
4144        if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
4145                btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
4146                btrfs_warn(fs_info,
4147        "balance: metadata profile %s has lower redundancy than data profile %s",
4148                                btrfs_bg_type_to_raid_name(meta_target),
4149                                btrfs_bg_type_to_raid_name(data_target));
4150        }
4151
4152        if (fs_info->send_in_progress) {
4153                btrfs_warn_rl(fs_info,
4154"cannot run balance while send operations are in progress (%d in progress)",
4155                              fs_info->send_in_progress);
4156                ret = -EAGAIN;
4157                goto out;
4158        }
4159
4160        ret = insert_balance_item(fs_info, bctl);
4161        if (ret && ret != -EEXIST)
4162                goto out;
4163
4164        if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
4165                BUG_ON(ret == -EEXIST);
4166                BUG_ON(fs_info->balance_ctl);
4167                spin_lock(&fs_info->balance_lock);
4168                fs_info->balance_ctl = bctl;
4169                spin_unlock(&fs_info->balance_lock);
4170        } else {
4171                BUG_ON(ret != -EEXIST);
4172                spin_lock(&fs_info->balance_lock);
4173                update_balance_args(bctl);
4174                spin_unlock(&fs_info->balance_lock);
4175        }
4176
4177        ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4178        set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4179        describe_balance_start_or_resume(fs_info);
4180        mutex_unlock(&fs_info->balance_mutex);
4181
4182        ret = __btrfs_balance(fs_info);
4183
4184        mutex_lock(&fs_info->balance_mutex);
4185        if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req))
4186                btrfs_info(fs_info, "balance: paused");
4187        else if (ret == -ECANCELED && atomic_read(&fs_info->balance_cancel_req))
4188                btrfs_info(fs_info, "balance: canceled");
4189        else
4190                btrfs_info(fs_info, "balance: ended with status: %d", ret);
4191
4192        clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4193
4194        if (bargs) {
4195                memset(bargs, 0, sizeof(*bargs));
4196                btrfs_update_ioctl_balance_args(fs_info, bargs);
4197        }
4198
4199        if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
4200            balance_need_close(fs_info)) {
4201                reset_balance_state(fs_info);
4202                clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
4203        }
4204
4205        wake_up(&fs_info->balance_wait_q);
4206
4207        return ret;
4208out:
4209        if (bctl->flags & BTRFS_BALANCE_RESUME)
4210                reset_balance_state(fs_info);
4211        else
4212                kfree(bctl);
4213        clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
4214
4215        return ret;
4216}
4217
4218static int balance_kthread(void *data)
4219{
4220        struct btrfs_fs_info *fs_info = data;
4221        int ret = 0;
4222
4223        mutex_lock(&fs_info->balance_mutex);
4224        if (fs_info->balance_ctl)
4225                ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
4226        mutex_unlock(&fs_info->balance_mutex);
4227
4228        return ret;
4229}
4230
4231int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
4232{
4233        struct task_struct *tsk;
4234
4235        mutex_lock(&fs_info->balance_mutex);
4236        if (!fs_info->balance_ctl) {
4237                mutex_unlock(&fs_info->balance_mutex);
4238                return 0;
4239        }
4240        mutex_unlock(&fs_info->balance_mutex);
4241
4242        if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
4243                btrfs_info(fs_info, "balance: resume skipped");
4244                return 0;
4245        }
4246
4247        /*
4248         * A ro->rw remount sequence should continue with the paused balance
4249         * regardless of who pauses it, system or the user as of now, so set
4250         * the resume flag.
4251         */
4252        spin_lock(&fs_info->balance_lock);
4253        fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
4254        spin_unlock(&fs_info->balance_lock);
4255
4256        tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
4257        return PTR_ERR_OR_ZERO(tsk);
4258}
4259
4260int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
4261{
4262        struct btrfs_balance_control *bctl;
4263        struct btrfs_balance_item *item;
4264        struct btrfs_disk_balance_args disk_bargs;
4265        struct btrfs_path *path;
4266        struct extent_buffer *leaf;
4267        struct btrfs_key key;
4268        int ret;
4269
4270        path = btrfs_alloc_path();
4271        if (!path)
4272                return -ENOMEM;
4273
4274        key.objectid = BTRFS_BALANCE_OBJECTID;
4275        key.type = BTRFS_TEMPORARY_ITEM_KEY;
4276        key.offset = 0;
4277
4278        ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
4279        if (ret < 0)
4280                goto out;
4281        if (ret > 0) { /* ret = -ENOENT; */
4282                ret = 0;
4283                goto out;
4284        }
4285
4286        bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
4287        if (!bctl) {
4288                ret = -ENOMEM;
4289                goto out;
4290        }
4291
4292        leaf = path->nodes[0];
4293        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
4294
4295        bctl->flags = btrfs_balance_flags(leaf, item);
4296        bctl->flags |= BTRFS_BALANCE_RESUME;
4297
4298        btrfs_balance_data(leaf, item, &disk_bargs);
4299        btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
4300        btrfs_balance_meta(leaf, item, &disk_bargs);
4301        btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
4302        btrfs_balance_sys(leaf, item, &disk_bargs);
4303        btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
4304
4305        /*
4306         * This should never happen, as the paused balance state is recovered
4307         * during mount without any chance of other exclusive ops to collide.
4308         *
4309         * This gives the exclusive op status to balance and keeps in paused
4310         * state until user intervention (cancel or umount). If the ownership
4311         * cannot be assigned, show a message but do not fail. The balance
4312         * is in a paused state and must have fs_info::balance_ctl properly
4313         * set up.
4314         */
4315        if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags))
4316                btrfs_warn(fs_info,
4317        "balance: cannot set exclusive op status, resume manually");
4318
4319        mutex_lock(&fs_info->balance_mutex);
4320        BUG_ON(fs_info->balance_ctl);
4321        spin_lock(&fs_info->balance_lock);
4322        fs_info->balance_ctl = bctl;
4323        spin_unlock(&fs_info->balance_lock);
4324        mutex_unlock(&fs_info->balance_mutex);
4325out:
4326        btrfs_free_path(path);
4327        return ret;
4328}
4329
4330int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
4331{
4332        int ret = 0;
4333
4334        mutex_lock(&fs_info->balance_mutex);
4335        if (!fs_info->balance_ctl) {
4336                mutex_unlock(&fs_info->balance_mutex);
4337                return -ENOTCONN;
4338        }
4339
4340        if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4341                atomic_inc(&fs_info->balance_pause_req);
4342                mutex_unlock(&fs_info->balance_mutex);
4343
4344                wait_event(fs_info->balance_wait_q,
4345                           !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4346
4347                mutex_lock(&fs_info->balance_mutex);
4348                /* we are good with balance_ctl ripped off from under us */
4349                BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4350                atomic_dec(&fs_info->balance_pause_req);
4351        } else {
4352                ret = -ENOTCONN;
4353        }
4354
4355        mutex_unlock(&fs_info->balance_mutex);
4356        return ret;
4357}
4358
4359int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
4360{
4361        mutex_lock(&fs_info->balance_mutex);
4362        if (!fs_info->balance_ctl) {
4363                mutex_unlock(&fs_info->balance_mutex);
4364                return -ENOTCONN;
4365        }
4366
4367        /*
4368         * A paused balance with the item stored on disk can be resumed at
4369         * mount time if the mount is read-write. Otherwise it's still paused
4370         * and we must not allow cancelling as it deletes the item.
4371         */
4372        if (sb_rdonly(fs_info->sb)) {
4373                mutex_unlock(&fs_info->balance_mutex);
4374                return -EROFS;
4375        }
4376
4377        atomic_inc(&fs_info->balance_cancel_req);
4378        /*
4379         * if we are running just wait and return, balance item is
4380         * deleted in btrfs_balance in this case
4381         */
4382        if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4383                mutex_unlock(&fs_info->balance_mutex);
4384                wait_event(fs_info->balance_wait_q,
4385                           !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4386                mutex_lock(&fs_info->balance_mutex);
4387        } else {
4388                mutex_unlock(&fs_info->balance_mutex);
4389                /*
4390                 * Lock released to allow other waiters to continue, we'll
4391                 * reexamine the status again.
4392                 */
4393                mutex_lock(&fs_info->balance_mutex);
4394
4395                if (fs_info->balance_ctl) {
4396                        reset_balance_state(fs_info);
4397                        clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
4398                        btrfs_info(fs_info, "balance: canceled");
4399                }
4400        }
4401
4402        BUG_ON(fs_info->balance_ctl ||
4403                test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4404        atomic_dec(&fs_info->balance_cancel_req);
4405        mutex_unlock(&fs_info->balance_mutex);
4406        return 0;
4407}
4408
4409static int btrfs_uuid_scan_kthread(void *data)
4410{
4411        struct btrfs_fs_info *fs_info = data;
4412        struct btrfs_root *root = fs_info->tree_root;
4413        struct btrfs_key key;
4414        struct btrfs_path *path = NULL;
4415        int ret = 0;
4416        struct extent_buffer *eb;
4417        int slot;
4418        struct btrfs_root_item root_item;
4419        u32 item_size;
4420        struct btrfs_trans_handle *trans = NULL;
4421
4422        path = btrfs_alloc_path();
4423        if (!path) {
4424                ret = -ENOMEM;
4425                goto out;
4426        }
4427
4428        key.objectid = 0;
4429        key.type = BTRFS_ROOT_ITEM_KEY;
4430        key.offset = 0;
4431
4432        while (1) {
4433                ret = btrfs_search_forward(root, &key, path,
4434                                BTRFS_OLDEST_GENERATION);
4435                if (ret) {
4436                        if (ret > 0)
4437                                ret = 0;
4438                        break;
4439                }
4440
4441                if (key.type != BTRFS_ROOT_ITEM_KEY ||
4442                    (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
4443                     key.objectid != BTRFS_FS_TREE_OBJECTID) ||
4444                    key.objectid > BTRFS_LAST_FREE_OBJECTID)
4445                        goto skip;
4446
4447                eb = path->nodes[0];
4448                slot = path->slots[0];
4449                item_size = btrfs_item_size_nr(eb, slot);
4450                if (item_size < sizeof(root_item))
4451                        goto skip;
4452
4453                read_extent_buffer(eb, &root_item,
4454                                   btrfs_item_ptr_offset(eb, slot),
4455                                   (int)sizeof(root_item));
4456                if (btrfs_root_refs(&root_item) == 0)
4457                        goto skip;
4458
4459                if (!btrfs_is_empty_uuid(root_item.uuid) ||
4460                    !btrfs_is_empty_uuid(root_item.received_uuid)) {
4461                        if (trans)
4462                                goto update_tree;
4463
4464                        btrfs_release_path(path);
4465                        /*
4466                         * 1 - subvol uuid item
4467                         * 1 - received_subvol uuid item
4468                         */
4469                        trans = btrfs_start_transaction(fs_info->uuid_root, 2);
4470                        if (IS_ERR(trans)) {
4471                                ret = PTR_ERR(trans);
4472                                break;
4473                        }
4474                        continue;
4475                } else {
4476                        goto skip;
4477                }
4478update_tree:
4479                if (!btrfs_is_empty_uuid(root_item.uuid)) {
4480                        ret = btrfs_uuid_tree_add(trans, root_item.uuid,
4481                                                  BTRFS_UUID_KEY_SUBVOL,
4482                                                  key.objectid);
4483                        if (ret < 0) {
4484                                btrfs_warn(fs_info, "uuid_tree_add failed %d",
4485                                        ret);
4486                                break;
4487                        }
4488                }
4489
4490                if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
4491                        ret = btrfs_uuid_tree_add(trans,
4492                                                  root_item.received_uuid,
4493                                                 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4494                                                  key.objectid);
4495                        if (ret < 0) {
4496                                btrfs_warn(fs_info, "uuid_tree_add failed %d",
4497                                        ret);
4498                                break;
4499                        }
4500                }
4501
4502skip:
4503                if (trans) {
4504                        ret = btrfs_end_transaction(trans);
4505                        trans = NULL;
4506                        if (ret)
4507                                break;
4508                }
4509
4510                btrfs_release_path(path);
4511                if (key.offset < (u64)-1) {
4512                        key.offset++;
4513                } else if (key.type < BTRFS_ROOT_ITEM_KEY) {
4514                        key.offset = 0;
4515                        key.type = BTRFS_ROOT_ITEM_KEY;
4516                } else if (key.objectid < (u64)-1) {
4517                        key.offset = 0;
4518                        key.type = BTRFS_ROOT_ITEM_KEY;
4519                        key.objectid++;
4520                } else {
4521                        break;
4522                }
4523                cond_resched();
4524        }
4525
4526out:
4527        btrfs_free_path(path);
4528        if (trans && !IS_ERR(trans))
4529                btrfs_end_transaction(trans);
4530        if (ret)
4531                btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
4532        else
4533                set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
4534        up(&fs_info->uuid_tree_rescan_sem);
4535        return 0;
4536}
4537
4538/*
4539 * Callback for btrfs_uuid_tree_iterate().
4540 * returns:
4541 * 0    check succeeded, the entry is not outdated.
4542 * < 0  if an error occurred.
4543 * > 0  if the check failed, which means the caller shall remove the entry.
4544 */
4545static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
4546                                       u8 *uuid, u8 type, u64 subid)
4547{
4548        struct btrfs_key key;
4549        int ret = 0;
4550        struct btrfs_root *subvol_root;
4551
4552        if (type != BTRFS_UUID_KEY_SUBVOL &&
4553            type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
4554                goto out;
4555
4556        key.objectid = subid;
4557        key.type = BTRFS_ROOT_ITEM_KEY;
4558        key.offset = (u64)-1;
4559        subvol_root = btrfs_read_fs_root_no_name(fs_info, &key);
4560        if (IS_ERR(subvol_root)) {
4561                ret = PTR_ERR(subvol_root);
4562                if (ret == -ENOENT)
4563                        ret = 1;
4564                goto out;
4565        }
4566
4567        switch (type) {
4568        case BTRFS_UUID_KEY_SUBVOL:
4569                if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE))
4570                        ret = 1;
4571                break;
4572        case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
4573                if (memcmp(uuid, subvol_root->root_item.received_uuid,
4574                           BTRFS_UUID_SIZE))
4575                        ret = 1;
4576                break;
4577        }
4578
4579out:
4580        return ret;
4581}
4582
4583static int btrfs_uuid_rescan_kthread(void *data)
4584{
4585        struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
4586        int ret;
4587
4588        /*
4589         * 1st step is to iterate through the existing UUID tree and
4590         * to delete all entries that contain outdated data.
4591         * 2nd step is to add all missing entries to the UUID tree.
4592         */
4593        ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry);
4594        if (ret < 0) {
4595                btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret);
4596                up(&fs_info->uuid_tree_rescan_sem);
4597                return ret;
4598        }
4599        return btrfs_uuid_scan_kthread(data);
4600}
4601
4602int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
4603{
4604        struct btrfs_trans_handle *trans;
4605        struct btrfs_root *tree_root = fs_info->tree_root;
4606        struct btrfs_root *uuid_root;
4607        struct task_struct *task;
4608        int ret;
4609
4610        /*
4611         * 1 - root node
4612         * 1 - root item
4613         */
4614        trans = btrfs_start_transaction(tree_root, 2);
4615        if (IS_ERR(trans))
4616                return PTR_ERR(trans);
4617
4618        uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
4619        if (IS_ERR(uuid_root)) {
4620                ret = PTR_ERR(uuid_root);
4621                btrfs_abort_transaction(trans, ret);
4622                btrfs_end_transaction(trans);
4623                return ret;
4624        }
4625
4626        fs_info->uuid_root = uuid_root;
4627
4628        ret = btrfs_commit_transaction(trans);
4629        if (ret)
4630                return ret;
4631
4632        down(&fs_info->uuid_tree_rescan_sem);
4633        task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
4634        if (IS_ERR(task)) {
4635                /* fs_info->update_uuid_tree_gen remains 0 in all error case */
4636                btrfs_warn(fs_info, "failed to start uuid_scan task");
4637                up(&fs_info->uuid_tree_rescan_sem);
4638                return PTR_ERR(task);
4639        }
4640
4641        return 0;
4642}
4643
4644int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
4645{
4646        struct task_struct *task;
4647
4648        down(&fs_info->uuid_tree_rescan_sem);
4649        task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
4650        if (IS_ERR(task)) {
4651                /* fs_info->update_uuid_tree_gen remains 0 in all error case */
4652                btrfs_warn(fs_info, "failed to start uuid_rescan task");
4653                up(&fs_info->uuid_tree_rescan_sem);
4654                return PTR_ERR(task);
4655        }
4656
4657        return 0;
4658}
4659
4660/*
4661 * shrinking a device means finding all of the device extents past
4662 * the new size, and then following the back refs to the chunks.
4663 * The chunk relocation code actually frees the device extent
4664 */
4665int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
4666{
4667        struct btrfs_fs_info *fs_info = device->fs_info;
4668        struct btrfs_root *root = fs_info->dev_root;
4669        struct btrfs_trans_handle *trans;
4670        struct btrfs_dev_extent *dev_extent = NULL;
4671        struct btrfs_path *path;
4672        u64 length;
4673        u64 chunk_offset;
4674        int ret;
4675        int slot;
4676        int failed = 0;
4677        bool retried = false;
4678        struct extent_buffer *l;
4679        struct btrfs_key key;
4680        struct btrfs_super_block *super_copy = fs_info->super_copy;
4681        u64 old_total = btrfs_super_total_bytes(super_copy);
4682        u64 old_size = btrfs_device_get_total_bytes(device);
4683        u64 diff;
4684        u64 start;
4685
4686        new_size = round_down(new_size, fs_info->sectorsize);
4687        start = new_size;
4688        diff = round_down(old_size - new_size, fs_info->sectorsize);
4689
4690        if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
4691                return -EINVAL;
4692
4693        path = btrfs_alloc_path();
4694        if (!path)
4695                return -ENOMEM;
4696
4697        path->reada = READA_BACK;
4698
4699        trans = btrfs_start_transaction(root, 0);
4700        if (IS_ERR(trans)) {
4701                btrfs_free_path(path);
4702                return PTR_ERR(trans);
4703        }
4704
4705        mutex_lock(&fs_info->chunk_mutex);
4706
4707        btrfs_device_set_total_bytes(device, new_size);
4708        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
4709                device->fs_devices->total_rw_bytes -= diff;
4710                atomic64_sub(diff, &fs_info->free_chunk_space);
4711        }
4712
4713        /*
4714         * Once the device's size has been set to the new size, ensure all
4715         * in-memory chunks are synced to disk so that the loop below sees them
4716         * and relocates them accordingly.
4717         */
4718        if (contains_pending_extent(device, &start, diff)) {
4719                mutex_unlock(&fs_info->chunk_mutex);
4720                ret = btrfs_commit_transaction(trans);
4721                if (ret)
4722                        goto done;
4723        } else {
4724                mutex_unlock(&fs_info->chunk_mutex);
4725                btrfs_end_transaction(trans);
4726        }
4727
4728again:
4729        key.objectid = device->devid;
4730        key.offset = (u64)-1;
4731        key.type = BTRFS_DEV_EXTENT_KEY;
4732
4733        do {
4734                mutex_lock(&fs_info->delete_unused_bgs_mutex);
4735                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4736                if (ret < 0) {
4737                        mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4738                        goto done;
4739                }
4740
4741                ret = btrfs_previous_item(root, path, 0, key.type);
4742                if (ret)
4743                        mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4744                if (ret < 0)
4745                        goto done;
4746                if (ret) {
4747                        ret = 0;
4748                        btrfs_release_path(path);
4749                        break;
4750                }
4751
4752                l = path->nodes[0];
4753                slot = path->slots[0];
4754                btrfs_item_key_to_cpu(l, &key, path->slots[0]);
4755
4756                if (key.objectid != device->devid) {
4757                        mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4758                        btrfs_release_path(path);
4759                        break;
4760                }
4761
4762                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
4763                length = btrfs_dev_extent_length(l, dev_extent);
4764
4765                if (key.offset + length <= new_size) {
4766                        mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4767                        btrfs_release_path(path);
4768                        break;
4769                }
4770
4771                chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
4772                btrfs_release_path(path);
4773
4774                /*
4775                 * We may be relocating the only data chunk we have,
4776                 * which could potentially end up with losing data's
4777                 * raid profile, so lets allocate an empty one in
4778                 * advance.
4779                 */
4780                ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
4781                if (ret < 0) {
4782                        mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4783                        goto done;
4784                }
4785
4786                ret = btrfs_relocate_chunk(fs_info, chunk_offset);
4787                mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4788                if (ret == -ENOSPC) {
4789                        failed++;
4790                } else if (ret) {
4791                        if (ret == -ETXTBSY) {
4792                                btrfs_warn(fs_info,
4793                   "could not shrink block group %llu due to active swapfile",
4794                                           chunk_offset);
4795                        }
4796                        goto done;
4797                }
4798        } while (key.offset-- > 0);
4799
4800        if (failed && !retried) {
4801                failed = 0;
4802                retried = true;
4803                goto again;
4804        } else if (failed && retried) {
4805                ret = -ENOSPC;
4806                goto done;
4807        }
4808
4809        /* Shrinking succeeded, else we would be at "done". */
4810        trans = btrfs_start_transaction(root, 0);
4811        if (IS_ERR(trans)) {
4812                ret = PTR_ERR(trans);
4813                goto done;
4814        }
4815
4816        mutex_lock(&fs_info->chunk_mutex);
4817        btrfs_device_set_disk_total_bytes(device, new_size);
4818        if (list_empty(&device->post_commit_list))
4819                list_add_tail(&device->post_commit_list,
4820                              &trans->transaction->dev_update_list);
4821
4822        WARN_ON(diff > old_total);
4823        btrfs_set_super_total_bytes(super_copy,
4824                        round_down(old_total - diff, fs_info->sectorsize));
4825        mutex_unlock(&fs_info->chunk_mutex);
4826
4827        /* Now btrfs_update_device() will change the on-disk size. */
4828        ret = btrfs_update_device(trans, device);
4829        if (ret < 0) {
4830                btrfs_abort_transaction(trans, ret);
4831                btrfs_end_transaction(trans);
4832        } else {
4833                ret = btrfs_commit_transaction(trans);
4834        }
4835done:
4836        btrfs_free_path(path);
4837        if (ret) {
4838                mutex_lock(&fs_info->chunk_mutex);
4839                btrfs_device_set_total_bytes(device, old_size);
4840                if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
4841                        device->fs_devices->total_rw_bytes += diff;
4842                atomic64_add(diff, &fs_info->free_chunk_space);
4843                mutex_unlock(&fs_info->chunk_mutex);
4844        }
4845        return ret;
4846}
4847
4848static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
4849                           struct btrfs_key *key,
4850                           struct btrfs_chunk *chunk, int item_size)
4851{
4852        struct btrfs_super_block *super_copy = fs_info->super_copy;
4853        struct btrfs_disk_key disk_key;
4854        u32 array_size;
4855        u8 *ptr;
4856
4857        mutex_lock(&fs_info->chunk_mutex);
4858        array_size = btrfs_super_sys_array_size(super_copy);
4859        if (array_size + item_size + sizeof(disk_key)
4860                        > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
4861                mutex_unlock(&fs_info->chunk_mutex);
4862                return -EFBIG;
4863        }
4864
4865        ptr = super_copy->sys_chunk_array + array_size;
4866        btrfs_cpu_key_to_disk(&disk_key, key);
4867        memcpy(ptr, &disk_key, sizeof(disk_key));
4868        ptr += sizeof(disk_key);
4869        memcpy(ptr, chunk, item_size);
4870        item_size += sizeof(disk_key);
4871        btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
4872        mutex_unlock(&fs_info->chunk_mutex);
4873
4874        return 0;
4875}
4876
4877/*
4878 * sort the devices in descending order by max_avail, total_avail
4879 */
4880static int btrfs_cmp_device_info(const void *a, const void *b)
4881{
4882        const struct btrfs_device_info *di_a = a;
4883        const struct btrfs_device_info *di_b = b;
4884
4885        if (di_a->max_avail > di_b->max_avail)
4886                return -1;
4887        if (di_a->max_avail < di_b->max_avail)
4888                return 1;
4889        if (di_a->total_avail > di_b->total_avail)
4890                return -1;
4891        if (di_a->total_avail < di_b->total_avail)
4892                return 1;
4893        return 0;
4894}
4895
4896static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
4897{
4898        if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
4899                return;
4900
4901        btrfs_set_fs_incompat(info, RAID56);
4902}
4903
4904static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4905                               u64 start, u64 type)
4906{
4907        struct btrfs_fs_info *info = trans->fs_info;
4908        struct btrfs_fs_devices *fs_devices = info->fs_devices;
4909        struct btrfs_device *device;
4910        struct map_lookup *map = NULL;
4911        struct extent_map_tree *em_tree;
4912        struct extent_map *em;
4913        struct btrfs_device_info *devices_info = NULL;
4914        u64 total_avail;
4915        int num_stripes;        /* total number of stripes to allocate */
4916        int data_stripes;       /* number of stripes that count for
4917                                   block group size */
4918        int sub_stripes;        /* sub_stripes info for map */
4919        int dev_stripes;        /* stripes per dev */
4920        int devs_max;           /* max devs to use */
4921        int devs_min;           /* min devs needed */
4922        int devs_increment;     /* ndevs has to be a multiple of this */
4923        int ncopies;            /* how many copies to data has */
4924        int nparity;            /* number of stripes worth of bytes to
4925                                   store parity information */
4926        int ret;
4927        u64 max_stripe_size;
4928        u64 max_chunk_size;
4929        u64 stripe_size;
4930        u64 chunk_size;
4931        int ndevs;
4932        int i;
4933        int j;
4934        int index;
4935
4936        BUG_ON(!alloc_profile_is_valid(type, 0));
4937
4938        if (list_empty(&fs_devices->alloc_list)) {
4939                if (btrfs_test_opt(info, ENOSPC_DEBUG))
4940                        btrfs_debug(info, "%s: no writable device", __func__);
4941                return -ENOSPC;
4942        }
4943
4944        index = btrfs_bg_flags_to_raid_index(type);
4945
4946        sub_stripes = btrfs_raid_array[index].sub_stripes;
4947        dev_stripes = btrfs_raid_array[index].dev_stripes;
4948        devs_max = btrfs_raid_array[index].devs_max;
4949        if (!devs_max)
4950                devs_max = BTRFS_MAX_DEVS(info);
4951        devs_min = btrfs_raid_array[index].devs_min;
4952        devs_increment = btrfs_raid_array[index].devs_increment;
4953        ncopies = btrfs_raid_array[index].ncopies;
4954        nparity = btrfs_raid_array[index].nparity;
4955
4956        if (type & BTRFS_BLOCK_GROUP_DATA) {
4957                max_stripe_size = SZ_1G;
4958                max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
4959        } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
4960                /* for larger filesystems, use larger metadata chunks */
4961                if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
4962                        max_stripe_size = SZ_1G;
4963                else
4964                        max_stripe_size = SZ_256M;
4965                max_chunk_size = max_stripe_size;
4966        } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
4967                max_stripe_size = SZ_32M;
4968                max_chunk_size = 2 * max_stripe_size;
4969        } else {
4970                btrfs_err(info, "invalid chunk type 0x%llx requested",
4971                       type);
4972                BUG();
4973        }
4974
4975        /* We don't want a chunk larger than 10% of writable space */
4976        max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
4977                             max_chunk_size);
4978
4979        devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
4980                               GFP_NOFS);
4981        if (!devices_info)
4982                return -ENOMEM;
4983
4984        /*
4985         * in the first pass through the devices list, we gather information
4986         * about the available holes on each device.
4987         */
4988        ndevs = 0;
4989        list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
4990                u64 max_avail;
4991                u64 dev_offset;
4992
4993                if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
4994                        WARN(1, KERN_ERR
4995                               "BTRFS: read-only device in alloc_list\n");
4996                        continue;
4997                }
4998
4999                if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
5000                                        &device->dev_state) ||

5001                    test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
5002                        continue;
5003
5004                if (device->total_bytes > device->bytes_used)
5005                        total_avail = device->total_bytes - device->bytes_used;
5006                else
5007                        total_avail = 0;
5008
5009                /* If there is no space on this device, skip it. */
5010                if (total_avail == 0)
5011                        continue;
5012
5013                ret = find_free_dev_extent(device,
5014                                           max_stripe_size * dev_stripes,
5015                                           &dev_offset, &max_avail);
5016                if (ret && ret != -ENOSPC)
5017                        goto error;
5018
5019                if (ret == 0)
5020                        max_avail = max_stripe_size * dev_stripes;
5021
5022                if (max_avail < BTRFS_STRIPE_LEN * dev_stripes) {
5023                        if (btrfs_test_opt(info, ENOSPC_DEBUG))
5024                                btrfs_debug(info,
5025                        "%s: devid %llu has no free space, have=%llu want=%u",
5026                                            __func__, device->devid, max_avail,
5027                                            BTRFS_STRIPE_LEN * dev_stripes);
5028                        continue;
5029                }
5030
5031                if (ndevs == fs_devices->rw_devices) {
5032                        WARN(1, "%s: found more than %llu devices\n",
5033                             __func__, fs_devices->rw_devices);
5034                        break;
5035                }
5036                devices_info[ndevs].dev_offset = dev_offset;
5037                devices_info[ndevs].max_avail = max_avail;
5038                devices_info[ndevs].total_avail = total_avail;
5039                devices_info[ndevs].dev = device;
5040                ++ndevs;
5041        }
5042
5043        /*
5044         * now sort the devices by hole size / available space
5045         */
5046        sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
5047             btrfs_cmp_device_info, NULL);
5048
5049        /* round down to number of usable stripes */
5050        ndevs = round_down(ndevs, devs_increment);
5051
5052        if (ndevs < devs_min) {
5053                ret = -ENOSPC;
5054                if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
5055                        btrfs_debug(info,
5056        "%s: not enough devices with free space: have=%d minimum required=%d",
5057                                    __func__, ndevs, devs_min);
5058                }
5059                goto error;
5060        }
5061
5062        ndevs = min(ndevs, devs_max);
5063
5064        /*
5065         * The primary goal is to maximize the number of stripes, so use as
5066         * many devices as possible, even if the stripes are not maximum sized.
5067         *
5068         * The DUP profile stores more than one stripe per device, the
5069         * max_avail is the total size so we have to adjust.
5070         */
5071        stripe_size = div_u64(devices_info[ndevs - 1].max_avail, dev_stripes);
5072        num_stripes = ndevs * dev_stripes;
5073
5074        /*
5075         * this will have to be fixed for RAID1 and RAID10 over
5076         * more drives
5077         */
5078        data_stripes = (num_stripes - nparity) / ncopies;
5079
5080        /*
5081         * Use the number of data stripes to figure out how big this chunk
5082         * is really going to be in terms of logical address space,
5083         * and compare that answer with the max chunk size. If it's higher,
5084         * we try to reduce stripe_size.
5085         */
5086        if (stripe_size * data_stripes > max_chunk_size) {
5087                /*
5088                 * Reduce stripe_size, round it up to a 16MB boundary again and
5089                 * then use it, unless it ends up being even bigger than the
5090                 * previous value we had already.
5091                 */
5092                stripe_size = min(round_up(div_u64(max_chunk_size,
5093                                                   data_stripes), SZ_16M),
5094                                  stripe_size);
5095        }
5096
5097        /* align to BTRFS_STRIPE_LEN */
5098        stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN);
5099
5100        map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
5101        if (!map) {
5102                ret = -ENOMEM;
5103                goto error;
5104        }
5105        map->num_stripes = num_stripes;
5106
5107        for (i = 0; i < ndevs; ++i) {
5108                for (j = 0; j < dev_stripes; ++j) {
5109                        int s = i * dev_stripes + j;
5110                        map->stripes[s].dev = devices_info[i].dev;
5111                        map->stripes[s].physical = devices_info[i].dev_offset +
5112                                                   j * stripe_size;
5113                }
5114        }
5115        map->stripe_len = BTRFS_STRIPE_LEN;
5116        map->io_align = BTRFS_STRIPE_LEN;
5117        map->io_width = BTRFS_STRIPE_LEN;
5118        map->type = type;
5119        map->sub_stripes = sub_stripes;
5120
5121        chunk_size = stripe_size * data_stripes;
5122
5123        trace_btrfs_chunk_alloc(info, map, start, chunk_size);
5124
5125        em = alloc_extent_map();
5126        if (!em) {
5127                kfree(map);
5128                ret = -ENOMEM;
5129                goto error;
5130        }
5131        set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
5132        em->map_lookup = map;
5133        em->start = start;
5134        em->len = chunk_size;
5135        em->block_start = 0;
5136        em->block_len = em->len;
5137        em->orig_block_len = stripe_size;
5138
5139        em_tree = &info->mapping_tree;
5140        write_lock(&em_tree->lock);
5141        ret = add_extent_mapping(em_tree, em, 0);
5142        if (ret) {
5143                write_unlock(&em_tree->lock);
5144                free_extent_map(em);
5145                goto error;
5146        }
5147        write_unlock(&em_tree->lock);
5148
5149        ret = btrfs_make_block_group(trans, 0, type, start, chunk_size);
5150        if (ret)
5151                goto error_del_extent;
5152
5153        for (i = 0; i < map->num_stripes; i++) {
5154                struct btrfs_device *dev = map->stripes[i].dev;
5155
5156                btrfs_device_set_bytes_used(dev, dev->bytes_used + stripe_size);
5157                if (list_empty(&dev->post_commit_list))
5158                        list_add_tail(&dev->post_commit_list,
5159                                      &trans->transaction->dev_update_list);
5160        }
5161
5162        atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space);
5163
5164        free_extent_map(em);
5165        check_raid56_incompat_flag(info, type);
5166
5167        kfree(devices_info);
5168        return 0;
5169
5170error_del_extent:
5171        write_lock(&em_tree->lock);
5172        remove_extent_mapping(em_tree, em);
5173        write_unlock(&em_tree->lock);
5174
5175        /* One for our allocation */
5176        free_extent_map(em);
5177        /* One for the tree reference */
5178        free_extent_map(em);
5179error:
5180        kfree(devices_info);
5181        return ret;
5182}
5183
5184int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
5185                             u64 chunk_offset, u64 chunk_size)
5186{
5187        struct btrfs_fs_info *fs_info = trans->fs_info;
5188        struct btrfs_root *extent_root = fs_info->extent_root;
5189        struct btrfs_root *chunk_root = fs_info->chunk_root;
5190        struct btrfs_key key;
5191        struct btrfs_device *device;
5192        struct btrfs_chunk *chunk;
5193        struct btrfs_stripe *stripe;
5194        struct extent_map *em;
5195        struct map_lookup *map;
5196        size_t item_size;
5197        u64 dev_offset;
5198        u64 stripe_size;
5199        int i = 0;
5200        int ret = 0;
5201
5202        em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
5203        if (IS_ERR(em))
5204                return PTR_ERR(em);
5205
5206        map = em->map_lookup;
5207        item_size = btrfs_chunk_item_size(map->num_stripes);
5208        stripe_size = em->orig_block_len;
5209
5210        chunk = kzalloc(item_size, GFP_NOFS);
5211        if (!chunk) {
5212                ret = -ENOMEM;
5213                goto out;
5214        }
5215
5216        /*
5217         * Take the device list mutex to prevent races with the final phase of
5218         * a device replace operation that replaces the device object associated
5219         * with the map's stripes, because the device object's id can change
5220         * at any time during that final phase of the device replace operation
5221         * (dev-replace.c:btrfs_dev_replace_finishing()).
5222         */
5223        mutex_lock(&fs_info->fs_devices->device_list_mutex);
5224        for (i = 0; i < map->num_stripes; i++) {
5225                device = map->stripes[i].dev;
5226                dev_offset = map->stripes[i].physical;
5227
5228                ret = btrfs_update_device(trans, device);
5229                if (ret)
5230                        break;
5231                ret = btrfs_alloc_dev_extent(trans, device, chunk_offset,
5232                                             dev_offset, stripe_size);
5233                if (ret)
5234                        break;
5235        }
5236        if (ret) {
5237                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
5238                goto out;
5239        }
5240
5241        stripe = &chunk->stripe;
5242        for (i = 0; i < map->num_stripes; i++) {
5243                device = map->stripes[i].dev;
5244                dev_offset = map->stripes[i].physical;
5245
5246                btrfs_set_stack_stripe_devid(stripe, device->devid);
5247                btrfs_set_stack_stripe_offset(stripe, dev_offset);
5248                memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
5249                stripe++;
5250        }
5251        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
5252
5253        btrfs_set_stack_chunk_length(chunk, chunk_size);
5254        btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
5255        btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
5256        btrfs_set_stack_chunk_type(chunk, map->type);
5257        btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
5258        btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
5259        btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
5260        btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
5261        btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
5262
5263        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
5264        key.type = BTRFS_CHUNK_ITEM_KEY;
5265        key.offset = chunk_offset;
5266
5267        ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
5268        if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
5269                /*
5270                 * TODO: Cleanup of inserted chunk root in case of
5271                 * failure.
5272                 */
5273                ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
5274        }
5275
5276out:
5277        kfree(chunk);
5278        free_extent_map(em);
5279        return ret;
5280}
5281
5282/*
5283 * Chunk allocation falls into two parts. The first part does work
5284 * that makes the new allocated chunk usable, but does not do any operation
5285 * that modifies the chunk tree. The second part does the work that
5286 * requires modifying the chunk tree. This division is important for the
5287 * bootstrap process of adding storage to a seed btrfs.
5288 */
5289int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
5290{
5291        u64 chunk_offset;
5292
5293        lockdep_assert_held(&trans->fs_info->chunk_mutex);
5294        chunk_offset = find_next_chunk(trans->fs_info);
5295        return __btrfs_alloc_chunk(trans, chunk_offset, type);
5296}
5297
5298static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
5299{
5300        struct btrfs_fs_info *fs_info = trans->fs_info;
5301        u64 chunk_offset;
5302        u64 sys_chunk_offset;
5303        u64 alloc_profile;
5304        int ret;
5305
5306        chunk_offset = find_next_chunk(fs_info);
5307        alloc_profile = btrfs_metadata_alloc_profile(fs_info);
5308        ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile);
5309        if (ret)
5310                return ret;
5311
5312        sys_chunk_offset = find_next_chunk(fs_info);
5313        alloc_profile = btrfs_system_alloc_profile(fs_info);
5314        ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile);
5315        return ret;
5316}
5317
5318static inline int btrfs_chunk_max_errors(struct map_lookup *map)
5319{
5320        const int index = btrfs_bg_flags_to_raid_index(map->type);
5321
5322        return btrfs_raid_array[index].tolerated_failures;
5323}
5324
5325int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
5326{
5327        struct extent_map *em;
5328        struct map_lookup *map;
5329        int readonly = 0;
5330        int miss_ndevs = 0;
5331        int i;
5332
5333        em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
5334        if (IS_ERR(em))
5335                return 1;
5336
5337        map = em->map_lookup;
5338        for (i = 0; i < map->num_stripes; i++) {
5339                if (test_bit(BTRFS_DEV_STATE_MISSING,
5340                                        &map->stripes[i].dev->dev_state)) {
5341                        miss_ndevs++;
5342                        continue;
5343                }
5344                if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
5345                                        &map->stripes[i].dev->dev_state)) {
5346                        readonly = 1;
5347                        goto end;
5348                }
5349        }
5350
5351        /*
5352         * If the number of missing devices is larger than max errors,
5353         * we can not write the data into that chunk successfully, so
5354         * set it readonly.
5355         */
5356        if (miss_ndevs > btrfs_chunk_max_errors(map))
5357                readonly = 1;
5358end:
5359        free_extent_map(em);
5360        return readonly;
5361}
5362
5363void btrfs_mapping_tree_free(struct extent_map_tree *tree)
5364{
5365        struct extent_map *em;
5366
5367        while (1) {
5368                write_lock(&tree->lock);
5369                em = lookup_extent_mapping(tree, 0, (u64)-1);
5370                if (em)
5371                        remove_extent_mapping(tree, em);
5372                write_unlock(&tree->lock);
5373                if (!em)
5374                        break;
5375                /* once for us */
5376                free_extent_map(em);
5377                /* once for the tree */
5378                free_extent_map(em);
5379        }
5380}
5381
5382int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5383{
5384        struct extent_map *em;
5385        struct map_lookup *map;
5386        int ret;
5387
5388        em = btrfs_get_chunk_map(fs_info, logical, len);
5389        if (IS_ERR(em))
5390                /*
5391                 * We could return errors for these cases, but that could get
5392                 * ugly and we'd probably do the same thing which is just not do
5393                 * anything else and exit, so return 1 so the callers don't try
5394                 * to use other copies.
5395                 */
5396                return 1;
5397
5398        map = em->map_lookup;
5399        if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK))
5400                ret = map->num_stripes;
5401        else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5402                ret = map->sub_stripes;
5403        else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
5404                ret = 2;
5405        else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
5406                /*
5407                 * There could be two corrupted data stripes, we need
5408                 * to loop retry in order to rebuild the correct data.
5409                 *
5410                 * Fail a stripe at a time on every retry except the
5411                 * stripe under reconstruction.
5412                 */
5413                ret = map->num_stripes;
5414        else
5415                ret = 1;
5416        free_extent_map(em);
5417
5418        down_read(&fs_info->dev_replace.rwsem);
5419        if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
5420            fs_info->dev_replace.tgtdev)
5421                ret++;
5422        up_read(&fs_info->dev_replace.rwsem);
5423
5424        return ret;
5425}
5426
5427unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
5428                                    u64 logical)
5429{
5430        struct extent_map *em;
5431        struct map_lookup *map;
5432        unsigned long len = fs_info->sectorsize;
5433
5434        em = btrfs_get_chunk_map(fs_info, logical, len);
5435
5436        if (!WARN_ON(IS_ERR(em))) {
5437                map = em->map_lookup;
5438                if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5439                        len = map->stripe_len * nr_data_stripes(map);
5440                free_extent_map(em);
5441        }
5442        return len;
5443}
5444
5445int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5446{
5447        struct extent_map *em;
5448        struct map_lookup *map;
5449        int ret = 0;
5450
5451        em = btrfs_get_chunk_map(fs_info, logical, len);
5452
5453        if(!WARN_ON(IS_ERR(em))) {
5454                map = em->map_lookup;
5455                if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5456                        ret = 1;
5457                free_extent_map(em);
5458        }
5459        return ret;
5460}
5461
5462static int find_live_mirror(struct btrfs_fs_info *fs_info,
5463                            struct map_lookup *map, int first,
5464                            int dev_replace_is_ongoing)
5465{
5466        int i;
5467        int num_stripes;
5468        int preferred_mirror;
5469        int tolerance;
5470        struct btrfs_device *srcdev;
5471
5472        ASSERT((map->type &
5473                 (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
5474
5475        if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5476                num_stripes = map->sub_stripes;
5477        else
5478                num_stripes = map->num_stripes;
5479
5480        preferred_mirror = first + current->pid % num_stripes;
5481
5482        if (dev_replace_is_ongoing &&
5483            fs_info->dev_replace.cont_reading_from_srcdev_mode ==
5484             BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
5485                srcdev = fs_info->dev_replace.srcdev;
5486        else
5487                srcdev = NULL;
5488
5489        /*
5490         * try to avoid the drive that is the source drive for a
5491         * dev-replace procedure, only choose it if no other non-missing
5492         * mirror is available
5493         */
5494        for (tolerance = 0; tolerance < 2; tolerance++) {
5495                if (map->stripes[preferred_mirror].dev->bdev &&
5496                    (tolerance || map->stripes[preferred_mirror].dev != srcdev))
5497                        return preferred_mirror;
5498                for (i = first; i < first + num_stripes; i++) {
5499                        if (map->stripes[i].dev->bdev &&
5500                            (tolerance || map->stripes[i].dev != srcdev))
5501                                return i;
5502                }
5503        }
5504
5505        /* we couldn't find one that doesn't fail.  Just return something
5506         * and the io error handling code will clean up eventually
5507         */
5508        return preferred_mirror;
5509}
5510
5511static inline int parity_smaller(u64 a, u64 b)
5512{
5513        return a > b;
5514}
5515
5516/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
5517static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
5518{
5519        struct btrfs_bio_stripe s;
5520        int i;
5521        u64 l;
5522        int again = 1;
5523
5524        while (again) {
5525                again = 0;
5526                for (i = 0; i < num_stripes - 1; i++) {
5527                        if (parity_smaller(bbio->raid_map[i],
5528                                           bbio->raid_map[i+1])) {
5529                                s = bbio->stripes[i];
5530                                l = bbio->raid_map[i];
5531                                bbio->stripes[i] = bbio->stripes[i+1];
5532                                bbio->raid_map[i] = bbio->raid_map[i+1];
5533                                bbio->stripes[i+1] = s;
5534                                bbio->raid_map[i+1] = l;
5535
5536                                again = 1;
5537                        }
5538                }
5539        }
5540}
5541
5542static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
5543{
5544        struct btrfs_bio *bbio = kzalloc(
5545                 /* the size of the btrfs_bio */
5546                sizeof(struct btrfs_bio) +
5547                /* plus the variable array for the stripes */
5548                sizeof(struct btrfs_bio_stripe) * (total_stripes) +
5549                /* plus the variable array for the tgt dev */
5550                sizeof(int) * (real_stripes) +
5551                /*
5552                 * plus the raid_map, which includes both the tgt dev
5553                 * and the stripes
5554                 */
5555                sizeof(u64) * (total_stripes),
5556                GFP_NOFS|__GFP_NOFAIL);
5557
5558        atomic_set(&bbio->error, 0);
5559        refcount_set(&bbio->refs, 1);
5560
5561        return bbio;
5562}
5563
5564void btrfs_get_bbio(struct btrfs_bio *bbio)
5565{
5566        WARN_ON(!refcount_read(&bbio->refs));
5567        refcount_inc(&bbio->refs);
5568}
5569
5570void btrfs_put_bbio(struct btrfs_bio *bbio)
5571{
5572        if (!bbio)
5573                return;
5574        if (refcount_dec_and_test(&bbio->refs))
5575                kfree(bbio);
5576}
5577
5578/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
5579/*
5580 * Please note that, discard won't be sent to target device of device
5581 * replace.
5582 */
5583static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
5584                                         u64 logical, u64 length,
5585                                         struct btrfs_bio **bbio_ret)
5586{
5587        struct extent_map *em;
5588        struct map_lookup *map;
5589        struct btrfs_bio *bbio;
5590        u64 offset;
5591        u64 stripe_nr;
5592        u64 stripe_nr_end;
5593        u64 stripe_end_offset;
5594        u64 stripe_cnt;
5595        u64 stripe_len;
5596        u64 stripe_offset;
5597        u64 num_stripes;
5598        u32 stripe_index;
5599        u32 factor = 0;
5600        u32 sub_stripes = 0;
5601        u64 stripes_per_dev = 0;
5602        u32 remaining_stripes = 0;
5603        u32 last_stripe = 0;
5604        int ret = 0;
5605        int i;
5606
5607        /* discard always return a bbio */
5608        ASSERT(bbio_ret);
5609
5610        em = btrfs_get_chunk_map(fs_info, logical, length);
5611        if (IS_ERR(em))
5612                return PTR_ERR(em);
5613
5614        map = em->map_lookup;
5615        /* we don't discard raid56 yet */
5616        if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5617                ret = -EOPNOTSUPP;
5618                goto out;
5619        }
5620
5621        offset = logical - em->start;
5622        length = min_t(u64, em->len - offset, length);
5623
5624        stripe_len = map->stripe_len;
5625        /*
5626         * stripe_nr counts the total number of stripes we have to stride
5627         * to get to this block
5628         */
5629        stripe_nr = div64_u64(offset, stripe_len);
5630
5631        /* stripe_offset is the offset of this block in its stripe */
5632        stripe_offset = offset - stripe_nr * stripe_len;
5633
5634        stripe_nr_end = round_up(offset + length, map->stripe_len);
5635        stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
5636        stripe_cnt = stripe_nr_end - stripe_nr;
5637        stripe_end_offset = stripe_nr_end * map->stripe_len -
5638                            (offset + length);
5639        /*
5640         * after this, stripe_nr is the number of stripes on this
5641         * device we have to walk to find the data, and stripe_index is
5642         * the number of our device in the stripe array
5643         */
5644        num_stripes = 1;
5645        stripe_index = 0;
5646        if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
5647                         BTRFS_BLOCK_GROUP_RAID10)) {
5648                if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5649                        sub_stripes = 1;
5650                else
5651                        sub_stripes = map->sub_stripes;
5652
5653                factor = map->num_stripes / sub_stripes;
5654                num_stripes = min_t(u64, map->num_stripes,
5655                                    sub_stripes * stripe_cnt);
5656                stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
5657                stripe_index *= sub_stripes;
5658                stripes_per_dev = div_u64_rem(stripe_cnt, factor,
5659                                              &remaining_stripes);
5660                div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
5661                last_stripe *= sub_stripes;
5662        } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
5663                                BTRFS_BLOCK_GROUP_DUP)) {
5664                num_stripes = map->num_stripes;
5665        } else {
5666                stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
5667                                        &stripe_index);
5668        }
5669
5670        bbio = alloc_btrfs_bio(num_stripes, 0);
5671        if (!bbio) {
5672                ret = -ENOMEM;
5673                goto out;
5674        }
5675
5676        for (i = 0; i < num_stripes; i++) {
5677                bbio->stripes[i].physical =
5678                        map->stripes[stripe_index].physical +
5679                        stripe_offset + stripe_nr * map->stripe_len;
5680                bbio->stripes[i].dev = map->stripes[stripe_index].dev;
5681
5682                if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
5683                                 BTRFS_BLOCK_GROUP_RAID10)) {
5684                        bbio->stripes[i].length = stripes_per_dev *
5685                                map->stripe_len;
5686
5687                        if (i / sub_stripes < remaining_stripes)
5688                                bbio->stripes[i].length +=
5689                                        map->stripe_len;
5690
5691                        /*
5692                         * Special for the first stripe and
5693                         * the last stripe:
5694                         *
5695                         * |-------|...|-------|
5696                         *     |----------|
5697                         *    off     end_off
5698                         */
5699                        if (i < sub_stripes)
5700                                bbio->stripes[i].length -=
5701                                        stripe_offset;
5702
5703                        if (stripe_index >= last_stripe &&
5704                            stripe_index <= (last_stripe +
5705                                             sub_stripes - 1))
5706                                bbio->stripes[i].length -=
5707                                        stripe_end_offset;
5708
5709                        if (i == sub_stripes - 1)
5710                                stripe_offset = 0;
5711                } else {
5712                        bbio->stripes[i].length = length;
5713                }
5714
5715                stripe_index++;
5716                if (stripe_index == map->num_stripes) {
5717                        stripe_index = 0;
5718                        stripe_nr++;
5719                }
5720        }
5721
5722        *bbio_ret = bbio;
5723        bbio->map_type = map->type;
5724        bbio->num_stripes = num_stripes;
5725out:
5726        free_extent_map(em);
5727        return ret;
5728}
5729
5730/*
5731 * In dev-replace case, for repair case (that's the only case where the mirror
5732 * is selected explicitly when calling btrfs_map_block), blocks left of the
5733 * left cursor can also be read from the target drive.
5734 *
5735 * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
5736 * array of stripes.
5737 * For READ, it also needs to be supported using the same mirror number.
5738 *
5739 * If the requested block is not left of the left cursor, EIO is returned. This
5740 * can happen because btrfs_num_copies() returns one more in the dev-replace
5741 * case.
5742 */
5743static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
5744                                         u64 logical, u64 length,
5745                                         u64 srcdev_devid, int *mirror_num,
5746                                         u64 *physical)
5747{
5748        struct btrfs_bio *bbio = NULL;
5749        int num_stripes;
5750        int index_srcdev = 0;
5751        int found = 0;
5752        u64 physical_of_found = 0;
5753        int i;
5754        int ret = 0;
5755
5756        ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
5757                                logical, &length, &bbio, 0, 0);
5758        if (ret) {
5759                ASSERT(bbio == NULL);
5760                return ret;
5761        }
5762
5763        num_stripes = bbio->num_stripes;
5764        if (*mirror_num > num_stripes) {
5765                /*
5766                 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
5767                 * that means that the requested area is not left of the left
5768                 * cursor
5769                 */
5770                btrfs_put_bbio(bbio);
5771                return -EIO;
5772        }
5773
5774        /*
5775         * process the rest of the function using the mirror_num of the source
5776         * drive. Therefore look it up first.  At the end, patch the device
5777         * pointer to the one of the target drive.
5778         */
5779        for (i = 0; i < num_stripes; i++) {
5780                if (bbio->stripes[i].dev->devid != srcdev_devid)
5781                        continue;
5782
5783                /*
5784                 * In case of DUP, in order to keep it simple, only add the
5785                 * mirror with the lowest physical address
5786                 */
5787                if (found &&
5788                    physical_of_found <= bbio->stripes[i].physical)
5789                        continue;
5790
5791                index_srcdev = i;
5792                found = 1;
5793                physical_of_found = bbio->stripes[i].physical;
5794        }
5795
5796        btrfs_put_bbio(bbio);
5797
5798        ASSERT(found);
5799        if (!found)
5800                return -EIO;
5801
5802        *mirror_num = index_srcdev + 1;
5803        *physical = physical_of_found;
5804        return ret;
5805}
5806
5807static void handle_ops_on_dev_replace(enum btrfs_map_op op,
5808                                      struct btrfs_bio **bbio_ret,
5809                                      struct btrfs_dev_replace *dev_replace,
5810                                      int *num_stripes_ret, int *max_errors_ret)
5811{
5812        struct btrfs_bio *bbio = *bbio_ret;
5813        u64 srcdev_devid = dev_replace->srcdev->devid;
5814        int tgtdev_indexes = 0;
5815        int num_stripes = *num_stripes_ret;
5816        int max_errors = *max_errors_ret;
5817        int i;
5818
5819        if (op == BTRFS_MAP_WRITE) {
5820                int index_where_to_add;
5821
5822                /*
5823                 * duplicate the write operations while the dev replace
5824                 * procedure is running. Since the copying of the old disk to
5825                 * the new disk takes place at run time while the filesystem is
5826                 * mounted writable, the regular write operations to the old
5827                 * disk have to be duplicated to go to the new disk as well.
5828                 *
5829                 * Note that device->missing is handled by the caller, and that
5830                 * the write to the old disk is already set up in the stripes
5831                 * array.
5832                 */
5833                index_where_to_add = num_stripes;
5834                for (i = 0; i < num_stripes; i++) {
5835                        if (bbio->stripes[i].dev->devid == srcdev_devid) {
5836                                /* write to new disk, too */
5837                                struct btrfs_bio_stripe *new =
5838                                        bbio->stripes + index_where_to_add;
5839                                struct btrfs_bio_stripe *old =
5840                                        bbio->stripes + i;
5841
5842                                new->physical = old->physical;
5843                                new->length = old->length;
5844                                new->dev = dev_replace->tgtdev;
5845                                bbio->tgtdev_map[i] = index_where_to_add;
5846                                index_where_to_add++;
5847                                max_errors++;
5848                                tgtdev_indexes++;
5849                        }
5850                }
5851                num_stripes = index_where_to_add;
5852        } else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
5853                int index_srcdev = 0;
5854                int found = 0;
5855                u64 physical_of_found = 0;
5856
5857                /*
5858                 * During the dev-replace procedure, the target drive can also
5859                 * be used to read data in case it is needed to repair a corrupt
5860                 * block elsewhere. This is possible if the requested area is
5861                 * left of the left cursor. In this area, the target drive is a
5862                 * full copy of the source drive.
5863                 */
5864                for (i = 0; i < num_stripes; i++) {
5865                        if (bbio->stripes[i].dev->devid == srcdev_devid) {
5866                                /*
5867                                 * In case of DUP, in order to keep it simple,
5868                                 * only add the mirror with the lowest physical
5869                                 * address
5870                                 */
5871                                if (found &&
5872                                    physical_of_found <=
5873                                     bbio->stripes[i].physical)
5874                                        continue;
5875                                index_srcdev = i;
5876                                found = 1;
5877                                physical_of_found = bbio->stripes[i].physical;
5878                        }
5879                }
5880                if (found) {
5881                        struct btrfs_bio_stripe *tgtdev_stripe =
5882                                bbio->stripes + num_stripes;
5883
5884                        tgtdev_stripe->physical = physical_of_found;
5885                        tgtdev_stripe->length =
5886                                bbio->stripes[index_srcdev].length;
5887                        tgtdev_stripe->dev = dev_replace->tgtdev;
5888                        bbio->tgtdev_map[index_srcdev] = num_stripes;
5889
5890                        tgtdev_indexes++;
5891                        num_stripes++;
5892                }
5893        }
5894
5895        *num_stripes_ret = num_stripes;
5896        *max_errors_ret = max_errors;
5897        bbio->num_tgtdevs = tgtdev_indexes;
5898        *bbio_ret = bbio;
5899}
5900
5901static bool need_full_stripe(enum btrfs_map_op op)
5902{
5903        return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
5904}
5905
5906/*
5907 * btrfs_get_io_geometry - calculates the geomery of a particular (address, len)
5908 *                     tuple. This information is used to calculate how big a
5909 *                     particular bio can get before it straddles a stripe.
5910 *
5911 * @fs_info - the filesystem
5912 * @logical - address that we want to figure out the geometry of
5913 * @len     - the length of IO we are going to perform, starting at @logical
5914 * @op      - type of operation - write or read
5915 * @io_geom - pointer used to return values
5916 *
5917 * Returns < 0 in case a chunk for the given logical address cannot be found,
5918 * usually shouldn't happen unless @logical is corrupted, 0 otherwise.
5919 */
5920int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
5921                        u64 logical, u64 len, struct btrfs_io_geometry *io_geom)
5922{
5923        struct extent_map *em;
5924        struct map_lookup *map;
5925        u64 offset;
5926        u64 stripe_offset;
5927        u64 stripe_nr;
5928        u64 stripe_len;
5929        u64 raid56_full_stripe_start = (u64)-1;
5930        int data_stripes;
5931        int ret = 0;
5932
5933        ASSERT(op != BTRFS_MAP_DISCARD);
5934
5935        em = btrfs_get_chunk_map(fs_info, logical, len);
5936        if (IS_ERR(em))
5937                return PTR_ERR(em);
5938
5939        map = em->map_lookup;
5940        /* Offset of this logical address in the chunk */
5941        offset = logical - em->start;
5942        /* Len of a stripe in a chunk */
5943        stripe_len = map->stripe_len;
5944        /* Stripe wher this block falls in */
5945        stripe_nr = div64_u64(offset, stripe_len);
5946        /* Offset of stripe in the chunk */
5947        stripe_offset = stripe_nr * stripe_len;
5948        if (offset < stripe_offset) {
5949                btrfs_crit(fs_info,
5950"stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
5951                        stripe_offset, offset, em->start, logical, stripe_len);
5952                ret = -EINVAL;
5953                goto out;
5954        }
5955
5956        /* stripe_offset is the offset of this block in its stripe */
5957        stripe_offset = offset - stripe_offset;
5958        data_stripes = nr_data_stripes(map);
5959
5960        if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
5961                u64 max_len = stripe_len - stripe_offset;
5962
5963                /*
5964                 * In case of raid56, we need to know the stripe aligned start
5965                 */
5966                if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5967                        unsigned long full_stripe_len = stripe_len * data_stripes;
5968                        raid56_full_stripe_start = offset;
5969
5970                        /*
5971                         * Allow a write of a full stripe, but make sure we
5972                         * don't allow straddling of stripes
5973                         */
5974                        raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
5975                                        full_stripe_len);
5976                        raid56_full_stripe_start *= full_stripe_len;
5977
5978                        /*
5979                         * For writes to RAID[56], allow a full stripeset across
5980                         * all disks. For other RAID types and for RAID[56]
5981                         * reads, just allow a single stripe (on a single disk).
5982                         */
5983                        if (op == BTRFS_MAP_WRITE) {
5984                                max_len = stripe_len * data_stripes -
5985                                          (offset - raid56_full_stripe_start);
5986                        }
5987                }
5988                len = min_t(u64, em->len - offset, max_len);
5989        } else {
5990                len = em->len - offset;
5991        }
5992
5993        io_geom->len = len;
5994        io_geom->offset = offset;
5995        io_geom->stripe_len = stripe_len;
5996        io_geom->stripe_nr = stripe_nr;
5997        io_geom->stripe_offset = stripe_offset;
5998        io_geom->raid56_stripe_offset = raid56_full_stripe_start;
5999
6000out:

6001        /* once for us */
6002        free_extent_map(em);
6003        return ret;
6004}
6005
6006static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
6007                             enum btrfs_map_op op,
6008                             u64 logical, u64 *length,
6009                             struct btrfs_bio **bbio_ret,
6010                             int mirror_num, int need_raid_map)
6011{
6012        struct extent_map *em;
6013        struct map_lookup *map;
6014        u64 offset;
6015        u64 stripe_offset;
6016        u64 stripe_nr;
6017        u64 stripe_len;
6018        u32 stripe_index;
6019        int data_stripes;
6020        int i;
6021        int ret = 0;
6022        int num_stripes;
6023        int max_errors = 0;
6024        int tgtdev_indexes = 0;
6025        struct btrfs_bio *bbio = NULL;
6026        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
6027        int dev_replace_is_ongoing = 0;
6028        int num_alloc_stripes;
6029        int patch_the_first_stripe_for_dev_replace = 0;
6030        u64 physical_to_patch_in_first_stripe = 0;
6031        u64 raid56_full_stripe_start = (u64)-1;
6032        struct btrfs_io_geometry geom;
6033
6034        ASSERT(bbio_ret);
6035
6036        if (op == BTRFS_MAP_DISCARD)
6037                return __btrfs_map_block_for_discard(fs_info, logical,
6038                                                     *length, bbio_ret);
6039
6040        ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom);
6041        if (ret < 0)
6042                return ret;
6043
6044        em = btrfs_get_chunk_map(fs_info, logical, *length);
6045        ASSERT(em);
6046        map = em->map_lookup;
6047
6048        *length = geom.len;
6049        offset = geom.offset;
6050        stripe_len = geom.stripe_len;
6051        stripe_nr = geom.stripe_nr;
6052        stripe_offset = geom.stripe_offset;
6053        raid56_full_stripe_start = geom.raid56_stripe_offset;
6054        data_stripes = nr_data_stripes(map);
6055
6056        down_read(&dev_replace->rwsem);
6057        dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
6058        /*
6059         * Hold the semaphore for read during the whole operation, write is
6060         * requested at commit time but must wait.
6061         */
6062        if (!dev_replace_is_ongoing)
6063                up_read(&dev_replace->rwsem);
6064
6065        if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
6066            !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
6067                ret = get_extra_mirror_from_replace(fs_info, logical, *length,
6068                                                    dev_replace->srcdev->devid,
6069                                                    &mirror_num,
6070                                            &physical_to_patch_in_first_stripe);
6071                if (ret)
6072                        goto out;
6073                else
6074                        patch_the_first_stripe_for_dev_replace = 1;
6075        } else if (mirror_num > map->num_stripes) {
6076                mirror_num = 0;
6077        }
6078
6079        num_stripes = 1;
6080        stripe_index = 0;
6081        if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
6082                stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
6083                                &stripe_index);
6084                if (!need_full_stripe(op))
6085                        mirror_num = 1;
6086        } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
6087                if (need_full_stripe(op))
6088                        num_stripes = map->num_stripes;
6089                else if (mirror_num)
6090                        stripe_index = mirror_num - 1;
6091                else {
6092                        stripe_index = find_live_mirror(fs_info, map, 0,
6093                                            dev_replace_is_ongoing);
6094                        mirror_num = stripe_index + 1;
6095                }
6096
6097        } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
6098                if (need_full_stripe(op)) {
6099                        num_stripes = map->num_stripes;
6100                } else if (mirror_num) {
6101                        stripe_index = mirror_num - 1;
6102                } else {
6103                        mirror_num = 1;
6104                }
6105
6106        } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
6107                u32 factor = map->num_stripes / map->sub_stripes;
6108
6109                stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
6110                stripe_index *= map->sub_stripes;
6111
6112                if (need_full_stripe(op))
6113                        num_stripes = map->sub_stripes;
6114                else if (mirror_num)
6115                        stripe_index += mirror_num - 1;
6116                else {
6117                        int old_stripe_index = stripe_index;
6118                        stripe_index = find_live_mirror(fs_info, map,
6119                                              stripe_index,
6120                                              dev_replace_is_ongoing);
6121                        mirror_num = stripe_index - old_stripe_index + 1;
6122                }
6123
6124        } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6125                if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
6126                        /* push stripe_nr back to the start of the full stripe */
6127                        stripe_nr = div64_u64(raid56_full_stripe_start,
6128                                        stripe_len * data_stripes);
6129
6130                        /* RAID[56] write or recovery. Return all stripes */
6131                        num_stripes = map->num_stripes;
6132                        max_errors = nr_parity_stripes(map);
6133
6134                        *length = map->stripe_len;
6135                        stripe_index = 0;
6136                        stripe_offset = 0;
6137                } else {
6138                        /*
6139                         * Mirror #0 or #1 means the original data block.
6140                         * Mirror #2 is RAID5 parity block.
6141                         * Mirror #3 is RAID6 Q block.
6142                         */
6143                        stripe_nr = div_u64_rem(stripe_nr,
6144                                        data_stripes, &stripe_index);
6145                        if (mirror_num > 1)
6146                                stripe_index = data_stripes + mirror_num - 2;
6147
6148                        /* We distribute the parity blocks across stripes */
6149                        div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
6150                                        &stripe_index);
6151                        if (!need_full_stripe(op) && mirror_num <= 1)
6152                                mirror_num = 1;
6153                }
6154        } else {
6155                /*
6156                 * after this, stripe_nr is the number of stripes on this
6157                 * device we have to walk to find the data, and stripe_index is
6158                 * the number of our device in the stripe array
6159                 */
6160                stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
6161                                &stripe_index);
6162                mirror_num = stripe_index + 1;
6163        }
6164        if (stripe_index >= map->num_stripes) {
6165                btrfs_crit(fs_info,
6166                           "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
6167                           stripe_index, map->num_stripes);
6168                ret = -EINVAL;
6169                goto out;
6170        }
6171
6172        num_alloc_stripes = num_stripes;
6173        if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
6174                if (op == BTRFS_MAP_WRITE)
6175                        num_alloc_stripes <<= 1;
6176                if (op == BTRFS_MAP_GET_READ_MIRRORS)
6177                        num_alloc_stripes++;
6178                tgtdev_indexes = num_stripes;
6179        }
6180
6181        bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
6182        if (!bbio) {
6183                ret = -ENOMEM;
6184                goto out;
6185        }
6186        if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
6187                bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
6188
6189        /* build raid_map */
6190        if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
6191            (need_full_stripe(op) || mirror_num > 1)) {
6192                u64 tmp;
6193                unsigned rot;
6194
6195                bbio->raid_map = (u64 *)((void *)bbio->stripes +
6196                                 sizeof(struct btrfs_bio_stripe) *
6197                                 num_alloc_stripes +
6198                                 sizeof(int) * tgtdev_indexes);
6199
6200                /* Work out the disk rotation on this stripe-set */
6201                div_u64_rem(stripe_nr, num_stripes, &rot);
6202
6203                /* Fill in the logical address of each stripe */
6204                tmp = stripe_nr * data_stripes;
6205                for (i = 0; i < data_stripes; i++)
6206                        bbio->raid_map[(i+rot) % num_stripes] =
6207                                em->start + (tmp + i) * map->stripe_len;
6208
6209                bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
6210                if (map->type & BTRFS_BLOCK_GROUP_RAID6)
6211                        bbio->raid_map[(i+rot+1) % num_stripes] =
6212                                RAID6_Q_STRIPE;
6213        }
6214
6215
6216        for (i = 0; i < num_stripes; i++) {
6217                bbio->stripes[i].physical =
6218                        map->stripes[stripe_index].physical +
6219                        stripe_offset +
6220                        stripe_nr * map->stripe_len;
6221                bbio->stripes[i].dev =
6222                        map->stripes[stripe_index].dev;
6223                stripe_index++;
6224        }
6225
6226        if (need_full_stripe(op))
6227                max_errors = btrfs_chunk_max_errors(map);
6228
6229        if (bbio->raid_map)
6230                sort_parity_stripes(bbio, num_stripes);
6231
6232        if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
6233            need_full_stripe(op)) {
6234                handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes,
6235                                          &max_errors);
6236        }
6237
6238        *bbio_ret = bbio;
6239        bbio->map_type = map->type;
6240        bbio->num_stripes = num_stripes;
6241        bbio->max_errors = max_errors;
6242        bbio->mirror_num = mirror_num;
6243
6244        /*
6245         * this is the case that REQ_READ && dev_replace_is_ongoing &&
6246         * mirror_num == num_stripes + 1 && dev_replace target drive is
6247         * available as a mirror
6248         */
6249        if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
6250                WARN_ON(num_stripes > 1);
6251                bbio->stripes[0].dev = dev_replace->tgtdev;
6252                bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
6253                bbio->mirror_num = map->num_stripes + 1;
6254        }
6255out:
6256        if (dev_replace_is_ongoing) {
6257                lockdep_assert_held(&dev_replace->rwsem);
6258                /* Unlock and let waiting writers proceed */
6259                up_read(&dev_replace->rwsem);
6260        }
6261        free_extent_map(em);
6262        return ret;
6263}
6264
6265int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6266                      u64 logical, u64 *length,
6267                      struct btrfs_bio **bbio_ret, int mirror_num)
6268{
6269        return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
6270                                 mirror_num, 0);
6271}
6272
6273/* For Scrub/replace */
6274int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6275                     u64 logical, u64 *length,
6276                     struct btrfs_bio **bbio_ret)
6277{
6278        return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
6279}
6280
6281int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
6282                     u64 physical, u64 **logical, int *naddrs, int *stripe_len)
6283{
6284        struct extent_map *em;
6285        struct map_lookup *map;
6286        u64 *buf;
6287        u64 bytenr;
6288        u64 length;
6289        u64 stripe_nr;
6290        u64 rmap_len;
6291        int i, j, nr = 0;
6292
6293        em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
6294        if (IS_ERR(em))
6295                return -EIO;
6296
6297        map = em->map_lookup;
6298        length = em->len;
6299        rmap_len = map->stripe_len;
6300
6301        if (map->type & BTRFS_BLOCK_GROUP_RAID10)
6302                length = div_u64(length, map->num_stripes / map->sub_stripes);
6303        else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
6304                length = div_u64(length, map->num_stripes);
6305        else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6306                length = div_u64(length, nr_data_stripes(map));
6307                rmap_len = map->stripe_len * nr_data_stripes(map);
6308        }
6309
6310        buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
6311        BUG_ON(!buf); /* -ENOMEM */
6312
6313        for (i = 0; i < map->num_stripes; i++) {
6314                if (map->stripes[i].physical > physical ||
6315                    map->stripes[i].physical + length <= physical)
6316                        continue;
6317
6318                stripe_nr = physical - map->stripes[i].physical;
6319                stripe_nr = div64_u64(stripe_nr, map->stripe_len);
6320
6321                if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
6322                        stripe_nr = stripe_nr * map->num_stripes + i;
6323                        stripe_nr = div_u64(stripe_nr, map->sub_stripes);
6324                } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
6325                        stripe_nr = stripe_nr * map->num_stripes + i;
6326                } /* else if RAID[56], multiply by nr_data_stripes().
6327                   * Alternatively, just use rmap_len below instead of
6328                   * map->stripe_len */
6329
6330                bytenr = chunk_start + stripe_nr * rmap_len;
6331                WARN_ON(nr >= map->num_stripes);
6332                for (j = 0; j < nr; j++) {
6333                        if (buf[j] == bytenr)
6334                                break;
6335                }
6336                if (j == nr) {
6337                        WARN_ON(nr >= map->num_stripes);
6338                        buf[nr++] = bytenr;
6339                }
6340        }
6341
6342        *logical = buf;
6343        *naddrs = nr;
6344        *stripe_len = rmap_len;
6345
6346        free_extent_map(em);
6347        return 0;
6348}
6349
6350static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
6351{
6352        bio->bi_private = bbio->private;
6353        bio->bi_end_io = bbio->end_io;
6354        bio_endio(bio);
6355
6356        btrfs_put_bbio(bbio);
6357}
6358
6359static void btrfs_end_bio(struct bio *bio)
6360{
6361        struct btrfs_bio *bbio = bio->bi_private;
6362        int is_orig_bio = 0;
6363
6364        if (bio->bi_status) {
6365                atomic_inc(&bbio->error);
6366                if (bio->bi_status == BLK_STS_IOERR ||
6367                    bio->bi_status == BLK_STS_TARGET) {
6368                        unsigned int stripe_index =
6369                                btrfs_io_bio(bio)->stripe_index;
6370                        struct btrfs_device *dev;
6371
6372                        BUG_ON(stripe_index >= bbio->num_stripes);
6373                        dev = bbio->stripes[stripe_index].dev;
6374                        if (dev->bdev) {
6375                                if (bio_op(bio) == REQ_OP_WRITE)
6376                                        btrfs_dev_stat_inc_and_print(dev,
6377                                                BTRFS_DEV_STAT_WRITE_ERRS);
6378                                else if (!(bio->bi_opf & REQ_RAHEAD))
6379                                        btrfs_dev_stat_inc_and_print(dev,
6380                                                BTRFS_DEV_STAT_READ_ERRS);
6381                                if (bio->bi_opf & REQ_PREFLUSH)
6382                                        btrfs_dev_stat_inc_and_print(dev,
6383                                                BTRFS_DEV_STAT_FLUSH_ERRS);
6384                        }
6385                }
6386        }
6387
6388        if (bio == bbio->orig_bio)
6389                is_orig_bio = 1;
6390
6391        btrfs_bio_counter_dec(bbio->fs_info);
6392
6393        if (atomic_dec_and_test(&bbio->stripes_pending)) {
6394                if (!is_orig_bio) {
6395                        bio_put(bio);
6396                        bio = bbio->orig_bio;
6397                }
6398
6399                btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6400                /* only send an error to the higher layers if it is
6401                 * beyond the tolerance of the btrfs bio
6402                 */
6403                if (atomic_read(&bbio->error) > bbio->max_errors) {
6404                        bio->bi_status = BLK_STS_IOERR;
6405                } else {
6406                        /*
6407                         * this bio is actually up to date, we didn't
6408                         * go over the max number of errors
6409                         */
6410                        bio->bi_status = BLK_STS_OK;
6411                }
6412
6413                btrfs_end_bbio(bbio, bio);
6414        } else if (!is_orig_bio) {
6415                bio_put(bio);
6416        }
6417}
6418
6419/*
6420 * see run_scheduled_bios for a description of why bios are collected for
6421 * async submit.
6422 *
6423 * This will add one bio to the pending list for a device and make sure
6424 * the work struct is scheduled.
6425 */
6426static noinline void btrfs_schedule_bio(struct btrfs_device *device,
6427                                        struct bio *bio)
6428{
6429        struct btrfs_fs_info *fs_info = device->fs_info;
6430        int should_queue = 1;
6431        struct btrfs_pending_bios *pending_bios;
6432
6433        /* don't bother with additional async steps for reads, right now */
6434        if (bio_op(bio) == REQ_OP_READ) {
6435                btrfsic_submit_bio(bio);
6436                return;
6437        }
6438
6439        WARN_ON(bio->bi_next);
6440        bio->bi_next = NULL;
6441
6442        spin_lock(&device->io_lock);
6443        if (op_is_sync(bio->bi_opf))
6444                pending_bios = &device->pending_sync_bios;
6445        else
6446                pending_bios = &device->pending_bios;
6447
6448        if (pending_bios->tail)
6449                pending_bios->tail->bi_next = bio;
6450
6451        pending_bios->tail = bio;
6452        if (!pending_bios->head)
6453                pending_bios->head = bio;
6454        if (device->running_pending)
6455                should_queue = 0;
6456
6457        spin_unlock(&device->io_lock);
6458
6459        if (should_queue)
6460                btrfs_queue_work(fs_info->submit_workers, &device->work);
6461}
6462
6463static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
6464                              u64 physical, int dev_nr, int async)
6465{
6466        struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
6467        struct btrfs_fs_info *fs_info = bbio->fs_info;
6468
6469        bio->bi_private = bbio;
6470        btrfs_io_bio(bio)->stripe_index = dev_nr;
6471        bio->bi_end_io = btrfs_end_bio;
6472        bio->bi_iter.bi_sector = physical >> 9;
6473        btrfs_debug_in_rcu(fs_info,
6474        "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
6475                bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector,
6476                (u_long)dev->bdev->bd_dev, rcu_str_deref(dev->name), dev->devid,
6477                bio->bi_iter.bi_size);
6478        bio_set_dev(bio, dev->bdev);
6479
6480        btrfs_bio_counter_inc_noblocked(fs_info);
6481
6482        if (async)
6483                btrfs_schedule_bio(dev, bio);
6484        else
6485                btrfsic_submit_bio(bio);
6486}
6487
6488static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
6489{
6490        atomic_inc(&bbio->error);
6491        if (atomic_dec_and_test(&bbio->stripes_pending)) {
6492                /* Should be the original bio. */
6493                WARN_ON(bio != bbio->orig_bio);
6494
6495                btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6496                bio->bi_iter.bi_sector = logical >> 9;
6497                if (atomic_read(&bbio->error) > bbio->max_errors)
6498                        bio->bi_status = BLK_STS_IOERR;
6499                else
6500                        bio->bi_status = BLK_STS_OK;
6501                btrfs_end_bbio(bbio, bio);
6502        }
6503}
6504
6505blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
6506                           int mirror_num, int async_submit)
6507{
6508        struct btrfs_device *dev;
6509        struct bio *first_bio = bio;
6510        u64 logical = (u64)bio->bi_iter.bi_sector << 9;
6511        u64 length = 0;
6512        u64 map_length;
6513        int ret;
6514        int dev_nr;
6515        int total_devs;
6516        struct btrfs_bio *bbio = NULL;
6517
6518        length = bio->bi_iter.bi_size;
6519        map_length = length;
6520
6521        btrfs_bio_counter_inc_blocked(fs_info);
6522        ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
6523                                &map_length, &bbio, mirror_num, 1);
6524        if (ret) {
6525                btrfs_bio_counter_dec(fs_info);
6526                return errno_to_blk_status(ret);
6527        }
6528
6529        total_devs = bbio->num_stripes;
6530        bbio->orig_bio = first_bio;
6531        bbio->private = first_bio->bi_private;
6532        bbio->end_io = first_bio->bi_end_io;
6533        bbio->fs_info = fs_info;
6534        atomic_set(&bbio->stripes_pending, bbio->num_stripes);
6535
6536        if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
6537            ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) {
6538                /* In this case, map_length has been set to the length of
6539                   a single stripe; not the whole write */
6540                if (bio_op(bio) == REQ_OP_WRITE) {
6541                        ret = raid56_parity_write(fs_info, bio, bbio,
6542                                                  map_length);
6543                } else {
6544                        ret = raid56_parity_recover(fs_info, bio, bbio,
6545                                                    map_length, mirror_num, 1);
6546                }
6547
6548                btrfs_bio_counter_dec(fs_info);
6549                return errno_to_blk_status(ret);
6550        }
6551
6552        if (map_length < length) {
6553                btrfs_crit(fs_info,
6554                           "mapping failed logical %llu bio len %llu len %llu",
6555                           logical, length, map_length);
6556                BUG();
6557        }
6558
6559        for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
6560                dev = bbio->stripes[dev_nr].dev;
6561                if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
6562                                                   &dev->dev_state) ||
6563                    (bio_op(first_bio) == REQ_OP_WRITE &&
6564                    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
6565                        bbio_error(bbio, first_bio, logical);
6566                        continue;
6567                }
6568
6569                if (dev_nr < total_devs - 1)
6570                        bio = btrfs_bio_clone(first_bio);
6571                else
6572                        bio = first_bio;
6573
6574                submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical,
6575                                  dev_nr, async_submit);
6576        }
6577        btrfs_bio_counter_dec(fs_info);
6578        return BLK_STS_OK;
6579}
6580
6581/*
6582 * Find a device specified by @devid or @uuid in the list of @fs_devices, or
6583 * return NULL.
6584 *
6585 * If devid and uuid are both specified, the match must be exact, otherwise
6586 * only devid is used.
6587 *
6588 * If @seed is true, traverse through the seed devices.
6589 */
6590struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
6591                                       u64 devid, u8 *uuid, u8 *fsid,
6592                                       bool seed)
6593{
6594        struct btrfs_device *device;
6595
6596        while (fs_devices) {
6597                if (!fsid ||
6598                    !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
6599                        list_for_each_entry(device, &fs_devices->devices,
6600                                            dev_list) {
6601                                if (device->devid == devid &&
6602                                    (!uuid || memcmp(device->uuid, uuid,
6603                                                     BTRFS_UUID_SIZE) == 0))
6604                                        return device;
6605                        }
6606                }
6607                if (seed)
6608                        fs_devices = fs_devices->seed;
6609                else
6610                        return NULL;
6611        }
6612        return NULL;
6613}
6614
6615static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
6616                                            u64 devid, u8 *dev_uuid)
6617{
6618        struct btrfs_device *device;
6619
6620        device = btrfs_alloc_device(NULL, &devid, dev_uuid);
6621        if (IS_ERR(device))
6622                return device;
6623
6624        list_add(&device->dev_list, &fs_devices->devices);
6625        device->fs_devices = fs_devices;
6626        fs_devices->num_devices++;
6627
6628        set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
6629        fs_devices->missing_devices++;
6630
6631        return device;
6632}
6633
6634/**
6635 * btrfs_alloc_device - allocate struct btrfs_device
6636 * @fs_info:    used only for generating a new devid, can be NULL if
6637 *              devid is provided (i.e. @devid != NULL).
6638 * @devid:      a pointer to devid for this device.  If NULL a new devid
6639 *              is generated.
6640 * @uuid:       a pointer to UUID for this device.  If NULL a new UUID
6641 *              is generated.
6642 *
6643 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
6644 * on error.  Returned struct is not linked onto any lists and must be
6645 * destroyed with btrfs_free_device.
6646 */
6647struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
6648                                        const u64 *devid,
6649                                        const u8 *uuid)
6650{
6651        struct btrfs_device *dev;
6652        u64 tmp;
6653
6654        if (WARN_ON(!devid && !fs_info))
6655                return ERR_PTR(-EINVAL);
6656
6657        dev = __alloc_device();
6658        if (IS_ERR(dev))
6659                return dev;
6660
6661        if (devid)
6662                tmp = *devid;
6663        else {
6664                int ret;
6665
6666                ret = find_next_devid(fs_info, &tmp);
6667                if (ret) {
6668                        btrfs_free_device(dev);
6669                        return ERR_PTR(ret);
6670                }
6671        }
6672        dev->devid = tmp;
6673
6674        if (uuid)
6675                memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
6676        else
6677                generate_random_uuid(dev->uuid);
6678
6679        btrfs_init_work(&dev->work, btrfs_submit_helper,
6680                        pending_bios_fn, NULL, NULL);
6681
6682        return dev;
6683}
6684
6685static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
6686                                        u64 devid, u8 *uuid, bool error)
6687{
6688        if (error)
6689                btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
6690                              devid, uuid);
6691        else
6692                btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
6693                              devid, uuid);
6694}
6695
6696static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
6697{
6698        int index = btrfs_bg_flags_to_raid_index(type);
6699        int ncopies = btrfs_raid_array[index].ncopies;
6700        int data_stripes;
6701
6702        switch (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
6703        case BTRFS_BLOCK_GROUP_RAID5:
6704                data_stripes = num_stripes - 1;
6705                break;
6706        case BTRFS_BLOCK_GROUP_RAID6:
6707                data_stripes = num_stripes - 2;
6708                break;
6709        default:
6710                data_stripes = num_stripes / ncopies;
6711                break;
6712        }
6713        return div_u64(chunk_len, data_stripes);
6714}
6715
6716static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
6717                          struct btrfs_chunk *chunk)
6718{
6719        struct btrfs_fs_info *fs_info = leaf->fs_info;
6720        struct extent_map_tree *map_tree = &fs_info->mapping_tree;
6721        struct map_lookup *map;
6722        struct extent_map *em;
6723        u64 logical;
6724        u64 length;
6725        u64 devid;
6726        u8 uuid[BTRFS_UUID_SIZE];
6727        int num_stripes;
6728        int ret;
6729        int i;
6730
6731        logical = key->offset;
6732        length = btrfs_chunk_length(leaf, chunk);
6733        num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
6734
6735        /*
6736         * Only need to verify chunk item if we're reading from sys chunk array,
6737         * as chunk item in tree block is already verified by tree-checker.
6738         */
6739        if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
6740                ret = btrfs_check_chunk_valid(leaf, chunk, logical);
6741                if (ret)
6742                        return ret;
6743        }
6744
6745        read_lock(&map_tree->lock);
6746        em = lookup_extent_mapping(map_tree, logical, 1);
6747        read_unlock(&map_tree->lock);
6748
6749        /* already mapped? */
6750        if (em && em->start <= logical && em->start + em->len > logical) {
6751                free_extent_map(em);
6752                return 0;
6753        } else if (em) {
6754                free_extent_map(em);
6755        }
6756
6757        em = alloc_extent_map();
6758        if (!em)
6759                return -ENOMEM;
6760        map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
6761        if (!map) {
6762                free_extent_map(em);
6763                return -ENOMEM;
6764        }
6765
6766        set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
6767        em->map_lookup = map;
6768        em->start = logical;
6769        em->len = length;
6770        em->orig_start = 0;
6771        em->block_start = 0;
6772        em->block_len = em->len;
6773
6774        map->num_stripes = num_stripes;
6775        map->io_width = btrfs_chunk_io_width(leaf, chunk);
6776        map->io_align = btrfs_chunk_io_align(leaf, chunk);
6777        map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
6778        map->type = btrfs_chunk_type(leaf, chunk);
6779        map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
6780        map->verified_stripes = 0;
6781        em->orig_block_len = calc_stripe_length(map->type, em->len,
6782                                                map->num_stripes);
6783        for (i = 0; i < num_stripes; i++) {
6784                map->stripes[i].physical =
6785                        btrfs_stripe_offset_nr(leaf, chunk, i);
6786                devid = btrfs_stripe_devid_nr(leaf, chunk, i);
6787                read_extent_buffer(leaf, uuid, (unsigned long)
6788                                   btrfs_stripe_dev_uuid_nr(chunk, i),
6789                                   BTRFS_UUID_SIZE);
6790                map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
6791                                                        devid, uuid, NULL, true);
6792                if (!map->stripes[i].dev &&
6793                    !btrfs_test_opt(fs_info, DEGRADED)) {
6794                        free_extent_map(em);
6795                        btrfs_report_missing_device(fs_info, devid, uuid, true);
6796                        return -ENOENT;
6797                }
6798                if (!map->stripes[i].dev) {
6799                        map->stripes[i].dev =
6800                                add_missing_dev(fs_info->fs_devices, devid,
6801                                                uuid);
6802                        if (IS_ERR(map->stripes[i].dev)) {
6803                                free_extent_map(em);
6804                                btrfs_err(fs_info,
6805                                        "failed to init missing dev %llu: %ld",
6806                                        devid, PTR_ERR(map->stripes[i].dev));
6807                                return PTR_ERR(map->stripes[i].dev);
6808                        }
6809                        btrfs_report_missing_device(fs_info, devid, uuid, false);
6810                }
6811                set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
6812                                &(map->stripes[i].dev->dev_state));
6813
6814        }
6815
6816        write_lock(&map_tree->lock);
6817        ret = add_extent_mapping(map_tree, em, 0);
6818        write_unlock(&map_tree->lock);
6819        if (ret < 0) {
6820                btrfs_err(fs_info,
6821                          "failed to add chunk map, start=%llu len=%llu: %d",
6822                          em->start, em->len, ret);
6823        }
6824        free_extent_map(em);
6825
6826        return ret;
6827}
6828
6829static void fill_device_from_item(struct extent_buffer *leaf,
6830                                 struct btrfs_dev_item *dev_item,
6831                                 struct btrfs_device *device)
6832{
6833        unsigned long ptr;
6834
6835        device->devid = btrfs_device_id(leaf, dev_item);
6836        device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
6837        device->total_bytes = device->disk_total_bytes;
6838        device->commit_total_bytes = device->disk_total_bytes;
6839        device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
6840        device->commit_bytes_used = device->bytes_used;
6841        device->type = btrfs_device_type(leaf, dev_item);
6842        device->io_align = btrfs_device_io_align(leaf, dev_item);
6843        device->io_width = btrfs_device_io_width(leaf, dev_item);
6844        device->sector_size = btrfs_device_sector_size(leaf, dev_item);
6845        WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
6846        clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
6847
6848        ptr = btrfs_device_uuid(dev_item);
6849        read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
6850}
6851
6852static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
6853                                                  u8 *fsid)
6854{
6855        struct btrfs_fs_devices *fs_devices;
6856        int ret;
6857
6858        lockdep_assert_held(&uuid_mutex);
6859        ASSERT(fsid);
6860
6861        fs_devices = fs_info->fs_devices->seed;
6862        while (fs_devices) {
6863                if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
6864                        return fs_devices;
6865
6866                fs_devices = fs_devices->seed;
6867        }
6868
6869        fs_devices = find_fsid(fsid, NULL);
6870        if (!fs_devices) {
6871                if (!btrfs_test_opt(fs_info, DEGRADED))
6872                        return ERR_PTR(-ENOENT);
6873
6874                fs_devices = alloc_fs_devices(fsid, NULL);
6875                if (IS_ERR(fs_devices))
6876                        return fs_devices;
6877
6878                fs_devices->seeding = 1;
6879                fs_devices->opened = 1;
6880                return fs_devices;
6881        }
6882
6883        fs_devices = clone_fs_devices(fs_devices);
6884        if (IS_ERR(fs_devices))
6885                return fs_devices;
6886
6887        ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
6888        if (ret) {
6889                free_fs_devices(fs_devices);
6890                fs_devices = ERR_PTR(ret);
6891                goto out;
6892        }
6893
6894        if (!fs_devices->seeding) {
6895                close_fs_devices(fs_devices);
6896                free_fs_devices(fs_devices);
6897                fs_devices = ERR_PTR(-EINVAL);
6898                goto out;
6899        }
6900
6901        fs_devices->seed = fs_info->fs_devices->seed;
6902        fs_info->fs_devices->seed = fs_devices;
6903out:
6904        return fs_devices;
6905}
6906
6907static int read_one_dev(struct extent_buffer *leaf,
6908                        struct btrfs_dev_item *dev_item)
6909{
6910        struct btrfs_fs_info *fs_info = leaf->fs_info;
6911        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6912        struct btrfs_device *device;
6913        u64 devid;
6914        int ret;
6915        u8 fs_uuid[BTRFS_FSID_SIZE];
6916        u8 dev_uuid[BTRFS_UUID_SIZE];
6917
6918        devid = btrfs_device_id(leaf, dev_item);
6919        read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
6920                           BTRFS_UUID_SIZE);
6921        read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
6922                           BTRFS_FSID_SIZE);
6923
6924        if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
6925                fs_devices = open_seed_devices(fs_info, fs_uuid);
6926                if (IS_ERR(fs_devices))
6927                        return PTR_ERR(fs_devices);
6928        }
6929
6930        device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
6931                                   fs_uuid, true);
6932        if (!device) {
6933                if (!btrfs_test_opt(fs_info, DEGRADED)) {
6934                        btrfs_report_missing_device(fs_info, devid,
6935                                                        dev_uuid, true);
6936                        return -ENOENT;
6937                }
6938
6939                device = add_missing_dev(fs_devices, devid, dev_uuid);
6940                if (IS_ERR(device)) {
6941                        btrfs_err(fs_info,
6942                                "failed to add missing dev %llu: %ld",
6943                                devid, PTR_ERR(device));
6944                        return PTR_ERR(device);
6945                }
6946                btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
6947        } else {
6948                if (!device->bdev) {
6949                        if (!btrfs_test_opt(fs_info, DEGRADED)) {
6950                                btrfs_report_missing_device(fs_info,
6951                                                devid, dev_uuid, true);
6952                                return -ENOENT;
6953                        }
6954                        btrfs_report_missing_device(fs_info, devid,
6955                                                        dev_uuid, false);
6956                }
6957
6958                if (!device->bdev &&
6959                    !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
6960                        /*
6961                         * this happens when a device that was properly setup
6962                         * in the device info lists suddenly goes bad.
6963                         * device->bdev is NULL, and so we have to set
6964                         * device->missing to one here
6965                         */
6966                        device->fs_devices->missing_devices++;
6967                        set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
6968                }
6969
6970                /* Move the device to its own fs_devices */
6971                if (device->fs_devices != fs_devices) {
6972                        ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
6973                                                        &device->dev_state));
6974
6975                        list_move(&device->dev_list, &fs_devices->devices);
6976                        device->fs_devices->num_devices--;
6977                        fs_devices->num_devices++;
6978
6979                        device->fs_devices->missing_devices--;
6980                        fs_devices->missing_devices++;
6981
6982                        device->fs_devices = fs_devices;
6983                }
6984        }
6985
6986        if (device->fs_devices != fs_info->fs_devices) {
6987                BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
6988                if (device->generation !=
6989                    btrfs_device_generation(leaf, dev_item))
6990                        return -EINVAL;
6991        }
6992
6993        fill_device_from_item(leaf, dev_item, device);
6994        set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
6995        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
6996           !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
6997                device->fs_devices->total_rw_bytes += device->total_bytes;
6998                atomic64_add(device->total_bytes - device->bytes_used,
6999                                &fs_info->free_chunk_space);
7000        }

7001        ret = 0;
7002        return ret;
7003}
7004
7005int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
7006{
7007        struct btrfs_root *root = fs_info->tree_root;
7008        struct btrfs_super_block *super_copy = fs_info->super_copy;
7009        struct extent_buffer *sb;
7010        struct btrfs_disk_key *disk_key;
7011        struct btrfs_chunk *chunk;
7012        u8 *array_ptr;
7013        unsigned long sb_array_offset;
7014        int ret = 0;
7015        u32 num_stripes;
7016        u32 array_size;
7017        u32 len = 0;
7018        u32 cur_offset;
7019        u64 type;
7020        struct btrfs_key key;
7021
7022        ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
7023        /*
7024         * This will create extent buffer of nodesize, superblock size is
7025         * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
7026         * overallocate but we can keep it as-is, only the first page is used.
7027         */
7028        sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET);
7029        if (IS_ERR(sb))
7030                return PTR_ERR(sb);
7031        set_extent_buffer_uptodate(sb);
7032        btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
7033        /*
7034         * The sb extent buffer is artificial and just used to read the system array.
7035         * set_extent_buffer_uptodate() call does not properly mark all it's
7036         * pages up-to-date when the page is larger: extent does not cover the
7037         * whole page and consequently check_page_uptodate does not find all
7038         * the page's extents up-to-date (the hole beyond sb),
7039         * write_extent_buffer then triggers a WARN_ON.
7040         *
7041         * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
7042         * but sb spans only this function. Add an explicit SetPageUptodate call
7043         * to silence the warning eg. on PowerPC 64.
7044         */
7045        if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
7046                SetPageUptodate(sb->pages[0]);
7047
7048        write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
7049        array_size = btrfs_super_sys_array_size(super_copy);
7050
7051        array_ptr = super_copy->sys_chunk_array;
7052        sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
7053        cur_offset = 0;
7054
7055        while (cur_offset < array_size) {
7056                disk_key = (struct btrfs_disk_key *)array_ptr;
7057                len = sizeof(*disk_key);
7058                if (cur_offset + len > array_size)
7059                        goto out_short_read;
7060
7061                btrfs_disk_key_to_cpu(&key, disk_key);
7062
7063                array_ptr += len;
7064                sb_array_offset += len;
7065                cur_offset += len;
7066
7067                if (key.type == BTRFS_CHUNK_ITEM_KEY) {
7068                        chunk = (struct btrfs_chunk *)sb_array_offset;
7069                        /*
7070                         * At least one btrfs_chunk with one stripe must be
7071                         * present, exact stripe count check comes afterwards
7072                         */
7073                        len = btrfs_chunk_item_size(1);
7074                        if (cur_offset + len > array_size)
7075                                goto out_short_read;
7076
7077                        num_stripes = btrfs_chunk_num_stripes(sb, chunk);
7078                        if (!num_stripes) {
7079                                btrfs_err(fs_info,
7080                                        "invalid number of stripes %u in sys_array at offset %u",
7081                                        num_stripes, cur_offset);
7082                                ret = -EIO;
7083                                break;
7084                        }
7085
7086                        type = btrfs_chunk_type(sb, chunk);
7087                        if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
7088                                btrfs_err(fs_info,
7089                            "invalid chunk type %llu in sys_array at offset %u",
7090                                        type, cur_offset);
7091                                ret = -EIO;
7092                                break;
7093                        }
7094
7095                        len = btrfs_chunk_item_size(num_stripes);
7096                        if (cur_offset + len > array_size)
7097                                goto out_short_read;
7098
7099                        ret = read_one_chunk(&key, sb, chunk);
7100                        if (ret)
7101                                break;
7102                } else {
7103                        btrfs_err(fs_info,
7104                            "unexpected item type %u in sys_array at offset %u",
7105                                  (u32)key.type, cur_offset);
7106                        ret = -EIO;
7107                        break;
7108                }
7109                array_ptr += len;
7110                sb_array_offset += len;
7111                cur_offset += len;
7112        }
7113        clear_extent_buffer_uptodate(sb);
7114        free_extent_buffer_stale(sb);
7115        return ret;
7116
7117out_short_read:
7118        btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
7119                        len, cur_offset);
7120        clear_extent_buffer_uptodate(sb);
7121        free_extent_buffer_stale(sb);
7122        return -EIO;
7123}
7124
7125/*
7126 * Check if all chunks in the fs are OK for read-write degraded mount
7127 *
7128 * If the @failing_dev is specified, it's accounted as missing.
7129 *
7130 * Return true if all chunks meet the minimal RW mount requirements.
7131 * Return false if any chunk doesn't meet the minimal RW mount requirements.
7132 */
7133bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
7134                                        struct btrfs_device *failing_dev)
7135{
7136        struct extent_map_tree *map_tree = &fs_info->mapping_tree;
7137        struct extent_map *em;
7138        u64 next_start = 0;
7139        bool ret = true;
7140
7141        read_lock(&map_tree->lock);
7142        em = lookup_extent_mapping(map_tree, 0, (u64)-1);
7143        read_unlock(&map_tree->lock);
7144        /* No chunk at all? Return false anyway */
7145        if (!em) {
7146                ret = false;
7147                goto out;
7148        }
7149        while (em) {
7150                struct map_lookup *map;
7151                int missing = 0;
7152                int max_tolerated;
7153                int i;
7154
7155                map = em->map_lookup;
7156                max_tolerated =
7157                        btrfs_get_num_tolerated_disk_barrier_failures(
7158                                        map->type);
7159                for (i = 0; i < map->num_stripes; i++) {
7160                        struct btrfs_device *dev = map->stripes[i].dev;
7161
7162                        if (!dev || !dev->bdev ||
7163                            test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
7164                            dev->last_flush_error)
7165                                missing++;
7166                        else if (failing_dev && failing_dev == dev)
7167                                missing++;
7168                }
7169                if (missing > max_tolerated) {
7170                        if (!failing_dev)
7171                                btrfs_warn(fs_info,
7172        "chunk %llu missing %d devices, max tolerance is %d for writable mount",
7173                                   em->start, missing, max_tolerated);
7174                        free_extent_map(em);
7175                        ret = false;
7176                        goto out;
7177                }
7178                next_start = extent_map_end(em);
7179                free_extent_map(em);
7180
7181                read_lock(&map_tree->lock);
7182                em = lookup_extent_mapping(map_tree, next_start,
7183                                           (u64)(-1) - next_start);
7184                read_unlock(&map_tree->lock);
7185        }
7186out:
7187        return ret;
7188}
7189
7190int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
7191{
7192        struct btrfs_root *root = fs_info->chunk_root;
7193        struct btrfs_path *path;
7194        struct extent_buffer *leaf;
7195        struct btrfs_key key;
7196        struct btrfs_key found_key;
7197        int ret;
7198        int slot;
7199        u64 total_dev = 0;
7200
7201        path = btrfs_alloc_path();
7202        if (!path)
7203                return -ENOMEM;
7204
7205        /*
7206         * uuid_mutex is needed only if we are mounting a sprout FS
7207         * otherwise we don't need it.
7208         */
7209        mutex_lock(&uuid_mutex);
7210        mutex_lock(&fs_info->chunk_mutex);
7211
7212        /*
7213         * Read all device items, and then all the chunk items. All
7214         * device items are found before any chunk item (their object id
7215         * is smaller than the lowest possible object id for a chunk
7216         * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
7217         */
7218        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
7219        key.offset = 0;
7220        key.type = 0;
7221        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7222        if (ret < 0)
7223                goto error;
7224        while (1) {
7225                leaf = path->nodes[0];
7226                slot = path->slots[0];
7227                if (slot >= btrfs_header_nritems(leaf)) {
7228                        ret = btrfs_next_leaf(root, path);
7229                        if (ret == 0)
7230                                continue;
7231                        if (ret < 0)
7232                                goto error;
7233                        break;
7234                }
7235                btrfs_item_key_to_cpu(leaf, &found_key, slot);
7236                if (found_key.type == BTRFS_DEV_ITEM_KEY) {
7237                        struct btrfs_dev_item *dev_item;
7238                        dev_item = btrfs_item_ptr(leaf, slot,
7239                                                  struct btrfs_dev_item);
7240                        ret = read_one_dev(leaf, dev_item);
7241                        if (ret)
7242                                goto error;
7243                        total_dev++;
7244                } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
7245                        struct btrfs_chunk *chunk;
7246                        chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
7247                        ret = read_one_chunk(&found_key, leaf, chunk);
7248                        if (ret)
7249                                goto error;
7250                }
7251                path->slots[0]++;
7252        }
7253
7254        /*
7255         * After loading chunk tree, we've got all device information,
7256         * do another round of validation checks.
7257         */
7258        if (total_dev != fs_info->fs_devices->total_devices) {
7259                btrfs_err(fs_info,
7260           "super_num_devices %llu mismatch with num_devices %llu found here",
7261                          btrfs_super_num_devices(fs_info->super_copy),
7262                          total_dev);
7263                ret = -EINVAL;
7264                goto error;
7265        }
7266        if (btrfs_super_total_bytes(fs_info->super_copy) <
7267            fs_info->fs_devices->total_rw_bytes) {
7268                btrfs_err(fs_info,
7269        "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
7270                          btrfs_super_total_bytes(fs_info->super_copy),
7271                          fs_info->fs_devices->total_rw_bytes);
7272                ret = -EINVAL;
7273                goto error;
7274        }
7275        ret = 0;
7276error:
7277        mutex_unlock(&fs_info->chunk_mutex);
7278        mutex_unlock(&uuid_mutex);
7279
7280        btrfs_free_path(path);
7281        return ret;
7282}
7283
7284void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
7285{
7286        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7287        struct btrfs_device *device;
7288
7289        while (fs_devices) {
7290                mutex_lock(&fs_devices->device_list_mutex);
7291                list_for_each_entry(device, &fs_devices->devices, dev_list)
7292                        device->fs_info = fs_info;
7293                mutex_unlock(&fs_devices->device_list_mutex);
7294
7295                fs_devices = fs_devices->seed;
7296        }
7297}
7298
7299static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
7300{
7301        int i;
7302
7303        for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7304                btrfs_dev_stat_reset(dev, i);
7305}
7306
7307int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
7308{
7309        struct btrfs_key key;
7310        struct btrfs_key found_key;
7311        struct btrfs_root *dev_root = fs_info->dev_root;
7312        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7313        struct extent_buffer *eb;
7314        int slot;
7315        int ret = 0;
7316        struct btrfs_device *device;
7317        struct btrfs_path *path = NULL;
7318        int i;
7319
7320        path = btrfs_alloc_path();
7321        if (!path) {
7322                ret = -ENOMEM;
7323                goto out;
7324        }
7325
7326        mutex_lock(&fs_devices->device_list_mutex);
7327        list_for_each_entry(device, &fs_devices->devices, dev_list) {
7328                int item_size;
7329                struct btrfs_dev_stats_item *ptr;
7330
7331                key.objectid = BTRFS_DEV_STATS_OBJECTID;
7332                key.type = BTRFS_PERSISTENT_ITEM_KEY;
7333                key.offset = device->devid;
7334                ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
7335                if (ret) {
7336                        __btrfs_reset_dev_stats(device);
7337                        device->dev_stats_valid = 1;
7338                        btrfs_release_path(path);
7339                        continue;
7340                }
7341                slot = path->slots[0];
7342                eb = path->nodes[0];
7343                btrfs_item_key_to_cpu(eb, &found_key, slot);
7344                item_size = btrfs_item_size_nr(eb, slot);
7345
7346                ptr = btrfs_item_ptr(eb, slot,
7347                                     struct btrfs_dev_stats_item);
7348
7349                for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7350                        if (item_size >= (1 + i) * sizeof(__le64))
7351                                btrfs_dev_stat_set(device, i,
7352                                        btrfs_dev_stats_value(eb, ptr, i));
7353                        else
7354                                btrfs_dev_stat_reset(device, i);
7355                }
7356
7357                device->dev_stats_valid = 1;
7358                btrfs_dev_stat_print_on_load(device);
7359                btrfs_release_path(path);
7360        }
7361        mutex_unlock(&fs_devices->device_list_mutex);
7362
7363out:
7364        btrfs_free_path(path);
7365        return ret < 0 ? ret : 0;
7366}
7367
7368static int update_dev_stat_item(struct btrfs_trans_handle *trans,
7369                                struct btrfs_device *device)
7370{
7371        struct btrfs_fs_info *fs_info = trans->fs_info;
7372        struct btrfs_root *dev_root = fs_info->dev_root;
7373        struct btrfs_path *path;
7374        struct btrfs_key key;
7375        struct extent_buffer *eb;
7376        struct btrfs_dev_stats_item *ptr;
7377        int ret;
7378        int i;
7379
7380        key.objectid = BTRFS_DEV_STATS_OBJECTID;
7381        key.type = BTRFS_PERSISTENT_ITEM_KEY;
7382        key.offset = device->devid;
7383
7384        path = btrfs_alloc_path();
7385        if (!path)
7386                return -ENOMEM;
7387        ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
7388        if (ret < 0) {
7389                btrfs_warn_in_rcu(fs_info,
7390                        "error %d while searching for dev_stats item for device %s",
7391                              ret, rcu_str_deref(device->name));
7392                goto out;
7393        }
7394
7395        if (ret == 0 &&
7396            btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
7397                /* need to delete old one and insert a new one */
7398                ret = btrfs_del_item(trans, dev_root, path);
7399                if (ret != 0) {
7400                        btrfs_warn_in_rcu(fs_info,
7401                                "delete too small dev_stats item for device %s failed %d",
7402                                      rcu_str_deref(device->name), ret);
7403                        goto out;
7404                }
7405                ret = 1;
7406        }
7407
7408        if (ret == 1) {
7409                /* need to insert a new item */
7410                btrfs_release_path(path);
7411                ret = btrfs_insert_empty_item(trans, dev_root, path,
7412                                              &key, sizeof(*ptr));
7413                if (ret < 0) {
7414                        btrfs_warn_in_rcu(fs_info,
7415                                "insert dev_stats item for device %s failed %d",
7416                                rcu_str_deref(device->name), ret);
7417                        goto out;
7418                }
7419        }
7420
7421        eb = path->nodes[0];
7422        ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
7423        for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7424                btrfs_set_dev_stats_value(eb, ptr, i,
7425                                          btrfs_dev_stat_read(device, i));
7426        btrfs_mark_buffer_dirty(eb);
7427
7428out:
7429        btrfs_free_path(path);
7430        return ret;
7431}
7432
7433/*
7434 * called from commit_transaction. Writes all changed device stats to disk.
7435 */
7436int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
7437{
7438        struct btrfs_fs_info *fs_info = trans->fs_info;
7439        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7440        struct btrfs_device *device;
7441        int stats_cnt;
7442        int ret = 0;
7443
7444        mutex_lock(&fs_devices->device_list_mutex);
7445        list_for_each_entry(device, &fs_devices->devices, dev_list) {
7446                stats_cnt = atomic_read(&device->dev_stats_ccnt);
7447                if (!device->dev_stats_valid || stats_cnt == 0)
7448                        continue;
7449
7450
7451                /*
7452                 * There is a LOAD-LOAD control dependency between the value of
7453                 * dev_stats_ccnt and updating the on-disk values which requires
7454                 * reading the in-memory counters. Such control dependencies
7455                 * require explicit read memory barriers.
7456                 *
7457                 * This memory barriers pairs with smp_mb__before_atomic in
7458                 * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
7459                 * barrier implied by atomic_xchg in
7460                 * btrfs_dev_stats_read_and_reset
7461                 */
7462                smp_rmb();
7463
7464                ret = update_dev_stat_item(trans, device);
7465                if (!ret)
7466                        atomic_sub(stats_cnt, &device->dev_stats_ccnt);
7467        }
7468        mutex_unlock(&fs_devices->device_list_mutex);
7469
7470        return ret;
7471}
7472
7473void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
7474{
7475        btrfs_dev_stat_inc(dev, index);
7476        btrfs_dev_stat_print_on_error(dev);
7477}
7478
7479static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
7480{
7481        if (!dev->dev_stats_valid)
7482                return;
7483        btrfs_err_rl_in_rcu(dev->fs_info,
7484                "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7485                           rcu_str_deref(dev->name),
7486                           btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7487                           btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7488                           btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7489                           btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7490                           btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7491}
7492
7493static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
7494{
7495        int i;
7496
7497        for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7498                if (btrfs_dev_stat_read(dev, i) != 0)
7499                        break;
7500        if (i == BTRFS_DEV_STAT_VALUES_MAX)
7501                return; /* all values == 0, suppress message */
7502
7503        btrfs_info_in_rcu(dev->fs_info,
7504                "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7505               rcu_str_deref(dev->name),
7506               btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7507               btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7508               btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7509               btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7510               btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7511}
7512
7513int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
7514                        struct btrfs_ioctl_get_dev_stats *stats)
7515{
7516        struct btrfs_device *dev;
7517        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7518        int i;
7519
7520        mutex_lock(&fs_devices->device_list_mutex);
7521        dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL,
7522                                true);
7523        mutex_unlock(&fs_devices->device_list_mutex);
7524
7525        if (!dev) {
7526                btrfs_warn(fs_info, "get dev_stats failed, device not found");
7527                return -ENODEV;
7528        } else if (!dev->dev_stats_valid) {
7529                btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
7530                return -ENODEV;
7531        } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
7532                for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7533                        if (stats->nr_items > i)
7534                                stats->values[i] =
7535                                        btrfs_dev_stat_read_and_reset(dev, i);
7536                        else
7537                                btrfs_dev_stat_reset(dev, i);
7538                }
7539        } else {
7540                for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7541                        if (stats->nr_items > i)
7542                                stats->values[i] = btrfs_dev_stat_read(dev, i);
7543        }
7544        if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
7545                stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
7546        return 0;
7547}
7548
7549void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path)
7550{
7551        struct buffer_head *bh;
7552        struct btrfs_super_block *disk_super;
7553        int copy_num;
7554
7555        if (!bdev)
7556                return;
7557
7558        for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX;
7559                copy_num++) {
7560
7561                if (btrfs_read_dev_one_super(bdev, copy_num, &bh))
7562                        continue;
7563
7564                disk_super = (struct btrfs_super_block *)bh->b_data;
7565
7566                memset(&disk_super->magic, 0, sizeof(disk_super->magic));
7567                set_buffer_dirty(bh);
7568                sync_dirty_buffer(bh);
7569                brelse(bh);
7570        }
7571
7572        /* Notify udev that device has changed */
7573        btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
7574
7575        /* Update ctime/mtime for device path for libblkid */
7576        update_dev_time(device_path);
7577}
7578
7579/*
7580 * Update the size and bytes used for each device where it changed.  This is
7581 * delayed since we would otherwise get errors while writing out the
7582 * superblocks.
7583 *
7584 * Must be invoked during transaction commit.
7585 */
7586void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
7587{
7588        struct btrfs_device *curr, *next;
7589
7590        ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
7591
7592        if (list_empty(&trans->dev_update_list))
7593                return;
7594
7595        /*
7596         * We don't need the device_list_mutex here.  This list is owned by the
7597         * transaction and the transaction must complete before the device is
7598         * released.
7599         */
7600        mutex_lock(&trans->fs_info->chunk_mutex);
7601        list_for_each_entry_safe(curr, next, &trans->dev_update_list,
7602                                 post_commit_list) {
7603                list_del_init(&curr->post_commit_list);
7604                curr->commit_total_bytes = curr->disk_total_bytes;
7605                curr->commit_bytes_used = curr->bytes_used;
7606        }
7607        mutex_unlock(&trans->fs_info->chunk_mutex);
7608}
7609
7610void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info)
7611{
7612        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7613        while (fs_devices) {
7614                fs_devices->fs_info = fs_info;
7615                fs_devices = fs_devices->seed;
7616        }
7617}
7618
7619void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
7620{
7621        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7622        while (fs_devices) {
7623                fs_devices->fs_info = NULL;
7624                fs_devices = fs_devices->seed;
7625        }
7626}
7627
7628/*
7629 * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
7630 */
7631int btrfs_bg_type_to_factor(u64 flags)
7632{
7633        const int index = btrfs_bg_flags_to_raid_index(flags);
7634
7635        return btrfs_raid_array[index].ncopies;
7636}
7637
7638
7639
7640static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
7641                                 u64 chunk_offset, u64 devid,
7642                                 u64 physical_offset, u64 physical_len)
7643{
7644        struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7645        struct extent_map *em;
7646        struct map_lookup *map;
7647        struct btrfs_device *dev;
7648        u64 stripe_len;
7649        bool found = false;
7650        int ret = 0;
7651        int i;
7652
7653        read_lock(&em_tree->lock);
7654        em = lookup_extent_mapping(em_tree, chunk_offset, 1);
7655        read_unlock(&em_tree->lock);
7656
7657        if (!em) {
7658                btrfs_err(fs_info,
7659"dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
7660                          physical_offset, devid);
7661                ret = -EUCLEAN;
7662                goto out;
7663        }
7664
7665        map = em->map_lookup;
7666        stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes);
7667        if (physical_len != stripe_len) {
7668                btrfs_err(fs_info,
7669"dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
7670                          physical_offset, devid, em->start, physical_len,
7671                          stripe_len);
7672                ret = -EUCLEAN;
7673                goto out;
7674        }
7675
7676        for (i = 0; i < map->num_stripes; i++) {
7677                if (map->stripes[i].dev->devid == devid &&
7678                    map->stripes[i].physical == physical_offset) {
7679                        found = true;
7680                        if (map->verified_stripes >= map->num_stripes) {
7681                                btrfs_err(fs_info,
7682                                "too many dev extents for chunk %llu found",
7683                                          em->start);
7684                                ret = -EUCLEAN;
7685                                goto out;
7686                        }
7687                        map->verified_stripes++;
7688                        break;
7689                }
7690        }
7691        if (!found) {
7692                btrfs_err(fs_info,
7693        "dev extent physical offset %llu devid %llu has no corresponding chunk",
7694                        physical_offset, devid);
7695                ret = -EUCLEAN;
7696        }
7697
7698        /* Make sure no dev extent is beyond device bondary */
7699        dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
7700        if (!dev) {
7701                btrfs_err(fs_info, "failed to find devid %llu", devid);
7702                ret = -EUCLEAN;
7703                goto out;
7704        }
7705
7706        /* It's possible this device is a dummy for seed device */
7707        if (dev->disk_total_bytes == 0) {
7708                dev = btrfs_find_device(fs_info->fs_devices->seed, devid, NULL,
7709                                        NULL, false);
7710                if (!dev) {
7711                        btrfs_err(fs_info, "failed to find seed devid %llu",
7712                                  devid);
7713                        ret = -EUCLEAN;
7714                        goto out;
7715                }
7716        }
7717
7718        if (physical_offset + physical_len > dev->disk_total_bytes) {
7719                btrfs_err(fs_info,
7720"dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
7721                          devid, physical_offset, physical_len,
7722                          dev->disk_total_bytes);
7723                ret = -EUCLEAN;
7724                goto out;
7725        }
7726out:
7727        free_extent_map(em);
7728        return ret;
7729}
7730
7731static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
7732{
7733        struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7734        struct extent_map *em;
7735        struct rb_node *node;
7736        int ret = 0;
7737
7738        read_lock(&em_tree->lock);
7739        for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
7740                em = rb_entry(node, struct extent_map, rb_node);
7741                if (em->map_lookup->num_stripes !=
7742                    em->map_lookup->verified_stripes) {
7743                        btrfs_err(fs_info,
7744                        "chunk %llu has missing dev extent, have %d expect %d",
7745                                  em->start, em->map_lookup->verified_stripes,
7746                                  em->map_lookup->num_stripes);
7747                        ret = -EUCLEAN;
7748                        goto out;
7749                }
7750        }
7751out:
7752        read_unlock(&em_tree->lock);
7753        return ret;
7754}
7755
7756/*
7757 * Ensure that all dev extents are mapped to correct chunk, otherwise
7758 * later chunk allocation/free would cause unexpected behavior.
7759 *
7760 * NOTE: This will iterate through the whole device tree, which should be of
7761 * the same size level as the chunk tree.  This slightly increases mount time.
7762 */
7763int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
7764{
7765        struct btrfs_path *path;
7766        struct btrfs_root *root = fs_info->dev_root;
7767        struct btrfs_key key;
7768        u64 prev_devid = 0;
7769        u64 prev_dev_ext_end = 0;
7770        int ret = 0;
7771
7772        key.objectid = 1;
7773        key.type = BTRFS_DEV_EXTENT_KEY;
7774        key.offset = 0;
7775
7776        path = btrfs_alloc_path();
7777        if (!path)
7778                return -ENOMEM;
7779
7780        path->reada = READA_FORWARD;
7781        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7782        if (ret < 0)
7783                goto out;
7784
7785        if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
7786                ret = btrfs_next_item(root, path);
7787                if (ret < 0)
7788                        goto out;
7789                /* No dev extents at all? Not good */
7790                if (ret > 0) {
7791                        ret = -EUCLEAN;
7792                        goto out;
7793                }
7794        }
7795        while (1) {
7796                struct extent_buffer *leaf = path->nodes[0];
7797                struct btrfs_dev_extent *dext;
7798                int slot = path->slots[0];
7799                u64 chunk_offset;
7800                u64 physical_offset;
7801                u64 physical_len;
7802                u64 devid;
7803
7804                btrfs_item_key_to_cpu(leaf, &key, slot);
7805                if (key.type != BTRFS_DEV_EXTENT_KEY)
7806                        break;
7807                devid = key.objectid;
7808                physical_offset = key.offset;
7809
7810                dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
7811                chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
7812                physical_len = btrfs_dev_extent_length(leaf, dext);
7813
7814                /* Check if this dev extent overlaps with the previous one */
7815                if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
7816                        btrfs_err(fs_info,
7817"dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
7818                                  devid, physical_offset, prev_dev_ext_end);
7819                        ret = -EUCLEAN;
7820                        goto out;
7821                }
7822
7823                ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
7824                                            physical_offset, physical_len);
7825                if (ret < 0)
7826                        goto out;
7827                prev_devid = devid;
7828                prev_dev_ext_end = physical_offset + physical_len;
7829
7830                ret = btrfs_next_item(root, path);
7831                if (ret < 0)
7832                        goto out;
7833                if (ret > 0) {
7834                        ret = 0;
7835                        break;
7836                }
7837        }
7838
7839        /* Ensure all chunks have corresponding dev extents */
7840        ret = verify_chunk_dev_extent_mapping(fs_info);
7841out:
7842        btrfs_free_path(path);
7843        return ret;
7844}
7845
7846/*
7847 * Check whether the given block group or device is pinned by any inode being
7848 * used as a swapfile.
7849 */
7850bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
7851{
7852        struct btrfs_swapfile_pin *sp;
7853        struct rb_node *node;
7854
7855        spin_lock(&fs_info->swapfile_pins_lock);
7856        node = fs_info->swapfile_pins.rb_node;
7857        while (node) {
7858                sp = rb_entry(node, struct btrfs_swapfile_pin, node);
7859                if (ptr < sp->ptr)
7860                        node = node->rb_left;
7861                else if (ptr > sp->ptr)
7862                        node = node->rb_right;
7863                else
7864                        break;
7865        }
7866        spin_unlock(&fs_info->swapfile_pins_lock);
7867        return node != NULL;
7868}
7869