LXR linux/fs/btrfs/volumes.c

   1/*
   2 * Copyright (C) 2007 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18#include <linux/sched.h>
  19#include <linux/bio.h>
  20#include <linux/slab.h>
  21#include <linux/buffer_head.h>
  22#include <linux/blkdev.h>
  23#include <linux/iocontext.h>
  24#include <linux/capability.h>
  25#include <linux/ratelimit.h>
  26#include <linux/kthread.h>
  27#include <linux/raid/pq.h>
  28#include <linux/semaphore.h>
  29#include <linux/uuid.h>
  30#include <asm/div64.h>
  31#include "ctree.h"
  32#include "extent_map.h"
  33#include "disk-io.h"
  34#include "transaction.h"
  35#include "print-tree.h"
  36#include "volumes.h"
  37#include "raid56.h"
  38#include "async-thread.h"
  39#include "check-integrity.h"
  40#include "rcu-string.h"
  41#include "math.h"
  42#include "dev-replace.h"
  43#include "sysfs.h"
  44
  45const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
  46        [BTRFS_RAID_RAID10] = {
  47                .sub_stripes    = 2,
  48                .dev_stripes    = 1,
  49                .devs_max       = 0,    /* 0 == as many as possible */
  50                .devs_min       = 4,
  51                .tolerated_failures = 1,
  52                .devs_increment = 2,
  53                .ncopies        = 2,
  54        },
  55        [BTRFS_RAID_RAID1] = {
  56                .sub_stripes    = 1,
  57                .dev_stripes    = 1,
  58                .devs_max       = 2,
  59                .devs_min       = 2,
  60                .tolerated_failures = 1,
  61                .devs_increment = 2,
  62                .ncopies        = 2,
  63        },
  64        [BTRFS_RAID_DUP] = {
  65                .sub_stripes    = 1,
  66                .dev_stripes    = 2,
  67                .devs_max       = 1,
  68                .devs_min       = 1,
  69                .tolerated_failures = 0,
  70                .devs_increment = 1,
  71                .ncopies        = 2,
  72        },
  73        [BTRFS_RAID_RAID0] = {
  74                .sub_stripes    = 1,
  75                .dev_stripes    = 1,
  76                .devs_max       = 0,
  77                .devs_min       = 2,
  78                .tolerated_failures = 0,
  79                .devs_increment = 1,
  80                .ncopies        = 1,
  81        },
  82        [BTRFS_RAID_SINGLE] = {
  83                .sub_stripes    = 1,
  84                .dev_stripes    = 1,
  85                .devs_max       = 1,
  86                .devs_min       = 1,
  87                .tolerated_failures = 0,
  88                .devs_increment = 1,
  89                .ncopies        = 1,
  90        },
  91        [BTRFS_RAID_RAID5] = {
  92                .sub_stripes    = 1,
  93                .dev_stripes    = 1,
  94                .devs_max       = 0,
  95                .devs_min       = 2,
  96                .tolerated_failures = 1,
  97                .devs_increment = 1,
  98                .ncopies        = 2,
  99        },
 100        [BTRFS_RAID_RAID6] = {
 101                .sub_stripes    = 1,
 102                .dev_stripes    = 1,
 103                .devs_max       = 0,
 104                .devs_min       = 3,
 105                .tolerated_failures = 2,
 106                .devs_increment = 1,
 107                .ncopies        = 3,
 108        },
 109};
 110
 111const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
 112        [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10,
 113        [BTRFS_RAID_RAID1]  = BTRFS_BLOCK_GROUP_RAID1,
 114        [BTRFS_RAID_DUP]    = BTRFS_BLOCK_GROUP_DUP,
 115        [BTRFS_RAID_RAID0]  = BTRFS_BLOCK_GROUP_RAID0,
 116        [BTRFS_RAID_SINGLE] = 0,
 117        [BTRFS_RAID_RAID5]  = BTRFS_BLOCK_GROUP_RAID5,
 118        [BTRFS_RAID_RAID6]  = BTRFS_BLOCK_GROUP_RAID6,
 119};
 120
 121/*
 122 * Table to convert BTRFS_RAID_* to the error code if minimum number of devices
 123 * condition is not met. Zero means there's no corresponding
 124 * BTRFS_ERROR_DEV_*_NOT_MET value.
 125 */
 126const int btrfs_raid_mindev_error[BTRFS_NR_RAID_TYPES] = {
 127        [BTRFS_RAID_RAID10] = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
 128        [BTRFS_RAID_RAID1]  = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
 129        [BTRFS_RAID_DUP]    = 0,
 130        [BTRFS_RAID_RAID0]  = 0,
 131        [BTRFS_RAID_SINGLE] = 0,
 132        [BTRFS_RAID_RAID5]  = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
 133        [BTRFS_RAID_RAID6]  = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
 134};
 135
 136static int init_first_rw_device(struct btrfs_trans_handle *trans,
 137                                struct btrfs_fs_info *fs_info);
 138static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
 139static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
 140static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
 141static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
 142static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
 143                             enum btrfs_map_op op,
 144                             u64 logical, u64 *length,
 145                             struct btrfs_bio **bbio_ret,
 146                             int mirror_num, int need_raid_map);
 147
 148DEFINE_MUTEX(uuid_mutex);
 149static LIST_HEAD(fs_uuids);
 150struct list_head *btrfs_get_fs_uuids(void)
 151{
 152        return &fs_uuids;
 153}
 154
 155/*
 156 * alloc_fs_devices - allocate struct btrfs_fs_devices
 157 * @fsid:       if not NULL, copy the uuid to fs_devices::fsid
 158 *
 159 * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
 160 * The returned struct is not linked onto any lists and can be destroyed with
 161 * kfree() right away.
 162 */
 163static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
 164{
 165        struct btrfs_fs_devices *fs_devs;
 166
 167        fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
 168        if (!fs_devs)
 169                return ERR_PTR(-ENOMEM);
 170
 171        mutex_init(&fs_devs->device_list_mutex);
 172
 173        INIT_LIST_HEAD(&fs_devs->devices);
 174        INIT_LIST_HEAD(&fs_devs->resized_devices);
 175        INIT_LIST_HEAD(&fs_devs->alloc_list);
 176        INIT_LIST_HEAD(&fs_devs->list);
 177        if (fsid)
 178                memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
 179
 180        return fs_devs;
 181}
 182
 183static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
 184{
 185        struct btrfs_device *device;
 186        WARN_ON(fs_devices->opened);
 187        while (!list_empty(&fs_devices->devices)) {
 188                device = list_entry(fs_devices->devices.next,
 189                                    struct btrfs_device, dev_list);
 190                list_del(&device->dev_list);
 191                rcu_string_free(device->name);
 192                kfree(device);
 193        }
 194        kfree(fs_devices);
 195}
 196
 197static void btrfs_kobject_uevent(struct block_device *bdev,
 198                                 enum kobject_action action)
 199{
 200        int ret;
 201
 202        ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
 203        if (ret)
 204                pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
 205                        action,
 206                        kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
 207                        &disk_to_dev(bdev->bd_disk)->kobj);
 208}
 209
 210void btrfs_cleanup_fs_uuids(void)
 211{
 212        struct btrfs_fs_devices *fs_devices;
 213
 214        while (!list_empty(&fs_uuids)) {
 215                fs_devices = list_entry(fs_uuids.next,
 216                                        struct btrfs_fs_devices, list);
 217                list_del(&fs_devices->list);
 218                free_fs_devices(fs_devices);
 219        }
 220}
 221
 222static struct btrfs_device *__alloc_device(void)
 223{
 224        struct btrfs_device *dev;
 225
 226        dev = kzalloc(sizeof(*dev), GFP_KERNEL);
 227        if (!dev)
 228                return ERR_PTR(-ENOMEM);
 229
 230        /*
 231         * Preallocate a bio that's always going to be used for flushing device
 232         * barriers and matches the device lifespan
 233         */
 234        dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
 235        if (!dev->flush_bio) {
 236                kfree(dev);
 237                return ERR_PTR(-ENOMEM);
 238        }
 239        bio_get(dev->flush_bio);
 240
 241        INIT_LIST_HEAD(&dev->dev_list);
 242        INIT_LIST_HEAD(&dev->dev_alloc_list);
 243        INIT_LIST_HEAD(&dev->resized_list);
 244
 245        spin_lock_init(&dev->io_lock);
 246
 247        spin_lock_init(&dev->reada_lock);
 248        atomic_set(&dev->reada_in_flight, 0);
 249        atomic_set(&dev->dev_stats_ccnt, 0);
 250        btrfs_device_data_ordered_init(dev);
 251        INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
 252        INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
 253
 254        return dev;
 255}
 256
 257/*
 258 * Find a device specified by @devid or @uuid in the list of @fs_devices, or
 259 * return NULL.
 260 *
 261 * If devid and uuid are both specified, the match must be exact, otherwise
 262 * only devid is used.
 263 */
 264static struct btrfs_device *find_device(struct btrfs_fs_devices *fs_devices,
 265                u64 devid, const u8 *uuid)
 266{
 267        struct list_head *head = &fs_devices->devices;
 268        struct btrfs_device *dev;
 269
 270        list_for_each_entry(dev, head, dev_list) {
 271                if (dev->devid == devid &&
 272                    (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
 273                        return dev;
 274                }
 275        }
 276        return NULL;
 277}
 278
 279static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
 280{
 281        struct btrfs_fs_devices *fs_devices;
 282
 283        list_for_each_entry(fs_devices, &fs_uuids, list) {
 284                if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
 285                        return fs_devices;
 286        }
 287        return NULL;
 288}
 289
 290static int
 291btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
 292                      int flush, struct block_device **bdev,
 293                      struct buffer_head **bh)
 294{
 295        int ret;
 296
 297        *bdev = blkdev_get_by_path(device_path, flags, holder);
 298
 299        if (IS_ERR(*bdev)) {
 300                ret = PTR_ERR(*bdev);
 301                goto error;
 302        }
 303
 304        if (flush)
 305                filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
 306        ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
 307        if (ret) {
 308                blkdev_put(*bdev, flags);
 309                goto error;
 310        }
 311        invalidate_bdev(*bdev);
 312        *bh = btrfs_read_dev_super(*bdev);
 313        if (IS_ERR(*bh)) {
 314                ret = PTR_ERR(*bh);
 315                blkdev_put(*bdev, flags);
 316                goto error;
 317        }
 318
 319        return 0;
 320
 321error:
 322        *bdev = NULL;
 323        *bh = NULL;
 324        return ret;
 325}
 326
 327static void requeue_list(struct btrfs_pending_bios *pending_bios,
 328                        struct bio *head, struct bio *tail)
 329{
 330
 331        struct bio *old_head;
 332
 333        old_head = pending_bios->head;
 334        pending_bios->head = head;
 335        if (pending_bios->tail)
 336                tail->bi_next = old_head;
 337        else
 338                pending_bios->tail = tail;
 339}
 340
 341/*
 342 * we try to collect pending bios for a device so we don't get a large
 343 * number of procs sending bios down to the same device.  This greatly
 344 * improves the schedulers ability to collect and merge the bios.
 345 *
 346 * But, it also turns into a long list of bios to process and that is sure
 347 * to eventually make the worker thread block.  The solution here is to
 348 * make some progress and then put this work struct back at the end of
 349 * the list if the block device is congested.  This way, multiple devices
 350 * can make progress from a single worker thread.
 351 */
 352static noinline void run_scheduled_bios(struct btrfs_device *device)
 353{
 354        struct btrfs_fs_info *fs_info = device->fs_info;
 355        struct bio *pending;
 356        struct backing_dev_info *bdi;
 357        struct btrfs_pending_bios *pending_bios;
 358        struct bio *tail;
 359        struct bio *cur;
 360        int again = 0;
 361        unsigned long num_run;
 362        unsigned long batch_run = 0;
 363        unsigned long limit;
 364        unsigned long last_waited = 0;
 365        int force_reg = 0;
 366        int sync_pending = 0;
 367        struct blk_plug plug;
 368
 369        /*
 370         * this function runs all the bios we've collected for
 371         * a particular device.  We don't want to wander off to
 372         * another device without first sending all of these down.
 373         * So, setup a plug here and finish it off before we return
 374         */
 375        blk_start_plug(&plug);
 376
 377        bdi = device->bdev->bd_bdi;
 378        limit = btrfs_async_submit_limit(fs_info);
 379        limit = limit * 2 / 3;
 380
 381loop:
 382        spin_lock(&device->io_lock);
 383
 384loop_lock:
 385        num_run = 0;
 386
 387        /* take all the bios off the list at once and process them
 388         * later on (without the lock held).  But, remember the
 389         * tail and other pointers so the bios can be properly reinserted
 390         * into the list if we hit congestion
 391         */
 392        if (!force_reg && device->pending_sync_bios.head) {
 393                pending_bios = &device->pending_sync_bios;
 394                force_reg = 1;
 395        } else {
 396                pending_bios = &device->pending_bios;
 397                force_reg = 0;
 398        }
 399
 400        pending = pending_bios->head;
 401        tail = pending_bios->tail;
 402        WARN_ON(pending && !tail);
 403
 404        /*
 405         * if pending was null this time around, no bios need processing
 406         * at all and we can stop.  Otherwise it'll loop back up again
 407         * and do an additional check so no bios are missed.
 408         *
 409         * device->running_pending is used to synchronize with the
 410         * schedule_bio code.
 411         */
 412        if (device->pending_sync_bios.head == NULL &&
 413            device->pending_bios.head == NULL) {
 414                again = 0;
 415                device->running_pending = 0;
 416        } else {
 417                again = 1;
 418                device->running_pending = 1;
 419        }
 420
 421        pending_bios->head = NULL;
 422        pending_bios->tail = NULL;
 423
 424        spin_unlock(&device->io_lock);
 425
 426        while (pending) {
 427
 428                rmb();
 429                /* we want to work on both lists, but do more bios on the
 430                 * sync list than the regular list
 431                 */
 432                if ((num_run > 32 &&
 433                    pending_bios != &device->pending_sync_bios &&
 434                    device->pending_sync_bios.head) ||
 435                   (num_run > 64 && pending_bios == &device->pending_sync_bios &&
 436                    device->pending_bios.head)) {
 437                        spin_lock(&device->io_lock);
 438                        requeue_list(pending_bios, pending, tail);
 439                        goto loop_lock;
 440                }
 441
 442                cur = pending;
 443                pending = pending->bi_next;
 444                cur->bi_next = NULL;
 445
 446                /*
 447                 * atomic_dec_return implies a barrier for waitqueue_active
 448                 */
 449                if (atomic_dec_return(&fs_info->nr_async_bios) < limit &&
 450                    waitqueue_active(&fs_info->async_submit_wait))
 451                        wake_up(&fs_info->async_submit_wait);
 452
 453                BUG_ON(atomic_read(&cur->__bi_cnt) == 0);
 454
 455                /*
 456                 * if we're doing the sync list, record that our
 457                 * plug has some sync requests on it
 458                 *
 459                 * If we're doing the regular list and there are
 460                 * sync requests sitting around, unplug before
 461                 * we add more
 462                 */
 463                if (pending_bios == &device->pending_sync_bios) {
 464                        sync_pending = 1;
 465                } else if (sync_pending) {
 466                        blk_finish_plug(&plug);
 467                        blk_start_plug(&plug);
 468                        sync_pending = 0;
 469                }
 470
 471                btrfsic_submit_bio(cur);
 472                num_run++;
 473                batch_run++;
 474
 475                cond_resched();
 476
 477                /*
 478                 * we made progress, there is more work to do and the bdi
 479                 * is now congested.  Back off and let other work structs
 480                 * run instead
 481                 */
 482                if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
 483                    fs_info->fs_devices->open_devices > 1) {
 484                        struct io_context *ioc;
 485
 486                        ioc = current->io_context;
 487
 488                        /*
 489                         * the main goal here is that we don't want to
 490                         * block if we're going to be able to submit
 491                         * more requests without blocking.
 492                         *
 493                         * This code does two great things, it pokes into
 494                         * the elevator code from a filesystem _and_
 495                         * it makes assumptions about how batching works.
 496                         */
 497                        if (ioc && ioc->nr_batch_requests > 0 &&
 498                            time_before(jiffies, ioc->last_waited + HZ/50UL) &&
 499                            (last_waited == 0 ||
 500                             ioc->last_waited == last_waited)) {
 501                                /*
 502                                 * we want to go through our batch of
 503                                 * requests and stop.  So, we copy out
 504                                 * the ioc->last_waited time and test
 505                                 * against it before looping
 506                                 */
 507                                last_waited = ioc->last_waited;
 508                                cond_resched();
 509                                continue;
 510                        }
 511                        spin_lock(&device->io_lock);
 512                        requeue_list(pending_bios, pending, tail);
 513                        device->running_pending = 1;
 514
 515                        spin_unlock(&device->io_lock);
 516                        btrfs_queue_work(fs_info->submit_workers,
 517                                         &device->work);
 518                        goto done;
 519                }
 520                /* unplug every 64 requests just for good measure */
 521                if (batch_run % 64 == 0) {
 522                        blk_finish_plug(&plug);
 523                        blk_start_plug(&plug);
 524                        sync_pending = 0;
 525                }
 526        }
 527
 528        cond_resched();
 529        if (again)
 530                goto loop;
 531
 532        spin_lock(&device->io_lock);
 533        if (device->pending_bios.head || device->pending_sync_bios.head)
 534                goto loop_lock;
 535        spin_unlock(&device->io_lock);
 536
 537done:
 538        blk_finish_plug(&plug);
 539}
 540
 541static void pending_bios_fn(struct btrfs_work *work)
 542{
 543        struct btrfs_device *device;
 544
 545        device = container_of(work, struct btrfs_device, work);
 546        run_scheduled_bios(device);
 547}
 548
 549
 550void btrfs_free_stale_device(struct btrfs_device *cur_dev)
 551{
 552        struct btrfs_fs_devices *fs_devs;
 553        struct btrfs_device *dev;
 554
 555        if (!cur_dev->name)
 556                return;
 557
 558        list_for_each_entry(fs_devs, &fs_uuids, list) {
 559                int del = 1;
 560
 561                if (fs_devs->opened)
 562                        continue;
 563                if (fs_devs->seeding)
 564                        continue;
 565
 566                list_for_each_entry(dev, &fs_devs->devices, dev_list) {
 567
 568                        if (dev == cur_dev)
 569                                continue;
 570                        if (!dev->name)
 571                                continue;
 572
 573                        /*
 574                         * Todo: This won't be enough. What if the same device
 575                         * comes back (with new uuid and) with its mapper path?
 576                         * But for now, this does help as mostly an admin will
 577                         * either use mapper or non mapper path throughout.
 578                         */
 579                        rcu_read_lock();
 580                        del = strcmp(rcu_str_deref(dev->name),
 581                                                rcu_str_deref(cur_dev->name));
 582                        rcu_read_unlock();
 583                        if (!del)
 584                                break;
 585                }
 586
 587                if (!del) {
 588                        /* delete the stale device */
 589                        if (fs_devs->num_devices == 1) {
 590                                btrfs_sysfs_remove_fsid(fs_devs);
 591                                list_del(&fs_devs->list);
 592                                free_fs_devices(fs_devs);
 593                        } else {
 594                                fs_devs->num_devices--;
 595                                list_del(&dev->dev_list);
 596                                rcu_string_free(dev->name);
 597                                kfree(dev);
 598                        }
 599                        break;
 600                }
 601        }
 602}
 603
 604/*
 605 * Add new device to list of registered devices
 606 *
 607 * Returns:
 608 * 1   - first time device is seen
 609 * 0   - device already known
 610 * < 0 - error
 611 */
 612static noinline int device_list_add(const char *path,
 613                           struct btrfs_super_block *disk_super,
 614                           u64 devid, struct btrfs_fs_devices **fs_devices_ret)
 615{
 616        struct btrfs_device *device;
 617        struct btrfs_fs_devices *fs_devices;
 618        struct rcu_string *name;
 619        int ret = 0;
 620        u64 found_transid = btrfs_super_generation(disk_super);
 621
 622        fs_devices = find_fsid(disk_super->fsid);
 623        if (!fs_devices) {
 624                fs_devices = alloc_fs_devices(disk_super->fsid);
 625                if (IS_ERR(fs_devices))
 626                        return PTR_ERR(fs_devices);
 627
 628                list_add(&fs_devices->list, &fs_uuids);
 629
 630                device = NULL;
 631        } else {
 632                device = find_device(fs_devices, devid,
 633                                disk_super->dev_item.uuid);
 634        }
 635
 636        if (!device) {
 637                if (fs_devices->opened)
 638                        return -EBUSY;
 639
 640                device = btrfs_alloc_device(NULL, &devid,
 641                                            disk_super->dev_item.uuid);
 642                if (IS_ERR(device)) {
 643                        /* we can safely leave the fs_devices entry around */
 644                        return PTR_ERR(device);
 645                }
 646
 647                name = rcu_string_strdup(path, GFP_NOFS);
 648                if (!name) {
 649                        kfree(device);
 650                        return -ENOMEM;
 651                }
 652                rcu_assign_pointer(device->name, name);
 653
 654                mutex_lock(&fs_devices->device_list_mutex);
 655                list_add_rcu(&device->dev_list, &fs_devices->devices);
 656                fs_devices->num_devices++;
 657                mutex_unlock(&fs_devices->device_list_mutex);
 658
 659                ret = 1;
 660                device->fs_devices = fs_devices;
 661        } else if (!device->name || strcmp(device->name->str, path)) {
 662                /*
 663                 * When FS is already mounted.
 664                 * 1. If you are here and if the device->name is NULL that
 665                 *    means this device was missing at time of FS mount.
 666                 * 2. If you are here and if the device->name is different
 667                 *    from 'path' that means either
 668                 *      a. The same device disappeared and reappeared with
 669                 *         different name. or
 670                 *      b. The missing-disk-which-was-replaced, has
 671                 *         reappeared now.
 672                 *
 673                 * We must allow 1 and 2a above. But 2b would be a spurious
 674                 * and unintentional.
 675                 *
 676                 * Further in case of 1 and 2a above, the disk at 'path'
 677                 * would have missed some transaction when it was away and
 678                 * in case of 2a the stale bdev has to be updated as well.
 679                 * 2b must not be allowed at all time.
 680                 */
 681
 682                /*
 683                 * For now, we do allow update to btrfs_fs_device through the
 684                 * btrfs dev scan cli after FS has been mounted.  We're still
 685                 * tracking a problem where systems fail mount by subvolume id
 686                 * when we reject replacement on a mounted FS.
 687                 */
 688                if (!fs_devices->opened && found_transid < device->generation) {
 689                        /*
 690                         * That is if the FS is _not_ mounted and if you
 691                         * are here, that means there is more than one
 692                         * disk with same uuid and devid.We keep the one
 693                         * with larger generation number or the last-in if
 694                         * generation are equal.
 695                         */
 696                        return -EEXIST;
 697                }
 698
 699                name = rcu_string_strdup(path, GFP_NOFS);
 700                if (!name)
 701                        return -ENOMEM;
 702                rcu_string_free(device->name);
 703                rcu_assign_pointer(device->name, name);
 704                if (device->missing) {
 705                        fs_devices->missing_devices--;
 706                        device->missing = 0;
 707                }
 708        }
 709
 710        /*
 711         * Unmount does not free the btrfs_device struct but would zero
 712         * generation along with most of the other members. So just update
 713         * it back. We need it to pick the disk with largest generation
 714         * (as above).
 715         */
 716        if (!fs_devices->opened)
 717                device->generation = found_transid;
 718
 719        /*
 720         * if there is new btrfs on an already registered device,
 721         * then remove the stale device entry.
 722         */
 723        if (ret > 0)
 724                btrfs_free_stale_device(device);
 725
 726        *fs_devices_ret = fs_devices;
 727
 728        return ret;
 729}
 730
 731static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 732{
 733        struct btrfs_fs_devices *fs_devices;
 734        struct btrfs_device *device;
 735        struct btrfs_device *orig_dev;
 736
 737        fs_devices = alloc_fs_devices(orig->fsid);
 738        if (IS_ERR(fs_devices))
 739                return fs_devices;
 740
 741        mutex_lock(&orig->device_list_mutex);
 742        fs_devices->total_devices = orig->total_devices;
 743
 744        /* We have held the volume lock, it is safe to get the devices. */
 745        list_for_each_entry(orig_dev, &orig->devices, dev_list) {
 746                struct rcu_string *name;
 747
 748                device = btrfs_alloc_device(NULL, &orig_dev->devid,
 749                                            orig_dev->uuid);
 750                if (IS_ERR(device))
 751                        goto error;
 752
 753                /*
 754                 * This is ok to do without rcu read locked because we hold the
 755                 * uuid mutex so nothing we touch in here is going to disappear.
 756                 */
 757                if (orig_dev->name) {
 758                        name = rcu_string_strdup(orig_dev->name->str,
 759                                        GFP_KERNEL);
 760                        if (!name) {
 761                                kfree(device);
 762                                goto error;
 763                        }
 764                        rcu_assign_pointer(device->name, name);
 765                }
 766
 767                list_add(&device->dev_list, &fs_devices->devices);
 768                device->fs_devices = fs_devices;
 769                fs_devices->num_devices++;
 770        }
 771        mutex_unlock(&orig->device_list_mutex);
 772        return fs_devices;
 773error:
 774        mutex_unlock(&orig->device_list_mutex);
 775        free_fs_devices(fs_devices);
 776        return ERR_PTR(-ENOMEM);
 777}
 778
 779void btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices, int step)
 780{
 781        struct btrfs_device *device, *next;
 782        struct btrfs_device *latest_dev = NULL;
 783
 784        mutex_lock(&uuid_mutex);
 785again:
 786        /* This is the initialized path, it is safe to release the devices. */
 787        list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
 788                if (device->in_fs_metadata) {
 789                        if (!device->is_tgtdev_for_dev_replace &&
 790                            (!latest_dev ||
 791                             device->generation > latest_dev->generation)) {
 792                                latest_dev = device;
 793                        }
 794                        continue;
 795                }
 796
 797                if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
 798                        /*
 799                         * In the first step, keep the device which has
 800                         * the correct fsid and the devid that is used
 801                         * for the dev_replace procedure.
 802                         * In the second step, the dev_replace state is
 803                         * read from the device tree and it is known
 804                         * whether the procedure is really active or
 805                         * not, which means whether this device is
 806                         * used or whether it should be removed.
 807                         */
 808                        if (step == 0 || device->is_tgtdev_for_dev_replace) {
 809                                continue;
 810                        }
 811                }
 812                if (device->bdev) {
 813                        blkdev_put(device->bdev, device->mode);
 814                        device->bdev = NULL;
 815                        fs_devices->open_devices--;
 816                }
 817                if (device->writeable) {
 818                        list_del_init(&device->dev_alloc_list);
 819                        device->writeable = 0;
 820                        if (!device->is_tgtdev_for_dev_replace)
 821                                fs_devices->rw_devices--;
 822                }
 823                list_del_init(&device->dev_list);
 824                fs_devices->num_devices--;
 825                rcu_string_free(device->name);
 826                kfree(device);
 827        }
 828
 829        if (fs_devices->seed) {
 830                fs_devices = fs_devices->seed;
 831                goto again;
 832        }
 833
 834        fs_devices->latest_bdev = latest_dev->bdev;
 835
 836        mutex_unlock(&uuid_mutex);
 837}
 838
 839static void __free_device(struct work_struct *work)
 840{
 841        struct btrfs_device *device;
 842
 843        device = container_of(work, struct btrfs_device, rcu_work);
 844        rcu_string_free(device->name);
 845        bio_put(device->flush_bio);
 846        kfree(device);
 847}
 848
 849static void free_device(struct rcu_head *head)
 850{
 851        struct btrfs_device *device;
 852
 853        device = container_of(head, struct btrfs_device, rcu);
 854
 855        INIT_WORK(&device->rcu_work, __free_device);
 856        schedule_work(&device->rcu_work);
 857}
 858
 859static void btrfs_close_bdev(struct btrfs_device *device)
 860{
 861        if (device->bdev && device->writeable) {
 862                sync_blockdev(device->bdev);
 863                invalidate_bdev(device->bdev);
 864        }
 865
 866        if (device->bdev)
 867                blkdev_put(device->bdev, device->mode);
 868}
 869
 870static void btrfs_prepare_close_one_device(struct btrfs_device *device)
 871{
 872        struct btrfs_fs_devices *fs_devices = device->fs_devices;
 873        struct btrfs_device *new_device;
 874        struct rcu_string *name;
 875
 876        if (device->bdev)
 877                fs_devices->open_devices--;
 878
 879        if (device->writeable &&
 880            device->devid != BTRFS_DEV_REPLACE_DEVID) {
 881                list_del_init(&device->dev_alloc_list);
 882                fs_devices->rw_devices--;
 883        }
 884
 885        if (device->missing)
 886                fs_devices->missing_devices--;
 887
 888        new_device = btrfs_alloc_device(NULL, &device->devid,
 889                                        device->uuid);
 890        BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
 891
 892        /* Safe because we are under uuid_mutex */
 893        if (device->name) {
 894                name = rcu_string_strdup(device->name->str, GFP_NOFS);
 895                BUG_ON(!name); /* -ENOMEM */
 896                rcu_assign_pointer(new_device->name, name);
 897        }
 898
 899        list_replace_rcu(&device->dev_list, &new_device->dev_list);
 900        new_device->fs_devices = device->fs_devices;
 901}
 902
 903static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 904{
 905        struct btrfs_device *device, *tmp;
 906        struct list_head pending_put;
 907
 908        INIT_LIST_HEAD(&pending_put);
 909
 910        if (--fs_devices->opened > 0)
 911                return 0;
 912
 913        mutex_lock(&fs_devices->device_list_mutex);
 914        list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) {
 915                btrfs_prepare_close_one_device(device);
 916                list_add(&device->dev_list, &pending_put);
 917        }
 918        mutex_unlock(&fs_devices->device_list_mutex);
 919
 920        /*
 921         * btrfs_show_devname() is using the device_list_mutex,
 922         * sometimes call to blkdev_put() leads vfs calling
 923         * into this func. So do put outside of device_list_mutex,
 924         * as of now.
 925         */
 926        while (!list_empty(&pending_put)) {
 927                device = list_first_entry(&pending_put,
 928                                struct btrfs_device, dev_list);
 929                list_del(&device->dev_list);
 930                btrfs_close_bdev(device);
 931                call_rcu(&device->rcu, free_device);
 932        }
 933
 934        WARN_ON(fs_devices->open_devices);
 935        WARN_ON(fs_devices->rw_devices);
 936        fs_devices->opened = 0;
 937        fs_devices->seeding = 0;
 938
 939        return 0;
 940}
 941
 942int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 943{
 944        struct btrfs_fs_devices *seed_devices = NULL;
 945        int ret;
 946
 947        mutex_lock(&uuid_mutex);
 948        ret = __btrfs_close_devices(fs_devices);
 949        if (!fs_devices->opened) {
 950                seed_devices = fs_devices->seed;
 951                fs_devices->seed = NULL;
 952        }
 953        mutex_unlock(&uuid_mutex);
 954
 955        while (seed_devices) {
 956                fs_devices = seed_devices;
 957                seed_devices = fs_devices->seed;
 958                __btrfs_close_devices(fs_devices);
 959                free_fs_devices(fs_devices);
 960        }
 961        /*
 962         * Wait for rcu kworkers under __btrfs_close_devices
 963         * to finish all blkdev_puts so device is really
 964         * free when umount is done.
 965         */
 966        rcu_barrier();
 967        return ret;
 968}
 969
 970static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 971                                fmode_t flags, void *holder)
 972{
 973        struct request_queue *q;
 974        struct block_device *bdev;
 975        struct list_head *head = &fs_devices->devices;
 976        struct btrfs_device *device;
 977        struct btrfs_device *latest_dev = NULL;
 978        struct buffer_head *bh;
 979        struct btrfs_super_block *disk_super;
 980        u64 devid;
 981        int seeding = 1;
 982        int ret = 0;
 983
 984        flags |= FMODE_EXCL;
 985
 986        list_for_each_entry(device, head, dev_list) {
 987                if (device->bdev)
 988                        continue;
 989                if (!device->name)
 990                        continue;
 991
 992                /* Just open everything we can; ignore failures here */
 993                if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
 994                                            &bdev, &bh))
 995                        continue;
 996
 997                disk_super = (struct btrfs_super_block *)bh->b_data;
 998                devid = btrfs_stack_device_id(&disk_super->dev_item);
 999                if (devid != device->devid)
1000                        goto error_brelse;

1001
1002                if (memcmp(device->uuid, disk_super->dev_item.uuid,
1003                           BTRFS_UUID_SIZE))
1004                        goto error_brelse;
1005
1006                device->generation = btrfs_super_generation(disk_super);
1007                if (!latest_dev ||
1008                    device->generation > latest_dev->generation)
1009                        latest_dev = device;
1010
1011                if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
1012                        device->writeable = 0;
1013                } else {
1014                        device->writeable = !bdev_read_only(bdev);
1015                        seeding = 0;
1016                }
1017
1018                q = bdev_get_queue(bdev);
1019                if (blk_queue_discard(q))
1020                        device->can_discard = 1;
1021                if (!blk_queue_nonrot(q))
1022                        fs_devices->rotating = 1;
1023
1024                device->bdev = bdev;
1025                device->in_fs_metadata = 0;
1026                device->mode = flags;
1027
1028                fs_devices->open_devices++;
1029                if (device->writeable &&
1030                    device->devid != BTRFS_DEV_REPLACE_DEVID) {
1031                        fs_devices->rw_devices++;
1032                        list_add(&device->dev_alloc_list,
1033                                 &fs_devices->alloc_list);
1034                }
1035                brelse(bh);
1036                continue;
1037
1038error_brelse:
1039                brelse(bh);
1040                blkdev_put(bdev, flags);
1041                continue;
1042        }
1043        if (fs_devices->open_devices == 0) {
1044                ret = -EINVAL;
1045                goto out;
1046        }
1047        fs_devices->seeding = seeding;
1048        fs_devices->opened = 1;
1049        fs_devices->latest_bdev = latest_dev->bdev;
1050        fs_devices->total_rw_bytes = 0;
1051out:
1052        return ret;
1053}
1054
1055int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1056                       fmode_t flags, void *holder)
1057{
1058        int ret;
1059
1060        mutex_lock(&uuid_mutex);
1061        if (fs_devices->opened) {
1062                fs_devices->opened++;
1063                ret = 0;
1064        } else {
1065                ret = __btrfs_open_devices(fs_devices, flags, holder);
1066        }
1067        mutex_unlock(&uuid_mutex);
1068        return ret;
1069}
1070
1071void btrfs_release_disk_super(struct page *page)
1072{
1073        kunmap(page);
1074        put_page(page);
1075}
1076
1077int btrfs_read_disk_super(struct block_device *bdev, u64 bytenr,
1078                struct page **page, struct btrfs_super_block **disk_super)
1079{
1080        void *p;
1081        pgoff_t index;
1082
1083        /* make sure our super fits in the device */
1084        if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
1085                return 1;
1086
1087        /* make sure our super fits in the page */
1088        if (sizeof(**disk_super) > PAGE_SIZE)
1089                return 1;
1090
1091        /* make sure our super doesn't straddle pages on disk */
1092        index = bytenr >> PAGE_SHIFT;
1093        if ((bytenr + sizeof(**disk_super) - 1) >> PAGE_SHIFT != index)
1094                return 1;
1095
1096        /* pull in the page with our super */
1097        *page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
1098                                   index, GFP_KERNEL);
1099
1100        if (IS_ERR_OR_NULL(*page))
1101                return 1;
1102
1103        p = kmap(*page);
1104
1105        /* align our pointer to the offset of the super block */
1106        *disk_super = p + (bytenr & ~PAGE_MASK);
1107
1108        if (btrfs_super_bytenr(*disk_super) != bytenr ||
1109            btrfs_super_magic(*disk_super) != BTRFS_MAGIC) {
1110                btrfs_release_disk_super(*page);
1111                return 1;
1112        }
1113
1114        if ((*disk_super)->label[0] &&
1115                (*disk_super)->label[BTRFS_LABEL_SIZE - 1])
1116                (*disk_super)->label[BTRFS_LABEL_SIZE - 1] = '\0';
1117
1118        return 0;
1119}
1120
1121/*
1122 * Look for a btrfs signature on a device. This may be called out of the mount path
1123 * and we are not allowed to call set_blocksize during the scan. The superblock
1124 * is read via pagecache
1125 */
1126int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
1127                          struct btrfs_fs_devices **fs_devices_ret)
1128{
1129        struct btrfs_super_block *disk_super;
1130        struct block_device *bdev;
1131        struct page *page;
1132        int ret = -EINVAL;
1133        u64 devid;
1134        u64 transid;
1135        u64 total_devices;
1136        u64 bytenr;
1137
1138        /*
1139         * we would like to check all the supers, but that would make
1140         * a btrfs mount succeed after a mkfs from a different FS.
1141         * So, we need to add a special mount option to scan for
1142         * later supers, using BTRFS_SUPER_MIRROR_MAX instead
1143         */
1144        bytenr = btrfs_sb_offset(0);
1145        flags |= FMODE_EXCL;
1146        mutex_lock(&uuid_mutex);
1147
1148        bdev = blkdev_get_by_path(path, flags, holder);
1149        if (IS_ERR(bdev)) {
1150                ret = PTR_ERR(bdev);
1151                goto error;
1152        }
1153
1154        if (btrfs_read_disk_super(bdev, bytenr, &page, &disk_super))
1155                goto error_bdev_put;
1156
1157        devid = btrfs_stack_device_id(&disk_super->dev_item);
1158        transid = btrfs_super_generation(disk_super);
1159        total_devices = btrfs_super_num_devices(disk_super);
1160
1161        ret = device_list_add(path, disk_super, devid, fs_devices_ret);
1162        if (ret > 0) {
1163                if (disk_super->label[0]) {
1164                        pr_info("BTRFS: device label %s ", disk_super->label);
1165                } else {
1166                        pr_info("BTRFS: device fsid %pU ", disk_super->fsid);
1167                }
1168
1169                pr_cont("devid %llu transid %llu %s\n", devid, transid, path);
1170                ret = 0;
1171        }
1172        if (!ret && fs_devices_ret)
1173                (*fs_devices_ret)->total_devices = total_devices;
1174
1175        btrfs_release_disk_super(page);
1176
1177error_bdev_put:
1178        blkdev_put(bdev, flags);
1179error:
1180        mutex_unlock(&uuid_mutex);
1181        return ret;
1182}
1183
1184/* helper to account the used device space in the range */
1185int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
1186                                   u64 end, u64 *length)
1187{
1188        struct btrfs_key key;
1189        struct btrfs_root *root = device->fs_info->dev_root;
1190        struct btrfs_dev_extent *dev_extent;
1191        struct btrfs_path *path;
1192        u64 extent_end;
1193        int ret;
1194        int slot;
1195        struct extent_buffer *l;
1196
1197        *length = 0;
1198
1199        if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace)
1200                return 0;
1201
1202        path = btrfs_alloc_path();
1203        if (!path)
1204                return -ENOMEM;
1205        path->reada = READA_FORWARD;
1206
1207        key.objectid = device->devid;
1208        key.offset = start;
1209        key.type = BTRFS_DEV_EXTENT_KEY;
1210
1211        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1212        if (ret < 0)
1213                goto out;
1214        if (ret > 0) {
1215                ret = btrfs_previous_item(root, path, key.objectid, key.type);
1216                if (ret < 0)
1217                        goto out;
1218        }
1219
1220        while (1) {
1221                l = path->nodes[0];
1222                slot = path->slots[0];
1223                if (slot >= btrfs_header_nritems(l)) {
1224                        ret = btrfs_next_leaf(root, path);
1225                        if (ret == 0)
1226                                continue;
1227                        if (ret < 0)
1228                                goto out;
1229
1230                        break;
1231                }
1232                btrfs_item_key_to_cpu(l, &key, slot);
1233
1234                if (key.objectid < device->devid)
1235                        goto next;
1236
1237                if (key.objectid > device->devid)
1238                        break;
1239
1240                if (key.type != BTRFS_DEV_EXTENT_KEY)
1241                        goto next;
1242
1243                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1244                extent_end = key.offset + btrfs_dev_extent_length(l,
1245                                                                  dev_extent);
1246                if (key.offset <= start && extent_end > end) {
1247                        *length = end - start + 1;
1248                        break;
1249                } else if (key.offset <= start && extent_end > start)
1250                        *length += extent_end - start;
1251                else if (key.offset > start && extent_end <= end)
1252                        *length += extent_end - key.offset;
1253                else if (key.offset > start && key.offset <= end) {
1254                        *length += end - key.offset + 1;
1255                        break;
1256                } else if (key.offset > end)
1257                        break;
1258
1259next:
1260                path->slots[0]++;
1261        }
1262        ret = 0;
1263out:
1264        btrfs_free_path(path);
1265        return ret;
1266}
1267
1268static int contains_pending_extent(struct btrfs_transaction *transaction,
1269                                   struct btrfs_device *device,
1270                                   u64 *start, u64 len)
1271{
1272        struct btrfs_fs_info *fs_info = device->fs_info;
1273        struct extent_map *em;
1274        struct list_head *search_list = &fs_info->pinned_chunks;
1275        int ret = 0;
1276        u64 physical_start = *start;
1277
1278        if (transaction)
1279                search_list = &transaction->pending_chunks;
1280again:
1281        list_for_each_entry(em, search_list, list) {
1282                struct map_lookup *map;
1283                int i;
1284
1285                map = em->map_lookup;
1286                for (i = 0; i < map->num_stripes; i++) {
1287                        u64 end;
1288
1289                        if (map->stripes[i].dev != device)
1290                                continue;
1291                        if (map->stripes[i].physical >= physical_start + len ||
1292                            map->stripes[i].physical + em->orig_block_len <=
1293                            physical_start)
1294                                continue;
1295                        /*
1296                         * Make sure that while processing the pinned list we do
1297                         * not override our *start with a lower value, because
1298                         * we can have pinned chunks that fall within this
1299                         * device hole and that have lower physical addresses
1300                         * than the pending chunks we processed before. If we
1301                         * do not take this special care we can end up getting
1302                         * 2 pending chunks that start at the same physical
1303                         * device offsets because the end offset of a pinned
1304                         * chunk can be equal to the start offset of some
1305                         * pending chunk.
1306                         */
1307                        end = map->stripes[i].physical + em->orig_block_len;
1308                        if (end > *start) {
1309                                *start = end;
1310                                ret = 1;
1311                        }
1312                }
1313        }
1314        if (search_list != &fs_info->pinned_chunks) {
1315                search_list = &fs_info->pinned_chunks;
1316                goto again;
1317        }
1318
1319        return ret;
1320}
1321
1322
1323/*
1324 * find_free_dev_extent_start - find free space in the specified device
1325 * @device:       the device which we search the free space in
1326 * @num_bytes:    the size of the free space that we need
1327 * @search_start: the position from which to begin the search
1328 * @start:        store the start of the free space.
1329 * @len:          the size of the free space. that we find, or the size
1330 *                of the max free space if we don't find suitable free space
1331 *
1332 * this uses a pretty simple search, the expectation is that it is
1333 * called very infrequently and that a given device has a small number
1334 * of extents
1335 *
1336 * @start is used to store the start of the free space if we find. But if we
1337 * don't find suitable free space, it will be used to store the start position
1338 * of the max free space.
1339 *
1340 * @len is used to store the size of the free space that we find.
1341 * But if we don't find suitable free space, it is used to store the size of
1342 * the max free space.
1343 */
1344int find_free_dev_extent_start(struct btrfs_transaction *transaction,
1345                               struct btrfs_device *device, u64 num_bytes,
1346                               u64 search_start, u64 *start, u64 *len)
1347{
1348        struct btrfs_fs_info *fs_info = device->fs_info;
1349        struct btrfs_root *root = fs_info->dev_root;
1350        struct btrfs_key key;
1351        struct btrfs_dev_extent *dev_extent;
1352        struct btrfs_path *path;
1353        u64 hole_size;
1354        u64 max_hole_start;
1355        u64 max_hole_size;
1356        u64 extent_end;
1357        u64 search_end = device->total_bytes;
1358        int ret;
1359        int slot;
1360        struct extent_buffer *l;
1361
1362        /*
1363         * We don't want to overwrite the superblock on the drive nor any area
1364         * used by the boot loader (grub for example), so we make sure to start
1365         * at an offset of at least 1MB.
1366         */
1367        search_start = max_t(u64, search_start, SZ_1M);
1368
1369        path = btrfs_alloc_path();
1370        if (!path)
1371                return -ENOMEM;
1372
1373        max_hole_start = search_start;
1374        max_hole_size = 0;
1375
1376again:
1377        if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
1378                ret = -ENOSPC;
1379                goto out;
1380        }
1381
1382        path->reada = READA_FORWARD;
1383        path->search_commit_root = 1;
1384        path->skip_locking = 1;
1385
1386        key.objectid = device->devid;
1387        key.offset = search_start;
1388        key.type = BTRFS_DEV_EXTENT_KEY;
1389
1390        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1391        if (ret < 0)
1392                goto out;
1393        if (ret > 0) {
1394                ret = btrfs_previous_item(root, path, key.objectid, key.type);
1395                if (ret < 0)
1396                        goto out;
1397        }
1398
1399        while (1) {
1400                l = path->nodes[0];
1401                slot = path->slots[0];
1402                if (slot >= btrfs_header_nritems(l)) {
1403                        ret = btrfs_next_leaf(root, path);
1404                        if (ret == 0)
1405                                continue;
1406                        if (ret < 0)
1407                                goto out;
1408
1409                        break;
1410                }
1411                btrfs_item_key_to_cpu(l, &key, slot);
1412
1413                if (key.objectid < device->devid)
1414                        goto next;
1415
1416                if (key.objectid > device->devid)
1417                        break;
1418
1419                if (key.type != BTRFS_DEV_EXTENT_KEY)
1420                        goto next;
1421
1422                if (key.offset > search_start) {
1423                        hole_size = key.offset - search_start;
1424
1425                        /*
1426                         * Have to check before we set max_hole_start, otherwise
1427                         * we could end up sending back this offset anyway.
1428                         */
1429                        if (contains_pending_extent(transaction, device,
1430                                                    &search_start,
1431                                                    hole_size)) {
1432                                if (key.offset >= search_start) {
1433                                        hole_size = key.offset - search_start;
1434                                } else {
1435                                        WARN_ON_ONCE(1);
1436                                        hole_size = 0;
1437                                }
1438                        }
1439
1440                        if (hole_size > max_hole_size) {
1441                                max_hole_start = search_start;
1442                                max_hole_size = hole_size;
1443                        }
1444
1445                        /*
1446                         * If this free space is greater than which we need,
1447                         * it must be the max free space that we have found
1448                         * until now, so max_hole_start must point to the start
1449                         * of this free space and the length of this free space
1450                         * is stored in max_hole_size. Thus, we return
1451                         * max_hole_start and max_hole_size and go back to the
1452                         * caller.
1453                         */
1454                        if (hole_size >= num_bytes) {
1455                                ret = 0;
1456                                goto out;
1457                        }
1458                }
1459
1460                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1461                extent_end = key.offset + btrfs_dev_extent_length(l,
1462                                                                  dev_extent);
1463                if (extent_end > search_start)
1464                        search_start = extent_end;
1465next:
1466                path->slots[0]++;
1467                cond_resched();
1468        }
1469
1470        /*
1471         * At this point, search_start should be the end of
1472         * allocated dev extents, and when shrinking the device,
1473         * search_end may be smaller than search_start.
1474         */
1475        if (search_end > search_start) {
1476                hole_size = search_end - search_start;
1477
1478                if (contains_pending_extent(transaction, device, &search_start,
1479                                            hole_size)) {
1480                        btrfs_release_path(path);
1481                        goto again;
1482                }
1483
1484                if (hole_size > max_hole_size) {
1485                        max_hole_start = search_start;
1486                        max_hole_size = hole_size;
1487                }
1488        }
1489
1490        /* See above. */
1491        if (max_hole_size < num_bytes)
1492                ret = -ENOSPC;
1493        else
1494                ret = 0;
1495
1496out:
1497        btrfs_free_path(path);
1498        *start = max_hole_start;
1499        if (len)
1500                *len = max_hole_size;
1501        return ret;
1502}
1503
1504int find_free_dev_extent(struct btrfs_trans_handle *trans,
1505                         struct btrfs_device *device, u64 num_bytes,
1506                         u64 *start, u64 *len)
1507{
1508        /* FIXME use last free of some kind */
1509        return find_free_dev_extent_start(trans->transaction, device,
1510                                          num_bytes, 0, start, len);
1511}
1512
1513static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1514                          struct btrfs_device *device,
1515                          u64 start, u64 *dev_extent_len)
1516{
1517        struct btrfs_fs_info *fs_info = device->fs_info;
1518        struct btrfs_root *root = fs_info->dev_root;
1519        int ret;
1520        struct btrfs_path *path;
1521        struct btrfs_key key;
1522        struct btrfs_key found_key;
1523        struct extent_buffer *leaf = NULL;
1524        struct btrfs_dev_extent *extent = NULL;
1525
1526        path = btrfs_alloc_path();
1527        if (!path)
1528                return -ENOMEM;
1529
1530        key.objectid = device->devid;
1531        key.offset = start;
1532        key.type = BTRFS_DEV_EXTENT_KEY;
1533again:
1534        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1535        if (ret > 0) {
1536                ret = btrfs_previous_item(root, path, key.objectid,
1537                                          BTRFS_DEV_EXTENT_KEY);
1538                if (ret)
1539                        goto out;
1540                leaf = path->nodes[0];
1541                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1542                extent = btrfs_item_ptr(leaf, path->slots[0],
1543                                        struct btrfs_dev_extent);
1544                BUG_ON(found_key.offset > start || found_key.offset +
1545                       btrfs_dev_extent_length(leaf, extent) < start);
1546                key = found_key;
1547                btrfs_release_path(path);
1548                goto again;
1549        } else if (ret == 0) {
1550                leaf = path->nodes[0];
1551                extent = btrfs_item_ptr(leaf, path->slots[0],
1552                                        struct btrfs_dev_extent);
1553        } else {
1554                btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
1555                goto out;
1556        }
1557
1558        *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1559
1560        ret = btrfs_del_item(trans, root, path);
1561        if (ret) {
1562                btrfs_handle_fs_error(fs_info, ret,
1563                                      "Failed to remove dev extent item");
1564        } else {
1565                set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1566        }
1567out:
1568        btrfs_free_path(path);
1569        return ret;
1570}
1571
1572static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1573                                  struct btrfs_device *device,
1574                                  u64 chunk_offset, u64 start, u64 num_bytes)
1575{
1576        int ret;
1577        struct btrfs_path *path;
1578        struct btrfs_fs_info *fs_info = device->fs_info;
1579        struct btrfs_root *root = fs_info->dev_root;
1580        struct btrfs_dev_extent *extent;
1581        struct extent_buffer *leaf;
1582        struct btrfs_key key;
1583
1584        WARN_ON(!device->in_fs_metadata);
1585        WARN_ON(device->is_tgtdev_for_dev_replace);
1586        path = btrfs_alloc_path();
1587        if (!path)
1588                return -ENOMEM;
1589
1590        key.objectid = device->devid;
1591        key.offset = start;
1592        key.type = BTRFS_DEV_EXTENT_KEY;
1593        ret = btrfs_insert_empty_item(trans, root, path, &key,
1594                                      sizeof(*extent));
1595        if (ret)
1596                goto out;
1597
1598        leaf = path->nodes[0];
1599        extent = btrfs_item_ptr(leaf, path->slots[0],
1600                                struct btrfs_dev_extent);
1601        btrfs_set_dev_extent_chunk_tree(leaf, extent,
1602                                        BTRFS_CHUNK_TREE_OBJECTID);
1603        btrfs_set_dev_extent_chunk_objectid(leaf, extent,
1604                                            BTRFS_FIRST_CHUNK_TREE_OBJECTID);
1605        btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1606
1607        btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1608        btrfs_mark_buffer_dirty(leaf);
1609out:
1610        btrfs_free_path(path);
1611        return ret;
1612}
1613
1614static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1615{
1616        struct extent_map_tree *em_tree;
1617        struct extent_map *em;
1618        struct rb_node *n;
1619        u64 ret = 0;
1620
1621        em_tree = &fs_info->mapping_tree.map_tree;
1622        read_lock(&em_tree->lock);
1623        n = rb_last(&em_tree->map);
1624        if (n) {
1625                em = rb_entry(n, struct extent_map, rb_node);
1626                ret = em->start + em->len;
1627        }
1628        read_unlock(&em_tree->lock);
1629
1630        return ret;
1631}
1632
1633static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1634                                    u64 *devid_ret)
1635{
1636        int ret;
1637        struct btrfs_key key;
1638        struct btrfs_key found_key;
1639        struct btrfs_path *path;
1640
1641        path = btrfs_alloc_path();
1642        if (!path)
1643                return -ENOMEM;
1644
1645        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1646        key.type = BTRFS_DEV_ITEM_KEY;
1647        key.offset = (u64)-1;
1648
1649        ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1650        if (ret < 0)
1651                goto error;
1652
1653        BUG_ON(ret == 0); /* Corruption */
1654
1655        ret = btrfs_previous_item(fs_info->chunk_root, path,
1656                                  BTRFS_DEV_ITEMS_OBJECTID,
1657                                  BTRFS_DEV_ITEM_KEY);
1658        if (ret) {
1659                *devid_ret = 1;
1660        } else {
1661                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1662                                      path->slots[0]);
1663                *devid_ret = found_key.offset + 1;
1664        }
1665        ret = 0;
1666error:
1667        btrfs_free_path(path);
1668        return ret;
1669}
1670
1671/*
1672 * the device information is stored in the chunk root
1673 * the btrfs_device struct should be fully filled in
1674 */
1675static int btrfs_add_device(struct btrfs_trans_handle *trans,
1676                            struct btrfs_fs_info *fs_info,
1677                            struct btrfs_device *device)
1678{
1679        struct btrfs_root *root = fs_info->chunk_root;
1680        int ret;
1681        struct btrfs_path *path;
1682        struct btrfs_dev_item *dev_item;
1683        struct extent_buffer *leaf;
1684        struct btrfs_key key;
1685        unsigned long ptr;
1686
1687        path = btrfs_alloc_path();
1688        if (!path)
1689                return -ENOMEM;
1690
1691        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1692        key.type = BTRFS_DEV_ITEM_KEY;
1693        key.offset = device->devid;
1694
1695        ret = btrfs_insert_empty_item(trans, root, path, &key,
1696                                      sizeof(*dev_item));
1697        if (ret)
1698                goto out;
1699
1700        leaf = path->nodes[0];
1701        dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1702
1703        btrfs_set_device_id(leaf, dev_item, device->devid);
1704        btrfs_set_device_generation(leaf, dev_item, 0);
1705        btrfs_set_device_type(leaf, dev_item, device->type);
1706        btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1707        btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1708        btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1709        btrfs_set_device_total_bytes(leaf, dev_item,
1710                                     btrfs_device_get_disk_total_bytes(device));
1711        btrfs_set_device_bytes_used(leaf, dev_item,
1712                                    btrfs_device_get_bytes_used(device));
1713        btrfs_set_device_group(leaf, dev_item, 0);
1714        btrfs_set_device_seek_speed(leaf, dev_item, 0);
1715        btrfs_set_device_bandwidth(leaf, dev_item, 0);
1716        btrfs_set_device_start_offset(leaf, dev_item, 0);
1717
1718        ptr = btrfs_device_uuid(dev_item);
1719        write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1720        ptr = btrfs_device_fsid(dev_item);
1721        write_extent_buffer(leaf, fs_info->fsid, ptr, BTRFS_FSID_SIZE);
1722        btrfs_mark_buffer_dirty(leaf);
1723
1724        ret = 0;
1725out:
1726        btrfs_free_path(path);
1727        return ret;
1728}
1729
1730/*
1731 * Function to update ctime/mtime for a given device path.
1732 * Mainly used for ctime/mtime based probe like libblkid.
1733 */
1734static void update_dev_time(const char *path_name)
1735{
1736        struct file *filp;
1737
1738        filp = filp_open(path_name, O_RDWR, 0);
1739        if (IS_ERR(filp))
1740                return;
1741        file_update_time(filp);
1742        filp_close(filp, NULL);
1743}
1744
1745static int btrfs_rm_dev_item(struct btrfs_fs_info *fs_info,
1746                             struct btrfs_device *device)
1747{
1748        struct btrfs_root *root = fs_info->chunk_root;
1749        int ret;
1750        struct btrfs_path *path;
1751        struct btrfs_key key;
1752        struct btrfs_trans_handle *trans;
1753
1754        path = btrfs_alloc_path();
1755        if (!path)
1756                return -ENOMEM;
1757
1758        trans = btrfs_start_transaction(root, 0);
1759        if (IS_ERR(trans)) {
1760                btrfs_free_path(path);
1761                return PTR_ERR(trans);
1762        }
1763        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1764        key.type = BTRFS_DEV_ITEM_KEY;
1765        key.offset = device->devid;
1766
1767        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1768        if (ret < 0)
1769                goto out;
1770
1771        if (ret > 0) {
1772                ret = -ENOENT;
1773                goto out;
1774        }
1775
1776        ret = btrfs_del_item(trans, root, path);
1777        if (ret)
1778                goto out;
1779out:
1780        btrfs_free_path(path);
1781        btrfs_commit_transaction(trans);
1782        return ret;
1783}
1784
1785/*
1786 * Verify that @num_devices satisfies the RAID profile constraints in the whole
1787 * filesystem. It's up to the caller to adjust that number regarding eg. device
1788 * replace.
1789 */
1790static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
1791                u64 num_devices)
1792{
1793        u64 all_avail;
1794        unsigned seq;
1795        int i;
1796
1797        do {
1798                seq = read_seqbegin(&fs_info->profiles_lock);
1799
1800                all_avail = fs_info->avail_data_alloc_bits |
1801                            fs_info->avail_system_alloc_bits |
1802                            fs_info->avail_metadata_alloc_bits;
1803        } while (read_seqretry(&fs_info->profiles_lock, seq));
1804
1805        for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
1806                if (!(all_avail & btrfs_raid_group[i]))
1807                        continue;
1808
1809                if (num_devices < btrfs_raid_array[i].devs_min) {
1810                        int ret = btrfs_raid_mindev_error[i];
1811
1812                        if (ret)
1813                                return ret;
1814                }
1815        }
1816
1817        return 0;
1818}
1819
1820struct btrfs_device *btrfs_find_next_active_device(struct btrfs_fs_devices *fs_devs,
1821                                        struct btrfs_device *device)
1822{
1823        struct btrfs_device *next_device;
1824
1825        list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
1826                if (next_device != device &&
1827                        !next_device->missing && next_device->bdev)
1828                        return next_device;
1829        }
1830
1831        return NULL;
1832}
1833
1834/*
1835 * Helper function to check if the given device is part of s_bdev / latest_bdev
1836 * and replace it with the provided or the next active device, in the context
1837 * where this function called, there should be always be another device (or
1838 * this_dev) which is active.
1839 */
1840void btrfs_assign_next_active_device(struct btrfs_fs_info *fs_info,
1841                struct btrfs_device *device, struct btrfs_device *this_dev)
1842{
1843        struct btrfs_device *next_device;
1844
1845        if (this_dev)
1846                next_device = this_dev;
1847        else
1848                next_device = btrfs_find_next_active_device(fs_info->fs_devices,
1849                                                                device);
1850        ASSERT(next_device);
1851
1852        if (fs_info->sb->s_bdev &&
1853                        (fs_info->sb->s_bdev == device->bdev))
1854                fs_info->sb->s_bdev = next_device->bdev;
1855
1856        if (fs_info->fs_devices->latest_bdev == device->bdev)
1857                fs_info->fs_devices->latest_bdev = next_device->bdev;
1858}
1859
1860int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
1861                u64 devid)
1862{
1863        struct btrfs_device *device;
1864        struct btrfs_fs_devices *cur_devices;
1865        u64 num_devices;
1866        int ret = 0;
1867
1868        mutex_lock(&uuid_mutex);
1869
1870        num_devices = fs_info->fs_devices->num_devices;
1871        btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
1872        if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
1873                WARN_ON(num_devices < 1);
1874                num_devices--;
1875        }
1876        btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
1877
1878        ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
1879        if (ret)
1880                goto out;
1881
1882        ret = btrfs_find_device_by_devspec(fs_info, devid, device_path,
1883                                           &device);
1884        if (ret)
1885                goto out;
1886
1887        if (device->is_tgtdev_for_dev_replace) {
1888                ret = BTRFS_ERROR_DEV_TGT_REPLACE;
1889                goto out;
1890        }
1891
1892        if (device->writeable && fs_info->fs_devices->rw_devices == 1) {
1893                ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
1894                goto out;
1895        }
1896
1897        if (device->writeable) {
1898                mutex_lock(&fs_info->chunk_mutex);
1899                list_del_init(&device->dev_alloc_list);
1900                device->fs_devices->rw_devices--;
1901                mutex_unlock(&fs_info->chunk_mutex);
1902        }
1903
1904        mutex_unlock(&uuid_mutex);
1905        ret = btrfs_shrink_device(device, 0);
1906        mutex_lock(&uuid_mutex);
1907        if (ret)
1908                goto error_undo;
1909
1910        /*
1911         * TODO: the superblock still includes this device in its num_devices
1912         * counter although write_all_supers() is not locked out. This
1913         * could give a filesystem state which requires a degraded mount.
1914         */
1915        ret = btrfs_rm_dev_item(fs_info, device);
1916        if (ret)
1917                goto error_undo;
1918
1919        device->in_fs_metadata = 0;
1920        btrfs_scrub_cancel_dev(fs_info, device);
1921
1922        /*
1923         * the device list mutex makes sure that we don't change
1924         * the device list while someone else is writing out all
1925         * the device supers. Whoever is writing all supers, should
1926         * lock the device list mutex before getting the number of
1927         * devices in the super block (super_copy). Conversely,
1928         * whoever updates the number of devices in the super block
1929         * (super_copy) should hold the device list mutex.
1930         */
1931
1932        cur_devices = device->fs_devices;
1933        mutex_lock(&fs_info->fs_devices->device_list_mutex);
1934        list_del_rcu(&device->dev_list);
1935
1936        device->fs_devices->num_devices--;
1937        device->fs_devices->total_devices--;
1938
1939        if (device->missing)
1940                device->fs_devices->missing_devices--;
1941
1942        btrfs_assign_next_active_device(fs_info, device, NULL);
1943
1944        if (device->bdev) {
1945                device->fs_devices->open_devices--;
1946                /* remove sysfs entry */
1947                btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
1948        }
1949
1950        num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
1951        btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
1952        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1953
1954        /*
1955         * at this point, the device is zero sized and detached from
1956         * the devices list.  All that's left is to zero out the old
1957         * supers and free the device.
1958         */
1959        if (device->writeable)
1960                btrfs_scratch_superblocks(device->bdev, device->name->str);
1961
1962        btrfs_close_bdev(device);
1963        call_rcu(&device->rcu, free_device);
1964
1965        if (cur_devices->open_devices == 0) {
1966                struct btrfs_fs_devices *fs_devices;
1967                fs_devices = fs_info->fs_devices;
1968                while (fs_devices) {
1969                        if (fs_devices->seed == cur_devices) {
1970                                fs_devices->seed = cur_devices->seed;
1971                                break;
1972                        }
1973                        fs_devices = fs_devices->seed;
1974                }
1975                cur_devices->seed = NULL;
1976                __btrfs_close_devices(cur_devices);
1977                free_fs_devices(cur_devices);
1978        }
1979
1980out:
1981        mutex_unlock(&uuid_mutex);
1982        return ret;
1983
1984error_undo:
1985        if (device->writeable) {
1986                mutex_lock(&fs_info->chunk_mutex);
1987                list_add(&device->dev_alloc_list,
1988                         &fs_info->fs_devices->alloc_list);
1989                device->fs_devices->rw_devices++;
1990                mutex_unlock(&fs_info->chunk_mutex);
1991        }
1992        goto out;
1993}
1994
1995void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
1996                                        struct btrfs_device *srcdev)
1997{
1998        struct btrfs_fs_devices *fs_devices;
1999
2000        WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));

2001
2002        /*
2003         * in case of fs with no seed, srcdev->fs_devices will point
2004         * to fs_devices of fs_info. However when the dev being replaced is
2005         * a seed dev it will point to the seed's local fs_devices. In short
2006         * srcdev will have its correct fs_devices in both the cases.
2007         */
2008        fs_devices = srcdev->fs_devices;
2009
2010        list_del_rcu(&srcdev->dev_list);
2011        list_del_rcu(&srcdev->dev_alloc_list);
2012        fs_devices->num_devices--;
2013        if (srcdev->missing)
2014                fs_devices->missing_devices--;
2015
2016        if (srcdev->writeable)
2017                fs_devices->rw_devices--;
2018
2019        if (srcdev->bdev)
2020                fs_devices->open_devices--;
2021}
2022
2023void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
2024                                      struct btrfs_device *srcdev)
2025{
2026        struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2027
2028        if (srcdev->writeable) {
2029                /* zero out the old super if it is writable */
2030                btrfs_scratch_superblocks(srcdev->bdev, srcdev->name->str);
2031        }
2032
2033        btrfs_close_bdev(srcdev);
2034
2035        call_rcu(&srcdev->rcu, free_device);
2036
2037        /*
2038         * unless fs_devices is seed fs, num_devices shouldn't go
2039         * zero
2040         */
2041        BUG_ON(!fs_devices->num_devices && !fs_devices->seeding);
2042
2043        /* if this is no devs we rather delete the fs_devices */
2044        if (!fs_devices->num_devices) {
2045                struct btrfs_fs_devices *tmp_fs_devices;
2046
2047                tmp_fs_devices = fs_info->fs_devices;
2048                while (tmp_fs_devices) {
2049                        if (tmp_fs_devices->seed == fs_devices) {
2050                                tmp_fs_devices->seed = fs_devices->seed;
2051                                break;
2052                        }
2053                        tmp_fs_devices = tmp_fs_devices->seed;
2054                }
2055                fs_devices->seed = NULL;
2056                __btrfs_close_devices(fs_devices);
2057                free_fs_devices(fs_devices);
2058        }
2059}
2060
2061void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
2062                                      struct btrfs_device *tgtdev)
2063{
2064        mutex_lock(&uuid_mutex);
2065        WARN_ON(!tgtdev);
2066        mutex_lock(&fs_info->fs_devices->device_list_mutex);
2067
2068        btrfs_sysfs_rm_device_link(fs_info->fs_devices, tgtdev);
2069
2070        if (tgtdev->bdev)
2071                fs_info->fs_devices->open_devices--;
2072
2073        fs_info->fs_devices->num_devices--;
2074
2075        btrfs_assign_next_active_device(fs_info, tgtdev, NULL);
2076
2077        list_del_rcu(&tgtdev->dev_list);
2078
2079        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2080        mutex_unlock(&uuid_mutex);
2081
2082        /*
2083         * The update_dev_time() with in btrfs_scratch_superblocks()
2084         * may lead to a call to btrfs_show_devname() which will try
2085         * to hold device_list_mutex. And here this device
2086         * is already out of device list, so we don't have to hold
2087         * the device_list_mutex lock.
2088         */
2089        btrfs_scratch_superblocks(tgtdev->bdev, tgtdev->name->str);
2090
2091        btrfs_close_bdev(tgtdev);
2092        call_rcu(&tgtdev->rcu, free_device);
2093}
2094
2095static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info,
2096                                     const char *device_path,
2097                                     struct btrfs_device **device)
2098{
2099        int ret = 0;
2100        struct btrfs_super_block *disk_super;
2101        u64 devid;
2102        u8 *dev_uuid;
2103        struct block_device *bdev;
2104        struct buffer_head *bh;
2105
2106        *device = NULL;
2107        ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
2108                                    fs_info->bdev_holder, 0, &bdev, &bh);
2109        if (ret)
2110                return ret;
2111        disk_super = (struct btrfs_super_block *)bh->b_data;
2112        devid = btrfs_stack_device_id(&disk_super->dev_item);
2113        dev_uuid = disk_super->dev_item.uuid;
2114        *device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid);
2115        brelse(bh);
2116        if (!*device)
2117                ret = -ENOENT;
2118        blkdev_put(bdev, FMODE_READ);
2119        return ret;
2120}
2121
2122int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
2123                                         const char *device_path,
2124                                         struct btrfs_device **device)
2125{
2126        *device = NULL;
2127        if (strcmp(device_path, "missing") == 0) {
2128                struct list_head *devices;
2129                struct btrfs_device *tmp;
2130
2131                devices = &fs_info->fs_devices->devices;
2132                /*
2133                 * It is safe to read the devices since the volume_mutex
2134                 * is held by the caller.
2135                 */
2136                list_for_each_entry(tmp, devices, dev_list) {
2137                        if (tmp->in_fs_metadata && !tmp->bdev) {
2138                                *device = tmp;
2139                                break;
2140                        }
2141                }
2142
2143                if (!*device)
2144                        return BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2145
2146                return 0;
2147        } else {
2148                return btrfs_find_device_by_path(fs_info, device_path, device);
2149        }
2150}
2151
2152/*
2153 * Lookup a device given by device id, or the path if the id is 0.
2154 */
2155int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
2156                                 const char *devpath,
2157                                 struct btrfs_device **device)
2158{
2159        int ret;
2160
2161        if (devid) {
2162                ret = 0;
2163                *device = btrfs_find_device(fs_info, devid, NULL, NULL);
2164                if (!*device)
2165                        ret = -ENOENT;
2166        } else {
2167                if (!devpath || !devpath[0])
2168                        return -EINVAL;
2169
2170                ret = btrfs_find_device_missing_or_by_path(fs_info, devpath,
2171                                                           device);
2172        }
2173        return ret;
2174}
2175
2176/*
2177 * does all the dirty work required for changing file system's UUID.
2178 */
2179static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
2180{
2181        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2182        struct btrfs_fs_devices *old_devices;
2183        struct btrfs_fs_devices *seed_devices;
2184        struct btrfs_super_block *disk_super = fs_info->super_copy;
2185        struct btrfs_device *device;
2186        u64 super_flags;
2187
2188        BUG_ON(!mutex_is_locked(&uuid_mutex));
2189        if (!fs_devices->seeding)
2190                return -EINVAL;
2191
2192        seed_devices = alloc_fs_devices(NULL);
2193        if (IS_ERR(seed_devices))
2194                return PTR_ERR(seed_devices);
2195
2196        old_devices = clone_fs_devices(fs_devices);
2197        if (IS_ERR(old_devices)) {
2198                kfree(seed_devices);
2199                return PTR_ERR(old_devices);
2200        }
2201
2202        list_add(&old_devices->list, &fs_uuids);
2203
2204        memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2205        seed_devices->opened = 1;
2206        INIT_LIST_HEAD(&seed_devices->devices);
2207        INIT_LIST_HEAD(&seed_devices->alloc_list);
2208        mutex_init(&seed_devices->device_list_mutex);
2209
2210        mutex_lock(&fs_info->fs_devices->device_list_mutex);
2211        list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
2212                              synchronize_rcu);
2213        list_for_each_entry(device, &seed_devices->devices, dev_list)
2214                device->fs_devices = seed_devices;
2215
2216        mutex_lock(&fs_info->chunk_mutex);
2217        list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
2218        mutex_unlock(&fs_info->chunk_mutex);
2219
2220        fs_devices->seeding = 0;
2221        fs_devices->num_devices = 0;
2222        fs_devices->open_devices = 0;
2223        fs_devices->missing_devices = 0;
2224        fs_devices->rotating = 0;
2225        fs_devices->seed = seed_devices;
2226
2227        generate_random_uuid(fs_devices->fsid);
2228        memcpy(fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2229        memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2230        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2231
2232        super_flags = btrfs_super_flags(disk_super) &
2233                      ~BTRFS_SUPER_FLAG_SEEDING;
2234        btrfs_set_super_flags(disk_super, super_flags);
2235
2236        return 0;
2237}
2238
2239/*
2240 * Store the expected generation for seed devices in device items.
2241 */
2242static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
2243                               struct btrfs_fs_info *fs_info)
2244{
2245        struct btrfs_root *root = fs_info->chunk_root;
2246        struct btrfs_path *path;
2247        struct extent_buffer *leaf;
2248        struct btrfs_dev_item *dev_item;
2249        struct btrfs_device *device;
2250        struct btrfs_key key;
2251        u8 fs_uuid[BTRFS_FSID_SIZE];
2252        u8 dev_uuid[BTRFS_UUID_SIZE];
2253        u64 devid;
2254        int ret;
2255
2256        path = btrfs_alloc_path();
2257        if (!path)
2258                return -ENOMEM;
2259
2260        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2261        key.offset = 0;
2262        key.type = BTRFS_DEV_ITEM_KEY;
2263
2264        while (1) {
2265                ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2266                if (ret < 0)
2267                        goto error;
2268
2269                leaf = path->nodes[0];
2270next_slot:
2271                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2272                        ret = btrfs_next_leaf(root, path);
2273                        if (ret > 0)
2274                                break;
2275                        if (ret < 0)
2276                                goto error;
2277                        leaf = path->nodes[0];
2278                        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2279                        btrfs_release_path(path);
2280                        continue;
2281                }
2282
2283                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2284                if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2285                    key.type != BTRFS_DEV_ITEM_KEY)
2286                        break;
2287
2288                dev_item = btrfs_item_ptr(leaf, path->slots[0],
2289                                          struct btrfs_dev_item);
2290                devid = btrfs_device_id(leaf, dev_item);
2291                read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2292                                   BTRFS_UUID_SIZE);
2293                read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2294                                   BTRFS_FSID_SIZE);
2295                device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
2296                BUG_ON(!device); /* Logic error */
2297
2298                if (device->fs_devices->seeding) {
2299                        btrfs_set_device_generation(leaf, dev_item,
2300                                                    device->generation);
2301                        btrfs_mark_buffer_dirty(leaf);
2302                }
2303
2304                path->slots[0]++;
2305                goto next_slot;
2306        }
2307        ret = 0;
2308error:
2309        btrfs_free_path(path);
2310        return ret;
2311}
2312
2313int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2314{
2315        struct btrfs_root *root = fs_info->dev_root;
2316        struct request_queue *q;
2317        struct btrfs_trans_handle *trans;
2318        struct btrfs_device *device;
2319        struct block_device *bdev;
2320        struct list_head *devices;
2321        struct super_block *sb = fs_info->sb;
2322        struct rcu_string *name;
2323        u64 tmp;
2324        int seeding_dev = 0;
2325        int ret = 0;
2326
2327        if (sb_rdonly(sb) && !fs_info->fs_devices->seeding)
2328                return -EROFS;
2329
2330        bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2331                                  fs_info->bdev_holder);
2332        if (IS_ERR(bdev))
2333                return PTR_ERR(bdev);
2334
2335        if (fs_info->fs_devices->seeding) {
2336                seeding_dev = 1;
2337                down_write(&sb->s_umount);
2338                mutex_lock(&uuid_mutex);
2339        }
2340
2341        filemap_write_and_wait(bdev->bd_inode->i_mapping);
2342
2343        devices = &fs_info->fs_devices->devices;
2344
2345        mutex_lock(&fs_info->fs_devices->device_list_mutex);
2346        list_for_each_entry(device, devices, dev_list) {
2347                if (device->bdev == bdev) {
2348                        ret = -EEXIST;
2349                        mutex_unlock(
2350                                &fs_info->fs_devices->device_list_mutex);
2351                        goto error;
2352                }
2353        }
2354        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2355
2356        device = btrfs_alloc_device(fs_info, NULL, NULL);
2357        if (IS_ERR(device)) {
2358                /* we can safely leave the fs_devices entry around */
2359                ret = PTR_ERR(device);
2360                goto error;
2361        }
2362
2363        name = rcu_string_strdup(device_path, GFP_KERNEL);
2364        if (!name) {
2365                kfree(device);
2366                ret = -ENOMEM;
2367                goto error;
2368        }
2369        rcu_assign_pointer(device->name, name);
2370
2371        trans = btrfs_start_transaction(root, 0);
2372        if (IS_ERR(trans)) {
2373                rcu_string_free(device->name);
2374                kfree(device);
2375                ret = PTR_ERR(trans);
2376                goto error;
2377        }
2378
2379        q = bdev_get_queue(bdev);
2380        if (blk_queue_discard(q))
2381                device->can_discard = 1;
2382        device->writeable = 1;
2383        device->generation = trans->transid;
2384        device->io_width = fs_info->sectorsize;
2385        device->io_align = fs_info->sectorsize;
2386        device->sector_size = fs_info->sectorsize;
2387        device->total_bytes = round_down(i_size_read(bdev->bd_inode),
2388                                         fs_info->sectorsize);
2389        device->disk_total_bytes = device->total_bytes;
2390        device->commit_total_bytes = device->total_bytes;
2391        device->fs_info = fs_info;
2392        device->bdev = bdev;
2393        device->in_fs_metadata = 1;
2394        device->is_tgtdev_for_dev_replace = 0;
2395        device->mode = FMODE_EXCL;
2396        device->dev_stats_valid = 1;
2397        set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2398
2399        if (seeding_dev) {
2400                sb->s_flags &= ~MS_RDONLY;
2401                ret = btrfs_prepare_sprout(fs_info);
2402                BUG_ON(ret); /* -ENOMEM */
2403        }
2404
2405        device->fs_devices = fs_info->fs_devices;
2406
2407        mutex_lock(&fs_info->fs_devices->device_list_mutex);
2408        mutex_lock(&fs_info->chunk_mutex);
2409        list_add_rcu(&device->dev_list, &fs_info->fs_devices->devices);
2410        list_add(&device->dev_alloc_list,
2411                 &fs_info->fs_devices->alloc_list);
2412        fs_info->fs_devices->num_devices++;
2413        fs_info->fs_devices->open_devices++;
2414        fs_info->fs_devices->rw_devices++;
2415        fs_info->fs_devices->total_devices++;
2416        fs_info->fs_devices->total_rw_bytes += device->total_bytes;
2417
2418        atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2419
2420        if (!blk_queue_nonrot(q))
2421                fs_info->fs_devices->rotating = 1;
2422
2423        tmp = btrfs_super_total_bytes(fs_info->super_copy);
2424        btrfs_set_super_total_bytes(fs_info->super_copy,
2425                round_down(tmp + device->total_bytes, fs_info->sectorsize));
2426
2427        tmp = btrfs_super_num_devices(fs_info->super_copy);
2428        btrfs_set_super_num_devices(fs_info->super_copy, tmp + 1);
2429
2430        /* add sysfs device entry */
2431        btrfs_sysfs_add_device_link(fs_info->fs_devices, device);
2432
2433        /*
2434         * we've got more storage, clear any full flags on the space
2435         * infos
2436         */
2437        btrfs_clear_space_info_full(fs_info);
2438
2439        mutex_unlock(&fs_info->chunk_mutex);
2440        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2441
2442        if (seeding_dev) {
2443                mutex_lock(&fs_info->chunk_mutex);
2444                ret = init_first_rw_device(trans, fs_info);
2445                mutex_unlock(&fs_info->chunk_mutex);
2446                if (ret) {
2447                        btrfs_abort_transaction(trans, ret);
2448                        goto error_trans;
2449                }
2450        }
2451
2452        ret = btrfs_add_device(trans, fs_info, device);
2453        if (ret) {
2454                btrfs_abort_transaction(trans, ret);
2455                goto error_trans;
2456        }
2457
2458        if (seeding_dev) {
2459                char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
2460
2461                ret = btrfs_finish_sprout(trans, fs_info);
2462                if (ret) {
2463                        btrfs_abort_transaction(trans, ret);
2464                        goto error_trans;
2465                }
2466
2467                /* Sprouting would change fsid of the mounted root,
2468                 * so rename the fsid on the sysfs
2469                 */
2470                snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
2471                                                fs_info->fsid);
2472                if (kobject_rename(&fs_info->fs_devices->fsid_kobj, fsid_buf))
2473                        btrfs_warn(fs_info,
2474                                   "sysfs: failed to create fsid for sprout");
2475        }
2476
2477        ret = btrfs_commit_transaction(trans);
2478
2479        if (seeding_dev) {
2480                mutex_unlock(&uuid_mutex);
2481                up_write(&sb->s_umount);
2482
2483                if (ret) /* transaction commit */
2484                        return ret;
2485
2486                ret = btrfs_relocate_sys_chunks(fs_info);
2487                if (ret < 0)
2488                        btrfs_handle_fs_error(fs_info, ret,
2489                                    "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2490                trans = btrfs_attach_transaction(root);
2491                if (IS_ERR(trans)) {
2492                        if (PTR_ERR(trans) == -ENOENT)
2493                                return 0;
2494                        return PTR_ERR(trans);
2495                }
2496                ret = btrfs_commit_transaction(trans);
2497        }
2498
2499        /* Update ctime/mtime for libblkid */
2500        update_dev_time(device_path);
2501        return ret;
2502
2503error_trans:
2504        btrfs_end_transaction(trans);
2505        rcu_string_free(device->name);
2506        btrfs_sysfs_rm_device_link(fs_info->fs_devices, device);
2507        kfree(device);
2508error:
2509        blkdev_put(bdev, FMODE_EXCL);
2510        if (seeding_dev) {
2511                mutex_unlock(&uuid_mutex);
2512                up_write(&sb->s_umount);
2513        }
2514        return ret;
2515}
2516
2517int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
2518                                  const char *device_path,
2519                                  struct btrfs_device *srcdev,
2520                                  struct btrfs_device **device_out)
2521{
2522        struct request_queue *q;
2523        struct btrfs_device *device;
2524        struct block_device *bdev;
2525        struct list_head *devices;
2526        struct rcu_string *name;
2527        u64 devid = BTRFS_DEV_REPLACE_DEVID;
2528        int ret = 0;
2529
2530        *device_out = NULL;
2531        if (fs_info->fs_devices->seeding) {
2532                btrfs_err(fs_info, "the filesystem is a seed filesystem!");
2533                return -EINVAL;
2534        }
2535
2536        bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2537                                  fs_info->bdev_holder);
2538        if (IS_ERR(bdev)) {
2539                btrfs_err(fs_info, "target device %s is invalid!", device_path);
2540                return PTR_ERR(bdev);
2541        }
2542
2543        filemap_write_and_wait(bdev->bd_inode->i_mapping);
2544
2545        devices = &fs_info->fs_devices->devices;
2546        list_for_each_entry(device, devices, dev_list) {
2547                if (device->bdev == bdev) {
2548                        btrfs_err(fs_info,
2549                                  "target device is in the filesystem!");
2550                        ret = -EEXIST;
2551                        goto error;
2552                }
2553        }
2554
2555
2556        if (i_size_read(bdev->bd_inode) <
2557            btrfs_device_get_total_bytes(srcdev)) {
2558                btrfs_err(fs_info,
2559                          "target device is smaller than source device!");
2560                ret = -EINVAL;
2561                goto error;
2562        }
2563
2564
2565        device = btrfs_alloc_device(NULL, &devid, NULL);
2566        if (IS_ERR(device)) {
2567                ret = PTR_ERR(device);
2568                goto error;
2569        }
2570
2571        name = rcu_string_strdup(device_path, GFP_KERNEL);
2572        if (!name) {
2573                kfree(device);
2574                ret = -ENOMEM;
2575                goto error;
2576        }
2577        rcu_assign_pointer(device->name, name);
2578
2579        q = bdev_get_queue(bdev);
2580        if (blk_queue_discard(q))
2581                device->can_discard = 1;
2582        mutex_lock(&fs_info->fs_devices->device_list_mutex);
2583        device->writeable = 1;
2584        device->generation = 0;
2585        device->io_width = fs_info->sectorsize;
2586        device->io_align = fs_info->sectorsize;
2587        device->sector_size = fs_info->sectorsize;
2588        device->total_bytes = btrfs_device_get_total_bytes(srcdev);
2589        device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
2590        device->bytes_used = btrfs_device_get_bytes_used(srcdev);
2591        ASSERT(list_empty(&srcdev->resized_list));
2592        device->commit_total_bytes = srcdev->commit_total_bytes;
2593        device->commit_bytes_used = device->bytes_used;
2594        device->fs_info = fs_info;
2595        device->bdev = bdev;
2596        device->in_fs_metadata = 1;
2597        device->is_tgtdev_for_dev_replace = 1;
2598        device->mode = FMODE_EXCL;
2599        device->dev_stats_valid = 1;
2600        set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2601        device->fs_devices = fs_info->fs_devices;
2602        list_add(&device->dev_list, &fs_info->fs_devices->devices);
2603        fs_info->fs_devices->num_devices++;
2604        fs_info->fs_devices->open_devices++;
2605        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2606
2607        *device_out = device;
2608        return ret;
2609
2610error:
2611        blkdev_put(bdev, FMODE_EXCL);
2612        return ret;
2613}
2614
2615void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
2616                                              struct btrfs_device *tgtdev)
2617{
2618        u32 sectorsize = fs_info->sectorsize;
2619
2620        WARN_ON(fs_info->fs_devices->rw_devices == 0);
2621        tgtdev->io_width = sectorsize;
2622        tgtdev->io_align = sectorsize;
2623        tgtdev->sector_size = sectorsize;
2624        tgtdev->fs_info = fs_info;
2625        tgtdev->in_fs_metadata = 1;
2626}
2627
2628static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2629                                        struct btrfs_device *device)
2630{
2631        int ret;
2632        struct btrfs_path *path;
2633        struct btrfs_root *root = device->fs_info->chunk_root;
2634        struct btrfs_dev_item *dev_item;
2635        struct extent_buffer *leaf;
2636        struct btrfs_key key;
2637
2638        path = btrfs_alloc_path();
2639        if (!path)
2640                return -ENOMEM;
2641
2642        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2643        key.type = BTRFS_DEV_ITEM_KEY;
2644        key.offset = device->devid;
2645
2646        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2647        if (ret < 0)
2648                goto out;
2649
2650        if (ret > 0) {
2651                ret = -ENOENT;
2652                goto out;
2653        }
2654
2655        leaf = path->nodes[0];
2656        dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2657
2658        btrfs_set_device_id(leaf, dev_item, device->devid);
2659        btrfs_set_device_type(leaf, dev_item, device->type);
2660        btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2661        btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2662        btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2663        btrfs_set_device_total_bytes(leaf, dev_item,
2664                                     btrfs_device_get_disk_total_bytes(device));
2665        btrfs_set_device_bytes_used(leaf, dev_item,
2666                                    btrfs_device_get_bytes_used(device));
2667        btrfs_mark_buffer_dirty(leaf);
2668
2669out:
2670        btrfs_free_path(path);
2671        return ret;
2672}
2673
2674int btrfs_grow_device(struct btrfs_trans_handle *trans,
2675                      struct btrfs_device *device, u64 new_size)
2676{
2677        struct btrfs_fs_info *fs_info = device->fs_info;
2678        struct btrfs_super_block *super_copy = fs_info->super_copy;
2679        struct btrfs_fs_devices *fs_devices;
2680        u64 old_total;
2681        u64 diff;
2682
2683        if (!device->writeable)
2684                return -EACCES;
2685
2686        new_size = round_down(new_size, fs_info->sectorsize);
2687
2688        mutex_lock(&fs_info->chunk_mutex);
2689        old_total = btrfs_super_total_bytes(super_copy);
2690        diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
2691
2692        if (new_size <= device->total_bytes ||
2693            device->is_tgtdev_for_dev_replace) {
2694                mutex_unlock(&fs_info->chunk_mutex);
2695                return -EINVAL;
2696        }
2697
2698        fs_devices = fs_info->fs_devices;
2699
2700        btrfs_set_super_total_bytes(super_copy,
2701                        round_down(old_total + diff, fs_info->sectorsize));
2702        device->fs_devices->total_rw_bytes += diff;
2703
2704        btrfs_device_set_total_bytes(device, new_size);
2705        btrfs_device_set_disk_total_bytes(device, new_size);
2706        btrfs_clear_space_info_full(device->fs_info);
2707        if (list_empty(&device->resized_list))
2708                list_add_tail(&device->resized_list,
2709                              &fs_devices->resized_devices);
2710        mutex_unlock(&fs_info->chunk_mutex);
2711
2712        return btrfs_update_device(trans, device);
2713}
2714
2715static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
2716                            struct btrfs_fs_info *fs_info, u64 chunk_offset)
2717{
2718        struct btrfs_root *root = fs_info->chunk_root;
2719        int ret;
2720        struct btrfs_path *path;
2721        struct btrfs_key key;
2722
2723        path = btrfs_alloc_path();
2724        if (!path)
2725                return -ENOMEM;
2726
2727        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2728        key.offset = chunk_offset;
2729        key.type = BTRFS_CHUNK_ITEM_KEY;
2730
2731        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2732        if (ret < 0)
2733                goto out;
2734        else if (ret > 0) { /* Logic error or corruption */
2735                btrfs_handle_fs_error(fs_info, -ENOENT,
2736                                      "Failed lookup while freeing chunk.");
2737                ret = -ENOENT;
2738                goto out;
2739        }
2740
2741        ret = btrfs_del_item(trans, root, path);
2742        if (ret < 0)
2743                btrfs_handle_fs_error(fs_info, ret,
2744                                      "Failed to delete chunk item.");
2745out:
2746        btrfs_free_path(path);
2747        return ret;
2748}
2749
2750static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2751{
2752        struct btrfs_super_block *super_copy = fs_info->super_copy;
2753        struct btrfs_disk_key *disk_key;
2754        struct btrfs_chunk *chunk;
2755        u8 *ptr;
2756        int ret = 0;
2757        u32 num_stripes;
2758        u32 array_size;
2759        u32 len = 0;
2760        u32 cur;
2761        struct btrfs_key key;
2762
2763        mutex_lock(&fs_info->chunk_mutex);
2764        array_size = btrfs_super_sys_array_size(super_copy);
2765
2766        ptr = super_copy->sys_chunk_array;
2767        cur = 0;
2768
2769        while (cur < array_size) {
2770                disk_key = (struct btrfs_disk_key *)ptr;
2771                btrfs_disk_key_to_cpu(&key, disk_key);
2772
2773                len = sizeof(*disk_key);
2774
2775                if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2776                        chunk = (struct btrfs_chunk *)(ptr + len);
2777                        num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2778                        len += btrfs_chunk_item_size(num_stripes);
2779                } else {
2780                        ret = -EIO;
2781                        break;
2782                }
2783                if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
2784                    key.offset == chunk_offset) {
2785                        memmove(ptr, ptr + len, array_size - (cur + len));
2786                        array_size -= len;
2787                        btrfs_set_super_sys_array_size(super_copy, array_size);
2788                } else {
2789                        ptr += len;
2790                        cur += len;
2791                }
2792        }
2793        mutex_unlock(&fs_info->chunk_mutex);
2794        return ret;
2795}
2796
2797static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info,
2798                                        u64 logical, u64 length)
2799{
2800        struct extent_map_tree *em_tree;
2801        struct extent_map *em;
2802
2803        em_tree = &fs_info->mapping_tree.map_tree;
2804        read_lock(&em_tree->lock);
2805        em = lookup_extent_mapping(em_tree, logical, length);
2806        read_unlock(&em_tree->lock);
2807
2808        if (!em) {
2809                btrfs_crit(fs_info, "unable to find logical %llu length %llu",
2810                           logical, length);
2811                return ERR_PTR(-EINVAL);
2812        }
2813
2814        if (em->start > logical || em->start + em->len < logical) {
2815                btrfs_crit(fs_info,
2816                           "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
2817                           logical, length, em->start, em->start + em->len);
2818                free_extent_map(em);
2819                return ERR_PTR(-EINVAL);
2820        }
2821
2822        /* callers are responsible for dropping em's ref. */
2823        return em;
2824}
2825
2826int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
2827                       struct btrfs_fs_info *fs_info, u64 chunk_offset)
2828{
2829        struct extent_map *em;
2830        struct map_lookup *map;
2831        u64 dev_extent_len = 0;
2832        int i, ret = 0;
2833        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2834
2835        em = get_chunk_map(fs_info, chunk_offset, 1);
2836        if (IS_ERR(em)) {
2837                /*
2838                 * This is a logic error, but we don't want to just rely on the
2839                 * user having built with ASSERT enabled, so if ASSERT doesn't
2840                 * do anything we still error out.
2841                 */
2842                ASSERT(0);
2843                return PTR_ERR(em);
2844        }
2845        map = em->map_lookup;
2846        mutex_lock(&fs_info->chunk_mutex);
2847        check_system_chunk(trans, fs_info, map->type);
2848        mutex_unlock(&fs_info->chunk_mutex);
2849
2850        /*
2851         * Take the device list mutex to prevent races with the final phase of
2852         * a device replace operation that replaces the device object associated
2853         * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
2854         */
2855        mutex_lock(&fs_devices->device_list_mutex);
2856        for (i = 0; i < map->num_stripes; i++) {
2857                struct btrfs_device *device = map->stripes[i].dev;
2858                ret = btrfs_free_dev_extent(trans, device,
2859                                            map->stripes[i].physical,
2860                                            &dev_extent_len);
2861                if (ret) {
2862                        mutex_unlock(&fs_devices->device_list_mutex);
2863                        btrfs_abort_transaction(trans, ret);
2864                        goto out;
2865                }
2866
2867                if (device->bytes_used > 0) {
2868                        mutex_lock(&fs_info->chunk_mutex);
2869                        btrfs_device_set_bytes_used(device,
2870                                        device->bytes_used - dev_extent_len);
2871                        atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
2872                        btrfs_clear_space_info_full(fs_info);
2873                        mutex_unlock(&fs_info->chunk_mutex);
2874                }
2875
2876                if (map->stripes[i].dev) {
2877                        ret = btrfs_update_device(trans, map->stripes[i].dev);
2878                        if (ret) {
2879                                mutex_unlock(&fs_devices->device_list_mutex);
2880                                btrfs_abort_transaction(trans, ret);
2881                                goto out;
2882                        }
2883                }
2884        }
2885        mutex_unlock(&fs_devices->device_list_mutex);
2886
2887        ret = btrfs_free_chunk(trans, fs_info, chunk_offset);
2888        if (ret) {
2889                btrfs_abort_transaction(trans, ret);
2890                goto out;
2891        }
2892
2893        trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
2894
2895        if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2896                ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
2897                if (ret) {
2898                        btrfs_abort_transaction(trans, ret);
2899                        goto out;
2900                }
2901        }
2902
2903        ret = btrfs_remove_block_group(trans, fs_info, chunk_offset, em);
2904        if (ret) {
2905                btrfs_abort_transaction(trans, ret);
2906                goto out;
2907        }
2908
2909out:
2910        /* once for us */
2911        free_extent_map(em);
2912        return ret;
2913}
2914
2915static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2916{
2917        struct btrfs_root *root = fs_info->chunk_root;
2918        struct btrfs_trans_handle *trans;
2919        int ret;
2920
2921        /*
2922         * Prevent races with automatic removal of unused block groups.
2923         * After we relocate and before we remove the chunk with offset
2924         * chunk_offset, automatic removal of the block group can kick in,
2925         * resulting in a failure when calling btrfs_remove_chunk() below.
2926         *
2927         * Make sure to acquire this mutex before doing a tree search (dev
2928         * or chunk trees) to find chunks. Otherwise the cleaner kthread might
2929         * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
2930         * we release the path used to search the chunk/dev tree and before
2931         * the current task acquires this mutex and calls us.
2932         */
2933        ASSERT(mutex_is_locked(&fs_info->delete_unused_bgs_mutex));
2934
2935        ret = btrfs_can_relocate(fs_info, chunk_offset);
2936        if (ret)
2937                return -ENOSPC;
2938
2939        /* step one, relocate all the extents inside this chunk */
2940        btrfs_scrub_pause(fs_info);
2941        ret = btrfs_relocate_block_group(fs_info, chunk_offset);
2942        btrfs_scrub_continue(fs_info);
2943        if (ret)
2944                return ret;
2945
2946        trans = btrfs_start_trans_remove_block_group(root->fs_info,
2947                                                     chunk_offset);
2948        if (IS_ERR(trans)) {
2949                ret = PTR_ERR(trans);
2950                btrfs_handle_fs_error(root->fs_info, ret, NULL);
2951                return ret;
2952        }
2953
2954        /*
2955         * step two, delete the device extents and the
2956         * chunk tree entries
2957         */
2958        ret = btrfs_remove_chunk(trans, fs_info, chunk_offset);
2959        btrfs_end_transaction(trans);
2960        return ret;
2961}
2962
2963static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
2964{
2965        struct btrfs_root *chunk_root = fs_info->chunk_root;
2966        struct btrfs_path *path;
2967        struct extent_buffer *leaf;
2968        struct btrfs_chunk *chunk;
2969        struct btrfs_key key;
2970        struct btrfs_key found_key;
2971        u64 chunk_type;
2972        bool retried = false;
2973        int failed = 0;
2974        int ret;
2975
2976        path = btrfs_alloc_path();
2977        if (!path)
2978                return -ENOMEM;
2979
2980again:
2981        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2982        key.offset = (u64)-1;
2983        key.type = BTRFS_CHUNK_ITEM_KEY;
2984
2985        while (1) {
2986                mutex_lock(&fs_info->delete_unused_bgs_mutex);
2987                ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2988                if (ret < 0) {
2989                        mutex_unlock(&fs_info->delete_unused_bgs_mutex);
2990                        goto error;
2991                }
2992                BUG_ON(ret == 0); /* Corruption */
2993
2994                ret = btrfs_previous_item(chunk_root, path, key.objectid,
2995                                          key.type);
2996                if (ret)
2997                        mutex_unlock(&fs_info->delete_unused_bgs_mutex);
2998                if (ret < 0)
2999                        goto error;
3000                if (ret > 0)

3001                        break;
3002
3003                leaf = path->nodes[0];
3004                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3005
3006                chunk = btrfs_item_ptr(leaf, path->slots[0],
3007                                       struct btrfs_chunk);
3008                chunk_type = btrfs_chunk_type(leaf, chunk);
3009                btrfs_release_path(path);
3010
3011                if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
3012                        ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3013                        if (ret == -ENOSPC)
3014                                failed++;
3015                        else
3016                                BUG_ON(ret);
3017                }
3018                mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3019
3020                if (found_key.offset == 0)
3021                        break;
3022                key.offset = found_key.offset - 1;
3023        }
3024        ret = 0;
3025        if (failed && !retried) {
3026                failed = 0;
3027                retried = true;
3028                goto again;
3029        } else if (WARN_ON(failed && retried)) {
3030                ret = -ENOSPC;
3031        }
3032error:
3033        btrfs_free_path(path);
3034        return ret;
3035}
3036
3037static int insert_balance_item(struct btrfs_fs_info *fs_info,
3038                               struct btrfs_balance_control *bctl)
3039{
3040        struct btrfs_root *root = fs_info->tree_root;
3041        struct btrfs_trans_handle *trans;
3042        struct btrfs_balance_item *item;
3043        struct btrfs_disk_balance_args disk_bargs;
3044        struct btrfs_path *path;
3045        struct extent_buffer *leaf;
3046        struct btrfs_key key;
3047        int ret, err;
3048
3049        path = btrfs_alloc_path();
3050        if (!path)
3051                return -ENOMEM;
3052
3053        trans = btrfs_start_transaction(root, 0);
3054        if (IS_ERR(trans)) {
3055                btrfs_free_path(path);
3056                return PTR_ERR(trans);
3057        }
3058
3059        key.objectid = BTRFS_BALANCE_OBJECTID;
3060        key.type = BTRFS_TEMPORARY_ITEM_KEY;
3061        key.offset = 0;
3062
3063        ret = btrfs_insert_empty_item(trans, root, path, &key,
3064                                      sizeof(*item));
3065        if (ret)
3066                goto out;
3067
3068        leaf = path->nodes[0];
3069        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3070
3071        memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3072
3073        btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
3074        btrfs_set_balance_data(leaf, item, &disk_bargs);
3075        btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
3076        btrfs_set_balance_meta(leaf, item, &disk_bargs);
3077        btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
3078        btrfs_set_balance_sys(leaf, item, &disk_bargs);
3079
3080        btrfs_set_balance_flags(leaf, item, bctl->flags);
3081
3082        btrfs_mark_buffer_dirty(leaf);
3083out:
3084        btrfs_free_path(path);
3085        err = btrfs_commit_transaction(trans);
3086        if (err && !ret)
3087                ret = err;
3088        return ret;
3089}
3090
3091static int del_balance_item(struct btrfs_fs_info *fs_info)
3092{
3093        struct btrfs_root *root = fs_info->tree_root;
3094        struct btrfs_trans_handle *trans;
3095        struct btrfs_path *path;
3096        struct btrfs_key key;
3097        int ret, err;
3098
3099        path = btrfs_alloc_path();
3100        if (!path)
3101                return -ENOMEM;
3102
3103        trans = btrfs_start_transaction(root, 0);
3104        if (IS_ERR(trans)) {
3105                btrfs_free_path(path);
3106                return PTR_ERR(trans);
3107        }
3108
3109        key.objectid = BTRFS_BALANCE_OBJECTID;
3110        key.type = BTRFS_TEMPORARY_ITEM_KEY;
3111        key.offset = 0;
3112
3113        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3114        if (ret < 0)
3115                goto out;
3116        if (ret > 0) {
3117                ret = -ENOENT;
3118                goto out;
3119        }
3120
3121        ret = btrfs_del_item(trans, root, path);
3122out:
3123        btrfs_free_path(path);
3124        err = btrfs_commit_transaction(trans);
3125        if (err && !ret)
3126                ret = err;
3127        return ret;
3128}
3129
3130/*
3131 * This is a heuristic used to reduce the number of chunks balanced on
3132 * resume after balance was interrupted.
3133 */
3134static void update_balance_args(struct btrfs_balance_control *bctl)
3135{
3136        /*
3137         * Turn on soft mode for chunk types that were being converted.
3138         */
3139        if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
3140                bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
3141        if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
3142                bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
3143        if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
3144                bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
3145
3146        /*
3147         * Turn on usage filter if is not already used.  The idea is
3148         * that chunks that we have already balanced should be
3149         * reasonably full.  Don't do it for chunks that are being
3150         * converted - that will keep us from relocating unconverted
3151         * (albeit full) chunks.
3152         */
3153        if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3154            !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3155            !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3156                bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
3157                bctl->data.usage = 90;
3158        }
3159        if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3160            !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3161            !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3162                bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
3163                bctl->sys.usage = 90;
3164        }
3165        if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3166            !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3167            !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3168                bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
3169                bctl->meta.usage = 90;
3170        }
3171}
3172
3173/*
3174 * Should be called with both balance and volume mutexes held to
3175 * serialize other volume operations (add_dev/rm_dev/resize) with
3176 * restriper.  Same goes for unset_balance_control.
3177 */
3178static void set_balance_control(struct btrfs_balance_control *bctl)
3179{
3180        struct btrfs_fs_info *fs_info = bctl->fs_info;
3181
3182        BUG_ON(fs_info->balance_ctl);
3183
3184        spin_lock(&fs_info->balance_lock);
3185        fs_info->balance_ctl = bctl;
3186        spin_unlock(&fs_info->balance_lock);
3187}
3188
3189static void unset_balance_control(struct btrfs_fs_info *fs_info)
3190{
3191        struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3192
3193        BUG_ON(!fs_info->balance_ctl);
3194
3195        spin_lock(&fs_info->balance_lock);
3196        fs_info->balance_ctl = NULL;
3197        spin_unlock(&fs_info->balance_lock);
3198
3199        kfree(bctl);
3200}
3201
3202/*
3203 * Balance filters.  Return 1 if chunk should be filtered out
3204 * (should not be balanced).
3205 */
3206static int chunk_profiles_filter(u64 chunk_type,
3207                                 struct btrfs_balance_args *bargs)
3208{
3209        chunk_type = chunk_to_extended(chunk_type) &
3210                                BTRFS_EXTENDED_PROFILE_MASK;
3211
3212        if (bargs->profiles & chunk_type)
3213                return 0;
3214
3215        return 1;
3216}
3217
3218static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
3219                              struct btrfs_balance_args *bargs)
3220{
3221        struct btrfs_block_group_cache *cache;
3222        u64 chunk_used;
3223        u64 user_thresh_min;
3224        u64 user_thresh_max;
3225        int ret = 1;
3226
3227        cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3228        chunk_used = btrfs_block_group_used(&cache->item);
3229
3230        if (bargs->usage_min == 0)
3231                user_thresh_min = 0;
3232        else
3233                user_thresh_min = div_factor_fine(cache->key.offset,
3234                                        bargs->usage_min);
3235
3236        if (bargs->usage_max == 0)
3237                user_thresh_max = 1;
3238        else if (bargs->usage_max > 100)
3239                user_thresh_max = cache->key.offset;
3240        else
3241                user_thresh_max = div_factor_fine(cache->key.offset,
3242                                        bargs->usage_max);
3243
3244        if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
3245                ret = 0;
3246
3247        btrfs_put_block_group(cache);
3248        return ret;
3249}
3250
3251static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
3252                u64 chunk_offset, struct btrfs_balance_args *bargs)
3253{
3254        struct btrfs_block_group_cache *cache;
3255        u64 chunk_used, user_thresh;
3256        int ret = 1;
3257
3258        cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3259        chunk_used = btrfs_block_group_used(&cache->item);
3260
3261        if (bargs->usage_min == 0)
3262                user_thresh = 1;
3263        else if (bargs->usage > 100)
3264                user_thresh = cache->key.offset;
3265        else
3266                user_thresh = div_factor_fine(cache->key.offset,
3267                                              bargs->usage);
3268
3269        if (chunk_used < user_thresh)
3270                ret = 0;
3271
3272        btrfs_put_block_group(cache);
3273        return ret;
3274}
3275
3276static int chunk_devid_filter(struct extent_buffer *leaf,
3277                              struct btrfs_chunk *chunk,
3278                              struct btrfs_balance_args *bargs)
3279{
3280        struct btrfs_stripe *stripe;
3281        int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3282        int i;
3283
3284        for (i = 0; i < num_stripes; i++) {
3285                stripe = btrfs_stripe_nr(chunk, i);
3286                if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
3287                        return 0;
3288        }
3289
3290        return 1;
3291}
3292
3293/* [pstart, pend) */
3294static int chunk_drange_filter(struct extent_buffer *leaf,
3295                               struct btrfs_chunk *chunk,
3296                               struct btrfs_balance_args *bargs)
3297{
3298        struct btrfs_stripe *stripe;
3299        int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3300        u64 stripe_offset;
3301        u64 stripe_length;
3302        int factor;
3303        int i;
3304
3305        if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
3306                return 0;
3307
3308        if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
3309             BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
3310                factor = num_stripes / 2;
3311        } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
3312                factor = num_stripes - 1;
3313        } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
3314                factor = num_stripes - 2;
3315        } else {
3316                factor = num_stripes;
3317        }
3318
3319        for (i = 0; i < num_stripes; i++) {
3320                stripe = btrfs_stripe_nr(chunk, i);
3321                if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
3322                        continue;
3323
3324                stripe_offset = btrfs_stripe_offset(leaf, stripe);
3325                stripe_length = btrfs_chunk_length(leaf, chunk);
3326                stripe_length = div_u64(stripe_length, factor);
3327
3328                if (stripe_offset < bargs->pend &&
3329                    stripe_offset + stripe_length > bargs->pstart)
3330                        return 0;
3331        }
3332
3333        return 1;
3334}
3335
3336/* [vstart, vend) */
3337static int chunk_vrange_filter(struct extent_buffer *leaf,
3338                               struct btrfs_chunk *chunk,
3339                               u64 chunk_offset,
3340                               struct btrfs_balance_args *bargs)
3341{
3342        if (chunk_offset < bargs->vend &&
3343            chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
3344                /* at least part of the chunk is inside this vrange */
3345                return 0;
3346
3347        return 1;
3348}
3349
3350static int chunk_stripes_range_filter(struct extent_buffer *leaf,
3351                               struct btrfs_chunk *chunk,
3352                               struct btrfs_balance_args *bargs)
3353{
3354        int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3355
3356        if (bargs->stripes_min <= num_stripes
3357                        && num_stripes <= bargs->stripes_max)
3358                return 0;
3359
3360        return 1;
3361}
3362
3363static int chunk_soft_convert_filter(u64 chunk_type,
3364                                     struct btrfs_balance_args *bargs)
3365{
3366        if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3367                return 0;
3368
3369        chunk_type = chunk_to_extended(chunk_type) &
3370                                BTRFS_EXTENDED_PROFILE_MASK;
3371
3372        if (bargs->target == chunk_type)
3373                return 1;
3374
3375        return 0;
3376}
3377
3378static int should_balance_chunk(struct btrfs_fs_info *fs_info,
3379                                struct extent_buffer *leaf,
3380                                struct btrfs_chunk *chunk, u64 chunk_offset)
3381{
3382        struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3383        struct btrfs_balance_args *bargs = NULL;
3384        u64 chunk_type = btrfs_chunk_type(leaf, chunk);
3385
3386        /* type filter */
3387        if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
3388              (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
3389                return 0;
3390        }
3391
3392        if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3393                bargs = &bctl->data;
3394        else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3395                bargs = &bctl->sys;
3396        else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3397                bargs = &bctl->meta;
3398
3399        /* profiles filter */
3400        if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
3401            chunk_profiles_filter(chunk_type, bargs)) {
3402                return 0;
3403        }
3404
3405        /* usage filter */
3406        if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
3407            chunk_usage_filter(fs_info, chunk_offset, bargs)) {
3408                return 0;
3409        } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3410            chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
3411                return 0;
3412        }
3413
3414        /* devid filter */
3415        if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
3416            chunk_devid_filter(leaf, chunk, bargs)) {
3417                return 0;
3418        }
3419
3420        /* drange filter, makes sense only with devid filter */
3421        if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3422            chunk_drange_filter(leaf, chunk, bargs)) {
3423                return 0;
3424        }
3425
3426        /* vrange filter */
3427        if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
3428            chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
3429                return 0;
3430        }
3431
3432        /* stripes filter */
3433        if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
3434            chunk_stripes_range_filter(leaf, chunk, bargs)) {
3435                return 0;
3436        }
3437
3438        /* soft profile changing mode */
3439        if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
3440            chunk_soft_convert_filter(chunk_type, bargs)) {
3441                return 0;
3442        }
3443
3444        /*
3445         * limited by count, must be the last filter
3446         */
3447        if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
3448                if (bargs->limit == 0)
3449                        return 0;
3450                else
3451                        bargs->limit--;
3452        } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
3453                /*
3454                 * Same logic as the 'limit' filter; the minimum cannot be
3455                 * determined here because we do not have the global information
3456                 * about the count of all chunks that satisfy the filters.
3457                 */
3458                if (bargs->limit_max == 0)
3459                        return 0;
3460                else
3461                        bargs->limit_max--;
3462        }
3463
3464        return 1;
3465}
3466
3467static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3468{
3469        struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3470        struct btrfs_root *chunk_root = fs_info->chunk_root;
3471        struct btrfs_root *dev_root = fs_info->dev_root;
3472        struct list_head *devices;
3473        struct btrfs_device *device;
3474        u64 old_size;
3475        u64 size_to_free;
3476        u64 chunk_type;
3477        struct btrfs_chunk *chunk;
3478        struct btrfs_path *path = NULL;
3479        struct btrfs_key key;
3480        struct btrfs_key found_key;
3481        struct btrfs_trans_handle *trans;
3482        struct extent_buffer *leaf;
3483        int slot;
3484        int ret;
3485        int enospc_errors = 0;
3486        bool counting = true;
3487        /* The single value limit and min/max limits use the same bytes in the */
3488        u64 limit_data = bctl->data.limit;
3489        u64 limit_meta = bctl->meta.limit;
3490        u64 limit_sys = bctl->sys.limit;
3491        u32 count_data = 0;
3492        u32 count_meta = 0;
3493        u32 count_sys = 0;
3494        int chunk_reserved = 0;
3495        u64 bytes_used = 0;
3496
3497        /* step one make some room on all the devices */
3498        devices = &fs_info->fs_devices->devices;
3499        list_for_each_entry(device, devices, dev_list) {
3500                old_size = btrfs_device_get_total_bytes(device);
3501                size_to_free = div_factor(old_size, 1);
3502                size_to_free = min_t(u64, size_to_free, SZ_1M);
3503                if (!device->writeable ||
3504                    btrfs_device_get_total_bytes(device) -
3505                    btrfs_device_get_bytes_used(device) > size_to_free ||
3506                    device->is_tgtdev_for_dev_replace)
3507                        continue;
3508
3509                ret = btrfs_shrink_device(device, old_size - size_to_free);
3510                if (ret == -ENOSPC)
3511                        break;
3512                if (ret) {
3513                        /* btrfs_shrink_device never returns ret > 0 */
3514                        WARN_ON(ret > 0);
3515                        goto error;
3516                }
3517
3518                trans = btrfs_start_transaction(dev_root, 0);
3519                if (IS_ERR(trans)) {
3520                        ret = PTR_ERR(trans);
3521                        btrfs_info_in_rcu(fs_info,
3522                 "resize: unable to start transaction after shrinking device %s (error %d), old size %llu, new size %llu",
3523                                          rcu_str_deref(device->name), ret,
3524                                          old_size, old_size - size_to_free);
3525                        goto error;
3526                }
3527
3528                ret = btrfs_grow_device(trans, device, old_size);
3529                if (ret) {
3530                        btrfs_end_transaction(trans);
3531                        /* btrfs_grow_device never returns ret > 0 */
3532                        WARN_ON(ret > 0);
3533                        btrfs_info_in_rcu(fs_info,
3534                 "resize: unable to grow device after shrinking device %s (error %d), old size %llu, new size %llu",
3535                                          rcu_str_deref(device->name), ret,
3536                                          old_size, old_size - size_to_free);
3537                        goto error;
3538                }
3539
3540                btrfs_end_transaction(trans);
3541        }
3542
3543        /* step two, relocate all the chunks */
3544        path = btrfs_alloc_path();
3545        if (!path) {
3546                ret = -ENOMEM;
3547                goto error;
3548        }
3549
3550        /* zero out stat counters */
3551        spin_lock(&fs_info->balance_lock);
3552        memset(&bctl->stat, 0, sizeof(bctl->stat));
3553        spin_unlock(&fs_info->balance_lock);
3554again:
3555        if (!counting) {
3556                /*
3557                 * The single value limit and min/max limits use the same bytes
3558                 * in the
3559                 */
3560                bctl->data.limit = limit_data;
3561                bctl->meta.limit = limit_meta;
3562                bctl->sys.limit = limit_sys;
3563        }
3564        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3565        key.offset = (u64)-1;
3566        key.type = BTRFS_CHUNK_ITEM_KEY;
3567
3568        while (1) {
3569                if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
3570                    atomic_read(&fs_info->balance_cancel_req)) {
3571                        ret = -ECANCELED;
3572                        goto error;
3573                }
3574
3575                mutex_lock(&fs_info->delete_unused_bgs_mutex);
3576                ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3577                if (ret < 0) {
3578                        mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3579                        goto error;
3580                }
3581
3582                /*
3583                 * this shouldn't happen, it means the last relocate
3584                 * failed
3585                 */
3586                if (ret == 0)
3587                        BUG(); /* FIXME break ? */
3588
3589                ret = btrfs_previous_item(chunk_root, path, 0,
3590                                          BTRFS_CHUNK_ITEM_KEY);
3591                if (ret) {
3592                        mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3593                        ret = 0;
3594                        break;
3595                }
3596
3597                leaf = path->nodes[0];
3598                slot = path->slots[0];
3599                btrfs_item_key_to_cpu(leaf, &found_key, slot);
3600
3601                if (found_key.objectid != key.objectid) {
3602                        mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3603                        break;
3604                }
3605
3606                chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3607                chunk_type = btrfs_chunk_type(leaf, chunk);
3608
3609                if (!counting) {
3610                        spin_lock(&fs_info->balance_lock);
3611                        bctl->stat.considered++;
3612                        spin_unlock(&fs_info->balance_lock);
3613                }
3614
3615                ret = should_balance_chunk(fs_info, leaf, chunk,
3616                                           found_key.offset);
3617
3618                btrfs_release_path(path);
3619                if (!ret) {
3620                        mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3621                        goto loop;
3622                }
3623
3624                if (counting) {
3625                        mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3626                        spin_lock(&fs_info->balance_lock);
3627                        bctl->stat.expected++;
3628                        spin_unlock(&fs_info->balance_lock);
3629
3630                        if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3631                                count_data++;
3632                        else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3633                                count_sys++;
3634                        else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3635                                count_meta++;
3636
3637                        goto loop;
3638                }
3639
3640                /*
3641                 * Apply limit_min filter, no need to check if the LIMITS
3642                 * filter is used, limit_min is 0 by default
3643                 */
3644                if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
3645                                        count_data < bctl->data.limit_min)
3646                                || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
3647                                        count_meta < bctl->meta.limit_min)
3648                                || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
3649                                        count_sys < bctl->sys.limit_min)) {
3650                        mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3651                        goto loop;
3652                }
3653
3654                ASSERT(fs_info->data_sinfo);
3655                spin_lock(&fs_info->data_sinfo->lock);
3656                bytes_used = fs_info->data_sinfo->bytes_used;
3657                spin_unlock(&fs_info->data_sinfo->lock);
3658
3659                if ((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
3660                    !chunk_reserved && !bytes_used) {
3661                        trans = btrfs_start_transaction(chunk_root, 0);
3662                        if (IS_ERR(trans)) {
3663                                mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3664                                ret = PTR_ERR(trans);
3665                                goto error;
3666                        }
3667
3668                        ret = btrfs_force_chunk_alloc(trans, fs_info,
3669                                                      BTRFS_BLOCK_GROUP_DATA);
3670                        btrfs_end_transaction(trans);
3671                        if (ret < 0) {
3672                                mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3673                                goto error;
3674                        }
3675                        chunk_reserved = 1;
3676                }
3677
3678                ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3679                mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3680                if (ret && ret != -ENOSPC)
3681                        goto error;
3682                if (ret == -ENOSPC) {
3683                        enospc_errors++;
3684                } else {
3685                        spin_lock(&fs_info->balance_lock);
3686                        bctl->stat.completed++;
3687                        spin_unlock(&fs_info->balance_lock);
3688                }
3689loop:
3690                if (found_key.offset == 0)
3691                        break;
3692                key.offset = found_key.offset - 1;
3693        }
3694
3695        if (counting) {
3696                btrfs_release_path(path);
3697                counting = false;
3698                goto again;
3699        }
3700error:
3701        btrfs_free_path(path);
3702        if (enospc_errors) {
3703                btrfs_info(fs_info, "%d enospc errors during balance",
3704                           enospc_errors);
3705                if (!ret)
3706                        ret = -ENOSPC;
3707        }
3708
3709        return ret;
3710}
3711
3712/**
3713 * alloc_profile_is_valid - see if a given profile is valid and reduced
3714 * @flags: profile to validate
3715 * @extended: if true @flags is treated as an extended profile
3716 */
3717static int alloc_profile_is_valid(u64 flags, int extended)
3718{
3719        u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
3720                               BTRFS_BLOCK_GROUP_PROFILE_MASK);
3721
3722        flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
3723
3724        /* 1) check that all other bits are zeroed */
3725        if (flags & ~mask)
3726                return 0;
3727
3728        /* 2) see if profile is reduced */
3729        if (flags == 0)
3730                return !extended; /* "0" is valid for usual profiles */
3731
3732        /* true if exactly one bit set */
3733        return (flags & (flags - 1)) == 0;
3734}
3735
3736static inline int balance_need_close(struct btrfs_fs_info *fs_info)
3737{
3738        /* cancel requested || normal exit path */
3739        return atomic_read(&fs_info->balance_cancel_req) ||
3740                (atomic_read(&fs_info->balance_pause_req) == 0 &&
3741                 atomic_read(&fs_info->balance_cancel_req) == 0);
3742}
3743
3744static void __cancel_balance(struct btrfs_fs_info *fs_info)
3745{
3746        int ret;
3747
3748        unset_balance_control(fs_info);
3749        ret = del_balance_item(fs_info);
3750        if (ret)
3751                btrfs_handle_fs_error(fs_info, ret, NULL);
3752
3753        clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
3754}
3755
3756/* Non-zero return value signifies invalidity */
3757static inline int validate_convert_profile(struct btrfs_balance_args *bctl_arg,
3758                u64 allowed)
3759{
3760        return ((bctl_arg->flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3761                (!alloc_profile_is_valid(bctl_arg->target, 1) ||
3762                 (bctl_arg->target & ~allowed)));
3763}
3764
3765/*
3766 * Should be called with both balance and volume mutexes held
3767 */
3768int btrfs_balance(struct btrfs_balance_control *bctl,
3769                  struct btrfs_ioctl_balance_args *bargs)
3770{
3771        struct btrfs_fs_info *fs_info = bctl->fs_info;
3772        u64 meta_target, data_target;
3773        u64 allowed;
3774        int mixed = 0;
3775        int ret;
3776        u64 num_devices;
3777        unsigned seq;
3778
3779        if (btrfs_fs_closing(fs_info) ||
3780            atomic_read(&fs_info->balance_pause_req) ||
3781            atomic_read(&fs_info->balance_cancel_req)) {
3782                ret = -EINVAL;
3783                goto out;
3784        }
3785
3786        allowed = btrfs_super_incompat_flags(fs_info->super_copy);
3787        if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
3788                mixed = 1;
3789
3790        /*
3791         * In case of mixed groups both data and meta should be picked,
3792         * and identical options should be given for both of them.
3793         */
3794        allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
3795        if (mixed && (bctl->flags & allowed)) {
3796                if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
3797                    !(bctl->flags & BTRFS_BALANCE_METADATA) ||
3798                    memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
3799                        btrfs_err(fs_info,
3800                                  "with mixed groups data and metadata balance options must be the same");
3801                        ret = -EINVAL;
3802                        goto out;
3803                }
3804        }
3805
3806        num_devices = fs_info->fs_devices->num_devices;
3807        btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
3808        if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
3809                BUG_ON(num_devices < 1);
3810                num_devices--;
3811        }
3812        btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
3813        allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP;
3814        if (num_devices > 1)
3815                allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
3816        if (num_devices > 2)
3817                allowed |= BTRFS_BLOCK_GROUP_RAID5;
3818        if (num_devices > 3)
3819                allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
3820                            BTRFS_BLOCK_GROUP_RAID6);
3821        if (validate_convert_profile(&bctl->data, allowed)) {
3822                btrfs_err(fs_info,
3823                          "unable to start balance with target data profile %llu",
3824                          bctl->data.target);
3825                ret = -EINVAL;
3826                goto out;
3827        }
3828        if (validate_convert_profile(&bctl->meta, allowed)) {
3829                btrfs_err(fs_info,
3830                          "unable to start balance with target metadata profile %llu",
3831                          bctl->meta.target);
3832                ret = -EINVAL;
3833                goto out;
3834        }
3835        if (validate_convert_profile(&bctl->sys, allowed)) {
3836                btrfs_err(fs_info,
3837                          "unable to start balance with target system profile %llu",
3838                          bctl->sys.target);
3839                ret = -EINVAL;
3840                goto out;
3841        }
3842
3843        /* allow to reduce meta or sys integrity only if force set */
3844        allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3845                        BTRFS_BLOCK_GROUP_RAID10 |
3846                        BTRFS_BLOCK_GROUP_RAID5 |
3847                        BTRFS_BLOCK_GROUP_RAID6;
3848        do {
3849                seq = read_seqbegin(&fs_info->profiles_lock);
3850
3851                if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3852                     (fs_info->avail_system_alloc_bits & allowed) &&
3853                     !(bctl->sys.target & allowed)) ||
3854                    ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3855                     (fs_info->avail_metadata_alloc_bits & allowed) &&
3856                     !(bctl->meta.target & allowed))) {
3857                        if (bctl->flags & BTRFS_BALANCE_FORCE) {
3858                                btrfs_info(fs_info,
3859                                           "force reducing metadata integrity");
3860                        } else {
3861                                btrfs_err(fs_info,
3862                                          "balance will reduce metadata integrity, use force if you want this");
3863                                ret = -EINVAL;
3864                                goto out;
3865                        }
3866                }
3867        } while (read_seqretry(&fs_info->profiles_lock, seq));
3868
3869        /* if we're not converting, the target field is uninitialized */
3870        meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
3871                bctl->meta.target : fs_info->avail_metadata_alloc_bits;
3872        data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
3873                bctl->data.target : fs_info->avail_data_alloc_bits;
3874        if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
3875                btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
3876                btrfs_warn(fs_info,
3877                           "metadata profile 0x%llx has lower redundancy than data profile 0x%llx",
3878                           meta_target, data_target);
3879        }
3880
3881        ret = insert_balance_item(fs_info, bctl);
3882        if (ret && ret != -EEXIST)
3883                goto out;
3884
3885        if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
3886                BUG_ON(ret == -EEXIST);
3887                set_balance_control(bctl);
3888        } else {
3889                BUG_ON(ret != -EEXIST);
3890                spin_lock(&fs_info->balance_lock);
3891                update_balance_args(bctl);
3892                spin_unlock(&fs_info->balance_lock);
3893        }
3894
3895        atomic_inc(&fs_info->balance_running);
3896        mutex_unlock(&fs_info->balance_mutex);
3897
3898        ret = __btrfs_balance(fs_info);
3899
3900        mutex_lock(&fs_info->balance_mutex);
3901        atomic_dec(&fs_info->balance_running);
3902
3903        if (bargs) {
3904                memset(bargs, 0, sizeof(*bargs));
3905                update_ioctl_balance_args(fs_info, 0, bargs);
3906        }
3907
3908        if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
3909            balance_need_close(fs_info)) {
3910                __cancel_balance(fs_info);
3911        }
3912
3913        wake_up(&fs_info->balance_wait_q);
3914
3915        return ret;
3916out:
3917        if (bctl->flags & BTRFS_BALANCE_RESUME)
3918                __cancel_balance(fs_info);
3919        else {
3920                kfree(bctl);
3921                clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags);
3922        }
3923        return ret;
3924}
3925
3926static int balance_kthread(void *data)
3927{
3928        struct btrfs_fs_info *fs_info = data;
3929        int ret = 0;
3930
3931        mutex_lock(&fs_info->volume_mutex);
3932        mutex_lock(&fs_info->balance_mutex);
3933
3934        if (fs_info->balance_ctl) {
3935                btrfs_info(fs_info, "continuing balance");
3936                ret = btrfs_balance(fs_info->balance_ctl, NULL);
3937        }
3938
3939        mutex_unlock(&fs_info->balance_mutex);
3940        mutex_unlock(&fs_info->volume_mutex);
3941
3942        return ret;
3943}
3944
3945int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
3946{
3947        struct task_struct *tsk;
3948
3949        spin_lock(&fs_info->balance_lock);
3950        if (!fs_info->balance_ctl) {
3951                spin_unlock(&fs_info->balance_lock);
3952                return 0;
3953        }
3954        spin_unlock(&fs_info->balance_lock);
3955
3956        if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
3957                btrfs_info(fs_info, "force skipping balance");
3958                return 0;
3959        }
3960
3961        tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
3962        return PTR_ERR_OR_ZERO(tsk);
3963}
3964
3965int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
3966{
3967        struct btrfs_balance_control *bctl;
3968        struct btrfs_balance_item *item;
3969        struct btrfs_disk_balance_args disk_bargs;
3970        struct btrfs_path *path;
3971        struct extent_buffer *leaf;
3972        struct btrfs_key key;
3973        int ret;
3974
3975        path = btrfs_alloc_path();
3976        if (!path)
3977                return -ENOMEM;
3978
3979        key.objectid = BTRFS_BALANCE_OBJECTID;
3980        key.type = BTRFS_TEMPORARY_ITEM_KEY;
3981        key.offset = 0;
3982
3983        ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
3984        if (ret < 0)
3985                goto out;
3986        if (ret > 0) { /* ret = -ENOENT; */
3987                ret = 0;
3988                goto out;
3989        }
3990
3991        bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
3992        if (!bctl) {
3993                ret = -ENOMEM;
3994                goto out;
3995        }
3996
3997        leaf = path->nodes[0];
3998        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3999
4000        bctl->fs_info = fs_info;

4001        bctl->flags = btrfs_balance_flags(leaf, item);
4002        bctl->flags |= BTRFS_BALANCE_RESUME;
4003
4004        btrfs_balance_data(leaf, item, &disk_bargs);
4005        btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
4006        btrfs_balance_meta(leaf, item, &disk_bargs);
4007        btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
4008        btrfs_balance_sys(leaf, item, &disk_bargs);
4009        btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
4010
4011        WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags));
4012
4013        mutex_lock(&fs_info->volume_mutex);
4014        mutex_lock(&fs_info->balance_mutex);
4015
4016        set_balance_control(bctl);
4017
4018        mutex_unlock(&fs_info->balance_mutex);
4019        mutex_unlock(&fs_info->volume_mutex);
4020out:
4021        btrfs_free_path(path);
4022        return ret;
4023}
4024
4025int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
4026{
4027        int ret = 0;
4028
4029        mutex_lock(&fs_info->balance_mutex);
4030        if (!fs_info->balance_ctl) {
4031                mutex_unlock(&fs_info->balance_mutex);
4032                return -ENOTCONN;
4033        }
4034
4035        if (atomic_read(&fs_info->balance_running)) {
4036                atomic_inc(&fs_info->balance_pause_req);
4037                mutex_unlock(&fs_info->balance_mutex);
4038
4039                wait_event(fs_info->balance_wait_q,
4040                           atomic_read(&fs_info->balance_running) == 0);
4041
4042                mutex_lock(&fs_info->balance_mutex);
4043                /* we are good with balance_ctl ripped off from under us */
4044                BUG_ON(atomic_read(&fs_info->balance_running));
4045                atomic_dec(&fs_info->balance_pause_req);
4046        } else {
4047                ret = -ENOTCONN;
4048        }
4049
4050        mutex_unlock(&fs_info->balance_mutex);
4051        return ret;
4052}
4053
4054int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
4055{
4056        if (sb_rdonly(fs_info->sb))
4057                return -EROFS;
4058
4059        mutex_lock(&fs_info->balance_mutex);
4060        if (!fs_info->balance_ctl) {
4061                mutex_unlock(&fs_info->balance_mutex);
4062                return -ENOTCONN;
4063        }
4064
4065        atomic_inc(&fs_info->balance_cancel_req);
4066        /*
4067         * if we are running just wait and return, balance item is
4068         * deleted in btrfs_balance in this case
4069         */
4070        if (atomic_read(&fs_info->balance_running)) {
4071                mutex_unlock(&fs_info->balance_mutex);
4072                wait_event(fs_info->balance_wait_q,
4073                           atomic_read(&fs_info->balance_running) == 0);
4074                mutex_lock(&fs_info->balance_mutex);
4075        } else {
4076                /* __cancel_balance needs volume_mutex */
4077                mutex_unlock(&fs_info->balance_mutex);
4078                mutex_lock(&fs_info->volume_mutex);
4079                mutex_lock(&fs_info->balance_mutex);
4080
4081                if (fs_info->balance_ctl)
4082                        __cancel_balance(fs_info);
4083
4084                mutex_unlock(&fs_info->volume_mutex);
4085        }
4086
4087        BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
4088        atomic_dec(&fs_info->balance_cancel_req);
4089        mutex_unlock(&fs_info->balance_mutex);
4090        return 0;
4091}
4092
4093static int btrfs_uuid_scan_kthread(void *data)
4094{
4095        struct btrfs_fs_info *fs_info = data;
4096        struct btrfs_root *root = fs_info->tree_root;
4097        struct btrfs_key key;
4098        struct btrfs_path *path = NULL;
4099        int ret = 0;
4100        struct extent_buffer *eb;
4101        int slot;
4102        struct btrfs_root_item root_item;
4103        u32 item_size;
4104        struct btrfs_trans_handle *trans = NULL;
4105
4106        path = btrfs_alloc_path();
4107        if (!path) {
4108                ret = -ENOMEM;
4109                goto out;
4110        }
4111
4112        key.objectid = 0;
4113        key.type = BTRFS_ROOT_ITEM_KEY;
4114        key.offset = 0;
4115
4116        while (1) {
4117                ret = btrfs_search_forward(root, &key, path, 0);
4118                if (ret) {
4119                        if (ret > 0)
4120                                ret = 0;
4121                        break;
4122                }
4123
4124                if (key.type != BTRFS_ROOT_ITEM_KEY ||
4125                    (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
4126                     key.objectid != BTRFS_FS_TREE_OBJECTID) ||
4127                    key.objectid > BTRFS_LAST_FREE_OBJECTID)
4128                        goto skip;
4129
4130                eb = path->nodes[0];
4131                slot = path->slots[0];
4132                item_size = btrfs_item_size_nr(eb, slot);
4133                if (item_size < sizeof(root_item))
4134                        goto skip;
4135
4136                read_extent_buffer(eb, &root_item,
4137                                   btrfs_item_ptr_offset(eb, slot),
4138                                   (int)sizeof(root_item));
4139                if (btrfs_root_refs(&root_item) == 0)
4140                        goto skip;
4141
4142                if (!btrfs_is_empty_uuid(root_item.uuid) ||
4143                    !btrfs_is_empty_uuid(root_item.received_uuid)) {
4144                        if (trans)
4145                                goto update_tree;
4146
4147                        btrfs_release_path(path);
4148                        /*
4149                         * 1 - subvol uuid item
4150                         * 1 - received_subvol uuid item
4151                         */
4152                        trans = btrfs_start_transaction(fs_info->uuid_root, 2);
4153                        if (IS_ERR(trans)) {
4154                                ret = PTR_ERR(trans);
4155                                break;
4156                        }
4157                        continue;
4158                } else {
4159                        goto skip;
4160                }
4161update_tree:
4162                if (!btrfs_is_empty_uuid(root_item.uuid)) {
4163                        ret = btrfs_uuid_tree_add(trans, fs_info,
4164                                                  root_item.uuid,
4165                                                  BTRFS_UUID_KEY_SUBVOL,
4166                                                  key.objectid);
4167                        if (ret < 0) {
4168                                btrfs_warn(fs_info, "uuid_tree_add failed %d",
4169                                        ret);
4170                                break;
4171                        }
4172                }
4173
4174                if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
4175                        ret = btrfs_uuid_tree_add(trans, fs_info,
4176                                                  root_item.received_uuid,
4177                                                 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4178                                                  key.objectid);
4179                        if (ret < 0) {
4180                                btrfs_warn(fs_info, "uuid_tree_add failed %d",
4181                                        ret);
4182                                break;
4183                        }
4184                }
4185
4186skip:
4187                if (trans) {
4188                        ret = btrfs_end_transaction(trans);
4189                        trans = NULL;
4190                        if (ret)
4191                                break;
4192                }
4193
4194                btrfs_release_path(path);
4195                if (key.offset < (u64)-1) {
4196                        key.offset++;
4197                } else if (key.type < BTRFS_ROOT_ITEM_KEY) {
4198                        key.offset = 0;
4199                        key.type = BTRFS_ROOT_ITEM_KEY;
4200                } else if (key.objectid < (u64)-1) {
4201                        key.offset = 0;
4202                        key.type = BTRFS_ROOT_ITEM_KEY;
4203                        key.objectid++;
4204                } else {
4205                        break;
4206                }
4207                cond_resched();
4208        }
4209
4210out:
4211        btrfs_free_path(path);
4212        if (trans && !IS_ERR(trans))
4213                btrfs_end_transaction(trans);
4214        if (ret)
4215                btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
4216        else
4217                set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
4218        up(&fs_info->uuid_tree_rescan_sem);
4219        return 0;
4220}
4221
4222/*
4223 * Callback for btrfs_uuid_tree_iterate().
4224 * returns:
4225 * 0    check succeeded, the entry is not outdated.
4226 * < 0  if an error occurred.
4227 * > 0  if the check failed, which means the caller shall remove the entry.
4228 */
4229static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
4230                                       u8 *uuid, u8 type, u64 subid)
4231{
4232        struct btrfs_key key;
4233        int ret = 0;
4234        struct btrfs_root *subvol_root;
4235
4236        if (type != BTRFS_UUID_KEY_SUBVOL &&
4237            type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
4238                goto out;
4239
4240        key.objectid = subid;
4241        key.type = BTRFS_ROOT_ITEM_KEY;
4242        key.offset = (u64)-1;
4243        subvol_root = btrfs_read_fs_root_no_name(fs_info, &key);
4244        if (IS_ERR(subvol_root)) {
4245                ret = PTR_ERR(subvol_root);
4246                if (ret == -ENOENT)
4247                        ret = 1;
4248                goto out;
4249        }
4250
4251        switch (type) {
4252        case BTRFS_UUID_KEY_SUBVOL:
4253                if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE))
4254                        ret = 1;
4255                break;
4256        case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
4257                if (memcmp(uuid, subvol_root->root_item.received_uuid,
4258                           BTRFS_UUID_SIZE))
4259                        ret = 1;
4260                break;
4261        }
4262
4263out:
4264        return ret;
4265}
4266
4267static int btrfs_uuid_rescan_kthread(void *data)
4268{
4269        struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
4270        int ret;
4271
4272        /*
4273         * 1st step is to iterate through the existing UUID tree and
4274         * to delete all entries that contain outdated data.
4275         * 2nd step is to add all missing entries to the UUID tree.
4276         */
4277        ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry);
4278        if (ret < 0) {
4279                btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret);
4280                up(&fs_info->uuid_tree_rescan_sem);
4281                return ret;
4282        }
4283        return btrfs_uuid_scan_kthread(data);
4284}
4285
4286int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
4287{
4288        struct btrfs_trans_handle *trans;
4289        struct btrfs_root *tree_root = fs_info->tree_root;
4290        struct btrfs_root *uuid_root;
4291        struct task_struct *task;
4292        int ret;
4293
4294        /*
4295         * 1 - root node
4296         * 1 - root item
4297         */
4298        trans = btrfs_start_transaction(tree_root, 2);
4299        if (IS_ERR(trans))
4300                return PTR_ERR(trans);
4301
4302        uuid_root = btrfs_create_tree(trans, fs_info,
4303                                      BTRFS_UUID_TREE_OBJECTID);
4304        if (IS_ERR(uuid_root)) {
4305                ret = PTR_ERR(uuid_root);
4306                btrfs_abort_transaction(trans, ret);
4307                btrfs_end_transaction(trans);
4308                return ret;
4309        }
4310
4311        fs_info->uuid_root = uuid_root;
4312
4313        ret = btrfs_commit_transaction(trans);
4314        if (ret)
4315                return ret;
4316
4317        down(&fs_info->uuid_tree_rescan_sem);
4318        task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
4319        if (IS_ERR(task)) {
4320                /* fs_info->update_uuid_tree_gen remains 0 in all error case */
4321                btrfs_warn(fs_info, "failed to start uuid_scan task");
4322                up(&fs_info->uuid_tree_rescan_sem);
4323                return PTR_ERR(task);
4324        }
4325
4326        return 0;
4327}
4328
4329int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
4330{
4331        struct task_struct *task;
4332
4333        down(&fs_info->uuid_tree_rescan_sem);
4334        task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
4335        if (IS_ERR(task)) {
4336                /* fs_info->update_uuid_tree_gen remains 0 in all error case */
4337                btrfs_warn(fs_info, "failed to start uuid_rescan task");
4338                up(&fs_info->uuid_tree_rescan_sem);
4339                return PTR_ERR(task);
4340        }
4341
4342        return 0;
4343}
4344
4345/*
4346 * shrinking a device means finding all of the device extents past
4347 * the new size, and then following the back refs to the chunks.
4348 * The chunk relocation code actually frees the device extent
4349 */
4350int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
4351{
4352        struct btrfs_fs_info *fs_info = device->fs_info;
4353        struct btrfs_root *root = fs_info->dev_root;
4354        struct btrfs_trans_handle *trans;
4355        struct btrfs_dev_extent *dev_extent = NULL;
4356        struct btrfs_path *path;
4357        u64 length;
4358        u64 chunk_offset;
4359        int ret;
4360        int slot;
4361        int failed = 0;
4362        bool retried = false;
4363        bool checked_pending_chunks = false;
4364        struct extent_buffer *l;
4365        struct btrfs_key key;
4366        struct btrfs_super_block *super_copy = fs_info->super_copy;
4367        u64 old_total = btrfs_super_total_bytes(super_copy);
4368        u64 old_size = btrfs_device_get_total_bytes(device);
4369        u64 diff;
4370
4371        new_size = round_down(new_size, fs_info->sectorsize);
4372        diff = round_down(old_size - new_size, fs_info->sectorsize);
4373
4374        if (device->is_tgtdev_for_dev_replace)
4375                return -EINVAL;
4376
4377        path = btrfs_alloc_path();
4378        if (!path)
4379                return -ENOMEM;
4380
4381        path->reada = READA_FORWARD;
4382
4383        mutex_lock(&fs_info->chunk_mutex);
4384
4385        btrfs_device_set_total_bytes(device, new_size);
4386        if (device->writeable) {
4387                device->fs_devices->total_rw_bytes -= diff;
4388                atomic64_sub(diff, &fs_info->free_chunk_space);
4389        }
4390        mutex_unlock(&fs_info->chunk_mutex);
4391
4392again:
4393        key.objectid = device->devid;
4394        key.offset = (u64)-1;
4395        key.type = BTRFS_DEV_EXTENT_KEY;
4396
4397        do {
4398                mutex_lock(&fs_info->delete_unused_bgs_mutex);
4399                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4400                if (ret < 0) {
4401                        mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4402                        goto done;
4403                }
4404
4405                ret = btrfs_previous_item(root, path, 0, key.type);
4406                if (ret)
4407                        mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4408                if (ret < 0)
4409                        goto done;
4410                if (ret) {
4411                        ret = 0;
4412                        btrfs_release_path(path);
4413                        break;
4414                }
4415
4416                l = path->nodes[0];
4417                slot = path->slots[0];
4418                btrfs_item_key_to_cpu(l, &key, path->slots[0]);
4419
4420                if (key.objectid != device->devid) {
4421                        mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4422                        btrfs_release_path(path);
4423                        break;
4424                }
4425
4426                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
4427                length = btrfs_dev_extent_length(l, dev_extent);
4428
4429                if (key.offset + length <= new_size) {
4430                        mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4431                        btrfs_release_path(path);
4432                        break;
4433                }
4434
4435                chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
4436                btrfs_release_path(path);
4437
4438                ret = btrfs_relocate_chunk(fs_info, chunk_offset);
4439                mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4440                if (ret && ret != -ENOSPC)
4441                        goto done;
4442                if (ret == -ENOSPC)
4443                        failed++;
4444        } while (key.offset-- > 0);
4445
4446        if (failed && !retried) {
4447                failed = 0;
4448                retried = true;
4449                goto again;
4450        } else if (failed && retried) {
4451                ret = -ENOSPC;
4452                goto done;
4453        }
4454
4455        /* Shrinking succeeded, else we would be at "done". */
4456        trans = btrfs_start_transaction(root, 0);
4457        if (IS_ERR(trans)) {
4458                ret = PTR_ERR(trans);
4459                goto done;
4460        }
4461
4462        mutex_lock(&fs_info->chunk_mutex);
4463
4464        /*
4465         * We checked in the above loop all device extents that were already in
4466         * the device tree. However before we have updated the device's
4467         * total_bytes to the new size, we might have had chunk allocations that
4468         * have not complete yet (new block groups attached to transaction
4469         * handles), and therefore their device extents were not yet in the
4470         * device tree and we missed them in the loop above. So if we have any
4471         * pending chunk using a device extent that overlaps the device range
4472         * that we can not use anymore, commit the current transaction and
4473         * repeat the search on the device tree - this way we guarantee we will
4474         * not have chunks using device extents that end beyond 'new_size'.
4475         */
4476        if (!checked_pending_chunks) {
4477                u64 start = new_size;
4478                u64 len = old_size - new_size;
4479
4480                if (contains_pending_extent(trans->transaction, device,
4481                                            &start, len)) {
4482                        mutex_unlock(&fs_info->chunk_mutex);
4483                        checked_pending_chunks = true;
4484                        failed = 0;
4485                        retried = false;
4486                        ret = btrfs_commit_transaction(trans);
4487                        if (ret)
4488                                goto done;
4489                        goto again;
4490                }
4491        }
4492
4493        btrfs_device_set_disk_total_bytes(device, new_size);
4494        if (list_empty(&device->resized_list))
4495                list_add_tail(&device->resized_list,
4496                              &fs_info->fs_devices->resized_devices);
4497
4498        WARN_ON(diff > old_total);
4499        btrfs_set_super_total_bytes(super_copy,
4500                        round_down(old_total - diff, fs_info->sectorsize));
4501        mutex_unlock(&fs_info->chunk_mutex);
4502
4503        /* Now btrfs_update_device() will change the on-disk size. */
4504        ret = btrfs_update_device(trans, device);
4505        btrfs_end_transaction(trans);
4506done:
4507        btrfs_free_path(path);
4508        if (ret) {
4509                mutex_lock(&fs_info->chunk_mutex);
4510                btrfs_device_set_total_bytes(device, old_size);
4511                if (device->writeable)
4512                        device->fs_devices->total_rw_bytes += diff;
4513                atomic64_add(diff, &fs_info->free_chunk_space);
4514                mutex_unlock(&fs_info->chunk_mutex);
4515        }
4516        return ret;
4517}
4518
4519static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
4520                           struct btrfs_key *key,
4521                           struct btrfs_chunk *chunk, int item_size)
4522{
4523        struct btrfs_super_block *super_copy = fs_info->super_copy;
4524        struct btrfs_disk_key disk_key;
4525        u32 array_size;
4526        u8 *ptr;
4527
4528        mutex_lock(&fs_info->chunk_mutex);
4529        array_size = btrfs_super_sys_array_size(super_copy);
4530        if (array_size + item_size + sizeof(disk_key)
4531                        > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
4532                mutex_unlock(&fs_info->chunk_mutex);
4533                return -EFBIG;
4534        }
4535
4536        ptr = super_copy->sys_chunk_array + array_size;
4537        btrfs_cpu_key_to_disk(&disk_key, key);
4538        memcpy(ptr, &disk_key, sizeof(disk_key));
4539        ptr += sizeof(disk_key);
4540        memcpy(ptr, chunk, item_size);
4541        item_size += sizeof(disk_key);
4542        btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
4543        mutex_unlock(&fs_info->chunk_mutex);
4544
4545        return 0;
4546}
4547
4548/*
4549 * sort the devices in descending order by max_avail, total_avail
4550 */
4551static int btrfs_cmp_device_info(const void *a, const void *b)
4552{
4553        const struct btrfs_device_info *di_a = a;
4554        const struct btrfs_device_info *di_b = b;
4555
4556        if (di_a->max_avail > di_b->max_avail)
4557                return -1;
4558        if (di_a->max_avail < di_b->max_avail)
4559                return 1;
4560        if (di_a->total_avail > di_b->total_avail)
4561                return -1;
4562        if (di_a->total_avail < di_b->total_avail)
4563                return 1;
4564        return 0;
4565}
4566
4567static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
4568{
4569        if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
4570                return;
4571
4572        btrfs_set_fs_incompat(info, RAID56);
4573}
4574
4575#define BTRFS_MAX_DEVS(r) ((BTRFS_MAX_ITEM_SIZE(r->fs_info)             \
4576                        - sizeof(struct btrfs_chunk))           \
4577                        / sizeof(struct btrfs_stripe) + 1)
4578
4579#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE        \
4580                                - 2 * sizeof(struct btrfs_disk_key)     \
4581                                - 2 * sizeof(struct btrfs_chunk))       \
4582                                / sizeof(struct btrfs_stripe) + 1)
4583
4584static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4585                               u64 start, u64 type)
4586{
4587        struct btrfs_fs_info *info = trans->fs_info;
4588        struct btrfs_fs_devices *fs_devices = info->fs_devices;
4589        struct btrfs_device *device;
4590        struct map_lookup *map = NULL;
4591        struct extent_map_tree *em_tree;
4592        struct extent_map *em;
4593        struct btrfs_device_info *devices_info = NULL;
4594        u64 total_avail;
4595        int num_stripes;        /* total number of stripes to allocate */
4596        int data_stripes;       /* number of stripes that count for
4597                                   block group size */
4598        int sub_stripes;        /* sub_stripes info for map */
4599        int dev_stripes;        /* stripes per dev */
4600        int devs_max;           /* max devs to use */
4601        int devs_min;           /* min devs needed */
4602        int devs_increment;     /* ndevs has to be a multiple of this */
4603        int ncopies;            /* how many copies to data has */
4604        int ret;
4605        u64 max_stripe_size;
4606        u64 max_chunk_size;
4607        u64 stripe_size;
4608        u64 num_bytes;
4609        int ndevs;
4610        int i;
4611        int j;
4612        int index;
4613
4614        BUG_ON(!alloc_profile_is_valid(type, 0));
4615
4616        if (list_empty(&fs_devices->alloc_list))
4617                return -ENOSPC;
4618
4619        index = __get_raid_index(type);
4620
4621        sub_stripes = btrfs_raid_array[index].sub_stripes;
4622        dev_stripes = btrfs_raid_array[index].dev_stripes;
4623        devs_max = btrfs_raid_array[index].devs_max;
4624        devs_min = btrfs_raid_array[index].devs_min;
4625        devs_increment = btrfs_raid_array[index].devs_increment;
4626        ncopies = btrfs_raid_array[index].ncopies;
4627
4628        if (type & BTRFS_BLOCK_GROUP_DATA) {
4629                max_stripe_size = SZ_1G;
4630                max_chunk_size = 10 * max_stripe_size;
4631                if (!devs_max)
4632                        devs_max = BTRFS_MAX_DEVS(info->chunk_root);
4633        } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
4634                /* for larger filesystems, use larger metadata chunks */
4635                if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
4636                        max_stripe_size = SZ_1G;
4637                else
4638                        max_stripe_size = SZ_256M;
4639                max_chunk_size = max_stripe_size;
4640                if (!devs_max)
4641                        devs_max = BTRFS_MAX_DEVS(info->chunk_root);
4642        } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
4643                max_stripe_size = SZ_32M;
4644                max_chunk_size = 2 * max_stripe_size;
4645                if (!devs_max)
4646                        devs_max = BTRFS_MAX_DEVS_SYS_CHUNK;
4647        } else {
4648                btrfs_err(info, "invalid chunk type 0x%llx requested",
4649                       type);
4650                BUG_ON(1);
4651        }
4652
4653        /* we don't want a chunk larger than 10% of writeable space */
4654        max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
4655                             max_chunk_size);
4656
4657        devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
4658                               GFP_NOFS);
4659        if (!devices_info)
4660                return -ENOMEM;
4661
4662        /*
4663         * in the first pass through the devices list, we gather information
4664         * about the available holes on each device.
4665         */
4666        ndevs = 0;
4667        list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
4668                u64 max_avail;
4669                u64 dev_offset;
4670
4671                if (!device->writeable) {
4672                        WARN(1, KERN_ERR
4673                               "BTRFS: read-only device in alloc_list\n");
4674                        continue;
4675                }
4676
4677                if (!device->in_fs_metadata ||
4678                    device->is_tgtdev_for_dev_replace)
4679                        continue;
4680
4681                if (device->total_bytes > device->bytes_used)
4682                        total_avail = device->total_bytes - device->bytes_used;
4683                else
4684                        total_avail = 0;
4685
4686                /* If there is no space on this device, skip it. */
4687                if (total_avail == 0)
4688                        continue;
4689
4690                ret = find_free_dev_extent(trans, device,
4691                                           max_stripe_size * dev_stripes,
4692                                           &dev_offset, &max_avail);
4693                if (ret && ret != -ENOSPC)
4694                        goto error;
4695
4696                if (ret == 0)
4697                        max_avail = max_stripe_size * dev_stripes;
4698
4699                if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
4700                        continue;
4701
4702                if (ndevs == fs_devices->rw_devices) {
4703                        WARN(1, "%s: found more than %llu devices\n",
4704                             __func__, fs_devices->rw_devices);
4705                        break;
4706                }
4707                devices_info[ndevs].dev_offset = dev_offset;
4708                devices_info[ndevs].max_avail = max_avail;
4709                devices_info[ndevs].total_avail = total_avail;
4710                devices_info[ndevs].dev = device;
4711                ++ndevs;
4712        }
4713
4714        /*
4715         * now sort the devices by hole size / available space
4716         */
4717        sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
4718             btrfs_cmp_device_info, NULL);
4719
4720        /* round down to number of usable stripes */
4721        ndevs = round_down(ndevs, devs_increment);
4722
4723        if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
4724                ret = -ENOSPC;
4725                goto error;
4726        }
4727
4728        ndevs = min(ndevs, devs_max);
4729
4730        /*
4731         * the primary goal is to maximize the number of stripes, so use as many
4732         * devices as possible, even if the stripes are not maximum sized.
4733         */
4734        stripe_size = devices_info[ndevs-1].max_avail;
4735        num_stripes = ndevs * dev_stripes;
4736
4737        /*
4738         * this will have to be fixed for RAID1 and RAID10 over
4739         * more drives
4740         */
4741        data_stripes = num_stripes / ncopies;
4742
4743        if (type & BTRFS_BLOCK_GROUP_RAID5)
4744                data_stripes = num_stripes - 1;
4745
4746        if (type & BTRFS_BLOCK_GROUP_RAID6)
4747                data_stripes = num_stripes - 2;
4748
4749        /*
4750         * Use the number of data stripes to figure out how big this chunk
4751         * is really going to be in terms of logical address space,
4752         * and compare that answer with the max chunk size
4753         */
4754        if (stripe_size * data_stripes > max_chunk_size) {
4755                u64 mask = (1ULL << 24) - 1;
4756
4757                stripe_size = div_u64(max_chunk_size, data_stripes);
4758
4759                /* bump the answer up to a 16MB boundary */
4760                stripe_size = (stripe_size + mask) & ~mask;
4761
4762                /* but don't go higher than the limits we found
4763                 * while searching for free extents
4764                 */
4765                if (stripe_size > devices_info[ndevs-1].max_avail)
4766                        stripe_size = devices_info[ndevs-1].max_avail;
4767        }
4768
4769        stripe_size = div_u64(stripe_size, dev_stripes);
4770
4771        /* align to BTRFS_STRIPE_LEN */
4772        stripe_size = round_down(stripe_size, BTRFS_STRIPE_LEN);
4773
4774        map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
4775        if (!map) {
4776                ret = -ENOMEM;
4777                goto error;
4778        }
4779        map->num_stripes = num_stripes;
4780
4781        for (i = 0; i < ndevs; ++i) {
4782                for (j = 0; j < dev_stripes; ++j) {
4783                        int s = i * dev_stripes + j;
4784                        map->stripes[s].dev = devices_info[i].dev;
4785                        map->stripes[s].physical = devices_info[i].dev_offset +
4786                                                   j * stripe_size;
4787                }
4788        }
4789        map->stripe_len = BTRFS_STRIPE_LEN;
4790        map->io_align = BTRFS_STRIPE_LEN;
4791        map->io_width = BTRFS_STRIPE_LEN;
4792        map->type = type;
4793        map->sub_stripes = sub_stripes;
4794
4795        num_bytes = stripe_size * data_stripes;
4796
4797        trace_btrfs_chunk_alloc(info, map, start, num_bytes);
4798
4799        em = alloc_extent_map();
4800        if (!em) {
4801                kfree(map);
4802                ret = -ENOMEM;
4803                goto error;
4804        }
4805        set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
4806        em->map_lookup = map;
4807        em->start = start;
4808        em->len = num_bytes;
4809        em->block_start = 0;
4810        em->block_len = em->len;
4811        em->orig_block_len = stripe_size;
4812
4813        em_tree = &info->mapping_tree.map_tree;
4814        write_lock(&em_tree->lock);
4815        ret = add_extent_mapping(em_tree, em, 0);
4816        if (!ret) {
4817                list_add_tail(&em->list, &trans->transaction->pending_chunks);
4818                refcount_inc(&em->refs);
4819        }
4820        write_unlock(&em_tree->lock);
4821        if (ret) {
4822                free_extent_map(em);
4823                goto error;
4824        }
4825
4826        ret = btrfs_make_block_group(trans, info, 0, type, start, num_bytes);
4827        if (ret)
4828                goto error_del_extent;
4829
4830        for (i = 0; i < map->num_stripes; i++) {
4831                num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
4832                btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
4833        }
4834
4835        atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space);
4836
4837        free_extent_map(em);
4838        check_raid56_incompat_flag(info, type);
4839
4840        kfree(devices_info);
4841        return 0;
4842
4843error_del_extent:
4844        write_lock(&em_tree->lock);
4845        remove_extent_mapping(em_tree, em);
4846        write_unlock(&em_tree->lock);
4847
4848        /* One for our allocation */
4849        free_extent_map(em);
4850        /* One for the tree reference */
4851        free_extent_map(em);
4852        /* One for the pending_chunks list reference */
4853        free_extent_map(em);
4854error:
4855        kfree(devices_info);
4856        return ret;
4857}
4858
4859int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
4860                                struct btrfs_fs_info *fs_info,
4861                                u64 chunk_offset, u64 chunk_size)
4862{
4863        struct btrfs_root *extent_root = fs_info->extent_root;
4864        struct btrfs_root *chunk_root = fs_info->chunk_root;
4865        struct btrfs_key key;
4866        struct btrfs_device *device;
4867        struct btrfs_chunk *chunk;
4868        struct btrfs_stripe *stripe;
4869        struct extent_map *em;
4870        struct map_lookup *map;
4871        size_t item_size;
4872        u64 dev_offset;
4873        u64 stripe_size;
4874        int i = 0;
4875        int ret = 0;
4876
4877        em = get_chunk_map(fs_info, chunk_offset, chunk_size);
4878        if (IS_ERR(em))
4879                return PTR_ERR(em);
4880
4881        map = em->map_lookup;
4882        item_size = btrfs_chunk_item_size(map->num_stripes);
4883        stripe_size = em->orig_block_len;
4884
4885        chunk = kzalloc(item_size, GFP_NOFS);
4886        if (!chunk) {
4887                ret = -ENOMEM;
4888                goto out;
4889        }
4890
4891        /*
4892         * Take the device list mutex to prevent races with the final phase of
4893         * a device replace operation that replaces the device object associated
4894         * with the map's stripes, because the device object's id can change
4895         * at any time during that final phase of the device replace operation
4896         * (dev-replace.c:btrfs_dev_replace_finishing()).
4897         */
4898        mutex_lock(&fs_info->fs_devices->device_list_mutex);
4899        for (i = 0; i < map->num_stripes; i++) {
4900                device = map->stripes[i].dev;
4901                dev_offset = map->stripes[i].physical;
4902
4903                ret = btrfs_update_device(trans, device);
4904                if (ret)
4905                        break;
4906                ret = btrfs_alloc_dev_extent(trans, device, chunk_offset,
4907                                             dev_offset, stripe_size);
4908                if (ret)
4909                        break;
4910        }
4911        if (ret) {
4912                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4913                goto out;
4914        }
4915
4916        stripe = &chunk->stripe;
4917        for (i = 0; i < map->num_stripes; i++) {
4918                device = map->stripes[i].dev;
4919                dev_offset = map->stripes[i].physical;
4920
4921                btrfs_set_stack_stripe_devid(stripe, device->devid);
4922                btrfs_set_stack_stripe_offset(stripe, dev_offset);
4923                memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
4924                stripe++;
4925        }
4926        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4927
4928        btrfs_set_stack_chunk_length(chunk, chunk_size);
4929        btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
4930        btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
4931        btrfs_set_stack_chunk_type(chunk, map->type);
4932        btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
4933        btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
4934        btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
4935        btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
4936        btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
4937
4938        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
4939        key.type = BTRFS_CHUNK_ITEM_KEY;
4940        key.offset = chunk_offset;
4941
4942        ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
4943        if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
4944                /*
4945                 * TODO: Cleanup of inserted chunk root in case of
4946                 * failure.
4947                 */
4948                ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
4949        }
4950
4951out:
4952        kfree(chunk);
4953        free_extent_map(em);
4954        return ret;
4955}
4956
4957/*
4958 * Chunk allocation falls into two parts. The first part does works
4959 * that make the new allocated chunk useable, but not do any operation
4960 * that modifies the chunk tree. The second part does the works that
4961 * require modifying the chunk tree. This division is important for the
4962 * bootstrap process of adding storage to a seed btrfs.
4963 */
4964int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4965                      struct btrfs_fs_info *fs_info, u64 type)
4966{
4967        u64 chunk_offset;
4968
4969        ASSERT(mutex_is_locked(&fs_info->chunk_mutex));
4970        chunk_offset = find_next_chunk(fs_info);
4971        return __btrfs_alloc_chunk(trans, chunk_offset, type);
4972}
4973
4974static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
4975                                         struct btrfs_fs_info *fs_info)
4976{
4977        u64 chunk_offset;
4978        u64 sys_chunk_offset;
4979        u64 alloc_profile;
4980        int ret;
4981
4982        chunk_offset = find_next_chunk(fs_info);
4983        alloc_profile = btrfs_metadata_alloc_profile(fs_info);
4984        ret = __btrfs_alloc_chunk(trans, chunk_offset, alloc_profile);
4985        if (ret)
4986                return ret;
4987
4988        sys_chunk_offset = find_next_chunk(fs_info);
4989        alloc_profile = btrfs_system_alloc_profile(fs_info);
4990        ret = __btrfs_alloc_chunk(trans, sys_chunk_offset, alloc_profile);
4991        return ret;
4992}
4993
4994static inline int btrfs_chunk_max_errors(struct map_lookup *map)
4995{
4996        int max_errors;
4997
4998        if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
4999                         BTRFS_BLOCK_GROUP_RAID10 |
5000                         BTRFS_BLOCK_GROUP_RAID5 |

5001                         BTRFS_BLOCK_GROUP_DUP)) {
5002                max_errors = 1;
5003        } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
5004                max_errors = 2;
5005        } else {
5006                max_errors = 0;
5007        }
5008
5009        return max_errors;
5010}
5011
5012int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
5013{
5014        struct extent_map *em;
5015        struct map_lookup *map;
5016        int readonly = 0;
5017        int miss_ndevs = 0;
5018        int i;
5019
5020        em = get_chunk_map(fs_info, chunk_offset, 1);
5021        if (IS_ERR(em))
5022                return 1;
5023
5024        map = em->map_lookup;
5025        for (i = 0; i < map->num_stripes; i++) {
5026                if (map->stripes[i].dev->missing) {
5027                        miss_ndevs++;
5028                        continue;
5029                }
5030
5031                if (!map->stripes[i].dev->writeable) {
5032                        readonly = 1;
5033                        goto end;
5034                }
5035        }
5036
5037        /*
5038         * If the number of missing devices is larger than max errors,
5039         * we can not write the data into that chunk successfully, so
5040         * set it readonly.
5041         */
5042        if (miss_ndevs > btrfs_chunk_max_errors(map))
5043                readonly = 1;
5044end:
5045        free_extent_map(em);
5046        return readonly;
5047}
5048
5049void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
5050{
5051        extent_map_tree_init(&tree->map_tree);
5052}
5053
5054void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
5055{
5056        struct extent_map *em;
5057
5058        while (1) {
5059                write_lock(&tree->map_tree.lock);
5060                em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
5061                if (em)
5062                        remove_extent_mapping(&tree->map_tree, em);
5063                write_unlock(&tree->map_tree.lock);
5064                if (!em)
5065                        break;
5066                /* once for us */
5067                free_extent_map(em);
5068                /* once for the tree */
5069                free_extent_map(em);
5070        }
5071}
5072
5073int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5074{
5075        struct extent_map *em;
5076        struct map_lookup *map;
5077        int ret;
5078
5079        em = get_chunk_map(fs_info, logical, len);
5080        if (IS_ERR(em))
5081                /*
5082                 * We could return errors for these cases, but that could get
5083                 * ugly and we'd probably do the same thing which is just not do
5084                 * anything else and exit, so return 1 so the callers don't try
5085                 * to use other copies.
5086                 */
5087                return 1;
5088
5089        map = em->map_lookup;
5090        if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
5091                ret = map->num_stripes;
5092        else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5093                ret = map->sub_stripes;
5094        else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
5095                ret = 2;
5096        else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
5097                ret = 3;
5098        else
5099                ret = 1;
5100        free_extent_map(em);
5101
5102        btrfs_dev_replace_lock(&fs_info->dev_replace, 0);
5103        if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
5104            fs_info->dev_replace.tgtdev)
5105                ret++;
5106        btrfs_dev_replace_unlock(&fs_info->dev_replace, 0);
5107
5108        return ret;
5109}
5110
5111unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
5112                                    u64 logical)
5113{
5114        struct extent_map *em;
5115        struct map_lookup *map;
5116        unsigned long len = fs_info->sectorsize;
5117
5118        em = get_chunk_map(fs_info, logical, len);
5119
5120        if (!WARN_ON(IS_ERR(em))) {
5121                map = em->map_lookup;
5122                if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5123                        len = map->stripe_len * nr_data_stripes(map);
5124                free_extent_map(em);
5125        }
5126        return len;
5127}
5128
5129int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5130{
5131        struct extent_map *em;
5132        struct map_lookup *map;
5133        int ret = 0;
5134
5135        em = get_chunk_map(fs_info, logical, len);
5136
5137        if(!WARN_ON(IS_ERR(em))) {
5138                map = em->map_lookup;
5139                if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5140                        ret = 1;
5141                free_extent_map(em);
5142        }
5143        return ret;
5144}
5145
5146static int find_live_mirror(struct btrfs_fs_info *fs_info,
5147                            struct map_lookup *map, int first, int num,
5148                            int optimal, int dev_replace_is_ongoing)
5149{
5150        int i;
5151        int tolerance;
5152        struct btrfs_device *srcdev;
5153
5154        if (dev_replace_is_ongoing &&
5155            fs_info->dev_replace.cont_reading_from_srcdev_mode ==
5156             BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
5157                srcdev = fs_info->dev_replace.srcdev;
5158        else
5159                srcdev = NULL;
5160
5161        /*
5162         * try to avoid the drive that is the source drive for a
5163         * dev-replace procedure, only choose it if no other non-missing
5164         * mirror is available
5165         */
5166        for (tolerance = 0; tolerance < 2; tolerance++) {
5167                if (map->stripes[optimal].dev->bdev &&
5168                    (tolerance || map->stripes[optimal].dev != srcdev))
5169                        return optimal;
5170                for (i = first; i < first + num; i++) {
5171                        if (map->stripes[i].dev->bdev &&
5172                            (tolerance || map->stripes[i].dev != srcdev))
5173                                return i;
5174                }
5175        }
5176
5177        /* we couldn't find one that doesn't fail.  Just return something
5178         * and the io error handling code will clean up eventually
5179         */
5180        return optimal;
5181}
5182
5183static inline int parity_smaller(u64 a, u64 b)
5184{
5185        return a > b;
5186}
5187
5188/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
5189static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
5190{
5191        struct btrfs_bio_stripe s;
5192        int i;
5193        u64 l;
5194        int again = 1;
5195
5196        while (again) {
5197                again = 0;
5198                for (i = 0; i < num_stripes - 1; i++) {
5199                        if (parity_smaller(bbio->raid_map[i],
5200                                           bbio->raid_map[i+1])) {
5201                                s = bbio->stripes[i];
5202                                l = bbio->raid_map[i];
5203                                bbio->stripes[i] = bbio->stripes[i+1];
5204                                bbio->raid_map[i] = bbio->raid_map[i+1];
5205                                bbio->stripes[i+1] = s;
5206                                bbio->raid_map[i+1] = l;
5207
5208                                again = 1;
5209                        }
5210                }
5211        }
5212}
5213
5214static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
5215{
5216        struct btrfs_bio *bbio = kzalloc(
5217                 /* the size of the btrfs_bio */
5218                sizeof(struct btrfs_bio) +
5219                /* plus the variable array for the stripes */
5220                sizeof(struct btrfs_bio_stripe) * (total_stripes) +
5221                /* plus the variable array for the tgt dev */
5222                sizeof(int) * (real_stripes) +
5223                /*
5224                 * plus the raid_map, which includes both the tgt dev
5225                 * and the stripes
5226                 */
5227                sizeof(u64) * (total_stripes),
5228                GFP_NOFS|__GFP_NOFAIL);
5229
5230        atomic_set(&bbio->error, 0);
5231        refcount_set(&bbio->refs, 1);
5232
5233        return bbio;
5234}
5235
5236void btrfs_get_bbio(struct btrfs_bio *bbio)
5237{
5238        WARN_ON(!refcount_read(&bbio->refs));
5239        refcount_inc(&bbio->refs);
5240}
5241
5242void btrfs_put_bbio(struct btrfs_bio *bbio)
5243{
5244        if (!bbio)
5245                return;
5246        if (refcount_dec_and_test(&bbio->refs))
5247                kfree(bbio);
5248}
5249
5250/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
5251/*
5252 * Please note that, discard won't be sent to target device of device
5253 * replace.
5254 */
5255static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
5256                                         u64 logical, u64 length,
5257                                         struct btrfs_bio **bbio_ret)
5258{
5259        struct extent_map *em;
5260        struct map_lookup *map;
5261        struct btrfs_bio *bbio;
5262        u64 offset;
5263        u64 stripe_nr;
5264        u64 stripe_nr_end;
5265        u64 stripe_end_offset;
5266        u64 stripe_cnt;
5267        u64 stripe_len;
5268        u64 stripe_offset;
5269        u64 num_stripes;
5270        u32 stripe_index;
5271        u32 factor = 0;
5272        u32 sub_stripes = 0;
5273        u64 stripes_per_dev = 0;
5274        u32 remaining_stripes = 0;
5275        u32 last_stripe = 0;
5276        int ret = 0;
5277        int i;
5278
5279        /* discard always return a bbio */
5280        ASSERT(bbio_ret);
5281
5282        em = get_chunk_map(fs_info, logical, length);
5283        if (IS_ERR(em))
5284                return PTR_ERR(em);
5285
5286        map = em->map_lookup;
5287        /* we don't discard raid56 yet */
5288        if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5289                ret = -EOPNOTSUPP;
5290                goto out;
5291        }
5292
5293        offset = logical - em->start;
5294        length = min_t(u64, em->len - offset, length);
5295
5296        stripe_len = map->stripe_len;
5297        /*
5298         * stripe_nr counts the total number of stripes we have to stride
5299         * to get to this block
5300         */
5301        stripe_nr = div64_u64(offset, stripe_len);
5302
5303        /* stripe_offset is the offset of this block in its stripe */
5304        stripe_offset = offset - stripe_nr * stripe_len;
5305
5306        stripe_nr_end = round_up(offset + length, map->stripe_len);
5307        stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
5308        stripe_cnt = stripe_nr_end - stripe_nr;
5309        stripe_end_offset = stripe_nr_end * map->stripe_len -
5310                            (offset + length);
5311        /*
5312         * after this, stripe_nr is the number of stripes on this
5313         * device we have to walk to find the data, and stripe_index is
5314         * the number of our device in the stripe array
5315         */
5316        num_stripes = 1;
5317        stripe_index = 0;
5318        if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
5319                         BTRFS_BLOCK_GROUP_RAID10)) {
5320                if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5321                        sub_stripes = 1;
5322                else
5323                        sub_stripes = map->sub_stripes;
5324
5325                factor = map->num_stripes / sub_stripes;
5326                num_stripes = min_t(u64, map->num_stripes,
5327                                    sub_stripes * stripe_cnt);
5328                stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
5329                stripe_index *= sub_stripes;
5330                stripes_per_dev = div_u64_rem(stripe_cnt, factor,
5331                                              &remaining_stripes);
5332                div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
5333                last_stripe *= sub_stripes;
5334        } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
5335                                BTRFS_BLOCK_GROUP_DUP)) {
5336                num_stripes = map->num_stripes;
5337        } else {
5338                stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
5339                                        &stripe_index);
5340        }
5341
5342        bbio = alloc_btrfs_bio(num_stripes, 0);
5343        if (!bbio) {
5344                ret = -ENOMEM;
5345                goto out;
5346        }
5347
5348        for (i = 0; i < num_stripes; i++) {
5349                bbio->stripes[i].physical =
5350                        map->stripes[stripe_index].physical +
5351                        stripe_offset + stripe_nr * map->stripe_len;
5352                bbio->stripes[i].dev = map->stripes[stripe_index].dev;
5353
5354                if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
5355                                 BTRFS_BLOCK_GROUP_RAID10)) {
5356                        bbio->stripes[i].length = stripes_per_dev *
5357                                map->stripe_len;
5358
5359                        if (i / sub_stripes < remaining_stripes)
5360                                bbio->stripes[i].length +=
5361                                        map->stripe_len;
5362
5363                        /*
5364                         * Special for the first stripe and
5365                         * the last stripe:
5366                         *
5367                         * |-------|...|-------|
5368                         *     |----------|
5369                         *    off     end_off
5370                         */
5371                        if (i < sub_stripes)
5372                                bbio->stripes[i].length -=
5373                                        stripe_offset;
5374
5375                        if (stripe_index >= last_stripe &&
5376                            stripe_index <= (last_stripe +
5377                                             sub_stripes - 1))
5378                                bbio->stripes[i].length -=
5379                                        stripe_end_offset;
5380
5381                        if (i == sub_stripes - 1)
5382                                stripe_offset = 0;
5383                } else {
5384                        bbio->stripes[i].length = length;
5385                }
5386
5387                stripe_index++;
5388                if (stripe_index == map->num_stripes) {
5389                        stripe_index = 0;
5390                        stripe_nr++;
5391                }
5392        }
5393
5394        *bbio_ret = bbio;
5395        bbio->map_type = map->type;
5396        bbio->num_stripes = num_stripes;
5397out:
5398        free_extent_map(em);
5399        return ret;
5400}
5401
5402/*
5403 * In dev-replace case, for repair case (that's the only case where the mirror
5404 * is selected explicitly when calling btrfs_map_block), blocks left of the
5405 * left cursor can also be read from the target drive.
5406 *
5407 * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
5408 * array of stripes.
5409 * For READ, it also needs to be supported using the same mirror number.
5410 *
5411 * If the requested block is not left of the left cursor, EIO is returned. This
5412 * can happen because btrfs_num_copies() returns one more in the dev-replace
5413 * case.
5414 */
5415static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
5416                                         u64 logical, u64 length,
5417                                         u64 srcdev_devid, int *mirror_num,
5418                                         u64 *physical)
5419{
5420        struct btrfs_bio *bbio = NULL;
5421        int num_stripes;
5422        int index_srcdev = 0;
5423        int found = 0;
5424        u64 physical_of_found = 0;
5425        int i;
5426        int ret = 0;
5427
5428        ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
5429                                logical, &length, &bbio, 0, 0);
5430        if (ret) {
5431                ASSERT(bbio == NULL);
5432                return ret;
5433        }
5434
5435        num_stripes = bbio->num_stripes;
5436        if (*mirror_num > num_stripes) {
5437                /*
5438                 * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
5439                 * that means that the requested area is not left of the left
5440                 * cursor
5441                 */
5442                btrfs_put_bbio(bbio);
5443                return -EIO;
5444        }
5445
5446        /*
5447         * process the rest of the function using the mirror_num of the source
5448         * drive. Therefore look it up first.  At the end, patch the device
5449         * pointer to the one of the target drive.
5450         */
5451        for (i = 0; i < num_stripes; i++) {
5452                if (bbio->stripes[i].dev->devid != srcdev_devid)
5453                        continue;
5454
5455                /*
5456                 * In case of DUP, in order to keep it simple, only add the
5457                 * mirror with the lowest physical address
5458                 */
5459                if (found &&
5460                    physical_of_found <= bbio->stripes[i].physical)
5461                        continue;
5462
5463                index_srcdev = i;
5464                found = 1;
5465                physical_of_found = bbio->stripes[i].physical;
5466        }
5467
5468        btrfs_put_bbio(bbio);
5469
5470        ASSERT(found);
5471        if (!found)
5472                return -EIO;
5473
5474        *mirror_num = index_srcdev + 1;
5475        *physical = physical_of_found;
5476        return ret;
5477}
5478
5479static void handle_ops_on_dev_replace(enum btrfs_map_op op,
5480                                      struct btrfs_bio **bbio_ret,
5481                                      struct btrfs_dev_replace *dev_replace,
5482                                      int *num_stripes_ret, int *max_errors_ret)
5483{
5484        struct btrfs_bio *bbio = *bbio_ret;
5485        u64 srcdev_devid = dev_replace->srcdev->devid;
5486        int tgtdev_indexes = 0;
5487        int num_stripes = *num_stripes_ret;
5488        int max_errors = *max_errors_ret;
5489        int i;
5490
5491        if (op == BTRFS_MAP_WRITE) {
5492                int index_where_to_add;
5493
5494                /*
5495                 * duplicate the write operations while the dev replace
5496                 * procedure is running. Since the copying of the old disk to
5497                 * the new disk takes place at run time while the filesystem is
5498                 * mounted writable, the regular write operations to the old
5499                 * disk have to be duplicated to go to the new disk as well.
5500                 *
5501                 * Note that device->missing is handled by the caller, and that
5502                 * the write to the old disk is already set up in the stripes
5503                 * array.
5504                 */
5505                index_where_to_add = num_stripes;
5506                for (i = 0; i < num_stripes; i++) {
5507                        if (bbio->stripes[i].dev->devid == srcdev_devid) {
5508                                /* write to new disk, too */
5509                                struct btrfs_bio_stripe *new =
5510                                        bbio->stripes + index_where_to_add;
5511                                struct btrfs_bio_stripe *old =
5512                                        bbio->stripes + i;
5513
5514                                new->physical = old->physical;
5515                                new->length = old->length;
5516                                new->dev = dev_replace->tgtdev;
5517                                bbio->tgtdev_map[i] = index_where_to_add;
5518                                index_where_to_add++;
5519                                max_errors++;
5520                                tgtdev_indexes++;
5521                        }
5522                }
5523                num_stripes = index_where_to_add;
5524        } else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
5525                int index_srcdev = 0;
5526                int found = 0;
5527                u64 physical_of_found = 0;
5528
5529                /*
5530                 * During the dev-replace procedure, the target drive can also
5531                 * be used to read data in case it is needed to repair a corrupt
5532                 * block elsewhere. This is possible if the requested area is
5533                 * left of the left cursor. In this area, the target drive is a
5534                 * full copy of the source drive.
5535                 */
5536                for (i = 0; i < num_stripes; i++) {
5537                        if (bbio->stripes[i].dev->devid == srcdev_devid) {
5538                                /*
5539                                 * In case of DUP, in order to keep it simple,
5540                                 * only add the mirror with the lowest physical
5541                                 * address
5542                                 */
5543                                if (found &&
5544                                    physical_of_found <=
5545                                     bbio->stripes[i].physical)
5546                                        continue;
5547                                index_srcdev = i;
5548                                found = 1;
5549                                physical_of_found = bbio->stripes[i].physical;
5550                        }
5551                }
5552                if (found) {
5553                        struct btrfs_bio_stripe *tgtdev_stripe =
5554                                bbio->stripes + num_stripes;
5555
5556                        tgtdev_stripe->physical = physical_of_found;
5557                        tgtdev_stripe->length =
5558                                bbio->stripes[index_srcdev].length;
5559                        tgtdev_stripe->dev = dev_replace->tgtdev;
5560                        bbio->tgtdev_map[index_srcdev] = num_stripes;
5561
5562                        tgtdev_indexes++;
5563                        num_stripes++;
5564                }
5565        }
5566
5567        *num_stripes_ret = num_stripes;
5568        *max_errors_ret = max_errors;
5569        bbio->num_tgtdevs = tgtdev_indexes;
5570        *bbio_ret = bbio;
5571}
5572
5573static bool need_full_stripe(enum btrfs_map_op op)
5574{
5575        return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
5576}
5577
5578static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
5579                             enum btrfs_map_op op,
5580                             u64 logical, u64 *length,
5581                             struct btrfs_bio **bbio_ret,
5582                             int mirror_num, int need_raid_map)
5583{
5584        struct extent_map *em;
5585        struct map_lookup *map;
5586        u64 offset;
5587        u64 stripe_offset;
5588        u64 stripe_nr;
5589        u64 stripe_len;
5590        u32 stripe_index;
5591        int i;
5592        int ret = 0;
5593        int num_stripes;
5594        int max_errors = 0;
5595        int tgtdev_indexes = 0;
5596        struct btrfs_bio *bbio = NULL;
5597        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
5598        int dev_replace_is_ongoing = 0;
5599        int num_alloc_stripes;
5600        int patch_the_first_stripe_for_dev_replace = 0;
5601        u64 physical_to_patch_in_first_stripe = 0;
5602        u64 raid56_full_stripe_start = (u64)-1;
5603
5604        if (op == BTRFS_MAP_DISCARD)
5605                return __btrfs_map_block_for_discard(fs_info, logical,
5606                                                     *length, bbio_ret);
5607
5608        em = get_chunk_map(fs_info, logical, *length);
5609        if (IS_ERR(em))
5610                return PTR_ERR(em);
5611
5612        map = em->map_lookup;
5613        offset = logical - em->start;
5614
5615        stripe_len = map->stripe_len;
5616        stripe_nr = offset;
5617        /*
5618         * stripe_nr counts the total number of stripes we have to stride
5619         * to get to this block
5620         */
5621        stripe_nr = div64_u64(stripe_nr, stripe_len);
5622
5623        stripe_offset = stripe_nr * stripe_len;
5624        if (offset < stripe_offset) {
5625                btrfs_crit(fs_info,
5626                           "stripe math has gone wrong, stripe_offset=%llu, offset=%llu, start=%llu, logical=%llu, stripe_len=%llu",
5627                           stripe_offset, offset, em->start, logical,
5628                           stripe_len);
5629                free_extent_map(em);
5630                return -EINVAL;
5631        }
5632
5633        /* stripe_offset is the offset of this block in its stripe*/
5634        stripe_offset = offset - stripe_offset;
5635
5636        /* if we're here for raid56, we need to know the stripe aligned start */
5637        if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5638                unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
5639                raid56_full_stripe_start = offset;
5640
5641                /* allow a write of a full stripe, but make sure we don't
5642                 * allow straddling of stripes
5643                 */
5644                raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
5645                                full_stripe_len);
5646                raid56_full_stripe_start *= full_stripe_len;
5647        }
5648
5649        if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
5650                u64 max_len;
5651                /* For writes to RAID[56], allow a full stripeset across all disks.
5652                   For other RAID types and for RAID[56] reads, just allow a single
5653                   stripe (on a single disk). */
5654                if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
5655                    (op == BTRFS_MAP_WRITE)) {
5656                        max_len = stripe_len * nr_data_stripes(map) -
5657                                (offset - raid56_full_stripe_start);
5658                } else {
5659                        /* we limit the length of each bio to what fits in a stripe */
5660                        max_len = stripe_len - stripe_offset;
5661                }
5662                *length = min_t(u64, em->len - offset, max_len);
5663        } else {
5664                *length = em->len - offset;
5665        }
5666
5667        /* This is for when we're called from btrfs_merge_bio_hook() and all
5668           it cares about is the length */
5669        if (!bbio_ret)
5670                goto out;
5671
5672        btrfs_dev_replace_lock(dev_replace, 0);
5673        dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
5674        if (!dev_replace_is_ongoing)
5675                btrfs_dev_replace_unlock(dev_replace, 0);
5676        else
5677                btrfs_dev_replace_set_lock_blocking(dev_replace);
5678
5679        if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
5680            !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
5681                ret = get_extra_mirror_from_replace(fs_info, logical, *length,
5682                                                    dev_replace->srcdev->devid,
5683                                                    &mirror_num,
5684                                            &physical_to_patch_in_first_stripe);
5685                if (ret)
5686                        goto out;
5687                else
5688                        patch_the_first_stripe_for_dev_replace = 1;
5689        } else if (mirror_num > map->num_stripes) {
5690                mirror_num = 0;
5691        }
5692
5693        num_stripes = 1;
5694        stripe_index = 0;
5695        if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
5696                stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
5697                                &stripe_index);
5698                if (op != BTRFS_MAP_WRITE && op != BTRFS_MAP_GET_READ_MIRRORS)
5699                        mirror_num = 1;
5700        } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
5701                if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS)
5702                        num_stripes = map->num_stripes;
5703                else if (mirror_num)
5704                        stripe_index = mirror_num - 1;
5705                else {
5706                        stripe_index = find_live_mirror(fs_info, map, 0,
5707                                            map->num_stripes,
5708                                            current->pid % map->num_stripes,
5709                                            dev_replace_is_ongoing);
5710                        mirror_num = stripe_index + 1;
5711                }
5712
5713        } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
5714                if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) {
5715                        num_stripes = map->num_stripes;
5716                } else if (mirror_num) {
5717                        stripe_index = mirror_num - 1;
5718                } else {
5719                        mirror_num = 1;
5720                }
5721
5722        } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
5723                u32 factor = map->num_stripes / map->sub_stripes;
5724
5725                stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
5726                stripe_index *= map->sub_stripes;
5727
5728                if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS)
5729                        num_stripes = map->sub_stripes;
5730                else if (mirror_num)
5731                        stripe_index += mirror_num - 1;
5732                else {
5733                        int old_stripe_index = stripe_index;
5734                        stripe_index = find_live_mirror(fs_info, map,
5735                                              stripe_index,
5736                                              map->sub_stripes, stripe_index +
5737                                              current->pid % map->sub_stripes,
5738                                              dev_replace_is_ongoing);
5739                        mirror_num = stripe_index - old_stripe_index + 1;
5740                }
5741
5742        } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5743                if (need_raid_map &&
5744                    (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS ||
5745                     mirror_num > 1)) {
5746                        /* push stripe_nr back to the start of the full stripe */
5747                        stripe_nr = div64_u64(raid56_full_stripe_start,
5748                                        stripe_len * nr_data_stripes(map));
5749
5750                        /* RAID[56] write or recovery. Return all stripes */
5751                        num_stripes = map->num_stripes;
5752                        max_errors = nr_parity_stripes(map);
5753
5754                        *length = map->stripe_len;
5755                        stripe_index = 0;
5756                        stripe_offset = 0;
5757                } else {
5758                        /*
5759                         * Mirror #0 or #1 means the original data block.
5760                         * Mirror #2 is RAID5 parity block.
5761                         * Mirror #3 is RAID6 Q block.
5762                         */
5763                        stripe_nr = div_u64_rem(stripe_nr,
5764                                        nr_data_stripes(map), &stripe_index);
5765                        if (mirror_num > 1)
5766                                stripe_index = nr_data_stripes(map) +
5767                                                mirror_num - 2;
5768
5769                        /* We distribute the parity blocks across stripes */
5770                        div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
5771                                        &stripe_index);
5772                        if ((op != BTRFS_MAP_WRITE &&
5773                             op != BTRFS_MAP_GET_READ_MIRRORS) &&
5774                            mirror_num <= 1)
5775                                mirror_num = 1;
5776                }
5777        } else {
5778                /*
5779                 * after this, stripe_nr is the number of stripes on this
5780                 * device we have to walk to find the data, and stripe_index is
5781                 * the number of our device in the stripe array
5782                 */
5783                stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
5784                                &stripe_index);
5785                mirror_num = stripe_index + 1;
5786        }
5787        if (stripe_index >= map->num_stripes) {
5788                btrfs_crit(fs_info,
5789                           "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
5790                           stripe_index, map->num_stripes);
5791                ret = -EINVAL;
5792                goto out;
5793        }
5794
5795        num_alloc_stripes = num_stripes;
5796        if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
5797                if (op == BTRFS_MAP_WRITE)
5798                        num_alloc_stripes <<= 1;
5799                if (op == BTRFS_MAP_GET_READ_MIRRORS)
5800                        num_alloc_stripes++;
5801                tgtdev_indexes = num_stripes;
5802        }
5803
5804        bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
5805        if (!bbio) {
5806                ret = -ENOMEM;
5807                goto out;
5808        }
5809        if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL)
5810                bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
5811
5812        /* build raid_map */
5813        if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
5814            (need_full_stripe(op) || mirror_num > 1)) {
5815                u64 tmp;
5816                unsigned rot;
5817
5818                bbio->raid_map = (u64 *)((void *)bbio->stripes +
5819                                 sizeof(struct btrfs_bio_stripe) *
5820                                 num_alloc_stripes +
5821                                 sizeof(int) * tgtdev_indexes);
5822
5823                /* Work out the disk rotation on this stripe-set */
5824                div_u64_rem(stripe_nr, num_stripes, &rot);
5825
5826                /* Fill in the logical address of each stripe */
5827                tmp = stripe_nr * nr_data_stripes(map);
5828                for (i = 0; i < nr_data_stripes(map); i++)
5829                        bbio->raid_map[(i+rot) % num_stripes] =
5830                                em->start + (tmp + i) * map->stripe_len;
5831
5832                bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
5833                if (map->type & BTRFS_BLOCK_GROUP_RAID6)
5834                        bbio->raid_map[(i+rot+1) % num_stripes] =
5835                                RAID6_Q_STRIPE;
5836        }
5837
5838
5839        for (i = 0; i < num_stripes; i++) {
5840                bbio->stripes[i].physical =
5841                        map->stripes[stripe_index].physical +
5842                        stripe_offset +
5843                        stripe_nr * map->stripe_len;
5844                bbio->stripes[i].dev =
5845                        map->stripes[stripe_index].dev;
5846                stripe_index++;
5847        }
5848
5849        if (need_full_stripe(op))
5850                max_errors = btrfs_chunk_max_errors(map);
5851
5852        if (bbio->raid_map)
5853                sort_parity_stripes(bbio, num_stripes);
5854
5855        if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
5856            need_full_stripe(op)) {
5857                handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes,
5858                                          &max_errors);
5859        }
5860
5861        *bbio_ret = bbio;
5862        bbio->map_type = map->type;
5863        bbio->num_stripes = num_stripes;
5864        bbio->max_errors = max_errors;
5865        bbio->mirror_num = mirror_num;
5866
5867        /*
5868         * this is the case that REQ_READ && dev_replace_is_ongoing &&
5869         * mirror_num == num_stripes + 1 && dev_replace target drive is
5870         * available as a mirror
5871         */
5872        if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
5873                WARN_ON(num_stripes > 1);
5874                bbio->stripes[0].dev = dev_replace->tgtdev;
5875                bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
5876                bbio->mirror_num = map->num_stripes + 1;
5877        }
5878out:
5879        if (dev_replace_is_ongoing) {
5880                btrfs_dev_replace_clear_lock_blocking(dev_replace);
5881                btrfs_dev_replace_unlock(dev_replace, 0);
5882        }
5883        free_extent_map(em);
5884        return ret;
5885}
5886
5887int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
5888                      u64 logical, u64 *length,
5889                      struct btrfs_bio **bbio_ret, int mirror_num)
5890{
5891        return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
5892                                 mirror_num, 0);
5893}
5894
5895/* For Scrub/replace */
5896int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
5897                     u64 logical, u64 *length,
5898                     struct btrfs_bio **bbio_ret)
5899{
5900        return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
5901}
5902
5903int btrfs_rmap_block(struct btrfs_fs_info *fs_info,
5904                     u64 chunk_start, u64 physical, u64 devid,
5905                     u64 **logical, int *naddrs, int *stripe_len)
5906{
5907        struct extent_map *em;
5908        struct map_lookup *map;
5909        u64 *buf;
5910        u64 bytenr;
5911        u64 length;
5912        u64 stripe_nr;
5913        u64 rmap_len;
5914        int i, j, nr = 0;
5915
5916        em = get_chunk_map(fs_info, chunk_start, 1);
5917        if (IS_ERR(em))
5918                return -EIO;
5919
5920        map = em->map_lookup;
5921        length = em->len;
5922        rmap_len = map->stripe_len;
5923
5924        if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5925                length = div_u64(length, map->num_stripes / map->sub_stripes);
5926        else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5927                length = div_u64(length, map->num_stripes);
5928        else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5929                length = div_u64(length, nr_data_stripes(map));
5930                rmap_len = map->stripe_len * nr_data_stripes(map);
5931        }
5932
5933        buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
5934        BUG_ON(!buf); /* -ENOMEM */
5935
5936        for (i = 0; i < map->num_stripes; i++) {
5937                if (devid && map->stripes[i].dev->devid != devid)
5938                        continue;
5939                if (map->stripes[i].physical > physical ||
5940                    map->stripes[i].physical + length <= physical)
5941                        continue;
5942
5943                stripe_nr = physical - map->stripes[i].physical;
5944                stripe_nr = div64_u64(stripe_nr, map->stripe_len);
5945
5946                if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
5947                        stripe_nr = stripe_nr * map->num_stripes + i;
5948                        stripe_nr = div_u64(stripe_nr, map->sub_stripes);
5949                } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
5950                        stripe_nr = stripe_nr * map->num_stripes + i;
5951                } /* else if RAID[56], multiply by nr_data_stripes().
5952                   * Alternatively, just use rmap_len below instead of
5953                   * map->stripe_len */
5954
5955                bytenr = chunk_start + stripe_nr * rmap_len;
5956                WARN_ON(nr >= map->num_stripes);
5957                for (j = 0; j < nr; j++) {
5958                        if (buf[j] == bytenr)
5959                                break;
5960                }
5961                if (j == nr) {
5962                        WARN_ON(nr >= map->num_stripes);
5963                        buf[nr++] = bytenr;
5964                }
5965        }
5966
5967        *logical = buf;
5968        *naddrs = nr;
5969        *stripe_len = rmap_len;
5970
5971        free_extent_map(em);
5972        return 0;
5973}
5974
5975static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
5976{
5977        bio->bi_private = bbio->private;
5978        bio->bi_end_io = bbio->end_io;
5979        bio_endio(bio);
5980
5981        btrfs_put_bbio(bbio);
5982}
5983
5984static void btrfs_end_bio(struct bio *bio)
5985{
5986        struct btrfs_bio *bbio = bio->bi_private;
5987        int is_orig_bio = 0;
5988
5989        if (bio->bi_status) {
5990                atomic_inc(&bbio->error);
5991                if (bio->bi_status == BLK_STS_IOERR ||
5992                    bio->bi_status == BLK_STS_TARGET) {
5993                        unsigned int stripe_index =
5994                                btrfs_io_bio(bio)->stripe_index;
5995                        struct btrfs_device *dev;
5996
5997                        BUG_ON(stripe_index >= bbio->num_stripes);
5998                        dev = bbio->stripes[stripe_index].dev;
5999                        if (dev->bdev) {
6000                                if (bio_op(bio) == REQ_OP_WRITE)

6001                                        btrfs_dev_stat_inc(dev,
6002                                                BTRFS_DEV_STAT_WRITE_ERRS);
6003                                else
6004                                        btrfs_dev_stat_inc(dev,
6005                                                BTRFS_DEV_STAT_READ_ERRS);
6006                                if (bio->bi_opf & REQ_PREFLUSH)
6007                                        btrfs_dev_stat_inc(dev,
6008                                                BTRFS_DEV_STAT_FLUSH_ERRS);
6009                                btrfs_dev_stat_print_on_error(dev);
6010                        }
6011                }
6012        }
6013
6014        if (bio == bbio->orig_bio)
6015                is_orig_bio = 1;
6016
6017        btrfs_bio_counter_dec(bbio->fs_info);
6018
6019        if (atomic_dec_and_test(&bbio->stripes_pending)) {
6020                if (!is_orig_bio) {
6021                        bio_put(bio);
6022                        bio = bbio->orig_bio;
6023                }
6024
6025                btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6026                /* only send an error to the higher layers if it is
6027                 * beyond the tolerance of the btrfs bio
6028                 */
6029                if (atomic_read(&bbio->error) > bbio->max_errors) {
6030                        bio->bi_status = BLK_STS_IOERR;
6031                } else {
6032                        /*
6033                         * this bio is actually up to date, we didn't
6034                         * go over the max number of errors
6035                         */
6036                        bio->bi_status = 0;
6037                }
6038
6039                btrfs_end_bbio(bbio, bio);
6040        } else if (!is_orig_bio) {
6041                bio_put(bio);
6042        }
6043}
6044
6045/*
6046 * see run_scheduled_bios for a description of why bios are collected for
6047 * async submit.
6048 *
6049 * This will add one bio to the pending list for a device and make sure
6050 * the work struct is scheduled.
6051 */
6052static noinline void btrfs_schedule_bio(struct btrfs_device *device,
6053                                        struct bio *bio)
6054{
6055        struct btrfs_fs_info *fs_info = device->fs_info;
6056        int should_queue = 1;
6057        struct btrfs_pending_bios *pending_bios;
6058
6059        if (device->missing || !device->bdev) {
6060                bio_io_error(bio);
6061                return;
6062        }
6063
6064        /* don't bother with additional async steps for reads, right now */
6065        if (bio_op(bio) == REQ_OP_READ) {
6066                bio_get(bio);
6067                btrfsic_submit_bio(bio);
6068                bio_put(bio);
6069                return;
6070        }
6071
6072        /*
6073         * nr_async_bios allows us to reliably return congestion to the
6074         * higher layers.  Otherwise, the async bio makes it appear we have
6075         * made progress against dirty pages when we've really just put it
6076         * on a queue for later
6077         */
6078        atomic_inc(&fs_info->nr_async_bios);
6079        WARN_ON(bio->bi_next);
6080        bio->bi_next = NULL;
6081
6082        spin_lock(&device->io_lock);
6083        if (op_is_sync(bio->bi_opf))
6084                pending_bios = &device->pending_sync_bios;
6085        else
6086                pending_bios = &device->pending_bios;
6087
6088        if (pending_bios->tail)
6089                pending_bios->tail->bi_next = bio;
6090
6091        pending_bios->tail = bio;
6092        if (!pending_bios->head)
6093                pending_bios->head = bio;
6094        if (device->running_pending)
6095                should_queue = 0;
6096
6097        spin_unlock(&device->io_lock);
6098
6099        if (should_queue)
6100                btrfs_queue_work(fs_info->submit_workers, &device->work);
6101}
6102
6103static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
6104                              u64 physical, int dev_nr, int async)
6105{
6106        struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
6107        struct btrfs_fs_info *fs_info = bbio->fs_info;
6108
6109        bio->bi_private = bbio;
6110        btrfs_io_bio(bio)->stripe_index = dev_nr;
6111        bio->bi_end_io = btrfs_end_bio;
6112        bio->bi_iter.bi_sector = physical >> 9;
6113#ifdef DEBUG
6114        {
6115                struct rcu_string *name;
6116
6117                rcu_read_lock();
6118                name = rcu_dereference(dev->name);
6119                btrfs_debug(fs_info,
6120                        "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
6121                        bio_op(bio), bio->bi_opf,
6122                        (u64)bio->bi_iter.bi_sector,
6123                        (u_long)dev->bdev->bd_dev, name->str, dev->devid,
6124                        bio->bi_iter.bi_size);
6125                rcu_read_unlock();
6126        }
6127#endif
6128        bio_set_dev(bio, dev->bdev);
6129
6130        btrfs_bio_counter_inc_noblocked(fs_info);
6131
6132        if (async)
6133                btrfs_schedule_bio(dev, bio);
6134        else
6135                btrfsic_submit_bio(bio);
6136}
6137
6138static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
6139{
6140        atomic_inc(&bbio->error);
6141        if (atomic_dec_and_test(&bbio->stripes_pending)) {
6142                /* Should be the original bio. */
6143                WARN_ON(bio != bbio->orig_bio);
6144
6145                btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6146                bio->bi_iter.bi_sector = logical >> 9;
6147                bio->bi_status = BLK_STS_IOERR;
6148                btrfs_end_bbio(bbio, bio);
6149        }
6150}
6151
6152blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
6153                           int mirror_num, int async_submit)
6154{
6155        struct btrfs_device *dev;
6156        struct bio *first_bio = bio;
6157        u64 logical = (u64)bio->bi_iter.bi_sector << 9;
6158        u64 length = 0;
6159        u64 map_length;
6160        int ret;
6161        int dev_nr;
6162        int total_devs;
6163        struct btrfs_bio *bbio = NULL;
6164
6165        length = bio->bi_iter.bi_size;
6166        map_length = length;
6167
6168        btrfs_bio_counter_inc_blocked(fs_info);
6169        ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
6170                                &map_length, &bbio, mirror_num, 1);
6171        if (ret) {
6172                btrfs_bio_counter_dec(fs_info);
6173                return errno_to_blk_status(ret);
6174        }
6175
6176        total_devs = bbio->num_stripes;
6177        bbio->orig_bio = first_bio;
6178        bbio->private = first_bio->bi_private;
6179        bbio->end_io = first_bio->bi_end_io;
6180        bbio->fs_info = fs_info;
6181        atomic_set(&bbio->stripes_pending, bbio->num_stripes);
6182
6183        if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
6184            ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) {
6185                /* In this case, map_length has been set to the length of
6186                   a single stripe; not the whole write */
6187                if (bio_op(bio) == REQ_OP_WRITE) {
6188                        ret = raid56_parity_write(fs_info, bio, bbio,
6189                                                  map_length);
6190                } else {
6191                        ret = raid56_parity_recover(fs_info, bio, bbio,
6192                                                    map_length, mirror_num, 1);
6193                }
6194
6195                btrfs_bio_counter_dec(fs_info);
6196                return errno_to_blk_status(ret);
6197        }
6198
6199        if (map_length < length) {
6200                btrfs_crit(fs_info,
6201                           "mapping failed logical %llu bio len %llu len %llu",
6202                           logical, length, map_length);
6203                BUG();
6204        }
6205
6206        for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
6207                dev = bbio->stripes[dev_nr].dev;
6208                if (!dev || !dev->bdev ||
6209                    (bio_op(first_bio) == REQ_OP_WRITE && !dev->writeable)) {
6210                        bbio_error(bbio, first_bio, logical);
6211                        continue;
6212                }
6213
6214                if (dev_nr < total_devs - 1)
6215                        bio = btrfs_bio_clone(first_bio);
6216                else
6217                        bio = first_bio;
6218
6219                submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical,
6220                                  dev_nr, async_submit);
6221        }
6222        btrfs_bio_counter_dec(fs_info);
6223        return BLK_STS_OK;
6224}
6225
6226struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
6227                                       u8 *uuid, u8 *fsid)
6228{
6229        struct btrfs_device *device;
6230        struct btrfs_fs_devices *cur_devices;
6231
6232        cur_devices = fs_info->fs_devices;
6233        while (cur_devices) {
6234                if (!fsid ||
6235                    !memcmp(cur_devices->fsid, fsid, BTRFS_FSID_SIZE)) {
6236                        device = find_device(cur_devices, devid, uuid);
6237                        if (device)
6238                                return device;
6239                }
6240                cur_devices = cur_devices->seed;
6241        }
6242        return NULL;
6243}
6244
6245static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
6246                                            u64 devid, u8 *dev_uuid)
6247{
6248        struct btrfs_device *device;
6249
6250        device = btrfs_alloc_device(NULL, &devid, dev_uuid);
6251        if (IS_ERR(device))
6252                return NULL;
6253
6254        list_add(&device->dev_list, &fs_devices->devices);
6255        device->fs_devices = fs_devices;
6256        fs_devices->num_devices++;
6257
6258        device->missing = 1;
6259        fs_devices->missing_devices++;
6260
6261        return device;
6262}
6263
6264/**
6265 * btrfs_alloc_device - allocate struct btrfs_device
6266 * @fs_info:    used only for generating a new devid, can be NULL if
6267 *              devid is provided (i.e. @devid != NULL).
6268 * @devid:      a pointer to devid for this device.  If NULL a new devid
6269 *              is generated.
6270 * @uuid:       a pointer to UUID for this device.  If NULL a new UUID
6271 *              is generated.
6272 *
6273 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
6274 * on error.  Returned struct is not linked onto any lists and can be
6275 * destroyed with kfree() right away.
6276 */
6277struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
6278                                        const u64 *devid,
6279                                        const u8 *uuid)
6280{
6281        struct btrfs_device *dev;
6282        u64 tmp;
6283
6284        if (WARN_ON(!devid && !fs_info))
6285                return ERR_PTR(-EINVAL);
6286
6287        dev = __alloc_device();
6288        if (IS_ERR(dev))
6289                return dev;
6290
6291        if (devid)
6292                tmp = *devid;
6293        else {
6294                int ret;
6295
6296                ret = find_next_devid(fs_info, &tmp);
6297                if (ret) {
6298                        kfree(dev);
6299                        return ERR_PTR(ret);
6300                }
6301        }
6302        dev->devid = tmp;
6303
6304        if (uuid)
6305                memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
6306        else
6307                generate_random_uuid(dev->uuid);
6308
6309        btrfs_init_work(&dev->work, btrfs_submit_helper,
6310                        pending_bios_fn, NULL, NULL);
6311
6312        return dev;
6313}
6314
6315/* Return -EIO if any error, otherwise return 0. */
6316static int btrfs_check_chunk_valid(struct btrfs_fs_info *fs_info,
6317                                   struct extent_buffer *leaf,
6318                                   struct btrfs_chunk *chunk, u64 logical)
6319{
6320        u64 length;
6321        u64 stripe_len;
6322        u16 num_stripes;
6323        u16 sub_stripes;
6324        u64 type;
6325
6326        length = btrfs_chunk_length(leaf, chunk);
6327        stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
6328        num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
6329        sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
6330        type = btrfs_chunk_type(leaf, chunk);
6331
6332        if (!num_stripes) {
6333                btrfs_err(fs_info, "invalid chunk num_stripes: %u",
6334                          num_stripes);
6335                return -EIO;
6336        }
6337        if (!IS_ALIGNED(logical, fs_info->sectorsize)) {
6338                btrfs_err(fs_info, "invalid chunk logical %llu", logical);
6339                return -EIO;
6340        }
6341        if (btrfs_chunk_sector_size(leaf, chunk) != fs_info->sectorsize) {
6342                btrfs_err(fs_info, "invalid chunk sectorsize %u",
6343                          btrfs_chunk_sector_size(leaf, chunk));
6344                return -EIO;
6345        }
6346        if (!length || !IS_ALIGNED(length, fs_info->sectorsize)) {
6347                btrfs_err(fs_info, "invalid chunk length %llu", length);
6348                return -EIO;
6349        }
6350        if (!is_power_of_2(stripe_len) || stripe_len != BTRFS_STRIPE_LEN) {
6351                btrfs_err(fs_info, "invalid chunk stripe length: %llu",
6352                          stripe_len);
6353                return -EIO;
6354        }
6355        if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) &
6356            type) {
6357                btrfs_err(fs_info, "unrecognized chunk type: %llu",
6358                          ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
6359                            BTRFS_BLOCK_GROUP_PROFILE_MASK) &
6360                          btrfs_chunk_type(leaf, chunk));
6361                return -EIO;
6362        }
6363        if ((type & BTRFS_BLOCK_GROUP_RAID10 && sub_stripes != 2) ||
6364            (type & BTRFS_BLOCK_GROUP_RAID1 && num_stripes < 1) ||
6365            (type & BTRFS_BLOCK_GROUP_RAID5 && num_stripes < 2) ||
6366            (type & BTRFS_BLOCK_GROUP_RAID6 && num_stripes < 3) ||
6367            (type & BTRFS_BLOCK_GROUP_DUP && num_stripes > 2) ||
6368            ((type & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0 &&
6369             num_stripes != 1)) {
6370                btrfs_err(fs_info,
6371                        "invalid num_stripes:sub_stripes %u:%u for profile %llu",
6372                        num_stripes, sub_stripes,
6373                        type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
6374                return -EIO;
6375        }
6376
6377        return 0;
6378}
6379
6380static int read_one_chunk(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
6381                          struct extent_buffer *leaf,
6382                          struct btrfs_chunk *chunk)
6383{
6384        struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
6385        struct map_lookup *map;
6386        struct extent_map *em;
6387        u64 logical;
6388        u64 length;
6389        u64 devid;
6390        u8 uuid[BTRFS_UUID_SIZE];
6391        int num_stripes;
6392        int ret;
6393        int i;
6394
6395        logical = key->offset;
6396        length = btrfs_chunk_length(leaf, chunk);
6397        num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
6398
6399        ret = btrfs_check_chunk_valid(fs_info, leaf, chunk, logical);
6400        if (ret)
6401                return ret;
6402
6403        read_lock(&map_tree->map_tree.lock);
6404        em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
6405        read_unlock(&map_tree->map_tree.lock);
6406
6407        /* already mapped? */
6408        if (em && em->start <= logical && em->start + em->len > logical) {
6409                free_extent_map(em);
6410                return 0;
6411        } else if (em) {
6412                free_extent_map(em);
6413        }
6414
6415        em = alloc_extent_map();
6416        if (!em)
6417                return -ENOMEM;
6418        map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
6419        if (!map) {
6420                free_extent_map(em);
6421                return -ENOMEM;
6422        }
6423
6424        set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
6425        em->map_lookup = map;
6426        em->start = logical;
6427        em->len = length;
6428        em->orig_start = 0;
6429        em->block_start = 0;
6430        em->block_len = em->len;
6431
6432        map->num_stripes = num_stripes;
6433        map->io_width = btrfs_chunk_io_width(leaf, chunk);
6434        map->io_align = btrfs_chunk_io_align(leaf, chunk);
6435        map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
6436        map->type = btrfs_chunk_type(leaf, chunk);
6437        map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
6438        for (i = 0; i < num_stripes; i++) {
6439                map->stripes[i].physical =
6440                        btrfs_stripe_offset_nr(leaf, chunk, i);
6441                devid = btrfs_stripe_devid_nr(leaf, chunk, i);
6442                read_extent_buffer(leaf, uuid, (unsigned long)
6443                                   btrfs_stripe_dev_uuid_nr(chunk, i),
6444                                   BTRFS_UUID_SIZE);
6445                map->stripes[i].dev = btrfs_find_device(fs_info, devid,
6446                                                        uuid, NULL);
6447                if (!map->stripes[i].dev &&
6448                    !btrfs_test_opt(fs_info, DEGRADED)) {
6449                        free_extent_map(em);
6450                        btrfs_report_missing_device(fs_info, devid, uuid);
6451                        return -EIO;
6452                }
6453                if (!map->stripes[i].dev) {
6454                        map->stripes[i].dev =
6455                                add_missing_dev(fs_info->fs_devices, devid,
6456                                                uuid);
6457                        if (!map->stripes[i].dev) {
6458                                free_extent_map(em);
6459                                return -EIO;
6460                        }
6461                        btrfs_report_missing_device(fs_info, devid, uuid);
6462                }
6463                map->stripes[i].dev->in_fs_metadata = 1;
6464        }
6465
6466        write_lock(&map_tree->map_tree.lock);
6467        ret = add_extent_mapping(&map_tree->map_tree, em, 0);
6468        write_unlock(&map_tree->map_tree.lock);
6469        BUG_ON(ret); /* Tree corruption */
6470        free_extent_map(em);
6471
6472        return 0;
6473}
6474
6475static void fill_device_from_item(struct extent_buffer *leaf,
6476                                 struct btrfs_dev_item *dev_item,
6477                                 struct btrfs_device *device)
6478{
6479        unsigned long ptr;
6480
6481        device->devid = btrfs_device_id(leaf, dev_item);
6482        device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
6483        device->total_bytes = device->disk_total_bytes;
6484        device->commit_total_bytes = device->disk_total_bytes;
6485        device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
6486        device->commit_bytes_used = device->bytes_used;
6487        device->type = btrfs_device_type(leaf, dev_item);
6488        device->io_align = btrfs_device_io_align(leaf, dev_item);
6489        device->io_width = btrfs_device_io_width(leaf, dev_item);
6490        device->sector_size = btrfs_device_sector_size(leaf, dev_item);
6491        WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
6492        device->is_tgtdev_for_dev_replace = 0;
6493
6494        ptr = btrfs_device_uuid(dev_item);
6495        read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
6496}
6497
6498static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
6499                                                  u8 *fsid)
6500{
6501        struct btrfs_fs_devices *fs_devices;
6502        int ret;
6503
6504        BUG_ON(!mutex_is_locked(&uuid_mutex));
6505        ASSERT(fsid);
6506
6507        fs_devices = fs_info->fs_devices->seed;
6508        while (fs_devices) {
6509                if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
6510                        return fs_devices;
6511
6512                fs_devices = fs_devices->seed;
6513        }
6514
6515        fs_devices = find_fsid(fsid);
6516        if (!fs_devices) {
6517                if (!btrfs_test_opt(fs_info, DEGRADED))
6518                        return ERR_PTR(-ENOENT);
6519
6520                fs_devices = alloc_fs_devices(fsid);
6521                if (IS_ERR(fs_devices))
6522                        return fs_devices;
6523
6524                fs_devices->seeding = 1;
6525                fs_devices->opened = 1;
6526                return fs_devices;
6527        }
6528
6529        fs_devices = clone_fs_devices(fs_devices);
6530        if (IS_ERR(fs_devices))
6531                return fs_devices;
6532
6533        ret = __btrfs_open_devices(fs_devices, FMODE_READ,
6534                                   fs_info->bdev_holder);
6535        if (ret) {
6536                free_fs_devices(fs_devices);
6537                fs_devices = ERR_PTR(ret);
6538                goto out;
6539        }
6540
6541        if (!fs_devices->seeding) {
6542                __btrfs_close_devices(fs_devices);
6543                free_fs_devices(fs_devices);
6544                fs_devices = ERR_PTR(-EINVAL);
6545                goto out;
6546        }
6547
6548        fs_devices->seed = fs_info->fs_devices->seed;
6549        fs_info->fs_devices->seed = fs_devices;
6550out:
6551        return fs_devices;
6552}
6553
6554static int read_one_dev(struct btrfs_fs_info *fs_info,
6555                        struct extent_buffer *leaf,
6556                        struct btrfs_dev_item *dev_item)
6557{
6558        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6559        struct btrfs_device *device;
6560        u64 devid;
6561        int ret;
6562        u8 fs_uuid[BTRFS_FSID_SIZE];
6563        u8 dev_uuid[BTRFS_UUID_SIZE];
6564
6565        devid = btrfs_device_id(leaf, dev_item);
6566        read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
6567                           BTRFS_UUID_SIZE);
6568        read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
6569                           BTRFS_FSID_SIZE);
6570
6571        if (memcmp(fs_uuid, fs_info->fsid, BTRFS_FSID_SIZE)) {
6572                fs_devices = open_seed_devices(fs_info, fs_uuid);
6573                if (IS_ERR(fs_devices))
6574                        return PTR_ERR(fs_devices);
6575        }
6576
6577        device = btrfs_find_device(fs_info, devid, dev_uuid, fs_uuid);
6578        if (!device) {
6579                if (!btrfs_test_opt(fs_info, DEGRADED)) {
6580                        btrfs_report_missing_device(fs_info, devid, dev_uuid);
6581                        return -EIO;
6582                }
6583
6584                device = add_missing_dev(fs_devices, devid, dev_uuid);
6585                if (!device)
6586                        return -ENOMEM;
6587                btrfs_report_missing_device(fs_info, devid, dev_uuid);
6588        } else {
6589                if (!device->bdev) {
6590                        btrfs_report_missing_device(fs_info, devid, dev_uuid);
6591                        if (!btrfs_test_opt(fs_info, DEGRADED))
6592                                return -EIO;
6593                }
6594
6595                if(!device->bdev && !device->missing) {
6596                        /*
6597                         * this happens when a device that was properly setup
6598                         * in the device info lists suddenly goes bad.
6599                         * device->bdev is NULL, and so we have to set
6600                         * device->missing to one here
6601                         */
6602                        device->fs_devices->missing_devices++;
6603                        device->missing = 1;
6604                }
6605
6606                /* Move the device to its own fs_devices */
6607                if (device->fs_devices != fs_devices) {
6608                        ASSERT(device->missing);
6609
6610                        list_move(&device->dev_list, &fs_devices->devices);
6611                        device->fs_devices->num_devices--;
6612                        fs_devices->num_devices++;
6613
6614                        device->fs_devices->missing_devices--;
6615                        fs_devices->missing_devices++;
6616
6617                        device->fs_devices = fs_devices;
6618                }
6619        }
6620
6621        if (device->fs_devices != fs_info->fs_devices) {
6622                BUG_ON(device->writeable);
6623                if (device->generation !=
6624                    btrfs_device_generation(leaf, dev_item))
6625                        return -EINVAL;
6626        }
6627
6628        fill_device_from_item(leaf, dev_item, device);
6629        device->in_fs_metadata = 1;
6630        if (device->writeable && !device->is_tgtdev_for_dev_replace) {
6631                device->fs_devices->total_rw_bytes += device->total_bytes;
6632                atomic64_add(device->total_bytes - device->bytes_used,
6633                                &fs_info->free_chunk_space);
6634        }
6635        ret = 0;
6636        return ret;
6637}
6638
6639int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
6640{
6641        struct btrfs_root *root = fs_info->tree_root;
6642        struct btrfs_super_block *super_copy = fs_info->super_copy;
6643        struct extent_buffer *sb;
6644        struct btrfs_disk_key *disk_key;
6645        struct btrfs_chunk *chunk;
6646        u8 *array_ptr;
6647        unsigned long sb_array_offset;
6648        int ret = 0;
6649        u32 num_stripes;
6650        u32 array_size;
6651        u32 len = 0;
6652        u32 cur_offset;
6653        u64 type;
6654        struct btrfs_key key;
6655
6656        ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
6657        /*
6658         * This will create extent buffer of nodesize, superblock size is
6659         * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
6660         * overallocate but we can keep it as-is, only the first page is used.
6661         */
6662        sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET);
6663        if (IS_ERR(sb))
6664                return PTR_ERR(sb);
6665        set_extent_buffer_uptodate(sb);
6666        btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
6667        /*
6668         * The sb extent buffer is artificial and just used to read the system array.
6669         * set_extent_buffer_uptodate() call does not properly mark all it's
6670         * pages up-to-date when the page is larger: extent does not cover the
6671         * whole page and consequently check_page_uptodate does not find all
6672         * the page's extents up-to-date (the hole beyond sb),
6673         * write_extent_buffer then triggers a WARN_ON.
6674         *
6675         * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
6676         * but sb spans only this function. Add an explicit SetPageUptodate call
6677         * to silence the warning eg. on PowerPC 64.
6678         */
6679        if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
6680                SetPageUptodate(sb->pages[0]);
6681
6682        write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
6683        array_size = btrfs_super_sys_array_size(super_copy);
6684
6685        array_ptr = super_copy->sys_chunk_array;
6686        sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
6687        cur_offset = 0;
6688
6689        while (cur_offset < array_size) {
6690                disk_key = (struct btrfs_disk_key *)array_ptr;
6691                len = sizeof(*disk_key);
6692                if (cur_offset + len > array_size)
6693                        goto out_short_read;
6694
6695                btrfs_disk_key_to_cpu(&key, disk_key);
6696
6697                array_ptr += len;
6698                sb_array_offset += len;
6699                cur_offset += len;
6700
6701                if (key.type == BTRFS_CHUNK_ITEM_KEY) {
6702                        chunk = (struct btrfs_chunk *)sb_array_offset;
6703                        /*
6704                         * At least one btrfs_chunk with one stripe must be
6705                         * present, exact stripe count check comes afterwards
6706                         */
6707                        len = btrfs_chunk_item_size(1);
6708                        if (cur_offset + len > array_size)
6709                                goto out_short_read;
6710
6711                        num_stripes = btrfs_chunk_num_stripes(sb, chunk);
6712                        if (!num_stripes) {
6713                                btrfs_err(fs_info,
6714                                        "invalid number of stripes %u in sys_array at offset %u",
6715                                        num_stripes, cur_offset);
6716                                ret = -EIO;
6717                                break;
6718                        }
6719
6720                        type = btrfs_chunk_type(sb, chunk);
6721                        if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
6722                                btrfs_err(fs_info,
6723                            "invalid chunk type %llu in sys_array at offset %u",
6724                                        type, cur_offset);
6725                                ret = -EIO;
6726                                break;
6727                        }
6728
6729                        len = btrfs_chunk_item_size(num_stripes);
6730                        if (cur_offset + len > array_size)
6731                                goto out_short_read;
6732
6733                        ret = read_one_chunk(fs_info, &key, sb, chunk);
6734                        if (ret)
6735                                break;
6736                } else {
6737                        btrfs_err(fs_info,
6738                            "unexpected item type %u in sys_array at offset %u",
6739                                  (u32)key.type, cur_offset);
6740                        ret = -EIO;
6741                        break;
6742                }
6743                array_ptr += len;
6744                sb_array_offset += len;
6745                cur_offset += len;
6746        }
6747        clear_extent_buffer_uptodate(sb);
6748        free_extent_buffer_stale(sb);
6749        return ret;
6750
6751out_short_read:
6752        btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
6753                        len, cur_offset);
6754        clear_extent_buffer_uptodate(sb);
6755        free_extent_buffer_stale(sb);
6756        return -EIO;
6757}
6758
6759void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, u64 devid,
6760                                 u8 *uuid)
6761{
6762        btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing", devid, uuid);
6763}
6764
6765/*
6766 * Check if all chunks in the fs are OK for read-write degraded mount
6767 *
6768 * Return true if all chunks meet the minimal RW mount requirements.
6769 * Return false if any chunk doesn't meet the minimal RW mount requirements.
6770 */
6771bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info)
6772{
6773        struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
6774        struct extent_map *em;
6775        u64 next_start = 0;
6776        bool ret = true;
6777
6778        read_lock(&map_tree->map_tree.lock);
6779        em = lookup_extent_mapping(&map_tree->map_tree, 0, (u64)-1);
6780        read_unlock(&map_tree->map_tree.lock);
6781        /* No chunk at all? Return false anyway */
6782        if (!em) {
6783                ret = false;
6784                goto out;
6785        }
6786        while (em) {
6787                struct map_lookup *map;
6788                int missing = 0;
6789                int max_tolerated;
6790                int i;
6791
6792                map = em->map_lookup;
6793                max_tolerated =
6794                        btrfs_get_num_tolerated_disk_barrier_failures(
6795                                        map->type);
6796                for (i = 0; i < map->num_stripes; i++) {
6797                        struct btrfs_device *dev = map->stripes[i].dev;
6798
6799                        if (!dev || !dev->bdev || dev->missing ||
6800                            dev->last_flush_error)
6801                                missing++;
6802                }
6803                if (missing > max_tolerated) {
6804                        btrfs_warn(fs_info,
6805        "chunk %llu missing %d devices, max tolerance is %d for writeable mount",
6806                                   em->start, missing, max_tolerated);
6807                        free_extent_map(em);
6808                        ret = false;
6809                        goto out;
6810                }
6811                next_start = extent_map_end(em);
6812                free_extent_map(em);
6813
6814                read_lock(&map_tree->map_tree.lock);
6815                em = lookup_extent_mapping(&map_tree->map_tree, next_start,
6816                                           (u64)(-1) - next_start);
6817                read_unlock(&map_tree->map_tree.lock);
6818        }
6819out:
6820        return ret;
6821}
6822
6823int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
6824{
6825        struct btrfs_root *root = fs_info->chunk_root;
6826        struct btrfs_path *path;
6827        struct extent_buffer *leaf;
6828        struct btrfs_key key;
6829        struct btrfs_key found_key;
6830        int ret;
6831        int slot;
6832        u64 total_dev = 0;
6833
6834        path = btrfs_alloc_path();
6835        if (!path)
6836                return -ENOMEM;
6837
6838        mutex_lock(&uuid_mutex);
6839        mutex_lock(&fs_info->chunk_mutex);
6840
6841        /*
6842         * Read all device items, and then all the chunk items. All
6843         * device items are found before any chunk item (their object id
6844         * is smaller than the lowest possible object id for a chunk
6845         * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
6846         */
6847        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
6848        key.offset = 0;
6849        key.type = 0;
6850        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6851        if (ret < 0)
6852                goto error;
6853        while (1) {
6854                leaf = path->nodes[0];
6855                slot = path->slots[0];
6856                if (slot >= btrfs_header_nritems(leaf)) {
6857                        ret = btrfs_next_leaf(root, path);
6858                        if (ret == 0)
6859                                continue;
6860                        if (ret < 0)
6861                                goto error;
6862                        break;
6863                }
6864                btrfs_item_key_to_cpu(leaf, &found_key, slot);
6865                if (found_key.type == BTRFS_DEV_ITEM_KEY) {
6866                        struct btrfs_dev_item *dev_item;
6867                        dev_item = btrfs_item_ptr(leaf, slot,
6868                                                  struct btrfs_dev_item);
6869                        ret = read_one_dev(fs_info, leaf, dev_item);
6870                        if (ret)
6871                                goto error;
6872                        total_dev++;
6873                } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
6874                        struct btrfs_chunk *chunk;
6875                        chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
6876                        ret = read_one_chunk(fs_info, &found_key, leaf, chunk);
6877                        if (ret)
6878                                goto error;
6879                }
6880                path->slots[0]++;
6881        }
6882
6883        /*
6884         * After loading chunk tree, we've got all device information,
6885         * do another round of validation checks.
6886         */
6887        if (total_dev != fs_info->fs_devices->total_devices) {
6888                btrfs_err(fs_info,
6889           "super_num_devices %llu mismatch with num_devices %llu found here",
6890                          btrfs_super_num_devices(fs_info->super_copy),
6891                          total_dev);
6892                ret = -EINVAL;
6893                goto error;
6894        }
6895        if (btrfs_super_total_bytes(fs_info->super_copy) <
6896            fs_info->fs_devices->total_rw_bytes) {
6897                btrfs_err(fs_info,
6898        "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
6899                          btrfs_super_total_bytes(fs_info->super_copy),
6900                          fs_info->fs_devices->total_rw_bytes);
6901                ret = -EINVAL;
6902                goto error;
6903        }
6904        ret = 0;
6905error:
6906        mutex_unlock(&fs_info->chunk_mutex);
6907        mutex_unlock(&uuid_mutex);
6908
6909        btrfs_free_path(path);
6910        return ret;
6911}
6912
6913void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
6914{
6915        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6916        struct btrfs_device *device;
6917
6918        while (fs_devices) {
6919                mutex_lock(&fs_devices->device_list_mutex);
6920                list_for_each_entry(device, &fs_devices->devices, dev_list)
6921                        device->fs_info = fs_info;
6922                mutex_unlock(&fs_devices->device_list_mutex);
6923
6924                fs_devices = fs_devices->seed;
6925        }
6926}
6927
6928static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
6929{
6930        int i;
6931
6932        for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
6933                btrfs_dev_stat_reset(dev, i);
6934}
6935
6936int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
6937{
6938        struct btrfs_key key;
6939        struct btrfs_key found_key;
6940        struct btrfs_root *dev_root = fs_info->dev_root;
6941        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6942        struct extent_buffer *eb;
6943        int slot;
6944        int ret = 0;
6945        struct btrfs_device *device;
6946        struct btrfs_path *path = NULL;
6947        int i;
6948
6949        path = btrfs_alloc_path();
6950        if (!path) {
6951                ret = -ENOMEM;
6952                goto out;
6953        }
6954
6955        mutex_lock(&fs_devices->device_list_mutex);
6956        list_for_each_entry(device, &fs_devices->devices, dev_list) {
6957                int item_size;
6958                struct btrfs_dev_stats_item *ptr;
6959
6960                key.objectid = BTRFS_DEV_STATS_OBJECTID;
6961                key.type = BTRFS_PERSISTENT_ITEM_KEY;
6962                key.offset = device->devid;
6963                ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
6964                if (ret) {
6965                        __btrfs_reset_dev_stats(device);
6966                        device->dev_stats_valid = 1;
6967                        btrfs_release_path(path);
6968                        continue;
6969                }
6970                slot = path->slots[0];
6971                eb = path->nodes[0];
6972                btrfs_item_key_to_cpu(eb, &found_key, slot);
6973                item_size = btrfs_item_size_nr(eb, slot);
6974
6975                ptr = btrfs_item_ptr(eb, slot,
6976                                     struct btrfs_dev_stats_item);
6977
6978                for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
6979                        if (item_size >= (1 + i) * sizeof(__le64))
6980                                btrfs_dev_stat_set(device, i,
6981                                        btrfs_dev_stats_value(eb, ptr, i));
6982                        else
6983                                btrfs_dev_stat_reset(device, i);
6984                }
6985
6986                device->dev_stats_valid = 1;
6987                btrfs_dev_stat_print_on_load(device);
6988                btrfs_release_path(path);
6989        }
6990        mutex_unlock(&fs_devices->device_list_mutex);
6991
6992out:
6993        btrfs_free_path(path);
6994        return ret < 0 ? ret : 0;
6995}
6996
6997static int update_dev_stat_item(struct btrfs_trans_handle *trans,
6998                                struct btrfs_fs_info *fs_info,
6999                                struct btrfs_device *device)
7000{

7001        struct btrfs_root *dev_root = fs_info->dev_root;
7002        struct btrfs_path *path;
7003        struct btrfs_key key;
7004        struct extent_buffer *eb;
7005        struct btrfs_dev_stats_item *ptr;
7006        int ret;
7007        int i;
7008
7009        key.objectid = BTRFS_DEV_STATS_OBJECTID;
7010        key.type = BTRFS_PERSISTENT_ITEM_KEY;
7011        key.offset = device->devid;
7012
7013        path = btrfs_alloc_path();
7014        if (!path)
7015                return -ENOMEM;
7016        ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
7017        if (ret < 0) {
7018                btrfs_warn_in_rcu(fs_info,
7019                        "error %d while searching for dev_stats item for device %s",
7020                              ret, rcu_str_deref(device->name));
7021                goto out;
7022        }
7023
7024        if (ret == 0 &&
7025            btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
7026                /* need to delete old one and insert a new one */
7027                ret = btrfs_del_item(trans, dev_root, path);
7028                if (ret != 0) {
7029                        btrfs_warn_in_rcu(fs_info,
7030                                "delete too small dev_stats item for device %s failed %d",
7031                                      rcu_str_deref(device->name), ret);
7032                        goto out;
7033                }
7034                ret = 1;
7035        }
7036
7037        if (ret == 1) {
7038                /* need to insert a new item */
7039                btrfs_release_path(path);
7040                ret = btrfs_insert_empty_item(trans, dev_root, path,
7041                                              &key, sizeof(*ptr));
7042                if (ret < 0) {
7043                        btrfs_warn_in_rcu(fs_info,
7044                                "insert dev_stats item for device %s failed %d",
7045                                rcu_str_deref(device->name), ret);
7046                        goto out;
7047                }
7048        }
7049
7050        eb = path->nodes[0];
7051        ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
7052        for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7053                btrfs_set_dev_stats_value(eb, ptr, i,
7054                                          btrfs_dev_stat_read(device, i));
7055        btrfs_mark_buffer_dirty(eb);
7056
7057out:
7058        btrfs_free_path(path);
7059        return ret;
7060}
7061
7062/*
7063 * called from commit_transaction. Writes all changed device stats to disk.
7064 */
7065int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
7066                        struct btrfs_fs_info *fs_info)
7067{
7068        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7069        struct btrfs_device *device;
7070        int stats_cnt;
7071        int ret = 0;
7072
7073        mutex_lock(&fs_devices->device_list_mutex);
7074        list_for_each_entry(device, &fs_devices->devices, dev_list) {
7075                if (!device->dev_stats_valid || !btrfs_dev_stats_dirty(device))
7076                        continue;
7077
7078                stats_cnt = atomic_read(&device->dev_stats_ccnt);
7079                ret = update_dev_stat_item(trans, fs_info, device);
7080                if (!ret)
7081                        atomic_sub(stats_cnt, &device->dev_stats_ccnt);
7082        }
7083        mutex_unlock(&fs_devices->device_list_mutex);
7084
7085        return ret;
7086}
7087
7088void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
7089{
7090        btrfs_dev_stat_inc(dev, index);
7091        btrfs_dev_stat_print_on_error(dev);
7092}
7093
7094static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
7095{
7096        if (!dev->dev_stats_valid)
7097                return;
7098        btrfs_err_rl_in_rcu(dev->fs_info,
7099                "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7100                           rcu_str_deref(dev->name),
7101                           btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7102                           btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7103                           btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7104                           btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7105                           btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7106}
7107
7108static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
7109{
7110        int i;
7111
7112        for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7113                if (btrfs_dev_stat_read(dev, i) != 0)
7114                        break;
7115        if (i == BTRFS_DEV_STAT_VALUES_MAX)
7116                return; /* all values == 0, suppress message */
7117
7118        btrfs_info_in_rcu(dev->fs_info,
7119                "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7120               rcu_str_deref(dev->name),
7121               btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7122               btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7123               btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7124               btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7125               btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7126}
7127
7128int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
7129                        struct btrfs_ioctl_get_dev_stats *stats)
7130{
7131        struct btrfs_device *dev;
7132        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7133        int i;
7134
7135        mutex_lock(&fs_devices->device_list_mutex);
7136        dev = btrfs_find_device(fs_info, stats->devid, NULL, NULL);
7137        mutex_unlock(&fs_devices->device_list_mutex);
7138
7139        if (!dev) {
7140                btrfs_warn(fs_info, "get dev_stats failed, device not found");
7141                return -ENODEV;
7142        } else if (!dev->dev_stats_valid) {
7143                btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
7144                return -ENODEV;
7145        } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
7146                for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7147                        if (stats->nr_items > i)
7148                                stats->values[i] =
7149                                        btrfs_dev_stat_read_and_reset(dev, i);
7150                        else
7151                                btrfs_dev_stat_reset(dev, i);
7152                }
7153        } else {
7154                for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7155                        if (stats->nr_items > i)
7156                                stats->values[i] = btrfs_dev_stat_read(dev, i);
7157        }
7158        if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
7159                stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
7160        return 0;
7161}
7162
7163void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path)
7164{
7165        struct buffer_head *bh;
7166        struct btrfs_super_block *disk_super;
7167        int copy_num;
7168
7169        if (!bdev)
7170                return;
7171
7172        for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX;
7173                copy_num++) {
7174
7175                if (btrfs_read_dev_one_super(bdev, copy_num, &bh))
7176                        continue;
7177
7178                disk_super = (struct btrfs_super_block *)bh->b_data;
7179
7180                memset(&disk_super->magic, 0, sizeof(disk_super->magic));
7181                set_buffer_dirty(bh);
7182                sync_dirty_buffer(bh);
7183                brelse(bh);
7184        }
7185
7186        /* Notify udev that device has changed */
7187        btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
7188
7189        /* Update ctime/mtime for device path for libblkid */
7190        update_dev_time(device_path);
7191}
7192
7193/*
7194 * Update the size of all devices, which is used for writing out the
7195 * super blocks.
7196 */
7197void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info)
7198{
7199        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7200        struct btrfs_device *curr, *next;
7201
7202        if (list_empty(&fs_devices->resized_devices))
7203                return;
7204
7205        mutex_lock(&fs_devices->device_list_mutex);
7206        mutex_lock(&fs_info->chunk_mutex);
7207        list_for_each_entry_safe(curr, next, &fs_devices->resized_devices,
7208                                 resized_list) {
7209                list_del_init(&curr->resized_list);
7210                curr->commit_total_bytes = curr->disk_total_bytes;
7211        }
7212        mutex_unlock(&fs_info->chunk_mutex);
7213        mutex_unlock(&fs_devices->device_list_mutex);
7214}
7215
7216/* Must be invoked during the transaction commit */
7217void btrfs_update_commit_device_bytes_used(struct btrfs_fs_info *fs_info,
7218                                        struct btrfs_transaction *transaction)
7219{
7220        struct extent_map *em;
7221        struct map_lookup *map;
7222        struct btrfs_device *dev;
7223        int i;
7224
7225        if (list_empty(&transaction->pending_chunks))
7226                return;
7227
7228        /* In order to kick the device replace finish process */
7229        mutex_lock(&fs_info->chunk_mutex);
7230        list_for_each_entry(em, &transaction->pending_chunks, list) {
7231                map = em->map_lookup;
7232
7233                for (i = 0; i < map->num_stripes; i++) {
7234                        dev = map->stripes[i].dev;
7235                        dev->commit_bytes_used = dev->bytes_used;
7236                }
7237        }
7238        mutex_unlock(&fs_info->chunk_mutex);
7239}
7240
7241void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info)
7242{
7243        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7244        while (fs_devices) {
7245                fs_devices->fs_info = fs_info;
7246                fs_devices = fs_devices->seed;
7247        }
7248}
7249
7250void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
7251{
7252        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7253        while (fs_devices) {
7254                fs_devices->fs_info = NULL;
7255                fs_devices = fs_devices->seed;
7256        }
7257}
7258