linux/fs/btrfs/volumes.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2007 Oracle.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18#include <linux/sched.h>
  19#include <linux/bio.h>
  20#include <linux/slab.h>
  21#include <linux/buffer_head.h>
  22#include <linux/blkdev.h>
  23#include <linux/random.h>
  24#include <linux/iocontext.h>
  25#include <linux/capability.h>
  26#include <linux/ratelimit.h>
  27#include <linux/kthread.h>
  28#include <linux/raid/pq.h>
  29#include <linux/semaphore.h>
  30#include <asm/div64.h>
  31#include "ctree.h"
  32#include "extent_map.h"
  33#include "disk-io.h"
  34#include "transaction.h"
  35#include "print-tree.h"
  36#include "volumes.h"
  37#include "raid56.h"
  38#include "async-thread.h"
  39#include "check-integrity.h"
  40#include "rcu-string.h"
  41#include "math.h"
  42#include "dev-replace.h"
  43#include "sysfs.h"
  44
  45static int init_first_rw_device(struct btrfs_trans_handle *trans,
  46                                struct btrfs_root *root,
  47                                struct btrfs_device *device);
  48static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
  49static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
  50static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
  51static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
  52
  53DEFINE_MUTEX(uuid_mutex);
  54static LIST_HEAD(fs_uuids);
  55
  56static struct btrfs_fs_devices *__alloc_fs_devices(void)
  57{
  58        struct btrfs_fs_devices *fs_devs;
  59
  60        fs_devs = kzalloc(sizeof(*fs_devs), GFP_NOFS);
  61        if (!fs_devs)
  62                return ERR_PTR(-ENOMEM);
  63
  64        mutex_init(&fs_devs->device_list_mutex);
  65
  66        INIT_LIST_HEAD(&fs_devs->devices);
  67        INIT_LIST_HEAD(&fs_devs->resized_devices);
  68        INIT_LIST_HEAD(&fs_devs->alloc_list);
  69        INIT_LIST_HEAD(&fs_devs->list);
  70
  71        return fs_devs;
  72}
  73
  74/**
  75 * alloc_fs_devices - allocate struct btrfs_fs_devices
  76 * @fsid:       a pointer to UUID for this FS.  If NULL a new UUID is
  77 *              generated.
  78 *
  79 * Return: a pointer to a new &struct btrfs_fs_devices on success;
  80 * ERR_PTR() on error.  Returned struct is not linked onto any lists and
  81 * can be destroyed with kfree() right away.
  82 */
  83static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
  84{
  85        struct btrfs_fs_devices *fs_devs;
  86
  87        fs_devs = __alloc_fs_devices();
  88        if (IS_ERR(fs_devs))
  89                return fs_devs;
  90
  91        if (fsid)
  92                memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
  93        else
  94                generate_random_uuid(fs_devs->fsid);
  95
  96        return fs_devs;
  97}
  98
  99static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
 100{
 101        struct btrfs_device *device;
 102        WARN_ON(fs_devices->opened);
 103        while (!list_empty(&fs_devices->devices)) {
 104                device = list_entry(fs_devices->devices.next,
 105                                    struct btrfs_device, dev_list);
 106                list_del(&device->dev_list);
 107                rcu_string_free(device->name);
 108                kfree(device);
 109        }
 110        kfree(fs_devices);
 111}
 112
 113static void btrfs_kobject_uevent(struct block_device *bdev,
 114                                 enum kobject_action action)
 115{
 116        int ret;
 117
 118        ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
 119        if (ret)
 120                pr_warn("BTRFS: Sending event '%d' to kobject: '%s' (%p): failed\n",
 121                        action,
 122                        kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
 123                        &disk_to_dev(bdev->bd_disk)->kobj);
 124}
 125
 126void btrfs_cleanup_fs_uuids(void)
 127{
 128        struct btrfs_fs_devices *fs_devices;
 129
 130        while (!list_empty(&fs_uuids)) {
 131                fs_devices = list_entry(fs_uuids.next,
 132                                        struct btrfs_fs_devices, list);
 133                list_del(&fs_devices->list);
 134                free_fs_devices(fs_devices);
 135        }
 136}
 137
 138static struct btrfs_device *__alloc_device(void)
 139{
 140        struct btrfs_device *dev;
 141
 142        dev = kzalloc(sizeof(*dev), GFP_NOFS);
 143        if (!dev)
 144                return ERR_PTR(-ENOMEM);
 145
 146        INIT_LIST_HEAD(&dev->dev_list);
 147        INIT_LIST_HEAD(&dev->dev_alloc_list);
 148        INIT_LIST_HEAD(&dev->resized_list);
 149
 150        spin_lock_init(&dev->io_lock);
 151
 152        spin_lock_init(&dev->reada_lock);
 153        atomic_set(&dev->reada_in_flight, 0);
 154        atomic_set(&dev->dev_stats_ccnt, 0);
 155        INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT);
 156        INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT);
 157
 158        return dev;
 159}
 160
 161static noinline struct btrfs_device *__find_device(struct list_head *head,
 162                                                   u64 devid, u8 *uuid)
 163{
 164        struct btrfs_device *dev;
 165
 166        list_for_each_entry(dev, head, dev_list) {
 167                if (dev->devid == devid &&
 168                    (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) {
 169                        return dev;
 170                }
 171        }
 172        return NULL;
 173}
 174
 175static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid)
 176{
 177        struct btrfs_fs_devices *fs_devices;
 178
 179        list_for_each_entry(fs_devices, &fs_uuids, list) {
 180                if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
 181                        return fs_devices;
 182        }
 183        return NULL;
 184}
 185
 186static int
 187btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
 188                      int flush, struct block_device **bdev,
 189                      struct buffer_head **bh)
 190{
 191        int ret;
 192
 193        *bdev = blkdev_get_by_path(device_path, flags, holder);
 194
 195        if (IS_ERR(*bdev)) {
 196                ret = PTR_ERR(*bdev);
 197                printk(KERN_INFO "BTRFS: open %s failed\n", device_path);
 198                goto error;
 199        }
 200
 201        if (flush)
 202                filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
 203        ret = set_blocksize(*bdev, 4096);
 204        if (ret) {
 205                blkdev_put(*bdev, flags);
 206                goto error;
 207        }
 208        invalidate_bdev(*bdev);
 209        *bh = btrfs_read_dev_super(*bdev);
 210        if (!*bh) {
 211                ret = -EINVAL;
 212                blkdev_put(*bdev, flags);
 213                goto error;
 214        }
 215
 216        return 0;
 217
 218error:
 219        *bdev = NULL;
 220        *bh = NULL;
 221        return ret;
 222}
 223
 224static void requeue_list(struct btrfs_pending_bios *pending_bios,
 225                        struct bio *head, struct bio *tail)
 226{
 227
 228        struct bio *old_head;
 229
 230        old_head = pending_bios->head;
 231        pending_bios->head = head;
 232        if (pending_bios->tail)
 233                tail->bi_next = old_head;
 234        else
 235                pending_bios->tail = tail;
 236}
 237
 238/*
 239 * we try to collect pending bios for a device so we don't get a large
 240 * number of procs sending bios down to the same device.  This greatly
 241 * improves the schedulers ability to collect and merge the bios.
 242 *
 243 * But, it also turns into a long list of bios to process and that is sure
 244 * to eventually make the worker thread block.  The solution here is to
 245 * make some progress and then put this work struct back at the end of
 246 * the list if the block device is congested.  This way, multiple devices
 247 * can make progress from a single worker thread.
 248 */
 249static noinline void run_scheduled_bios(struct btrfs_device *device)
 250{
 251        struct bio *pending;
 252        struct backing_dev_info *bdi;
 253        struct btrfs_fs_info *fs_info;
 254        struct btrfs_pending_bios *pending_bios;
 255        struct bio *tail;
 256        struct bio *cur;
 257        int again = 0;
 258        unsigned long num_run;
 259        unsigned long batch_run = 0;
 260        unsigned long limit;
 261        unsigned long last_waited = 0;
 262        int force_reg = 0;
 263        int sync_pending = 0;
 264        struct blk_plug plug;
 265
 266        /*
 267         * this function runs all the bios we've collected for
 268         * a particular device.  We don't want to wander off to
 269         * another device without first sending all of these down.
 270         * So, setup a plug here and finish it off before we return
 271         */
 272        blk_start_plug(&plug);
 273
 274        bdi = blk_get_backing_dev_info(device->bdev);
 275        fs_info = device->dev_root->fs_info;
 276        limit = btrfs_async_submit_limit(fs_info);
 277        limit = limit * 2 / 3;
 278
 279loop:
 280        spin_lock(&device->io_lock);
 281
 282loop_lock:
 283        num_run = 0;
 284
 285        /* take all the bios off the list at once and process them
 286         * later on (without the lock held).  But, remember the
 287         * tail and other pointers so the bios can be properly reinserted
 288         * into the list if we hit congestion
 289         */
 290        if (!force_reg && device->pending_sync_bios.head) {
 291                pending_bios = &device->pending_sync_bios;
 292                force_reg = 1;
 293        } else {
 294                pending_bios = &device->pending_bios;
 295                force_reg = 0;
 296        }
 297
 298        pending = pending_bios->head;
 299        tail = pending_bios->tail;
 300        WARN_ON(pending && !tail);
 301
 302        /*
 303         * if pending was null this time around, no bios need processing
 304         * at all and we can stop.  Otherwise it'll loop back up again
 305         * and do an additional check so no bios are missed.
 306         *
 307         * device->running_pending is used to synchronize with the
 308         * schedule_bio code.
 309         */
 310        if (device->pending_sync_bios.head == NULL &&
 311            device->pending_bios.head == NULL) {
 312                again = 0;
 313                device->running_pending = 0;
 314        } else {
 315                again = 1;
 316                device->running_pending = 1;
 317        }
 318
 319        pending_bios->head = NULL;
 320        pending_bios->tail = NULL;
 321
 322        spin_unlock(&device->io_lock);
 323
 324        while (pending) {
 325
 326                rmb();
 327                /* we want to work on both lists, but do more bios on the
 328                 * sync list than the regular list
 329                 */
 330                if ((num_run > 32 &&
 331                    pending_bios != &device->pending_sync_bios &&
 332                    device->pending_sync_bios.head) ||
 333                   (num_run > 64 && pending_bios == &device->pending_sync_bios &&
 334                    device->pending_bios.head)) {
 335                        spin_lock(&device->io_lock);
 336                        requeue_list(pending_bios, pending, tail);
 337                        goto loop_lock;
 338                }
 339
 340                cur = pending;
 341                pending = pending->bi_next;
 342                cur->bi_next = NULL;
 343
 344                if (atomic_dec_return(&fs_info->nr_async_bios) < limit &&
 345                    waitqueue_active(&fs_info->async_submit_wait))
 346                        wake_up(&fs_info->async_submit_wait);
 347
 348                BUG_ON(atomic_read(&cur->bi_cnt) == 0);
 349
 350                /*
 351                 * if we're doing the sync list, record that our
 352                 * plug has some sync requests on it
 353                 *
 354                 * If we're doing the regular list and there are
 355                 * sync requests sitting around, unplug before
 356                 * we add more
 357                 */
 358                if (pending_bios == &device->pending_sync_bios) {
 359                        sync_pending = 1;
 360                } else if (sync_pending) {
 361                        blk_finish_plug(&plug);
 362                        blk_start_plug(&plug);
 363                        sync_pending = 0;
 364                }
 365
 366                btrfsic_submit_bio(cur->bi_rw, cur);
 367                num_run++;
 368                batch_run++;
 369                if (need_resched())
 370                        cond_resched();
 371
 372                /*
 373                 * we made progress, there is more work to do and the bdi
 374                 * is now congested.  Back off and let other work structs
 375                 * run instead
 376                 */
 377                if (pending && bdi_write_congested(bdi) && batch_run > 8 &&
 378                    fs_info->fs_devices->open_devices > 1) {
 379                        struct io_context *ioc;
 380
 381                        ioc = current->io_context;
 382
 383                        /*
 384                         * the main goal here is that we don't want to
 385                         * block if we're going to be able to submit
 386                         * more requests without blocking.
 387                         *
 388                         * This code does two great things, it pokes into
 389                         * the elevator code from a filesystem _and_
 390                         * it makes assumptions about how batching works.
 391                         */
 392                        if (ioc && ioc->nr_batch_requests > 0 &&
 393                            time_before(jiffies, ioc->last_waited + HZ/50UL) &&
 394                            (last_waited == 0 ||
 395                             ioc->last_waited == last_waited)) {
 396                                /*
 397                                 * we want to go through our batch of
 398                                 * requests and stop.  So, we copy out
 399                                 * the ioc->last_waited time and test
 400                                 * against it before looping
 401                                 */
 402                                last_waited = ioc->last_waited;
 403                                if (need_resched())
 404                                        cond_resched();
 405                                continue;
 406                        }
 407                        spin_lock(&device->io_lock);
 408                        requeue_list(pending_bios, pending, tail);
 409                        device->running_pending = 1;
 410
 411                        spin_unlock(&device->io_lock);
 412                        btrfs_queue_work(fs_info->submit_workers,
 413                                         &device->work);
 414                        goto done;
 415                }
 416                /* unplug every 64 requests just for good measure */
 417                if (batch_run % 64 == 0) {
 418                        blk_finish_plug(&plug);
 419                        blk_start_plug(&plug);
 420                        sync_pending = 0;
 421                }
 422        }
 423
 424        cond_resched();
 425        if (again)
 426                goto loop;
 427
 428        spin_lock(&device->io_lock);
 429        if (device->pending_bios.head || device->pending_sync_bios.head)
 430                goto loop_lock;
 431        spin_unlock(&device->io_lock);
 432
 433done:
 434        blk_finish_plug(&plug);
 435}
 436
 437static void pending_bios_fn(struct btrfs_work *work)
 438{
 439        struct btrfs_device *device;
 440
 441        device = container_of(work, struct btrfs_device, work);
 442        run_scheduled_bios(device);
 443}
 444
 445/*
 446 * Add new device to list of registered devices
 447 *
 448 * Returns:
 449 * 1   - first time device is seen
 450 * 0   - device already known
 451 * < 0 - error
 452 */
 453static noinline int device_list_add(const char *path,
 454                           struct btrfs_super_block *disk_super,
 455                           u64 devid, struct btrfs_fs_devices **fs_devices_ret)
 456{
 457        struct btrfs_device *device;
 458        struct btrfs_fs_devices *fs_devices;
 459        struct rcu_string *name;
 460        int ret = 0;
 461        u64 found_transid = btrfs_super_generation(disk_super);
 462
 463        fs_devices = find_fsid(disk_super->fsid);
 464        if (!fs_devices) {
 465                fs_devices = alloc_fs_devices(disk_super->fsid);
 466                if (IS_ERR(fs_devices))
 467                        return PTR_ERR(fs_devices);
 468
 469                list_add(&fs_devices->list, &fs_uuids);
 470
 471                device = NULL;
 472        } else {
 473                device = __find_device(&fs_devices->devices, devid,
 474                                       disk_super->dev_item.uuid);
 475        }
 476
 477        if (!device) {
 478                if (fs_devices->opened)
 479                        return -EBUSY;
 480
 481                device = btrfs_alloc_device(NULL, &devid,
 482                                            disk_super->dev_item.uuid);
 483                if (IS_ERR(device)) {
 484                        /* we can safely leave the fs_devices entry around */
 485                        return PTR_ERR(device);
 486                }
 487
 488                name = rcu_string_strdup(path, GFP_NOFS);
 489                if (!name) {
 490                        kfree(device);
 491                        return -ENOMEM;
 492                }
 493                rcu_assign_pointer(device->name, name);
 494
 495                mutex_lock(&fs_devices->device_list_mutex);
 496                list_add_rcu(&device->dev_list, &fs_devices->devices);
 497                fs_devices->num_devices++;
 498                mutex_unlock(&fs_devices->device_list_mutex);
 499
 500                ret = 1;
 501                device->fs_devices = fs_devices;
 502        } else if (!device->name || strcmp(device->name->str, path)) {
 503                /*
 504                 * When FS is already mounted.
 505                 * 1. If you are here and if the device->name is NULL that
 506                 *    means this device was missing at time of FS mount.
 507                 * 2. If you are here and if the device->name is different
 508                 *    from 'path' that means either
 509                 *      a. The same device disappeared and reappeared with
 510                 *         different name. or
 511                 *      b. The missing-disk-which-was-replaced, has
 512                 *         reappeared now.
 513                 *
 514                 * We must allow 1 and 2a above. But 2b would be a spurious
 515                 * and unintentional.
 516                 *
 517                 * Further in case of 1 and 2a above, the disk at 'path'
 518                 * would have missed some transaction when it was away and
 519                 * in case of 2a the stale bdev has to be updated as well.
 520                 * 2b must not be allowed at all time.
 521                 */
 522
 523                /*
 524                 * For now, we do allow update to btrfs_fs_device through the
 525                 * btrfs dev scan cli after FS has been mounted.  We're still
 526                 * tracking a problem where systems fail mount by subvolume id
 527                 * when we reject replacement on a mounted FS.
 528                 */
 529                if (!fs_devices->opened && found_transid < device->generation) {
 530                        /*
 531                         * That is if the FS is _not_ mounted and if you
 532                         * are here, that means there is more than one
 533                         * disk with same uuid and devid.We keep the one
 534                         * with larger generation number or the last-in if
 535                         * generation are equal.
 536                         */
 537                        return -EEXIST;
 538                }
 539
 540                name = rcu_string_strdup(path, GFP_NOFS);
 541                if (!name)
 542                        return -ENOMEM;
 543                rcu_string_free(device->name);
 544                rcu_assign_pointer(device->name, name);
 545                if (device->missing) {
 546                        fs_devices->missing_devices--;
 547                        device->missing = 0;
 548                }
 549        }
 550
 551        /*
 552         * Unmount does not free the btrfs_device struct but would zero
 553         * generation along with most of the other members. So just update
 554         * it back. We need it to pick the disk with largest generation
 555         * (as above).
 556         */
 557        if (!fs_devices->opened)
 558                device->generation = found_transid;
 559
 560        *fs_devices_ret = fs_devices;
 561
 562        return ret;
 563}
 564
 565static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 566{
 567        struct btrfs_fs_devices *fs_devices;
 568        struct btrfs_device *device;
 569        struct btrfs_device *orig_dev;
 570
 571        fs_devices = alloc_fs_devices(orig->fsid);
 572        if (IS_ERR(fs_devices))
 573                return fs_devices;
 574
 575        mutex_lock(&orig->device_list_mutex);
 576        fs_devices->total_devices = orig->total_devices;
 577
 578        /* We have held the volume lock, it is safe to get the devices. */
 579        list_for_each_entry(orig_dev, &orig->devices, dev_list) {
 580                struct rcu_string *name;
 581
 582                device = btrfs_alloc_device(NULL, &orig_dev->devid,
 583                                            orig_dev->uuid);
 584                if (IS_ERR(device))
 585                        goto error;
 586
 587                /*
 588                 * This is ok to do without rcu read locked because we hold the
 589                 * uuid mutex so nothing we touch in here is going to disappear.
 590                 */
 591                if (orig_dev->name) {
 592                        name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS);
 593                        if (!name) {
 594                                kfree(device);
 595                                goto error;
 596                        }
 597                        rcu_assign_pointer(device->name, name);
 598                }
 599
 600                list_add(&device->dev_list, &fs_devices->devices);
 601                device->fs_devices = fs_devices;
 602                fs_devices->num_devices++;
 603        }
 604        mutex_unlock(&orig->device_list_mutex);
 605        return fs_devices;
 606error:
 607        mutex_unlock(&orig->device_list_mutex);
 608        free_fs_devices(fs_devices);
 609        return ERR_PTR(-ENOMEM);
 610}
 611
 612void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
 613                               struct btrfs_fs_devices *fs_devices, int step)
 614{
 615        struct btrfs_device *device, *next;
 616        struct btrfs_device *latest_dev = NULL;
 617
 618        mutex_lock(&uuid_mutex);
 619again:
 620        /* This is the initialized path, it is safe to release the devices. */
 621        list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
 622                if (device->in_fs_metadata) {
 623                        if (!device->is_tgtdev_for_dev_replace &&
 624                            (!latest_dev ||
 625                             device->generation > latest_dev->generation)) {
 626                                latest_dev = device;
 627                        }
 628                        continue;
 629                }
 630
 631                if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
 632                        /*
 633                         * In the first step, keep the device which has
 634                         * the correct fsid and the devid that is used
 635                         * for the dev_replace procedure.
 636                         * In the second step, the dev_replace state is
 637                         * read from the device tree and it is known
 638                         * whether the procedure is really active or
 639                         * not, which means whether this device is
 640                         * used or whether it should be removed.
 641                         */
 642                        if (step == 0 || device->is_tgtdev_for_dev_replace) {
 643                                continue;
 644                        }
 645                }
 646                if (device->bdev) {
 647                        blkdev_put(device->bdev, device->mode);
 648                        device->bdev = NULL;
 649                        fs_devices->open_devices--;
 650                }
 651                if (device->writeable) {
 652                        list_del_init(&device->dev_alloc_list);
 653                        device->writeable = 0;
 654                        if (!device->is_tgtdev_for_dev_replace)
 655                                fs_devices->rw_devices--;
 656                }
 657                list_del_init(&device->dev_list);
 658                fs_devices->num_devices--;
 659                rcu_string_free(device->name);
 660                kfree(device);
 661        }
 662
 663        if (fs_devices->seed) {
 664                fs_devices = fs_devices->seed;
 665                goto again;
 666        }
 667
 668        fs_devices->latest_bdev = latest_dev->bdev;
 669
 670        mutex_unlock(&uuid_mutex);
 671}
 672
 673static void __free_device(struct work_struct *work)
 674{
 675        struct btrfs_device *device;
 676
 677        device = container_of(work, struct btrfs_device, rcu_work);
 678
 679        if (device->bdev)
 680                blkdev_put(device->bdev, device->mode);
 681
 682        rcu_string_free(device->name);
 683        kfree(device);
 684}
 685
 686static void free_device(struct rcu_head *head)
 687{
 688        struct btrfs_device *device;
 689
 690        device = container_of(head, struct btrfs_device, rcu);
 691
 692        INIT_WORK(&device->rcu_work, __free_device);
 693        schedule_work(&device->rcu_work);
 694}
 695
 696static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 697{
 698        struct btrfs_device *device;
 699
 700        if (--fs_devices->opened > 0)
 701                return 0;
 702
 703        mutex_lock(&fs_devices->device_list_mutex);
 704        list_for_each_entry(device, &fs_devices->devices, dev_list) {
 705                struct btrfs_device *new_device;
 706                struct rcu_string *name;
 707
 708                if (device->bdev)
 709                        fs_devices->open_devices--;
 710
 711                if (device->writeable &&
 712                    device->devid != BTRFS_DEV_REPLACE_DEVID) {
 713                        list_del_init(&device->dev_alloc_list);
 714                        fs_devices->rw_devices--;
 715                }
 716
 717                if (device->missing)
 718                        fs_devices->missing_devices--;
 719
 720                new_device = btrfs_alloc_device(NULL, &device->devid,
 721                                                device->uuid);
 722                BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
 723
 724                /* Safe because we are under uuid_mutex */
 725                if (device->name) {
 726                        name = rcu_string_strdup(device->name->str, GFP_NOFS);
 727                        BUG_ON(!name); /* -ENOMEM */
 728                        rcu_assign_pointer(new_device->name, name);
 729                }
 730
 731                list_replace_rcu(&device->dev_list, &new_device->dev_list);
 732                new_device->fs_devices = device->fs_devices;
 733
 734                call_rcu(&device->rcu, free_device);
 735        }
 736        mutex_unlock(&fs_devices->device_list_mutex);
 737
 738        WARN_ON(fs_devices->open_devices);
 739        WARN_ON(fs_devices->rw_devices);
 740        fs_devices->opened = 0;
 741        fs_devices->seeding = 0;
 742
 743        return 0;
 744}
 745
 746int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 747{
 748        struct btrfs_fs_devices *seed_devices = NULL;
 749        int ret;
 750
 751        mutex_lock(&uuid_mutex);
 752        ret = __btrfs_close_devices(fs_devices);
 753        if (!fs_devices->opened) {
 754                seed_devices = fs_devices->seed;
 755                fs_devices->seed = NULL;
 756        }
 757        mutex_unlock(&uuid_mutex);
 758
 759        while (seed_devices) {
 760                fs_devices = seed_devices;
 761                seed_devices = fs_devices->seed;
 762                __btrfs_close_devices(fs_devices);
 763                free_fs_devices(fs_devices);
 764        }
 765        /*
 766         * Wait for rcu kworkers under __btrfs_close_devices
 767         * to finish all blkdev_puts so device is really
 768         * free when umount is done.
 769         */
 770        rcu_barrier();
 771        return ret;
 772}
 773
 774static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 775                                fmode_t flags, void *holder)
 776{
 777        struct request_queue *q;
 778        struct block_device *bdev;
 779        struct list_head *head = &fs_devices->devices;
 780        struct btrfs_device *device;
 781        struct btrfs_device *latest_dev = NULL;
 782        struct buffer_head *bh;
 783        struct btrfs_super_block *disk_super;
 784        u64 devid;
 785        int seeding = 1;
 786        int ret = 0;
 787
 788        flags |= FMODE_EXCL;
 789
 790        list_for_each_entry(device, head, dev_list) {
 791                if (device->bdev)
 792                        continue;
 793                if (!device->name)
 794                        continue;
 795
 796                /* Just open everything we can; ignore failures here */
 797                if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
 798                                            &bdev, &bh))
 799                        continue;
 800
 801                disk_super = (struct btrfs_super_block *)bh->b_data;
 802                devid = btrfs_stack_device_id(&disk_super->dev_item);
 803                if (devid != device->devid)
 804                        goto error_brelse;
 805
 806                if (memcmp(device->uuid, disk_super->dev_item.uuid,
 807                           BTRFS_UUID_SIZE))
 808                        goto error_brelse;
 809
 810                device->generation = btrfs_super_generation(disk_super);
 811                if (!latest_dev ||
 812                    device->generation > latest_dev->generation)
 813                        latest_dev = device;
 814
 815                if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
 816                        device->writeable = 0;
 817                } else {
 818                        device->writeable = !bdev_read_only(bdev);
 819                        seeding = 0;
 820                }
 821
 822                q = bdev_get_queue(bdev);
 823                if (blk_queue_discard(q))
 824                        device->can_discard = 1;
 825
 826                device->bdev = bdev;
 827                device->in_fs_metadata = 0;
 828                device->mode = flags;
 829
 830                if (!blk_queue_nonrot(bdev_get_queue(bdev)))
 831                        fs_devices->rotating = 1;
 832
 833                fs_devices->open_devices++;
 834                if (device->writeable &&
 835                    device->devid != BTRFS_DEV_REPLACE_DEVID) {
 836                        fs_devices->rw_devices++;
 837                        list_add(&device->dev_alloc_list,
 838                                 &fs_devices->alloc_list);
 839                }
 840                brelse(bh);
 841                continue;
 842
 843error_brelse:
 844                brelse(bh);
 845                blkdev_put(bdev, flags);
 846                continue;
 847        }
 848        if (fs_devices->open_devices == 0) {
 849                ret = -EINVAL;
 850                goto out;
 851        }
 852        fs_devices->seeding = seeding;
 853        fs_devices->opened = 1;
 854        fs_devices->latest_bdev = latest_dev->bdev;
 855        fs_devices->total_rw_bytes = 0;
 856out:
 857        return ret;
 858}
 859
 860int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 861                       fmode_t flags, void *holder)
 862{
 863        int ret;
 864
 865        mutex_lock(&uuid_mutex);
 866        if (fs_devices->opened) {
 867                fs_devices->opened++;
 868                ret = 0;
 869        } else {
 870                ret = __btrfs_open_devices(fs_devices, flags, holder);
 871        }
 872        mutex_unlock(&uuid_mutex);
 873        return ret;
 874}
 875
 876/*
 877 * Look for a btrfs signature on a device. This may be called out of the mount path
 878 * and we are not allowed to call set_blocksize during the scan. The superblock
 879 * is read via pagecache
 880 */
 881int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 882                          struct btrfs_fs_devices **fs_devices_ret)
 883{
 884        struct btrfs_super_block *disk_super;
 885        struct block_device *bdev;
 886        struct page *page;
 887        void *p;
 888        int ret = -EINVAL;
 889        u64 devid;
 890        u64 transid;
 891        u64 total_devices;
 892        u64 bytenr;
 893        pgoff_t index;
 894
 895        /*
 896         * we would like to check all the supers, but that would make
 897         * a btrfs mount succeed after a mkfs from a different FS.
 898         * So, we need to add a special mount option to scan for
 899         * later supers, using BTRFS_SUPER_MIRROR_MAX instead
 900         */
 901        bytenr = btrfs_sb_offset(0);
 902        flags |= FMODE_EXCL;
 903        mutex_lock(&uuid_mutex);
 904
 905        bdev = blkdev_get_by_path(path, flags, holder);
 906
 907        if (IS_ERR(bdev)) {
 908                ret = PTR_ERR(bdev);
 909                goto error;
 910        }
 911
 912        /* make sure our super fits in the device */
 913        if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode))
 914                goto error_bdev_put;
 915
 916        /* make sure our super fits in the page */
 917        if (sizeof(*disk_super) > PAGE_CACHE_SIZE)
 918                goto error_bdev_put;
 919
 920        /* make sure our super doesn't straddle pages on disk */
 921        index = bytenr >> PAGE_CACHE_SHIFT;
 922        if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index)
 923                goto error_bdev_put;
 924
 925        /* pull in the page with our super */
 926        page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
 927                                   index, GFP_NOFS);
 928
 929        if (IS_ERR_OR_NULL(page))
 930                goto error_bdev_put;
 931
 932        p = kmap(page);
 933
 934        /* align our pointer to the offset of the super block */
 935        disk_super = p + (bytenr & ~PAGE_CACHE_MASK);
 936
 937        if (btrfs_super_bytenr(disk_super) != bytenr ||
 938            btrfs_super_magic(disk_super) != BTRFS_MAGIC)
 939                goto error_unmap;
 940
 941        devid = btrfs_stack_device_id(&disk_super->dev_item);
 942        transid = btrfs_super_generation(disk_super);
 943        total_devices = btrfs_super_num_devices(disk_super);
 944
 945        ret = device_list_add(path, disk_super, devid, fs_devices_ret);
 946        if (ret > 0) {
 947                if (disk_super->label[0]) {
 948                        if (disk_super->label[BTRFS_LABEL_SIZE - 1])
 949                                disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
 950                        printk(KERN_INFO "BTRFS: device label %s ", disk_super->label);
 951                } else {
 952                        printk(KERN_INFO "BTRFS: device fsid %pU ", disk_super->fsid);
 953                }
 954
 955                printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path);
 956                ret = 0;
 957        }
 958        if (!ret && fs_devices_ret)
 959                (*fs_devices_ret)->total_devices = total_devices;
 960
 961error_unmap:
 962        kunmap(page);
 963        page_cache_release(page);
 964
 965error_bdev_put:
 966        blkdev_put(bdev, flags);
 967error:
 968        mutex_unlock(&uuid_mutex);
 969        return ret;
 970}
 971
 972/* helper to account the used device space in the range */
 973int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
 974                                   u64 end, u64 *length)
 975{
 976        struct btrfs_key key;
 977        struct btrfs_root *root = device->dev_root;
 978        struct btrfs_dev_extent *dev_extent;
 979        struct btrfs_path *path;
 980        u64 extent_end;
 981        int ret;
 982        int slot;
 983        struct extent_buffer *l;
 984
 985        *length = 0;
 986
 987        if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace)
 988                return 0;
 989
 990        path = btrfs_alloc_path();
 991        if (!path)
 992                return -ENOMEM;
 993        path->reada = 2;
 994
 995        key.objectid = device->devid;
 996        key.offset = start;
 997        key.type = BTRFS_DEV_EXTENT_KEY;
 998
 999        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1000        if (ret < 0)
1001                goto out;
1002        if (ret > 0) {
1003                ret = btrfs_previous_item(root, path, key.objectid, key.type);
1004                if (ret < 0)
1005                        goto out;
1006        }
1007
1008        while (1) {
1009                l = path->nodes[0];
1010                slot = path->slots[0];
1011                if (slot >= btrfs_header_nritems(l)) {
1012                        ret = btrfs_next_leaf(root, path);
1013                        if (ret == 0)
1014                                continue;
1015                        if (ret < 0)
1016                                goto out;
1017
1018                        break;
1019                }
1020                btrfs_item_key_to_cpu(l, &key, slot);
1021
1022                if (key.objectid < device->devid)
1023                        goto next;
1024
1025                if (key.objectid > device->devid)
1026                        break;
1027
1028                if (key.type != BTRFS_DEV_EXTENT_KEY)
1029                        goto next;
1030
1031                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1032                extent_end = key.offset + btrfs_dev_extent_length(l,
1033                                                                  dev_extent);
1034                if (key.offset <= start && extent_end > end) {
1035                        *length = end - start + 1;
1036                        break;
1037                } else if (key.offset <= start && extent_end > start)
1038                        *length += extent_end - start;
1039                else if (key.offset > start && extent_end <= end)
1040                        *length += extent_end - key.offset;
1041                else if (key.offset > start && key.offset <= end) {
1042                        *length += end - key.offset + 1;
1043                        break;
1044                } else if (key.offset > end)
1045                        break;
1046
1047next:
1048                path->slots[0]++;
1049        }
1050        ret = 0;
1051out:
1052        btrfs_free_path(path);
1053        return ret;
1054}
1055
1056static int contains_pending_extent(struct btrfs_trans_handle *trans,
1057                                   struct btrfs_device *device,
1058                                   u64 *start, u64 len)
1059{
1060        struct extent_map *em;
1061        struct list_head *search_list = &trans->transaction->pending_chunks;
1062        int ret = 0;
1063
1064again:
1065        list_for_each_entry(em, search_list, list) {
1066                struct map_lookup *map;
1067                int i;
1068
1069                map = (struct map_lookup *)em->bdev;
1070                for (i = 0; i < map->num_stripes; i++) {
1071                        if (map->stripes[i].dev != device)
1072                                continue;
1073                        if (map->stripes[i].physical >= *start + len ||
1074                            map->stripes[i].physical + em->orig_block_len <=
1075                            *start)
1076                                continue;
1077                        *start = map->stripes[i].physical +
1078                                em->orig_block_len;
1079                        ret = 1;
1080                }
1081        }
1082        if (search_list == &trans->transaction->pending_chunks) {
1083                search_list = &trans->root->fs_info->pinned_chunks;
1084                goto again;
1085        }
1086
1087        return ret;
1088}
1089
1090
1091/*
1092 * find_free_dev_extent - find free space in the specified device
1093 * @device:     the device which we search the free space in
1094 * @num_bytes:  the size of the free space that we need
1095 * @start:      store the start of the free space.
1096 * @len:        the size of the free space. that we find, or the size of the max
1097 *              free space if we don't find suitable free space
1098 *
1099 * this uses a pretty simple search, the expectation is that it is
1100 * called very infrequently and that a given device has a small number
1101 * of extents
1102 *
1103 * @start is used to store the start of the free space if we find. But if we
1104 * don't find suitable free space, it will be used to store the start position
1105 * of the max free space.
1106 *
1107 * @len is used to store the size of the free space that we find.
1108 * But if we don't find suitable free space, it is used to store the size of
1109 * the max free space.
1110 */
1111int find_free_dev_extent(struct btrfs_trans_handle *trans,
1112                         struct btrfs_device *device, u64 num_bytes,
1113                         u64 *start, u64 *len)
1114{
1115        struct btrfs_key key;
1116        struct btrfs_root *root = device->dev_root;
1117        struct btrfs_dev_extent *dev_extent;
1118        struct btrfs_path *path;
1119        u64 hole_size;
1120        u64 max_hole_start;
1121        u64 max_hole_size;
1122        u64 extent_end;
1123        u64 search_start;
1124        u64 search_end = device->total_bytes;
1125        int ret;
1126        int slot;
1127        struct extent_buffer *l;
1128
1129        /* FIXME use last free of some kind */
1130
1131        /* we don't want to overwrite the superblock on the drive,
1132         * so we make sure to start at an offset of at least 1MB
1133         */
1134        search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
1135
1136        path = btrfs_alloc_path();
1137        if (!path)
1138                return -ENOMEM;
1139again:
1140        max_hole_start = search_start;
1141        max_hole_size = 0;
1142        hole_size = 0;
1143
1144        if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
1145                ret = -ENOSPC;
1146                goto out;
1147        }
1148
1149        path->reada = 2;
1150        path->search_commit_root = 1;
1151        path->skip_locking = 1;
1152
1153        key.objectid = device->devid;
1154        key.offset = search_start;
1155        key.type = BTRFS_DEV_EXTENT_KEY;
1156
1157        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1158        if (ret < 0)
1159                goto out;
1160        if (ret > 0) {
1161                ret = btrfs_previous_item(root, path, key.objectid, key.type);
1162                if (ret < 0)
1163                        goto out;
1164        }
1165
1166        while (1) {
1167                l = path->nodes[0];
1168                slot = path->slots[0];
1169                if (slot >= btrfs_header_nritems(l)) {
1170                        ret = btrfs_next_leaf(root, path);
1171                        if (ret == 0)
1172                                continue;
1173                        if (ret < 0)
1174                                goto out;
1175
1176                        break;
1177                }
1178                btrfs_item_key_to_cpu(l, &key, slot);
1179
1180                if (key.objectid < device->devid)
1181                        goto next;
1182
1183                if (key.objectid > device->devid)
1184                        break;
1185
1186                if (key.type != BTRFS_DEV_EXTENT_KEY)
1187                        goto next;
1188
1189                if (key.offset > search_start) {
1190                        hole_size = key.offset - search_start;
1191
1192                        /*
1193                         * Have to check before we set max_hole_start, otherwise
1194                         * we could end up sending back this offset anyway.
1195                         */
1196                        if (contains_pending_extent(trans, device,
1197                                                    &search_start,
1198                                                    hole_size))
1199                                hole_size = 0;
1200
1201                        if (hole_size > max_hole_size) {
1202                                max_hole_start = search_start;
1203                                max_hole_size = hole_size;
1204                        }
1205
1206                        /*
1207                         * If this free space is greater than which we need,
1208                         * it must be the max free space that we have found
1209                         * until now, so max_hole_start must point to the start
1210                         * of this free space and the length of this free space
1211                         * is stored in max_hole_size. Thus, we return
1212                         * max_hole_start and max_hole_size and go back to the
1213                         * caller.
1214                         */
1215                        if (hole_size >= num_bytes) {
1216                                ret = 0;
1217                                goto out;
1218                        }
1219                }
1220
1221                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1222                extent_end = key.offset + btrfs_dev_extent_length(l,
1223                                                                  dev_extent);
1224                if (extent_end > search_start)
1225                        search_start = extent_end;
1226next:
1227                path->slots[0]++;
1228                cond_resched();
1229        }
1230
1231        /*
1232         * At this point, search_start should be the end of
1233         * allocated dev extents, and when shrinking the device,
1234         * search_end may be smaller than search_start.
1235         */
1236        if (search_end > search_start)
1237                hole_size = search_end - search_start;
1238
1239        if (hole_size > max_hole_size) {
1240                max_hole_start = search_start;
1241                max_hole_size = hole_size;
1242        }
1243
1244        if (contains_pending_extent(trans, device, &search_start, hole_size)) {
1245                btrfs_release_path(path);
1246                goto again;
1247        }
1248
1249        /* See above. */
1250        if (hole_size < num_bytes)
1251                ret = -ENOSPC;
1252        else
1253                ret = 0;
1254
1255out:
1256        btrfs_free_path(path);
1257        *start = max_hole_start;
1258        if (len)
1259                *len = max_hole_size;
1260        return ret;
1261}
1262
1263static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1264                          struct btrfs_device *device,
1265                          u64 start, u64 *dev_extent_len)
1266{
1267        int ret;
1268        struct btrfs_path *path;
1269        struct btrfs_root *root = device->dev_root;
1270        struct btrfs_key key;
1271        struct btrfs_key found_key;
1272        struct extent_buffer *leaf = NULL;
1273        struct btrfs_dev_extent *extent = NULL;
1274
1275        path = btrfs_alloc_path();
1276        if (!path)
1277                return -ENOMEM;
1278
1279        key.objectid = device->devid;
1280        key.offset = start;
1281        key.type = BTRFS_DEV_EXTENT_KEY;
1282again:
1283        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1284        if (ret > 0) {
1285                ret = btrfs_previous_item(root, path, key.objectid,
1286                                          BTRFS_DEV_EXTENT_KEY);
1287                if (ret)
1288                        goto out;
1289                leaf = path->nodes[0];
1290                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1291                extent = btrfs_item_ptr(leaf, path->slots[0],
1292                                        struct btrfs_dev_extent);
1293                BUG_ON(found_key.offset > start || found_key.offset +
1294                       btrfs_dev_extent_length(leaf, extent) < start);
1295                key = found_key;
1296                btrfs_release_path(path);
1297                goto again;
1298        } else if (ret == 0) {
1299                leaf = path->nodes[0];
1300                extent = btrfs_item_ptr(leaf, path->slots[0],
1301                                        struct btrfs_dev_extent);
1302        } else {
1303                btrfs_error(root->fs_info, ret, "Slot search failed");
1304                goto out;
1305        }
1306
1307        *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1308
1309        ret = btrfs_del_item(trans, root, path);
1310        if (ret) {
1311                btrfs_error(root->fs_info, ret,
1312                            "Failed to remove dev extent item");
1313        } else {
1314                trans->transaction->have_free_bgs = 1;
1315        }
1316out:
1317        btrfs_free_path(path);
1318        return ret;
1319}
1320
1321static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1322                                  struct btrfs_device *device,
1323                                  u64 chunk_tree, u64 chunk_objectid,
1324                                  u64 chunk_offset, u64 start, u64 num_bytes)
1325{
1326        int ret;
1327        struct btrfs_path *path;
1328        struct btrfs_root *root = device->dev_root;
1329        struct btrfs_dev_extent *extent;
1330        struct extent_buffer *leaf;
1331        struct btrfs_key key;
1332
1333        WARN_ON(!device->in_fs_metadata);
1334        WARN_ON(device->is_tgtdev_for_dev_replace);
1335        path = btrfs_alloc_path();
1336        if (!path)
1337                return -ENOMEM;
1338
1339        key.objectid = device->devid;
1340        key.offset = start;
1341        key.type = BTRFS_DEV_EXTENT_KEY;
1342        ret = btrfs_insert_empty_item(trans, root, path, &key,
1343                                      sizeof(*extent));
1344        if (ret)
1345                goto out;
1346
1347        leaf = path->nodes[0];
1348        extent = btrfs_item_ptr(leaf, path->slots[0],
1349                                struct btrfs_dev_extent);
1350        btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree);
1351        btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid);
1352        btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1353
1354        write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
1355                    btrfs_dev_extent_chunk_tree_uuid(extent), BTRFS_UUID_SIZE);
1356
1357        btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1358        btrfs_mark_buffer_dirty(leaf);
1359out:
1360        btrfs_free_path(path);
1361        return ret;
1362}
1363
1364static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1365{
1366        struct extent_map_tree *em_tree;
1367        struct extent_map *em;
1368        struct rb_node *n;
1369        u64 ret = 0;
1370
1371        em_tree = &fs_info->mapping_tree.map_tree;
1372        read_lock(&em_tree->lock);
1373        n = rb_last(&em_tree->map);
1374        if (n) {
1375                em = rb_entry(n, struct extent_map, rb_node);
1376                ret = em->start + em->len;
1377        }
1378        read_unlock(&em_tree->lock);
1379
1380        return ret;
1381}
1382
1383static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1384                                    u64 *devid_ret)
1385{
1386        int ret;
1387        struct btrfs_key key;
1388        struct btrfs_key found_key;
1389        struct btrfs_path *path;
1390
1391        path = btrfs_alloc_path();
1392        if (!path)
1393                return -ENOMEM;
1394
1395        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1396        key.type = BTRFS_DEV_ITEM_KEY;
1397        key.offset = (u64)-1;
1398
1399        ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1400        if (ret < 0)
1401                goto error;
1402
1403        BUG_ON(ret == 0); /* Corruption */
1404
1405        ret = btrfs_previous_item(fs_info->chunk_root, path,
1406                                  BTRFS_DEV_ITEMS_OBJECTID,
1407                                  BTRFS_DEV_ITEM_KEY);
1408        if (ret) {
1409                *devid_ret = 1;
1410        } else {
1411                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1412                                      path->slots[0]);
1413                *devid_ret = found_key.offset + 1;
1414        }
1415        ret = 0;
1416error:
1417        btrfs_free_path(path);
1418        return ret;
1419}
1420
1421/*
1422 * the device information is stored in the chunk root
1423 * the btrfs_device struct should be fully filled in
1424 */
1425static int btrfs_add_device(struct btrfs_trans_handle *trans,
1426                            struct btrfs_root *root,
1427                            struct btrfs_device *device)
1428{
1429        int ret;
1430        struct btrfs_path *path;
1431        struct btrfs_dev_item *dev_item;
1432        struct extent_buffer *leaf;
1433        struct btrfs_key key;
1434        unsigned long ptr;
1435
1436        root = root->fs_info->chunk_root;
1437
1438        path = btrfs_alloc_path();
1439        if (!path)
1440                return -ENOMEM;
1441
1442        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1443        key.type = BTRFS_DEV_ITEM_KEY;
1444        key.offset = device->devid;
1445
1446        ret = btrfs_insert_empty_item(trans, root, path, &key,
1447                                      sizeof(*dev_item));
1448        if (ret)
1449                goto out;
1450
1451        leaf = path->nodes[0];
1452        dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1453
1454        btrfs_set_device_id(leaf, dev_item, device->devid);
1455        btrfs_set_device_generation(leaf, dev_item, 0);
1456        btrfs_set_device_type(leaf, dev_item, device->type);
1457        btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1458        btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1459        btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1460        btrfs_set_device_total_bytes(leaf, dev_item,
1461                                     btrfs_device_get_disk_total_bytes(device));
1462        btrfs_set_device_bytes_used(leaf, dev_item,
1463                                    btrfs_device_get_bytes_used(device));
1464        btrfs_set_device_group(leaf, dev_item, 0);
1465        btrfs_set_device_seek_speed(leaf, dev_item, 0);
1466        btrfs_set_device_bandwidth(leaf, dev_item, 0);
1467        btrfs_set_device_start_offset(leaf, dev_item, 0);
1468
1469        ptr = btrfs_device_uuid(dev_item);
1470        write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1471        ptr = btrfs_device_fsid(dev_item);
1472        write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
1473        btrfs_mark_buffer_dirty(leaf);
1474
1475        ret = 0;
1476out:
1477        btrfs_free_path(path);
1478        return ret;
1479}
1480
1481/*
1482 * Function to update ctime/mtime for a given device path.
1483 * Mainly used for ctime/mtime based probe like libblkid.
1484 */
1485static void update_dev_time(char *path_name)
1486{
1487        struct file *filp;
1488
1489        filp = filp_open(path_name, O_RDWR, 0);
1490        if (IS_ERR(filp))
1491                return;
1492        file_update_time(filp);
1493        filp_close(filp, NULL);
1494        return;
1495}
1496
1497static int btrfs_rm_dev_item(struct btrfs_root *root,
1498                             struct btrfs_device *device)
1499{
1500        int ret;
1501        struct btrfs_path *path;
1502        struct btrfs_key key;
1503        struct btrfs_trans_handle *trans;
1504
1505        root = root->fs_info->chunk_root;
1506
1507        path = btrfs_alloc_path();
1508        if (!path)
1509                return -ENOMEM;
1510
1511        trans = btrfs_start_transaction(root, 0);
1512        if (IS_ERR(trans)) {
1513                btrfs_free_path(path);
1514                return PTR_ERR(trans);
1515        }
1516        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1517        key.type = BTRFS_DEV_ITEM_KEY;
1518        key.offset = device->devid;
1519
1520        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1521        if (ret < 0)
1522                goto out;
1523
1524        if (ret > 0) {
1525                ret = -ENOENT;
1526                goto out;
1527        }
1528
1529        ret = btrfs_del_item(trans, root, path);
1530        if (ret)
1531                goto out;
1532out:
1533        btrfs_free_path(path);
1534        btrfs_commit_transaction(trans, root);
1535        return ret;
1536}
1537
1538int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1539{
1540        struct btrfs_device *device;
1541        struct btrfs_device *next_device;
1542        struct block_device *bdev;
1543        struct buffer_head *bh = NULL;
1544        struct btrfs_super_block *disk_super;
1545        struct btrfs_fs_devices *cur_devices;
1546        u64 all_avail;
1547        u64 devid;
1548        u64 num_devices;
1549        u8 *dev_uuid;
1550        unsigned seq;
1551        int ret = 0;
1552        bool clear_super = false;
1553
1554        mutex_lock(&uuid_mutex);
1555
1556        do {
1557                seq = read_seqbegin(&root->fs_info->profiles_lock);
1558
1559                all_avail = root->fs_info->avail_data_alloc_bits |
1560                            root->fs_info->avail_system_alloc_bits |
1561                            root->fs_info->avail_metadata_alloc_bits;
1562        } while (read_seqretry(&root->fs_info->profiles_lock, seq));
1563
1564        num_devices = root->fs_info->fs_devices->num_devices;
1565        btrfs_dev_replace_lock(&root->fs_info->dev_replace);
1566        if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
1567                WARN_ON(num_devices < 1);
1568                num_devices--;
1569        }
1570        btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
1571
1572        if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
1573                ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET;
1574                goto out;
1575        }
1576
1577        if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
1578                ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET;
1579                goto out;
1580        }
1581
1582        if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
1583            root->fs_info->fs_devices->rw_devices <= 2) {
1584                ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET;
1585                goto out;
1586        }
1587        if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
1588            root->fs_info->fs_devices->rw_devices <= 3) {
1589                ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET;
1590                goto out;
1591        }
1592
1593        if (strcmp(device_path, "missing") == 0) {
1594                struct list_head *devices;
1595                struct btrfs_device *tmp;
1596
1597                device = NULL;
1598                devices = &root->fs_info->fs_devices->devices;
1599                /*
1600                 * It is safe to read the devices since the volume_mutex
1601                 * is held.
1602                 */
1603                list_for_each_entry(tmp, devices, dev_list) {
1604                        if (tmp->in_fs_metadata &&
1605                            !tmp->is_tgtdev_for_dev_replace &&
1606                            !tmp->bdev) {
1607                                device = tmp;
1608                                break;
1609                        }
1610                }
1611                bdev = NULL;
1612                bh = NULL;
1613                disk_super = NULL;
1614                if (!device) {
1615                        ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
1616                        goto out;
1617                }
1618        } else {
1619                ret = btrfs_get_bdev_and_sb(device_path,
1620                                            FMODE_WRITE | FMODE_EXCL,
1621                                            root->fs_info->bdev_holder, 0,
1622                                            &bdev, &bh);
1623                if (ret)
1624                        goto out;
1625                disk_super = (struct btrfs_super_block *)bh->b_data;
1626                devid = btrfs_stack_device_id(&disk_super->dev_item);
1627                dev_uuid = disk_super->dev_item.uuid;
1628                device = btrfs_find_device(root->fs_info, devid, dev_uuid,
1629                                           disk_super->fsid);
1630                if (!device) {
1631                        ret = -ENOENT;
1632                        goto error_brelse;
1633                }
1634        }
1635
1636        if (device->is_tgtdev_for_dev_replace) {
1637                ret = BTRFS_ERROR_DEV_TGT_REPLACE;
1638                goto error_brelse;
1639        }
1640
1641        if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
1642                ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
1643                goto error_brelse;
1644        }
1645
1646        if (device->writeable) {
1647                lock_chunks(root);
1648                list_del_init(&device->dev_alloc_list);
1649                device->fs_devices->rw_devices--;
1650                unlock_chunks(root);
1651                clear_super = true;
1652        }
1653
1654        mutex_unlock(&uuid_mutex);
1655        ret = btrfs_shrink_device(device, 0);
1656        mutex_lock(&uuid_mutex);
1657        if (ret)
1658                goto error_undo;
1659
1660        /*
1661         * TODO: the superblock still includes this device in its num_devices
1662         * counter although write_all_supers() is not locked out. This
1663         * could give a filesystem state which requires a degraded mount.
1664         */
1665        ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
1666        if (ret)
1667                goto error_undo;
1668
1669        device->in_fs_metadata = 0;
1670        btrfs_scrub_cancel_dev(root->fs_info, device);
1671
1672        /*
1673         * the device list mutex makes sure that we don't change
1674         * the device list while someone else is writing out all
1675         * the device supers. Whoever is writing all supers, should
1676         * lock the device list mutex before getting the number of
1677         * devices in the super block (super_copy). Conversely,
1678         * whoever updates the number of devices in the super block
1679         * (super_copy) should hold the device list mutex.
1680         */
1681
1682        cur_devices = device->fs_devices;
1683        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1684        list_del_rcu(&device->dev_list);
1685
1686        device->fs_devices->num_devices--;
1687        device->fs_devices->total_devices--;
1688
1689        if (device->missing)
1690                device->fs_devices->missing_devices--;
1691
1692        next_device = list_entry(root->fs_info->fs_devices->devices.next,
1693                                 struct btrfs_device, dev_list);
1694        if (device->bdev == root->fs_info->sb->s_bdev)
1695                root->fs_info->sb->s_bdev = next_device->bdev;
1696        if (device->bdev == root->fs_info->fs_devices->latest_bdev)
1697                root->fs_info->fs_devices->latest_bdev = next_device->bdev;
1698
1699        if (device->bdev) {
1700                device->fs_devices->open_devices--;
1701                /* remove sysfs entry */
1702                btrfs_kobj_rm_device(root->fs_info, device);
1703        }
1704
1705        call_rcu(&device->rcu, free_device);
1706
1707        num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
1708        btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
1709        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
1710
1711        if (cur_devices->open_devices == 0) {
1712                struct btrfs_fs_devices *fs_devices;
1713                fs_devices = root->fs_info->fs_devices;
1714                while (fs_devices) {
1715                        if (fs_devices->seed == cur_devices) {
1716                                fs_devices->seed = cur_devices->seed;
1717                                break;
1718                        }
1719                        fs_devices = fs_devices->seed;
1720                }
1721                cur_devices->seed = NULL;
1722                __btrfs_close_devices(cur_devices);
1723                free_fs_devices(cur_devices);
1724        }
1725
1726        root->fs_info->num_tolerated_disk_barrier_failures =
1727                btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
1728
1729        /*
1730         * at this point, the device is zero sized.  We want to
1731         * remove it from the devices list and zero out the old super
1732         */
1733        if (clear_super && disk_super) {
1734                u64 bytenr;
1735                int i;
1736
1737                /* make sure this device isn't detected as part of
1738                 * the FS anymore
1739                 */
1740                memset(&disk_super->magic, 0, sizeof(disk_super->magic));
1741                set_buffer_dirty(bh);
1742                sync_dirty_buffer(bh);
1743
1744                /* clear the mirror copies of super block on the disk
1745                 * being removed, 0th copy is been taken care above and
1746                 * the below would take of the rest
1747                 */
1748                for (i = 1; i < BTRFS_SUPER_MIRROR_MAX; i++) {
1749                        bytenr = btrfs_sb_offset(i);
1750                        if (bytenr + BTRFS_SUPER_INFO_SIZE >=
1751                                        i_size_read(bdev->bd_inode))
1752                                break;
1753
1754                        brelse(bh);
1755                        bh = __bread(bdev, bytenr / 4096,
1756                                        BTRFS_SUPER_INFO_SIZE);
1757                        if (!bh)
1758                                continue;
1759
1760                        disk_super = (struct btrfs_super_block *)bh->b_data;
1761
1762                        if (btrfs_super_bytenr(disk_super) != bytenr ||
1763                                btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
1764                                continue;
1765                        }
1766                        memset(&disk_super->magic, 0,
1767                                                sizeof(disk_super->magic));
1768                        set_buffer_dirty(bh);
1769                        sync_dirty_buffer(bh);
1770                }
1771        }
1772
1773        ret = 0;
1774
1775        if (bdev) {
1776                /* Notify udev that device has changed */
1777                btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
1778
1779                /* Update ctime/mtime for device path for libblkid */
1780                update_dev_time(device_path);
1781        }
1782
1783error_brelse:
1784        brelse(bh);
1785        if (bdev)
1786                blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
1787out:
1788        mutex_unlock(&uuid_mutex);
1789        return ret;
1790error_undo:
1791        if (device->writeable) {
1792                lock_chunks(root);
1793                list_add(&device->dev_alloc_list,
1794                         &root->fs_info->fs_devices->alloc_list);
1795                device->fs_devices->rw_devices++;
1796                unlock_chunks(root);
1797        }
1798        goto error_brelse;
1799}
1800
1801void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_fs_info *fs_info,
1802                                        struct btrfs_device *srcdev)
1803{
1804        struct btrfs_fs_devices *fs_devices;
1805
1806        WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
1807
1808        /*
1809         * in case of fs with no seed, srcdev->fs_devices will point
1810         * to fs_devices of fs_info. However when the dev being replaced is
1811         * a seed dev it will point to the seed's local fs_devices. In short
1812         * srcdev will have its correct fs_devices in both the cases.
1813         */
1814        fs_devices = srcdev->fs_devices;
1815
1816        list_del_rcu(&srcdev->dev_list);
1817        list_del_rcu(&srcdev->dev_alloc_list);
1818        fs_devices->num_devices--;
1819        if (srcdev->missing)
1820                fs_devices->missing_devices--;
1821
1822        if (srcdev->writeable) {
1823                fs_devices->rw_devices--;
1824                /* zero out the old super if it is writable */
1825                btrfs_scratch_superblock(srcdev);
1826        }
1827
1828        if (srcdev->bdev)
1829                fs_devices->open_devices--;
1830}
1831
1832void btrfs_rm_dev_replace_free_srcdev(struct btrfs_fs_info *fs_info,
1833                                      struct btrfs_device *srcdev)
1834{
1835        struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
1836
1837        call_rcu(&srcdev->rcu, free_device);
1838
1839        /*
1840         * unless fs_devices is seed fs, num_devices shouldn't go
1841         * zero
1842         */
1843        BUG_ON(!fs_devices->num_devices && !fs_devices->seeding);
1844
1845        /* if this is no devs we rather delete the fs_devices */
1846        if (!fs_devices->num_devices) {
1847                struct btrfs_fs_devices *tmp_fs_devices;
1848
1849                tmp_fs_devices = fs_info->fs_devices;
1850                while (tmp_fs_devices) {
1851                        if (tmp_fs_devices->seed == fs_devices) {
1852                                tmp_fs_devices->seed = fs_devices->seed;
1853                                break;
1854                        }
1855                        tmp_fs_devices = tmp_fs_devices->seed;
1856                }
1857                fs_devices->seed = NULL;
1858                __btrfs_close_devices(fs_devices);
1859                free_fs_devices(fs_devices);
1860        }
1861}
1862
1863void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
1864                                      struct btrfs_device *tgtdev)
1865{
1866        struct btrfs_device *next_device;
1867
1868        mutex_lock(&uuid_mutex);
1869        WARN_ON(!tgtdev);
1870        mutex_lock(&fs_info->fs_devices->device_list_mutex);
1871        if (tgtdev->bdev) {
1872                btrfs_scratch_superblock(tgtdev);
1873                fs_info->fs_devices->open_devices--;
1874        }
1875        fs_info->fs_devices->num_devices--;
1876
1877        next_device = list_entry(fs_info->fs_devices->devices.next,
1878                                 struct btrfs_device, dev_list);
1879        if (tgtdev->bdev == fs_info->sb->s_bdev)
1880                fs_info->sb->s_bdev = next_device->bdev;
1881        if (tgtdev->bdev == fs_info->fs_devices->latest_bdev)
1882                fs_info->fs_devices->latest_bdev = next_device->bdev;
1883        list_del_rcu(&tgtdev->dev_list);
1884
1885        call_rcu(&tgtdev->rcu, free_device);
1886
1887        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1888        mutex_unlock(&uuid_mutex);
1889}
1890
1891static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
1892                                     struct btrfs_device **device)
1893{
1894        int ret = 0;
1895        struct btrfs_super_block *disk_super;
1896        u64 devid;
1897        u8 *dev_uuid;
1898        struct block_device *bdev;
1899        struct buffer_head *bh;
1900
1901        *device = NULL;
1902        ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
1903                                    root->fs_info->bdev_holder, 0, &bdev, &bh);
1904        if (ret)
1905                return ret;
1906        disk_super = (struct btrfs_super_block *)bh->b_data;
1907        devid = btrfs_stack_device_id(&disk_super->dev_item);
1908        dev_uuid = disk_super->dev_item.uuid;
1909        *device = btrfs_find_device(root->fs_info, devid, dev_uuid,
1910                                    disk_super->fsid);
1911        brelse(bh);
1912        if (!*device)
1913                ret = -ENOENT;
1914        blkdev_put(bdev, FMODE_READ);
1915        return ret;
1916}
1917
1918int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
1919                                         char *device_path,
1920                                         struct btrfs_device **device)
1921{
1922        *device = NULL;
1923        if (strcmp(device_path, "missing") == 0) {
1924                struct list_head *devices;
1925                struct btrfs_device *tmp;
1926
1927                devices = &root->fs_info->fs_devices->devices;
1928                /*
1929                 * It is safe to read the devices since the volume_mutex
1930                 * is held by the caller.
1931                 */
1932                list_for_each_entry(tmp, devices, dev_list) {
1933                        if (tmp->in_fs_metadata && !tmp->bdev) {
1934                                *device = tmp;
1935                                break;
1936                        }
1937                }
1938
1939                if (!*device) {
1940                        btrfs_err(root->fs_info, "no missing device found");
1941                        return -ENOENT;
1942                }
1943
1944                return 0;
1945        } else {
1946                return btrfs_find_device_by_path(root, device_path, device);
1947        }
1948}
1949
1950/*
1951 * does all the dirty work required for changing file system's UUID.
1952 */
1953static int btrfs_prepare_sprout(struct btrfs_root *root)
1954{
1955        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
1956        struct btrfs_fs_devices *old_devices;
1957        struct btrfs_fs_devices *seed_devices;
1958        struct btrfs_super_block *disk_super = root->fs_info->super_copy;
1959        struct btrfs_device *device;
1960        u64 super_flags;
1961
1962        BUG_ON(!mutex_is_locked(&uuid_mutex));
1963        if (!fs_devices->seeding)
1964                return -EINVAL;
1965
1966        seed_devices = __alloc_fs_devices();
1967        if (IS_ERR(seed_devices))
1968                return PTR_ERR(seed_devices);
1969
1970        old_devices = clone_fs_devices(fs_devices);
1971        if (IS_ERR(old_devices)) {
1972                kfree(seed_devices);
1973                return PTR_ERR(old_devices);
1974        }
1975
1976        list_add(&old_devices->list, &fs_uuids);
1977
1978        memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
1979        seed_devices->opened = 1;
1980        INIT_LIST_HEAD(&seed_devices->devices);
1981        INIT_LIST_HEAD(&seed_devices->alloc_list);
1982        mutex_init(&seed_devices->device_list_mutex);
1983
1984        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
1985        list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
1986                              synchronize_rcu);
1987        list_for_each_entry(device, &seed_devices->devices, dev_list)
1988                device->fs_devices = seed_devices;
1989
1990        lock_chunks(root);
1991        list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
1992        unlock_chunks(root);
1993
1994        fs_devices->seeding = 0;
1995        fs_devices->num_devices = 0;
1996        fs_devices->open_devices = 0;
1997        fs_devices->missing_devices = 0;
1998        fs_devices->rotating = 0;
1999        fs_devices->seed = seed_devices;
2000
2001        generate_random_uuid(fs_devices->fsid);
2002        memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2003        memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2004        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2005
2006        super_flags = btrfs_super_flags(disk_super) &
2007                      ~BTRFS_SUPER_FLAG_SEEDING;
2008        btrfs_set_super_flags(disk_super, super_flags);
2009
2010        return 0;
2011}
2012
2013/*
2014 * strore the expected generation for seed devices in device items.
2015 */
2016static int btrfs_finish_sprout(struct btrfs_trans_handle *trans,
2017                               struct btrfs_root *root)
2018{
2019        struct btrfs_path *path;
2020        struct extent_buffer *leaf;
2021        struct btrfs_dev_item *dev_item;
2022        struct btrfs_device *device;
2023        struct btrfs_key key;
2024        u8 fs_uuid[BTRFS_UUID_SIZE];
2025        u8 dev_uuid[BTRFS_UUID_SIZE];
2026        u64 devid;
2027        int ret;
2028
2029        path = btrfs_alloc_path();
2030        if (!path)
2031                return -ENOMEM;
2032
2033        root = root->fs_info->chunk_root;
2034        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2035        key.offset = 0;
2036        key.type = BTRFS_DEV_ITEM_KEY;
2037
2038        while (1) {
2039                ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2040                if (ret < 0)
2041                        goto error;
2042
2043                leaf = path->nodes[0];
2044next_slot:
2045                if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2046                        ret = btrfs_next_leaf(root, path);
2047                        if (ret > 0)
2048                                break;
2049                        if (ret < 0)
2050                                goto error;
2051                        leaf = path->nodes[0];
2052                        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2053                        btrfs_release_path(path);
2054                        continue;
2055                }
2056
2057                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2058                if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2059                    key.type != BTRFS_DEV_ITEM_KEY)
2060                        break;
2061
2062                dev_item = btrfs_item_ptr(leaf, path->slots[0],
2063                                          struct btrfs_dev_item);
2064                devid = btrfs_device_id(leaf, dev_item);
2065                read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2066                                   BTRFS_UUID_SIZE);
2067                read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2068                                   BTRFS_UUID_SIZE);
2069                device = btrfs_find_device(root->fs_info, devid, dev_uuid,
2070                                           fs_uuid);
2071                BUG_ON(!device); /* Logic error */
2072
2073                if (device->fs_devices->seeding) {
2074                        btrfs_set_device_generation(leaf, dev_item,
2075                                                    device->generation);
2076                        btrfs_mark_buffer_dirty(leaf);
2077                }
2078
2079                path->slots[0]++;
2080                goto next_slot;
2081        }
2082        ret = 0;
2083error:
2084        btrfs_free_path(path);
2085        return ret;
2086}
2087
2088int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
2089{
2090        struct request_queue *q;
2091        struct btrfs_trans_handle *trans;
2092        struct btrfs_device *device;
2093        struct block_device *bdev;
2094        struct list_head *devices;
2095        struct super_block *sb = root->fs_info->sb;
2096        struct rcu_string *name;
2097        u64 tmp;
2098        int seeding_dev = 0;
2099        int ret = 0;
2100
2101        if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
2102                return -EROFS;
2103
2104        bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2105                                  root->fs_info->bdev_holder);
2106        if (IS_ERR(bdev))
2107                return PTR_ERR(bdev);
2108
2109        if (root->fs_info->fs_devices->seeding) {
2110                seeding_dev = 1;
2111                down_write(&sb->s_umount);
2112                mutex_lock(&uuid_mutex);
2113        }
2114
2115        filemap_write_and_wait(bdev->bd_inode->i_mapping);
2116
2117        devices = &root->fs_info->fs_devices->devices;
2118
2119        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2120        list_for_each_entry(device, devices, dev_list) {
2121                if (device->bdev == bdev) {
2122                        ret = -EEXIST;
2123                        mutex_unlock(
2124                                &root->fs_info->fs_devices->device_list_mutex);
2125                        goto error;
2126                }
2127        }
2128        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2129
2130        device = btrfs_alloc_device(root->fs_info, NULL, NULL);
2131        if (IS_ERR(device)) {
2132                /* we can safely leave the fs_devices entry around */
2133                ret = PTR_ERR(device);
2134                goto error;
2135        }
2136
2137        name = rcu_string_strdup(device_path, GFP_NOFS);
2138        if (!name) {
2139                kfree(device);
2140                ret = -ENOMEM;
2141                goto error;
2142        }
2143        rcu_assign_pointer(device->name, name);
2144
2145        trans = btrfs_start_transaction(root, 0);
2146        if (IS_ERR(trans)) {
2147                rcu_string_free(device->name);
2148                kfree(device);
2149                ret = PTR_ERR(trans);
2150                goto error;
2151        }
2152
2153        q = bdev_get_queue(bdev);
2154        if (blk_queue_discard(q))
2155                device->can_discard = 1;
2156        device->writeable = 1;
2157        device->generation = trans->transid;
2158        device->io_width = root->sectorsize;
2159        device->io_align = root->sectorsize;
2160        device->sector_size = root->sectorsize;
2161        device->total_bytes = i_size_read(bdev->bd_inode);
2162        device->disk_total_bytes = device->total_bytes;
2163        device->commit_total_bytes = device->total_bytes;
2164        device->dev_root = root->fs_info->dev_root;
2165        device->bdev = bdev;
2166        device->in_fs_metadata = 1;
2167        device->is_tgtdev_for_dev_replace = 0;
2168        device->mode = FMODE_EXCL;
2169        device->dev_stats_valid = 1;
2170        set_blocksize(device->bdev, 4096);
2171
2172        if (seeding_dev) {
2173                sb->s_flags &= ~MS_RDONLY;
2174                ret = btrfs_prepare_sprout(root);
2175                BUG_ON(ret); /* -ENOMEM */
2176        }
2177
2178        device->fs_devices = root->fs_info->fs_devices;
2179
2180        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2181        lock_chunks(root);
2182        list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
2183        list_add(&device->dev_alloc_list,
2184                 &root->fs_info->fs_devices->alloc_list);
2185        root->fs_info->fs_devices->num_devices++;
2186        root->fs_info->fs_devices->open_devices++;
2187        root->fs_info->fs_devices->rw_devices++;
2188        root->fs_info->fs_devices->total_devices++;
2189        root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
2190
2191        spin_lock(&root->fs_info->free_chunk_lock);
2192        root->fs_info->free_chunk_space += device->total_bytes;
2193        spin_unlock(&root->fs_info->free_chunk_lock);
2194
2195        if (!blk_queue_nonrot(bdev_get_queue(bdev)))
2196                root->fs_info->fs_devices->rotating = 1;
2197
2198        tmp = btrfs_super_total_bytes(root->fs_info->super_copy);
2199        btrfs_set_super_total_bytes(root->fs_info->super_copy,
2200                                    tmp + device->total_bytes);
2201
2202        tmp = btrfs_super_num_devices(root->fs_info->super_copy);
2203        btrfs_set_super_num_devices(root->fs_info->super_copy,
2204                                    tmp + 1);
2205
2206        /* add sysfs device entry */
2207        btrfs_kobj_add_device(root->fs_info, device);
2208
2209        /*
2210         * we've got more storage, clear any full flags on the space
2211         * infos
2212         */
2213        btrfs_clear_space_info_full(root->fs_info);
2214
2215        unlock_chunks(root);
2216        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2217
2218        if (seeding_dev) {
2219                lock_chunks(root);
2220                ret = init_first_rw_device(trans, root, device);
2221                unlock_chunks(root);
2222                if (ret) {
2223                        btrfs_abort_transaction(trans, root, ret);
2224                        goto error_trans;
2225                }
2226        }
2227
2228        ret = btrfs_add_device(trans, root, device);
2229        if (ret) {
2230                btrfs_abort_transaction(trans, root, ret);
2231                goto error_trans;
2232        }
2233
2234        if (seeding_dev) {
2235                char fsid_buf[BTRFS_UUID_UNPARSED_SIZE];
2236
2237                ret = btrfs_finish_sprout(trans, root);
2238                if (ret) {
2239                        btrfs_abort_transaction(trans, root, ret);
2240                        goto error_trans;
2241                }
2242
2243                /* Sprouting would change fsid of the mounted root,
2244                 * so rename the fsid on the sysfs
2245                 */
2246                snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU",
2247                                                root->fs_info->fsid);
2248                if (kobject_rename(&root->fs_info->super_kobj, fsid_buf))
2249                        goto error_trans;
2250        }
2251
2252        root->fs_info->num_tolerated_disk_barrier_failures =
2253                btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
2254        ret = btrfs_commit_transaction(trans, root);
2255
2256        if (seeding_dev) {
2257                mutex_unlock(&uuid_mutex);
2258                up_write(&sb->s_umount);
2259
2260                if (ret) /* transaction commit */
2261                        return ret;
2262
2263                ret = btrfs_relocate_sys_chunks(root);
2264                if (ret < 0)
2265                        btrfs_error(root->fs_info, ret,
2266                                    "Failed to relocate sys chunks after "
2267                                    "device initialization. This can be fixed "
2268                                    "using the \"btrfs balance\" command.");
2269                trans = btrfs_attach_transaction(root);
2270                if (IS_ERR(trans)) {
2271                        if (PTR_ERR(trans) == -ENOENT)
2272                                return 0;
2273                        return PTR_ERR(trans);
2274                }
2275                ret = btrfs_commit_transaction(trans, root);
2276        }
2277
2278        /* Update ctime/mtime for libblkid */
2279        update_dev_time(device_path);
2280        return ret;
2281
2282error_trans:
2283        btrfs_end_transaction(trans, root);
2284        rcu_string_free(device->name);
2285        btrfs_kobj_rm_device(root->fs_info, device);
2286        kfree(device);
2287error:
2288        blkdev_put(bdev, FMODE_EXCL);
2289        if (seeding_dev) {
2290                mutex_unlock(&uuid_mutex);
2291                up_write(&sb->s_umount);
2292        }
2293        return ret;
2294}
2295
2296int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
2297                                  struct btrfs_device *srcdev,
2298                                  struct btrfs_device **device_out)
2299{
2300        struct request_queue *q;
2301        struct btrfs_device *device;
2302        struct block_device *bdev;
2303        struct btrfs_fs_info *fs_info = root->fs_info;
2304        struct list_head *devices;
2305        struct rcu_string *name;
2306        u64 devid = BTRFS_DEV_REPLACE_DEVID;
2307        int ret = 0;
2308
2309        *device_out = NULL;
2310        if (fs_info->fs_devices->seeding) {
2311                btrfs_err(fs_info, "the filesystem is a seed filesystem!");
2312                return -EINVAL;
2313        }
2314
2315        bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2316                                  fs_info->bdev_holder);
2317        if (IS_ERR(bdev)) {
2318                btrfs_err(fs_info, "target device %s is invalid!", device_path);
2319                return PTR_ERR(bdev);
2320        }
2321
2322        filemap_write_and_wait(bdev->bd_inode->i_mapping);
2323
2324        devices = &fs_info->fs_devices->devices;
2325        list_for_each_entry(device, devices, dev_list) {
2326                if (device->bdev == bdev) {
2327                        btrfs_err(fs_info, "target device is in the filesystem!");
2328                        ret = -EEXIST;
2329                        goto error;
2330                }
2331        }
2332
2333
2334        if (i_size_read(bdev->bd_inode) <
2335            btrfs_device_get_total_bytes(srcdev)) {
2336                btrfs_err(fs_info, "target device is smaller than source device!");
2337                ret = -EINVAL;
2338                goto error;
2339        }
2340
2341
2342        device = btrfs_alloc_device(NULL, &devid, NULL);
2343        if (IS_ERR(device)) {
2344                ret = PTR_ERR(device);
2345                goto error;
2346        }
2347
2348        name = rcu_string_strdup(device_path, GFP_NOFS);
2349        if (!name) {
2350                kfree(device);
2351                ret = -ENOMEM;
2352                goto error;
2353        }
2354        rcu_assign_pointer(device->name, name);
2355
2356        q = bdev_get_queue(bdev);
2357        if (blk_queue_discard(q))
2358                device->can_discard = 1;
2359        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
2360        device->writeable = 1;
2361        device->generation = 0;
2362        device->io_width = root->sectorsize;
2363        device->io_align = root->sectorsize;
2364        device->sector_size = root->sectorsize;
2365        device->total_bytes = btrfs_device_get_total_bytes(srcdev);
2366        device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
2367        device->bytes_used = btrfs_device_get_bytes_used(srcdev);
2368        ASSERT(list_empty(&srcdev->resized_list));
2369        device->commit_total_bytes = srcdev->commit_total_bytes;
2370        device->commit_bytes_used = device->bytes_used;
2371        device->dev_root = fs_info->dev_root;
2372        device->bdev = bdev;
2373        device->in_fs_metadata = 1;
2374        device->is_tgtdev_for_dev_replace = 1;
2375        device->mode = FMODE_EXCL;
2376        device->dev_stats_valid = 1;
2377        set_blocksize(device->bdev, 4096);
2378        device->fs_devices = fs_info->fs_devices;
2379        list_add(&device->dev_list, &fs_info->fs_devices->devices);
2380        fs_info->fs_devices->num_devices++;
2381        fs_info->fs_devices->open_devices++;
2382        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
2383
2384        *device_out = device;
2385        return ret;
2386
2387error:
2388        blkdev_put(bdev, FMODE_EXCL);
2389        return ret;
2390}
2391
2392void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
2393                                              struct btrfs_device *tgtdev)
2394{
2395        WARN_ON(fs_info->fs_devices->rw_devices == 0);
2396        tgtdev->io_width = fs_info->dev_root->sectorsize;
2397        tgtdev->io_align = fs_info->dev_root->sectorsize;
2398        tgtdev->sector_size = fs_info->dev_root->sectorsize;
2399        tgtdev->dev_root = fs_info->dev_root;
2400        tgtdev->in_fs_metadata = 1;
2401}
2402
2403static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2404                                        struct btrfs_device *device)
2405{
2406        int ret;
2407        struct btrfs_path *path;
2408        struct btrfs_root *root;
2409        struct btrfs_dev_item *dev_item;
2410        struct extent_buffer *leaf;
2411        struct btrfs_key key;
2412
2413        root = device->dev_root->fs_info->chunk_root;
2414
2415        path = btrfs_alloc_path();
2416        if (!path)
2417                return -ENOMEM;
2418
2419        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2420        key.type = BTRFS_DEV_ITEM_KEY;
2421        key.offset = device->devid;
2422
2423        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2424        if (ret < 0)
2425                goto out;
2426
2427        if (ret > 0) {
2428                ret = -ENOENT;
2429                goto out;
2430        }
2431
2432        leaf = path->nodes[0];
2433        dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2434
2435        btrfs_set_device_id(leaf, dev_item, device->devid);
2436        btrfs_set_device_type(leaf, dev_item, device->type);
2437        btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2438        btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2439        btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2440        btrfs_set_device_total_bytes(leaf, dev_item,
2441                                     btrfs_device_get_disk_total_bytes(device));
2442        btrfs_set_device_bytes_used(leaf, dev_item,
2443                                    btrfs_device_get_bytes_used(device));
2444        btrfs_mark_buffer_dirty(leaf);
2445
2446out:
2447        btrfs_free_path(path);
2448        return ret;
2449}
2450
2451int btrfs_grow_device(struct btrfs_trans_handle *trans,
2452                      struct btrfs_device *device, u64 new_size)
2453{
2454        struct btrfs_super_block *super_copy =
2455                device->dev_root->fs_info->super_copy;
2456        struct btrfs_fs_devices *fs_devices;
2457        u64 old_total;
2458        u64 diff;
2459
2460        if (!device->writeable)
2461                return -EACCES;
2462
2463        lock_chunks(device->dev_root);
2464        old_total = btrfs_super_total_bytes(super_copy);
2465        diff = new_size - device->total_bytes;
2466
2467        if (new_size <= device->total_bytes ||
2468            device->is_tgtdev_for_dev_replace) {
2469                unlock_chunks(device->dev_root);
2470                return -EINVAL;
2471        }
2472
2473        fs_devices = device->dev_root->fs_info->fs_devices;
2474
2475        btrfs_set_super_total_bytes(super_copy, old_total + diff);
2476        device->fs_devices->total_rw_bytes += diff;
2477
2478        btrfs_device_set_total_bytes(device, new_size);
2479        btrfs_device_set_disk_total_bytes(device, new_size);
2480        btrfs_clear_space_info_full(device->dev_root->fs_info);
2481        if (list_empty(&device->resized_list))
2482                list_add_tail(&device->resized_list,
2483                              &fs_devices->resized_devices);
2484        unlock_chunks(device->dev_root);
2485
2486        return btrfs_update_device(trans, device);
2487}
2488
2489static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
2490                            struct btrfs_root *root,
2491                            u64 chunk_tree, u64 chunk_objectid,
2492                            u64 chunk_offset)
2493{
2494        int ret;
2495        struct btrfs_path *path;
2496        struct btrfs_key key;
2497
2498        root = root->fs_info->chunk_root;
2499        path = btrfs_alloc_path();
2500        if (!path)
2501                return -ENOMEM;
2502
2503        key.objectid = chunk_objectid;
2504        key.offset = chunk_offset;
2505        key.type = BTRFS_CHUNK_ITEM_KEY;
2506
2507        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2508        if (ret < 0)
2509                goto out;
2510        else if (ret > 0) { /* Logic error or corruption */
2511                btrfs_error(root->fs_info, -ENOENT,
2512                            "Failed lookup while freeing chunk.");
2513                ret = -ENOENT;
2514                goto out;
2515        }
2516
2517        ret = btrfs_del_item(trans, root, path);
2518        if (ret < 0)
2519                btrfs_error(root->fs_info, ret,
2520                            "Failed to delete chunk item.");
2521out:
2522        btrfs_free_path(path);
2523        return ret;
2524}
2525
2526static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
2527                        chunk_offset)
2528{
2529        struct btrfs_super_block *super_copy = root->fs_info->super_copy;
2530        struct btrfs_disk_key *disk_key;
2531        struct btrfs_chunk *chunk;
2532        u8 *ptr;
2533        int ret = 0;
2534        u32 num_stripes;
2535        u32 array_size;
2536        u32 len = 0;
2537        u32 cur;
2538        struct btrfs_key key;
2539
2540        lock_chunks(root);
2541        array_size = btrfs_super_sys_array_size(super_copy);
2542
2543        ptr = super_copy->sys_chunk_array;
2544        cur = 0;
2545
2546        while (cur < array_size) {
2547                disk_key = (struct btrfs_disk_key *)ptr;
2548                btrfs_disk_key_to_cpu(&key, disk_key);
2549
2550                len = sizeof(*disk_key);
2551
2552                if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2553                        chunk = (struct btrfs_chunk *)(ptr + len);
2554                        num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2555                        len += btrfs_chunk_item_size(num_stripes);
2556                } else {
2557                        ret = -EIO;
2558                        break;
2559                }
2560                if (key.objectid == chunk_objectid &&
2561                    key.offset == chunk_offset) {
2562                        memmove(ptr, ptr + len, array_size - (cur + len));
2563                        array_size -= len;
2564                        btrfs_set_super_sys_array_size(super_copy, array_size);
2565                } else {
2566                        ptr += len;
2567                        cur += len;
2568                }
2569        }
2570        unlock_chunks(root);
2571        return ret;
2572}
2573
2574int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
2575                       struct btrfs_root *root, u64 chunk_offset)
2576{
2577        struct extent_map_tree *em_tree;
2578        struct extent_map *em;
2579        struct btrfs_root *extent_root = root->fs_info->extent_root;
2580        struct map_lookup *map;
2581        u64 dev_extent_len = 0;
2582        u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2583        u64 chunk_tree = root->fs_info->chunk_root->objectid;
2584        int i, ret = 0;
2585
2586        /* Just in case */
2587        root = root->fs_info->chunk_root;
2588        em_tree = &root->fs_info->mapping_tree.map_tree;
2589
2590        read_lock(&em_tree->lock);
2591        em = lookup_extent_mapping(em_tree, chunk_offset, 1);
2592        read_unlock(&em_tree->lock);
2593
2594        if (!em || em->start > chunk_offset ||
2595            em->start + em->len < chunk_offset) {
2596                /*
2597                 * This is a logic error, but we don't want to just rely on the
2598                 * user having built with ASSERT enabled, so if ASSERT doens't
2599                 * do anything we still error out.
2600                 */
2601                ASSERT(0);
2602                if (em)
2603                        free_extent_map(em);
2604                return -EINVAL;
2605        }
2606        map = (struct map_lookup *)em->bdev;
2607
2608        for (i = 0; i < map->num_stripes; i++) {
2609                struct btrfs_device *device = map->stripes[i].dev;
2610                ret = btrfs_free_dev_extent(trans, device,
2611                                            map->stripes[i].physical,
2612                                            &dev_extent_len);
2613                if (ret) {
2614                        btrfs_abort_transaction(trans, root, ret);
2615                        goto out;
2616                }
2617
2618                if (device->bytes_used > 0) {
2619                        lock_chunks(root);
2620                        btrfs_device_set_bytes_used(device,
2621                                        device->bytes_used - dev_extent_len);
2622                        spin_lock(&root->fs_info->free_chunk_lock);
2623                        root->fs_info->free_chunk_space += dev_extent_len;
2624                        spin_unlock(&root->fs_info->free_chunk_lock);
2625                        btrfs_clear_space_info_full(root->fs_info);
2626                        unlock_chunks(root);
2627                }
2628
2629                if (map->stripes[i].dev) {
2630                        ret = btrfs_update_device(trans, map->stripes[i].dev);
2631                        if (ret) {
2632                                btrfs_abort_transaction(trans, root, ret);
2633                                goto out;
2634                        }
2635                }
2636        }
2637        ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid,
2638                               chunk_offset);
2639        if (ret) {
2640                btrfs_abort_transaction(trans, root, ret);
2641                goto out;
2642        }
2643
2644        trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
2645
2646        if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
2647                ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
2648                if (ret) {
2649                        btrfs_abort_transaction(trans, root, ret);
2650                        goto out;
2651                }
2652        }
2653
2654        ret = btrfs_remove_block_group(trans, extent_root, chunk_offset, em);
2655        if (ret) {
2656                btrfs_abort_transaction(trans, extent_root, ret);
2657                goto out;
2658        }
2659
2660out:
2661        /* once for us */
2662        free_extent_map(em);
2663        return ret;
2664}
2665
2666static int btrfs_relocate_chunk(struct btrfs_root *root,
2667                         u64 chunk_tree, u64 chunk_objectid,
2668                         u64 chunk_offset)
2669{
2670        struct btrfs_root *extent_root;
2671        struct btrfs_trans_handle *trans;
2672        int ret;
2673
2674        root = root->fs_info->chunk_root;
2675        extent_root = root->fs_info->extent_root;
2676
2677        ret = btrfs_can_relocate(extent_root, chunk_offset);
2678        if (ret)
2679                return -ENOSPC;
2680
2681        /* step one, relocate all the extents inside this chunk */
2682        ret = btrfs_relocate_block_group(extent_root, chunk_offset);
2683        if (ret)
2684                return ret;
2685
2686        trans = btrfs_start_transaction(root, 0);
2687        if (IS_ERR(trans)) {
2688                ret = PTR_ERR(trans);
2689                btrfs_std_error(root->fs_info, ret);
2690                return ret;
2691        }
2692
2693        /*
2694         * step two, delete the device extents and the
2695         * chunk tree entries
2696         */
2697        ret = btrfs_remove_chunk(trans, root, chunk_offset);
2698        btrfs_end_transaction(trans, root);
2699        return ret;
2700}
2701
2702static int btrfs_relocate_sys_chunks(struct btrfs_root *root)
2703{
2704        struct btrfs_root *chunk_root = root->fs_info->chunk_root;
2705        struct btrfs_path *path;
2706        struct extent_buffer *leaf;
2707        struct btrfs_chunk *chunk;
2708        struct btrfs_key key;
2709        struct btrfs_key found_key;
2710        u64 chunk_tree = chunk_root->root_key.objectid;
2711        u64 chunk_type;
2712        bool retried = false;
2713        int failed = 0;
2714        int ret;
2715
2716        path = btrfs_alloc_path();
2717        if (!path)
2718                return -ENOMEM;
2719
2720again:
2721        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2722        key.offset = (u64)-1;
2723        key.type = BTRFS_CHUNK_ITEM_KEY;
2724
2725        while (1) {
2726                ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
2727                if (ret < 0)
2728                        goto error;
2729                BUG_ON(ret == 0); /* Corruption */
2730
2731                ret = btrfs_previous_item(chunk_root, path, key.objectid,
2732                                          key.type);
2733                if (ret < 0)
2734                        goto error;
2735                if (ret > 0)
2736                        break;
2737
2738                leaf = path->nodes[0];
2739                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2740
2741                chunk = btrfs_item_ptr(leaf, path->slots[0],
2742                                       struct btrfs_chunk);
2743                chunk_type = btrfs_chunk_type(leaf, chunk);
2744                btrfs_release_path(path);
2745
2746                if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
2747                        ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
2748                                                   found_key.objectid,
2749                                                   found_key.offset);
2750                        if (ret == -ENOSPC)
2751                                failed++;
2752                        else
2753                                BUG_ON(ret);
2754                }
2755
2756                if (found_key.offset == 0)
2757                        break;
2758                key.offset = found_key.offset - 1;
2759        }
2760        ret = 0;
2761        if (failed && !retried) {
2762                failed = 0;
2763                retried = true;
2764                goto again;
2765        } else if (WARN_ON(failed && retried)) {
2766                ret = -ENOSPC;
2767        }
2768error:
2769        btrfs_free_path(path);
2770        return ret;
2771}
2772
2773static int insert_balance_item(struct btrfs_root *root,
2774                               struct btrfs_balance_control *bctl)
2775{
2776        struct btrfs_trans_handle *trans;
2777        struct btrfs_balance_item *item;
2778        struct btrfs_disk_balance_args disk_bargs;
2779        struct btrfs_path *path;
2780        struct extent_buffer *leaf;
2781        struct btrfs_key key;
2782        int ret, err;
2783
2784        path = btrfs_alloc_path();
2785        if (!path)
2786                return -ENOMEM;
2787
2788        trans = btrfs_start_transaction(root, 0);
2789        if (IS_ERR(trans)) {
2790                btrfs_free_path(path);
2791                return PTR_ERR(trans);
2792        }
2793
2794        key.objectid = BTRFS_BALANCE_OBJECTID;
2795        key.type = BTRFS_BALANCE_ITEM_KEY;
2796        key.offset = 0;
2797
2798        ret = btrfs_insert_empty_item(trans, root, path, &key,
2799                                      sizeof(*item));
2800        if (ret)
2801                goto out;
2802
2803        leaf = path->nodes[0];
2804        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
2805
2806        memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
2807
2808        btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
2809        btrfs_set_balance_data(leaf, item, &disk_bargs);
2810        btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
2811        btrfs_set_balance_meta(leaf, item, &disk_bargs);
2812        btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
2813        btrfs_set_balance_sys(leaf, item, &disk_bargs);
2814
2815        btrfs_set_balance_flags(leaf, item, bctl->flags);
2816
2817        btrfs_mark_buffer_dirty(leaf);
2818out:
2819        btrfs_free_path(path);
2820        err = btrfs_commit_transaction(trans, root);
2821        if (err && !ret)
2822                ret = err;
2823        return ret;
2824}
2825
2826static int del_balance_item(struct btrfs_root *root)
2827{
2828        struct btrfs_trans_handle *trans;
2829        struct btrfs_path *path;
2830        struct btrfs_key key;
2831        int ret, err;
2832
2833        path = btrfs_alloc_path();
2834        if (!path)
2835                return -ENOMEM;
2836
2837        trans = btrfs_start_transaction(root, 0);
2838        if (IS_ERR(trans)) {
2839                btrfs_free_path(path);
2840                return PTR_ERR(trans);
2841        }
2842
2843        key.objectid = BTRFS_BALANCE_OBJECTID;
2844        key.type = BTRFS_BALANCE_ITEM_KEY;
2845        key.offset = 0;
2846
2847        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2848        if (ret < 0)
2849                goto out;
2850        if (ret > 0) {
2851                ret = -ENOENT;
2852                goto out;
2853        }
2854
2855        ret = btrfs_del_item(trans, root, path);
2856out:
2857        btrfs_free_path(path);
2858        err = btrfs_commit_transaction(trans, root);
2859        if (err && !ret)
2860                ret = err;
2861        return ret;
2862}
2863
2864/*
2865 * This is a heuristic used to reduce the number of chunks balanced on
2866 * resume after balance was interrupted.
2867 */
2868static void update_balance_args(struct btrfs_balance_control *bctl)
2869{
2870        /*
2871         * Turn on soft mode for chunk types that were being converted.
2872         */
2873        if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
2874                bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
2875        if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
2876                bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
2877        if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
2878                bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
2879
2880        /*
2881         * Turn on usage filter if is not already used.  The idea is
2882         * that chunks that we have already balanced should be
2883         * reasonably full.  Don't do it for chunks that are being
2884         * converted - that will keep us from relocating unconverted
2885         * (albeit full) chunks.
2886         */
2887        if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2888            !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2889                bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
2890                bctl->data.usage = 90;
2891        }
2892        if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2893            !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2894                bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
2895                bctl->sys.usage = 90;
2896        }
2897        if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
2898            !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
2899                bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
2900                bctl->meta.usage = 90;
2901        }
2902}
2903
2904/*
2905 * Should be called with both balance and volume mutexes held to
2906 * serialize other volume operations (add_dev/rm_dev/resize) with
2907 * restriper.  Same goes for unset_balance_control.
2908 */
2909static void set_balance_control(struct btrfs_balance_control *bctl)
2910{
2911        struct btrfs_fs_info *fs_info = bctl->fs_info;
2912
2913        BUG_ON(fs_info->balance_ctl);
2914
2915        spin_lock(&fs_info->balance_lock);
2916        fs_info->balance_ctl = bctl;
2917        spin_unlock(&fs_info->balance_lock);
2918}
2919
2920static void unset_balance_control(struct btrfs_fs_info *fs_info)
2921{
2922        struct btrfs_balance_control *bctl = fs_info->balance_ctl;
2923
2924        BUG_ON(!fs_info->balance_ctl);
2925
2926        spin_lock(&fs_info->balance_lock);
2927        fs_info->balance_ctl = NULL;
2928        spin_unlock(&fs_info->balance_lock);
2929
2930        kfree(bctl);
2931}
2932
2933/*
2934 * Balance filters.  Return 1 if chunk should be filtered out
2935 * (should not be balanced).
2936 */
2937static int chunk_profiles_filter(u64 chunk_type,
2938                                 struct btrfs_balance_args *bargs)
2939{
2940        chunk_type = chunk_to_extended(chunk_type) &
2941                                BTRFS_EXTENDED_PROFILE_MASK;
2942
2943        if (bargs->profiles & chunk_type)
2944                return 0;
2945
2946        return 1;
2947}
2948
2949static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
2950                              struct btrfs_balance_args *bargs)
2951{
2952        struct btrfs_block_group_cache *cache;
2953        u64 chunk_used, user_thresh;
2954        int ret = 1;
2955
2956        cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2957        chunk_used = btrfs_block_group_used(&cache->item);
2958
2959        if (bargs->usage == 0)
2960                user_thresh = 1;
2961        else if (bargs->usage > 100)
2962                user_thresh = cache->key.offset;
2963        else
2964                user_thresh = div_factor_fine(cache->key.offset,
2965                                              bargs->usage);
2966
2967        if (chunk_used < user_thresh)
2968                ret = 0;
2969
2970        btrfs_put_block_group(cache);
2971        return ret;
2972}
2973
2974static int chunk_devid_filter(struct extent_buffer *leaf,
2975                              struct btrfs_chunk *chunk,
2976                              struct btrfs_balance_args *bargs)
2977{
2978        struct btrfs_stripe *stripe;
2979        int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2980        int i;
2981
2982        for (i = 0; i < num_stripes; i++) {
2983                stripe = btrfs_stripe_nr(chunk, i);
2984                if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
2985                        return 0;
2986        }
2987
2988        return 1;
2989}
2990
2991/* [pstart, pend) */
2992static int chunk_drange_filter(struct extent_buffer *leaf,
2993                               struct btrfs_chunk *chunk,
2994                               u64 chunk_offset,
2995                               struct btrfs_balance_args *bargs)
2996{
2997        struct btrfs_stripe *stripe;
2998        int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
2999        u64 stripe_offset;
3000        u64 stripe_length;
3001        int factor;
3002        int i;
3003
3004        if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
3005                return 0;
3006
3007        if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
3008             BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
3009                factor = num_stripes / 2;
3010        } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
3011                factor = num_stripes - 1;
3012        } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
3013                factor = num_stripes - 2;
3014        } else {
3015                factor = num_stripes;
3016        }
3017
3018        for (i = 0; i < num_stripes; i++) {
3019                stripe = btrfs_stripe_nr(chunk, i);
3020                if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
3021                        continue;
3022
3023                stripe_offset = btrfs_stripe_offset(leaf, stripe);
3024                stripe_length = btrfs_chunk_length(leaf, chunk);
3025                do_div(stripe_length, factor);
3026
3027                if (stripe_offset < bargs->pend &&
3028                    stripe_offset + stripe_length > bargs->pstart)
3029                        return 0;
3030        }
3031
3032        return 1;
3033}
3034
3035/* [vstart, vend) */
3036static int chunk_vrange_filter(struct extent_buffer *leaf,
3037                               struct btrfs_chunk *chunk,
3038                               u64 chunk_offset,
3039                               struct btrfs_balance_args *bargs)
3040{
3041        if (chunk_offset < bargs->vend &&
3042            chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
3043                /* at least part of the chunk is inside this vrange */
3044                return 0;
3045
3046        return 1;
3047}
3048
3049static int chunk_soft_convert_filter(u64 chunk_type,
3050                                     struct btrfs_balance_args *bargs)
3051{
3052        if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3053                return 0;
3054
3055        chunk_type = chunk_to_extended(chunk_type) &
3056                                BTRFS_EXTENDED_PROFILE_MASK;
3057
3058        if (bargs->target == chunk_type)
3059                return 1;
3060
3061        return 0;
3062}
3063
3064static int should_balance_chunk(struct btrfs_root *root,
3065                                struct extent_buffer *leaf,
3066                                struct btrfs_chunk *chunk, u64 chunk_offset)
3067{
3068        struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
3069        struct btrfs_balance_args *bargs = NULL;
3070        u64 chunk_type = btrfs_chunk_type(leaf, chunk);
3071
3072        /* type filter */
3073        if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
3074              (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
3075                return 0;
3076        }
3077
3078        if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3079                bargs = &bctl->data;
3080        else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3081                bargs = &bctl->sys;
3082        else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3083                bargs = &bctl->meta;
3084
3085        /* profiles filter */
3086        if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
3087            chunk_profiles_filter(chunk_type, bargs)) {
3088                return 0;
3089        }
3090
3091        /* usage filter */
3092        if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
3093            chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
3094                return 0;
3095        }
3096
3097        /* devid filter */
3098        if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
3099            chunk_devid_filter(leaf, chunk, bargs)) {
3100                return 0;
3101        }
3102
3103        /* drange filter, makes sense only with devid filter */
3104        if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3105            chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
3106                return 0;
3107        }
3108
3109        /* vrange filter */
3110        if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
3111            chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
3112                return 0;
3113        }
3114
3115        /* soft profile changing mode */
3116        if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
3117            chunk_soft_convert_filter(chunk_type, bargs)) {
3118                return 0;
3119        }
3120
3121        /*
3122         * limited by count, must be the last filter
3123         */
3124        if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
3125                if (bargs->limit == 0)
3126                        return 0;
3127                else
3128                        bargs->limit--;
3129        }
3130
3131        return 1;
3132}
3133
3134static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3135{
3136        struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3137        struct btrfs_root *chunk_root = fs_info->chunk_root;
3138        struct btrfs_root *dev_root = fs_info->dev_root;
3139        struct list_head *devices;
3140        struct btrfs_device *device;
3141        u64 old_size;
3142        u64 size_to_free;
3143        struct btrfs_chunk *chunk;
3144        struct btrfs_path *path;
3145        struct btrfs_key key;
3146        struct btrfs_key found_key;
3147        struct btrfs_trans_handle *trans;
3148        struct extent_buffer *leaf;
3149        int slot;
3150        int ret;
3151        int enospc_errors = 0;
3152        bool counting = true;
3153        u64 limit_data = bctl->data.limit;
3154        u64 limit_meta = bctl->meta.limit;
3155        u64 limit_sys = bctl->sys.limit;
3156
3157        /* step one make some room on all the devices */
3158        devices = &fs_info->fs_devices->devices;
3159        list_for_each_entry(device, devices, dev_list) {
3160                old_size = btrfs_device_get_total_bytes(device);
3161                size_to_free = div_factor(old_size, 1);
3162                size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
3163                if (!device->writeable ||
3164                    btrfs_device_get_total_bytes(device) -
3165                    btrfs_device_get_bytes_used(device) > size_to_free ||
3166                    device->is_tgtdev_for_dev_replace)
3167                        continue;
3168
3169                ret = btrfs_shrink_device(device, old_size - size_to_free);
3170                if (ret == -ENOSPC)
3171                        break;
3172                BUG_ON(ret);
3173
3174                trans = btrfs_start_transaction(dev_root, 0);
3175                BUG_ON(IS_ERR(trans));
3176
3177                ret = btrfs_grow_device(trans, device, old_size);
3178                BUG_ON(ret);
3179
3180                btrfs_end_transaction(trans, dev_root);
3181        }
3182
3183        /* step two, relocate all the chunks */
3184        path = btrfs_alloc_path();
3185        if (!path) {
3186                ret = -ENOMEM;
3187                goto error;
3188        }
3189
3190        /* zero out stat counters */
3191        spin_lock(&fs_info->balance_lock);
3192        memset(&bctl->stat, 0, sizeof(bctl->stat));
3193        spin_unlock(&fs_info->balance_lock);
3194again:
3195        if (!counting) {
3196                bctl->data.limit = limit_data;
3197                bctl->meta.limit = limit_meta;
3198                bctl->sys.limit = limit_sys;
3199        }
3200        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3201        key.offset = (u64)-1;
3202        key.type = BTRFS_CHUNK_ITEM_KEY;
3203
3204        while (1) {
3205                if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
3206                    atomic_read(&fs_info->balance_cancel_req)) {
3207                        ret = -ECANCELED;
3208                        goto error;
3209                }
3210
3211                ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3212                if (ret < 0)
3213                        goto error;
3214
3215                /*
3216                 * this shouldn't happen, it means the last relocate
3217                 * failed
3218                 */
3219                if (ret == 0)
3220                        BUG(); /* FIXME break ? */
3221
3222                ret = btrfs_previous_item(chunk_root, path, 0,
3223                                          BTRFS_CHUNK_ITEM_KEY);
3224                if (ret) {
3225                        ret = 0;
3226                        break;
3227                }
3228
3229                leaf = path->nodes[0];
3230                slot = path->slots[0];
3231                btrfs_item_key_to_cpu(leaf, &found_key, slot);
3232
3233                if (found_key.objectid != key.objectid)
3234                        break;
3235
3236                chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3237
3238                if (!counting) {
3239                        spin_lock(&fs_info->balance_lock);
3240                        bctl->stat.considered++;
3241                        spin_unlock(&fs_info->balance_lock);
3242                }
3243
3244                ret = should_balance_chunk(chunk_root, leaf, chunk,
3245                                           found_key.offset);
3246                btrfs_release_path(path);
3247                if (!ret)
3248                        goto loop;
3249
3250                if (counting) {
3251                        spin_lock(&fs_info->balance_lock);
3252                        bctl->stat.expected++;
3253                        spin_unlock(&fs_info->balance_lock);
3254                        goto loop;
3255                }
3256
3257                ret = btrfs_relocate_chunk(chunk_root,
3258                                           chunk_root->root_key.objectid,
3259                                           found_key.objectid,
3260                                           found_key.offset);
3261                if (ret && ret != -ENOSPC)
3262                        goto error;
3263                if (ret == -ENOSPC) {
3264                        enospc_errors++;
3265                } else {
3266                        spin_lock(&fs_info->balance_lock);
3267                        bctl->stat.completed++;
3268                        spin_unlock(&fs_info->balance_lock);
3269                }
3270loop:
3271                if (found_key.offset == 0)
3272                        break;
3273                key.offset = found_key.offset - 1;
3274        }
3275
3276        if (counting) {
3277                btrfs_release_path(path);
3278                counting = false;
3279                goto again;
3280        }
3281error:
3282        btrfs_free_path(path);
3283        if (enospc_errors) {
3284                btrfs_info(fs_info, "%d enospc errors during balance",
3285                       enospc_errors);
3286                if (!ret)
3287                        ret = -ENOSPC;
3288        }
3289
3290        return ret;
3291}
3292
3293/**
3294 * alloc_profile_is_valid - see if a given profile is valid and reduced
3295 * @flags: profile to validate
3296 * @extended: if true @flags is treated as an extended profile
3297 */
3298static int alloc_profile_is_valid(u64 flags, int extended)
3299{
3300        u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
3301                               BTRFS_BLOCK_GROUP_PROFILE_MASK);
3302
3303        flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
3304
3305        /* 1) check that all other bits are zeroed */
3306        if (flags & ~mask)
3307                return 0;
3308
3309        /* 2) see if profile is reduced */
3310        if (flags == 0)
3311                return !extended; /* "0" is valid for usual profiles */
3312
3313        /* true if exactly one bit set */
3314        return (flags & (flags - 1)) == 0;
3315}
3316
3317static inline int balance_need_close(struct btrfs_fs_info *fs_info)
3318{
3319        /* cancel requested || normal exit path */
3320        return atomic_read(&fs_info->balance_cancel_req) ||
3321                (atomic_read(&fs_info->balance_pause_req) == 0 &&
3322                 atomic_read(&fs_info->balance_cancel_req) == 0);
3323}
3324
3325static void __cancel_balance(struct btrfs_fs_info *fs_info)
3326{
3327        int ret;
3328
3329        unset_balance_control(fs_info);
3330        ret = del_balance_item(fs_info->tree_root);
3331        if (ret)
3332                btrfs_std_error(fs_info, ret);
3333
3334        atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
3335}
3336
3337/*
3338 * Should be called with both balance and volume mutexes held
3339 */
3340int btrfs_balance(struct btrfs_balance_control *bctl,
3341                  struct btrfs_ioctl_balance_args *bargs)
3342{
3343        struct btrfs_fs_info *fs_info = bctl->fs_info;
3344        u64 allowed;
3345        int mixed = 0;
3346        int ret;
3347        u64 num_devices;
3348        unsigned seq;
3349
3350        if (btrfs_fs_closing(fs_info) ||
3351            atomic_read(&fs_info->balance_pause_req) ||
3352            atomic_read(&fs_info->balance_cancel_req)) {
3353                ret = -EINVAL;
3354                goto out;
3355        }
3356
3357        allowed = btrfs_super_incompat_flags(fs_info->super_copy);
3358        if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
3359                mixed = 1;
3360
3361        /*
3362         * In case of mixed groups both data and meta should be picked,
3363         * and identical options should be given for both of them.
3364         */
3365        allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
3366        if (mixed && (bctl->flags & allowed)) {
3367                if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
3368                    !(bctl->flags & BTRFS_BALANCE_METADATA) ||
3369                    memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
3370                        btrfs_err(fs_info, "with mixed groups data and "
3371                                   "metadata balance options must be the same");
3372                        ret = -EINVAL;
3373                        goto out;
3374                }
3375        }
3376
3377        num_devices = fs_info->fs_devices->num_devices;
3378        btrfs_dev_replace_lock(&fs_info->dev_replace);
3379        if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
3380                BUG_ON(num_devices < 1);
3381                num_devices--;
3382        }
3383        btrfs_dev_replace_unlock(&fs_info->dev_replace);
3384        allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
3385        if (num_devices == 1)
3386                allowed |= BTRFS_BLOCK_GROUP_DUP;
3387        else if (num_devices > 1)
3388                allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
3389        if (num_devices > 2)
3390                allowed |= BTRFS_BLOCK_GROUP_RAID5;
3391        if (num_devices > 3)
3392                allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
3393                            BTRFS_BLOCK_GROUP_RAID6);
3394        if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3395            (!alloc_profile_is_valid(bctl->data.target, 1) ||
3396             (bctl->data.target & ~allowed))) {
3397                btrfs_err(fs_info, "unable to start balance with target "
3398                           "data profile %llu",
3399                       bctl->data.target);
3400                ret = -EINVAL;
3401                goto out;
3402        }
3403        if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3404            (!alloc_profile_is_valid(bctl->meta.target, 1) ||
3405             (bctl->meta.target & ~allowed))) {
3406                btrfs_err(fs_info,
3407                           "unable to start balance with target metadata profile %llu",
3408                       bctl->meta.target);
3409                ret = -EINVAL;
3410                goto out;
3411        }
3412        if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3413            (!alloc_profile_is_valid(bctl->sys.target, 1) ||
3414             (bctl->sys.target & ~allowed))) {
3415                btrfs_err(fs_info,
3416                           "unable to start balance with target system profile %llu",
3417                       bctl->sys.target);
3418                ret = -EINVAL;
3419                goto out;
3420        }
3421
3422        /* allow dup'ed data chunks only in mixed mode */
3423        if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3424            (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) {
3425                btrfs_err(fs_info, "dup for data is not allowed");
3426                ret = -EINVAL;
3427                goto out;
3428        }
3429
3430        /* allow to reduce meta or sys integrity only if force set */
3431        allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3432                        BTRFS_BLOCK_GROUP_RAID10 |
3433                        BTRFS_BLOCK_GROUP_RAID5 |
3434                        BTRFS_BLOCK_GROUP_RAID6;
3435        do {
3436                seq = read_seqbegin(&fs_info->profiles_lock);
3437
3438                if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3439                     (fs_info->avail_system_alloc_bits & allowed) &&
3440                     !(bctl->sys.target & allowed)) ||
3441                    ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
3442                     (fs_info->avail_metadata_alloc_bits & allowed) &&
3443                     !(bctl->meta.target & allowed))) {
3444                        if (bctl->flags & BTRFS_BALANCE_FORCE) {
3445                                btrfs_info(fs_info, "force reducing metadata integrity");
3446                        } else {
3447                                btrfs_err(fs_info, "balance will reduce metadata "
3448                                           "integrity, use force if you want this");
3449                                ret = -EINVAL;
3450                                goto out;
3451                        }
3452                }
3453        } while (read_seqretry(&fs_info->profiles_lock, seq));
3454
3455        if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3456                int num_tolerated_disk_barrier_failures;
3457                u64 target = bctl->sys.target;
3458
3459                num_tolerated_disk_barrier_failures =
3460                        btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
3461                if (num_tolerated_disk_barrier_failures > 0 &&
3462                    (target &
3463                     (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
3464                      BTRFS_AVAIL_ALLOC_BIT_SINGLE)))
3465                        num_tolerated_disk_barrier_failures = 0;
3466                else if (num_tolerated_disk_barrier_failures > 1 &&
3467                         (target &
3468                          (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)))
3469                        num_tolerated_disk_barrier_failures = 1;
3470
3471                fs_info->num_tolerated_disk_barrier_failures =
3472                        num_tolerated_disk_barrier_failures;
3473        }
3474
3475        ret = insert_balance_item(fs_info->tree_root, bctl);
3476        if (ret && ret != -EEXIST)
3477                goto out;
3478
3479        if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
3480                BUG_ON(ret == -EEXIST);
3481                set_balance_control(bctl);
3482        } else {
3483                BUG_ON(ret != -EEXIST);
3484                spin_lock(&fs_info->balance_lock);
3485                update_balance_args(bctl);
3486                spin_unlock(&fs_info->balance_lock);
3487        }
3488
3489        atomic_inc(&fs_info->balance_running);
3490        mutex_unlock(&fs_info->balance_mutex);
3491
3492        ret = __btrfs_balance(fs_info);
3493
3494        mutex_lock(&fs_info->balance_mutex);
3495        atomic_dec(&fs_info->balance_running);
3496
3497        if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3498                fs_info->num_tolerated_disk_barrier_failures =
3499                        btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
3500        }
3501
3502        if (bargs) {
3503                memset(bargs, 0, sizeof(*bargs));
3504                update_ioctl_balance_args(fs_info, 0, bargs);
3505        }
3506
3507        if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
3508            balance_need_close(fs_info)) {
3509                __cancel_balance(fs_info);
3510        }
3511
3512        wake_up(&fs_info->balance_wait_q);
3513
3514        return ret;
3515out:
3516        if (bctl->flags & BTRFS_BALANCE_RESUME)
3517                __cancel_balance(fs_info);
3518        else {
3519                kfree(bctl);
3520                atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
3521        }
3522        return ret;
3523}
3524
3525static int balance_kthread(void *data)
3526{
3527        struct btrfs_fs_info *fs_info = data;
3528        int ret = 0;
3529
3530        mutex_lock(&fs_info->volume_mutex);
3531        mutex_lock(&fs_info->balance_mutex);
3532
3533        if (fs_info->balance_ctl) {
3534                btrfs_info(fs_info, "continuing balance");
3535                ret = btrfs_balance(fs_info->balance_ctl, NULL);
3536        }
3537
3538        mutex_unlock(&fs_info->balance_mutex);
3539        mutex_unlock(&fs_info->volume_mutex);
3540
3541        return ret;
3542}
3543
3544int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
3545{
3546        struct task_struct *tsk;
3547
3548        spin_lock(&fs_info->balance_lock);
3549        if (!fs_info->balance_ctl) {
3550                spin_unlock(&fs_info->balance_lock);
3551                return 0;
3552        }
3553        spin_unlock(&fs_info->balance_lock);
3554
3555        if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
3556                btrfs_info(fs_info, "force skipping balance");
3557                return 0;
3558        }
3559
3560        tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
3561        return PTR_ERR_OR_ZERO(tsk);
3562}
3563
3564int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
3565{
3566        struct btrfs_balance_control *bctl;
3567        struct btrfs_balance_item *item;
3568        struct btrfs_disk_balance_args disk_bargs;
3569        struct btrfs_path *path;
3570        struct extent_buffer *leaf;
3571        struct btrfs_key key;
3572        int ret;
3573
3574        path = btrfs_alloc_path();
3575        if (!path)
3576                return -ENOMEM;
3577
3578        key.objectid = BTRFS_BALANCE_OBJECTID;
3579        key.type = BTRFS_BALANCE_ITEM_KEY;
3580        key.offset = 0;
3581
3582        ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
3583        if (ret < 0)
3584                goto out;
3585        if (ret > 0) { /* ret = -ENOENT; */
3586                ret = 0;
3587                goto out;
3588        }
3589
3590        bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
3591        if (!bctl) {
3592                ret = -ENOMEM;
3593                goto out;
3594        }
3595
3596        leaf = path->nodes[0];
3597        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3598
3599        bctl->fs_info = fs_info;
3600        bctl->flags = btrfs_balance_flags(leaf, item);
3601        bctl->flags |= BTRFS_BALANCE_RESUME;
3602
3603        btrfs_balance_data(leaf, item, &disk_bargs);
3604        btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
3605        btrfs_balance_meta(leaf, item, &disk_bargs);
3606        btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
3607        btrfs_balance_sys(leaf, item, &disk_bargs);
3608        btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
3609
3610        WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
3611
3612        mutex_lock(&fs_info->volume_mutex);
3613        mutex_lock(&fs_info->balance_mutex);
3614
3615        set_balance_control(bctl);
3616
3617        mutex_unlock(&fs_info->balance_mutex);
3618        mutex_unlock(&fs_info->volume_mutex);
3619out:
3620        btrfs_free_path(path);
3621        return ret;
3622}
3623
3624int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
3625{
3626        int ret = 0;
3627
3628        mutex_lock(&fs_info->balance_mutex);
3629        if (!fs_info->balance_ctl) {
3630                mutex_unlock(&fs_info->balance_mutex);
3631                return -ENOTCONN;
3632        }
3633
3634        if (atomic_read(&fs_info->balance_running)) {
3635                atomic_inc(&fs_info->balance_pause_req);
3636                mutex_unlock(&fs_info->balance_mutex);
3637
3638                wait_event(fs_info->balance_wait_q,
3639                           atomic_read(&fs_info->balance_running) == 0);
3640
3641                mutex_lock(&fs_info->balance_mutex);
3642                /* we are good with balance_ctl ripped off from under us */
3643                BUG_ON(atomic_read(&fs_info->balance_running));
3644                atomic_dec(&fs_info->balance_pause_req);
3645        } else {
3646                ret = -ENOTCONN;
3647        }
3648
3649        mutex_unlock(&fs_info->balance_mutex);
3650        return ret;
3651}
3652
3653int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
3654{
3655        if (fs_info->sb->s_flags & MS_RDONLY)
3656                return -EROFS;
3657
3658        mutex_lock(&fs_info->balance_mutex);
3659        if (!fs_info->balance_ctl) {
3660                mutex_unlock(&fs_info->balance_mutex);
3661                return -ENOTCONN;
3662        }
3663
3664        atomic_inc(&fs_info->balance_cancel_req);
3665        /*
3666         * if we are running just wait and return, balance item is
3667         * deleted in btrfs_balance in this case
3668         */
3669        if (atomic_read(&fs_info->balance_running)) {
3670                mutex_unlock(&fs_info->balance_mutex);
3671                wait_event(fs_info->balance_wait_q,
3672                           atomic_read(&fs_info->balance_running) == 0);
3673                mutex_lock(&fs_info->balance_mutex);
3674        } else {
3675                /* __cancel_balance needs volume_mutex */
3676                mutex_unlock(&fs_info->balance_mutex);
3677                mutex_lock(&fs_info->volume_mutex);
3678                mutex_lock(&fs_info->balance_mutex);
3679
3680                if (fs_info->balance_ctl)
3681                        __cancel_balance(fs_info);
3682
3683                mutex_unlock(&fs_info->volume_mutex);
3684        }
3685
3686        BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
3687        atomic_dec(&fs_info->balance_cancel_req);
3688        mutex_unlock(&fs_info->balance_mutex);
3689        return 0;
3690}
3691
3692static int btrfs_uuid_scan_kthread(void *data)
3693{
3694        struct btrfs_fs_info *fs_info = data;
3695        struct btrfs_root *root = fs_info->tree_root;
3696        struct btrfs_key key;
3697        struct btrfs_key max_key;
3698        struct btrfs_path *path = NULL;
3699        int ret = 0;
3700        struct extent_buffer *eb;
3701        int slot;
3702        struct btrfs_root_item root_item;
3703        u32 item_size;
3704        struct btrfs_trans_handle *trans = NULL;
3705
3706        path = btrfs_alloc_path();
3707        if (!path) {
3708                ret = -ENOMEM;
3709                goto out;
3710        }
3711
3712        key.objectid = 0;
3713        key.type = BTRFS_ROOT_ITEM_KEY;
3714        key.offset = 0;
3715
3716        max_key.objectid = (u64)-1;
3717        max_key.type = BTRFS_ROOT_ITEM_KEY;
3718        max_key.offset = (u64)-1;
3719
3720        while (1) {
3721                ret = btrfs_search_forward(root, &key, path, 0);
3722                if (ret) {
3723                        if (ret > 0)
3724                                ret = 0;
3725                        break;
3726                }
3727
3728                if (key.type != BTRFS_ROOT_ITEM_KEY ||
3729                    (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
3730                     key.objectid != BTRFS_FS_TREE_OBJECTID) ||
3731                    key.objectid > BTRFS_LAST_FREE_OBJECTID)
3732                        goto skip;
3733
3734                eb = path->nodes[0];
3735                slot = path->slots[0];
3736                item_size = btrfs_item_size_nr(eb, slot);
3737                if (item_size < sizeof(root_item))
3738                        goto skip;
3739
3740                read_extent_buffer(eb, &root_item,
3741                                   btrfs_item_ptr_offset(eb, slot),
3742                                   (int)sizeof(root_item));
3743                if (btrfs_root_refs(&root_item) == 0)
3744                        goto skip;
3745
3746                if (!btrfs_is_empty_uuid(root_item.uuid) ||
3747                    !btrfs_is_empty_uuid(root_item.received_uuid)) {
3748                        if (trans)
3749                                goto update_tree;
3750
3751                        btrfs_release_path(path);
3752                        /*
3753                         * 1 - subvol uuid item
3754                         * 1 - received_subvol uuid item
3755                         */
3756                        trans = btrfs_start_transaction(fs_info->uuid_root, 2);
3757                        if (IS_ERR(trans)) {
3758                                ret = PTR_ERR(trans);
3759                                break;
3760                        }
3761                        continue;
3762                } else {
3763                        goto skip;
3764                }
3765update_tree:
3766                if (!btrfs_is_empty_uuid(root_item.uuid)) {
3767                        ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root,
3768                                                  root_item.uuid,
3769                                                  BTRFS_UUID_KEY_SUBVOL,
3770                                                  key.objectid);
3771                        if (ret < 0) {
3772                                btrfs_warn(fs_info, "uuid_tree_add failed %d",
3773                                        ret);
3774                                break;
3775                        }
3776                }
3777
3778                if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
3779                        ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root,
3780                                                  root_item.received_uuid,
3781                                                 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
3782                                                  key.objectid);
3783                        if (ret < 0) {
3784                                btrfs_warn(fs_info, "uuid_tree_add failed %d",
3785                                        ret);
3786                                break;
3787                        }
3788                }
3789
3790skip:
3791                if (trans) {
3792                        ret = btrfs_end_transaction(trans, fs_info->uuid_root);
3793                        trans = NULL;
3794                        if (ret)
3795                                break;
3796                }
3797
3798                btrfs_release_path(path);
3799                if (key.offset < (u64)-1) {
3800                        key.offset++;
3801                } else if (key.type < BTRFS_ROOT_ITEM_KEY) {
3802                        key.offset = 0;
3803                        key.type = BTRFS_ROOT_ITEM_KEY;
3804                } else if (key.objectid < (u64)-1) {
3805                        key.offset = 0;
3806                        key.type = BTRFS_ROOT_ITEM_KEY;
3807                        key.objectid++;
3808                } else {
3809                        break;
3810                }
3811                cond_resched();
3812        }
3813
3814out:
3815        btrfs_free_path(path);
3816        if (trans && !IS_ERR(trans))
3817                btrfs_end_transaction(trans, fs_info->uuid_root);
3818        if (ret)
3819                btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
3820        else
3821                fs_info->update_uuid_tree_gen = 1;
3822        up(&fs_info->uuid_tree_rescan_sem);
3823        return 0;
3824}
3825
3826/*
3827 * Callback for btrfs_uuid_tree_iterate().
3828 * returns:
3829 * 0    check succeeded, the entry is not outdated.
3830 * < 0  if an error occured.
3831 * > 0  if the check failed, which means the caller shall remove the entry.
3832 */
3833static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
3834                                       u8 *uuid, u8 type, u64 subid)
3835{
3836        struct btrfs_key key;
3837        int ret = 0;
3838        struct btrfs_root *subvol_root;
3839
3840        if (type != BTRFS_UUID_KEY_SUBVOL &&
3841            type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
3842                goto out;
3843
3844        key.objectid = subid;
3845        key.type = BTRFS_ROOT_ITEM_KEY;
3846        key.offset = (u64)-1;
3847        subvol_root = btrfs_read_fs_root_no_name(fs_info, &key);
3848        if (IS_ERR(subvol_root)) {
3849                ret = PTR_ERR(subvol_root);
3850                if (ret == -ENOENT)
3851                        ret = 1;
3852                goto out;
3853        }
3854
3855        switch (type) {
3856        case BTRFS_UUID_KEY_SUBVOL:
3857                if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE))
3858                        ret = 1;
3859                break;
3860        case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
3861                if (memcmp(uuid, subvol_root->root_item.received_uuid,
3862                           BTRFS_UUID_SIZE))
3863                        ret = 1;
3864                break;
3865        }
3866
3867out:
3868        return ret;
3869}
3870
3871static int btrfs_uuid_rescan_kthread(void *data)
3872{
3873        struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
3874        int ret;
3875
3876        /*
3877         * 1st step is to iterate through the existing UUID tree and
3878         * to delete all entries that contain outdated data.
3879         * 2nd step is to add all missing entries to the UUID tree.
3880         */
3881        ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry);
3882        if (ret < 0) {
3883                btrfs_warn(fs_info, "iterating uuid_tree failed %d", ret);
3884                up(&fs_info->uuid_tree_rescan_sem);
3885                return ret;
3886        }
3887        return btrfs_uuid_scan_kthread(data);
3888}
3889
3890int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
3891{
3892        struct btrfs_trans_handle *trans;
3893        struct btrfs_root *tree_root = fs_info->tree_root;
3894        struct btrfs_root *uuid_root;
3895        struct task_struct *task;
3896        int ret;
3897
3898        /*
3899         * 1 - root node
3900         * 1 - root item
3901         */
3902        trans = btrfs_start_transaction(tree_root, 2);
3903        if (IS_ERR(trans))
3904                return PTR_ERR(trans);
3905
3906        uuid_root = btrfs_create_tree(trans, fs_info,
3907                                      BTRFS_UUID_TREE_OBJECTID);
3908        if (IS_ERR(uuid_root)) {
3909                btrfs_abort_transaction(trans, tree_root,
3910                                        PTR_ERR(uuid_root));
3911                return PTR_ERR(uuid_root);
3912        }
3913
3914        fs_info->uuid_root = uuid_root;
3915
3916        ret = btrfs_commit_transaction(trans, tree_root);
3917        if (ret)
3918                return ret;
3919
3920        down(&fs_info->uuid_tree_rescan_sem);
3921        task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
3922        if (IS_ERR(task)) {
3923                /* fs_info->update_uuid_tree_gen remains 0 in all error case */
3924                btrfs_warn(fs_info, "failed to start uuid_scan task");
3925                up(&fs_info->uuid_tree_rescan_sem);
3926                return PTR_ERR(task);
3927        }
3928
3929        return 0;
3930}
3931
3932int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
3933{
3934        struct task_struct *task;
3935
3936        down(&fs_info->uuid_tree_rescan_sem);
3937        task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
3938        if (IS_ERR(task)) {
3939                /* fs_info->update_uuid_tree_gen remains 0 in all error case */
3940                btrfs_warn(fs_info, "failed to start uuid_rescan task");
3941                up(&fs_info->uuid_tree_rescan_sem);
3942                return PTR_ERR(task);
3943        }
3944
3945        return 0;
3946}
3947
3948/*
3949 * shrinking a device means finding all of the device extents past
3950 * the new size, and then following the back refs to the chunks.
3951 * The chunk relocation code actually frees the device extent
3952 */
3953int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
3954{
3955        struct btrfs_trans_handle *trans;
3956        struct btrfs_root *root = device->dev_root;
3957        struct btrfs_dev_extent *dev_extent = NULL;
3958        struct btrfs_path *path;
3959        u64 length;
3960        u64 chunk_tree;
3961        u64 chunk_objectid;
3962        u64 chunk_offset;
3963        int ret;
3964        int slot;
3965        int failed = 0;
3966        bool retried = false;
3967        struct extent_buffer *l;
3968        struct btrfs_key key;
3969        struct btrfs_super_block *super_copy = root->fs_info->super_copy;
3970        u64 old_total = btrfs_super_total_bytes(super_copy);
3971        u64 old_size = btrfs_device_get_total_bytes(device);
3972        u64 diff = old_size - new_size;
3973
3974        if (device->is_tgtdev_for_dev_replace)
3975                return -EINVAL;
3976
3977        path = btrfs_alloc_path();
3978        if (!path)
3979                return -ENOMEM;
3980
3981        path->reada = 2;
3982
3983        lock_chunks(root);
3984
3985        btrfs_device_set_total_bytes(device, new_size);
3986        if (device->writeable) {
3987                device->fs_devices->total_rw_bytes -= diff;
3988                spin_lock(&root->fs_info->free_chunk_lock);
3989                root->fs_info->free_chunk_space -= diff;
3990                spin_unlock(&root->fs_info->free_chunk_lock);
3991        }
3992        unlock_chunks(root);
3993
3994again:
3995        key.objectid = device->devid;
3996        key.offset = (u64)-1;
3997        key.type = BTRFS_DEV_EXTENT_KEY;
3998
3999        do {
4000                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4001                if (ret < 0)
4002                        goto done;
4003
4004                ret = btrfs_previous_item(root, path, 0, key.type);
4005                if (ret < 0)
4006                        goto done;
4007                if (ret) {
4008                        ret = 0;
4009                        btrfs_release_path(path);
4010                        break;
4011                }
4012
4013                l = path->nodes[0];
4014                slot = path->slots[0];
4015                btrfs_item_key_to_cpu(l, &key, path->slots[0]);
4016
4017                if (key.objectid != device->devid) {
4018                        btrfs_release_path(path);
4019                        break;
4020                }
4021
4022                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
4023                length = btrfs_dev_extent_length(l, dev_extent);
4024
4025                if (key.offset + length <= new_size) {
4026                        btrfs_release_path(path);
4027                        break;
4028                }
4029
4030                chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
4031                chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
4032                chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
4033                btrfs_release_path(path);
4034
4035                ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
4036                                           chunk_offset);
4037                if (ret && ret != -ENOSPC)
4038                        goto done;
4039                if (ret == -ENOSPC)
4040                        failed++;
4041        } while (key.offset-- > 0);
4042
4043        if (failed && !retried) {
4044                failed = 0;
4045                retried = true;
4046                goto again;
4047        } else if (failed && retried) {
4048                ret = -ENOSPC;
4049                lock_chunks(root);
4050
4051                btrfs_device_set_total_bytes(device, old_size);
4052                if (device->writeable)
4053                        device->fs_devices->total_rw_bytes += diff;
4054                spin_lock(&root->fs_info->free_chunk_lock);
4055                root->fs_info->free_chunk_space += diff;
4056                spin_unlock(&root->fs_info->free_chunk_lock);
4057                unlock_chunks(root);
4058                goto done;
4059        }
4060
4061        /* Shrinking succeeded, else we would be at "done". */
4062        trans = btrfs_start_transaction(root, 0);
4063        if (IS_ERR(trans)) {
4064                ret = PTR_ERR(trans);
4065                goto done;
4066        }
4067
4068        lock_chunks(root);
4069        btrfs_device_set_disk_total_bytes(device, new_size);
4070        if (list_empty(&device->resized_list))
4071                list_add_tail(&device->resized_list,
4072                              &root->fs_info->fs_devices->resized_devices);
4073
4074        WARN_ON(diff > old_total);
4075        btrfs_set_super_total_bytes(super_copy, old_total - diff);
4076        unlock_chunks(root);
4077
4078        /* Now btrfs_update_device() will change the on-disk size. */
4079        ret = btrfs_update_device(trans, device);
4080        btrfs_end_transaction(trans, root);
4081done:
4082        btrfs_free_path(path);
4083        return ret;
4084}
4085
4086static int btrfs_add_system_chunk(struct btrfs_root *root,
4087                           struct btrfs_key *key,
4088                           struct btrfs_chunk *chunk, int item_size)
4089{
4090        struct btrfs_super_block *super_copy = root->fs_info->super_copy;
4091        struct btrfs_disk_key disk_key;
4092        u32 array_size;
4093        u8 *ptr;
4094
4095        lock_chunks(root);
4096        array_size = btrfs_super_sys_array_size(super_copy);
4097        if (array_size + item_size + sizeof(disk_key)
4098                        > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
4099                unlock_chunks(root);
4100                return -EFBIG;
4101        }
4102
4103        ptr = super_copy->sys_chunk_array + array_size;
4104        btrfs_cpu_key_to_disk(&disk_key, key);
4105        memcpy(ptr, &disk_key, sizeof(disk_key));
4106        ptr += sizeof(disk_key);
4107        memcpy(ptr, chunk, item_size);
4108        item_size += sizeof(disk_key);
4109        btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
4110        unlock_chunks(root);
4111
4112        return 0;
4113}
4114
4115/*
4116 * sort the devices in descending order by max_avail, total_avail
4117 */
4118static int btrfs_cmp_device_info(const void *a, const void *b)
4119{
4120        const struct btrfs_device_info *di_a = a;
4121        const struct btrfs_device_info *di_b = b;
4122
4123        if (di_a->max_avail > di_b->max_avail)
4124                return -1;
4125        if (di_a->max_avail < di_b->max_avail)
4126                return 1;
4127        if (di_a->total_avail > di_b->total_avail)
4128                return -1;
4129        if (di_a->total_avail < di_b->total_avail)
4130                return 1;
4131        return 0;
4132}
4133
4134static struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
4135        [BTRFS_RAID_RAID10] = {
4136                .sub_stripes    = 2,
4137                .dev_stripes    = 1,
4138                .devs_max       = 0,    /* 0 == as many as possible */
4139                .devs_min       = 4,
4140                .devs_increment = 2,
4141                .ncopies        = 2,
4142        },
4143        [BTRFS_RAID_RAID1] = {
4144                .sub_stripes    = 1,
4145                .dev_stripes    = 1,
4146                .devs_max       = 2,
4147                .devs_min       = 2,
4148                .devs_increment = 2,
4149                .ncopies        = 2,
4150        },
4151        [BTRFS_RAID_DUP] = {
4152                .sub_stripes    = 1,
4153                .dev_stripes    = 2,
4154                .devs_max       = 1,
4155                .devs_min       = 1,
4156                .devs_increment = 1,
4157                .ncopies        = 2,
4158        },
4159        [BTRFS_RAID_RAID0] = {
4160                .sub_stripes    = 1,
4161                .dev_stripes    = 1,
4162                .devs_max       = 0,
4163                .devs_min       = 2,
4164                .devs_increment = 1,
4165                .ncopies        = 1,
4166        },
4167        [BTRFS_RAID_SINGLE] = {
4168                .sub_stripes    = 1,
4169                .dev_stripes    = 1,
4170                .devs_max       = 1,
4171                .devs_min       = 1,
4172                .devs_increment = 1,
4173                .ncopies        = 1,
4174        },
4175        [BTRFS_RAID_RAID5] = {
4176                .sub_stripes    = 1,
4177                .dev_stripes    = 1,
4178                .devs_max       = 0,
4179                .devs_min       = 2,
4180                .devs_increment = 1,
4181                .ncopies        = 2,
4182        },
4183        [BTRFS_RAID_RAID6] = {
4184                .sub_stripes    = 1,
4185                .dev_stripes    = 1,
4186                .devs_max       = 0,
4187                .devs_min       = 3,
4188                .devs_increment = 1,
4189                .ncopies        = 3,
4190        },
4191};
4192
4193static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
4194{
4195        /* TODO allow them to set a preferred stripe size */
4196        return 64 * 1024;
4197}
4198
4199static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
4200{
4201        if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
4202                return;
4203
4204        btrfs_set_fs_incompat(info, RAID56);
4205}
4206
4207#define BTRFS_MAX_DEVS(r) ((BTRFS_LEAF_DATA_SIZE(r)             \
4208                        - sizeof(struct btrfs_item)             \
4209                        - sizeof(struct btrfs_chunk))           \
4210                        / sizeof(struct btrfs_stripe) + 1)
4211
4212#define BTRFS_MAX_DEVS_SYS_CHUNK ((BTRFS_SYSTEM_CHUNK_ARRAY_SIZE        \
4213                                - 2 * sizeof(struct btrfs_disk_key)     \
4214                                - 2 * sizeof(struct btrfs_chunk))       \
4215                                / sizeof(struct btrfs_stripe) + 1)
4216
4217static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4218                               struct btrfs_root *extent_root, u64 start,
4219                               u64 type)
4220{
4221        struct btrfs_fs_info *info = extent_root->fs_info;
4222        struct btrfs_fs_devices *fs_devices = info->fs_devices;
4223        struct list_head *cur;
4224        struct map_lookup *map = NULL;
4225        struct extent_map_tree *em_tree;
4226        struct extent_map *em;
4227        struct btrfs_device_info *devices_info = NULL;
4228        u64 total_avail;
4229        int num_stripes;        /* total number of stripes to allocate */
4230        int data_stripes;       /* number of stripes that count for
4231                                   block group size */
4232        int sub_stripes;        /* sub_stripes info for map */
4233        int dev_stripes;        /* stripes per dev */
4234        int devs_max;           /* max devs to use */
4235        int devs_min;           /* min devs needed */
4236        int devs_increment;     /* ndevs has to be a multiple of this */
4237        int ncopies;            /* how many copies to data has */
4238        int ret;
4239        u64 max_stripe_size;
4240        u64 max_chunk_size;
4241        u64 stripe_size;
4242        u64 num_bytes;
4243        u64 raid_stripe_len = BTRFS_STRIPE_LEN;
4244        int ndevs;
4245        int i;
4246        int j;
4247        int index;
4248
4249        BUG_ON(!alloc_profile_is_valid(type, 0));
4250
4251        if (list_empty(&fs_devices->alloc_list))
4252                return -ENOSPC;
4253
4254        index = __get_raid_index(type);
4255
4256        sub_stripes = btrfs_raid_array[index].sub_stripes;
4257        dev_stripes = btrfs_raid_array[index].dev_stripes;
4258        devs_max = btrfs_raid_array[index].devs_max;
4259        devs_min = btrfs_raid_array[index].devs_min;
4260        devs_increment = btrfs_raid_array[index].devs_increment;
4261        ncopies = btrfs_raid_array[index].ncopies;
4262
4263        if (type & BTRFS_BLOCK_GROUP_DATA) {
4264                max_stripe_size = 1024 * 1024 * 1024;
4265                max_chunk_size = 10 * max_stripe_size;
4266                if (!devs_max)
4267                        devs_max = BTRFS_MAX_DEVS(info->chunk_root);
4268        } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
4269                /* for larger filesystems, use larger metadata chunks */
4270                if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
4271                        max_stripe_size = 1024 * 1024 * 1024;
4272                else
4273                        max_stripe_size = 256 * 1024 * 1024;
4274                max_chunk_size = max_stripe_size;
4275                if (!devs_max)
4276                        devs_max = BTRFS_MAX_DEVS(info->chunk_root);
4277        } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
4278                max_stripe_size = 32 * 1024 * 1024;
4279                max_chunk_size = 2 * max_stripe_size;
4280                if (!devs_max)
4281                        devs_max = BTRFS_MAX_DEVS_SYS_CHUNK;
4282        } else {
4283                btrfs_err(info, "invalid chunk type 0x%llx requested",
4284                       type);
4285                BUG_ON(1);
4286        }
4287
4288        /* we don't want a chunk larger than 10% of writeable space */
4289        max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
4290                             max_chunk_size);
4291
4292        devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
4293                               GFP_NOFS);
4294        if (!devices_info)
4295                return -ENOMEM;
4296
4297        cur = fs_devices->alloc_list.next;
4298
4299        /*
4300         * in the first pass through the devices list, we gather information
4301         * about the available holes on each device.
4302         */
4303        ndevs = 0;
4304        while (cur != &fs_devices->alloc_list) {
4305                struct btrfs_device *device;
4306                u64 max_avail;
4307                u64 dev_offset;
4308
4309                device = list_entry(cur, struct btrfs_device, dev_alloc_list);
4310
4311                cur = cur->next;
4312
4313                if (!device->writeable) {
4314                        WARN(1, KERN_ERR
4315                               "BTRFS: read-only device in alloc_list\n");
4316                        continue;
4317                }
4318
4319                if (!device->in_fs_metadata ||
4320                    device->is_tgtdev_for_dev_replace)
4321                        continue;
4322
4323                if (device->total_bytes > device->bytes_used)
4324                        total_avail = device->total_bytes - device->bytes_used;
4325                else
4326                        total_avail = 0;
4327
4328                /* If there is no space on this device, skip it. */
4329                if (total_avail == 0)
4330                        continue;
4331
4332                ret = find_free_dev_extent(trans, device,
4333                                           max_stripe_size * dev_stripes,
4334                                           &dev_offset, &max_avail);
4335                if (ret && ret != -ENOSPC)
4336                        goto error;
4337
4338                if (ret == 0)
4339                        max_avail = max_stripe_size * dev_stripes;
4340
4341                if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
4342                        continue;
4343
4344                if (ndevs == fs_devices->rw_devices) {
4345                        WARN(1, "%s: found more than %llu devices\n",
4346                             __func__, fs_devices->rw_devices);
4347                        break;
4348                }
4349                devices_info[ndevs].dev_offset = dev_offset;
4350                devices_info[ndevs].max_avail = max_avail;
4351                devices_info[ndevs].total_avail = total_avail;
4352                devices_info[ndevs].dev = device;
4353                ++ndevs;
4354        }
4355
4356        /*
4357         * now sort the devices by hole size / available space
4358         */
4359        sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
4360             btrfs_cmp_device_info, NULL);
4361
4362        /* round down to number of usable stripes */
4363        ndevs -= ndevs % devs_increment;
4364
4365        if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
4366                ret = -ENOSPC;
4367                goto error;
4368        }
4369
4370        if (devs_max && ndevs > devs_max)
4371                ndevs = devs_max;
4372        /*
4373         * the primary goal is to maximize the number of stripes, so use as many
4374         * devices as possible, even if the stripes are not maximum sized.
4375         */
4376        stripe_size = devices_info[ndevs-1].max_avail;
4377        num_stripes = ndevs * dev_stripes;
4378
4379        /*
4380         * this will have to be fixed for RAID1 and RAID10 over
4381         * more drives
4382         */
4383        data_stripes = num_stripes / ncopies;
4384
4385        if (type & BTRFS_BLOCK_GROUP_RAID5) {
4386                raid_stripe_len = find_raid56_stripe_len(ndevs - 1,
4387                                 btrfs_super_stripesize(info->super_copy));
4388                data_stripes = num_stripes - 1;
4389        }
4390        if (type & BTRFS_BLOCK_GROUP_RAID6) {
4391                raid_stripe_len = find_raid56_stripe_len(ndevs - 2,
4392                                 btrfs_super_stripesize(info->super_copy));
4393                data_stripes = num_stripes - 2;
4394        }
4395
4396        /*
4397         * Use the number of data stripes to figure out how big this chunk
4398         * is really going to be in terms of logical address space,
4399         * and compare that answer with the max chunk size
4400         */
4401        if (stripe_size * data_stripes > max_chunk_size) {
4402                u64 mask = (1ULL << 24) - 1;
4403                stripe_size = max_chunk_size;
4404                do_div(stripe_size, data_stripes);
4405
4406                /* bump the answer up to a 16MB boundary */
4407                stripe_size = (stripe_size + mask) & ~mask;
4408
4409                /* but don't go higher than the limits we found
4410                 * while searching for free extents
4411                 */
4412                if (stripe_size > devices_info[ndevs-1].max_avail)
4413                        stripe_size = devices_info[ndevs-1].max_avail;
4414        }
4415
4416        do_div(stripe_size, dev_stripes);
4417
4418        /* align to BTRFS_STRIPE_LEN */
4419        do_div(stripe_size, raid_stripe_len);
4420        stripe_size *= raid_stripe_len;
4421
4422        map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
4423        if (!map) {
4424                ret = -ENOMEM;
4425                goto error;
4426        }
4427        map->num_stripes = num_stripes;
4428
4429        for (i = 0; i < ndevs; ++i) {
4430                for (j = 0; j < dev_stripes; ++j) {
4431                        int s = i * dev_stripes + j;
4432                        map->stripes[s].dev = devices_info[i].dev;
4433                        map->stripes[s].physical = devices_info[i].dev_offset +
4434                                                   j * stripe_size;
4435                }
4436        }
4437        map->sector_size = extent_root->sectorsize;
4438        map->stripe_len = raid_stripe_len;
4439        map->io_align = raid_stripe_len;
4440        map->io_width = raid_stripe_len;
4441        map->type = type;
4442        map->sub_stripes = sub_stripes;
4443
4444        num_bytes = stripe_size * data_stripes;
4445
4446        trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
4447
4448        em = alloc_extent_map();
4449        if (!em) {
4450                kfree(map);
4451                ret = -ENOMEM;
4452                goto error;
4453        }
4454        set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
4455        em->bdev = (struct block_device *)map;
4456        em->start = start;
4457        em->len = num_bytes;
4458        em->block_start = 0;
4459        em->block_len = em->len;
4460        em->orig_block_len = stripe_size;
4461
4462        em_tree = &extent_root->fs_info->mapping_tree.map_tree;
4463        write_lock(&em_tree->lock);
4464        ret = add_extent_mapping(em_tree, em, 0);
4465        if (!ret) {
4466                list_add_tail(&em->list, &trans->transaction->pending_chunks);
4467                atomic_inc(&em->refs);
4468        }
4469        write_unlock(&em_tree->lock);
4470        if (ret) {
4471                free_extent_map(em);
4472                goto error;
4473        }
4474
4475        ret = btrfs_make_block_group(trans, extent_root, 0, type,
4476                                     BTRFS_FIRST_CHUNK_TREE_OBJECTID,
4477                                     start, num_bytes);
4478        if (ret)
4479                goto error_del_extent;
4480
4481        for (i = 0; i < map->num_stripes; i++) {
4482                num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
4483                btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
4484        }
4485
4486        spin_lock(&extent_root->fs_info->free_chunk_lock);
4487        extent_root->fs_info->free_chunk_space -= (stripe_size *
4488                                                   map->num_stripes);
4489        spin_unlock(&extent_root->fs_info->free_chunk_lock);
4490
4491        free_extent_map(em);
4492        check_raid56_incompat_flag(extent_root->fs_info, type);
4493
4494        kfree(devices_info);
4495        return 0;
4496
4497error_del_extent:
4498        write_lock(&em_tree->lock);
4499        remove_extent_mapping(em_tree, em);
4500        write_unlock(&em_tree->lock);
4501
4502        /* One for our allocation */
4503        free_extent_map(em);
4504        /* One for the tree reference */
4505        free_extent_map(em);
4506        /* One for the pending_chunks list reference */
4507        free_extent_map(em);
4508error:
4509        kfree(devices_info);
4510        return ret;
4511}
4512
4513int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
4514                                struct btrfs_root *extent_root,
4515                                u64 chunk_offset, u64 chunk_size)
4516{
4517        struct btrfs_key key;
4518        struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
4519        struct btrfs_device *device;
4520        struct btrfs_chunk *chunk;
4521        struct btrfs_stripe *stripe;
4522        struct extent_map_tree *em_tree;
4523        struct extent_map *em;
4524        struct map_lookup *map;
4525        size_t item_size;
4526        u64 dev_offset;
4527        u64 stripe_size;
4528        int i = 0;
4529        int ret;
4530
4531        em_tree = &extent_root->fs_info->mapping_tree.map_tree;
4532        read_lock(&em_tree->lock);
4533        em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size);
4534        read_unlock(&em_tree->lock);
4535
4536        if (!em) {
4537                btrfs_crit(extent_root->fs_info, "unable to find logical "
4538                           "%Lu len %Lu", chunk_offset, chunk_size);
4539                return -EINVAL;
4540        }
4541
4542        if (em->start != chunk_offset || em->len != chunk_size) {
4543                btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted"
4544                          " %Lu-%Lu, found %Lu-%Lu", chunk_offset,
4545                          chunk_size, em->start, em->len);
4546                free_extent_map(em);
4547                return -EINVAL;
4548        }
4549
4550        map = (struct map_lookup *)em->bdev;
4551        item_size = btrfs_chunk_item_size(map->num_stripes);
4552        stripe_size = em->orig_block_len;
4553
4554        chunk = kzalloc(item_size, GFP_NOFS);
4555        if (!chunk) {
4556                ret = -ENOMEM;
4557                goto out;
4558        }
4559
4560        for (i = 0; i < map->num_stripes; i++) {
4561                device = map->stripes[i].dev;
4562                dev_offset = map->stripes[i].physical;
4563
4564                ret = btrfs_update_device(trans, device);
4565                if (ret)
4566                        goto out;
4567                ret = btrfs_alloc_dev_extent(trans, device,
4568                                             chunk_root->root_key.objectid,
4569                                             BTRFS_FIRST_CHUNK_TREE_OBJECTID,
4570                                             chunk_offset, dev_offset,
4571                                             stripe_size);
4572                if (ret)
4573                        goto out;
4574        }
4575
4576        stripe = &chunk->stripe;
4577        for (i = 0; i < map->num_stripes; i++) {
4578                device = map->stripes[i].dev;
4579                dev_offset = map->stripes[i].physical;
4580
4581                btrfs_set_stack_stripe_devid(stripe, device->devid);
4582                btrfs_set_stack_stripe_offset(stripe, dev_offset);
4583                memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
4584                stripe++;
4585        }
4586
4587        btrfs_set_stack_chunk_length(chunk, chunk_size);
4588        btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
4589        btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
4590        btrfs_set_stack_chunk_type(chunk, map->type);
4591        btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
4592        btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
4593        btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
4594        btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
4595        btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
4596
4597        key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
4598        key.type = BTRFS_CHUNK_ITEM_KEY;
4599        key.offset = chunk_offset;
4600
4601        ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
4602        if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
4603                /*
4604                 * TODO: Cleanup of inserted chunk root in case of
4605                 * failure.
4606                 */
4607                ret = btrfs_add_system_chunk(chunk_root, &key, chunk,
4608                                             item_size);
4609        }
4610
4611out:
4612        kfree(chunk);
4613        free_extent_map(em);
4614        return ret;
4615}
4616
4617/*
4618 * Chunk allocation falls into two parts. The first part does works
4619 * that make the new allocated chunk useable, but not do any operation
4620 * that modifies the chunk tree. The second part does the works that
4621 * require modifying the chunk tree. This division is important for the
4622 * bootstrap process of adding storage to a seed btrfs.
4623 */
4624int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
4625                      struct btrfs_root *extent_root, u64 type)
4626{
4627        u64 chunk_offset;
4628
4629        chunk_offset = find_next_chunk(extent_root->fs_info);
4630        return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type);
4631}
4632
4633static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
4634                                         struct btrfs_root *root,
4635                                         struct btrfs_device *device)
4636{
4637        u64 chunk_offset;
4638        u64 sys_chunk_offset;
4639        u64 alloc_profile;
4640        struct btrfs_fs_info *fs_info = root->fs_info;
4641        struct btrfs_root *extent_root = fs_info->extent_root;
4642        int ret;
4643
4644        chunk_offset = find_next_chunk(fs_info);
4645        alloc_profile = btrfs_get_alloc_profile(extent_root, 0);
4646        ret = __btrfs_alloc_chunk(trans, extent_root, chunk_offset,
4647                                  alloc_profile);
4648        if (ret)
4649                return ret;
4650
4651        sys_chunk_offset = find_next_chunk(root->fs_info);
4652        alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
4653        ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset,
4654                                  alloc_profile);
4655        return ret;
4656}
4657
4658static inline int btrfs_chunk_max_errors(struct map_lookup *map)
4659{
4660        int max_errors;
4661
4662        if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
4663                         BTRFS_BLOCK_GROUP_RAID10 |
4664                         BTRFS_BLOCK_GROUP_RAID5 |
4665                         BTRFS_BLOCK_GROUP_DUP)) {
4666                max_errors = 1;
4667        } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
4668                max_errors = 2;
4669        } else {
4670                max_errors = 0;
4671        }
4672
4673        return max_errors;
4674}
4675
4676int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
4677{
4678        struct extent_map *em;
4679        struct map_lookup *map;
4680        struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
4681        int readonly = 0;
4682        int miss_ndevs = 0;
4683        int i;
4684
4685        read_lock(&map_tree->map_tree.lock);
4686        em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
4687        read_unlock(&map_tree->map_tree.lock);
4688        if (!em)
4689                return 1;
4690
4691        map = (struct map_lookup *)em->bdev;
4692        for (i = 0; i < map->num_stripes; i++) {
4693                if (map->stripes[i].dev->missing) {
4694                        miss_ndevs++;
4695                        continue;
4696                }
4697
4698                if (!map->stripes[i].dev->writeable) {
4699                        readonly = 1;
4700                        goto end;
4701                }
4702        }
4703
4704        /*
4705         * If the number of missing devices is larger than max errors,
4706         * we can not write the data into that chunk successfully, so
4707         * set it readonly.
4708         */
4709        if (miss_ndevs > btrfs_chunk_max_errors(map))
4710                readonly = 1;
4711end:
4712        free_extent_map(em);
4713        return readonly;
4714}
4715
4716void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
4717{
4718        extent_map_tree_init(&tree->map_tree);
4719}
4720
4721void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
4722{
4723        struct extent_map *em;
4724
4725        while (1) {
4726                write_lock(&tree->map_tree.lock);
4727                em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
4728                if (em)
4729                        remove_extent_mapping(&tree->map_tree, em);
4730                write_unlock(&tree->map_tree.lock);
4731                if (!em)
4732                        break;
4733                /* once for us */
4734                free_extent_map(em);
4735                /* once for the tree */
4736                free_extent_map(em);
4737        }
4738}
4739
4740int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
4741{
4742        struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
4743        struct extent_map *em;
4744        struct map_lookup *map;
4745        struct extent_map_tree *em_tree = &map_tree->map_tree;
4746        int ret;
4747
4748        read_lock(&em_tree->lock);
4749        em = lookup_extent_mapping(em_tree, logical, len);
4750        read_unlock(&em_tree->lock);
4751
4752        /*
4753         * We could return errors for these cases, but that could get ugly and
4754         * we'd probably do the same thing which is just not do anything else
4755         * and exit, so return 1 so the callers don't try to use other copies.
4756         */
4757        if (!em) {
4758                btrfs_crit(fs_info, "No mapping for %Lu-%Lu", logical,
4759                            logical+len);
4760                return 1;
4761        }
4762
4763        if (em->start > logical || em->start + em->len < logical) {
4764                btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got "
4765                            "%Lu-%Lu", logical, logical+len, em->start,
4766                            em->start + em->len);
4767                free_extent_map(em);
4768                return 1;
4769        }
4770
4771        map = (struct map_lookup *)em->bdev;
4772        if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
4773                ret = map->num_stripes;
4774        else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
4775                ret = map->sub_stripes;
4776        else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
4777                ret = 2;
4778        else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
4779                ret = 3;
4780        else
4781                ret = 1;
4782        free_extent_map(em);
4783
4784        btrfs_dev_replace_lock(&fs_info->dev_replace);
4785        if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))
4786                ret++;
4787        btrfs_dev_replace_unlock(&fs_info->dev_replace);
4788
4789        return ret;
4790}
4791
4792unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
4793                                    struct btrfs_mapping_tree *map_tree,
4794                                    u64 logical)
4795{
4796        struct extent_map *em;
4797        struct map_lookup *map;
4798        struct extent_map_tree *em_tree = &map_tree->map_tree;
4799        unsigned long len = root->sectorsize;
4800
4801        read_lock(&em_tree->lock);
4802        em = lookup_extent_mapping(em_tree, logical, len);
4803        read_unlock(&em_tree->lock);
4804        BUG_ON(!em);
4805
4806        BUG_ON(em->start > logical || em->start + em->len < logical);
4807        map = (struct map_lookup *)em->bdev;
4808        if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
4809                len = map->stripe_len * nr_data_stripes(map);
4810        free_extent_map(em);
4811        return len;
4812}
4813
4814int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
4815                           u64 logical, u64 len, int mirror_num)
4816{
4817        struct extent_map *em;
4818        struct map_lookup *map;
4819        struct extent_map_tree *em_tree = &map_tree->map_tree;
4820        int ret = 0;
4821
4822        read_lock(&em_tree->lock);
4823        em = lookup_extent_mapping(em_tree, logical, len);
4824        read_unlock(&em_tree->lock);
4825        BUG_ON(!em);
4826
4827        BUG_ON(em->start > logical || em->start + em->len < logical);
4828        map = (struct map_lookup *)em->bdev;
4829        if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
4830                ret = 1;
4831        free_extent_map(em);
4832        return ret;
4833}
4834
4835static int find_live_mirror(struct btrfs_fs_info *fs_info,
4836                            struct map_lookup *map, int first, int num,
4837                            int optimal, int dev_replace_is_ongoing)
4838{
4839        int i;
4840        int tolerance;
4841        struct btrfs_device *srcdev;
4842
4843        if (dev_replace_is_ongoing &&
4844            fs_info->dev_replace.cont_reading_from_srcdev_mode ==
4845             BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
4846                srcdev = fs_info->dev_replace.srcdev;
4847        else
4848                srcdev = NULL;
4849
4850        /*
4851         * try to avoid the drive that is the source drive for a
4852         * dev-replace procedure, only choose it if no other non-missing
4853         * mirror is available
4854         */
4855        for (tolerance = 0; tolerance < 2; tolerance++) {
4856                if (map->stripes[optimal].dev->bdev &&
4857                    (tolerance || map->stripes[optimal].dev != srcdev))
4858                        return optimal;
4859                for (i = first; i < first + num; i++) {
4860                        if (map->stripes[i].dev->bdev &&
4861                            (tolerance || map->stripes[i].dev != srcdev))
4862                                return i;
4863                }
4864        }
4865
4866        /* we couldn't find one that doesn't fail.  Just return something
4867         * and the io error handling code will clean up eventually
4868         */
4869        return optimal;
4870}
4871
4872static inline int parity_smaller(u64 a, u64 b)
4873{
4874        return a > b;
4875}
4876
4877/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
4878static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
4879{
4880        struct btrfs_bio_stripe s;
4881        int i;
4882        u64 l;
4883        int again = 1;
4884
4885        while (again) {
4886                again = 0;
4887                for (i = 0; i < num_stripes - 1; i++) {
4888                        if (parity_smaller(bbio->raid_map[i],
4889                                           bbio->raid_map[i+1])) {
4890                                s = bbio->stripes[i];
4891                                l = bbio->raid_map[i];
4892                                bbio->stripes[i] = bbio->stripes[i+1];
4893                                bbio->raid_map[i] = bbio->raid_map[i+1];
4894                                bbio->stripes[i+1] = s;
4895                                bbio->raid_map[i+1] = l;
4896
4897                                again = 1;
4898                        }
4899                }
4900        }
4901}
4902
4903static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
4904{
4905        struct btrfs_bio *bbio = kzalloc(
4906                 /* the size of the btrfs_bio */
4907                sizeof(struct btrfs_bio) +
4908                /* plus the variable array for the stripes */
4909                sizeof(struct btrfs_bio_stripe) * (total_stripes) +
4910                /* plus the variable array for the tgt dev */
4911                sizeof(int) * (real_stripes) +
4912                /*
4913                 * plus the raid_map, which includes both the tgt dev
4914                 * and the stripes
4915                 */
4916                sizeof(u64) * (total_stripes),
4917                GFP_NOFS);
4918        if (!bbio)
4919                return NULL;
4920
4921        atomic_set(&bbio->error, 0);
4922        atomic_set(&bbio->refs, 1);
4923
4924        return bbio;
4925}
4926
4927void btrfs_get_bbio(struct btrfs_bio *bbio)
4928{
4929        WARN_ON(!atomic_read(&bbio->refs));
4930        atomic_inc(&bbio->refs);
4931}
4932
4933void btrfs_put_bbio(struct btrfs_bio *bbio)
4934{
4935        if (!bbio)
4936                return;
4937        if (atomic_dec_and_test(&bbio->refs))
4938                kfree(bbio);
4939}
4940
4941static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
4942                             u64 logical, u64 *length,
4943                             struct btrfs_bio **bbio_ret,
4944                             int mirror_num, int need_raid_map)
4945{
4946        struct extent_map *em;
4947        struct map_lookup *map;
4948        struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
4949        struct extent_map_tree *em_tree = &map_tree->map_tree;
4950        u64 offset;
4951        u64 stripe_offset;
4952        u64 stripe_end_offset;
4953        u64 stripe_nr;
4954        u64 stripe_nr_orig;
4955        u64 stripe_nr_end;
4956        u64 stripe_len;
4957        int stripe_index;
4958        int i;
4959        int ret = 0;
4960        int num_stripes;
4961        int max_errors = 0;
4962        int tgtdev_indexes = 0;
4963        struct btrfs_bio *bbio = NULL;
4964        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
4965        int dev_replace_is_ongoing = 0;
4966        int num_alloc_stripes;
4967        int patch_the_first_stripe_for_dev_replace = 0;
4968        u64 physical_to_patch_in_first_stripe = 0;
4969        u64 raid56_full_stripe_start = (u64)-1;
4970
4971        read_lock(&em_tree->lock);
4972        em = lookup_extent_mapping(em_tree, logical, *length);
4973        read_unlock(&em_tree->lock);
4974
4975        if (!em) {
4976                btrfs_crit(fs_info, "unable to find logical %llu len %llu",
4977                        logical, *length);
4978                return -EINVAL;
4979        }
4980
4981        if (em->start > logical || em->start + em->len < logical) {
4982                btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, "
4983                           "found %Lu-%Lu", logical, em->start,
4984                           em->start + em->len);
4985                free_extent_map(em);
4986                return -EINVAL;
4987        }
4988
4989        map = (struct map_lookup *)em->bdev;
4990        offset = logical - em->start;
4991
4992        stripe_len = map->stripe_len;
4993        stripe_nr = offset;
4994        /*
4995         * stripe_nr counts the total number of stripes we have to stride
4996         * to get to this block
4997         */
4998        do_div(stripe_nr, stripe_len);
4999
5000        stripe_offset = stripe_nr * stripe_len;
5001        BUG_ON(offset < stripe_offset);
5002
5003        /* stripe_offset is the offset of this block in its stripe*/
5004        stripe_offset = offset - stripe_offset;
5005
5006        /* if we're here for raid56, we need to know the stripe aligned start */
5007        if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5008                unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
5009                raid56_full_stripe_start = offset;
5010
5011                /* allow a write of a full stripe, but make sure we don't
5012                 * allow straddling of stripes
5013                 */
5014                do_div(raid56_full_stripe_start, full_stripe_len);
5015                raid56_full_stripe_start *= full_stripe_len;
5016        }
5017
5018        if (rw & REQ_DISCARD) {
5019                /* we don't discard raid56 yet */
5020                if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5021                        ret = -EOPNOTSUPP;
5022                        goto out;
5023                }
5024                *length = min_t(u64, em->len - offset, *length);
5025        } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
5026                u64 max_len;
5027                /* For writes to RAID[56], allow a full stripeset across all disks.
5028                   For other RAID types and for RAID[56] reads, just allow a single
5029                   stripe (on a single disk). */
5030                if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
5031                    (rw & REQ_WRITE)) {
5032                        max_len = stripe_len * nr_data_stripes(map) -
5033                                (offset - raid56_full_stripe_start);
5034                } else {
5035                        /* we limit the length of each bio to what fits in a stripe */
5036                        max_len = stripe_len - stripe_offset;
5037                }
5038                *length = min_t(u64, em->len - offset, max_len);
5039        } else {
5040                *length = em->len - offset;
5041        }
5042
5043        /* This is for when we're called from btrfs_merge_bio_hook() and all
5044           it cares about is the length */
5045        if (!bbio_ret)
5046                goto out;
5047
5048        btrfs_dev_replace_lock(dev_replace);
5049        dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
5050        if (!dev_replace_is_ongoing)
5051                btrfs_dev_replace_unlock(dev_replace);
5052
5053        if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
5054            !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) &&
5055            dev_replace->tgtdev != NULL) {
5056                /*
5057                 * in dev-replace case, for repair case (that's the only
5058                 * case where the mirror is selected explicitly when
5059                 * calling btrfs_map_block), blocks left of the left cursor
5060                 * can also be read from the target drive.
5061                 * For REQ_GET_READ_MIRRORS, the target drive is added as
5062                 * the last one to the array of stripes. For READ, it also
5063                 * needs to be supported using the same mirror number.
5064                 * If the requested block is not left of the left cursor,
5065                 * EIO is returned. This can happen because btrfs_num_copies()
5066                 * returns one more in the dev-replace case.
5067                 */
5068                u64 tmp_length = *length;
5069                struct btrfs_bio *tmp_bbio = NULL;
5070                int tmp_num_stripes;
5071                u64 srcdev_devid = dev_replace->srcdev->devid;
5072                int index_srcdev = 0;
5073                int found = 0;
5074                u64 physical_of_found = 0;
5075
5076                ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
5077                             logical, &tmp_length, &tmp_bbio, 0, 0);
5078                if (ret) {
5079                        WARN_ON(tmp_bbio != NULL);
5080                        goto out;
5081                }
5082
5083                tmp_num_stripes = tmp_bbio->num_stripes;
5084                if (mirror_num > tmp_num_stripes) {
5085                        /*
5086                         * REQ_GET_READ_MIRRORS does not contain this
5087                         * mirror, that means that the requested area
5088                         * is not left of the left cursor
5089                         */
5090                        ret = -EIO;
5091                        btrfs_put_bbio(tmp_bbio);
5092                        goto out;
5093                }
5094
5095                /*
5096                 * process the rest of the function using the mirror_num
5097                 * of the source drive. Therefore look it up first.
5098                 * At the end, patch the device pointer to the one of the
5099                 * target drive.
5100                 */
5101                for (i = 0; i < tmp_num_stripes; i++) {
5102                        if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) {
5103                                /*
5104                                 * In case of DUP, in order to keep it
5105                                 * simple, only add the mirror with the
5106                                 * lowest physical address
5107                                 */
5108                                if (found &&
5109                                    physical_of_found <=
5110                                     tmp_bbio->stripes[i].physical)
5111                                        continue;
5112                                index_srcdev = i;
5113                                found = 1;
5114                                physical_of_found =
5115                                        tmp_bbio->stripes[i].physical;
5116                        }
5117                }
5118
5119                if (found) {
5120                        mirror_num = index_srcdev + 1;
5121                        patch_the_first_stripe_for_dev_replace = 1;
5122                        physical_to_patch_in_first_stripe = physical_of_found;
5123                } else {
5124                        WARN_ON(1);
5125                        ret = -EIO;
5126                        btrfs_put_bbio(tmp_bbio);
5127                        goto out;
5128                }
5129
5130                btrfs_put_bbio(tmp_bbio);
5131        } else if (mirror_num > map->num_stripes) {
5132                mirror_num = 0;
5133        }
5134
5135        num_stripes = 1;
5136        stripe_index = 0;
5137        stripe_nr_orig = stripe_nr;
5138        stripe_nr_end = ALIGN(offset + *length, map->stripe_len);
5139        do_div(stripe_nr_end, map->stripe_len);
5140        stripe_end_offset = stripe_nr_end * map->stripe_len -
5141                            (offset + *length);
5142
5143        if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
5144                if (rw & REQ_DISCARD)
5145                        num_stripes = min_t(u64, map->num_stripes,
5146                                            stripe_nr_end - stripe_nr_orig);
5147                stripe_index = do_div(stripe_nr, map->num_stripes);
5148                if (!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)))
5149                        mirror_num = 1;
5150        } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
5151                if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))
5152                        num_stripes = map->num_stripes;
5153                else if (mirror_num)
5154                        stripe_index = mirror_num - 1;
5155                else {
5156                        stripe_index = find_live_mirror(fs_info, map, 0,
5157                                            map->num_stripes,
5158                                            current->pid % map->num_stripes,
5159                                            dev_replace_is_ongoing);
5160                        mirror_num = stripe_index + 1;
5161                }
5162
5163        } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
5164                if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) {
5165                        num_stripes = map->num_stripes;
5166                } else if (mirror_num) {
5167                        stripe_index = mirror_num - 1;
5168                } else {
5169                        mirror_num = 1;
5170                }
5171
5172        } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
5173                int factor = map->num_stripes / map->sub_stripes;
5174
5175                stripe_index = do_div(stripe_nr, factor);
5176                stripe_index *= map->sub_stripes;
5177
5178                if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
5179                        num_stripes = map->sub_stripes;
5180                else if (rw & REQ_DISCARD)
5181                        num_stripes = min_t(u64, map->sub_stripes *
5182                                            (stripe_nr_end - stripe_nr_orig),
5183                                            map->num_stripes);
5184                else if (mirror_num)
5185                        stripe_index += mirror_num - 1;
5186                else {
5187                        int old_stripe_index = stripe_index;
5188                        stripe_index = find_live_mirror(fs_info, map,
5189                                              stripe_index,
5190                                              map->sub_stripes, stripe_index +
5191                                              current->pid % map->sub_stripes,
5192                                              dev_replace_is_ongoing);
5193                        mirror_num = stripe_index - old_stripe_index + 1;
5194                }
5195
5196        } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5197                if (need_raid_map &&
5198                    ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
5199                     mirror_num > 1)) {
5200                        /* push stripe_nr back to the start of the full stripe */
5201                        stripe_nr = raid56_full_stripe_start;
5202                        do_div(stripe_nr, stripe_len * nr_data_stripes(map));
5203
5204                        /* RAID[56] write or recovery. Return all stripes */
5205                        num_stripes = map->num_stripes;
5206                        max_errors = nr_parity_stripes(map);
5207
5208                        *length = map->stripe_len;
5209                        stripe_index = 0;
5210                        stripe_offset = 0;
5211                } else {
5212                        u64 tmp;
5213
5214                        /*
5215                         * Mirror #0 or #1 means the original data block.
5216                         * Mirror #2 is RAID5 parity block.
5217                         * Mirror #3 is RAID6 Q block.
5218                         */
5219                        stripe_index = do_div(stripe_nr, nr_data_stripes(map));
5220                        if (mirror_num > 1)
5221                                stripe_index = nr_data_stripes(map) +
5222                                                mirror_num - 2;
5223
5224                        /* We distribute the parity blocks across stripes */
5225                        tmp = stripe_nr + stripe_index;
5226                        stripe_index = do_div(tmp, map->num_stripes);
5227                        if (!(rw & (REQ_WRITE | REQ_DISCARD |
5228                                    REQ_GET_READ_MIRRORS)) && mirror_num <= 1)
5229                                mirror_num = 1;
5230                }
5231        } else {
5232                /*
5233                 * after this do_div call, stripe_nr is the number of stripes
5234                 * on this device we have to walk to find the data, and
5235                 * stripe_index is the number of our device in the stripe array
5236                 */
5237                stripe_index = do_div(stripe_nr, map->num_stripes);
5238                mirror_num = stripe_index + 1;
5239        }
5240        BUG_ON(stripe_index >= map->num_stripes);
5241
5242        num_alloc_stripes = num_stripes;
5243        if (dev_replace_is_ongoing) {
5244                if (rw & (REQ_WRITE | REQ_DISCARD))
5245                        num_alloc_stripes <<= 1;
5246                if (rw & REQ_GET_READ_MIRRORS)
5247                        num_alloc_stripes++;
5248                tgtdev_indexes = num_stripes;
5249        }
5250
5251        bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
5252        if (!bbio) {
5253                ret = -ENOMEM;
5254                goto out;
5255        }
5256        if (dev_replace_is_ongoing)
5257                bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
5258
5259        /* build raid_map */
5260        if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK &&
5261            need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
5262            mirror_num > 1)) {
5263                u64 tmp;
5264                int i, rot;
5265
5266                bbio->raid_map = (u64 *)((void *)bbio->stripes +
5267                                 sizeof(struct btrfs_bio_stripe) *
5268                                 num_alloc_stripes +
5269                                 sizeof(int) * tgtdev_indexes);
5270
5271                /* Work out the disk rotation on this stripe-set */
5272                tmp = stripe_nr;
5273                rot = do_div(tmp, num_stripes);
5274
5275                /* Fill in the logical address of each stripe */
5276                tmp = stripe_nr * nr_data_stripes(map);
5277                for (i = 0; i < nr_data_stripes(map); i++)
5278                        bbio->raid_map[(i+rot) % num_stripes] =
5279                                em->start + (tmp + i) * map->stripe_len;
5280
5281                bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
5282                if (map->type & BTRFS_BLOCK_GROUP_RAID6)
5283                        bbio->raid_map[(i+rot+1) % num_stripes] =
5284                                RAID6_Q_STRIPE;
5285        }
5286
5287        if (rw & REQ_DISCARD) {
5288                int factor = 0;
5289                int sub_stripes = 0;
5290                u64 stripes_per_dev = 0;
5291                u32 remaining_stripes = 0;
5292                u32 last_stripe = 0;
5293
5294                if (map->type &
5295                    (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
5296                        if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5297                                sub_stripes = 1;
5298                        else
5299                                sub_stripes = map->sub_stripes;
5300
5301                        factor = map->num_stripes / sub_stripes;
5302                        stripes_per_dev = div_u64_rem(stripe_nr_end -
5303                                                      stripe_nr_orig,
5304                                                      factor,
5305                                                      &remaining_stripes);
5306                        div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
5307                        last_stripe *= sub_stripes;
5308                }
5309
5310                for (i = 0; i < num_stripes; i++) {
5311                        bbio->stripes[i].physical =
5312                                map->stripes[stripe_index].physical +
5313                                stripe_offset + stripe_nr * map->stripe_len;
5314                        bbio->stripes[i].dev = map->stripes[stripe_index].dev;
5315
5316                        if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
5317                                         BTRFS_BLOCK_GROUP_RAID10)) {
5318                                bbio->stripes[i].length = stripes_per_dev *
5319                                                          map->stripe_len;
5320
5321                                if (i / sub_stripes < remaining_stripes)
5322                                        bbio->stripes[i].length +=
5323                                                map->stripe_len;
5324
5325                                /*
5326                                 * Special for the first stripe and
5327                                 * the last stripe:
5328                                 *
5329                                 * |-------|...|-------|
5330                                 *     |----------|
5331                                 *    off     end_off
5332                                 */
5333                                if (i < sub_stripes)
5334                                        bbio->stripes[i].length -=
5335                                                stripe_offset;
5336
5337                                if (stripe_index >= last_stripe &&
5338                                    stripe_index <= (last_stripe +
5339                                                     sub_stripes - 1))
5340                                        bbio->stripes[i].length -=
5341                                                stripe_end_offset;
5342
5343                                if (i == sub_stripes - 1)
5344                                        stripe_offset = 0;
5345                        } else
5346                                bbio->stripes[i].length = *length;
5347
5348                        stripe_index++;
5349                        if (stripe_index == map->num_stripes) {
5350                                /* This could only happen for RAID0/10 */
5351                                stripe_index = 0;
5352                                stripe_nr++;
5353                        }
5354                }
5355        } else {
5356                for (i = 0; i < num_stripes; i++) {
5357                        bbio->stripes[i].physical =
5358                                map->stripes[stripe_index].physical +
5359                                stripe_offset +
5360                                stripe_nr * map->stripe_len;
5361                        bbio->stripes[i].dev =
5362                                map->stripes[stripe_index].dev;
5363                        stripe_index++;
5364                }
5365        }
5366
5367        if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
5368                max_errors = btrfs_chunk_max_errors(map);
5369
5370        if (bbio->raid_map)
5371                sort_parity_stripes(bbio, num_stripes);
5372
5373        tgtdev_indexes = 0;
5374        if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
5375            dev_replace->tgtdev != NULL) {
5376                int index_where_to_add;
5377                u64 srcdev_devid = dev_replace->srcdev->devid;
5378
5379                /*
5380                 * duplicate the write operations while the dev replace
5381                 * procedure is running. Since the copying of the old disk
5382                 * to the new disk takes place at run time while the
5383                 * filesystem is mounted writable, the regular write
5384                 * operations to the old disk have to be duplicated to go
5385                 * to the new disk as well.
5386                 * Note that device->missing is handled by the caller, and
5387                 * that the write to the old disk is already set up in the
5388                 * stripes array.
5389                 */
5390                index_where_to_add = num_stripes;
5391                for (i = 0; i < num_stripes; i++) {
5392                        if (bbio->stripes[i].dev->devid == srcdev_devid) {
5393                                /* write to new disk, too */
5394                                struct btrfs_bio_stripe *new =
5395                                        bbio->stripes + index_where_to_add;
5396                                struct btrfs_bio_stripe *old =
5397                                        bbio->stripes + i;
5398
5399                                new->physical = old->physical;
5400                                new->length = old->length;
5401                                new->dev = dev_replace->tgtdev;
5402                                bbio->tgtdev_map[i] = index_where_to_add;
5403                                index_where_to_add++;
5404                                max_errors++;
5405                                tgtdev_indexes++;
5406                        }
5407                }
5408                num_stripes = index_where_to_add;
5409        } else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) &&
5410                   dev_replace->tgtdev != NULL) {
5411                u64 srcdev_devid = dev_replace->srcdev->devid;
5412                int index_srcdev = 0;
5413                int found = 0;
5414                u64 physical_of_found = 0;
5415
5416                /*
5417                 * During the dev-replace procedure, the target drive can
5418                 * also be used to read data in case it is needed to repair
5419                 * a corrupt block elsewhere. This is possible if the
5420                 * requested area is left of the left cursor. In this area,
5421                 * the target drive is a full copy of the source drive.
5422                 */
5423                for (i = 0; i < num_stripes; i++) {
5424                        if (bbio->stripes[i].dev->devid == srcdev_devid) {
5425                                /*
5426                                 * In case of DUP, in order to keep it
5427                                 * simple, only add the mirror with the
5428                                 * lowest physical address
5429                                 */
5430                                if (found &&
5431                                    physical_of_found <=
5432                                     bbio->stripes[i].physical)
5433                                        continue;
5434                                index_srcdev = i;
5435                                found = 1;
5436                                physical_of_found = bbio->stripes[i].physical;
5437                        }
5438                }
5439                if (found) {
5440                        u64 length = map->stripe_len;
5441
5442                        if (physical_of_found + length <=
5443                            dev_replace->cursor_left) {
5444                                struct btrfs_bio_stripe *tgtdev_stripe =
5445                                        bbio->stripes + num_stripes;
5446
5447                                tgtdev_stripe->physical = physical_of_found;
5448                                tgtdev_stripe->length =
5449                                        bbio->stripes[index_srcdev].length;
5450                                tgtdev_stripe->dev = dev_replace->tgtdev;
5451                                bbio->tgtdev_map[index_srcdev] = num_stripes;
5452
5453                                tgtdev_indexes++;
5454                                num_stripes++;
5455                        }
5456                }
5457        }
5458
5459        *bbio_ret = bbio;
5460        bbio->map_type = map->type;
5461        bbio->num_stripes = num_stripes;
5462        bbio->max_errors = max_errors;
5463        bbio->mirror_num = mirror_num;
5464        bbio->num_tgtdevs = tgtdev_indexes;
5465
5466        /*
5467         * this is the case that REQ_READ && dev_replace_is_ongoing &&
5468         * mirror_num == num_stripes + 1 && dev_replace target drive is
5469         * available as a mirror
5470         */
5471        if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
5472                WARN_ON(num_stripes > 1);
5473                bbio->stripes[0].dev = dev_replace->tgtdev;
5474                bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
5475                bbio->mirror_num = map->num_stripes + 1;
5476        }
5477out:
5478        if (dev_replace_is_ongoing)
5479                btrfs_dev_replace_unlock(dev_replace);
5480        free_extent_map(em);
5481        return ret;
5482}
5483
5484int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
5485                      u64 logical, u64 *length,
5486                      struct btrfs_bio **bbio_ret, int mirror_num)
5487{
5488        return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
5489                                 mirror_num, 0);
5490}
5491
5492/* For Scrub/replace */
5493int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
5494                     u64 logical, u64 *length,
5495                     struct btrfs_bio **bbio_ret, int mirror_num,
5496                     int need_raid_map)
5497{
5498        return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
5499                                 mirror_num, need_raid_map);
5500}
5501
5502int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
5503                     u64 chunk_start, u64 physical, u64 devid,
5504                     u64 **logical, int *naddrs, int *stripe_len)
5505{
5506        struct extent_map_tree *em_tree = &map_tree->map_tree;
5507        struct extent_map *em;
5508        struct map_lookup *map;
5509        u64 *buf;
5510        u64 bytenr;
5511        u64 length;
5512        u64 stripe_nr;
5513        u64 rmap_len;
5514        int i, j, nr = 0;
5515
5516        read_lock(&em_tree->lock);
5517        em = lookup_extent_mapping(em_tree, chunk_start, 1);
5518        read_unlock(&em_tree->lock);
5519
5520        if (!em) {
5521                printk(KERN_ERR "BTRFS: couldn't find em for chunk %Lu\n",
5522                       chunk_start);
5523                return -EIO;
5524        }
5525
5526        if (em->start != chunk_start) {
5527                printk(KERN_ERR "BTRFS: bad chunk start, em=%Lu, wanted=%Lu\n",
5528                       em->start, chunk_start);
5529                free_extent_map(em);
5530                return -EIO;
5531        }
5532        map = (struct map_lookup *)em->bdev;
5533
5534        length = em->len;
5535        rmap_len = map->stripe_len;
5536
5537        if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5538                do_div(length, map->num_stripes / map->sub_stripes);
5539        else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5540                do_div(length, map->num_stripes);
5541        else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5542                do_div(length, nr_data_stripes(map));
5543                rmap_len = map->stripe_len * nr_data_stripes(map);
5544        }
5545
5546        buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
5547        BUG_ON(!buf); /* -ENOMEM */
5548
5549        for (i = 0; i < map->num_stripes; i++) {
5550                if (devid && map->stripes[i].dev->devid != devid)
5551                        continue;
5552                if (map->stripes[i].physical > physical ||
5553                    map->stripes[i].physical + length <= physical)
5554                        continue;
5555
5556                stripe_nr = physical - map->stripes[i].physical;
5557                do_div(stripe_nr, map->stripe_len);
5558
5559                if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
5560                        stripe_nr = stripe_nr * map->num_stripes + i;
5561                        do_div(stripe_nr, map->sub_stripes);
5562                } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
5563                        stripe_nr = stripe_nr * map->num_stripes + i;
5564                } /* else if RAID[56], multiply by nr_data_stripes().
5565                   * Alternatively, just use rmap_len below instead of
5566                   * map->stripe_len */
5567
5568                bytenr = chunk_start + stripe_nr * rmap_len;
5569                WARN_ON(nr >= map->num_stripes);
5570                for (j = 0; j < nr; j++) {
5571                        if (buf[j] == bytenr)
5572                                break;
5573                }
5574                if (j == nr) {
5575                        WARN_ON(nr >= map->num_stripes);
5576                        buf[nr++] = bytenr;
5577                }
5578        }
5579
5580        *logical = buf;
5581        *naddrs = nr;
5582        *stripe_len = rmap_len;
5583
5584        free_extent_map(em);
5585        return 0;
5586}
5587
5588static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err)
5589{
5590        if (likely(bbio->flags & BTRFS_BIO_ORIG_BIO_SUBMITTED))
5591                bio_endio_nodec(bio, err);
5592        else
5593                bio_endio(bio, err);
5594        btrfs_put_bbio(bbio);
5595}
5596
5597static void btrfs_end_bio(struct bio *bio, int err)
5598{
5599        struct btrfs_bio *bbio = bio->bi_private;
5600        struct btrfs_device *dev = bbio->stripes[0].dev;
5601        int is_orig_bio = 0;
5602
5603        if (err) {
5604                atomic_inc(&bbio->error);
5605                if (err == -EIO || err == -EREMOTEIO) {
5606                        unsigned int stripe_index =
5607                                btrfs_io_bio(bio)->stripe_index;
5608
5609                        BUG_ON(stripe_index >= bbio->num_stripes);
5610                        dev = bbio->stripes[stripe_index].dev;
5611                        if (dev->bdev) {
5612                                if (bio->bi_rw & WRITE)
5613                                        btrfs_dev_stat_inc(dev,
5614                                                BTRFS_DEV_STAT_WRITE_ERRS);
5615                                else
5616                                        btrfs_dev_stat_inc(dev,
5617                                                BTRFS_DEV_STAT_READ_ERRS);
5618                                if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH)
5619                                        btrfs_dev_stat_inc(dev,
5620                                                BTRFS_DEV_STAT_FLUSH_ERRS);
5621                                btrfs_dev_stat_print_on_error(dev);
5622                        }
5623                }
5624        }
5625
5626        if (bio == bbio->orig_bio)
5627                is_orig_bio = 1;
5628
5629        btrfs_bio_counter_dec(bbio->fs_info);
5630
5631        if (atomic_dec_and_test(&bbio->stripes_pending)) {
5632                if (!is_orig_bio) {
5633                        bio_put(bio);
5634                        bio = bbio->orig_bio;
5635                }
5636
5637                bio->bi_private = bbio->private;
5638                bio->bi_end_io = bbio->end_io;
5639                btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
5640                /* only send an error to the higher layers if it is
5641                 * beyond the tolerance of the btrfs bio
5642                 */
5643                if (atomic_read(&bbio->error) > bbio->max_errors) {
5644                        err = -EIO;
5645                } else {
5646                        /*
5647                         * this bio is actually up to date, we didn't
5648                         * go over the max number of errors
5649                         */
5650                        set_bit(BIO_UPTODATE, &bio->bi_flags);
5651                        err = 0;
5652                }
5653
5654                btrfs_end_bbio(bbio, bio, err);
5655        } else if (!is_orig_bio) {
5656                bio_put(bio);
5657        }
5658}
5659
5660/*
5661 * see run_scheduled_bios for a description of why bios are collected for
5662 * async submit.
5663 *
5664 * This will add one bio to the pending list for a device and make sure
5665 * the work struct is scheduled.
5666 */
5667static noinline void btrfs_schedule_bio(struct btrfs_root *root,
5668                                        struct btrfs_device *device,
5669                                        int rw, struct bio *bio)
5670{
5671        int should_queue = 1;
5672        struct btrfs_pending_bios *pending_bios;
5673
5674        if (device->missing || !device->bdev) {
5675                bio_endio(bio, -EIO);
5676                return;
5677        }
5678
5679        /* don't bother with additional async steps for reads, right now */
5680        if (!(rw & REQ_WRITE)) {
5681                bio_get(bio);
5682                btrfsic_submit_bio(rw, bio);
5683                bio_put(bio);
5684                return;
5685        }
5686
5687        /*
5688         * nr_async_bios allows us to reliably return congestion to the
5689         * higher layers.  Otherwise, the async bio makes it appear we have
5690         * made progress against dirty pages when we've really just put it
5691         * on a queue for later
5692         */
5693        atomic_inc(&root->fs_info->nr_async_bios);
5694        WARN_ON(bio->bi_next);
5695        bio->bi_next = NULL;
5696        bio->bi_rw |= rw;
5697
5698        spin_lock(&device->io_lock);
5699        if (bio->bi_rw & REQ_SYNC)
5700                pending_bios = &device->pending_sync_bios;
5701        else
5702                pending_bios = &device->pending_bios;
5703
5704        if (pending_bios->tail)
5705                pending_bios->tail->bi_next = bio;
5706
5707        pending_bios->tail = bio;
5708        if (!pending_bios->head)
5709                pending_bios->head = bio;
5710        if (device->running_pending)
5711                should_queue = 0;
5712
5713        spin_unlock(&device->io_lock);
5714
5715        if (should_queue)
5716                btrfs_queue_work(root->fs_info->submit_workers,
5717                                 &device->work);
5718}
5719
5720static int bio_size_ok(struct block_device *bdev, struct bio *bio,
5721                       sector_t sector)
5722{
5723        struct bio_vec *prev;
5724        struct request_queue *q = bdev_get_queue(bdev);
5725        unsigned int max_sectors = queue_max_sectors(q);
5726        struct bvec_merge_data bvm = {
5727                .bi_bdev = bdev,
5728                .bi_sector = sector,
5729                .bi_rw = bio->bi_rw,
5730        };
5731
5732        if (WARN_ON(bio->bi_vcnt == 0))
5733                return 1;
5734
5735        prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
5736        if (bio_sectors(bio) > max_sectors)
5737                return 0;
5738
5739        if (!q->merge_bvec_fn)
5740                return 1;
5741
5742        bvm.bi_size = bio->bi_iter.bi_size - prev->bv_len;
5743        if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len)
5744                return 0;
5745        return 1;
5746}
5747
5748static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
5749                              struct bio *bio, u64 physical, int dev_nr,
5750                              int rw, int async)
5751{
5752        struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
5753
5754        bio->bi_private = bbio;
5755        btrfs_io_bio(bio)->stripe_index = dev_nr;
5756        bio->bi_end_io = btrfs_end_bio;
5757        bio->bi_iter.bi_sector = physical >> 9;
5758#ifdef DEBUG
5759        {
5760                struct rcu_string *name;
5761
5762                rcu_read_lock();
5763                name = rcu_dereference(dev->name);
5764                pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
5765                         "(%s id %llu), size=%u\n", rw,
5766                         (u64)bio->bi_iter.bi_sector, (u_long)dev->bdev->bd_dev,
5767                         name->str, dev->devid, bio->bi_iter.bi_size);
5768                rcu_read_unlock();
5769        }
5770#endif
5771        bio->bi_bdev = dev->bdev;
5772
5773        btrfs_bio_counter_inc_noblocked(root->fs_info);
5774
5775        if (async)
5776                btrfs_schedule_bio(root, dev, rw, bio);
5777        else
5778                btrfsic_submit_bio(rw, bio);
5779}
5780
5781static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
5782                              struct bio *first_bio, struct btrfs_device *dev,
5783                              int dev_nr, int rw, int async)
5784{
5785        struct bio_vec *bvec = first_bio->bi_io_vec;
5786        struct bio *bio;
5787        int nr_vecs = bio_get_nr_vecs(dev->bdev);
5788        u64 physical = bbio->stripes[dev_nr].physical;
5789
5790again:
5791        bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS);
5792        if (!bio)
5793                return -ENOMEM;
5794
5795        while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) {
5796                if (bio_add_page(bio, bvec->bv_page, bvec->bv_len,
5797                                 bvec->bv_offset) < bvec->bv_len) {
5798                        u64 len = bio->bi_iter.bi_size;
5799
5800                        atomic_inc(&bbio->stripes_pending);
5801                        submit_stripe_bio(root, bbio, bio, physical, dev_nr,
5802                                          rw, async);
5803                        physical += len;
5804                        goto again;
5805                }
5806                bvec++;
5807        }
5808
5809        submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async);
5810        return 0;
5811}
5812
5813static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
5814{
5815        atomic_inc(&bbio->error);
5816        if (atomic_dec_and_test(&bbio->stripes_pending)) {
5817                /* Shoud be the original bio. */
5818                WARN_ON(bio != bbio->orig_bio);
5819
5820                bio->bi_private = bbio->private;
5821                bio->bi_end_io = bbio->end_io;
5822                btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
5823                bio->bi_iter.bi_sector = logical >> 9;
5824
5825                btrfs_end_bbio(bbio, bio, -EIO);
5826        }
5827}
5828
5829int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
5830                  int mirror_num, int async_submit)
5831{
5832        struct btrfs_device *dev;
5833        struct bio *first_bio = bio;
5834        u64 logical = (u64)bio->bi_iter.bi_sector << 9;
5835        u64 length = 0;
5836        u64 map_length;
5837        int ret;
5838        int dev_nr = 0;
5839        int total_devs = 1;
5840        struct btrfs_bio *bbio = NULL;
5841
5842        length = bio->bi_iter.bi_size;
5843        map_length = length;
5844
5845        btrfs_bio_counter_inc_blocked(root->fs_info);
5846        ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
5847                              mirror_num, 1);
5848        if (ret) {
5849                btrfs_bio_counter_dec(root->fs_info);
5850                return ret;
5851        }
5852
5853        total_devs = bbio->num_stripes;
5854        bbio->orig_bio = first_bio;
5855        bbio->private = first_bio->bi_private;
5856        bbio->end_io = first_bio->bi_end_io;
5857        bbio->fs_info = root->fs_info;
5858        atomic_set(&bbio->stripes_pending, bbio->num_stripes);
5859
5860        if (bbio->raid_map) {
5861                /* In this case, map_length has been set to the length of
5862                   a single stripe; not the whole write */
5863                if (rw & WRITE) {
5864                        ret = raid56_parity_write(root, bio, bbio, map_length);
5865                } else {
5866                        ret = raid56_parity_recover(root, bio, bbio, map_length,
5867                                                    mirror_num, 1);
5868                }
5869
5870                btrfs_bio_counter_dec(root->fs_info);
5871                return ret;
5872        }
5873
5874        if (map_length < length) {
5875                btrfs_crit(root->fs_info, "mapping failed logical %llu bio len %llu len %llu",
5876                        logical, length, map_length);
5877                BUG();
5878        }
5879
5880        while (dev_nr < total_devs) {
5881                dev = bbio->stripes[dev_nr].dev;
5882                if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
5883                        bbio_error(bbio, first_bio, logical);
5884                        dev_nr++;
5885                        continue;
5886                }
5887
5888                /*
5889                 * Check and see if we're ok with this bio based on it's size
5890                 * and offset with the given device.
5891                 */
5892                if (!bio_size_ok(dev->bdev, first_bio,
5893                                 bbio->stripes[dev_nr].physical >> 9)) {
5894                        ret = breakup_stripe_bio(root, bbio, first_bio, dev,
5895                                                 dev_nr, rw, async_submit);
5896                        BUG_ON(ret);
5897                        dev_nr++;
5898                        continue;
5899                }
5900
5901                if (dev_nr < total_devs - 1) {
5902                        bio = btrfs_bio_clone(first_bio, GFP_NOFS);
5903                        BUG_ON(!bio); /* -ENOMEM */
5904                } else {
5905                        bio = first_bio;
5906                        bbio->flags |= BTRFS_BIO_ORIG_BIO_SUBMITTED;
5907                }
5908
5909                submit_stripe_bio(root, bbio, bio,
5910                                  bbio->stripes[dev_nr].physical, dev_nr, rw,
5911                                  async_submit);
5912                dev_nr++;
5913        }
5914        btrfs_bio_counter_dec(root->fs_info);
5915        return 0;
5916}
5917
5918struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
5919                                       u8 *uuid, u8 *fsid)
5920{
5921        struct btrfs_device *device;
5922        struct btrfs_fs_devices *cur_devices;
5923
5924        cur_devices = fs_info->fs_devices;
5925        while (cur_devices) {
5926                if (!fsid ||
5927                    !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
5928                        device = __find_device(&cur_devices->devices,
5929                                               devid, uuid);
5930                        if (device)
5931                                return device;
5932                }
5933                cur_devices = cur_devices->seed;
5934        }
5935        return NULL;
5936}
5937
5938static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
5939                                            struct btrfs_fs_devices *fs_devices,
5940                                            u64 devid, u8 *dev_uuid)
5941{
5942        struct btrfs_device *device;
5943
5944        device = btrfs_alloc_device(NULL, &devid, dev_uuid);
5945        if (IS_ERR(device))
5946                return NULL;
5947
5948        list_add(&device->dev_list, &fs_devices->devices);
5949        device->fs_devices = fs_devices;
5950        fs_devices->num_devices++;
5951
5952        device->missing = 1;
5953        fs_devices->missing_devices++;
5954
5955        return device;
5956}
5957
5958/**
5959 * btrfs_alloc_device - allocate struct btrfs_device
5960 * @fs_info:    used only for generating a new devid, can be NULL if
5961 *              devid is provided (i.e. @devid != NULL).
5962 * @devid:      a pointer to devid for this device.  If NULL a new devid
5963 *              is generated.
5964 * @uuid:       a pointer to UUID for this device.  If NULL a new UUID
5965 *              is generated.
5966 *
5967 * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
5968 * on error.  Returned struct is not linked onto any lists and can be
5969 * destroyed with kfree() right away.
5970 */
5971struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
5972                                        const u64 *devid,
5973                                        const u8 *uuid)
5974{
5975        struct btrfs_device *dev;
5976        u64 tmp;
5977
5978        if (WARN_ON(!devid && !fs_info))
5979                return ERR_PTR(-EINVAL);
5980
5981        dev = __alloc_device();
5982        if (IS_ERR(dev))
5983                return dev;
5984
5985        if (devid)
5986                tmp = *devid;
5987        else {
5988                int ret;
5989
5990                ret = find_next_devid(fs_info, &tmp);
5991                if (ret) {
5992                        kfree(dev);
5993                        return ERR_PTR(ret);
5994                }
5995        }
5996        dev->devid = tmp;
5997
5998        if (uuid)
5999                memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
6000        else
6001                generate_random_uuid(dev->uuid);
6002
6003        btrfs_init_work(&dev->work, btrfs_submit_helper,
6004                        pending_bios_fn, NULL, NULL);
6005
6006        return dev;
6007}
6008
6009static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
6010                          struct extent_buffer *leaf,
6011                          struct btrfs_chunk *chunk)
6012{
6013        struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
6014        struct map_lookup *map;
6015        struct extent_map *em;
6016        u64 logical;
6017        u64 length;
6018        u64 devid;
6019        u8 uuid[BTRFS_UUID_SIZE];
6020        int num_stripes;
6021        int ret;
6022        int i;
6023
6024        logical = key->offset;
6025        length = btrfs_chunk_length(leaf, chunk);
6026
6027        read_lock(&map_tree->map_tree.lock);
6028        em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
6029        read_unlock(&map_tree->map_tree.lock);
6030
6031        /* already mapped? */
6032        if (em && em->start <= logical && em->start + em->len > logical) {
6033                free_extent_map(em);
6034                return 0;
6035        } else if (em) {
6036                free_extent_map(em);
6037        }
6038
6039        em = alloc_extent_map();
6040        if (!em)
6041                return -ENOMEM;
6042        num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
6043        map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
6044        if (!map) {
6045                free_extent_map(em);
6046                return -ENOMEM;
6047        }
6048
6049        set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
6050        em->bdev = (struct block_device *)map;
6051        em->start = logical;
6052        em->len = length;
6053        em->orig_start = 0;
6054        em->block_start = 0;
6055        em->block_len = em->len;
6056
6057        map->num_stripes = num_stripes;
6058        map->io_width = btrfs_chunk_io_width(leaf, chunk);
6059        map->io_align = btrfs_chunk_io_align(leaf, chunk);
6060        map->sector_size = btrfs_chunk_sector_size(leaf, chunk);
6061        map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
6062        map->type = btrfs_chunk_type(leaf, chunk);
6063        map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
6064        for (i = 0; i < num_stripes; i++) {
6065                map->stripes[i].physical =
6066                        btrfs_stripe_offset_nr(leaf, chunk, i);
6067                devid = btrfs_stripe_devid_nr(leaf, chunk, i);
6068                read_extent_buffer(leaf, uuid, (unsigned long)
6069                                   btrfs_stripe_dev_uuid_nr(chunk, i),
6070                                   BTRFS_UUID_SIZE);
6071                map->stripes[i].dev = btrfs_find_device(root->fs_info, devid,
6072                                                        uuid, NULL);
6073                if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
6074                        free_extent_map(em);
6075                        return -EIO;
6076                }
6077                if (!map->stripes[i].dev) {
6078                        map->stripes[i].dev =
6079                                add_missing_dev(root, root->fs_info->fs_devices,
6080                                                devid, uuid);
6081                        if (!map->stripes[i].dev) {
6082                                free_extent_map(em);
6083                                return -EIO;
6084                        }
6085                }
6086                map->stripes[i].dev->in_fs_metadata = 1;
6087        }
6088
6089        write_lock(&map_tree->map_tree.lock);
6090        ret = add_extent_mapping(&map_tree->map_tree, em, 0);
6091        write_unlock(&map_tree->map_tree.lock);
6092        BUG_ON(ret); /* Tree corruption */
6093        free_extent_map(em);
6094
6095        return 0;
6096}
6097
6098static void fill_device_from_item(struct extent_buffer *leaf,
6099                                 struct btrfs_dev_item *dev_item,
6100                                 struct btrfs_device *device)
6101{
6102        unsigned long ptr;
6103
6104        device->devid = btrfs_device_id(leaf, dev_item);
6105        device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
6106        device->total_bytes = device->disk_total_bytes;
6107        device->commit_total_bytes = device->disk_total_bytes;
6108        device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
6109        device->commit_bytes_used = device->bytes_used;
6110        device->type = btrfs_device_type(leaf, dev_item);
6111        device->io_align = btrfs_device_io_align(leaf, dev_item);
6112        device->io_width = btrfs_device_io_width(leaf, dev_item);
6113        device->sector_size = btrfs_device_sector_size(leaf, dev_item);
6114        WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
6115        device->is_tgtdev_for_dev_replace = 0;
6116
6117        ptr = btrfs_device_uuid(dev_item);
6118        read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
6119}
6120
6121static struct btrfs_fs_devices *open_seed_devices(struct btrfs_root *root,
6122                                                  u8 *fsid)
6123{
6124        struct btrfs_fs_devices *fs_devices;
6125        int ret;
6126
6127        BUG_ON(!mutex_is_locked(&uuid_mutex));
6128
6129        fs_devices = root->fs_info->fs_devices->seed;
6130        while (fs_devices) {
6131                if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE))
6132                        return fs_devices;
6133
6134                fs_devices = fs_devices->seed;
6135        }
6136
6137        fs_devices = find_fsid(fsid);
6138        if (!fs_devices) {
6139                if (!btrfs_test_opt(root, DEGRADED))
6140                        return ERR_PTR(-ENOENT);
6141
6142                fs_devices = alloc_fs_devices(fsid);
6143                if (IS_ERR(fs_devices))
6144                        return fs_devices;
6145
6146                fs_devices->seeding = 1;
6147                fs_devices->opened = 1;
6148                return fs_devices;
6149        }
6150
6151        fs_devices = clone_fs_devices(fs_devices);
6152        if (IS_ERR(fs_devices))
6153                return fs_devices;
6154
6155        ret = __btrfs_open_devices(fs_devices, FMODE_READ,
6156                                   root->fs_info->bdev_holder);
6157        if (ret) {
6158                free_fs_devices(fs_devices);
6159                fs_devices = ERR_PTR(ret);
6160                goto out;
6161        }
6162
6163        if (!fs_devices->seeding) {
6164                __btrfs_close_devices(fs_devices);
6165                free_fs_devices(fs_devices);
6166                fs_devices = ERR_PTR(-EINVAL);
6167                goto out;
6168        }
6169
6170        fs_devices->seed = root->fs_info->fs_devices->seed;
6171        root->fs_info->fs_devices->seed = fs_devices;
6172out:
6173        return fs_devices;
6174}
6175
6176static int read_one_dev(struct btrfs_root *root,
6177                        struct extent_buffer *leaf,
6178                        struct btrfs_dev_item *dev_item)
6179{
6180        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
6181        struct btrfs_device *device;
6182        u64 devid;
6183        int ret;
6184        u8 fs_uuid[BTRFS_UUID_SIZE];
6185        u8 dev_uuid[BTRFS_UUID_SIZE];
6186
6187        devid = btrfs_device_id(leaf, dev_item);
6188        read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
6189                           BTRFS_UUID_SIZE);
6190        read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
6191                           BTRFS_UUID_SIZE);
6192
6193        if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
6194                fs_devices = open_seed_devices(root, fs_uuid);
6195                if (IS_ERR(fs_devices))
6196                        return PTR_ERR(fs_devices);
6197        }
6198
6199        device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid);
6200        if (!device) {
6201                if (!btrfs_test_opt(root, DEGRADED))
6202                        return -EIO;
6203
6204                btrfs_warn(root->fs_info, "devid %llu missing", devid);
6205                device = add_missing_dev(root, fs_devices, devid, dev_uuid);
6206                if (!device)
6207                        return -ENOMEM;
6208        } else {
6209                if (!device->bdev && !btrfs_test_opt(root, DEGRADED))
6210                        return -EIO;
6211
6212                if(!device->bdev && !device->missing) {
6213                        /*
6214                         * this happens when a device that was properly setup
6215                         * in the device info lists suddenly goes bad.
6216                         * device->bdev is NULL, and so we have to set
6217                         * device->missing to one here
6218                         */
6219                        device->fs_devices->missing_devices++;
6220                        device->missing = 1;
6221                }
6222
6223                /* Move the device to its own fs_devices */
6224                if (device->fs_devices != fs_devices) {
6225                        ASSERT(device->missing);
6226
6227                        list_move(&device->dev_list, &fs_devices->devices);
6228                        device->fs_devices->num_devices--;
6229                        fs_devices->num_devices++;
6230
6231                        device->fs_devices->missing_devices--;
6232                        fs_devices->missing_devices++;
6233
6234                        device->fs_devices = fs_devices;
6235                }
6236        }
6237
6238        if (device->fs_devices != root->fs_info->fs_devices) {
6239                BUG_ON(device->writeable);
6240                if (device->generation !=
6241                    btrfs_device_generation(leaf, dev_item))
6242                        return -EINVAL;
6243        }
6244
6245        fill_device_from_item(leaf, dev_item, device);
6246        device->in_fs_metadata = 1;
6247        if (device->writeable && !device->is_tgtdev_for_dev_replace) {
6248                device->fs_devices->total_rw_bytes += device->total_bytes;
6249                spin_lock(&root->fs_info->free_chunk_lock);
6250                root->fs_info->free_chunk_space += device->total_bytes -
6251                        device->bytes_used;
6252                spin_unlock(&root->fs_info->free_chunk_lock);
6253        }
6254        ret = 0;
6255        return ret;
6256}
6257
6258int btrfs_read_sys_array(struct btrfs_root *root)
6259{
6260        struct btrfs_super_block *super_copy = root->fs_info->super_copy;
6261        struct extent_buffer *sb;
6262        struct btrfs_disk_key *disk_key;
6263        struct btrfs_chunk *chunk;
6264        u8 *array_ptr;
6265        unsigned long sb_array_offset;
6266        int ret = 0;
6267        u32 num_stripes;
6268        u32 array_size;
6269        u32 len = 0;
6270        u32 cur_offset;
6271        struct btrfs_key key;
6272
6273        ASSERT(BTRFS_SUPER_INFO_SIZE <= root->nodesize);
6274        /*
6275         * This will create extent buffer of nodesize, superblock size is
6276         * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
6277         * overallocate but we can keep it as-is, only the first page is used.
6278         */
6279        sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET);
6280        if (!sb)
6281                return -ENOMEM;
6282        btrfs_set_buffer_uptodate(sb);
6283        btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
6284        /*
6285         * The sb extent buffer is artifical and just used to read the system array.
6286         * btrfs_set_buffer_uptodate() call does not properly mark all it's
6287         * pages up-to-date when the page is larger: extent does not cover the
6288         * whole page and consequently check_page_uptodate does not find all
6289         * the page's extents up-to-date (the hole beyond sb),
6290         * write_extent_buffer then triggers a WARN_ON.
6291         *
6292         * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
6293         * but sb spans only this function. Add an explicit SetPageUptodate call
6294         * to silence the warning eg. on PowerPC 64.
6295         */
6296        if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE)
6297                SetPageUptodate(sb->pages[0]);
6298
6299        write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
6300        array_size = btrfs_super_sys_array_size(super_copy);
6301
6302        array_ptr = super_copy->sys_chunk_array;
6303        sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
6304        cur_offset = 0;
6305
6306        while (cur_offset < array_size) {
6307                disk_key = (struct btrfs_disk_key *)array_ptr;
6308                len = sizeof(*disk_key);
6309                if (cur_offset + len > array_size)
6310                        goto out_short_read;
6311
6312                btrfs_disk_key_to_cpu(&key, disk_key);
6313
6314                array_ptr += len;
6315                sb_array_offset += len;
6316                cur_offset += len;
6317
6318                if (key.type == BTRFS_CHUNK_ITEM_KEY) {
6319                        chunk = (struct btrfs_chunk *)sb_array_offset;
6320                        /*
6321                         * At least one btrfs_chunk with one stripe must be
6322                         * present, exact stripe count check comes afterwards
6323                         */
6324                        len = btrfs_chunk_item_size(1);
6325                        if (cur_offset + len > array_size)
6326                                goto out_short_read;
6327
6328                        num_stripes = btrfs_chunk_num_stripes(sb, chunk);
6329                        len = btrfs_chunk_item_size(num_stripes);
6330                        if (cur_offset + len > array_size)
6331                                goto out_short_read;
6332
6333                        ret = read_one_chunk(root, &key, sb, chunk);
6334                        if (ret)
6335                                break;
6336                } else {
6337                        ret = -EIO;
6338                        break;
6339                }
6340                array_ptr += len;
6341                sb_array_offset += len;
6342                cur_offset += len;
6343        }
6344        free_extent_buffer(sb);
6345        return ret;
6346
6347out_short_read:
6348        printk(KERN_ERR "BTRFS: sys_array too short to read %u bytes at offset %u\n",
6349                        len, cur_offset);
6350        free_extent_buffer(sb);
6351        return -EIO;
6352}
6353
6354int btrfs_read_chunk_tree(struct btrfs_root *root)
6355{
6356        struct btrfs_path *path;
6357        struct extent_buffer *leaf;
6358        struct btrfs_key key;
6359        struct btrfs_key found_key;
6360        int ret;
6361        int slot;
6362
6363        root = root->fs_info->chunk_root;
6364
6365        path = btrfs_alloc_path();
6366        if (!path)
6367                return -ENOMEM;
6368
6369        mutex_lock(&uuid_mutex);
6370        lock_chunks(root);
6371
6372        /*
6373         * Read all device items, and then all the chunk items. All
6374         * device items are found before any chunk item (their object id
6375         * is smaller than the lowest possible object id for a chunk
6376         * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
6377         */
6378        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
6379        key.offset = 0;
6380        key.type = 0;
6381        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6382        if (ret < 0)
6383                goto error;
6384        while (1) {
6385                leaf = path->nodes[0];
6386                slot = path->slots[0];
6387                if (slot >= btrfs_header_nritems(leaf)) {
6388                        ret = btrfs_next_leaf(root, path);
6389                        if (ret == 0)
6390                                continue;
6391                        if (ret < 0)
6392                                goto error;
6393                        break;
6394                }
6395                btrfs_item_key_to_cpu(leaf, &found_key, slot);
6396                if (found_key.type == BTRFS_DEV_ITEM_KEY) {
6397                        struct btrfs_dev_item *dev_item;
6398                        dev_item = btrfs_item_ptr(leaf, slot,
6399                                                  struct btrfs_dev_item);
6400                        ret = read_one_dev(root, leaf, dev_item);
6401                        if (ret)
6402                                goto error;
6403                } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
6404                        struct btrfs_chunk *chunk;
6405                        chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
6406                        ret = read_one_chunk(root, &found_key, leaf, chunk);
6407                        if (ret)
6408                                goto error;
6409                }
6410                path->slots[0]++;
6411        }
6412        ret = 0;
6413error:
6414        unlock_chunks(root);
6415        mutex_unlock(&uuid_mutex);
6416
6417        btrfs_free_path(path);
6418        return ret;
6419}
6420
6421void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
6422{
6423        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6424        struct btrfs_device *device;
6425
6426        while (fs_devices) {
6427                mutex_lock(&fs_devices->device_list_mutex);
6428                list_for_each_entry(device, &fs_devices->devices, dev_list)
6429                        device->dev_root = fs_info->dev_root;
6430                mutex_unlock(&fs_devices->device_list_mutex);
6431
6432                fs_devices = fs_devices->seed;
6433        }
6434}
6435
6436static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
6437{
6438        int i;
6439
6440        for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
6441                btrfs_dev_stat_reset(dev, i);
6442}
6443
6444int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
6445{
6446        struct btrfs_key key;
6447        struct btrfs_key found_key;
6448        struct btrfs_root *dev_root = fs_info->dev_root;
6449        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6450        struct extent_buffer *eb;
6451        int slot;
6452        int ret = 0;
6453        struct btrfs_device *device;
6454        struct btrfs_path *path = NULL;
6455        int i;
6456
6457        path = btrfs_alloc_path();
6458        if (!path) {
6459                ret = -ENOMEM;
6460                goto out;
6461        }
6462
6463        mutex_lock(&fs_devices->device_list_mutex);
6464        list_for_each_entry(device, &fs_devices->devices, dev_list) {
6465                int item_size;
6466                struct btrfs_dev_stats_item *ptr;
6467
6468                key.objectid = 0;
6469                key.type = BTRFS_DEV_STATS_KEY;
6470                key.offset = device->devid;
6471                ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
6472                if (ret) {
6473                        __btrfs_reset_dev_stats(device);
6474                        device->dev_stats_valid = 1;
6475                        btrfs_release_path(path);
6476                        continue;
6477                }
6478                slot = path->slots[0];
6479                eb = path->nodes[0];
6480                btrfs_item_key_to_cpu(eb, &found_key, slot);
6481                item_size = btrfs_item_size_nr(eb, slot);
6482
6483                ptr = btrfs_item_ptr(eb, slot,
6484                                     struct btrfs_dev_stats_item);
6485
6486                for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
6487                        if (item_size >= (1 + i) * sizeof(__le64))
6488                                btrfs_dev_stat_set(device, i,
6489                                        btrfs_dev_stats_value(eb, ptr, i));
6490                        else
6491                                btrfs_dev_stat_reset(device, i);
6492                }
6493
6494                device->dev_stats_valid = 1;
6495                btrfs_dev_stat_print_on_load(device);
6496                btrfs_release_path(path);
6497        }
6498        mutex_unlock(&fs_devices->device_list_mutex);
6499
6500out:
6501        btrfs_free_path(path);
6502        return ret < 0 ? ret : 0;
6503}
6504
6505static int update_dev_stat_item(struct btrfs_trans_handle *trans,
6506                                struct btrfs_root *dev_root,
6507                                struct btrfs_device *device)
6508{
6509        struct btrfs_path *path;
6510        struct btrfs_key key;
6511        struct extent_buffer *eb;
6512        struct btrfs_dev_stats_item *ptr;
6513        int ret;
6514        int i;
6515
6516        key.objectid = 0;
6517        key.type = BTRFS_DEV_STATS_KEY;
6518        key.offset = device->devid;
6519
6520        path = btrfs_alloc_path();
6521        BUG_ON(!path);
6522        ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
6523        if (ret < 0) {
6524                printk_in_rcu(KERN_WARNING "BTRFS: "
6525                        "error %d while searching for dev_stats item for device %s!\n",
6526                              ret, rcu_str_deref(device->name));
6527                goto out;
6528        }
6529
6530        if (ret == 0 &&
6531            btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
6532                /* need to delete old one and insert a new one */
6533                ret = btrfs_del_item(trans, dev_root, path);
6534                if (ret != 0) {
6535                        printk_in_rcu(KERN_WARNING "BTRFS: "
6536                                "delete too small dev_stats item for device %s failed %d!\n",
6537                                      rcu_str_deref(device->name), ret);
6538                        goto out;
6539                }
6540                ret = 1;
6541        }
6542
6543        if (ret == 1) {
6544                /* need to insert a new item */
6545                btrfs_release_path(path);
6546                ret = btrfs_insert_empty_item(trans, dev_root, path,
6547                                              &key, sizeof(*ptr));
6548                if (ret < 0) {
6549                        printk_in_rcu(KERN_WARNING "BTRFS: "
6550                                          "insert dev_stats item for device %s failed %d!\n",
6551                                      rcu_str_deref(device->name), ret);
6552                        goto out;
6553                }
6554        }
6555
6556        eb = path->nodes[0];
6557        ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
6558        for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
6559                btrfs_set_dev_stats_value(eb, ptr, i,
6560                                          btrfs_dev_stat_read(device, i));
6561        btrfs_mark_buffer_dirty(eb);
6562
6563out:
6564        btrfs_free_path(path);
6565        return ret;
6566}
6567
6568/*
6569 * called from commit_transaction. Writes all changed device stats to disk.
6570 */
6571int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
6572                        struct btrfs_fs_info *fs_info)
6573{
6574        struct btrfs_root *dev_root = fs_info->dev_root;
6575        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6576        struct btrfs_device *device;
6577        int stats_cnt;
6578        int ret = 0;
6579
6580        mutex_lock(&fs_devices->device_list_mutex);
6581        list_for_each_entry(device, &fs_devices->devices, dev_list) {
6582                if (!device->dev_stats_valid || !btrfs_dev_stats_dirty(device))
6583                        continue;
6584
6585                stats_cnt = atomic_read(&device->dev_stats_ccnt);
6586                ret = update_dev_stat_item(trans, dev_root, device);
6587                if (!ret)
6588                        atomic_sub(stats_cnt, &device->dev_stats_ccnt);
6589        }
6590        mutex_unlock(&fs_devices->device_list_mutex);
6591
6592        return ret;
6593}
6594
6595void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
6596{
6597        btrfs_dev_stat_inc(dev, index);
6598        btrfs_dev_stat_print_on_error(dev);
6599}
6600
6601static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
6602{
6603        if (!dev->dev_stats_valid)
6604                return;
6605        printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
6606                           "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
6607                           rcu_str_deref(dev->name),
6608                           btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
6609                           btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
6610                           btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
6611                           btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
6612                           btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
6613}
6614
6615static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
6616{
6617        int i;
6618
6619        for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
6620                if (btrfs_dev_stat_read(dev, i) != 0)
6621                        break;
6622        if (i == BTRFS_DEV_STAT_VALUES_MAX)
6623                return; /* all values == 0, suppress message */
6624
6625        printk_in_rcu(KERN_INFO "BTRFS: "
6626                   "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
6627               rcu_str_deref(dev->name),
6628               btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
6629               btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
6630               btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
6631               btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
6632               btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
6633}
6634
6635int btrfs_get_dev_stats(struct btrfs_root *root,
6636                        struct btrfs_ioctl_get_dev_stats *stats)
6637{
6638        struct btrfs_device *dev;
6639        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
6640        int i;
6641
6642        mutex_lock(&fs_devices->device_list_mutex);
6643        dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL);
6644        mutex_unlock(&fs_devices->device_list_mutex);
6645
6646        if (!dev) {
6647                btrfs_warn(root->fs_info, "get dev_stats failed, device not found");
6648                return -ENODEV;
6649        } else if (!dev->dev_stats_valid) {
6650                btrfs_warn(root->fs_info, "get dev_stats failed, not yet valid");
6651                return -ENODEV;
6652        } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
6653                for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
6654                        if (stats->nr_items > i)
6655                                stats->values[i] =
6656                                        btrfs_dev_stat_read_and_reset(dev, i);
6657                        else
6658                                btrfs_dev_stat_reset(dev, i);
6659                }
6660        } else {
6661                for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
6662                        if (stats->nr_items > i)
6663                                stats->values[i] = btrfs_dev_stat_read(dev, i);
6664        }
6665        if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
6666                stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
6667        return 0;
6668}
6669
6670int btrfs_scratch_superblock(struct btrfs_device *device)
6671{
6672        struct buffer_head *bh;
6673        struct btrfs_super_block *disk_super;
6674
6675        bh = btrfs_read_dev_super(device->bdev);
6676        if (!bh)
6677                return -EINVAL;
6678        disk_super = (struct btrfs_super_block *)bh->b_data;
6679
6680        memset(&disk_super->magic, 0, sizeof(disk_super->magic));
6681        set_buffer_dirty(bh);
6682        sync_dirty_buffer(bh);
6683        brelse(bh);
6684
6685        return 0;
6686}
6687
6688/*
6689 * Update the size of all devices, which is used for writing out the
6690 * super blocks.
6691 */
6692void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info)
6693{
6694        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6695        struct btrfs_device *curr, *next;
6696
6697        if (list_empty(&fs_devices->resized_devices))
6698                return;
6699
6700        mutex_lock(&fs_devices->device_list_mutex);
6701        lock_chunks(fs_info->dev_root);
6702        list_for_each_entry_safe(curr, next, &fs_devices->resized_devices,
6703                                 resized_list) {
6704                list_del_init(&curr->resized_list);
6705                curr->commit_total_bytes = curr->disk_total_bytes;
6706        }
6707        unlock_chunks(fs_info->dev_root);
6708        mutex_unlock(&fs_devices->device_list_mutex);
6709}
6710
6711/* Must be invoked during the transaction commit */
6712void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
6713                                        struct btrfs_transaction *transaction)
6714{
6715        struct extent_map *em;
6716        struct map_lookup *map;
6717        struct btrfs_device *dev;
6718        int i;
6719
6720        if (list_empty(&transaction->pending_chunks))
6721                return;
6722
6723        /* In order to kick the device replace finish process */
6724        lock_chunks(root);
6725        list_for_each_entry(em, &transaction->pending_chunks, list) {
6726                map = (struct map_lookup *)em->bdev;
6727
6728                for (i = 0; i < map->num_stripes; i++) {
6729                        dev = map->stripes[i].dev;
6730                        dev->commit_bytes_used = dev->bytes_used;
6731                }
6732        }
6733        unlock_chunks(root);
6734}
6735