linux/fs/btrfs/dev-replace.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) STRATO AG 2012.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18#include <linux/sched.h>
  19#include <linux/bio.h>
  20#include <linux/slab.h>
  21#include <linux/buffer_head.h>
  22#include <linux/blkdev.h>
  23#include <linux/random.h>
  24#include <linux/iocontext.h>
  25#include <linux/capability.h>
  26#include <linux/kthread.h>
  27#include <linux/math64.h>
  28#include <asm/div64.h>
  29#include "ctree.h"
  30#include "extent_map.h"
  31#include "disk-io.h"
  32#include "transaction.h"
  33#include "print-tree.h"
  34#include "volumes.h"
  35#include "async-thread.h"
  36#include "check-integrity.h"
  37#include "rcu-string.h"
  38#include "dev-replace.h"
  39#include "sysfs.h"
  40
  41static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
  42                                       int scrub_ret);
  43static void btrfs_dev_replace_update_device_in_mapping_tree(
  44                                                struct btrfs_fs_info *fs_info,
  45                                                struct btrfs_device *srcdev,
  46                                                struct btrfs_device *tgtdev);
  47static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
  48static int btrfs_dev_replace_kthread(void *data);
  49static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info);
  50
  51
  52int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
  53{
  54        struct btrfs_key key;
  55        struct btrfs_root *dev_root = fs_info->dev_root;
  56        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
  57        struct extent_buffer *eb;
  58        int slot;
  59        int ret = 0;
  60        struct btrfs_path *path = NULL;
  61        int item_size;
  62        struct btrfs_dev_replace_item *ptr;
  63        u64 src_devid;
  64
  65        path = btrfs_alloc_path();
  66        if (!path) {
  67                ret = -ENOMEM;
  68                goto out;
  69        }
  70
  71        key.objectid = 0;
  72        key.type = BTRFS_DEV_REPLACE_KEY;
  73        key.offset = 0;
  74        ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
  75        if (ret) {
  76no_valid_dev_replace_entry_found:
  77                ret = 0;
  78                dev_replace->replace_state =
  79                        BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED;
  80                dev_replace->cont_reading_from_srcdev_mode =
  81                    BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
  82                dev_replace->replace_state = 0;
  83                dev_replace->time_started = 0;
  84                dev_replace->time_stopped = 0;
  85                atomic64_set(&dev_replace->num_write_errors, 0);
  86                atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
  87                dev_replace->cursor_left = 0;
  88                dev_replace->committed_cursor_left = 0;
  89                dev_replace->cursor_left_last_write_of_item = 0;
  90                dev_replace->cursor_right = 0;
  91                dev_replace->srcdev = NULL;
  92                dev_replace->tgtdev = NULL;
  93                dev_replace->is_valid = 0;
  94                dev_replace->item_needs_writeback = 0;
  95                goto out;
  96        }
  97        slot = path->slots[0];
  98        eb = path->nodes[0];
  99        item_size = btrfs_item_size_nr(eb, slot);
 100        ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
 101
 102        if (item_size != sizeof(struct btrfs_dev_replace_item)) {
 103                btrfs_warn(fs_info,
 104                        "dev_replace entry found has unexpected size, ignore entry");
 105                goto no_valid_dev_replace_entry_found;
 106        }
 107
 108        src_devid = btrfs_dev_replace_src_devid(eb, ptr);
 109        dev_replace->cont_reading_from_srcdev_mode =
 110                btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
 111        dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
 112        dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
 113        dev_replace->time_stopped =
 114                btrfs_dev_replace_time_stopped(eb, ptr);
 115        atomic64_set(&dev_replace->num_write_errors,
 116                     btrfs_dev_replace_num_write_errors(eb, ptr));
 117        atomic64_set(&dev_replace->num_uncorrectable_read_errors,
 118                     btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
 119        dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
 120        dev_replace->committed_cursor_left = dev_replace->cursor_left;
 121        dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
 122        dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
 123        dev_replace->is_valid = 1;
 124
 125        dev_replace->item_needs_writeback = 0;
 126        switch (dev_replace->replace_state) {
 127        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 128        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
 129        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
 130                dev_replace->srcdev = NULL;
 131                dev_replace->tgtdev = NULL;
 132                break;
 133        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 134        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
 135                dev_replace->srcdev = btrfs_find_device(fs_info, src_devid,
 136                                                        NULL, NULL);
 137                dev_replace->tgtdev = btrfs_find_device(fs_info,
 138                                                        BTRFS_DEV_REPLACE_DEVID,
 139                                                        NULL, NULL);
 140                /*
 141                 * allow 'btrfs dev replace_cancel' if src/tgt device is
 142                 * missing
 143                 */
 144                if (!dev_replace->srcdev &&
 145                    !btrfs_test_opt(fs_info, DEGRADED)) {
 146                        ret = -EIO;
 147                        btrfs_warn(fs_info,
 148                           "cannot mount because device replace operation is ongoing and");
 149                        btrfs_warn(fs_info,
 150                           "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
 151                           src_devid);
 152                }
 153                if (!dev_replace->tgtdev &&
 154                    !btrfs_test_opt(fs_info, DEGRADED)) {
 155                        ret = -EIO;
 156                        btrfs_warn(fs_info,
 157                           "cannot mount because device replace operation is ongoing and");
 158                        btrfs_warn(fs_info,
 159                           "tgtdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
 160                                BTRFS_DEV_REPLACE_DEVID);
 161                }
 162                if (dev_replace->tgtdev) {
 163                        if (dev_replace->srcdev) {
 164                                dev_replace->tgtdev->total_bytes =
 165                                        dev_replace->srcdev->total_bytes;
 166                                dev_replace->tgtdev->disk_total_bytes =
 167                                        dev_replace->srcdev->disk_total_bytes;
 168                                dev_replace->tgtdev->commit_total_bytes =
 169                                        dev_replace->srcdev->commit_total_bytes;
 170                                dev_replace->tgtdev->bytes_used =
 171                                        dev_replace->srcdev->bytes_used;
 172                                dev_replace->tgtdev->commit_bytes_used =
 173                                        dev_replace->srcdev->commit_bytes_used;
 174                        }
 175                        dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1;
 176                        btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
 177                                dev_replace->tgtdev);
 178                }
 179                break;
 180        }
 181
 182out:
 183        btrfs_free_path(path);
 184        return ret;
 185}
 186
 187/*
 188 * called from commit_transaction. Writes changed device replace state to
 189 * disk.
 190 */
 191int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
 192                          struct btrfs_fs_info *fs_info)
 193{
 194        int ret;
 195        struct btrfs_root *dev_root = fs_info->dev_root;
 196        struct btrfs_path *path;
 197        struct btrfs_key key;
 198        struct extent_buffer *eb;
 199        struct btrfs_dev_replace_item *ptr;
 200        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 201
 202        btrfs_dev_replace_lock(dev_replace, 0);
 203        if (!dev_replace->is_valid ||
 204            !dev_replace->item_needs_writeback) {
 205                btrfs_dev_replace_unlock(dev_replace, 0);
 206                return 0;
 207        }
 208        btrfs_dev_replace_unlock(dev_replace, 0);
 209
 210        key.objectid = 0;
 211        key.type = BTRFS_DEV_REPLACE_KEY;
 212        key.offset = 0;
 213
 214        path = btrfs_alloc_path();
 215        if (!path) {
 216                ret = -ENOMEM;
 217                goto out;
 218        }
 219        ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
 220        if (ret < 0) {
 221                btrfs_warn(fs_info,
 222                           "error %d while searching for dev_replace item!",
 223                           ret);
 224                goto out;
 225        }
 226
 227        if (ret == 0 &&
 228            btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
 229                /*
 230                 * need to delete old one and insert a new one.
 231                 * Since no attempt is made to recover any old state, if the
 232                 * dev_replace state is 'running', the data on the target
 233                 * drive is lost.
 234                 * It would be possible to recover the state: just make sure
 235                 * that the beginning of the item is never changed and always
 236                 * contains all the essential information. Then read this
 237                 * minimal set of information and use it as a base for the
 238                 * new state.
 239                 */
 240                ret = btrfs_del_item(trans, dev_root, path);
 241                if (ret != 0) {
 242                        btrfs_warn(fs_info,
 243                                   "delete too small dev_replace item failed %d!",
 244                                   ret);
 245                        goto out;
 246                }
 247                ret = 1;
 248        }
 249
 250        if (ret == 1) {
 251                /* need to insert a new item */
 252                btrfs_release_path(path);
 253                ret = btrfs_insert_empty_item(trans, dev_root, path,
 254                                              &key, sizeof(*ptr));
 255                if (ret < 0) {
 256                        btrfs_warn(fs_info,
 257                                   "insert dev_replace item failed %d!", ret);
 258                        goto out;
 259                }
 260        }
 261
 262        eb = path->nodes[0];
 263        ptr = btrfs_item_ptr(eb, path->slots[0],
 264                             struct btrfs_dev_replace_item);
 265
 266        btrfs_dev_replace_lock(dev_replace, 1);
 267        if (dev_replace->srcdev)
 268                btrfs_set_dev_replace_src_devid(eb, ptr,
 269                        dev_replace->srcdev->devid);
 270        else
 271                btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
 272        btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
 273                dev_replace->cont_reading_from_srcdev_mode);
 274        btrfs_set_dev_replace_replace_state(eb, ptr,
 275                dev_replace->replace_state);
 276        btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
 277        btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
 278        btrfs_set_dev_replace_num_write_errors(eb, ptr,
 279                atomic64_read(&dev_replace->num_write_errors));
 280        btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
 281                atomic64_read(&dev_replace->num_uncorrectable_read_errors));
 282        dev_replace->cursor_left_last_write_of_item =
 283                dev_replace->cursor_left;
 284        btrfs_set_dev_replace_cursor_left(eb, ptr,
 285                dev_replace->cursor_left_last_write_of_item);
 286        btrfs_set_dev_replace_cursor_right(eb, ptr,
 287                dev_replace->cursor_right);
 288        dev_replace->item_needs_writeback = 0;
 289        btrfs_dev_replace_unlock(dev_replace, 1);
 290
 291        btrfs_mark_buffer_dirty(eb);
 292
 293out:
 294        btrfs_free_path(path);
 295
 296        return ret;
 297}
 298
 299void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
 300{
 301        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 302
 303        dev_replace->committed_cursor_left =
 304                dev_replace->cursor_left_last_write_of_item;
 305}
 306
 307int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, char *tgtdev_name,
 308                                u64 srcdevid, char *srcdev_name, int read_src)
 309{
 310        struct btrfs_root *root = fs_info->dev_root;
 311        struct btrfs_trans_handle *trans;
 312        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 313        int ret;
 314        struct btrfs_device *tgt_device = NULL;
 315        struct btrfs_device *src_device = NULL;
 316
 317        /* the disk copy procedure reuses the scrub code */
 318        mutex_lock(&fs_info->volume_mutex);
 319        ret = btrfs_find_device_by_devspec(fs_info, srcdevid,
 320                                            srcdev_name, &src_device);
 321        if (ret) {
 322                mutex_unlock(&fs_info->volume_mutex);
 323                return ret;
 324        }
 325
 326        ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name,
 327                                            src_device, &tgt_device);
 328        mutex_unlock(&fs_info->volume_mutex);
 329        if (ret)
 330                return ret;
 331
 332        /*
 333         * Here we commit the transaction to make sure commit_total_bytes
 334         * of all the devices are updated.
 335         */
 336        trans = btrfs_attach_transaction(root);
 337        if (!IS_ERR(trans)) {
 338                ret = btrfs_commit_transaction(trans);
 339                if (ret)
 340                        return ret;
 341        } else if (PTR_ERR(trans) != -ENOENT) {
 342                return PTR_ERR(trans);
 343        }
 344
 345        btrfs_dev_replace_lock(dev_replace, 1);
 346        switch (dev_replace->replace_state) {
 347        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 348        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
 349        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
 350                break;
 351        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 352        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
 353                ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
 354                goto leave;
 355        }
 356
 357        dev_replace->cont_reading_from_srcdev_mode = read_src;
 358        WARN_ON(!src_device);
 359        dev_replace->srcdev = src_device;
 360        WARN_ON(!tgt_device);
 361        dev_replace->tgtdev = tgt_device;
 362
 363        btrfs_info_in_rcu(fs_info,
 364                      "dev_replace from %s (devid %llu) to %s started",
 365                      src_device->missing ? "<missing disk>" :
 366                        rcu_str_deref(src_device->name),
 367                      src_device->devid,
 368                      rcu_str_deref(tgt_device->name));
 369
 370        /*
 371         * from now on, the writes to the srcdev are all duplicated to
 372         * go to the tgtdev as well (refer to btrfs_map_block()).
 373         */
 374        dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
 375        dev_replace->time_started = get_seconds();
 376        dev_replace->cursor_left = 0;
 377        dev_replace->committed_cursor_left = 0;
 378        dev_replace->cursor_left_last_write_of_item = 0;
 379        dev_replace->cursor_right = 0;
 380        dev_replace->is_valid = 1;
 381        dev_replace->item_needs_writeback = 1;
 382        atomic64_set(&dev_replace->num_write_errors, 0);
 383        atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
 384        btrfs_dev_replace_unlock(dev_replace, 1);
 385
 386        ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
 387        if (ret)
 388                btrfs_err(fs_info, "kobj add dev failed %d", ret);
 389
 390        btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);
 391
 392        /* force writing the updated state information to disk */
 393        trans = btrfs_start_transaction(root, 0);
 394        if (IS_ERR(trans)) {
 395                ret = PTR_ERR(trans);
 396                btrfs_dev_replace_lock(dev_replace, 1);
 397                goto leave;
 398        }
 399
 400        ret = btrfs_commit_transaction(trans);
 401        WARN_ON(ret);
 402
 403        /* the disk copy procedure reuses the scrub code */
 404        ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
 405                              btrfs_device_get_total_bytes(src_device),
 406                              &dev_replace->scrub_progress, 0, 1);
 407
 408        ret = btrfs_dev_replace_finishing(fs_info, ret);
 409        if (ret == -EINPROGRESS) {
 410                ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
 411        } else {
 412                WARN_ON(ret);
 413        }
 414
 415        return ret;
 416
 417leave:
 418        dev_replace->srcdev = NULL;
 419        dev_replace->tgtdev = NULL;
 420        btrfs_dev_replace_unlock(dev_replace, 1);
 421        btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
 422        return ret;
 423}
 424
 425int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
 426                            struct btrfs_ioctl_dev_replace_args *args)
 427{
 428        int ret;
 429
 430        switch (args->start.cont_reading_from_srcdev_mode) {
 431        case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
 432        case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
 433                break;
 434        default:
 435                return -EINVAL;
 436        }
 437
 438        if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
 439            args->start.tgtdev_name[0] == '\0')
 440                return -EINVAL;
 441
 442        ret = btrfs_dev_replace_start(fs_info, args->start.tgtdev_name,
 443                                        args->start.srcdevid,
 444                                        args->start.srcdev_name,
 445                                        args->start.cont_reading_from_srcdev_mode);
 446        args->result = ret;
 447        /* don't warn if EINPROGRESS, someone else might be running scrub */
 448        if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS)
 449                ret = 0;
 450
 451        return ret;
 452}
 453
 454/*
 455 * blocked until all in-flight bios operations are finished.
 456 */
 457static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
 458{
 459        set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
 460        wait_event(fs_info->replace_wait, !percpu_counter_sum(
 461                   &fs_info->bio_counter));
 462}
 463
 464/*
 465 * we have removed target device, it is safe to allow new bios request.
 466 */
 467static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
 468{
 469        clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
 470        wake_up(&fs_info->replace_wait);
 471}
 472
 473static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 474                                       int scrub_ret)
 475{
 476        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 477        struct btrfs_device *tgt_device;
 478        struct btrfs_device *src_device;
 479        struct btrfs_root *root = fs_info->tree_root;
 480        u8 uuid_tmp[BTRFS_UUID_SIZE];
 481        struct btrfs_trans_handle *trans;
 482        int ret = 0;
 483
 484        /* don't allow cancel or unmount to disturb the finishing procedure */
 485        mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
 486
 487        btrfs_dev_replace_lock(dev_replace, 0);
 488        /* was the operation canceled, or is it finished? */
 489        if (dev_replace->replace_state !=
 490            BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
 491                btrfs_dev_replace_unlock(dev_replace, 0);
 492                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 493                return 0;
 494        }
 495
 496        tgt_device = dev_replace->tgtdev;
 497        src_device = dev_replace->srcdev;
 498        btrfs_dev_replace_unlock(dev_replace, 0);
 499
 500        /*
 501         * flush all outstanding I/O and inode extent mappings before the
 502         * copy operation is declared as being finished
 503         */
 504        ret = btrfs_start_delalloc_roots(fs_info, 0, -1);
 505        if (ret) {
 506                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 507                return ret;
 508        }
 509        btrfs_wait_ordered_roots(fs_info, -1, 0, (u64)-1);
 510
 511        trans = btrfs_start_transaction(root, 0);
 512        if (IS_ERR(trans)) {
 513                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 514                return PTR_ERR(trans);
 515        }
 516        ret = btrfs_commit_transaction(trans);
 517        WARN_ON(ret);
 518
 519        mutex_lock(&uuid_mutex);
 520        /* keep away write_all_supers() during the finishing procedure */
 521        mutex_lock(&fs_info->fs_devices->device_list_mutex);
 522        mutex_lock(&fs_info->chunk_mutex);
 523        btrfs_dev_replace_lock(dev_replace, 1);
 524        dev_replace->replace_state =
 525                scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
 526                          : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
 527        dev_replace->tgtdev = NULL;
 528        dev_replace->srcdev = NULL;
 529        dev_replace->time_stopped = get_seconds();
 530        dev_replace->item_needs_writeback = 1;
 531
 532        /* replace old device with new one in mapping tree */
 533        if (!scrub_ret) {
 534                btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
 535                                                                src_device,
 536                                                                tgt_device);
 537        } else {
 538                btrfs_err_in_rcu(fs_info,
 539                                 "btrfs_scrub_dev(%s, %llu, %s) failed %d",
 540                                 src_device->missing ? "<missing disk>" :
 541                                 rcu_str_deref(src_device->name),
 542                                 src_device->devid,
 543                                 rcu_str_deref(tgt_device->name), scrub_ret);
 544                btrfs_dev_replace_unlock(dev_replace, 1);
 545                mutex_unlock(&fs_info->chunk_mutex);
 546                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 547                mutex_unlock(&uuid_mutex);
 548                if (tgt_device)
 549                        btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
 550                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 551
 552                return scrub_ret;
 553        }
 554
 555        btrfs_info_in_rcu(fs_info,
 556                          "dev_replace from %s (devid %llu) to %s finished",
 557                          src_device->missing ? "<missing disk>" :
 558                          rcu_str_deref(src_device->name),
 559                          src_device->devid,
 560                          rcu_str_deref(tgt_device->name));
 561        tgt_device->is_tgtdev_for_dev_replace = 0;
 562        tgt_device->devid = src_device->devid;
 563        src_device->devid = BTRFS_DEV_REPLACE_DEVID;
 564        memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
 565        memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
 566        memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
 567        btrfs_device_set_total_bytes(tgt_device, src_device->total_bytes);
 568        btrfs_device_set_disk_total_bytes(tgt_device,
 569                                          src_device->disk_total_bytes);
 570        btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used);
 571        ASSERT(list_empty(&src_device->resized_list));
 572        tgt_device->commit_total_bytes = src_device->commit_total_bytes;
 573        tgt_device->commit_bytes_used = src_device->bytes_used;
 574
 575        btrfs_assign_next_active_device(fs_info, src_device, tgt_device);
 576
 577        list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
 578        fs_info->fs_devices->rw_devices++;
 579
 580        btrfs_dev_replace_unlock(dev_replace, 1);
 581
 582        btrfs_rm_dev_replace_blocked(fs_info);
 583
 584        btrfs_rm_dev_replace_remove_srcdev(fs_info, src_device);
 585
 586        btrfs_rm_dev_replace_unblocked(fs_info);
 587
 588        /*
 589         * this is again a consistent state where no dev_replace procedure
 590         * is running, the target device is part of the filesystem, the
 591         * source device is not part of the filesystem anymore and its 1st
 592         * superblock is scratched out so that it is no longer marked to
 593         * belong to this filesystem.
 594         */
 595        mutex_unlock(&fs_info->chunk_mutex);
 596        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 597        mutex_unlock(&uuid_mutex);
 598
 599        /* replace the sysfs entry */
 600        btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device);
 601        btrfs_rm_dev_replace_free_srcdev(fs_info, src_device);
 602
 603        /* write back the superblocks */
 604        trans = btrfs_start_transaction(root, 0);
 605        if (!IS_ERR(trans))
 606                btrfs_commit_transaction(trans);
 607
 608        mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 609
 610        return 0;
 611}
 612
 613static void btrfs_dev_replace_update_device_in_mapping_tree(
 614                                                struct btrfs_fs_info *fs_info,
 615                                                struct btrfs_device *srcdev,
 616                                                struct btrfs_device *tgtdev)
 617{
 618        struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
 619        struct extent_map *em;
 620        struct map_lookup *map;
 621        u64 start = 0;
 622        int i;
 623
 624        write_lock(&em_tree->lock);
 625        do {
 626                em = lookup_extent_mapping(em_tree, start, (u64)-1);
 627                if (!em)
 628                        break;
 629                map = em->map_lookup;
 630                for (i = 0; i < map->num_stripes; i++)
 631                        if (srcdev == map->stripes[i].dev)
 632                                map->stripes[i].dev = tgtdev;
 633                start = em->start + em->len;
 634                free_extent_map(em);
 635        } while (start);
 636        write_unlock(&em_tree->lock);
 637}
 638
 639void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
 640                              struct btrfs_ioctl_dev_replace_args *args)
 641{
 642        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 643        struct btrfs_device *srcdev;
 644
 645        btrfs_dev_replace_lock(dev_replace, 0);
 646        /* even if !dev_replace_is_valid, the values are good enough for
 647         * the replace_status ioctl */
 648        args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
 649        args->status.replace_state = dev_replace->replace_state;
 650        args->status.time_started = dev_replace->time_started;
 651        args->status.time_stopped = dev_replace->time_stopped;
 652        args->status.num_write_errors =
 653                atomic64_read(&dev_replace->num_write_errors);
 654        args->status.num_uncorrectable_read_errors =
 655                atomic64_read(&dev_replace->num_uncorrectable_read_errors);
 656        switch (dev_replace->replace_state) {
 657        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 658        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
 659                args->status.progress_1000 = 0;
 660                break;
 661        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
 662                args->status.progress_1000 = 1000;
 663                break;
 664        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 665        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
 666                srcdev = dev_replace->srcdev;
 667                args->status.progress_1000 = div_u64(dev_replace->cursor_left,
 668                        div_u64(btrfs_device_get_total_bytes(srcdev), 1000));
 669                break;
 670        }
 671        btrfs_dev_replace_unlock(dev_replace, 0);
 672}
 673
 674int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
 675                             struct btrfs_ioctl_dev_replace_args *args)
 676{
 677        args->result = __btrfs_dev_replace_cancel(fs_info);
 678        return 0;
 679}
 680
 681static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
 682{
 683        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 684        struct btrfs_device *tgt_device = NULL;
 685        struct btrfs_trans_handle *trans;
 686        struct btrfs_root *root = fs_info->tree_root;
 687        u64 result;
 688        int ret;
 689
 690        if (fs_info->sb->s_flags & MS_RDONLY)
 691                return -EROFS;
 692
 693        mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
 694        btrfs_dev_replace_lock(dev_replace, 1);
 695        switch (dev_replace->replace_state) {
 696        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 697        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
 698        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
 699                result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
 700                btrfs_dev_replace_unlock(dev_replace, 1);
 701                goto leave;
 702        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 703        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
 704                result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
 705                tgt_device = dev_replace->tgtdev;
 706                dev_replace->tgtdev = NULL;
 707                dev_replace->srcdev = NULL;
 708                break;
 709        }
 710        dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
 711        dev_replace->time_stopped = get_seconds();
 712        dev_replace->item_needs_writeback = 1;
 713        btrfs_dev_replace_unlock(dev_replace, 1);
 714        btrfs_scrub_cancel(fs_info);
 715
 716        trans = btrfs_start_transaction(root, 0);
 717        if (IS_ERR(trans)) {
 718                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 719                return PTR_ERR(trans);
 720        }
 721        ret = btrfs_commit_transaction(trans);
 722        WARN_ON(ret);
 723        if (tgt_device)
 724                btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
 725
 726leave:
 727        mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 728        return result;
 729}
 730
 731void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
 732{
 733        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 734
 735        mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
 736        btrfs_dev_replace_lock(dev_replace, 1);
 737        switch (dev_replace->replace_state) {
 738        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 739        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
 740        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
 741        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
 742                break;
 743        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 744                dev_replace->replace_state =
 745                        BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
 746                dev_replace->time_stopped = get_seconds();
 747                dev_replace->item_needs_writeback = 1;
 748                btrfs_info(fs_info, "suspending dev_replace for unmount");
 749                break;
 750        }
 751
 752        btrfs_dev_replace_unlock(dev_replace, 1);
 753        mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 754}
 755
 756/* resume dev_replace procedure that was interrupted by unmount */
 757int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
 758{
 759        struct task_struct *task;
 760        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 761
 762        btrfs_dev_replace_lock(dev_replace, 1);
 763        switch (dev_replace->replace_state) {
 764        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 765        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
 766        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
 767                btrfs_dev_replace_unlock(dev_replace, 1);
 768                return 0;
 769        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 770                break;
 771        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
 772                dev_replace->replace_state =
 773                        BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
 774                break;
 775        }
 776        if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
 777                btrfs_info(fs_info,
 778                           "cannot continue dev_replace, tgtdev is missing");
 779                btrfs_info(fs_info,
 780                           "you may cancel the operation after 'mount -o degraded'");
 781                btrfs_dev_replace_unlock(dev_replace, 1);
 782                return 0;
 783        }
 784        btrfs_dev_replace_unlock(dev_replace, 1);
 785
 786        WARN_ON(atomic_xchg(
 787                &fs_info->mutually_exclusive_operation_running, 1));
 788        task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
 789        return PTR_ERR_OR_ZERO(task);
 790}
 791
 792static int btrfs_dev_replace_kthread(void *data)
 793{
 794        struct btrfs_fs_info *fs_info = data;
 795        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 796        struct btrfs_ioctl_dev_replace_args *status_args;
 797        u64 progress;
 798
 799        status_args = kzalloc(sizeof(*status_args), GFP_KERNEL);
 800        if (status_args) {
 801                btrfs_dev_replace_status(fs_info, status_args);
 802                progress = status_args->status.progress_1000;
 803                kfree(status_args);
 804                progress = div_u64(progress, 10);
 805                btrfs_info_in_rcu(fs_info,
 806                        "continuing dev_replace from %s (devid %llu) to %s @%u%%",
 807                        dev_replace->srcdev->missing ? "<missing disk>" :
 808                        rcu_str_deref(dev_replace->srcdev->name),
 809                        dev_replace->srcdev->devid,
 810                        dev_replace->tgtdev ?
 811                        rcu_str_deref(dev_replace->tgtdev->name) :
 812                        "<missing target disk>",
 813                        (unsigned int)progress);
 814        }
 815        btrfs_dev_replace_continue_on_mount(fs_info);
 816        atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
 817
 818        return 0;
 819}
 820
 821static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info)
 822{
 823        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 824        int ret;
 825
 826        ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
 827                              dev_replace->committed_cursor_left,
 828                              btrfs_device_get_total_bytes(dev_replace->srcdev),
 829                              &dev_replace->scrub_progress, 0, 1);
 830        ret = btrfs_dev_replace_finishing(fs_info, ret);
 831        WARN_ON(ret);
 832        return 0;
 833}
 834
 835int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
 836{
 837        if (!dev_replace->is_valid)
 838                return 0;
 839
 840        switch (dev_replace->replace_state) {
 841        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 842        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
 843        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
 844                return 0;
 845        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 846        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
 847                /*
 848                 * return true even if tgtdev is missing (this is
 849                 * something that can happen if the dev_replace
 850                 * procedure is suspended by an umount and then
 851                 * the tgtdev is missing (or "btrfs dev scan") was
 852                 * not called and the the filesystem is remounted
 853                 * in degraded state. This does not stop the
 854                 * dev_replace procedure. It needs to be canceled
 855                 * manually if the cancellation is wanted.
 856                 */
 857                break;
 858        }
 859        return 1;
 860}
 861
 862void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace, int rw)
 863{
 864        if (rw == 1) {
 865                /* write */
 866again:
 867                wait_event(dev_replace->read_lock_wq,
 868                           atomic_read(&dev_replace->blocking_readers) == 0);
 869                write_lock(&dev_replace->lock);
 870                if (atomic_read(&dev_replace->blocking_readers)) {
 871                        write_unlock(&dev_replace->lock);
 872                        goto again;
 873                }
 874        } else {
 875                read_lock(&dev_replace->lock);
 876                atomic_inc(&dev_replace->read_locks);
 877        }
 878}
 879
 880void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace, int rw)
 881{
 882        if (rw == 1) {
 883                /* write */
 884                ASSERT(atomic_read(&dev_replace->blocking_readers) == 0);
 885                write_unlock(&dev_replace->lock);
 886        } else {
 887                ASSERT(atomic_read(&dev_replace->read_locks) > 0);
 888                atomic_dec(&dev_replace->read_locks);
 889                read_unlock(&dev_replace->lock);
 890        }
 891}
 892
 893/* inc blocking cnt and release read lock */
 894void btrfs_dev_replace_set_lock_blocking(
 895                                        struct btrfs_dev_replace *dev_replace)
 896{
 897        /* only set blocking for read lock */
 898        ASSERT(atomic_read(&dev_replace->read_locks) > 0);
 899        atomic_inc(&dev_replace->blocking_readers);
 900        read_unlock(&dev_replace->lock);
 901}
 902
 903/* acquire read lock and dec blocking cnt */
 904void btrfs_dev_replace_clear_lock_blocking(
 905                                        struct btrfs_dev_replace *dev_replace)
 906{
 907        /* only set blocking for read lock */
 908        ASSERT(atomic_read(&dev_replace->read_locks) > 0);
 909        ASSERT(atomic_read(&dev_replace->blocking_readers) > 0);
 910        read_lock(&dev_replace->lock);
 911        if (atomic_dec_and_test(&dev_replace->blocking_readers) &&
 912            waitqueue_active(&dev_replace->read_lock_wq))
 913                wake_up(&dev_replace->read_lock_wq);
 914}
 915
 916void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
 917{
 918        percpu_counter_inc(&fs_info->bio_counter);
 919}
 920
 921void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
 922{
 923        percpu_counter_sub(&fs_info->bio_counter, amount);
 924
 925        if (waitqueue_active(&fs_info->replace_wait))
 926                wake_up(&fs_info->replace_wait);
 927}
 928
 929void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
 930{
 931        while (1) {
 932                percpu_counter_inc(&fs_info->bio_counter);
 933                if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
 934                                     &fs_info->fs_state)))
 935                        break;
 936
 937                btrfs_bio_counter_dec(fs_info);
 938                wait_event(fs_info->replace_wait,
 939                           !test_bit(BTRFS_FS_STATE_DEV_REPLACING,
 940                                     &fs_info->fs_state));
 941        }
 942}
 943