linux/fs/btrfs/dev-replace.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) STRATO AG 2012.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18#include <linux/sched.h>
  19#include <linux/bio.h>
  20#include <linux/slab.h>
  21#include <linux/buffer_head.h>
  22#include <linux/blkdev.h>
  23#include <linux/random.h>
  24#include <linux/iocontext.h>
  25#include <linux/capability.h>
  26#include <linux/kthread.h>
  27#include <linux/math64.h>
  28#include <asm/div64.h>
  29#include "compat.h"
  30#include "ctree.h"
  31#include "extent_map.h"
  32#include "disk-io.h"
  33#include "transaction.h"
  34#include "print-tree.h"
  35#include "volumes.h"
  36#include "async-thread.h"
  37#include "check-integrity.h"
  38#include "rcu-string.h"
  39#include "dev-replace.h"
  40
  41static u64 btrfs_get_seconds_since_1970(void);
  42static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
  43                                       int scrub_ret);
  44static void btrfs_dev_replace_update_device_in_mapping_tree(
  45                                                struct btrfs_fs_info *fs_info,
  46                                                struct btrfs_device *srcdev,
  47                                                struct btrfs_device *tgtdev);
  48static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
  49                                         char *srcdev_name,
  50                                         struct btrfs_device **device);
  51static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
  52static int btrfs_dev_replace_kthread(void *data);
  53static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info);
  54
  55
  56int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
  57{
  58        struct btrfs_key key;
  59        struct btrfs_root *dev_root = fs_info->dev_root;
  60        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
  61        struct extent_buffer *eb;
  62        int slot;
  63        int ret = 0;
  64        struct btrfs_path *path = NULL;
  65        int item_size;
  66        struct btrfs_dev_replace_item *ptr;
  67        u64 src_devid;
  68
  69        path = btrfs_alloc_path();
  70        if (!path) {
  71                ret = -ENOMEM;
  72                goto out;
  73        }
  74
  75        key.objectid = 0;
  76        key.type = BTRFS_DEV_REPLACE_KEY;
  77        key.offset = 0;
  78        ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
  79        if (ret) {
  80no_valid_dev_replace_entry_found:
  81                ret = 0;
  82                dev_replace->replace_state =
  83                        BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED;
  84                dev_replace->cont_reading_from_srcdev_mode =
  85                    BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
  86                dev_replace->replace_state = 0;
  87                dev_replace->time_started = 0;
  88                dev_replace->time_stopped = 0;
  89                atomic64_set(&dev_replace->num_write_errors, 0);
  90                atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
  91                dev_replace->cursor_left = 0;
  92                dev_replace->committed_cursor_left = 0;
  93                dev_replace->cursor_left_last_write_of_item = 0;
  94                dev_replace->cursor_right = 0;
  95                dev_replace->srcdev = NULL;
  96                dev_replace->tgtdev = NULL;
  97                dev_replace->is_valid = 0;
  98                dev_replace->item_needs_writeback = 0;
  99                goto out;
 100        }
 101        slot = path->slots[0];
 102        eb = path->nodes[0];
 103        item_size = btrfs_item_size_nr(eb, slot);
 104        ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
 105
 106        if (item_size != sizeof(struct btrfs_dev_replace_item)) {
 107                pr_warn("btrfs: dev_replace entry found has unexpected size, ignore entry\n");
 108                goto no_valid_dev_replace_entry_found;
 109        }
 110
 111        src_devid = btrfs_dev_replace_src_devid(eb, ptr);
 112        dev_replace->cont_reading_from_srcdev_mode =
 113                btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
 114        dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
 115        dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
 116        dev_replace->time_stopped =
 117                btrfs_dev_replace_time_stopped(eb, ptr);
 118        atomic64_set(&dev_replace->num_write_errors,
 119                     btrfs_dev_replace_num_write_errors(eb, ptr));
 120        atomic64_set(&dev_replace->num_uncorrectable_read_errors,
 121                     btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
 122        dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
 123        dev_replace->committed_cursor_left = dev_replace->cursor_left;
 124        dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
 125        dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
 126        dev_replace->is_valid = 1;
 127
 128        dev_replace->item_needs_writeback = 0;
 129        switch (dev_replace->replace_state) {
 130        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 131        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
 132        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
 133                dev_replace->srcdev = NULL;
 134                dev_replace->tgtdev = NULL;
 135                break;
 136        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 137        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
 138                dev_replace->srcdev = btrfs_find_device(fs_info, src_devid,
 139                                                        NULL, NULL);
 140                dev_replace->tgtdev = btrfs_find_device(fs_info,
 141                                                        BTRFS_DEV_REPLACE_DEVID,
 142                                                        NULL, NULL);
 143                /*
 144                 * allow 'btrfs dev replace_cancel' if src/tgt device is
 145                 * missing
 146                 */
 147                if (!dev_replace->srcdev &&
 148                    !btrfs_test_opt(dev_root, DEGRADED)) {
 149                        ret = -EIO;
 150                        pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?\n",
 151                                (unsigned long long)src_devid);
 152                }
 153                if (!dev_replace->tgtdev &&
 154                    !btrfs_test_opt(dev_root, DEGRADED)) {
 155                        ret = -EIO;
 156                        pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "tgtdev (devid %llu) is missing, need to run btrfs dev scan?\n",
 157                                (unsigned long long)BTRFS_DEV_REPLACE_DEVID);
 158                }
 159                if (dev_replace->tgtdev) {
 160                        if (dev_replace->srcdev) {
 161                                dev_replace->tgtdev->total_bytes =
 162                                        dev_replace->srcdev->total_bytes;
 163                                dev_replace->tgtdev->disk_total_bytes =
 164                                        dev_replace->srcdev->disk_total_bytes;
 165                                dev_replace->tgtdev->bytes_used =
 166                                        dev_replace->srcdev->bytes_used;
 167                        }
 168                        dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1;
 169                        btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
 170                                dev_replace->tgtdev);
 171                }
 172                break;
 173        }
 174
 175out:
 176        if (path)
 177                btrfs_free_path(path);
 178        return ret;
 179}
 180
 181/*
 182 * called from commit_transaction. Writes changed device replace state to
 183 * disk.
 184 */
 185int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
 186                          struct btrfs_fs_info *fs_info)
 187{
 188        int ret;
 189        struct btrfs_root *dev_root = fs_info->dev_root;
 190        struct btrfs_path *path;
 191        struct btrfs_key key;
 192        struct extent_buffer *eb;
 193        struct btrfs_dev_replace_item *ptr;
 194        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 195
 196        btrfs_dev_replace_lock(dev_replace);
 197        if (!dev_replace->is_valid ||
 198            !dev_replace->item_needs_writeback) {
 199                btrfs_dev_replace_unlock(dev_replace);
 200                return 0;
 201        }
 202        btrfs_dev_replace_unlock(dev_replace);
 203
 204        key.objectid = 0;
 205        key.type = BTRFS_DEV_REPLACE_KEY;
 206        key.offset = 0;
 207
 208        path = btrfs_alloc_path();
 209        if (!path) {
 210                ret = -ENOMEM;
 211                goto out;
 212        }
 213        ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
 214        if (ret < 0) {
 215                pr_warn("btrfs: error %d while searching for dev_replace item!\n",
 216                        ret);
 217                goto out;
 218        }
 219
 220        if (ret == 0 &&
 221            btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
 222                /*
 223                 * need to delete old one and insert a new one.
 224                 * Since no attempt is made to recover any old state, if the
 225                 * dev_replace state is 'running', the data on the target
 226                 * drive is lost.
 227                 * It would be possible to recover the state: just make sure
 228                 * that the beginning of the item is never changed and always
 229                 * contains all the essential information. Then read this
 230                 * minimal set of information and use it as a base for the
 231                 * new state.
 232                 */
 233                ret = btrfs_del_item(trans, dev_root, path);
 234                if (ret != 0) {
 235                        pr_warn("btrfs: delete too small dev_replace item failed %d!\n",
 236                                ret);
 237                        goto out;
 238                }
 239                ret = 1;
 240        }
 241
 242        if (ret == 1) {
 243                /* need to insert a new item */
 244                btrfs_release_path(path);
 245                ret = btrfs_insert_empty_item(trans, dev_root, path,
 246                                              &key, sizeof(*ptr));
 247                if (ret < 0) {
 248                        pr_warn("btrfs: insert dev_replace item failed %d!\n",
 249                                ret);
 250                        goto out;
 251                }
 252        }
 253
 254        eb = path->nodes[0];
 255        ptr = btrfs_item_ptr(eb, path->slots[0],
 256                             struct btrfs_dev_replace_item);
 257
 258        btrfs_dev_replace_lock(dev_replace);
 259        if (dev_replace->srcdev)
 260                btrfs_set_dev_replace_src_devid(eb, ptr,
 261                        dev_replace->srcdev->devid);
 262        else
 263                btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
 264        btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
 265                dev_replace->cont_reading_from_srcdev_mode);
 266        btrfs_set_dev_replace_replace_state(eb, ptr,
 267                dev_replace->replace_state);
 268        btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
 269        btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
 270        btrfs_set_dev_replace_num_write_errors(eb, ptr,
 271                atomic64_read(&dev_replace->num_write_errors));
 272        btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
 273                atomic64_read(&dev_replace->num_uncorrectable_read_errors));
 274        dev_replace->cursor_left_last_write_of_item =
 275                dev_replace->cursor_left;
 276        btrfs_set_dev_replace_cursor_left(eb, ptr,
 277                dev_replace->cursor_left_last_write_of_item);
 278        btrfs_set_dev_replace_cursor_right(eb, ptr,
 279                dev_replace->cursor_right);
 280        dev_replace->item_needs_writeback = 0;
 281        btrfs_dev_replace_unlock(dev_replace);
 282
 283        btrfs_mark_buffer_dirty(eb);
 284
 285out:
 286        btrfs_free_path(path);
 287
 288        return ret;
 289}
 290
 291void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
 292{
 293        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 294
 295        dev_replace->committed_cursor_left =
 296                dev_replace->cursor_left_last_write_of_item;
 297}
 298
 299static u64 btrfs_get_seconds_since_1970(void)
 300{
 301        struct timespec t = CURRENT_TIME_SEC;
 302
 303        return t.tv_sec;
 304}
 305
 306int btrfs_dev_replace_start(struct btrfs_root *root,
 307                            struct btrfs_ioctl_dev_replace_args *args)
 308{
 309        struct btrfs_trans_handle *trans;
 310        struct btrfs_fs_info *fs_info = root->fs_info;
 311        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 312        int ret;
 313        struct btrfs_device *tgt_device = NULL;
 314        struct btrfs_device *src_device = NULL;
 315
 316        if (btrfs_fs_incompat(fs_info, RAID56)) {
 317                pr_warn("btrfs: dev_replace cannot yet handle RAID5/RAID6\n");
 318                return -EINVAL;
 319        }
 320
 321        switch (args->start.cont_reading_from_srcdev_mode) {
 322        case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
 323        case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
 324                break;
 325        default:
 326                return -EINVAL;
 327        }
 328
 329        if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
 330            args->start.tgtdev_name[0] == '\0')
 331                return -EINVAL;
 332
 333        mutex_lock(&fs_info->volume_mutex);
 334        ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
 335                                            &tgt_device);
 336        if (ret) {
 337                pr_err("btrfs: target device %s is invalid!\n",
 338                       args->start.tgtdev_name);
 339                mutex_unlock(&fs_info->volume_mutex);
 340                return -EINVAL;
 341        }
 342
 343        ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
 344                                            args->start.srcdev_name,
 345                                            &src_device);
 346        mutex_unlock(&fs_info->volume_mutex);
 347        if (ret) {
 348                ret = -EINVAL;
 349                goto leave_no_lock;
 350        }
 351
 352        if (tgt_device->total_bytes < src_device->total_bytes) {
 353                pr_err("btrfs: target device is smaller than source device!\n");
 354                ret = -EINVAL;
 355                goto leave_no_lock;
 356        }
 357
 358        btrfs_dev_replace_lock(dev_replace);
 359        switch (dev_replace->replace_state) {
 360        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 361        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
 362        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
 363                break;
 364        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 365        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
 366                args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
 367                goto leave;
 368        }
 369
 370        dev_replace->cont_reading_from_srcdev_mode =
 371                args->start.cont_reading_from_srcdev_mode;
 372        WARN_ON(!src_device);
 373        dev_replace->srcdev = src_device;
 374        WARN_ON(!tgt_device);
 375        dev_replace->tgtdev = tgt_device;
 376
 377        printk_in_rcu(KERN_INFO
 378                      "btrfs: dev_replace from %s (devid %llu) to %s) started\n",
 379                      src_device->missing ? "<missing disk>" :
 380                        rcu_str_deref(src_device->name),
 381                      src_device->devid,
 382                      rcu_str_deref(tgt_device->name));
 383
 384        tgt_device->total_bytes = src_device->total_bytes;
 385        tgt_device->disk_total_bytes = src_device->disk_total_bytes;
 386        tgt_device->bytes_used = src_device->bytes_used;
 387
 388        /*
 389         * from now on, the writes to the srcdev are all duplicated to
 390         * go to the tgtdev as well (refer to btrfs_map_block()).
 391         */
 392        dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
 393        dev_replace->time_started = btrfs_get_seconds_since_1970();
 394        dev_replace->cursor_left = 0;
 395        dev_replace->committed_cursor_left = 0;
 396        dev_replace->cursor_left_last_write_of_item = 0;
 397        dev_replace->cursor_right = 0;
 398        dev_replace->is_valid = 1;
 399        dev_replace->item_needs_writeback = 1;
 400        args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
 401        btrfs_dev_replace_unlock(dev_replace);
 402
 403        btrfs_wait_ordered_extents(root, 0);
 404
 405        /* force writing the updated state information to disk */
 406        trans = btrfs_start_transaction(root, 0);
 407        if (IS_ERR(trans)) {
 408                ret = PTR_ERR(trans);
 409                btrfs_dev_replace_lock(dev_replace);
 410                goto leave;
 411        }
 412
 413        ret = btrfs_commit_transaction(trans, root);
 414        WARN_ON(ret);
 415
 416        /* the disk copy procedure reuses the scrub code */
 417        ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
 418                              src_device->total_bytes,
 419                              &dev_replace->scrub_progress, 0, 1);
 420
 421        ret = btrfs_dev_replace_finishing(root->fs_info, ret);
 422        WARN_ON(ret);
 423
 424        return 0;
 425
 426leave:
 427        dev_replace->srcdev = NULL;
 428        dev_replace->tgtdev = NULL;
 429        btrfs_dev_replace_unlock(dev_replace);
 430leave_no_lock:
 431        if (tgt_device)
 432                btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
 433        return ret;
 434}
 435
 436static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 437                                       int scrub_ret)
 438{
 439        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 440        struct btrfs_device *tgt_device;
 441        struct btrfs_device *src_device;
 442        struct btrfs_root *root = fs_info->tree_root;
 443        u8 uuid_tmp[BTRFS_UUID_SIZE];
 444        struct btrfs_trans_handle *trans;
 445        int ret = 0;
 446
 447        /* don't allow cancel or unmount to disturb the finishing procedure */
 448        mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
 449
 450        btrfs_dev_replace_lock(dev_replace);
 451        /* was the operation canceled, or is it finished? */
 452        if (dev_replace->replace_state !=
 453            BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
 454                btrfs_dev_replace_unlock(dev_replace);
 455                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 456                return 0;
 457        }
 458
 459        tgt_device = dev_replace->tgtdev;
 460        src_device = dev_replace->srcdev;
 461        btrfs_dev_replace_unlock(dev_replace);
 462
 463        /* replace old device with new one in mapping tree */
 464        if (!scrub_ret)
 465                btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
 466                                                                src_device,
 467                                                                tgt_device);
 468
 469        /*
 470         * flush all outstanding I/O and inode extent mappings before the
 471         * copy operation is declared as being finished
 472         */
 473        ret = btrfs_start_delalloc_inodes(root, 0);
 474        if (ret) {
 475                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 476                return ret;
 477        }
 478        btrfs_wait_ordered_extents(root, 0);
 479
 480        trans = btrfs_start_transaction(root, 0);
 481        if (IS_ERR(trans)) {
 482                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 483                return PTR_ERR(trans);
 484        }
 485        ret = btrfs_commit_transaction(trans, root);
 486        WARN_ON(ret);
 487
 488        /* keep away write_all_supers() during the finishing procedure */
 489        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
 490        btrfs_dev_replace_lock(dev_replace);
 491        dev_replace->replace_state =
 492                scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
 493                          : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
 494        dev_replace->tgtdev = NULL;
 495        dev_replace->srcdev = NULL;
 496        dev_replace->time_stopped = btrfs_get_seconds_since_1970();
 497        dev_replace->item_needs_writeback = 1;
 498
 499        if (scrub_ret) {
 500                printk_in_rcu(KERN_ERR
 501                              "btrfs: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
 502                              src_device->missing ? "<missing disk>" :
 503                                rcu_str_deref(src_device->name),
 504                              src_device->devid,
 505                              rcu_str_deref(tgt_device->name), scrub_ret);
 506                btrfs_dev_replace_unlock(dev_replace);
 507                mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 508                if (tgt_device)
 509                        btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
 510                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 511
 512                return 0;
 513        }
 514
 515        printk_in_rcu(KERN_INFO
 516                      "btrfs: dev_replace from %s (devid %llu) to %s) finished\n",
 517                      src_device->missing ? "<missing disk>" :
 518                        rcu_str_deref(src_device->name),
 519                      src_device->devid,
 520                      rcu_str_deref(tgt_device->name));
 521        tgt_device->is_tgtdev_for_dev_replace = 0;
 522        tgt_device->devid = src_device->devid;
 523        src_device->devid = BTRFS_DEV_REPLACE_DEVID;
 524        tgt_device->bytes_used = src_device->bytes_used;
 525        memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
 526        memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
 527        memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
 528        tgt_device->total_bytes = src_device->total_bytes;
 529        tgt_device->disk_total_bytes = src_device->disk_total_bytes;
 530        tgt_device->bytes_used = src_device->bytes_used;
 531        if (fs_info->sb->s_bdev == src_device->bdev)
 532                fs_info->sb->s_bdev = tgt_device->bdev;
 533        if (fs_info->fs_devices->latest_bdev == src_device->bdev)
 534                fs_info->fs_devices->latest_bdev = tgt_device->bdev;
 535        list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
 536
 537        btrfs_rm_dev_replace_srcdev(fs_info, src_device);
 538        if (src_device->bdev) {
 539                /* zero out the old super */
 540                btrfs_scratch_superblock(src_device);
 541        }
 542        /*
 543         * this is again a consistent state where no dev_replace procedure
 544         * is running, the target device is part of the filesystem, the
 545         * source device is not part of the filesystem anymore and its 1st
 546         * superblock is scratched out so that it is no longer marked to
 547         * belong to this filesystem.
 548         */
 549        btrfs_dev_replace_unlock(dev_replace);
 550        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 551
 552        /* write back the superblocks */
 553        trans = btrfs_start_transaction(root, 0);
 554        if (!IS_ERR(trans))
 555                btrfs_commit_transaction(trans, root);
 556
 557        mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 558
 559        return 0;
 560}
 561
 562static void btrfs_dev_replace_update_device_in_mapping_tree(
 563                                                struct btrfs_fs_info *fs_info,
 564                                                struct btrfs_device *srcdev,
 565                                                struct btrfs_device *tgtdev)
 566{
 567        struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
 568        struct extent_map *em;
 569        struct map_lookup *map;
 570        u64 start = 0;
 571        int i;
 572
 573        write_lock(&em_tree->lock);
 574        do {
 575                em = lookup_extent_mapping(em_tree, start, (u64)-1);
 576                if (!em)
 577                        break;
 578                map = (struct map_lookup *)em->bdev;
 579                for (i = 0; i < map->num_stripes; i++)
 580                        if (srcdev == map->stripes[i].dev)
 581                                map->stripes[i].dev = tgtdev;
 582                start = em->start + em->len;
 583                free_extent_map(em);
 584        } while (start);
 585        write_unlock(&em_tree->lock);
 586}
 587
 588static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
 589                                         char *srcdev_name,
 590                                         struct btrfs_device **device)
 591{
 592        int ret;
 593
 594        if (srcdevid) {
 595                ret = 0;
 596                *device = btrfs_find_device(root->fs_info, srcdevid, NULL,
 597                                            NULL);
 598                if (!*device)
 599                        ret = -ENOENT;
 600        } else {
 601                ret = btrfs_find_device_missing_or_by_path(root, srcdev_name,
 602                                                           device);
 603        }
 604        return ret;
 605}
 606
 607void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
 608                              struct btrfs_ioctl_dev_replace_args *args)
 609{
 610        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 611
 612        btrfs_dev_replace_lock(dev_replace);
 613        /* even if !dev_replace_is_valid, the values are good enough for
 614         * the replace_status ioctl */
 615        args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
 616        args->status.replace_state = dev_replace->replace_state;
 617        args->status.time_started = dev_replace->time_started;
 618        args->status.time_stopped = dev_replace->time_stopped;
 619        args->status.num_write_errors =
 620                atomic64_read(&dev_replace->num_write_errors);
 621        args->status.num_uncorrectable_read_errors =
 622                atomic64_read(&dev_replace->num_uncorrectable_read_errors);
 623        switch (dev_replace->replace_state) {
 624        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 625        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
 626                args->status.progress_1000 = 0;
 627                break;
 628        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
 629                args->status.progress_1000 = 1000;
 630                break;
 631        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 632        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
 633                args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
 634                        div64_u64(dev_replace->srcdev->total_bytes, 1000));
 635                break;
 636        }
 637        btrfs_dev_replace_unlock(dev_replace);
 638}
 639
 640int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
 641                             struct btrfs_ioctl_dev_replace_args *args)
 642{
 643        args->result = __btrfs_dev_replace_cancel(fs_info);
 644        return 0;
 645}
 646
 647static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
 648{
 649        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 650        struct btrfs_device *tgt_device = NULL;
 651        struct btrfs_trans_handle *trans;
 652        struct btrfs_root *root = fs_info->tree_root;
 653        u64 result;
 654        int ret;
 655
 656        mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
 657        btrfs_dev_replace_lock(dev_replace);
 658        switch (dev_replace->replace_state) {
 659        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 660        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
 661        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
 662                result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
 663                btrfs_dev_replace_unlock(dev_replace);
 664                goto leave;
 665        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 666        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
 667                result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
 668                tgt_device = dev_replace->tgtdev;
 669                dev_replace->tgtdev = NULL;
 670                dev_replace->srcdev = NULL;
 671                break;
 672        }
 673        dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
 674        dev_replace->time_stopped = btrfs_get_seconds_since_1970();
 675        dev_replace->item_needs_writeback = 1;
 676        btrfs_dev_replace_unlock(dev_replace);
 677        btrfs_scrub_cancel(fs_info);
 678
 679        trans = btrfs_start_transaction(root, 0);
 680        if (IS_ERR(trans)) {
 681                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 682                return PTR_ERR(trans);
 683        }
 684        ret = btrfs_commit_transaction(trans, root);
 685        WARN_ON(ret);
 686        if (tgt_device)
 687                btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
 688
 689leave:
 690        mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 691        return result;
 692}
 693
 694void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
 695{
 696        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 697
 698        mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
 699        btrfs_dev_replace_lock(dev_replace);
 700        switch (dev_replace->replace_state) {
 701        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 702        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
 703        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
 704        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
 705                break;
 706        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 707                dev_replace->replace_state =
 708                        BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
 709                dev_replace->time_stopped = btrfs_get_seconds_since_1970();
 710                dev_replace->item_needs_writeback = 1;
 711                pr_info("btrfs: suspending dev_replace for unmount\n");
 712                break;
 713        }
 714
 715        btrfs_dev_replace_unlock(dev_replace);
 716        mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 717}
 718
 719/* resume dev_replace procedure that was interrupted by unmount */
 720int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
 721{
 722        struct task_struct *task;
 723        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 724
 725        btrfs_dev_replace_lock(dev_replace);
 726        switch (dev_replace->replace_state) {
 727        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 728        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
 729        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
 730                btrfs_dev_replace_unlock(dev_replace);
 731                return 0;
 732        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 733                break;
 734        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
 735                dev_replace->replace_state =
 736                        BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
 737                break;
 738        }
 739        if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
 740                pr_info("btrfs: cannot continue dev_replace, tgtdev is missing\n"
 741                        "btrfs: you may cancel the operation after 'mount -o degraded'\n");
 742                btrfs_dev_replace_unlock(dev_replace);
 743                return 0;
 744        }
 745        btrfs_dev_replace_unlock(dev_replace);
 746
 747        WARN_ON(atomic_xchg(
 748                &fs_info->mutually_exclusive_operation_running, 1));
 749        task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
 750        return PTR_RET(task);
 751}
 752
 753static int btrfs_dev_replace_kthread(void *data)
 754{
 755        struct btrfs_fs_info *fs_info = data;
 756        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 757        struct btrfs_ioctl_dev_replace_args *status_args;
 758        u64 progress;
 759
 760        status_args = kzalloc(sizeof(*status_args), GFP_NOFS);
 761        if (status_args) {
 762                btrfs_dev_replace_status(fs_info, status_args);
 763                progress = status_args->status.progress_1000;
 764                kfree(status_args);
 765                do_div(progress, 10);
 766                printk_in_rcu(KERN_INFO
 767                              "btrfs: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
 768                              dev_replace->srcdev->missing ? "<missing disk>" :
 769                                rcu_str_deref(dev_replace->srcdev->name),
 770                              dev_replace->srcdev->devid,
 771                              dev_replace->tgtdev ?
 772                                rcu_str_deref(dev_replace->tgtdev->name) :
 773                                "<missing target disk>",
 774                              (unsigned int)progress);
 775        }
 776        btrfs_dev_replace_continue_on_mount(fs_info);
 777        atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
 778
 779        return 0;
 780}
 781
 782static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info)
 783{
 784        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 785        int ret;
 786
 787        ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
 788                              dev_replace->committed_cursor_left,
 789                              dev_replace->srcdev->total_bytes,
 790                              &dev_replace->scrub_progress, 0, 1);
 791        ret = btrfs_dev_replace_finishing(fs_info, ret);
 792        WARN_ON(ret);
 793        return 0;
 794}
 795
 796int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
 797{
 798        if (!dev_replace->is_valid)
 799                return 0;
 800
 801        switch (dev_replace->replace_state) {
 802        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 803        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
 804        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
 805                return 0;
 806        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 807        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
 808                /*
 809                 * return true even if tgtdev is missing (this is
 810                 * something that can happen if the dev_replace
 811                 * procedure is suspended by an umount and then
 812                 * the tgtdev is missing (or "btrfs dev scan") was
 813                 * not called and the the filesystem is remounted
 814                 * in degraded state. This does not stop the
 815                 * dev_replace procedure. It needs to be canceled
 816                 * manually if the cancelation is wanted.
 817                 */
 818                break;
 819        }
 820        return 1;
 821}
 822
 823void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace)
 824{
 825        /* the beginning is just an optimization for the typical case */
 826        if (atomic_read(&dev_replace->nesting_level) == 0) {
 827acquire_lock:
 828                /* this is not a nested case where the same thread
 829                 * is trying to acqurire the same lock twice */
 830                mutex_lock(&dev_replace->lock);
 831                mutex_lock(&dev_replace->lock_management_lock);
 832                dev_replace->lock_owner = current->pid;
 833                atomic_inc(&dev_replace->nesting_level);
 834                mutex_unlock(&dev_replace->lock_management_lock);
 835                return;
 836        }
 837
 838        mutex_lock(&dev_replace->lock_management_lock);
 839        if (atomic_read(&dev_replace->nesting_level) > 0 &&
 840            dev_replace->lock_owner == current->pid) {
 841                WARN_ON(!mutex_is_locked(&dev_replace->lock));
 842                atomic_inc(&dev_replace->nesting_level);
 843                mutex_unlock(&dev_replace->lock_management_lock);
 844                return;
 845        }
 846
 847        mutex_unlock(&dev_replace->lock_management_lock);
 848        goto acquire_lock;
 849}
 850
 851void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
 852{
 853        WARN_ON(!mutex_is_locked(&dev_replace->lock));
 854        mutex_lock(&dev_replace->lock_management_lock);
 855        WARN_ON(atomic_read(&dev_replace->nesting_level) < 1);
 856        WARN_ON(dev_replace->lock_owner != current->pid);
 857        atomic_dec(&dev_replace->nesting_level);
 858        if (atomic_read(&dev_replace->nesting_level) == 0) {
 859                dev_replace->lock_owner = 0;
 860                mutex_unlock(&dev_replace->lock_management_lock);
 861                mutex_unlock(&dev_replace->lock);
 862        } else {
 863                mutex_unlock(&dev_replace->lock_management_lock);
 864        }
 865}
 866