linux/fs/btrfs/dev-replace.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) STRATO AG 2012.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18#include <linux/sched.h>
  19#include <linux/bio.h>
  20#include <linux/slab.h>
  21#include <linux/buffer_head.h>
  22#include <linux/blkdev.h>
  23#include <linux/random.h>
  24#include <linux/iocontext.h>
  25#include <linux/capability.h>
  26#include <linux/kthread.h>
  27#include <linux/math64.h>
  28#include <asm/div64.h>
  29#include "ctree.h"
  30#include "extent_map.h"
  31#include "disk-io.h"
  32#include "transaction.h"
  33#include "print-tree.h"
  34#include "volumes.h"
  35#include "async-thread.h"
  36#include "check-integrity.h"
  37#include "rcu-string.h"
  38#include "dev-replace.h"
  39
  40static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
  41                                       int scrub_ret);
  42static void btrfs_dev_replace_update_device_in_mapping_tree(
  43                                                struct btrfs_fs_info *fs_info,
  44                                                struct btrfs_device *srcdev,
  45                                                struct btrfs_device *tgtdev);
  46static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
  47                                         char *srcdev_name,
  48                                         struct btrfs_device **device);
  49static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
  50static int btrfs_dev_replace_kthread(void *data);
  51static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info);
  52
  53
  54int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
  55{
  56        struct btrfs_key key;
  57        struct btrfs_root *dev_root = fs_info->dev_root;
  58        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
  59        struct extent_buffer *eb;
  60        int slot;
  61        int ret = 0;
  62        struct btrfs_path *path = NULL;
  63        int item_size;
  64        struct btrfs_dev_replace_item *ptr;
  65        u64 src_devid;
  66
  67        path = btrfs_alloc_path();
  68        if (!path) {
  69                ret = -ENOMEM;
  70                goto out;
  71        }
  72
  73        key.objectid = 0;
  74        key.type = BTRFS_DEV_REPLACE_KEY;
  75        key.offset = 0;
  76        ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
  77        if (ret) {
  78no_valid_dev_replace_entry_found:
  79                ret = 0;
  80                dev_replace->replace_state =
  81                        BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED;
  82                dev_replace->cont_reading_from_srcdev_mode =
  83                    BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
  84                dev_replace->replace_state = 0;
  85                dev_replace->time_started = 0;
  86                dev_replace->time_stopped = 0;
  87                atomic64_set(&dev_replace->num_write_errors, 0);
  88                atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
  89                dev_replace->cursor_left = 0;
  90                dev_replace->committed_cursor_left = 0;
  91                dev_replace->cursor_left_last_write_of_item = 0;
  92                dev_replace->cursor_right = 0;
  93                dev_replace->srcdev = NULL;
  94                dev_replace->tgtdev = NULL;
  95                dev_replace->is_valid = 0;
  96                dev_replace->item_needs_writeback = 0;
  97                goto out;
  98        }
  99        slot = path->slots[0];
 100        eb = path->nodes[0];
 101        item_size = btrfs_item_size_nr(eb, slot);
 102        ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
 103
 104        if (item_size != sizeof(struct btrfs_dev_replace_item)) {
 105                pr_warn("btrfs: dev_replace entry found has unexpected size, ignore entry\n");
 106                goto no_valid_dev_replace_entry_found;
 107        }
 108
 109        src_devid = btrfs_dev_replace_src_devid(eb, ptr);
 110        dev_replace->cont_reading_from_srcdev_mode =
 111                btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
 112        dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
 113        dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
 114        dev_replace->time_stopped =
 115                btrfs_dev_replace_time_stopped(eb, ptr);
 116        atomic64_set(&dev_replace->num_write_errors,
 117                     btrfs_dev_replace_num_write_errors(eb, ptr));
 118        atomic64_set(&dev_replace->num_uncorrectable_read_errors,
 119                     btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
 120        dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
 121        dev_replace->committed_cursor_left = dev_replace->cursor_left;
 122        dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
 123        dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
 124        dev_replace->is_valid = 1;
 125
 126        dev_replace->item_needs_writeback = 0;
 127        switch (dev_replace->replace_state) {
 128        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 129        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
 130        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
 131                dev_replace->srcdev = NULL;
 132                dev_replace->tgtdev = NULL;
 133                break;
 134        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 135        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
 136                dev_replace->srcdev = btrfs_find_device(fs_info, src_devid,
 137                                                        NULL, NULL);
 138                dev_replace->tgtdev = btrfs_find_device(fs_info,
 139                                                        BTRFS_DEV_REPLACE_DEVID,
 140                                                        NULL, NULL);
 141                /*
 142                 * allow 'btrfs dev replace_cancel' if src/tgt device is
 143                 * missing
 144                 */
 145                if (!dev_replace->srcdev &&
 146                    !btrfs_test_opt(dev_root, DEGRADED)) {
 147                        ret = -EIO;
 148                        pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?\n",
 149                                src_devid);
 150                }
 151                if (!dev_replace->tgtdev &&
 152                    !btrfs_test_opt(dev_root, DEGRADED)) {
 153                        ret = -EIO;
 154                        pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "tgtdev (devid %llu) is missing, need to run btrfs dev scan?\n",
 155                                BTRFS_DEV_REPLACE_DEVID);
 156                }
 157                if (dev_replace->tgtdev) {
 158                        if (dev_replace->srcdev) {
 159                                dev_replace->tgtdev->total_bytes =
 160                                        dev_replace->srcdev->total_bytes;
 161                                dev_replace->tgtdev->disk_total_bytes =
 162                                        dev_replace->srcdev->disk_total_bytes;
 163                                dev_replace->tgtdev->bytes_used =
 164                                        dev_replace->srcdev->bytes_used;
 165                        }
 166                        dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1;
 167                        btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
 168                                dev_replace->tgtdev);
 169                }
 170                break;
 171        }
 172
 173out:
 174        if (path)
 175                btrfs_free_path(path);
 176        return ret;
 177}
 178
 179/*
 180 * called from commit_transaction. Writes changed device replace state to
 181 * disk.
 182 */
 183int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
 184                          struct btrfs_fs_info *fs_info)
 185{
 186        int ret;
 187        struct btrfs_root *dev_root = fs_info->dev_root;
 188        struct btrfs_path *path;
 189        struct btrfs_key key;
 190        struct extent_buffer *eb;
 191        struct btrfs_dev_replace_item *ptr;
 192        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 193
 194        btrfs_dev_replace_lock(dev_replace);
 195        if (!dev_replace->is_valid ||
 196            !dev_replace->item_needs_writeback) {
 197                btrfs_dev_replace_unlock(dev_replace);
 198                return 0;
 199        }
 200        btrfs_dev_replace_unlock(dev_replace);
 201
 202        key.objectid = 0;
 203        key.type = BTRFS_DEV_REPLACE_KEY;
 204        key.offset = 0;
 205
 206        path = btrfs_alloc_path();
 207        if (!path) {
 208                ret = -ENOMEM;
 209                goto out;
 210        }
 211        ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
 212        if (ret < 0) {
 213                pr_warn("btrfs: error %d while searching for dev_replace item!\n",
 214                        ret);
 215                goto out;
 216        }
 217
 218        if (ret == 0 &&
 219            btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
 220                /*
 221                 * need to delete old one and insert a new one.
 222                 * Since no attempt is made to recover any old state, if the
 223                 * dev_replace state is 'running', the data on the target
 224                 * drive is lost.
 225                 * It would be possible to recover the state: just make sure
 226                 * that the beginning of the item is never changed and always
 227                 * contains all the essential information. Then read this
 228                 * minimal set of information and use it as a base for the
 229                 * new state.
 230                 */
 231                ret = btrfs_del_item(trans, dev_root, path);
 232                if (ret != 0) {
 233                        pr_warn("btrfs: delete too small dev_replace item failed %d!\n",
 234                                ret);
 235                        goto out;
 236                }
 237                ret = 1;
 238        }
 239
 240        if (ret == 1) {
 241                /* need to insert a new item */
 242                btrfs_release_path(path);
 243                ret = btrfs_insert_empty_item(trans, dev_root, path,
 244                                              &key, sizeof(*ptr));
 245                if (ret < 0) {
 246                        pr_warn("btrfs: insert dev_replace item failed %d!\n",
 247                                ret);
 248                        goto out;
 249                }
 250        }
 251
 252        eb = path->nodes[0];
 253        ptr = btrfs_item_ptr(eb, path->slots[0],
 254                             struct btrfs_dev_replace_item);
 255
 256        btrfs_dev_replace_lock(dev_replace);
 257        if (dev_replace->srcdev)
 258                btrfs_set_dev_replace_src_devid(eb, ptr,
 259                        dev_replace->srcdev->devid);
 260        else
 261                btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
 262        btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
 263                dev_replace->cont_reading_from_srcdev_mode);
 264        btrfs_set_dev_replace_replace_state(eb, ptr,
 265                dev_replace->replace_state);
 266        btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
 267        btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
 268        btrfs_set_dev_replace_num_write_errors(eb, ptr,
 269                atomic64_read(&dev_replace->num_write_errors));
 270        btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
 271                atomic64_read(&dev_replace->num_uncorrectable_read_errors));
 272        dev_replace->cursor_left_last_write_of_item =
 273                dev_replace->cursor_left;
 274        btrfs_set_dev_replace_cursor_left(eb, ptr,
 275                dev_replace->cursor_left_last_write_of_item);
 276        btrfs_set_dev_replace_cursor_right(eb, ptr,
 277                dev_replace->cursor_right);
 278        dev_replace->item_needs_writeback = 0;
 279        btrfs_dev_replace_unlock(dev_replace);
 280
 281        btrfs_mark_buffer_dirty(eb);
 282
 283out:
 284        btrfs_free_path(path);
 285
 286        return ret;
 287}
 288
 289void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
 290{
 291        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 292
 293        dev_replace->committed_cursor_left =
 294                dev_replace->cursor_left_last_write_of_item;
 295}
 296
 297int btrfs_dev_replace_start(struct btrfs_root *root,
 298                            struct btrfs_ioctl_dev_replace_args *args)
 299{
 300        struct btrfs_trans_handle *trans;
 301        struct btrfs_fs_info *fs_info = root->fs_info;
 302        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 303        int ret;
 304        struct btrfs_device *tgt_device = NULL;
 305        struct btrfs_device *src_device = NULL;
 306
 307        if (btrfs_fs_incompat(fs_info, RAID56)) {
 308                pr_warn("btrfs: dev_replace cannot yet handle RAID5/RAID6\n");
 309                return -EINVAL;
 310        }
 311
 312        switch (args->start.cont_reading_from_srcdev_mode) {
 313        case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
 314        case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
 315                break;
 316        default:
 317                return -EINVAL;
 318        }
 319
 320        if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
 321            args->start.tgtdev_name[0] == '\0')
 322                return -EINVAL;
 323
 324        mutex_lock(&fs_info->volume_mutex);
 325        ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
 326                                            &tgt_device);
 327        if (ret) {
 328                pr_err("btrfs: target device %s is invalid!\n",
 329                       args->start.tgtdev_name);
 330                mutex_unlock(&fs_info->volume_mutex);
 331                return -EINVAL;
 332        }
 333
 334        ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
 335                                            args->start.srcdev_name,
 336                                            &src_device);
 337        mutex_unlock(&fs_info->volume_mutex);
 338        if (ret) {
 339                ret = -EINVAL;
 340                goto leave_no_lock;
 341        }
 342
 343        if (tgt_device->total_bytes < src_device->total_bytes) {
 344                pr_err("btrfs: target device is smaller than source device!\n");
 345                ret = -EINVAL;
 346                goto leave_no_lock;
 347        }
 348
 349        btrfs_dev_replace_lock(dev_replace);
 350        switch (dev_replace->replace_state) {
 351        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 352        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
 353        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
 354                break;
 355        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 356        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
 357                args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
 358                goto leave;
 359        }
 360
 361        dev_replace->cont_reading_from_srcdev_mode =
 362                args->start.cont_reading_from_srcdev_mode;
 363        WARN_ON(!src_device);
 364        dev_replace->srcdev = src_device;
 365        WARN_ON(!tgt_device);
 366        dev_replace->tgtdev = tgt_device;
 367
 368        printk_in_rcu(KERN_INFO
 369                      "btrfs: dev_replace from %s (devid %llu) to %s started\n",
 370                      src_device->missing ? "<missing disk>" :
 371                        rcu_str_deref(src_device->name),
 372                      src_device->devid,
 373                      rcu_str_deref(tgt_device->name));
 374
 375        tgt_device->total_bytes = src_device->total_bytes;
 376        tgt_device->disk_total_bytes = src_device->disk_total_bytes;
 377        tgt_device->bytes_used = src_device->bytes_used;
 378
 379        /*
 380         * from now on, the writes to the srcdev are all duplicated to
 381         * go to the tgtdev as well (refer to btrfs_map_block()).
 382         */
 383        dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
 384        dev_replace->time_started = get_seconds();
 385        dev_replace->cursor_left = 0;
 386        dev_replace->committed_cursor_left = 0;
 387        dev_replace->cursor_left_last_write_of_item = 0;
 388        dev_replace->cursor_right = 0;
 389        dev_replace->is_valid = 1;
 390        dev_replace->item_needs_writeback = 1;
 391        args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
 392        btrfs_dev_replace_unlock(dev_replace);
 393
 394        btrfs_wait_ordered_roots(root->fs_info, -1);
 395
 396        /* force writing the updated state information to disk */
 397        trans = btrfs_start_transaction(root, 0);
 398        if (IS_ERR(trans)) {
 399                ret = PTR_ERR(trans);
 400                btrfs_dev_replace_lock(dev_replace);
 401                goto leave;
 402        }
 403
 404        ret = btrfs_commit_transaction(trans, root);
 405        WARN_ON(ret);
 406
 407        /* the disk copy procedure reuses the scrub code */
 408        ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
 409                              src_device->total_bytes,
 410                              &dev_replace->scrub_progress, 0, 1);
 411
 412        ret = btrfs_dev_replace_finishing(root->fs_info, ret);
 413        WARN_ON(ret);
 414
 415        return 0;
 416
 417leave:
 418        dev_replace->srcdev = NULL;
 419        dev_replace->tgtdev = NULL;
 420        btrfs_dev_replace_unlock(dev_replace);
 421leave_no_lock:
 422        if (tgt_device)
 423                btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
 424        return ret;
 425}
 426
 427static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 428                                       int scrub_ret)
 429{
 430        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 431        struct btrfs_device *tgt_device;
 432        struct btrfs_device *src_device;
 433        struct btrfs_root *root = fs_info->tree_root;
 434        u8 uuid_tmp[BTRFS_UUID_SIZE];
 435        struct btrfs_trans_handle *trans;
 436        int ret = 0;
 437
 438        /* don't allow cancel or unmount to disturb the finishing procedure */
 439        mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
 440
 441        btrfs_dev_replace_lock(dev_replace);
 442        /* was the operation canceled, or is it finished? */
 443        if (dev_replace->replace_state !=
 444            BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
 445                btrfs_dev_replace_unlock(dev_replace);
 446                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 447                return 0;
 448        }
 449
 450        tgt_device = dev_replace->tgtdev;
 451        src_device = dev_replace->srcdev;
 452        btrfs_dev_replace_unlock(dev_replace);
 453
 454        /* replace old device with new one in mapping tree */
 455        if (!scrub_ret)
 456                btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
 457                                                                src_device,
 458                                                                tgt_device);
 459
 460        /*
 461         * flush all outstanding I/O and inode extent mappings before the
 462         * copy operation is declared as being finished
 463         */
 464        ret = btrfs_start_delalloc_roots(root->fs_info, 0);
 465        if (ret) {
 466                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 467                return ret;
 468        }
 469        btrfs_wait_ordered_roots(root->fs_info, -1);
 470
 471        trans = btrfs_start_transaction(root, 0);
 472        if (IS_ERR(trans)) {
 473                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 474                return PTR_ERR(trans);
 475        }
 476        ret = btrfs_commit_transaction(trans, root);
 477        WARN_ON(ret);
 478
 479        /* keep away write_all_supers() during the finishing procedure */
 480        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
 481        btrfs_dev_replace_lock(dev_replace);
 482        dev_replace->replace_state =
 483                scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
 484                          : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
 485        dev_replace->tgtdev = NULL;
 486        dev_replace->srcdev = NULL;
 487        dev_replace->time_stopped = get_seconds();
 488        dev_replace->item_needs_writeback = 1;
 489
 490        if (scrub_ret) {
 491                printk_in_rcu(KERN_ERR
 492                              "btrfs: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
 493                              src_device->missing ? "<missing disk>" :
 494                                rcu_str_deref(src_device->name),
 495                              src_device->devid,
 496                              rcu_str_deref(tgt_device->name), scrub_ret);
 497                btrfs_dev_replace_unlock(dev_replace);
 498                mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 499                if (tgt_device)
 500                        btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
 501                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 502
 503                return 0;
 504        }
 505
 506        printk_in_rcu(KERN_INFO
 507                      "btrfs: dev_replace from %s (devid %llu) to %s) finished\n",
 508                      src_device->missing ? "<missing disk>" :
 509                        rcu_str_deref(src_device->name),
 510                      src_device->devid,
 511                      rcu_str_deref(tgt_device->name));
 512        tgt_device->is_tgtdev_for_dev_replace = 0;
 513        tgt_device->devid = src_device->devid;
 514        src_device->devid = BTRFS_DEV_REPLACE_DEVID;
 515        tgt_device->bytes_used = src_device->bytes_used;
 516        memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
 517        memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
 518        memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
 519        tgt_device->total_bytes = src_device->total_bytes;
 520        tgt_device->disk_total_bytes = src_device->disk_total_bytes;
 521        tgt_device->bytes_used = src_device->bytes_used;
 522        if (fs_info->sb->s_bdev == src_device->bdev)
 523                fs_info->sb->s_bdev = tgt_device->bdev;
 524        if (fs_info->fs_devices->latest_bdev == src_device->bdev)
 525                fs_info->fs_devices->latest_bdev = tgt_device->bdev;
 526        list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
 527
 528        btrfs_rm_dev_replace_srcdev(fs_info, src_device);
 529
 530        /*
 531         * this is again a consistent state where no dev_replace procedure
 532         * is running, the target device is part of the filesystem, the
 533         * source device is not part of the filesystem anymore and its 1st
 534         * superblock is scratched out so that it is no longer marked to
 535         * belong to this filesystem.
 536         */
 537        btrfs_dev_replace_unlock(dev_replace);
 538        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 539
 540        /* write back the superblocks */
 541        trans = btrfs_start_transaction(root, 0);
 542        if (!IS_ERR(trans))
 543                btrfs_commit_transaction(trans, root);
 544
 545        mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 546
 547        return 0;
 548}
 549
 550static void btrfs_dev_replace_update_device_in_mapping_tree(
 551                                                struct btrfs_fs_info *fs_info,
 552                                                struct btrfs_device *srcdev,
 553                                                struct btrfs_device *tgtdev)
 554{
 555        struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
 556        struct extent_map *em;
 557        struct map_lookup *map;
 558        u64 start = 0;
 559        int i;
 560
 561        write_lock(&em_tree->lock);
 562        do {
 563                em = lookup_extent_mapping(em_tree, start, (u64)-1);
 564                if (!em)
 565                        break;
 566                map = (struct map_lookup *)em->bdev;
 567                for (i = 0; i < map->num_stripes; i++)
 568                        if (srcdev == map->stripes[i].dev)
 569                                map->stripes[i].dev = tgtdev;
 570                start = em->start + em->len;
 571                free_extent_map(em);
 572        } while (start);
 573        write_unlock(&em_tree->lock);
 574}
 575
 576static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
 577                                         char *srcdev_name,
 578                                         struct btrfs_device **device)
 579{
 580        int ret;
 581
 582        if (srcdevid) {
 583                ret = 0;
 584                *device = btrfs_find_device(root->fs_info, srcdevid, NULL,
 585                                            NULL);
 586                if (!*device)
 587                        ret = -ENOENT;
 588        } else {
 589                ret = btrfs_find_device_missing_or_by_path(root, srcdev_name,
 590                                                           device);
 591        }
 592        return ret;
 593}
 594
 595void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
 596                              struct btrfs_ioctl_dev_replace_args *args)
 597{
 598        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 599
 600        btrfs_dev_replace_lock(dev_replace);
 601        /* even if !dev_replace_is_valid, the values are good enough for
 602         * the replace_status ioctl */
 603        args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
 604        args->status.replace_state = dev_replace->replace_state;
 605        args->status.time_started = dev_replace->time_started;
 606        args->status.time_stopped = dev_replace->time_stopped;
 607        args->status.num_write_errors =
 608                atomic64_read(&dev_replace->num_write_errors);
 609        args->status.num_uncorrectable_read_errors =
 610                atomic64_read(&dev_replace->num_uncorrectable_read_errors);
 611        switch (dev_replace->replace_state) {
 612        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 613        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
 614                args->status.progress_1000 = 0;
 615                break;
 616        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
 617                args->status.progress_1000 = 1000;
 618                break;
 619        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 620        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
 621                args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
 622                        div64_u64(dev_replace->srcdev->total_bytes, 1000));
 623                break;
 624        }
 625        btrfs_dev_replace_unlock(dev_replace);
 626}
 627
 628int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
 629                             struct btrfs_ioctl_dev_replace_args *args)
 630{
 631        args->result = __btrfs_dev_replace_cancel(fs_info);
 632        return 0;
 633}
 634
 635static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
 636{
 637        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 638        struct btrfs_device *tgt_device = NULL;
 639        struct btrfs_trans_handle *trans;
 640        struct btrfs_root *root = fs_info->tree_root;
 641        u64 result;
 642        int ret;
 643
 644        if (fs_info->sb->s_flags & MS_RDONLY)
 645                return -EROFS;
 646
 647        mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
 648        btrfs_dev_replace_lock(dev_replace);
 649        switch (dev_replace->replace_state) {
 650        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 651        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
 652        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
 653                result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
 654                btrfs_dev_replace_unlock(dev_replace);
 655                goto leave;
 656        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 657        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
 658                result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
 659                tgt_device = dev_replace->tgtdev;
 660                dev_replace->tgtdev = NULL;
 661                dev_replace->srcdev = NULL;
 662                break;
 663        }
 664        dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
 665        dev_replace->time_stopped = get_seconds();
 666        dev_replace->item_needs_writeback = 1;
 667        btrfs_dev_replace_unlock(dev_replace);
 668        btrfs_scrub_cancel(fs_info);
 669
 670        trans = btrfs_start_transaction(root, 0);
 671        if (IS_ERR(trans)) {
 672                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 673                return PTR_ERR(trans);
 674        }
 675        ret = btrfs_commit_transaction(trans, root);
 676        WARN_ON(ret);
 677        if (tgt_device)
 678                btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
 679
 680leave:
 681        mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 682        return result;
 683}
 684
 685void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
 686{
 687        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 688
 689        mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
 690        btrfs_dev_replace_lock(dev_replace);
 691        switch (dev_replace->replace_state) {
 692        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 693        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
 694        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
 695        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
 696                break;
 697        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 698                dev_replace->replace_state =
 699                        BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
 700                dev_replace->time_stopped = get_seconds();
 701                dev_replace->item_needs_writeback = 1;
 702                pr_info("btrfs: suspending dev_replace for unmount\n");
 703                break;
 704        }
 705
 706        btrfs_dev_replace_unlock(dev_replace);
 707        mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 708}
 709
 710/* resume dev_replace procedure that was interrupted by unmount */
 711int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
 712{
 713        struct task_struct *task;
 714        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 715
 716        btrfs_dev_replace_lock(dev_replace);
 717        switch (dev_replace->replace_state) {
 718        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 719        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
 720        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
 721                btrfs_dev_replace_unlock(dev_replace);
 722                return 0;
 723        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 724                break;
 725        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
 726                dev_replace->replace_state =
 727                        BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
 728                break;
 729        }
 730        if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
 731                pr_info("btrfs: cannot continue dev_replace, tgtdev is missing\n"
 732                        "btrfs: you may cancel the operation after 'mount -o degraded'\n");
 733                btrfs_dev_replace_unlock(dev_replace);
 734                return 0;
 735        }
 736        btrfs_dev_replace_unlock(dev_replace);
 737
 738        WARN_ON(atomic_xchg(
 739                &fs_info->mutually_exclusive_operation_running, 1));
 740        task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
 741        return PTR_ERR_OR_ZERO(task);
 742}
 743
 744static int btrfs_dev_replace_kthread(void *data)
 745{
 746        struct btrfs_fs_info *fs_info = data;
 747        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 748        struct btrfs_ioctl_dev_replace_args *status_args;
 749        u64 progress;
 750
 751        status_args = kzalloc(sizeof(*status_args), GFP_NOFS);
 752        if (status_args) {
 753                btrfs_dev_replace_status(fs_info, status_args);
 754                progress = status_args->status.progress_1000;
 755                kfree(status_args);
 756                do_div(progress, 10);
 757                printk_in_rcu(KERN_INFO
 758                              "btrfs: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
 759                              dev_replace->srcdev->missing ? "<missing disk>" :
 760                                rcu_str_deref(dev_replace->srcdev->name),
 761                              dev_replace->srcdev->devid,
 762                              dev_replace->tgtdev ?
 763                                rcu_str_deref(dev_replace->tgtdev->name) :
 764                                "<missing target disk>",
 765                              (unsigned int)progress);
 766        }
 767        btrfs_dev_replace_continue_on_mount(fs_info);
 768        atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
 769
 770        return 0;
 771}
 772
 773static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info)
 774{
 775        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 776        int ret;
 777
 778        ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
 779                              dev_replace->committed_cursor_left,
 780                              dev_replace->srcdev->total_bytes,
 781                              &dev_replace->scrub_progress, 0, 1);
 782        ret = btrfs_dev_replace_finishing(fs_info, ret);
 783        WARN_ON(ret);
 784        return 0;
 785}
 786
 787int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
 788{
 789        if (!dev_replace->is_valid)
 790                return 0;
 791
 792        switch (dev_replace->replace_state) {
 793        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 794        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
 795        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
 796                return 0;
 797        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 798        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
 799                /*
 800                 * return true even if tgtdev is missing (this is
 801                 * something that can happen if the dev_replace
 802                 * procedure is suspended by an umount and then
 803                 * the tgtdev is missing (or "btrfs dev scan") was
 804                 * not called and the the filesystem is remounted
 805                 * in degraded state. This does not stop the
 806                 * dev_replace procedure. It needs to be canceled
 807                 * manually if the cancelation is wanted.
 808                 */
 809                break;
 810        }
 811        return 1;
 812}
 813
 814void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace)
 815{
 816        /* the beginning is just an optimization for the typical case */
 817        if (atomic_read(&dev_replace->nesting_level) == 0) {
 818acquire_lock:
 819                /* this is not a nested case where the same thread
 820                 * is trying to acqurire the same lock twice */
 821                mutex_lock(&dev_replace->lock);
 822                mutex_lock(&dev_replace->lock_management_lock);
 823                dev_replace->lock_owner = current->pid;
 824                atomic_inc(&dev_replace->nesting_level);
 825                mutex_unlock(&dev_replace->lock_management_lock);
 826                return;
 827        }
 828
 829        mutex_lock(&dev_replace->lock_management_lock);
 830        if (atomic_read(&dev_replace->nesting_level) > 0 &&
 831            dev_replace->lock_owner == current->pid) {
 832                WARN_ON(!mutex_is_locked(&dev_replace->lock));
 833                atomic_inc(&dev_replace->nesting_level);
 834                mutex_unlock(&dev_replace->lock_management_lock);
 835                return;
 836        }
 837
 838        mutex_unlock(&dev_replace->lock_management_lock);
 839        goto acquire_lock;
 840}
 841
 842void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
 843{
 844        WARN_ON(!mutex_is_locked(&dev_replace->lock));
 845        mutex_lock(&dev_replace->lock_management_lock);
 846        WARN_ON(atomic_read(&dev_replace->nesting_level) < 1);
 847        WARN_ON(dev_replace->lock_owner != current->pid);
 848        atomic_dec(&dev_replace->nesting_level);
 849        if (atomic_read(&dev_replace->nesting_level) == 0) {
 850                dev_replace->lock_owner = 0;
 851                mutex_unlock(&dev_replace->lock_management_lock);
 852                mutex_unlock(&dev_replace->lock);
 853        } else {
 854                mutex_unlock(&dev_replace->lock_management_lock);
 855        }
 856}
 857