linux/fs/btrfs/dev-replace.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) STRATO AG 2012.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18#include <linux/sched.h>
  19#include <linux/bio.h>
  20#include <linux/slab.h>
  21#include <linux/buffer_head.h>
  22#include <linux/blkdev.h>
  23#include <linux/random.h>
  24#include <linux/iocontext.h>
  25#include <linux/capability.h>
  26#include <linux/kthread.h>
  27#include <linux/math64.h>
  28#include <asm/div64.h>
  29#include "ctree.h"
  30#include "extent_map.h"
  31#include "disk-io.h"
  32#include "transaction.h"
  33#include "print-tree.h"
  34#include "volumes.h"
  35#include "async-thread.h"
  36#include "check-integrity.h"
  37#include "rcu-string.h"
  38#include "dev-replace.h"
  39#include "sysfs.h"
  40
  41static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
  42                                       int scrub_ret);
  43static void btrfs_dev_replace_update_device_in_mapping_tree(
  44                                                struct btrfs_fs_info *fs_info,
  45                                                struct btrfs_device *srcdev,
  46                                                struct btrfs_device *tgtdev);
  47static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
  48                                         char *srcdev_name,
  49                                         struct btrfs_device **device);
  50static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info);
  51static int btrfs_dev_replace_kthread(void *data);
  52static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info);
  53
  54
  55int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
  56{
  57        struct btrfs_key key;
  58        struct btrfs_root *dev_root = fs_info->dev_root;
  59        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
  60        struct extent_buffer *eb;
  61        int slot;
  62        int ret = 0;
  63        struct btrfs_path *path = NULL;
  64        int item_size;
  65        struct btrfs_dev_replace_item *ptr;
  66        u64 src_devid;
  67
  68        path = btrfs_alloc_path();
  69        if (!path) {
  70                ret = -ENOMEM;
  71                goto out;
  72        }
  73
  74        key.objectid = 0;
  75        key.type = BTRFS_DEV_REPLACE_KEY;
  76        key.offset = 0;
  77        ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
  78        if (ret) {
  79no_valid_dev_replace_entry_found:
  80                ret = 0;
  81                dev_replace->replace_state =
  82                        BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED;
  83                dev_replace->cont_reading_from_srcdev_mode =
  84                    BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
  85                dev_replace->replace_state = 0;
  86                dev_replace->time_started = 0;
  87                dev_replace->time_stopped = 0;
  88                atomic64_set(&dev_replace->num_write_errors, 0);
  89                atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
  90                dev_replace->cursor_left = 0;
  91                dev_replace->committed_cursor_left = 0;
  92                dev_replace->cursor_left_last_write_of_item = 0;
  93                dev_replace->cursor_right = 0;
  94                dev_replace->srcdev = NULL;
  95                dev_replace->tgtdev = NULL;
  96                dev_replace->is_valid = 0;
  97                dev_replace->item_needs_writeback = 0;
  98                goto out;
  99        }
 100        slot = path->slots[0];
 101        eb = path->nodes[0];
 102        item_size = btrfs_item_size_nr(eb, slot);
 103        ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
 104
 105        if (item_size != sizeof(struct btrfs_dev_replace_item)) {
 106                btrfs_warn(fs_info,
 107                        "dev_replace entry found has unexpected size, ignore entry");
 108                goto no_valid_dev_replace_entry_found;
 109        }
 110
 111        src_devid = btrfs_dev_replace_src_devid(eb, ptr);
 112        dev_replace->cont_reading_from_srcdev_mode =
 113                btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
 114        dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
 115        dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
 116        dev_replace->time_stopped =
 117                btrfs_dev_replace_time_stopped(eb, ptr);
 118        atomic64_set(&dev_replace->num_write_errors,
 119                     btrfs_dev_replace_num_write_errors(eb, ptr));
 120        atomic64_set(&dev_replace->num_uncorrectable_read_errors,
 121                     btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
 122        dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
 123        dev_replace->committed_cursor_left = dev_replace->cursor_left;
 124        dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
 125        dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
 126        dev_replace->is_valid = 1;
 127
 128        dev_replace->item_needs_writeback = 0;
 129        switch (dev_replace->replace_state) {
 130        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 131        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
 132        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
 133                dev_replace->srcdev = NULL;
 134                dev_replace->tgtdev = NULL;
 135                break;
 136        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 137        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
 138                dev_replace->srcdev = btrfs_find_device(fs_info, src_devid,
 139                                                        NULL, NULL);
 140                dev_replace->tgtdev = btrfs_find_device(fs_info,
 141                                                        BTRFS_DEV_REPLACE_DEVID,
 142                                                        NULL, NULL);
 143                /*
 144                 * allow 'btrfs dev replace_cancel' if src/tgt device is
 145                 * missing
 146                 */
 147                if (!dev_replace->srcdev &&
 148                    !btrfs_test_opt(dev_root, DEGRADED)) {
 149                        ret = -EIO;
 150                        btrfs_warn(fs_info,
 151                           "cannot mount because device replace operation is ongoing and");
 152                        btrfs_warn(fs_info,
 153                           "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
 154                           src_devid);
 155                }
 156                if (!dev_replace->tgtdev &&
 157                    !btrfs_test_opt(dev_root, DEGRADED)) {
 158                        ret = -EIO;
 159                        btrfs_warn(fs_info,
 160                           "cannot mount because device replace operation is ongoing and");
 161                        btrfs_warn(fs_info,
 162                           "tgtdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
 163                                BTRFS_DEV_REPLACE_DEVID);
 164                }
 165                if (dev_replace->tgtdev) {
 166                        if (dev_replace->srcdev) {
 167                                dev_replace->tgtdev->total_bytes =
 168                                        dev_replace->srcdev->total_bytes;
 169                                dev_replace->tgtdev->disk_total_bytes =
 170                                        dev_replace->srcdev->disk_total_bytes;
 171                                dev_replace->tgtdev->commit_total_bytes =
 172                                        dev_replace->srcdev->commit_total_bytes;
 173                                dev_replace->tgtdev->bytes_used =
 174                                        dev_replace->srcdev->bytes_used;
 175                                dev_replace->tgtdev->commit_bytes_used =
 176                                        dev_replace->srcdev->commit_bytes_used;
 177                        }
 178                        dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1;
 179                        btrfs_init_dev_replace_tgtdev_for_resume(fs_info,
 180                                dev_replace->tgtdev);
 181                }
 182                break;
 183        }
 184
 185out:
 186        if (path)
 187                btrfs_free_path(path);
 188        return ret;
 189}
 190
 191/*
 192 * called from commit_transaction. Writes changed device replace state to
 193 * disk.
 194 */
 195int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
 196                          struct btrfs_fs_info *fs_info)
 197{
 198        int ret;
 199        struct btrfs_root *dev_root = fs_info->dev_root;
 200        struct btrfs_path *path;
 201        struct btrfs_key key;
 202        struct extent_buffer *eb;
 203        struct btrfs_dev_replace_item *ptr;
 204        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 205
 206        btrfs_dev_replace_lock(dev_replace);
 207        if (!dev_replace->is_valid ||
 208            !dev_replace->item_needs_writeback) {
 209                btrfs_dev_replace_unlock(dev_replace);
 210                return 0;
 211        }
 212        btrfs_dev_replace_unlock(dev_replace);
 213
 214        key.objectid = 0;
 215        key.type = BTRFS_DEV_REPLACE_KEY;
 216        key.offset = 0;
 217
 218        path = btrfs_alloc_path();
 219        if (!path) {
 220                ret = -ENOMEM;
 221                goto out;
 222        }
 223        ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
 224        if (ret < 0) {
 225                btrfs_warn(fs_info, "error %d while searching for dev_replace item!",
 226                        ret);
 227                goto out;
 228        }
 229
 230        if (ret == 0 &&
 231            btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
 232                /*
 233                 * need to delete old one and insert a new one.
 234                 * Since no attempt is made to recover any old state, if the
 235                 * dev_replace state is 'running', the data on the target
 236                 * drive is lost.
 237                 * It would be possible to recover the state: just make sure
 238                 * that the beginning of the item is never changed and always
 239                 * contains all the essential information. Then read this
 240                 * minimal set of information and use it as a base for the
 241                 * new state.
 242                 */
 243                ret = btrfs_del_item(trans, dev_root, path);
 244                if (ret != 0) {
 245                        btrfs_warn(fs_info, "delete too small dev_replace item failed %d!",
 246                                ret);
 247                        goto out;
 248                }
 249                ret = 1;
 250        }
 251
 252        if (ret == 1) {
 253                /* need to insert a new item */
 254                btrfs_release_path(path);
 255                ret = btrfs_insert_empty_item(trans, dev_root, path,
 256                                              &key, sizeof(*ptr));
 257                if (ret < 0) {
 258                        btrfs_warn(fs_info, "insert dev_replace item failed %d!",
 259                                ret);
 260                        goto out;
 261                }
 262        }
 263
 264        eb = path->nodes[0];
 265        ptr = btrfs_item_ptr(eb, path->slots[0],
 266                             struct btrfs_dev_replace_item);
 267
 268        btrfs_dev_replace_lock(dev_replace);
 269        if (dev_replace->srcdev)
 270                btrfs_set_dev_replace_src_devid(eb, ptr,
 271                        dev_replace->srcdev->devid);
 272        else
 273                btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
 274        btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
 275                dev_replace->cont_reading_from_srcdev_mode);
 276        btrfs_set_dev_replace_replace_state(eb, ptr,
 277                dev_replace->replace_state);
 278        btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
 279        btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
 280        btrfs_set_dev_replace_num_write_errors(eb, ptr,
 281                atomic64_read(&dev_replace->num_write_errors));
 282        btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
 283                atomic64_read(&dev_replace->num_uncorrectable_read_errors));
 284        dev_replace->cursor_left_last_write_of_item =
 285                dev_replace->cursor_left;
 286        btrfs_set_dev_replace_cursor_left(eb, ptr,
 287                dev_replace->cursor_left_last_write_of_item);
 288        btrfs_set_dev_replace_cursor_right(eb, ptr,
 289                dev_replace->cursor_right);
 290        dev_replace->item_needs_writeback = 0;
 291        btrfs_dev_replace_unlock(dev_replace);
 292
 293        btrfs_mark_buffer_dirty(eb);
 294
 295out:
 296        btrfs_free_path(path);
 297
 298        return ret;
 299}
 300
 301void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
 302{
 303        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 304
 305        dev_replace->committed_cursor_left =
 306                dev_replace->cursor_left_last_write_of_item;
 307}
 308
 309int btrfs_dev_replace_start(struct btrfs_root *root,
 310                            struct btrfs_ioctl_dev_replace_args *args)
 311{
 312        struct btrfs_trans_handle *trans;
 313        struct btrfs_fs_info *fs_info = root->fs_info;
 314        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 315        int ret;
 316        struct btrfs_device *tgt_device = NULL;
 317        struct btrfs_device *src_device = NULL;
 318
 319        if (btrfs_fs_incompat(fs_info, RAID56)) {
 320                btrfs_warn(fs_info, "dev_replace cannot yet handle RAID5/RAID6");
 321                return -EOPNOTSUPP;
 322        }
 323
 324        switch (args->start.cont_reading_from_srcdev_mode) {
 325        case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
 326        case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
 327                break;
 328        default:
 329                return -EINVAL;
 330        }
 331
 332        if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
 333            args->start.tgtdev_name[0] == '\0')
 334                return -EINVAL;
 335
 336        /*
 337         * Here we commit the transaction to make sure commit_total_bytes
 338         * of all the devices are updated.
 339         */
 340        trans = btrfs_attach_transaction(root);
 341        if (!IS_ERR(trans)) {
 342                ret = btrfs_commit_transaction(trans, root);
 343                if (ret)
 344                        return ret;
 345        } else if (PTR_ERR(trans) != -ENOENT) {
 346                return PTR_ERR(trans);
 347        }
 348
 349        /* the disk copy procedure reuses the scrub code */
 350        mutex_lock(&fs_info->volume_mutex);
 351        ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid,
 352                                            args->start.srcdev_name,
 353                                            &src_device);
 354        if (ret) {
 355                mutex_unlock(&fs_info->volume_mutex);
 356                return ret;
 357        }
 358
 359        ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name,
 360                                            src_device, &tgt_device);
 361        mutex_unlock(&fs_info->volume_mutex);
 362        if (ret)
 363                return ret;
 364
 365        btrfs_dev_replace_lock(dev_replace);
 366        switch (dev_replace->replace_state) {
 367        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 368        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
 369        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
 370                break;
 371        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 372        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
 373                args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
 374                goto leave;
 375        }
 376
 377        dev_replace->cont_reading_from_srcdev_mode =
 378                args->start.cont_reading_from_srcdev_mode;
 379        WARN_ON(!src_device);
 380        dev_replace->srcdev = src_device;
 381        WARN_ON(!tgt_device);
 382        dev_replace->tgtdev = tgt_device;
 383
 384        printk_in_rcu(KERN_INFO
 385                      "BTRFS: dev_replace from %s (devid %llu) to %s started\n",
 386                      src_device->missing ? "<missing disk>" :
 387                        rcu_str_deref(src_device->name),
 388                      src_device->devid,
 389                      rcu_str_deref(tgt_device->name));
 390
 391        /*
 392         * from now on, the writes to the srcdev are all duplicated to
 393         * go to the tgtdev as well (refer to btrfs_map_block()).
 394         */
 395        dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
 396        dev_replace->time_started = get_seconds();
 397        dev_replace->cursor_left = 0;
 398        dev_replace->committed_cursor_left = 0;
 399        dev_replace->cursor_left_last_write_of_item = 0;
 400        dev_replace->cursor_right = 0;
 401        dev_replace->is_valid = 1;
 402        dev_replace->item_needs_writeback = 1;
 403        args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
 404        btrfs_dev_replace_unlock(dev_replace);
 405
 406        btrfs_wait_ordered_roots(root->fs_info, -1);
 407
 408        /* force writing the updated state information to disk */
 409        trans = btrfs_start_transaction(root, 0);
 410        if (IS_ERR(trans)) {
 411                ret = PTR_ERR(trans);
 412                btrfs_dev_replace_lock(dev_replace);
 413                goto leave;
 414        }
 415
 416        ret = btrfs_commit_transaction(trans, root);
 417        WARN_ON(ret);
 418
 419        /* the disk copy procedure reuses the scrub code */
 420        ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
 421                              btrfs_device_get_total_bytes(src_device),
 422                              &dev_replace->scrub_progress, 0, 1);
 423
 424        ret = btrfs_dev_replace_finishing(root->fs_info, ret);
 425        WARN_ON(ret);
 426
 427        return 0;
 428
 429leave:
 430        dev_replace->srcdev = NULL;
 431        dev_replace->tgtdev = NULL;
 432        btrfs_dev_replace_unlock(dev_replace);
 433        btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
 434        return ret;
 435}
 436
 437/*
 438 * blocked until all flighting bios are finished.
 439 */
 440static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
 441{
 442        s64 writers;
 443        DEFINE_WAIT(wait);
 444
 445        set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
 446        do {
 447                prepare_to_wait(&fs_info->replace_wait, &wait,
 448                                TASK_UNINTERRUPTIBLE);
 449                writers = percpu_counter_sum(&fs_info->bio_counter);
 450                if (writers)
 451                        schedule();
 452                finish_wait(&fs_info->replace_wait, &wait);
 453        } while (writers);
 454}
 455
 456/*
 457 * we have removed target device, it is safe to allow new bios request.
 458 */
 459static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
 460{
 461        clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
 462        if (waitqueue_active(&fs_info->replace_wait))
 463                wake_up(&fs_info->replace_wait);
 464}
 465
 466static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 467                                       int scrub_ret)
 468{
 469        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 470        struct btrfs_device *tgt_device;
 471        struct btrfs_device *src_device;
 472        struct btrfs_root *root = fs_info->tree_root;
 473        u8 uuid_tmp[BTRFS_UUID_SIZE];
 474        struct btrfs_trans_handle *trans;
 475        int ret = 0;
 476
 477        /* don't allow cancel or unmount to disturb the finishing procedure */
 478        mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
 479
 480        btrfs_dev_replace_lock(dev_replace);
 481        /* was the operation canceled, or is it finished? */
 482        if (dev_replace->replace_state !=
 483            BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
 484                btrfs_dev_replace_unlock(dev_replace);
 485                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 486                return 0;
 487        }
 488
 489        tgt_device = dev_replace->tgtdev;
 490        src_device = dev_replace->srcdev;
 491        btrfs_dev_replace_unlock(dev_replace);
 492
 493        /*
 494         * flush all outstanding I/O and inode extent mappings before the
 495         * copy operation is declared as being finished
 496         */
 497        ret = btrfs_start_delalloc_roots(root->fs_info, 0, -1);
 498        if (ret) {
 499                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 500                return ret;
 501        }
 502        btrfs_wait_ordered_roots(root->fs_info, -1);
 503
 504        trans = btrfs_start_transaction(root, 0);
 505        if (IS_ERR(trans)) {
 506                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 507                return PTR_ERR(trans);
 508        }
 509        ret = btrfs_commit_transaction(trans, root);
 510        WARN_ON(ret);
 511
 512        mutex_lock(&uuid_mutex);
 513        /* keep away write_all_supers() during the finishing procedure */
 514        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
 515        mutex_lock(&root->fs_info->chunk_mutex);
 516        btrfs_dev_replace_lock(dev_replace);
 517        dev_replace->replace_state =
 518                scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
 519                          : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
 520        dev_replace->tgtdev = NULL;
 521        dev_replace->srcdev = NULL;
 522        dev_replace->time_stopped = get_seconds();
 523        dev_replace->item_needs_writeback = 1;
 524
 525        /* replace old device with new one in mapping tree */
 526        if (!scrub_ret) {
 527                btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
 528                                                                src_device,
 529                                                                tgt_device);
 530        } else {
 531                printk_in_rcu(KERN_ERR
 532                              "BTRFS: btrfs_scrub_dev(%s, %llu, %s) failed %d\n",
 533                              src_device->missing ? "<missing disk>" :
 534                                rcu_str_deref(src_device->name),
 535                              src_device->devid,
 536                              rcu_str_deref(tgt_device->name), scrub_ret);
 537                btrfs_dev_replace_unlock(dev_replace);
 538                mutex_unlock(&root->fs_info->chunk_mutex);
 539                mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 540                mutex_unlock(&uuid_mutex);
 541                if (tgt_device)
 542                        btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
 543                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 544
 545                return 0;
 546        }
 547
 548        printk_in_rcu(KERN_INFO
 549                      "BTRFS: dev_replace from %s (devid %llu) to %s finished\n",
 550                      src_device->missing ? "<missing disk>" :
 551                        rcu_str_deref(src_device->name),
 552                      src_device->devid,
 553                      rcu_str_deref(tgt_device->name));
 554        tgt_device->is_tgtdev_for_dev_replace = 0;
 555        tgt_device->devid = src_device->devid;
 556        src_device->devid = BTRFS_DEV_REPLACE_DEVID;
 557        memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
 558        memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
 559        memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
 560        btrfs_device_set_total_bytes(tgt_device, src_device->total_bytes);
 561        btrfs_device_set_disk_total_bytes(tgt_device,
 562                                          src_device->disk_total_bytes);
 563        btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used);
 564        ASSERT(list_empty(&src_device->resized_list));
 565        tgt_device->commit_total_bytes = src_device->commit_total_bytes;
 566        tgt_device->commit_bytes_used = src_device->bytes_used;
 567        if (fs_info->sb->s_bdev == src_device->bdev)
 568                fs_info->sb->s_bdev = tgt_device->bdev;
 569        if (fs_info->fs_devices->latest_bdev == src_device->bdev)
 570                fs_info->fs_devices->latest_bdev = tgt_device->bdev;
 571        list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
 572        fs_info->fs_devices->rw_devices++;
 573
 574        /* replace the sysfs entry */
 575        btrfs_kobj_rm_device(fs_info, src_device);
 576        btrfs_kobj_add_device(fs_info, tgt_device);
 577
 578        btrfs_dev_replace_unlock(dev_replace);
 579
 580        btrfs_rm_dev_replace_blocked(fs_info);
 581
 582        btrfs_rm_dev_replace_srcdev(fs_info, src_device);
 583
 584        btrfs_rm_dev_replace_unblocked(fs_info);
 585
 586        /*
 587         * this is again a consistent state where no dev_replace procedure
 588         * is running, the target device is part of the filesystem, the
 589         * source device is not part of the filesystem anymore and its 1st
 590         * superblock is scratched out so that it is no longer marked to
 591         * belong to this filesystem.
 592         */
 593        mutex_unlock(&root->fs_info->chunk_mutex);
 594        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 595        mutex_unlock(&uuid_mutex);
 596
 597        /* write back the superblocks */
 598        trans = btrfs_start_transaction(root, 0);
 599        if (!IS_ERR(trans))
 600                btrfs_commit_transaction(trans, root);
 601
 602        mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 603
 604        return 0;
 605}
 606
 607static void btrfs_dev_replace_update_device_in_mapping_tree(
 608                                                struct btrfs_fs_info *fs_info,
 609                                                struct btrfs_device *srcdev,
 610                                                struct btrfs_device *tgtdev)
 611{
 612        struct extent_map_tree *em_tree = &fs_info->mapping_tree.map_tree;
 613        struct extent_map *em;
 614        struct map_lookup *map;
 615        u64 start = 0;
 616        int i;
 617
 618        write_lock(&em_tree->lock);
 619        do {
 620                em = lookup_extent_mapping(em_tree, start, (u64)-1);
 621                if (!em)
 622                        break;
 623                map = (struct map_lookup *)em->bdev;
 624                for (i = 0; i < map->num_stripes; i++)
 625                        if (srcdev == map->stripes[i].dev)
 626                                map->stripes[i].dev = tgtdev;
 627                start = em->start + em->len;
 628                free_extent_map(em);
 629        } while (start);
 630        write_unlock(&em_tree->lock);
 631}
 632
 633static int btrfs_dev_replace_find_srcdev(struct btrfs_root *root, u64 srcdevid,
 634                                         char *srcdev_name,
 635                                         struct btrfs_device **device)
 636{
 637        int ret;
 638
 639        if (srcdevid) {
 640                ret = 0;
 641                *device = btrfs_find_device(root->fs_info, srcdevid, NULL,
 642                                            NULL);
 643                if (!*device)
 644                        ret = -ENOENT;
 645        } else {
 646                ret = btrfs_find_device_missing_or_by_path(root, srcdev_name,
 647                                                           device);
 648        }
 649        return ret;
 650}
 651
 652void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
 653                              struct btrfs_ioctl_dev_replace_args *args)
 654{
 655        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 656        struct btrfs_device *srcdev;
 657
 658        btrfs_dev_replace_lock(dev_replace);
 659        /* even if !dev_replace_is_valid, the values are good enough for
 660         * the replace_status ioctl */
 661        args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
 662        args->status.replace_state = dev_replace->replace_state;
 663        args->status.time_started = dev_replace->time_started;
 664        args->status.time_stopped = dev_replace->time_stopped;
 665        args->status.num_write_errors =
 666                atomic64_read(&dev_replace->num_write_errors);
 667        args->status.num_uncorrectable_read_errors =
 668                atomic64_read(&dev_replace->num_uncorrectable_read_errors);
 669        switch (dev_replace->replace_state) {
 670        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 671        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
 672                args->status.progress_1000 = 0;
 673                break;
 674        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
 675                args->status.progress_1000 = 1000;
 676                break;
 677        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 678        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
 679                srcdev = dev_replace->srcdev;
 680                args->status.progress_1000 = div64_u64(dev_replace->cursor_left,
 681                        div64_u64(btrfs_device_get_total_bytes(srcdev), 1000));
 682                break;
 683        }
 684        btrfs_dev_replace_unlock(dev_replace);
 685}
 686
 687int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info,
 688                             struct btrfs_ioctl_dev_replace_args *args)
 689{
 690        args->result = __btrfs_dev_replace_cancel(fs_info);
 691        return 0;
 692}
 693
 694static u64 __btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
 695{
 696        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 697        struct btrfs_device *tgt_device = NULL;
 698        struct btrfs_trans_handle *trans;
 699        struct btrfs_root *root = fs_info->tree_root;
 700        u64 result;
 701        int ret;
 702
 703        if (fs_info->sb->s_flags & MS_RDONLY)
 704                return -EROFS;
 705
 706        mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
 707        btrfs_dev_replace_lock(dev_replace);
 708        switch (dev_replace->replace_state) {
 709        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 710        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
 711        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
 712                result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
 713                btrfs_dev_replace_unlock(dev_replace);
 714                goto leave;
 715        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 716        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
 717                result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
 718                tgt_device = dev_replace->tgtdev;
 719                dev_replace->tgtdev = NULL;
 720                dev_replace->srcdev = NULL;
 721                break;
 722        }
 723        dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
 724        dev_replace->time_stopped = get_seconds();
 725        dev_replace->item_needs_writeback = 1;
 726        btrfs_dev_replace_unlock(dev_replace);
 727        btrfs_scrub_cancel(fs_info);
 728
 729        trans = btrfs_start_transaction(root, 0);
 730        if (IS_ERR(trans)) {
 731                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 732                return PTR_ERR(trans);
 733        }
 734        ret = btrfs_commit_transaction(trans, root);
 735        WARN_ON(ret);
 736        if (tgt_device)
 737                btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device);
 738
 739leave:
 740        mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 741        return result;
 742}
 743
 744void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
 745{
 746        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 747
 748        mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
 749        btrfs_dev_replace_lock(dev_replace);
 750        switch (dev_replace->replace_state) {
 751        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 752        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
 753        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
 754        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
 755                break;
 756        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 757                dev_replace->replace_state =
 758                        BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
 759                dev_replace->time_stopped = get_seconds();
 760                dev_replace->item_needs_writeback = 1;
 761                btrfs_info(fs_info, "suspending dev_replace for unmount");
 762                break;
 763        }
 764
 765        btrfs_dev_replace_unlock(dev_replace);
 766        mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 767}
 768
 769/* resume dev_replace procedure that was interrupted by unmount */
 770int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
 771{
 772        struct task_struct *task;
 773        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 774
 775        btrfs_dev_replace_lock(dev_replace);
 776        switch (dev_replace->replace_state) {
 777        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 778        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
 779        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
 780                btrfs_dev_replace_unlock(dev_replace);
 781                return 0;
 782        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 783                break;
 784        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
 785                dev_replace->replace_state =
 786                        BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
 787                break;
 788        }
 789        if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
 790                btrfs_info(fs_info, "cannot continue dev_replace, tgtdev is missing");
 791                btrfs_info(fs_info,
 792                        "you may cancel the operation after 'mount -o degraded'");
 793                btrfs_dev_replace_unlock(dev_replace);
 794                return 0;
 795        }
 796        btrfs_dev_replace_unlock(dev_replace);
 797
 798        WARN_ON(atomic_xchg(
 799                &fs_info->mutually_exclusive_operation_running, 1));
 800        task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
 801        return PTR_ERR_OR_ZERO(task);
 802}
 803
 804static int btrfs_dev_replace_kthread(void *data)
 805{
 806        struct btrfs_fs_info *fs_info = data;
 807        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 808        struct btrfs_ioctl_dev_replace_args *status_args;
 809        u64 progress;
 810
 811        status_args = kzalloc(sizeof(*status_args), GFP_NOFS);
 812        if (status_args) {
 813                btrfs_dev_replace_status(fs_info, status_args);
 814                progress = status_args->status.progress_1000;
 815                kfree(status_args);
 816                do_div(progress, 10);
 817                printk_in_rcu(KERN_INFO
 818                        "BTRFS: continuing dev_replace from %s (devid %llu) to %s @%u%%\n",
 819                        dev_replace->srcdev->missing ? "<missing disk>" :
 820                        rcu_str_deref(dev_replace->srcdev->name),
 821                        dev_replace->srcdev->devid,
 822                        dev_replace->tgtdev ?
 823                        rcu_str_deref(dev_replace->tgtdev->name) :
 824                        "<missing target disk>",
 825                        (unsigned int)progress);
 826        }
 827        btrfs_dev_replace_continue_on_mount(fs_info);
 828        atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
 829
 830        return 0;
 831}
 832
 833static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info)
 834{
 835        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 836        int ret;
 837
 838        ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
 839                              dev_replace->committed_cursor_left,
 840                              btrfs_device_get_total_bytes(dev_replace->srcdev),
 841                              &dev_replace->scrub_progress, 0, 1);
 842        ret = btrfs_dev_replace_finishing(fs_info, ret);
 843        WARN_ON(ret);
 844        return 0;
 845}
 846
 847int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
 848{
 849        if (!dev_replace->is_valid)
 850                return 0;
 851
 852        switch (dev_replace->replace_state) {
 853        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 854        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
 855        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
 856                return 0;
 857        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 858        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
 859                /*
 860                 * return true even if tgtdev is missing (this is
 861                 * something that can happen if the dev_replace
 862                 * procedure is suspended by an umount and then
 863                 * the tgtdev is missing (or "btrfs dev scan") was
 864                 * not called and the the filesystem is remounted
 865                 * in degraded state. This does not stop the
 866                 * dev_replace procedure. It needs to be canceled
 867                 * manually if the cancelation is wanted.
 868                 */
 869                break;
 870        }
 871        return 1;
 872}
 873
 874void btrfs_dev_replace_lock(struct btrfs_dev_replace *dev_replace)
 875{
 876        /* the beginning is just an optimization for the typical case */
 877        if (atomic_read(&dev_replace->nesting_level) == 0) {
 878acquire_lock:
 879                /* this is not a nested case where the same thread
 880                 * is trying to acqurire the same lock twice */
 881                mutex_lock(&dev_replace->lock);
 882                mutex_lock(&dev_replace->lock_management_lock);
 883                dev_replace->lock_owner = current->pid;
 884                atomic_inc(&dev_replace->nesting_level);
 885                mutex_unlock(&dev_replace->lock_management_lock);
 886                return;
 887        }
 888
 889        mutex_lock(&dev_replace->lock_management_lock);
 890        if (atomic_read(&dev_replace->nesting_level) > 0 &&
 891            dev_replace->lock_owner == current->pid) {
 892                WARN_ON(!mutex_is_locked(&dev_replace->lock));
 893                atomic_inc(&dev_replace->nesting_level);
 894                mutex_unlock(&dev_replace->lock_management_lock);
 895                return;
 896        }
 897
 898        mutex_unlock(&dev_replace->lock_management_lock);
 899        goto acquire_lock;
 900}
 901
 902void btrfs_dev_replace_unlock(struct btrfs_dev_replace *dev_replace)
 903{
 904        WARN_ON(!mutex_is_locked(&dev_replace->lock));
 905        mutex_lock(&dev_replace->lock_management_lock);
 906        WARN_ON(atomic_read(&dev_replace->nesting_level) < 1);
 907        WARN_ON(dev_replace->lock_owner != current->pid);
 908        atomic_dec(&dev_replace->nesting_level);
 909        if (atomic_read(&dev_replace->nesting_level) == 0) {
 910                dev_replace->lock_owner = 0;
 911                mutex_unlock(&dev_replace->lock_management_lock);
 912                mutex_unlock(&dev_replace->lock);
 913        } else {
 914                mutex_unlock(&dev_replace->lock_management_lock);
 915        }
 916}
 917
 918void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
 919{
 920        percpu_counter_inc(&fs_info->bio_counter);
 921}
 922
 923void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
 924{
 925        percpu_counter_dec(&fs_info->bio_counter);
 926
 927        if (waitqueue_active(&fs_info->replace_wait))
 928                wake_up(&fs_info->replace_wait);
 929}
 930
 931void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
 932{
 933        DEFINE_WAIT(wait);
 934again:
 935        percpu_counter_inc(&fs_info->bio_counter);
 936        if (test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)) {
 937                btrfs_bio_counter_dec(fs_info);
 938                wait_event(fs_info->replace_wait,
 939                           !test_bit(BTRFS_FS_STATE_DEV_REPLACING,
 940                                     &fs_info->fs_state));
 941                goto again;
 942        }
 943
 944}
 945