linux/fs/btrfs/dev-replace.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) STRATO AG 2012.  All rights reserved.
   4 */
   5
   6#include <linux/sched.h>
   7#include <linux/bio.h>
   8#include <linux/slab.h>
   9#include <linux/blkdev.h>
  10#include <linux/kthread.h>
  11#include <linux/math64.h>
  12#include "misc.h"
  13#include "ctree.h"
  14#include "extent_map.h"
  15#include "disk-io.h"
  16#include "transaction.h"
  17#include "print-tree.h"
  18#include "volumes.h"
  19#include "async-thread.h"
  20#include "check-integrity.h"
  21#include "rcu-string.h"
  22#include "dev-replace.h"
  23#include "sysfs.h"
  24#include "zoned.h"
  25#include "block-group.h"
  26
  27/*
  28 * Device replace overview
  29 *
  30 * [Objective]
  31 * To copy all extents (both new and on-disk) from source device to target
  32 * device, while still keeping the filesystem read-write.
  33 *
  34 * [Method]
  35 * There are two main methods involved:
  36 *
  37 * - Write duplication
  38 *
  39 *   All new writes will be written to both target and source devices, so even
  40 *   if replace gets canceled, sources device still contains up-to-date data.
  41 *
  42 *   Location:          handle_ops_on_dev_replace() from __btrfs_map_block()
  43 *   Start:             btrfs_dev_replace_start()
  44 *   End:               btrfs_dev_replace_finishing()
  45 *   Content:           Latest data/metadata
  46 *
  47 * - Copy existing extents
  48 *
  49 *   This happens by re-using scrub facility, as scrub also iterates through
  50 *   existing extents from commit root.
  51 *
  52 *   Location:          scrub_write_block_to_dev_replace() from
  53 *                      scrub_block_complete()
  54 *   Content:           Data/meta from commit root.
  55 *
  56 * Due to the content difference, we need to avoid nocow write when dev-replace
  57 * is happening.  This is done by marking the block group read-only and waiting
  58 * for NOCOW writes.
  59 *
  60 * After replace is done, the finishing part is done by swapping the target and
  61 * source devices.
  62 *
  63 *   Location:          btrfs_dev_replace_update_device_in_mapping_tree() from
  64 *                      btrfs_dev_replace_finishing()
  65 */
  66
  67static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
  68                                       int scrub_ret);
  69static int btrfs_dev_replace_kthread(void *data);
  70
  71int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
  72{
  73        struct btrfs_key key;
  74        struct btrfs_root *dev_root = fs_info->dev_root;
  75        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
  76        struct extent_buffer *eb;
  77        int slot;
  78        int ret = 0;
  79        struct btrfs_path *path = NULL;
  80        int item_size;
  81        struct btrfs_dev_replace_item *ptr;
  82        u64 src_devid;
  83
  84        if (!dev_root)
  85                return 0;
  86
  87        path = btrfs_alloc_path();
  88        if (!path) {
  89                ret = -ENOMEM;
  90                goto out;
  91        }
  92
  93        key.objectid = 0;
  94        key.type = BTRFS_DEV_REPLACE_KEY;
  95        key.offset = 0;
  96        ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
  97        if (ret) {
  98no_valid_dev_replace_entry_found:
  99                /*
 100                 * We don't have a replace item or it's corrupted.  If there is
 101                 * a replace target, fail the mount.
 102                 */
 103                if (btrfs_find_device(fs_info->fs_devices,
 104                                      BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) {
 105                        btrfs_err(fs_info,
 106                        "found replace target device without a valid replace item");
 107                        ret = -EUCLEAN;
 108                        goto out;
 109                }
 110                ret = 0;
 111                dev_replace->replace_state =
 112                        BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
 113                dev_replace->cont_reading_from_srcdev_mode =
 114                    BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
 115                dev_replace->time_started = 0;
 116                dev_replace->time_stopped = 0;
 117                atomic64_set(&dev_replace->num_write_errors, 0);
 118                atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
 119                dev_replace->cursor_left = 0;
 120                dev_replace->committed_cursor_left = 0;
 121                dev_replace->cursor_left_last_write_of_item = 0;
 122                dev_replace->cursor_right = 0;
 123                dev_replace->srcdev = NULL;
 124                dev_replace->tgtdev = NULL;
 125                dev_replace->is_valid = 0;
 126                dev_replace->item_needs_writeback = 0;
 127                goto out;
 128        }
 129        slot = path->slots[0];
 130        eb = path->nodes[0];
 131        item_size = btrfs_item_size_nr(eb, slot);
 132        ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
 133
 134        if (item_size != sizeof(struct btrfs_dev_replace_item)) {
 135                btrfs_warn(fs_info,
 136                        "dev_replace entry found has unexpected size, ignore entry");
 137                goto no_valid_dev_replace_entry_found;
 138        }
 139
 140        src_devid = btrfs_dev_replace_src_devid(eb, ptr);
 141        dev_replace->cont_reading_from_srcdev_mode =
 142                btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
 143        dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
 144        dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
 145        dev_replace->time_stopped =
 146                btrfs_dev_replace_time_stopped(eb, ptr);
 147        atomic64_set(&dev_replace->num_write_errors,
 148                     btrfs_dev_replace_num_write_errors(eb, ptr));
 149        atomic64_set(&dev_replace->num_uncorrectable_read_errors,
 150                     btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
 151        dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
 152        dev_replace->committed_cursor_left = dev_replace->cursor_left;
 153        dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
 154        dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
 155        dev_replace->is_valid = 1;
 156
 157        dev_replace->item_needs_writeback = 0;
 158        switch (dev_replace->replace_state) {
 159        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 160        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
 161        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
 162                /*
 163                 * We don't have an active replace item but if there is a
 164                 * replace target, fail the mount.
 165                 */
 166                if (btrfs_find_device(fs_info->fs_devices,
 167                                      BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) {
 168                        btrfs_err(fs_info,
 169                        "replace devid present without an active replace item");
 170                        ret = -EUCLEAN;
 171                } else {
 172                        dev_replace->srcdev = NULL;
 173                        dev_replace->tgtdev = NULL;
 174                }
 175                break;
 176        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 177        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
 178                dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices,
 179                                                src_devid, NULL, NULL);
 180                dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices,
 181                                                        BTRFS_DEV_REPLACE_DEVID,
 182                                                        NULL, NULL);
 183                /*
 184                 * allow 'btrfs dev replace_cancel' if src/tgt device is
 185                 * missing
 186                 */
 187                if (!dev_replace->srcdev &&
 188                    !btrfs_test_opt(fs_info, DEGRADED)) {
 189                        ret = -EIO;
 190                        btrfs_warn(fs_info,
 191                           "cannot mount because device replace operation is ongoing and");
 192                        btrfs_warn(fs_info,
 193                           "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
 194                           src_devid);
 195                }
 196                if (!dev_replace->tgtdev &&
 197                    !btrfs_test_opt(fs_info, DEGRADED)) {
 198                        ret = -EIO;
 199                        btrfs_warn(fs_info,
 200                           "cannot mount because device replace operation is ongoing and");
 201                        btrfs_warn(fs_info,
 202                           "tgtdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
 203                                BTRFS_DEV_REPLACE_DEVID);
 204                }
 205                if (dev_replace->tgtdev) {
 206                        if (dev_replace->srcdev) {
 207                                dev_replace->tgtdev->total_bytes =
 208                                        dev_replace->srcdev->total_bytes;
 209                                dev_replace->tgtdev->disk_total_bytes =
 210                                        dev_replace->srcdev->disk_total_bytes;
 211                                dev_replace->tgtdev->commit_total_bytes =
 212                                        dev_replace->srcdev->commit_total_bytes;
 213                                dev_replace->tgtdev->bytes_used =
 214                                        dev_replace->srcdev->bytes_used;
 215                                dev_replace->tgtdev->commit_bytes_used =
 216                                        dev_replace->srcdev->commit_bytes_used;
 217                        }
 218                        set_bit(BTRFS_DEV_STATE_REPLACE_TGT,
 219                                &dev_replace->tgtdev->dev_state);
 220
 221                        WARN_ON(fs_info->fs_devices->rw_devices == 0);
 222                        dev_replace->tgtdev->io_width = fs_info->sectorsize;
 223                        dev_replace->tgtdev->io_align = fs_info->sectorsize;
 224                        dev_replace->tgtdev->sector_size = fs_info->sectorsize;
 225                        dev_replace->tgtdev->fs_info = fs_info;
 226                        set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
 227                                &dev_replace->tgtdev->dev_state);
 228                }
 229                break;
 230        }
 231
 232out:
 233        btrfs_free_path(path);
 234        return ret;
 235}
 236
 237/*
 238 * Initialize a new device for device replace target from a given source dev
 239 * and path.
 240 *
 241 * Return 0 and new device in @device_out, otherwise return < 0
 242 */
 243static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 244                                  const char *device_path,
 245                                  struct btrfs_device *srcdev,
 246                                  struct btrfs_device **device_out)
 247{
 248        struct btrfs_device *device;
 249        struct block_device *bdev;
 250        struct rcu_string *name;
 251        u64 devid = BTRFS_DEV_REPLACE_DEVID;
 252        int ret = 0;
 253
 254        *device_out = NULL;
 255        if (srcdev->fs_devices->seeding) {
 256                btrfs_err(fs_info, "the filesystem is a seed filesystem!");
 257                return -EINVAL;
 258        }
 259
 260        bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
 261                                  fs_info->bdev_holder);
 262        if (IS_ERR(bdev)) {
 263                btrfs_err(fs_info, "target device %s is invalid!", device_path);
 264                return PTR_ERR(bdev);
 265        }
 266
 267        if (!btrfs_check_device_zone_type(fs_info, bdev)) {
 268                btrfs_err(fs_info,
 269                "dev-replace: zoned type of target device mismatch with filesystem");
 270                ret = -EINVAL;
 271                goto error;
 272        }
 273
 274        sync_blockdev(bdev);
 275
 276        list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
 277                if (device->bdev == bdev) {
 278                        btrfs_err(fs_info,
 279                                  "target device is in the filesystem!");
 280                        ret = -EEXIST;
 281                        goto error;
 282                }
 283        }
 284
 285
 286        if (i_size_read(bdev->bd_inode) <
 287            btrfs_device_get_total_bytes(srcdev)) {
 288                btrfs_err(fs_info,
 289                          "target device is smaller than source device!");
 290                ret = -EINVAL;
 291                goto error;
 292        }
 293
 294
 295        device = btrfs_alloc_device(NULL, &devid, NULL);
 296        if (IS_ERR(device)) {
 297                ret = PTR_ERR(device);
 298                goto error;
 299        }
 300
 301        name = rcu_string_strdup(device_path, GFP_KERNEL);
 302        if (!name) {
 303                btrfs_free_device(device);
 304                ret = -ENOMEM;
 305                goto error;
 306        }
 307        rcu_assign_pointer(device->name, name);
 308
 309        set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 310        device->generation = 0;
 311        device->io_width = fs_info->sectorsize;
 312        device->io_align = fs_info->sectorsize;
 313        device->sector_size = fs_info->sectorsize;
 314        device->total_bytes = btrfs_device_get_total_bytes(srcdev);
 315        device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
 316        device->bytes_used = btrfs_device_get_bytes_used(srcdev);
 317        device->commit_total_bytes = srcdev->commit_total_bytes;
 318        device->commit_bytes_used = device->bytes_used;
 319        device->fs_info = fs_info;
 320        device->bdev = bdev;
 321        set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
 322        set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
 323        device->mode = FMODE_EXCL;
 324        device->dev_stats_valid = 1;
 325        set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
 326        device->fs_devices = fs_info->fs_devices;
 327
 328        ret = btrfs_get_dev_zone_info(device);
 329        if (ret)
 330                goto error;
 331
 332        mutex_lock(&fs_info->fs_devices->device_list_mutex);
 333        list_add(&device->dev_list, &fs_info->fs_devices->devices);
 334        fs_info->fs_devices->num_devices++;
 335        fs_info->fs_devices->open_devices++;
 336        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 337
 338        *device_out = device;
 339        return 0;
 340
 341error:
 342        blkdev_put(bdev, FMODE_EXCL);
 343        return ret;
 344}
 345
 346/*
 347 * called from commit_transaction. Writes changed device replace state to
 348 * disk.
 349 */
 350int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
 351{
 352        struct btrfs_fs_info *fs_info = trans->fs_info;
 353        int ret;
 354        struct btrfs_root *dev_root = fs_info->dev_root;
 355        struct btrfs_path *path;
 356        struct btrfs_key key;
 357        struct extent_buffer *eb;
 358        struct btrfs_dev_replace_item *ptr;
 359        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 360
 361        down_read(&dev_replace->rwsem);
 362        if (!dev_replace->is_valid ||
 363            !dev_replace->item_needs_writeback) {
 364                up_read(&dev_replace->rwsem);
 365                return 0;
 366        }
 367        up_read(&dev_replace->rwsem);
 368
 369        key.objectid = 0;
 370        key.type = BTRFS_DEV_REPLACE_KEY;
 371        key.offset = 0;
 372
 373        path = btrfs_alloc_path();
 374        if (!path) {
 375                ret = -ENOMEM;
 376                goto out;
 377        }
 378        ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
 379        if (ret < 0) {
 380                btrfs_warn(fs_info,
 381                           "error %d while searching for dev_replace item!",
 382                           ret);
 383                goto out;
 384        }
 385
 386        if (ret == 0 &&
 387            btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
 388                /*
 389                 * need to delete old one and insert a new one.
 390                 * Since no attempt is made to recover any old state, if the
 391                 * dev_replace state is 'running', the data on the target
 392                 * drive is lost.
 393                 * It would be possible to recover the state: just make sure
 394                 * that the beginning of the item is never changed and always
 395                 * contains all the essential information. Then read this
 396                 * minimal set of information and use it as a base for the
 397                 * new state.
 398                 */
 399                ret = btrfs_del_item(trans, dev_root, path);
 400                if (ret != 0) {
 401                        btrfs_warn(fs_info,
 402                                   "delete too small dev_replace item failed %d!",
 403                                   ret);
 404                        goto out;
 405                }
 406                ret = 1;
 407        }
 408
 409        if (ret == 1) {
 410                /* need to insert a new item */
 411                btrfs_release_path(path);
 412                ret = btrfs_insert_empty_item(trans, dev_root, path,
 413                                              &key, sizeof(*ptr));
 414                if (ret < 0) {
 415                        btrfs_warn(fs_info,
 416                                   "insert dev_replace item failed %d!", ret);
 417                        goto out;
 418                }
 419        }
 420
 421        eb = path->nodes[0];
 422        ptr = btrfs_item_ptr(eb, path->slots[0],
 423                             struct btrfs_dev_replace_item);
 424
 425        down_write(&dev_replace->rwsem);
 426        if (dev_replace->srcdev)
 427                btrfs_set_dev_replace_src_devid(eb, ptr,
 428                        dev_replace->srcdev->devid);
 429        else
 430                btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
 431        btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
 432                dev_replace->cont_reading_from_srcdev_mode);
 433        btrfs_set_dev_replace_replace_state(eb, ptr,
 434                dev_replace->replace_state);
 435        btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
 436        btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
 437        btrfs_set_dev_replace_num_write_errors(eb, ptr,
 438                atomic64_read(&dev_replace->num_write_errors));
 439        btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
 440                atomic64_read(&dev_replace->num_uncorrectable_read_errors));
 441        dev_replace->cursor_left_last_write_of_item =
 442                dev_replace->cursor_left;
 443        btrfs_set_dev_replace_cursor_left(eb, ptr,
 444                dev_replace->cursor_left_last_write_of_item);
 445        btrfs_set_dev_replace_cursor_right(eb, ptr,
 446                dev_replace->cursor_right);
 447        dev_replace->item_needs_writeback = 0;
 448        up_write(&dev_replace->rwsem);
 449
 450        btrfs_mark_buffer_dirty(eb);
 451
 452out:
 453        btrfs_free_path(path);
 454
 455        return ret;
 456}
 457
 458static char* btrfs_dev_name(struct btrfs_device *device)
 459{
 460        if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
 461                return "<missing disk>";
 462        else
 463                return rcu_str_deref(device->name);
 464}
 465
 466static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
 467                                    struct btrfs_device *src_dev)
 468{
 469        struct btrfs_path *path;
 470        struct btrfs_key key;
 471        struct btrfs_key found_key;
 472        struct btrfs_root *root = fs_info->dev_root;
 473        struct btrfs_dev_extent *dev_extent = NULL;
 474        struct btrfs_block_group *cache;
 475        struct btrfs_trans_handle *trans;
 476        int ret = 0;
 477        u64 chunk_offset;
 478
 479        /* Do not use "to_copy" on non zoned filesystem for now */
 480        if (!btrfs_is_zoned(fs_info))
 481                return 0;
 482
 483        mutex_lock(&fs_info->chunk_mutex);
 484
 485        /* Ensure we don't have pending new block group */
 486        spin_lock(&fs_info->trans_lock);
 487        while (fs_info->running_transaction &&
 488               !list_empty(&fs_info->running_transaction->dev_update_list)) {
 489                spin_unlock(&fs_info->trans_lock);
 490                mutex_unlock(&fs_info->chunk_mutex);
 491                trans = btrfs_attach_transaction(root);
 492                if (IS_ERR(trans)) {
 493                        ret = PTR_ERR(trans);
 494                        mutex_lock(&fs_info->chunk_mutex);
 495                        if (ret == -ENOENT) {
 496                                spin_lock(&fs_info->trans_lock);
 497                                continue;
 498                        } else {
 499                                goto unlock;
 500                        }
 501                }
 502
 503                ret = btrfs_commit_transaction(trans);
 504                mutex_lock(&fs_info->chunk_mutex);
 505                if (ret)
 506                        goto unlock;
 507
 508                spin_lock(&fs_info->trans_lock);
 509        }
 510        spin_unlock(&fs_info->trans_lock);
 511
 512        path = btrfs_alloc_path();
 513        if (!path) {
 514                ret = -ENOMEM;
 515                goto unlock;
 516        }
 517
 518        path->reada = READA_FORWARD;
 519        path->search_commit_root = 1;
 520        path->skip_locking = 1;
 521
 522        key.objectid = src_dev->devid;
 523        key.type = BTRFS_DEV_EXTENT_KEY;
 524        key.offset = 0;
 525
 526        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 527        if (ret < 0)
 528                goto free_path;
 529        if (ret > 0) {
 530                if (path->slots[0] >=
 531                    btrfs_header_nritems(path->nodes[0])) {
 532                        ret = btrfs_next_leaf(root, path);
 533                        if (ret < 0)
 534                                goto free_path;
 535                        if (ret > 0) {
 536                                ret = 0;
 537                                goto free_path;
 538                        }
 539                } else {
 540                        ret = 0;
 541                }
 542        }
 543
 544        while (1) {
 545                struct extent_buffer *leaf = path->nodes[0];
 546                int slot = path->slots[0];
 547
 548                btrfs_item_key_to_cpu(leaf, &found_key, slot);
 549
 550                if (found_key.objectid != src_dev->devid)
 551                        break;
 552
 553                if (found_key.type != BTRFS_DEV_EXTENT_KEY)
 554                        break;
 555
 556                if (found_key.offset < key.offset)
 557                        break;
 558
 559                dev_extent = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
 560
 561                chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dev_extent);
 562
 563                cache = btrfs_lookup_block_group(fs_info, chunk_offset);
 564                if (!cache)
 565                        goto skip;
 566
 567                spin_lock(&cache->lock);
 568                cache->to_copy = 1;
 569                spin_unlock(&cache->lock);
 570
 571                btrfs_put_block_group(cache);
 572
 573skip:
 574                ret = btrfs_next_item(root, path);
 575                if (ret != 0) {
 576                        if (ret > 0)
 577                                ret = 0;
 578                        break;
 579                }
 580        }
 581
 582free_path:
 583        btrfs_free_path(path);
 584unlock:
 585        mutex_unlock(&fs_info->chunk_mutex);
 586
 587        return ret;
 588}
 589
 590bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
 591                                      struct btrfs_block_group *cache,
 592                                      u64 physical)
 593{
 594        struct btrfs_fs_info *fs_info = cache->fs_info;
 595        struct extent_map *em;
 596        struct map_lookup *map;
 597        u64 chunk_offset = cache->start;
 598        int num_extents, cur_extent;
 599        int i;
 600
 601        /* Do not use "to_copy" on non zoned filesystem for now */
 602        if (!btrfs_is_zoned(fs_info))
 603                return true;
 604
 605        spin_lock(&cache->lock);
 606        if (cache->removed) {
 607                spin_unlock(&cache->lock);
 608                return true;
 609        }
 610        spin_unlock(&cache->lock);
 611
 612        em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
 613        ASSERT(!IS_ERR(em));
 614        map = em->map_lookup;
 615
 616        num_extents = cur_extent = 0;
 617        for (i = 0; i < map->num_stripes; i++) {
 618                /* We have more device extent to copy */
 619                if (srcdev != map->stripes[i].dev)
 620                        continue;
 621
 622                num_extents++;
 623                if (physical == map->stripes[i].physical)
 624                        cur_extent = i;
 625        }
 626
 627        free_extent_map(em);
 628
 629        if (num_extents > 1 && cur_extent < num_extents - 1) {
 630                /*
 631                 * Has more stripes on this device. Keep this block group
 632                 * readonly until we finish all the stripes.
 633                 */
 634                return false;
 635        }
 636
 637        /* Last stripe on this device */
 638        spin_lock(&cache->lock);
 639        cache->to_copy = 0;
 640        spin_unlock(&cache->lock);
 641
 642        return true;
 643}
 644
 645static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
 646                const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
 647                int read_src)
 648{
 649        struct btrfs_root *root = fs_info->dev_root;
 650        struct btrfs_trans_handle *trans;
 651        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 652        int ret;
 653        struct btrfs_device *tgt_device = NULL;
 654        struct btrfs_device *src_device = NULL;
 655
 656        src_device = btrfs_find_device_by_devspec(fs_info, srcdevid,
 657                                                  srcdev_name);
 658        if (IS_ERR(src_device))
 659                return PTR_ERR(src_device);
 660
 661        if (btrfs_pinned_by_swapfile(fs_info, src_device)) {
 662                btrfs_warn_in_rcu(fs_info,
 663          "cannot replace device %s (devid %llu) due to active swapfile",
 664                        btrfs_dev_name(src_device), src_device->devid);
 665                return -ETXTBSY;
 666        }
 667
 668        /*
 669         * Here we commit the transaction to make sure commit_total_bytes
 670         * of all the devices are updated.
 671         */
 672        trans = btrfs_attach_transaction(root);
 673        if (!IS_ERR(trans)) {
 674                ret = btrfs_commit_transaction(trans);
 675                if (ret)
 676                        return ret;
 677        } else if (PTR_ERR(trans) != -ENOENT) {
 678                return PTR_ERR(trans);
 679        }
 680
 681        ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name,
 682                                            src_device, &tgt_device);
 683        if (ret)
 684                return ret;
 685
 686        ret = mark_block_group_to_copy(fs_info, src_device);
 687        if (ret)
 688                return ret;
 689
 690        down_write(&dev_replace->rwsem);
 691        switch (dev_replace->replace_state) {
 692        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
 693        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
 694        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
 695                break;
 696        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
 697        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
 698                ASSERT(0);
 699                ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
 700                up_write(&dev_replace->rwsem);
 701                goto leave;
 702        }
 703
 704        dev_replace->cont_reading_from_srcdev_mode = read_src;
 705        dev_replace->srcdev = src_device;
 706        dev_replace->tgtdev = tgt_device;
 707
 708        btrfs_info_in_rcu(fs_info,
 709                      "dev_replace from %s (devid %llu) to %s started",
 710                      btrfs_dev_name(src_device),
 711                      src_device->devid,
 712                      rcu_str_deref(tgt_device->name));
 713
 714        /*
 715         * from now on, the writes to the srcdev are all duplicated to
 716         * go to the tgtdev as well (refer to btrfs_map_block()).
 717         */
 718        dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
 719        dev_replace->time_started = ktime_get_real_seconds();
 720        dev_replace->cursor_left = 0;
 721        dev_replace->committed_cursor_left = 0;
 722        dev_replace->cursor_left_last_write_of_item = 0;
 723        dev_replace->cursor_right = 0;
 724        dev_replace->is_valid = 1;
 725        dev_replace->item_needs_writeback = 1;
 726        atomic64_set(&dev_replace->num_write_errors, 0);
 727        atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
 728        up_write(&dev_replace->rwsem);
 729
 730        ret = btrfs_sysfs_add_device(tgt_device);
 731        if (ret)
 732                btrfs_err(fs_info, "kobj add dev failed %d", ret);
 733
 734        btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
 735
 736        /* Commit dev_replace state and reserve 1 item for it. */
 737        trans = btrfs_start_transaction(root, 1);
 738        if (IS_ERR(trans)) {
 739                ret = PTR_ERR(trans);
 740                down_write(&dev_replace->rwsem);
 741                dev_replace->replace_state =
 742                        BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
 743                dev_replace->srcdev = NULL;
 744                dev_replace->tgtdev = NULL;
 745                up_write(&dev_replace->rwsem);
 746                goto leave;
 747        }
 748
 749        ret = btrfs_commit_transaction(trans);
 750        WARN_ON(ret);
 751
 752        /* the disk copy procedure reuses the scrub code */
 753        ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
 754                              btrfs_device_get_total_bytes(src_device),
 755                              &dev_replace->scrub_progress, 0, 1);
 756
 757        ret = btrfs_dev_replace_finishing(fs_info, ret);
 758        if (ret == -EINPROGRESS)
 759                ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
 760
 761        return ret;
 762
 763leave:
 764        btrfs_destroy_dev_replace_tgtdev(tgt_device);
 765        return ret;
 766}
 767
 768int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
 769                            struct btrfs_ioctl_dev_replace_args *args)
 770{
 771        int ret;
 772
 773        switch (args->start.cont_reading_from_srcdev_mode) {
 774        case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
 775        case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
 776                break;
 777        default:
 778                return -EINVAL;
 779        }
 780
 781        if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
 782            args->start.tgtdev_name[0] == '\0')
 783                return -EINVAL;
 784
 785        ret = btrfs_dev_replace_start(fs_info, args->start.tgtdev_name,
 786                                        args->start.srcdevid,
 787                                        args->start.srcdev_name,
 788                                        args->start.cont_reading_from_srcdev_mode);
 789        args->result = ret;
 790        /* don't warn if EINPROGRESS, someone else might be running scrub */
 791        if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS ||
 792            ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR)
 793                return 0;
 794
 795        return ret;
 796}
 797
 798/*
 799 * blocked until all in-flight bios operations are finished.
 800 */
 801static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
 802{
 803        set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
 804        wait_event(fs_info->dev_replace.replace_wait, !percpu_counter_sum(
 805                   &fs_info->dev_replace.bio_counter));
 806}
 807
 808/*
 809 * we have removed target device, it is safe to allow new bios request.
 810 */
 811static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
 812{
 813        clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
 814        wake_up(&fs_info->dev_replace.replace_wait);
 815}
 816
 817/*
 818 * When finishing the device replace, before swapping the source device with the
 819 * target device we must update the chunk allocation state in the target device,
 820 * as it is empty because replace works by directly copying the chunks and not
 821 * through the normal chunk allocation path.
 822 */
 823static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev,
 824                                        struct btrfs_device *tgtdev)
 825{
 826        struct extent_state *cached_state = NULL;
 827        u64 start = 0;
 828        u64 found_start;
 829        u64 found_end;
 830        int ret = 0;
 831
 832        lockdep_assert_held(&srcdev->fs_info->chunk_mutex);
 833
 834        while (!find_first_extent_bit(&srcdev->alloc_state, start,
 835                                      &found_start, &found_end,
 836                                      CHUNK_ALLOCATED, &cached_state)) {
 837                ret = set_extent_bits(&tgtdev->alloc_state, found_start,
 838                                      found_end, CHUNK_ALLOCATED);
 839                if (ret)
 840                        break;
 841                start = found_end + 1;
 842        }
 843
 844        free_extent_state(cached_state);
 845        return ret;
 846}
 847
 848static void btrfs_dev_replace_update_device_in_mapping_tree(
 849                                                struct btrfs_fs_info *fs_info,
 850                                                struct btrfs_device *srcdev,
 851                                                struct btrfs_device *tgtdev)
 852{
 853        struct extent_map_tree *em_tree = &fs_info->mapping_tree;
 854        struct extent_map *em;
 855        struct map_lookup *map;
 856        u64 start = 0;
 857        int i;
 858
 859        write_lock(&em_tree->lock);
 860        do {
 861                em = lookup_extent_mapping(em_tree, start, (u64)-1);
 862                if (!em)
 863                        break;
 864                map = em->map_lookup;
 865                for (i = 0; i < map->num_stripes; i++)
 866                        if (srcdev == map->stripes[i].dev)
 867                                map->stripes[i].dev = tgtdev;
 868                start = em->start + em->len;
 869                free_extent_map(em);
 870        } while (start);
 871        write_unlock(&em_tree->lock);
 872}
 873
 874static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
 875                                       int scrub_ret)
 876{
 877        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 878        struct btrfs_device *tgt_device;
 879        struct btrfs_device *src_device;
 880        struct btrfs_root *root = fs_info->tree_root;
 881        u8 uuid_tmp[BTRFS_UUID_SIZE];
 882        struct btrfs_trans_handle *trans;
 883        int ret = 0;
 884
 885        /* don't allow cancel or unmount to disturb the finishing procedure */
 886        mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
 887
 888        down_read(&dev_replace->rwsem);
 889        /* was the operation canceled, or is it finished? */
 890        if (dev_replace->replace_state !=
 891            BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
 892                up_read(&dev_replace->rwsem);
 893                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 894                return 0;
 895        }
 896
 897        tgt_device = dev_replace->tgtdev;
 898        src_device = dev_replace->srcdev;
 899        up_read(&dev_replace->rwsem);
 900
 901        /*
 902         * flush all outstanding I/O and inode extent mappings before the
 903         * copy operation is declared as being finished
 904         */
 905        ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false);
 906        if (ret) {
 907                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 908                return ret;
 909        }
 910        btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
 911
 912        if (!scrub_ret)
 913                btrfs_reada_remove_dev(src_device);
 914
 915        /*
 916         * We have to use this loop approach because at this point src_device
 917         * has to be available for transaction commit to complete, yet new
 918         * chunks shouldn't be allocated on the device.
 919         */
 920        while (1) {
 921                trans = btrfs_start_transaction(root, 0);
 922                if (IS_ERR(trans)) {
 923                        btrfs_reada_undo_remove_dev(src_device);
 924                        mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 925                        return PTR_ERR(trans);
 926                }
 927                ret = btrfs_commit_transaction(trans);
 928                WARN_ON(ret);
 929
 930                /* Prevent write_all_supers() during the finishing procedure */
 931                mutex_lock(&fs_info->fs_devices->device_list_mutex);
 932                /* Prevent new chunks being allocated on the source device */
 933                mutex_lock(&fs_info->chunk_mutex);
 934
 935                if (!list_empty(&src_device->post_commit_list)) {
 936                        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 937                        mutex_unlock(&fs_info->chunk_mutex);
 938                } else {
 939                        break;
 940                }
 941        }
 942
 943        down_write(&dev_replace->rwsem);
 944        dev_replace->replace_state =
 945                scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
 946                          : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
 947        dev_replace->tgtdev = NULL;
 948        dev_replace->srcdev = NULL;
 949        dev_replace->time_stopped = ktime_get_real_seconds();
 950        dev_replace->item_needs_writeback = 1;
 951
 952        /*
 953         * Update allocation state in the new device and replace the old device
 954         * with the new one in the mapping tree.
 955         */
 956        if (!scrub_ret) {
 957                scrub_ret = btrfs_set_target_alloc_state(src_device, tgt_device);
 958                if (scrub_ret)
 959                        goto error;
 960                btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
 961                                                                src_device,
 962                                                                tgt_device);
 963        } else {
 964                if (scrub_ret != -ECANCELED)
 965                        btrfs_err_in_rcu(fs_info,
 966                                 "btrfs_scrub_dev(%s, %llu, %s) failed %d",
 967                                 btrfs_dev_name(src_device),
 968                                 src_device->devid,
 969                                 rcu_str_deref(tgt_device->name), scrub_ret);
 970error:
 971                up_write(&dev_replace->rwsem);
 972                mutex_unlock(&fs_info->chunk_mutex);
 973                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 974                btrfs_reada_undo_remove_dev(src_device);
 975                btrfs_rm_dev_replace_blocked(fs_info);
 976                if (tgt_device)
 977                        btrfs_destroy_dev_replace_tgtdev(tgt_device);
 978                btrfs_rm_dev_replace_unblocked(fs_info);
 979                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
 980
 981                return scrub_ret;
 982        }
 983
 984        btrfs_info_in_rcu(fs_info,
 985                          "dev_replace from %s (devid %llu) to %s finished",
 986                          btrfs_dev_name(src_device),
 987                          src_device->devid,
 988                          rcu_str_deref(tgt_device->name));
 989        clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &tgt_device->dev_state);
 990        tgt_device->devid = src_device->devid;
 991        src_device->devid = BTRFS_DEV_REPLACE_DEVID;
 992        memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
 993        memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
 994        memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
 995        btrfs_device_set_total_bytes(tgt_device, src_device->total_bytes);
 996        btrfs_device_set_disk_total_bytes(tgt_device,
 997                                          src_device->disk_total_bytes);
 998        btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used);
 999        tgt_device->commit_bytes_used = src_device->bytes_used;
1000
1001        btrfs_assign_next_active_device(src_device, tgt_device);
1002
1003        list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
1004        fs_info->fs_devices->rw_devices++;
1005
1006        up_write(&dev_replace->rwsem);
1007        btrfs_rm_dev_replace_blocked(fs_info);
1008
1009        btrfs_rm_dev_replace_remove_srcdev(src_device);
1010
1011        btrfs_rm_dev_replace_unblocked(fs_info);
1012
1013        /*
1014         * Increment dev_stats_ccnt so that btrfs_run_dev_stats() will
1015         * update on-disk dev stats value during commit transaction
1016         */
1017        atomic_inc(&tgt_device->dev_stats_ccnt);
1018
1019        /*
1020         * this is again a consistent state where no dev_replace procedure
1021         * is running, the target device is part of the filesystem, the
1022         * source device is not part of the filesystem anymore and its 1st
1023         * superblock is scratched out so that it is no longer marked to
1024         * belong to this filesystem.
1025         */
1026        mutex_unlock(&fs_info->chunk_mutex);
1027        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
1028
1029        /* replace the sysfs entry */
1030        btrfs_sysfs_remove_device(src_device);
1031        btrfs_sysfs_update_devid(tgt_device);
1032        if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state))
1033                btrfs_scratch_superblocks(fs_info, src_device->bdev,
1034                                          src_device->name->str);
1035
1036        /* write back the superblocks */
1037        trans = btrfs_start_transaction(root, 0);
1038        if (!IS_ERR(trans))
1039                btrfs_commit_transaction(trans);
1040
1041        mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
1042
1043        btrfs_rm_dev_replace_free_srcdev(src_device);
1044
1045        return 0;
1046}
1047
1048/*
1049 * Read progress of device replace status according to the state and last
1050 * stored position. The value format is the same as for
1051 * btrfs_dev_replace::progress_1000
1052 */
1053static u64 btrfs_dev_replace_progress(struct btrfs_fs_info *fs_info)
1054{
1055        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
1056        u64 ret = 0;
1057
1058        switch (dev_replace->replace_state) {
1059        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
1060        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
1061                ret = 0;
1062                break;
1063        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
1064                ret = 1000;
1065                break;
1066        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
1067        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
1068                ret = div64_u64(dev_replace->cursor_left,
1069                                div_u64(btrfs_device_get_total_bytes(
1070                                                dev_replace->srcdev), 1000));
1071                break;
1072        }
1073
1074        return ret;
1075}
1076
1077void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
1078                              struct btrfs_ioctl_dev_replace_args *args)
1079{
1080        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
1081
1082        down_read(&dev_replace->rwsem);
1083        /* even if !dev_replace_is_valid, the values are good enough for
1084         * the replace_status ioctl */
1085        args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
1086        args->status.replace_state = dev_replace->replace_state;
1087        args->status.time_started = dev_replace->time_started;
1088        args->status.time_stopped = dev_replace->time_stopped;
1089        args->status.num_write_errors =
1090                atomic64_read(&dev_replace->num_write_errors);
1091        args->status.num_uncorrectable_read_errors =
1092                atomic64_read(&dev_replace->num_uncorrectable_read_errors);
1093        args->status.progress_1000 = btrfs_dev_replace_progress(fs_info);
1094        up_read(&dev_replace->rwsem);
1095}
1096
1097int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
1098{
1099        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
1100        struct btrfs_device *tgt_device = NULL;
1101        struct btrfs_device *src_device = NULL;
1102        struct btrfs_trans_handle *trans;
1103        struct btrfs_root *root = fs_info->tree_root;
1104        int result;
1105        int ret;
1106
1107        if (sb_rdonly(fs_info->sb))
1108                return -EROFS;
1109
1110        mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
1111        down_write(&dev_replace->rwsem);
1112        switch (dev_replace->replace_state) {
1113        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
1114        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
1115        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
1116                result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
1117                up_write(&dev_replace->rwsem);
1118                break;
1119        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
1120                tgt_device = dev_replace->tgtdev;
1121                src_device = dev_replace->srcdev;
1122                up_write(&dev_replace->rwsem);
1123                ret = btrfs_scrub_cancel(fs_info);
1124                if (ret < 0) {
1125                        result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
1126                } else {
1127                        result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
1128                        /*
1129                         * btrfs_dev_replace_finishing() will handle the
1130                         * cleanup part
1131                         */
1132                        btrfs_info_in_rcu(fs_info,
1133                                "dev_replace from %s (devid %llu) to %s canceled",
1134                                btrfs_dev_name(src_device), src_device->devid,
1135                                btrfs_dev_name(tgt_device));
1136                }
1137                break;
1138        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
1139                /*
1140                 * Scrub doing the replace isn't running so we need to do the
1141                 * cleanup step of btrfs_dev_replace_finishing() here
1142                 */
1143                result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
1144                tgt_device = dev_replace->tgtdev;
1145                src_device = dev_replace->srcdev;
1146                dev_replace->tgtdev = NULL;
1147                dev_replace->srcdev = NULL;
1148                dev_replace->replace_state =
1149                                BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
1150                dev_replace->time_stopped = ktime_get_real_seconds();
1151                dev_replace->item_needs_writeback = 1;
1152
1153                up_write(&dev_replace->rwsem);
1154
1155                /* Scrub for replace must not be running in suspended state */
1156                ret = btrfs_scrub_cancel(fs_info);
1157                ASSERT(ret != -ENOTCONN);
1158
1159                trans = btrfs_start_transaction(root, 0);
1160                if (IS_ERR(trans)) {
1161                        mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
1162                        return PTR_ERR(trans);
1163                }
1164                ret = btrfs_commit_transaction(trans);
1165                WARN_ON(ret);
1166
1167                btrfs_info_in_rcu(fs_info,
1168                "suspended dev_replace from %s (devid %llu) to %s canceled",
1169                        btrfs_dev_name(src_device), src_device->devid,
1170                        btrfs_dev_name(tgt_device));
1171
1172                if (tgt_device)
1173                        btrfs_destroy_dev_replace_tgtdev(tgt_device);
1174                break;
1175        default:
1176                up_write(&dev_replace->rwsem);
1177                result = -EINVAL;
1178        }
1179
1180        mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
1181        return result;
1182}
1183
1184void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
1185{
1186        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
1187
1188        mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
1189        down_write(&dev_replace->rwsem);
1190
1191        switch (dev_replace->replace_state) {
1192        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
1193        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
1194        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
1195        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
1196                break;
1197        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
1198                dev_replace->replace_state =
1199                        BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
1200                dev_replace->time_stopped = ktime_get_real_seconds();
1201                dev_replace->item_needs_writeback = 1;
1202                btrfs_info(fs_info, "suspending dev_replace for unmount");
1203                break;
1204        }
1205
1206        up_write(&dev_replace->rwsem);
1207        mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
1208}
1209
1210/* resume dev_replace procedure that was interrupted by unmount */
1211int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
1212{
1213        struct task_struct *task;
1214        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
1215
1216        down_write(&dev_replace->rwsem);
1217
1218        switch (dev_replace->replace_state) {
1219        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
1220        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
1221        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
1222                up_write(&dev_replace->rwsem);
1223                return 0;
1224        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
1225                break;
1226        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
1227                dev_replace->replace_state =
1228                        BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
1229                break;
1230        }
1231        if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
1232                btrfs_info(fs_info,
1233                           "cannot continue dev_replace, tgtdev is missing");
1234                btrfs_info(fs_info,
1235                           "you may cancel the operation after 'mount -o degraded'");
1236                dev_replace->replace_state =
1237                                        BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
1238                up_write(&dev_replace->rwsem);
1239                return 0;
1240        }
1241        up_write(&dev_replace->rwsem);
1242
1243        /*
1244         * This could collide with a paused balance, but the exclusive op logic
1245         * should never allow both to start and pause. We don't want to allow
1246         * dev-replace to start anyway.
1247         */
1248        if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) {
1249                down_write(&dev_replace->rwsem);
1250                dev_replace->replace_state =
1251                                        BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
1252                up_write(&dev_replace->rwsem);
1253                btrfs_info(fs_info,
1254                "cannot resume dev-replace, other exclusive operation running");
1255                return 0;
1256        }
1257
1258        task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
1259        return PTR_ERR_OR_ZERO(task);
1260}
1261
1262static int btrfs_dev_replace_kthread(void *data)
1263{
1264        struct btrfs_fs_info *fs_info = data;
1265        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
1266        u64 progress;
1267        int ret;
1268
1269        progress = btrfs_dev_replace_progress(fs_info);
1270        progress = div_u64(progress, 10);
1271        btrfs_info_in_rcu(fs_info,
1272                "continuing dev_replace from %s (devid %llu) to target %s @%u%%",
1273                btrfs_dev_name(dev_replace->srcdev),
1274                dev_replace->srcdev->devid,
1275                btrfs_dev_name(dev_replace->tgtdev),
1276                (unsigned int)progress);
1277
1278        ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
1279                              dev_replace->committed_cursor_left,
1280                              btrfs_device_get_total_bytes(dev_replace->srcdev),
1281                              &dev_replace->scrub_progress, 0, 1);
1282        ret = btrfs_dev_replace_finishing(fs_info, ret);
1283        WARN_ON(ret && ret != -ECANCELED);
1284
1285        btrfs_exclop_finish(fs_info);
1286        return 0;
1287}
1288
1289int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
1290{
1291        if (!dev_replace->is_valid)
1292                return 0;
1293
1294        switch (dev_replace->replace_state) {
1295        case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
1296        case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
1297        case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
1298                return 0;
1299        case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
1300        case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
1301                /*
1302                 * return true even if tgtdev is missing (this is
1303                 * something that can happen if the dev_replace
1304                 * procedure is suspended by an umount and then
1305                 * the tgtdev is missing (or "btrfs dev scan") was
1306                 * not called and the filesystem is remounted
1307                 * in degraded state. This does not stop the
1308                 * dev_replace procedure. It needs to be canceled
1309                 * manually if the cancellation is wanted.
1310                 */
1311                break;
1312        }
1313        return 1;
1314}
1315
1316void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
1317{
1318        percpu_counter_inc(&fs_info->dev_replace.bio_counter);
1319}
1320
1321void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
1322{
1323        percpu_counter_sub(&fs_info->dev_replace.bio_counter, amount);
1324        cond_wake_up_nomb(&fs_info->dev_replace.replace_wait);
1325}
1326
1327void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
1328{
1329        while (1) {
1330                percpu_counter_inc(&fs_info->dev_replace.bio_counter);
1331                if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
1332                                     &fs_info->fs_state)))
1333                        break;
1334
1335                btrfs_bio_counter_dec(fs_info);
1336                wait_event(fs_info->dev_replace.replace_wait,
1337                           !test_bit(BTRFS_FS_STATE_DEV_REPLACING,
1338                                     &fs_info->fs_state));
1339        }
1340}
1341