1
2
3
4
5
6#include <linux/sched.h>
7#include <linux/bio.h>
8#include <linux/slab.h>
9#include <linux/blkdev.h>
10#include <linux/kthread.h>
11#include <linux/math64.h>
12#include "misc.h"
13#include "ctree.h"
14#include "extent_map.h"
15#include "disk-io.h"
16#include "transaction.h"
17#include "print-tree.h"
18#include "volumes.h"
19#include "async-thread.h"
20#include "check-integrity.h"
21#include "rcu-string.h"
22#include "dev-replace.h"
23#include "sysfs.h"
24#include "zoned.h"
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
67 int scrub_ret);
68static int btrfs_dev_replace_kthread(void *data);
69
70int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
71{
72 struct btrfs_key key;
73 struct btrfs_root *dev_root = fs_info->dev_root;
74 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
75 struct extent_buffer *eb;
76 int slot;
77 int ret = 0;
78 struct btrfs_path *path = NULL;
79 int item_size;
80 struct btrfs_dev_replace_item *ptr;
81 u64 src_devid;
82
83 path = btrfs_alloc_path();
84 if (!path) {
85 ret = -ENOMEM;
86 goto out;
87 }
88
89 key.objectid = 0;
90 key.type = BTRFS_DEV_REPLACE_KEY;
91 key.offset = 0;
92 ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
93 if (ret) {
94no_valid_dev_replace_entry_found:
95
96
97
98
99 if (btrfs_find_device(fs_info->fs_devices,
100 BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) {
101 btrfs_err(fs_info,
102 "found replace target device without a valid replace item");
103 ret = -EUCLEAN;
104 goto out;
105 }
106 ret = 0;
107 dev_replace->replace_state =
108 BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
109 dev_replace->cont_reading_from_srcdev_mode =
110 BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS;
111 dev_replace->time_started = 0;
112 dev_replace->time_stopped = 0;
113 atomic64_set(&dev_replace->num_write_errors, 0);
114 atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
115 dev_replace->cursor_left = 0;
116 dev_replace->committed_cursor_left = 0;
117 dev_replace->cursor_left_last_write_of_item = 0;
118 dev_replace->cursor_right = 0;
119 dev_replace->srcdev = NULL;
120 dev_replace->tgtdev = NULL;
121 dev_replace->is_valid = 0;
122 dev_replace->item_needs_writeback = 0;
123 goto out;
124 }
125 slot = path->slots[0];
126 eb = path->nodes[0];
127 item_size = btrfs_item_size_nr(eb, slot);
128 ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_replace_item);
129
130 if (item_size != sizeof(struct btrfs_dev_replace_item)) {
131 btrfs_warn(fs_info,
132 "dev_replace entry found has unexpected size, ignore entry");
133 goto no_valid_dev_replace_entry_found;
134 }
135
136 src_devid = btrfs_dev_replace_src_devid(eb, ptr);
137 dev_replace->cont_reading_from_srcdev_mode =
138 btrfs_dev_replace_cont_reading_from_srcdev_mode(eb, ptr);
139 dev_replace->replace_state = btrfs_dev_replace_replace_state(eb, ptr);
140 dev_replace->time_started = btrfs_dev_replace_time_started(eb, ptr);
141 dev_replace->time_stopped =
142 btrfs_dev_replace_time_stopped(eb, ptr);
143 atomic64_set(&dev_replace->num_write_errors,
144 btrfs_dev_replace_num_write_errors(eb, ptr));
145 atomic64_set(&dev_replace->num_uncorrectable_read_errors,
146 btrfs_dev_replace_num_uncorrectable_read_errors(eb, ptr));
147 dev_replace->cursor_left = btrfs_dev_replace_cursor_left(eb, ptr);
148 dev_replace->committed_cursor_left = dev_replace->cursor_left;
149 dev_replace->cursor_left_last_write_of_item = dev_replace->cursor_left;
150 dev_replace->cursor_right = btrfs_dev_replace_cursor_right(eb, ptr);
151 dev_replace->is_valid = 1;
152
153 dev_replace->item_needs_writeback = 0;
154 switch (dev_replace->replace_state) {
155 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
156 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
157 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
158
159
160
161
162 if (btrfs_find_device(fs_info->fs_devices,
163 BTRFS_DEV_REPLACE_DEVID, NULL, NULL)) {
164 btrfs_err(fs_info,
165 "replace devid present without an active replace item");
166 ret = -EUCLEAN;
167 } else {
168 dev_replace->srcdev = NULL;
169 dev_replace->tgtdev = NULL;
170 }
171 break;
172 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
173 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
174 dev_replace->srcdev = btrfs_find_device(fs_info->fs_devices,
175 src_devid, NULL, NULL);
176 dev_replace->tgtdev = btrfs_find_device(fs_info->fs_devices,
177 BTRFS_DEV_REPLACE_DEVID,
178 NULL, NULL);
179
180
181
182
183 if (!dev_replace->srcdev &&
184 !btrfs_test_opt(fs_info, DEGRADED)) {
185 ret = -EIO;
186 btrfs_warn(fs_info,
187 "cannot mount because device replace operation is ongoing and");
188 btrfs_warn(fs_info,
189 "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
190 src_devid);
191 }
192 if (!dev_replace->tgtdev &&
193 !btrfs_test_opt(fs_info, DEGRADED)) {
194 ret = -EIO;
195 btrfs_warn(fs_info,
196 "cannot mount because device replace operation is ongoing and");
197 btrfs_warn(fs_info,
198 "tgtdev (devid %llu) is missing, need to run 'btrfs dev scan'?",
199 BTRFS_DEV_REPLACE_DEVID);
200 }
201 if (dev_replace->tgtdev) {
202 if (dev_replace->srcdev) {
203 dev_replace->tgtdev->total_bytes =
204 dev_replace->srcdev->total_bytes;
205 dev_replace->tgtdev->disk_total_bytes =
206 dev_replace->srcdev->disk_total_bytes;
207 dev_replace->tgtdev->commit_total_bytes =
208 dev_replace->srcdev->commit_total_bytes;
209 dev_replace->tgtdev->bytes_used =
210 dev_replace->srcdev->bytes_used;
211 dev_replace->tgtdev->commit_bytes_used =
212 dev_replace->srcdev->commit_bytes_used;
213 }
214 set_bit(BTRFS_DEV_STATE_REPLACE_TGT,
215 &dev_replace->tgtdev->dev_state);
216
217 WARN_ON(fs_info->fs_devices->rw_devices == 0);
218 dev_replace->tgtdev->io_width = fs_info->sectorsize;
219 dev_replace->tgtdev->io_align = fs_info->sectorsize;
220 dev_replace->tgtdev->sector_size = fs_info->sectorsize;
221 dev_replace->tgtdev->fs_info = fs_info;
222 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
223 &dev_replace->tgtdev->dev_state);
224 }
225 break;
226 }
227
228out:
229 btrfs_free_path(path);
230 return ret;
231}
232
233
234
235
236
237
238
239static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
240 const char *device_path,
241 struct btrfs_device *srcdev,
242 struct btrfs_device **device_out)
243{
244 struct btrfs_device *device;
245 struct block_device *bdev;
246 struct rcu_string *name;
247 u64 devid = BTRFS_DEV_REPLACE_DEVID;
248 int ret = 0;
249
250 *device_out = NULL;
251 if (srcdev->fs_devices->seeding) {
252 btrfs_err(fs_info, "the filesystem is a seed filesystem!");
253 return -EINVAL;
254 }
255
256 bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
257 fs_info->bdev_holder);
258 if (IS_ERR(bdev)) {
259 btrfs_err(fs_info, "target device %s is invalid!", device_path);
260 return PTR_ERR(bdev);
261 }
262
263 if (!btrfs_check_device_zone_type(fs_info, bdev)) {
264 btrfs_err(fs_info,
265 "dev-replace: zoned type of target device mismatch with filesystem");
266 ret = -EINVAL;
267 goto error;
268 }
269
270 sync_blockdev(bdev);
271
272 list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
273 if (device->bdev == bdev) {
274 btrfs_err(fs_info,
275 "target device is in the filesystem!");
276 ret = -EEXIST;
277 goto error;
278 }
279 }
280
281
282 if (i_size_read(bdev->bd_inode) <
283 btrfs_device_get_total_bytes(srcdev)) {
284 btrfs_err(fs_info,
285 "target device is smaller than source device!");
286 ret = -EINVAL;
287 goto error;
288 }
289
290
291 device = btrfs_alloc_device(NULL, &devid, NULL);
292 if (IS_ERR(device)) {
293 ret = PTR_ERR(device);
294 goto error;
295 }
296
297 name = rcu_string_strdup(device_path, GFP_KERNEL);
298 if (!name) {
299 btrfs_free_device(device);
300 ret = -ENOMEM;
301 goto error;
302 }
303 rcu_assign_pointer(device->name, name);
304
305 set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
306 device->generation = 0;
307 device->io_width = fs_info->sectorsize;
308 device->io_align = fs_info->sectorsize;
309 device->sector_size = fs_info->sectorsize;
310 device->total_bytes = btrfs_device_get_total_bytes(srcdev);
311 device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev);
312 device->bytes_used = btrfs_device_get_bytes_used(srcdev);
313 device->commit_total_bytes = srcdev->commit_total_bytes;
314 device->commit_bytes_used = device->bytes_used;
315 device->fs_info = fs_info;
316 device->bdev = bdev;
317 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
318 set_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
319 device->mode = FMODE_EXCL;
320 device->dev_stats_valid = 1;
321 set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
322 device->fs_devices = fs_info->fs_devices;
323
324 ret = btrfs_get_dev_zone_info(device);
325 if (ret)
326 goto error;
327
328 mutex_lock(&fs_info->fs_devices->device_list_mutex);
329 list_add(&device->dev_list, &fs_info->fs_devices->devices);
330 fs_info->fs_devices->num_devices++;
331 fs_info->fs_devices->open_devices++;
332 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
333
334 *device_out = device;
335 return 0;
336
337error:
338 blkdev_put(bdev, FMODE_EXCL);
339 return ret;
340}
341
342
343
344
345
346int btrfs_run_dev_replace(struct btrfs_trans_handle *trans)
347{
348 struct btrfs_fs_info *fs_info = trans->fs_info;
349 int ret;
350 struct btrfs_root *dev_root = fs_info->dev_root;
351 struct btrfs_path *path;
352 struct btrfs_key key;
353 struct extent_buffer *eb;
354 struct btrfs_dev_replace_item *ptr;
355 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
356
357 down_read(&dev_replace->rwsem);
358 if (!dev_replace->is_valid ||
359 !dev_replace->item_needs_writeback) {
360 up_read(&dev_replace->rwsem);
361 return 0;
362 }
363 up_read(&dev_replace->rwsem);
364
365 key.objectid = 0;
366 key.type = BTRFS_DEV_REPLACE_KEY;
367 key.offset = 0;
368
369 path = btrfs_alloc_path();
370 if (!path) {
371 ret = -ENOMEM;
372 goto out;
373 }
374 ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
375 if (ret < 0) {
376 btrfs_warn(fs_info,
377 "error %d while searching for dev_replace item!",
378 ret);
379 goto out;
380 }
381
382 if (ret == 0 &&
383 btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
384
385
386
387
388
389
390
391
392
393
394
395 ret = btrfs_del_item(trans, dev_root, path);
396 if (ret != 0) {
397 btrfs_warn(fs_info,
398 "delete too small dev_replace item failed %d!",
399 ret);
400 goto out;
401 }
402 ret = 1;
403 }
404
405 if (ret == 1) {
406
407 btrfs_release_path(path);
408 ret = btrfs_insert_empty_item(trans, dev_root, path,
409 &key, sizeof(*ptr));
410 if (ret < 0) {
411 btrfs_warn(fs_info,
412 "insert dev_replace item failed %d!", ret);
413 goto out;
414 }
415 }
416
417 eb = path->nodes[0];
418 ptr = btrfs_item_ptr(eb, path->slots[0],
419 struct btrfs_dev_replace_item);
420
421 down_write(&dev_replace->rwsem);
422 if (dev_replace->srcdev)
423 btrfs_set_dev_replace_src_devid(eb, ptr,
424 dev_replace->srcdev->devid);
425 else
426 btrfs_set_dev_replace_src_devid(eb, ptr, (u64)-1);
427 btrfs_set_dev_replace_cont_reading_from_srcdev_mode(eb, ptr,
428 dev_replace->cont_reading_from_srcdev_mode);
429 btrfs_set_dev_replace_replace_state(eb, ptr,
430 dev_replace->replace_state);
431 btrfs_set_dev_replace_time_started(eb, ptr, dev_replace->time_started);
432 btrfs_set_dev_replace_time_stopped(eb, ptr, dev_replace->time_stopped);
433 btrfs_set_dev_replace_num_write_errors(eb, ptr,
434 atomic64_read(&dev_replace->num_write_errors));
435 btrfs_set_dev_replace_num_uncorrectable_read_errors(eb, ptr,
436 atomic64_read(&dev_replace->num_uncorrectable_read_errors));
437 dev_replace->cursor_left_last_write_of_item =
438 dev_replace->cursor_left;
439 btrfs_set_dev_replace_cursor_left(eb, ptr,
440 dev_replace->cursor_left_last_write_of_item);
441 btrfs_set_dev_replace_cursor_right(eb, ptr,
442 dev_replace->cursor_right);
443 dev_replace->item_needs_writeback = 0;
444 up_write(&dev_replace->rwsem);
445
446 btrfs_mark_buffer_dirty(eb);
447
448out:
449 btrfs_free_path(path);
450
451 return ret;
452}
453
454static char* btrfs_dev_name(struct btrfs_device *device)
455{
456 if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
457 return "<missing disk>";
458 else
459 return rcu_str_deref(device->name);
460}
461
462static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
463 const char *tgtdev_name, u64 srcdevid, const char *srcdev_name,
464 int read_src)
465{
466 struct btrfs_root *root = fs_info->dev_root;
467 struct btrfs_trans_handle *trans;
468 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
469 int ret;
470 struct btrfs_device *tgt_device = NULL;
471 struct btrfs_device *src_device = NULL;
472
473 src_device = btrfs_find_device_by_devspec(fs_info, srcdevid,
474 srcdev_name);
475 if (IS_ERR(src_device))
476 return PTR_ERR(src_device);
477
478 if (btrfs_pinned_by_swapfile(fs_info, src_device)) {
479 btrfs_warn_in_rcu(fs_info,
480 "cannot replace device %s (devid %llu) due to active swapfile",
481 btrfs_dev_name(src_device), src_device->devid);
482 return -ETXTBSY;
483 }
484
485
486
487
488
489 trans = btrfs_attach_transaction(root);
490 if (!IS_ERR(trans)) {
491 ret = btrfs_commit_transaction(trans);
492 if (ret)
493 return ret;
494 } else if (PTR_ERR(trans) != -ENOENT) {
495 return PTR_ERR(trans);
496 }
497
498 ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name,
499 src_device, &tgt_device);
500 if (ret)
501 return ret;
502
503 down_write(&dev_replace->rwsem);
504 switch (dev_replace->replace_state) {
505 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
506 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
507 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
508 break;
509 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
510 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
511 ASSERT(0);
512 ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
513 up_write(&dev_replace->rwsem);
514 goto leave;
515 }
516
517 dev_replace->cont_reading_from_srcdev_mode = read_src;
518 dev_replace->srcdev = src_device;
519 dev_replace->tgtdev = tgt_device;
520
521 btrfs_info_in_rcu(fs_info,
522 "dev_replace from %s (devid %llu) to %s started",
523 btrfs_dev_name(src_device),
524 src_device->devid,
525 rcu_str_deref(tgt_device->name));
526
527
528
529
530
531 dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
532 dev_replace->time_started = ktime_get_real_seconds();
533 dev_replace->cursor_left = 0;
534 dev_replace->committed_cursor_left = 0;
535 dev_replace->cursor_left_last_write_of_item = 0;
536 dev_replace->cursor_right = 0;
537 dev_replace->is_valid = 1;
538 dev_replace->item_needs_writeback = 1;
539 atomic64_set(&dev_replace->num_write_errors, 0);
540 atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
541 up_write(&dev_replace->rwsem);
542
543 ret = btrfs_sysfs_add_device(tgt_device);
544 if (ret)
545 btrfs_err(fs_info, "kobj add dev failed %d", ret);
546
547 btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
548
549
550 trans = btrfs_start_transaction(root, 1);
551 if (IS_ERR(trans)) {
552 ret = PTR_ERR(trans);
553 down_write(&dev_replace->rwsem);
554 dev_replace->replace_state =
555 BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
556 dev_replace->srcdev = NULL;
557 dev_replace->tgtdev = NULL;
558 up_write(&dev_replace->rwsem);
559 goto leave;
560 }
561
562 ret = btrfs_commit_transaction(trans);
563 WARN_ON(ret);
564
565
566 ret = btrfs_scrub_dev(fs_info, src_device->devid, 0,
567 btrfs_device_get_total_bytes(src_device),
568 &dev_replace->scrub_progress, 0, 1);
569
570 ret = btrfs_dev_replace_finishing(fs_info, ret);
571 if (ret == -EINPROGRESS)
572 ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS;
573
574 return ret;
575
576leave:
577 btrfs_destroy_dev_replace_tgtdev(tgt_device);
578 return ret;
579}
580
581int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
582 struct btrfs_ioctl_dev_replace_args *args)
583{
584 int ret;
585
586 switch (args->start.cont_reading_from_srcdev_mode) {
587 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS:
588 case BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID:
589 break;
590 default:
591 return -EINVAL;
592 }
593
594 if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
595 args->start.tgtdev_name[0] == '\0')
596 return -EINVAL;
597
598 ret = btrfs_dev_replace_start(fs_info, args->start.tgtdev_name,
599 args->start.srcdevid,
600 args->start.srcdev_name,
601 args->start.cont_reading_from_srcdev_mode);
602 args->result = ret;
603
604 if (ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS ||
605 ret == BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR)
606 return 0;
607
608 return ret;
609}
610
611
612
613
614static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
615{
616 set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
617 wait_event(fs_info->dev_replace.replace_wait, !percpu_counter_sum(
618 &fs_info->dev_replace.bio_counter));
619}
620
621
622
623
624static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
625{
626 clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
627 wake_up(&fs_info->dev_replace.replace_wait);
628}
629
630
631
632
633
634
635
636static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev,
637 struct btrfs_device *tgtdev)
638{
639 struct extent_state *cached_state = NULL;
640 u64 start = 0;
641 u64 found_start;
642 u64 found_end;
643 int ret = 0;
644
645 lockdep_assert_held(&srcdev->fs_info->chunk_mutex);
646
647 while (!find_first_extent_bit(&srcdev->alloc_state, start,
648 &found_start, &found_end,
649 CHUNK_ALLOCATED, &cached_state)) {
650 ret = set_extent_bits(&tgtdev->alloc_state, found_start,
651 found_end, CHUNK_ALLOCATED);
652 if (ret)
653 break;
654 start = found_end + 1;
655 }
656
657 free_extent_state(cached_state);
658 return ret;
659}
660
661static void btrfs_dev_replace_update_device_in_mapping_tree(
662 struct btrfs_fs_info *fs_info,
663 struct btrfs_device *srcdev,
664 struct btrfs_device *tgtdev)
665{
666 struct extent_map_tree *em_tree = &fs_info->mapping_tree;
667 struct extent_map *em;
668 struct map_lookup *map;
669 u64 start = 0;
670 int i;
671
672 write_lock(&em_tree->lock);
673 do {
674 em = lookup_extent_mapping(em_tree, start, (u64)-1);
675 if (!em)
676 break;
677 map = em->map_lookup;
678 for (i = 0; i < map->num_stripes; i++)
679 if (srcdev == map->stripes[i].dev)
680 map->stripes[i].dev = tgtdev;
681 start = em->start + em->len;
682 free_extent_map(em);
683 } while (start);
684 write_unlock(&em_tree->lock);
685}
686
687static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
688 int scrub_ret)
689{
690 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
691 struct btrfs_device *tgt_device;
692 struct btrfs_device *src_device;
693 struct btrfs_root *root = fs_info->tree_root;
694 u8 uuid_tmp[BTRFS_UUID_SIZE];
695 struct btrfs_trans_handle *trans;
696 int ret = 0;
697
698
699 mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
700
701 down_read(&dev_replace->rwsem);
702
703 if (dev_replace->replace_state !=
704 BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED) {
705 up_read(&dev_replace->rwsem);
706 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
707 return 0;
708 }
709
710 tgt_device = dev_replace->tgtdev;
711 src_device = dev_replace->srcdev;
712 up_read(&dev_replace->rwsem);
713
714
715
716
717
718 ret = btrfs_start_delalloc_roots(fs_info, U64_MAX, false);
719 if (ret) {
720 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
721 return ret;
722 }
723 btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
724
725 if (!scrub_ret)
726 btrfs_reada_remove_dev(src_device);
727
728
729
730
731
732
733 while (1) {
734 trans = btrfs_start_transaction(root, 0);
735 if (IS_ERR(trans)) {
736 btrfs_reada_undo_remove_dev(src_device);
737 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
738 return PTR_ERR(trans);
739 }
740 ret = btrfs_commit_transaction(trans);
741 WARN_ON(ret);
742
743
744 mutex_lock(&fs_info->fs_devices->device_list_mutex);
745
746 mutex_lock(&fs_info->chunk_mutex);
747
748 if (!list_empty(&src_device->post_commit_list)) {
749 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
750 mutex_unlock(&fs_info->chunk_mutex);
751 } else {
752 break;
753 }
754 }
755
756 down_write(&dev_replace->rwsem);
757 dev_replace->replace_state =
758 scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
759 : BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED;
760 dev_replace->tgtdev = NULL;
761 dev_replace->srcdev = NULL;
762 dev_replace->time_stopped = ktime_get_real_seconds();
763 dev_replace->item_needs_writeback = 1;
764
765
766
767
768
769 if (!scrub_ret) {
770 scrub_ret = btrfs_set_target_alloc_state(src_device, tgt_device);
771 if (scrub_ret)
772 goto error;
773 btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
774 src_device,
775 tgt_device);
776 } else {
777 if (scrub_ret != -ECANCELED)
778 btrfs_err_in_rcu(fs_info,
779 "btrfs_scrub_dev(%s, %llu, %s) failed %d",
780 btrfs_dev_name(src_device),
781 src_device->devid,
782 rcu_str_deref(tgt_device->name), scrub_ret);
783error:
784 up_write(&dev_replace->rwsem);
785 mutex_unlock(&fs_info->chunk_mutex);
786 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
787 btrfs_reada_undo_remove_dev(src_device);
788 btrfs_rm_dev_replace_blocked(fs_info);
789 if (tgt_device)
790 btrfs_destroy_dev_replace_tgtdev(tgt_device);
791 btrfs_rm_dev_replace_unblocked(fs_info);
792 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
793
794 return scrub_ret;
795 }
796
797 btrfs_info_in_rcu(fs_info,
798 "dev_replace from %s (devid %llu) to %s finished",
799 btrfs_dev_name(src_device),
800 src_device->devid,
801 rcu_str_deref(tgt_device->name));
802 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &tgt_device->dev_state);
803 tgt_device->devid = src_device->devid;
804 src_device->devid = BTRFS_DEV_REPLACE_DEVID;
805 memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp));
806 memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid));
807 memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid));
808 btrfs_device_set_total_bytes(tgt_device, src_device->total_bytes);
809 btrfs_device_set_disk_total_bytes(tgt_device,
810 src_device->disk_total_bytes);
811 btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used);
812 tgt_device->commit_bytes_used = src_device->bytes_used;
813
814 btrfs_assign_next_active_device(src_device, tgt_device);
815
816 list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
817 fs_info->fs_devices->rw_devices++;
818
819 up_write(&dev_replace->rwsem);
820 btrfs_rm_dev_replace_blocked(fs_info);
821
822 btrfs_rm_dev_replace_remove_srcdev(src_device);
823
824 btrfs_rm_dev_replace_unblocked(fs_info);
825
826
827
828
829
830 atomic_inc(&tgt_device->dev_stats_ccnt);
831
832
833
834
835
836
837
838
839 mutex_unlock(&fs_info->chunk_mutex);
840 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
841
842
843 btrfs_sysfs_remove_device(src_device);
844 btrfs_sysfs_update_devid(tgt_device);
845 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state))
846 btrfs_scratch_superblocks(fs_info, src_device->bdev,
847 src_device->name->str);
848
849
850 trans = btrfs_start_transaction(root, 0);
851 if (!IS_ERR(trans))
852 btrfs_commit_transaction(trans);
853
854 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
855
856 btrfs_rm_dev_replace_free_srcdev(src_device);
857
858 return 0;
859}
860
861
862
863
864
865
866static u64 btrfs_dev_replace_progress(struct btrfs_fs_info *fs_info)
867{
868 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
869 u64 ret = 0;
870
871 switch (dev_replace->replace_state) {
872 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
873 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
874 ret = 0;
875 break;
876 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
877 ret = 1000;
878 break;
879 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
880 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
881 ret = div64_u64(dev_replace->cursor_left,
882 div_u64(btrfs_device_get_total_bytes(
883 dev_replace->srcdev), 1000));
884 break;
885 }
886
887 return ret;
888}
889
890void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info,
891 struct btrfs_ioctl_dev_replace_args *args)
892{
893 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
894
895 down_read(&dev_replace->rwsem);
896
897
898 args->result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
899 args->status.replace_state = dev_replace->replace_state;
900 args->status.time_started = dev_replace->time_started;
901 args->status.time_stopped = dev_replace->time_stopped;
902 args->status.num_write_errors =
903 atomic64_read(&dev_replace->num_write_errors);
904 args->status.num_uncorrectable_read_errors =
905 atomic64_read(&dev_replace->num_uncorrectable_read_errors);
906 args->status.progress_1000 = btrfs_dev_replace_progress(fs_info);
907 up_read(&dev_replace->rwsem);
908}
909
910int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
911{
912 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
913 struct btrfs_device *tgt_device = NULL;
914 struct btrfs_device *src_device = NULL;
915 struct btrfs_trans_handle *trans;
916 struct btrfs_root *root = fs_info->tree_root;
917 int result;
918 int ret;
919
920 if (sb_rdonly(fs_info->sb))
921 return -EROFS;
922
923 mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
924 down_write(&dev_replace->rwsem);
925 switch (dev_replace->replace_state) {
926 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
927 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
928 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
929 result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
930 up_write(&dev_replace->rwsem);
931 break;
932 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
933 tgt_device = dev_replace->tgtdev;
934 src_device = dev_replace->srcdev;
935 up_write(&dev_replace->rwsem);
936 ret = btrfs_scrub_cancel(fs_info);
937 if (ret < 0) {
938 result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED;
939 } else {
940 result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
941
942
943
944
945 btrfs_info_in_rcu(fs_info,
946 "dev_replace from %s (devid %llu) to %s canceled",
947 btrfs_dev_name(src_device), src_device->devid,
948 btrfs_dev_name(tgt_device));
949 }
950 break;
951 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
952
953
954
955
956 result = BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR;
957 tgt_device = dev_replace->tgtdev;
958 src_device = dev_replace->srcdev;
959 dev_replace->tgtdev = NULL;
960 dev_replace->srcdev = NULL;
961 dev_replace->replace_state =
962 BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED;
963 dev_replace->time_stopped = ktime_get_real_seconds();
964 dev_replace->item_needs_writeback = 1;
965
966 up_write(&dev_replace->rwsem);
967
968
969 ret = btrfs_scrub_cancel(fs_info);
970 ASSERT(ret != -ENOTCONN);
971
972 trans = btrfs_start_transaction(root, 0);
973 if (IS_ERR(trans)) {
974 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
975 return PTR_ERR(trans);
976 }
977 ret = btrfs_commit_transaction(trans);
978 WARN_ON(ret);
979
980 btrfs_info_in_rcu(fs_info,
981 "suspended dev_replace from %s (devid %llu) to %s canceled",
982 btrfs_dev_name(src_device), src_device->devid,
983 btrfs_dev_name(tgt_device));
984
985 if (tgt_device)
986 btrfs_destroy_dev_replace_tgtdev(tgt_device);
987 break;
988 default:
989 up_write(&dev_replace->rwsem);
990 result = -EINVAL;
991 }
992
993 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
994 return result;
995}
996
997void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info)
998{
999 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
1000
1001 mutex_lock(&dev_replace->lock_finishing_cancel_unmount);
1002 down_write(&dev_replace->rwsem);
1003
1004 switch (dev_replace->replace_state) {
1005 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
1006 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
1007 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
1008 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
1009 break;
1010 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
1011 dev_replace->replace_state =
1012 BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
1013 dev_replace->time_stopped = ktime_get_real_seconds();
1014 dev_replace->item_needs_writeback = 1;
1015 btrfs_info(fs_info, "suspending dev_replace for unmount");
1016 break;
1017 }
1018
1019 up_write(&dev_replace->rwsem);
1020 mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
1021}
1022
1023
1024int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info)
1025{
1026 struct task_struct *task;
1027 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
1028
1029 down_write(&dev_replace->rwsem);
1030
1031 switch (dev_replace->replace_state) {
1032 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
1033 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
1034 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
1035 up_write(&dev_replace->rwsem);
1036 return 0;
1037 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
1038 break;
1039 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
1040 dev_replace->replace_state =
1041 BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED;
1042 break;
1043 }
1044 if (!dev_replace->tgtdev || !dev_replace->tgtdev->bdev) {
1045 btrfs_info(fs_info,
1046 "cannot continue dev_replace, tgtdev is missing");
1047 btrfs_info(fs_info,
1048 "you may cancel the operation after 'mount -o degraded'");
1049 dev_replace->replace_state =
1050 BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
1051 up_write(&dev_replace->rwsem);
1052 return 0;
1053 }
1054 up_write(&dev_replace->rwsem);
1055
1056
1057
1058
1059
1060
1061 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) {
1062 down_write(&dev_replace->rwsem);
1063 dev_replace->replace_state =
1064 BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED;
1065 up_write(&dev_replace->rwsem);
1066 btrfs_info(fs_info,
1067 "cannot resume dev-replace, other exclusive operation running");
1068 return 0;
1069 }
1070
1071 task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl");
1072 return PTR_ERR_OR_ZERO(task);
1073}
1074
1075static int btrfs_dev_replace_kthread(void *data)
1076{
1077 struct btrfs_fs_info *fs_info = data;
1078 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
1079 u64 progress;
1080 int ret;
1081
1082 progress = btrfs_dev_replace_progress(fs_info);
1083 progress = div_u64(progress, 10);
1084 btrfs_info_in_rcu(fs_info,
1085 "continuing dev_replace from %s (devid %llu) to target %s @%u%%",
1086 btrfs_dev_name(dev_replace->srcdev),
1087 dev_replace->srcdev->devid,
1088 btrfs_dev_name(dev_replace->tgtdev),
1089 (unsigned int)progress);
1090
1091 ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid,
1092 dev_replace->committed_cursor_left,
1093 btrfs_device_get_total_bytes(dev_replace->srcdev),
1094 &dev_replace->scrub_progress, 0, 1);
1095 ret = btrfs_dev_replace_finishing(fs_info, ret);
1096 WARN_ON(ret && ret != -ECANCELED);
1097
1098 btrfs_exclop_finish(fs_info);
1099 return 0;
1100}
1101
1102int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
1103{
1104 if (!dev_replace->is_valid)
1105 return 0;
1106
1107 switch (dev_replace->replace_state) {
1108 case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
1109 case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED:
1110 case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED:
1111 return 0;
1112 case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
1113 case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124 break;
1125 }
1126 return 1;
1127}
1128
1129void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
1130{
1131 percpu_counter_inc(&fs_info->dev_replace.bio_counter);
1132}
1133
1134void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
1135{
1136 percpu_counter_sub(&fs_info->dev_replace.bio_counter, amount);
1137 cond_wake_up_nomb(&fs_info->dev_replace.replace_wait);
1138}
1139
1140void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
1141{
1142 while (1) {
1143 percpu_counter_inc(&fs_info->dev_replace.bio_counter);
1144 if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
1145 &fs_info->fs_state)))
1146 break;
1147
1148 btrfs_bio_counter_dec(fs_info);
1149 wait_event(fs_info->dev_replace.replace_wait,
1150 !test_bit(BTRFS_FS_STATE_DEV_REPLACING,
1151 &fs_info->fs_state));
1152 }
1153}
1154