linux/fs/btrfs/scrub.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
   4 */
   5
   6#include <linux/blkdev.h>
   7#include <linux/ratelimit.h>
   8#include <linux/sched/mm.h>
   9#include "ctree.h"
  10#include "volumes.h"
  11#include "disk-io.h"
  12#include "ordered-data.h"
  13#include "transaction.h"
  14#include "backref.h"
  15#include "extent_io.h"
  16#include "dev-replace.h"
  17#include "check-integrity.h"
  18#include "rcu-string.h"
  19#include "raid56.h"
  20
  21/*
  22 * This is only the first step towards a full-features scrub. It reads all
  23 * extent and super block and verifies the checksums. In case a bad checksum
  24 * is found or the extent cannot be read, good data will be written back if
  25 * any can be found.
  26 *
  27 * Future enhancements:
  28 *  - In case an unrepairable extent is encountered, track which files are
  29 *    affected and report them
  30 *  - track and record media errors, throw out bad devices
  31 *  - add a mode to also read unallocated space
  32 */
  33
  34struct scrub_block;
  35struct scrub_ctx;
  36
  37/*
  38 * the following three values only influence the performance.
  39 * The last one configures the number of parallel and outstanding I/O
  40 * operations. The first two values configure an upper limit for the number
  41 * of (dynamically allocated) pages that are added to a bio.
  42 */
  43#define SCRUB_PAGES_PER_RD_BIO  32      /* 128k per bio */
  44#define SCRUB_PAGES_PER_WR_BIO  32      /* 128k per bio */
  45#define SCRUB_BIOS_PER_SCTX     64      /* 8MB per device in flight */
  46
  47/*
  48 * the following value times PAGE_SIZE needs to be large enough to match the
  49 * largest node/leaf/sector size that shall be supported.
  50 * Values larger than BTRFS_STRIPE_LEN are not supported.
  51 */
  52#define SCRUB_MAX_PAGES_PER_BLOCK       16      /* 64k per node/leaf/sector */
  53
  54struct scrub_recover {
  55        refcount_t              refs;
  56        struct btrfs_bio        *bbio;
  57        u64                     map_length;
  58};
  59
  60struct scrub_page {
  61        struct scrub_block      *sblock;
  62        struct page             *page;
  63        struct btrfs_device     *dev;
  64        struct list_head        list;
  65        u64                     flags;  /* extent flags */
  66        u64                     generation;
  67        u64                     logical;
  68        u64                     physical;
  69        u64                     physical_for_dev_replace;
  70        atomic_t                refs;
  71        struct {
  72                unsigned int    mirror_num:8;
  73                unsigned int    have_csum:1;
  74                unsigned int    io_error:1;
  75        };
  76        u8                      csum[BTRFS_CSUM_SIZE];
  77
  78        struct scrub_recover    *recover;
  79};
  80
  81struct scrub_bio {
  82        int                     index;
  83        struct scrub_ctx        *sctx;
  84        struct btrfs_device     *dev;
  85        struct bio              *bio;
  86        blk_status_t            status;
  87        u64                     logical;
  88        u64                     physical;
  89#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
  90        struct scrub_page       *pagev[SCRUB_PAGES_PER_WR_BIO];
  91#else
  92        struct scrub_page       *pagev[SCRUB_PAGES_PER_RD_BIO];
  93#endif
  94        int                     page_count;
  95        int                     next_free;
  96        struct btrfs_work       work;
  97};
  98
  99struct scrub_block {
 100        struct scrub_page       *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
 101        int                     page_count;
 102        atomic_t                outstanding_pages;
 103        refcount_t              refs; /* free mem on transition to zero */
 104        struct scrub_ctx        *sctx;
 105        struct scrub_parity     *sparity;
 106        struct {
 107                unsigned int    header_error:1;
 108                unsigned int    checksum_error:1;
 109                unsigned int    no_io_error_seen:1;
 110                unsigned int    generation_error:1; /* also sets header_error */
 111
 112                /* The following is for the data used to check parity */
 113                /* It is for the data with checksum */
 114                unsigned int    data_corrected:1;
 115        };
 116        struct btrfs_work       work;
 117};
 118
 119/* Used for the chunks with parity stripe such RAID5/6 */
 120struct scrub_parity {
 121        struct scrub_ctx        *sctx;
 122
 123        struct btrfs_device     *scrub_dev;
 124
 125        u64                     logic_start;
 126
 127        u64                     logic_end;
 128
 129        int                     nsectors;
 130
 131        u64                     stripe_len;
 132
 133        refcount_t              refs;
 134
 135        struct list_head        spages;
 136
 137        /* Work of parity check and repair */
 138        struct btrfs_work       work;
 139
 140        /* Mark the parity blocks which have data */
 141        unsigned long           *dbitmap;
 142
 143        /*
 144         * Mark the parity blocks which have data, but errors happen when
 145         * read data or check data
 146         */
 147        unsigned long           *ebitmap;
 148
 149        unsigned long           bitmap[0];
 150};
 151
 152struct scrub_ctx {
 153        struct scrub_bio        *bios[SCRUB_BIOS_PER_SCTX];
 154        struct btrfs_fs_info    *fs_info;
 155        int                     first_free;
 156        int                     curr;
 157        atomic_t                bios_in_flight;
 158        atomic_t                workers_pending;
 159        spinlock_t              list_lock;
 160        wait_queue_head_t       list_wait;
 161        u16                     csum_size;
 162        struct list_head        csum_list;
 163        atomic_t                cancel_req;
 164        int                     readonly;
 165        int                     pages_per_rd_bio;
 166
 167        int                     is_dev_replace;
 168
 169        struct scrub_bio        *wr_curr_bio;
 170        struct mutex            wr_lock;
 171        int                     pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
 172        struct btrfs_device     *wr_tgtdev;
 173        bool                    flush_all_writes;
 174
 175        /*
 176         * statistics
 177         */
 178        struct btrfs_scrub_progress stat;
 179        spinlock_t              stat_lock;
 180
 181        /*
 182         * Use a ref counter to avoid use-after-free issues. Scrub workers
 183         * decrement bios_in_flight and workers_pending and then do a wakeup
 184         * on the list_wait wait queue. We must ensure the main scrub task
 185         * doesn't free the scrub context before or while the workers are
 186         * doing the wakeup() call.
 187         */
 188        refcount_t              refs;
 189};
 190
 191struct scrub_fixup_nodatasum {
 192        struct scrub_ctx        *sctx;
 193        struct btrfs_device     *dev;
 194        u64                     logical;
 195        struct btrfs_root       *root;
 196        struct btrfs_work       work;
 197        int                     mirror_num;
 198};
 199
 200struct scrub_nocow_inode {
 201        u64                     inum;
 202        u64                     offset;
 203        u64                     root;
 204        struct list_head        list;
 205};
 206
 207struct scrub_copy_nocow_ctx {
 208        struct scrub_ctx        *sctx;
 209        u64                     logical;
 210        u64                     len;
 211        int                     mirror_num;
 212        u64                     physical_for_dev_replace;
 213        struct list_head        inodes;
 214        struct btrfs_work       work;
 215};
 216
 217struct scrub_warning {
 218        struct btrfs_path       *path;
 219        u64                     extent_item_size;
 220        const char              *errstr;
 221        u64                     physical;
 222        u64                     logical;
 223        struct btrfs_device     *dev;
 224};
 225
 226struct full_stripe_lock {
 227        struct rb_node node;
 228        u64 logical;
 229        u64 refs;
 230        struct mutex mutex;
 231};
 232
 233static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
 234static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
 235static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
 236static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
 237static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
 238static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
 239                                     struct scrub_block *sblocks_for_recheck);
 240static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 241                                struct scrub_block *sblock,
 242                                int retry_failed_mirror);
 243static void scrub_recheck_block_checksum(struct scrub_block *sblock);
 244static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
 245                                             struct scrub_block *sblock_good);
 246static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 247                                            struct scrub_block *sblock_good,
 248                                            int page_num, int force_write);
 249static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
 250static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
 251                                           int page_num);
 252static int scrub_checksum_data(struct scrub_block *sblock);
 253static int scrub_checksum_tree_block(struct scrub_block *sblock);
 254static int scrub_checksum_super(struct scrub_block *sblock);
 255static void scrub_block_get(struct scrub_block *sblock);
 256static void scrub_block_put(struct scrub_block *sblock);
 257static void scrub_page_get(struct scrub_page *spage);
 258static void scrub_page_put(struct scrub_page *spage);
 259static void scrub_parity_get(struct scrub_parity *sparity);
 260static void scrub_parity_put(struct scrub_parity *sparity);
 261static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
 262                                    struct scrub_page *spage);
 263static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 264                       u64 physical, struct btrfs_device *dev, u64 flags,
 265                       u64 gen, int mirror_num, u8 *csum, int force,
 266                       u64 physical_for_dev_replace);
 267static void scrub_bio_end_io(struct bio *bio);
 268static void scrub_bio_end_io_worker(struct btrfs_work *work);
 269static void scrub_block_complete(struct scrub_block *sblock);
 270static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
 271                               u64 extent_logical, u64 extent_len,
 272                               u64 *extent_physical,
 273                               struct btrfs_device **extent_dev,
 274                               int *extent_mirror_num);
 275static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
 276                                    struct scrub_page *spage);
 277static void scrub_wr_submit(struct scrub_ctx *sctx);
 278static void scrub_wr_bio_end_io(struct bio *bio);
 279static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
 280static int write_page_nocow(struct scrub_ctx *sctx,
 281                            u64 physical_for_dev_replace, struct page *page);
 282static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
 283                                      struct scrub_copy_nocow_ctx *ctx);
 284static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 285                            int mirror_num, u64 physical_for_dev_replace);
 286static void copy_nocow_pages_worker(struct btrfs_work *work);
 287static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
 288static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
 289static void scrub_put_ctx(struct scrub_ctx *sctx);
 290
 291static inline int scrub_is_page_on_raid56(struct scrub_page *page)
 292{
 293        return page->recover &&
 294               (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
 295}
 296
 297static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
 298{
 299        refcount_inc(&sctx->refs);
 300        atomic_inc(&sctx->bios_in_flight);
 301}
 302
 303static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
 304{
 305        atomic_dec(&sctx->bios_in_flight);
 306        wake_up(&sctx->list_wait);
 307        scrub_put_ctx(sctx);
 308}
 309
 310static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 311{
 312        while (atomic_read(&fs_info->scrub_pause_req)) {
 313                mutex_unlock(&fs_info->scrub_lock);
 314                wait_event(fs_info->scrub_pause_wait,
 315                   atomic_read(&fs_info->scrub_pause_req) == 0);
 316                mutex_lock(&fs_info->scrub_lock);
 317        }
 318}
 319
 320static void scrub_pause_on(struct btrfs_fs_info *fs_info)
 321{
 322        atomic_inc(&fs_info->scrubs_paused);
 323        wake_up(&fs_info->scrub_pause_wait);
 324}
 325
 326static void scrub_pause_off(struct btrfs_fs_info *fs_info)
 327{
 328        mutex_lock(&fs_info->scrub_lock);
 329        __scrub_blocked_if_needed(fs_info);
 330        atomic_dec(&fs_info->scrubs_paused);
 331        mutex_unlock(&fs_info->scrub_lock);
 332
 333        wake_up(&fs_info->scrub_pause_wait);
 334}
 335
 336static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 337{
 338        scrub_pause_on(fs_info);
 339        scrub_pause_off(fs_info);
 340}
 341
 342/*
 343 * Insert new full stripe lock into full stripe locks tree
 344 *
 345 * Return pointer to existing or newly inserted full_stripe_lock structure if
 346 * everything works well.
 347 * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
 348 *
 349 * NOTE: caller must hold full_stripe_locks_root->lock before calling this
 350 * function
 351 */
 352static struct full_stripe_lock *insert_full_stripe_lock(
 353                struct btrfs_full_stripe_locks_tree *locks_root,
 354                u64 fstripe_logical)
 355{
 356        struct rb_node **p;
 357        struct rb_node *parent = NULL;
 358        struct full_stripe_lock *entry;
 359        struct full_stripe_lock *ret;
 360
 361        lockdep_assert_held(&locks_root->lock);
 362
 363        p = &locks_root->root.rb_node;
 364        while (*p) {
 365                parent = *p;
 366                entry = rb_entry(parent, struct full_stripe_lock, node);
 367                if (fstripe_logical < entry->logical) {
 368                        p = &(*p)->rb_left;
 369                } else if (fstripe_logical > entry->logical) {
 370                        p = &(*p)->rb_right;
 371                } else {
 372                        entry->refs++;
 373                        return entry;
 374                }
 375        }
 376
 377        /* Insert new lock */
 378        ret = kmalloc(sizeof(*ret), GFP_KERNEL);
 379        if (!ret)
 380                return ERR_PTR(-ENOMEM);
 381        ret->logical = fstripe_logical;
 382        ret->refs = 1;
 383        mutex_init(&ret->mutex);
 384
 385        rb_link_node(&ret->node, parent, p);
 386        rb_insert_color(&ret->node, &locks_root->root);
 387        return ret;
 388}
 389
 390/*
 391 * Search for a full stripe lock of a block group
 392 *
 393 * Return pointer to existing full stripe lock if found
 394 * Return NULL if not found
 395 */
 396static struct full_stripe_lock *search_full_stripe_lock(
 397                struct btrfs_full_stripe_locks_tree *locks_root,
 398                u64 fstripe_logical)
 399{
 400        struct rb_node *node;
 401        struct full_stripe_lock *entry;
 402
 403        lockdep_assert_held(&locks_root->lock);
 404
 405        node = locks_root->root.rb_node;
 406        while (node) {
 407                entry = rb_entry(node, struct full_stripe_lock, node);
 408                if (fstripe_logical < entry->logical)
 409                        node = node->rb_left;
 410                else if (fstripe_logical > entry->logical)
 411                        node = node->rb_right;
 412                else
 413                        return entry;
 414        }
 415        return NULL;
 416}
 417
 418/*
 419 * Helper to get full stripe logical from a normal bytenr.
 420 *
 421 * Caller must ensure @cache is a RAID56 block group.
 422 */
 423static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache,
 424                                   u64 bytenr)
 425{
 426        u64 ret;
 427
 428        /*
 429         * Due to chunk item size limit, full stripe length should not be
 430         * larger than U32_MAX. Just a sanity check here.
 431         */
 432        WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
 433
 434        /*
 435         * round_down() can only handle power of 2, while RAID56 full
 436         * stripe length can be 64KiB * n, so we need to manually round down.
 437         */
 438        ret = div64_u64(bytenr - cache->key.objectid, cache->full_stripe_len) *
 439                cache->full_stripe_len + cache->key.objectid;
 440        return ret;
 441}
 442
 443/*
 444 * Lock a full stripe to avoid concurrency of recovery and read
 445 *
 446 * It's only used for profiles with parities (RAID5/6), for other profiles it
 447 * does nothing.
 448 *
 449 * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
 450 * So caller must call unlock_full_stripe() at the same context.
 451 *
 452 * Return <0 if encounters error.
 453 */
 454static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
 455                            bool *locked_ret)
 456{
 457        struct btrfs_block_group_cache *bg_cache;
 458        struct btrfs_full_stripe_locks_tree *locks_root;
 459        struct full_stripe_lock *existing;
 460        u64 fstripe_start;
 461        int ret = 0;
 462
 463        *locked_ret = false;
 464        bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
 465        if (!bg_cache) {
 466                ASSERT(0);
 467                return -ENOENT;
 468        }
 469
 470        /* Profiles not based on parity don't need full stripe lock */
 471        if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
 472                goto out;
 473        locks_root = &bg_cache->full_stripe_locks_root;
 474
 475        fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
 476
 477        /* Now insert the full stripe lock */
 478        mutex_lock(&locks_root->lock);
 479        existing = insert_full_stripe_lock(locks_root, fstripe_start);
 480        mutex_unlock(&locks_root->lock);
 481        if (IS_ERR(existing)) {
 482                ret = PTR_ERR(existing);
 483                goto out;
 484        }
 485        mutex_lock(&existing->mutex);
 486        *locked_ret = true;
 487out:
 488        btrfs_put_block_group(bg_cache);
 489        return ret;
 490}
 491
 492/*
 493 * Unlock a full stripe.
 494 *
 495 * NOTE: Caller must ensure it's the same context calling corresponding
 496 * lock_full_stripe().
 497 *
 498 * Return 0 if we unlock full stripe without problem.
 499 * Return <0 for error
 500 */
 501static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
 502                              bool locked)
 503{
 504        struct btrfs_block_group_cache *bg_cache;
 505        struct btrfs_full_stripe_locks_tree *locks_root;
 506        struct full_stripe_lock *fstripe_lock;
 507        u64 fstripe_start;
 508        bool freeit = false;
 509        int ret = 0;
 510
 511        /* If we didn't acquire full stripe lock, no need to continue */
 512        if (!locked)
 513                return 0;
 514
 515        bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
 516        if (!bg_cache) {
 517                ASSERT(0);
 518                return -ENOENT;
 519        }
 520        if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
 521                goto out;
 522
 523        locks_root = &bg_cache->full_stripe_locks_root;
 524        fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
 525
 526        mutex_lock(&locks_root->lock);
 527        fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
 528        /* Unpaired unlock_full_stripe() detected */
 529        if (!fstripe_lock) {
 530                WARN_ON(1);
 531                ret = -ENOENT;
 532                mutex_unlock(&locks_root->lock);
 533                goto out;
 534        }
 535
 536        if (fstripe_lock->refs == 0) {
 537                WARN_ON(1);
 538                btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
 539                        fstripe_lock->logical);
 540        } else {
 541                fstripe_lock->refs--;
 542        }
 543
 544        if (fstripe_lock->refs == 0) {
 545                rb_erase(&fstripe_lock->node, &locks_root->root);
 546                freeit = true;
 547        }
 548        mutex_unlock(&locks_root->lock);
 549
 550        mutex_unlock(&fstripe_lock->mutex);
 551        if (freeit)
 552                kfree(fstripe_lock);
 553out:
 554        btrfs_put_block_group(bg_cache);
 555        return ret;
 556}
 557
 558/*
 559 * used for workers that require transaction commits (i.e., for the
 560 * NOCOW case)
 561 */
 562static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
 563{
 564        struct btrfs_fs_info *fs_info = sctx->fs_info;
 565
 566        refcount_inc(&sctx->refs);
 567        /*
 568         * increment scrubs_running to prevent cancel requests from
 569         * completing as long as a worker is running. we must also
 570         * increment scrubs_paused to prevent deadlocking on pause
 571         * requests used for transactions commits (as the worker uses a
 572         * transaction context). it is safe to regard the worker
 573         * as paused for all matters practical. effectively, we only
 574         * avoid cancellation requests from completing.
 575         */
 576        mutex_lock(&fs_info->scrub_lock);
 577        atomic_inc(&fs_info->scrubs_running);
 578        atomic_inc(&fs_info->scrubs_paused);
 579        mutex_unlock(&fs_info->scrub_lock);
 580
 581        /*
 582         * check if @scrubs_running=@scrubs_paused condition
 583         * inside wait_event() is not an atomic operation.
 584         * which means we may inc/dec @scrub_running/paused
 585         * at any time. Let's wake up @scrub_pause_wait as
 586         * much as we can to let commit transaction blocked less.
 587         */
 588        wake_up(&fs_info->scrub_pause_wait);
 589
 590        atomic_inc(&sctx->workers_pending);
 591}
 592
 593/* used for workers that require transaction commits */
 594static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
 595{
 596        struct btrfs_fs_info *fs_info = sctx->fs_info;
 597
 598        /*
 599         * see scrub_pending_trans_workers_inc() why we're pretending
 600         * to be paused in the scrub counters
 601         */
 602        mutex_lock(&fs_info->scrub_lock);
 603        atomic_dec(&fs_info->scrubs_running);
 604        atomic_dec(&fs_info->scrubs_paused);
 605        mutex_unlock(&fs_info->scrub_lock);
 606        atomic_dec(&sctx->workers_pending);
 607        wake_up(&fs_info->scrub_pause_wait);
 608        wake_up(&sctx->list_wait);
 609        scrub_put_ctx(sctx);
 610}
 611
 612static void scrub_free_csums(struct scrub_ctx *sctx)
 613{
 614        while (!list_empty(&sctx->csum_list)) {
 615                struct btrfs_ordered_sum *sum;
 616                sum = list_first_entry(&sctx->csum_list,
 617                                       struct btrfs_ordered_sum, list);
 618                list_del(&sum->list);
 619                kfree(sum);
 620        }
 621}
 622
 623static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
 624{
 625        int i;
 626
 627        if (!sctx)
 628                return;
 629
 630        /* this can happen when scrub is cancelled */
 631        if (sctx->curr != -1) {
 632                struct scrub_bio *sbio = sctx->bios[sctx->curr];
 633
 634                for (i = 0; i < sbio->page_count; i++) {
 635                        WARN_ON(!sbio->pagev[i]->page);
 636                        scrub_block_put(sbio->pagev[i]->sblock);
 637                }
 638                bio_put(sbio->bio);
 639        }
 640
 641        for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 642                struct scrub_bio *sbio = sctx->bios[i];
 643
 644                if (!sbio)
 645                        break;
 646                kfree(sbio);
 647        }
 648
 649        kfree(sctx->wr_curr_bio);
 650        scrub_free_csums(sctx);
 651        kfree(sctx);
 652}
 653
 654static void scrub_put_ctx(struct scrub_ctx *sctx)
 655{
 656        if (refcount_dec_and_test(&sctx->refs))
 657                scrub_free_ctx(sctx);
 658}
 659
 660static noinline_for_stack
 661struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
 662{
 663        struct scrub_ctx *sctx;
 664        int             i;
 665        struct btrfs_fs_info *fs_info = dev->fs_info;
 666
 667        sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
 668        if (!sctx)
 669                goto nomem;
 670        refcount_set(&sctx->refs, 1);
 671        sctx->is_dev_replace = is_dev_replace;
 672        sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
 673        sctx->curr = -1;
 674        sctx->fs_info = dev->fs_info;
 675        for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 676                struct scrub_bio *sbio;
 677
 678                sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
 679                if (!sbio)
 680                        goto nomem;
 681                sctx->bios[i] = sbio;
 682
 683                sbio->index = i;
 684                sbio->sctx = sctx;
 685                sbio->page_count = 0;
 686                btrfs_init_work(&sbio->work, btrfs_scrub_helper,
 687                                scrub_bio_end_io_worker, NULL, NULL);
 688
 689                if (i != SCRUB_BIOS_PER_SCTX - 1)
 690                        sctx->bios[i]->next_free = i + 1;
 691                else
 692                        sctx->bios[i]->next_free = -1;
 693        }
 694        sctx->first_free = 0;
 695        atomic_set(&sctx->bios_in_flight, 0);
 696        atomic_set(&sctx->workers_pending, 0);
 697        atomic_set(&sctx->cancel_req, 0);
 698        sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
 699        INIT_LIST_HEAD(&sctx->csum_list);
 700
 701        spin_lock_init(&sctx->list_lock);
 702        spin_lock_init(&sctx->stat_lock);
 703        init_waitqueue_head(&sctx->list_wait);
 704
 705        WARN_ON(sctx->wr_curr_bio != NULL);
 706        mutex_init(&sctx->wr_lock);
 707        sctx->wr_curr_bio = NULL;
 708        if (is_dev_replace) {
 709                WARN_ON(!fs_info->dev_replace.tgtdev);
 710                sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
 711                sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
 712                sctx->flush_all_writes = false;
 713        }
 714
 715        return sctx;
 716
 717nomem:
 718        scrub_free_ctx(sctx);
 719        return ERR_PTR(-ENOMEM);
 720}
 721
 722static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
 723                                     void *warn_ctx)
 724{
 725        u64 isize;
 726        u32 nlink;
 727        int ret;
 728        int i;
 729        unsigned nofs_flag;
 730        struct extent_buffer *eb;
 731        struct btrfs_inode_item *inode_item;
 732        struct scrub_warning *swarn = warn_ctx;
 733        struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
 734        struct inode_fs_paths *ipath = NULL;
 735        struct btrfs_root *local_root;
 736        struct btrfs_key root_key;
 737        struct btrfs_key key;
 738
 739        root_key.objectid = root;
 740        root_key.type = BTRFS_ROOT_ITEM_KEY;
 741        root_key.offset = (u64)-1;
 742        local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
 743        if (IS_ERR(local_root)) {
 744                ret = PTR_ERR(local_root);
 745                goto err;
 746        }
 747
 748        /*
 749         * this makes the path point to (inum INODE_ITEM ioff)
 750         */
 751        key.objectid = inum;
 752        key.type = BTRFS_INODE_ITEM_KEY;
 753        key.offset = 0;
 754
 755        ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
 756        if (ret) {
 757                btrfs_release_path(swarn->path);
 758                goto err;
 759        }
 760
 761        eb = swarn->path->nodes[0];
 762        inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
 763                                        struct btrfs_inode_item);
 764        isize = btrfs_inode_size(eb, inode_item);
 765        nlink = btrfs_inode_nlink(eb, inode_item);
 766        btrfs_release_path(swarn->path);
 767
 768        /*
 769         * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
 770         * uses GFP_NOFS in this context, so we keep it consistent but it does
 771         * not seem to be strictly necessary.
 772         */
 773        nofs_flag = memalloc_nofs_save();
 774        ipath = init_ipath(4096, local_root, swarn->path);
 775        memalloc_nofs_restore(nofs_flag);
 776        if (IS_ERR(ipath)) {
 777                ret = PTR_ERR(ipath);
 778                ipath = NULL;
 779                goto err;
 780        }
 781        ret = paths_from_inode(inum, ipath);
 782
 783        if (ret < 0)
 784                goto err;
 785
 786        /*
 787         * we deliberately ignore the bit ipath might have been too small to
 788         * hold all of the paths here
 789         */
 790        for (i = 0; i < ipath->fspath->elem_cnt; ++i)
 791                btrfs_warn_in_rcu(fs_info,
 792"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)",
 793                                  swarn->errstr, swarn->logical,
 794                                  rcu_str_deref(swarn->dev->name),
 795                                  swarn->physical,
 796                                  root, inum, offset,
 797                                  min(isize - offset, (u64)PAGE_SIZE), nlink,
 798                                  (char *)(unsigned long)ipath->fspath->val[i]);
 799
 800        free_ipath(ipath);
 801        return 0;
 802
 803err:
 804        btrfs_warn_in_rcu(fs_info,
 805                          "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
 806                          swarn->errstr, swarn->logical,
 807                          rcu_str_deref(swarn->dev->name),
 808                          swarn->physical,
 809                          root, inum, offset, ret);
 810
 811        free_ipath(ipath);
 812        return 0;
 813}
 814
 815static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 816{
 817        struct btrfs_device *dev;
 818        struct btrfs_fs_info *fs_info;
 819        struct btrfs_path *path;
 820        struct btrfs_key found_key;
 821        struct extent_buffer *eb;
 822        struct btrfs_extent_item *ei;
 823        struct scrub_warning swarn;
 824        unsigned long ptr = 0;
 825        u64 extent_item_pos;
 826        u64 flags = 0;
 827        u64 ref_root;
 828        u32 item_size;
 829        u8 ref_level = 0;
 830        int ret;
 831
 832        WARN_ON(sblock->page_count < 1);
 833        dev = sblock->pagev[0]->dev;
 834        fs_info = sblock->sctx->fs_info;
 835
 836        path = btrfs_alloc_path();
 837        if (!path)
 838                return;
 839
 840        swarn.physical = sblock->pagev[0]->physical;
 841        swarn.logical = sblock->pagev[0]->logical;
 842        swarn.errstr = errstr;
 843        swarn.dev = NULL;
 844
 845        ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
 846                                  &flags);
 847        if (ret < 0)
 848                goto out;
 849
 850        extent_item_pos = swarn.logical - found_key.objectid;
 851        swarn.extent_item_size = found_key.offset;
 852
 853        eb = path->nodes[0];
 854        ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
 855        item_size = btrfs_item_size_nr(eb, path->slots[0]);
 856
 857        if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 858                do {
 859                        ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
 860                                                      item_size, &ref_root,
 861                                                      &ref_level);
 862                        btrfs_warn_in_rcu(fs_info,
 863"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
 864                                errstr, swarn.logical,
 865                                rcu_str_deref(dev->name),
 866                                swarn.physical,
 867                                ref_level ? "node" : "leaf",
 868                                ret < 0 ? -1 : ref_level,
 869                                ret < 0 ? -1 : ref_root);
 870                } while (ret != 1);
 871                btrfs_release_path(path);
 872        } else {
 873                btrfs_release_path(path);
 874                swarn.path = path;
 875                swarn.dev = dev;
 876                iterate_extent_inodes(fs_info, found_key.objectid,
 877                                        extent_item_pos, 1,
 878                                        scrub_print_warning_inode, &swarn, false);
 879        }
 880
 881out:
 882        btrfs_free_path(path);
 883}
 884
 885static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
 886{
 887        struct page *page = NULL;
 888        unsigned long index;
 889        struct scrub_fixup_nodatasum *fixup = fixup_ctx;
 890        int ret;
 891        int corrected = 0;
 892        struct btrfs_key key;
 893        struct inode *inode = NULL;
 894        struct btrfs_fs_info *fs_info;
 895        u64 end = offset + PAGE_SIZE - 1;
 896        struct btrfs_root *local_root;
 897        int srcu_index;
 898
 899        key.objectid = root;
 900        key.type = BTRFS_ROOT_ITEM_KEY;
 901        key.offset = (u64)-1;
 902
 903        fs_info = fixup->root->fs_info;
 904        srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
 905
 906        local_root = btrfs_read_fs_root_no_name(fs_info, &key);
 907        if (IS_ERR(local_root)) {
 908                srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
 909                return PTR_ERR(local_root);
 910        }
 911
 912        key.type = BTRFS_INODE_ITEM_KEY;
 913        key.objectid = inum;
 914        key.offset = 0;
 915        inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
 916        srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
 917        if (IS_ERR(inode))
 918                return PTR_ERR(inode);
 919
 920        index = offset >> PAGE_SHIFT;
 921
 922        page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
 923        if (!page) {
 924                ret = -ENOMEM;
 925                goto out;
 926        }
 927
 928        if (PageUptodate(page)) {
 929                if (PageDirty(page)) {
 930                        /*
 931                         * we need to write the data to the defect sector. the
 932                         * data that was in that sector is not in memory,
 933                         * because the page was modified. we must not write the
 934                         * modified page to that sector.
 935                         *
 936                         * TODO: what could be done here: wait for the delalloc
 937                         *       runner to write out that page (might involve
 938                         *       COW) and see whether the sector is still
 939                         *       referenced afterwards.
 940                         *
 941                         * For the meantime, we'll treat this error
 942                         * incorrectable, although there is a chance that a
 943                         * later scrub will find the bad sector again and that
 944                         * there's no dirty page in memory, then.
 945                         */
 946                        ret = -EIO;
 947                        goto out;
 948                }
 949                ret = repair_io_failure(fs_info, inum, offset, PAGE_SIZE,
 950                                        fixup->logical, page,
 951                                        offset - page_offset(page),
 952                                        fixup->mirror_num);
 953                unlock_page(page);
 954                corrected = !ret;
 955        } else {
 956                /*
 957                 * we need to get good data first. the general readpage path
 958                 * will call repair_io_failure for us, we just have to make
 959                 * sure we read the bad mirror.
 960                 */
 961                ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
 962                                        EXTENT_DAMAGED);
 963                if (ret) {
 964                        /* set_extent_bits should give proper error */
 965                        WARN_ON(ret > 0);
 966                        if (ret > 0)
 967                                ret = -EFAULT;
 968                        goto out;
 969                }
 970
 971                ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
 972                                                btrfs_get_extent,
 973                                                fixup->mirror_num);
 974                wait_on_page_locked(page);
 975
 976                corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
 977                                                end, EXTENT_DAMAGED, 0, NULL);
 978                if (!corrected)
 979                        clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
 980                                                EXTENT_DAMAGED);
 981        }
 982
 983out:
 984        if (page)
 985                put_page(page);
 986
 987        iput(inode);
 988
 989        if (ret < 0)
 990                return ret;
 991
 992        if (ret == 0 && corrected) {
 993                /*
 994                 * we only need to call readpage for one of the inodes belonging
 995                 * to this extent. so make iterate_extent_inodes stop
 996                 */
 997                return 1;
 998        }
 999
1000        return -EIO;
1001}
1002
1003static void scrub_fixup_nodatasum(struct btrfs_work *work)
1004{
1005        struct btrfs_fs_info *fs_info;
1006        int ret;
1007        struct scrub_fixup_nodatasum *fixup;
1008        struct scrub_ctx *sctx;
1009        struct btrfs_trans_handle *trans = NULL;
1010        struct btrfs_path *path;
1011        int uncorrectable = 0;
1012
1013        fixup = container_of(work, struct scrub_fixup_nodatasum, work);
1014        sctx = fixup->sctx;
1015        fs_info = fixup->root->fs_info;
1016
1017        path = btrfs_alloc_path();
1018        if (!path) {
1019                spin_lock(&sctx->stat_lock);
1020                ++sctx->stat.malloc_errors;
1021                spin_unlock(&sctx->stat_lock);
1022                uncorrectable = 1;
1023                goto out;
1024        }
1025
1026        trans = btrfs_join_transaction(fixup->root);
1027        if (IS_ERR(trans)) {
1028                uncorrectable = 1;
1029                goto out;
1030        }
1031
1032        /*
1033         * the idea is to trigger a regular read through the standard path. we
1034         * read a page from the (failed) logical address by specifying the
1035         * corresponding copynum of the failed sector. thus, that readpage is
1036         * expected to fail.
1037         * that is the point where on-the-fly error correction will kick in
1038         * (once it's finished) and rewrite the failed sector if a good copy
1039         * can be found.
1040         */
1041        ret = iterate_inodes_from_logical(fixup->logical, fs_info, path,
1042                                          scrub_fixup_readpage, fixup, false);
1043        if (ret < 0) {
1044                uncorrectable = 1;
1045                goto out;
1046        }
1047        WARN_ON(ret != 1);
1048
1049        spin_lock(&sctx->stat_lock);
1050        ++sctx->stat.corrected_errors;
1051        spin_unlock(&sctx->stat_lock);
1052
1053out:
1054        if (trans && !IS_ERR(trans))
1055                btrfs_end_transaction(trans);
1056        if (uncorrectable) {
1057                spin_lock(&sctx->stat_lock);
1058                ++sctx->stat.uncorrectable_errors;
1059                spin_unlock(&sctx->stat_lock);
1060                btrfs_dev_replace_stats_inc(
1061                        &fs_info->dev_replace.num_uncorrectable_read_errors);
1062                btrfs_err_rl_in_rcu(fs_info,
1063                    "unable to fixup (nodatasum) error at logical %llu on dev %s",
1064                        fixup->logical, rcu_str_deref(fixup->dev->name));
1065        }
1066
1067        btrfs_free_path(path);
1068        kfree(fixup);
1069
1070        scrub_pending_trans_workers_dec(sctx);
1071}
1072
1073static inline void scrub_get_recover(struct scrub_recover *recover)
1074{
1075        refcount_inc(&recover->refs);
1076}
1077
1078static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
1079                                     struct scrub_recover *recover)
1080{
1081        if (refcount_dec_and_test(&recover->refs)) {
1082                btrfs_bio_counter_dec(fs_info);
1083                btrfs_put_bbio(recover->bbio);
1084                kfree(recover);
1085        }
1086}
1087
1088/*
1089 * scrub_handle_errored_block gets called when either verification of the
1090 * pages failed or the bio failed to read, e.g. with EIO. In the latter
1091 * case, this function handles all pages in the bio, even though only one
1092 * may be bad.
1093 * The goal of this function is to repair the errored block by using the
1094 * contents of one of the mirrors.
1095 */
1096static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
1097{
1098        struct scrub_ctx *sctx = sblock_to_check->sctx;
1099        struct btrfs_device *dev;
1100        struct btrfs_fs_info *fs_info;
1101        u64 logical;
1102        unsigned int failed_mirror_index;
1103        unsigned int is_metadata;
1104        unsigned int have_csum;
1105        struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
1106        struct scrub_block *sblock_bad;
1107        int ret;
1108        int mirror_index;
1109        int page_num;
1110        int success;
1111        bool full_stripe_locked;
1112        static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
1113                                      DEFAULT_RATELIMIT_BURST);
1114
1115        BUG_ON(sblock_to_check->page_count < 1);
1116        fs_info = sctx->fs_info;
1117        if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
1118                /*
1119                 * if we find an error in a super block, we just report it.
1120                 * They will get written with the next transaction commit
1121                 * anyway
1122                 */
1123                spin_lock(&sctx->stat_lock);
1124                ++sctx->stat.super_errors;
1125                spin_unlock(&sctx->stat_lock);
1126                return 0;
1127        }
1128        logical = sblock_to_check->pagev[0]->logical;
1129        BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
1130        failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
1131        is_metadata = !(sblock_to_check->pagev[0]->flags &
1132                        BTRFS_EXTENT_FLAG_DATA);
1133        have_csum = sblock_to_check->pagev[0]->have_csum;
1134        dev = sblock_to_check->pagev[0]->dev;
1135
1136        /*
1137         * For RAID5/6, race can happen for a different device scrub thread.
1138         * For data corruption, Parity and Data threads will both try
1139         * to recovery the data.
1140         * Race can lead to doubly added csum error, or even unrecoverable
1141         * error.
1142         */
1143        ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
1144        if (ret < 0) {
1145                spin_lock(&sctx->stat_lock);
1146                if (ret == -ENOMEM)
1147                        sctx->stat.malloc_errors++;
1148                sctx->stat.read_errors++;
1149                sctx->stat.uncorrectable_errors++;
1150                spin_unlock(&sctx->stat_lock);
1151                return ret;
1152        }
1153
1154        /*
1155         * read all mirrors one after the other. This includes to
1156         * re-read the extent or metadata block that failed (that was
1157         * the cause that this fixup code is called) another time,
1158         * page by page this time in order to know which pages
1159         * caused I/O errors and which ones are good (for all mirrors).
1160         * It is the goal to handle the situation when more than one
1161         * mirror contains I/O errors, but the errors do not
1162         * overlap, i.e. the data can be repaired by selecting the
1163         * pages from those mirrors without I/O error on the
1164         * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
1165         * would be that mirror #1 has an I/O error on the first page,
1166         * the second page is good, and mirror #2 has an I/O error on
1167         * the second page, but the first page is good.
1168         * Then the first page of the first mirror can be repaired by
1169         * taking the first page of the second mirror, and the
1170         * second page of the second mirror can be repaired by
1171         * copying the contents of the 2nd page of the 1st mirror.
1172         * One more note: if the pages of one mirror contain I/O
1173         * errors, the checksum cannot be verified. In order to get
1174         * the best data for repairing, the first attempt is to find
1175         * a mirror without I/O errors and with a validated checksum.
1176         * Only if this is not possible, the pages are picked from
1177         * mirrors with I/O errors without considering the checksum.
1178         * If the latter is the case, at the end, the checksum of the
1179         * repaired area is verified in order to correctly maintain
1180         * the statistics.
1181         */
1182
1183        sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
1184                                      sizeof(*sblocks_for_recheck), GFP_NOFS);
1185        if (!sblocks_for_recheck) {
1186                spin_lock(&sctx->stat_lock);
1187                sctx->stat.malloc_errors++;
1188                sctx->stat.read_errors++;
1189                sctx->stat.uncorrectable_errors++;
1190                spin_unlock(&sctx->stat_lock);
1191                btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
1192                goto out;
1193        }
1194
1195        /* setup the context, map the logical blocks and alloc the pages */
1196        ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
1197        if (ret) {
1198                spin_lock(&sctx->stat_lock);
1199                sctx->stat.read_errors++;
1200                sctx->stat.uncorrectable_errors++;
1201                spin_unlock(&sctx->stat_lock);
1202                btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
1203                goto out;
1204        }
1205        BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
1206        sblock_bad = sblocks_for_recheck + failed_mirror_index;
1207
1208        /* build and submit the bios for the failed mirror, check checksums */
1209        scrub_recheck_block(fs_info, sblock_bad, 1);
1210
1211        if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
1212            sblock_bad->no_io_error_seen) {
1213                /*
1214                 * the error disappeared after reading page by page, or
1215                 * the area was part of a huge bio and other parts of the
1216                 * bio caused I/O errors, or the block layer merged several
1217                 * read requests into one and the error is caused by a
1218                 * different bio (usually one of the two latter cases is
1219                 * the cause)
1220                 */
1221                spin_lock(&sctx->stat_lock);
1222                sctx->stat.unverified_errors++;
1223                sblock_to_check->data_corrected = 1;
1224                spin_unlock(&sctx->stat_lock);
1225
1226                if (sctx->is_dev_replace)
1227                        scrub_write_block_to_dev_replace(sblock_bad);
1228                goto out;
1229        }
1230
1231        if (!sblock_bad->no_io_error_seen) {
1232                spin_lock(&sctx->stat_lock);
1233                sctx->stat.read_errors++;
1234                spin_unlock(&sctx->stat_lock);
1235                if (__ratelimit(&_rs))
1236                        scrub_print_warning("i/o error", sblock_to_check);
1237                btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
1238        } else if (sblock_bad->checksum_error) {
1239                spin_lock(&sctx->stat_lock);
1240                sctx->stat.csum_errors++;
1241                spin_unlock(&sctx->stat_lock);
1242                if (__ratelimit(&_rs))
1243                        scrub_print_warning("checksum error", sblock_to_check);
1244                btrfs_dev_stat_inc_and_print(dev,
1245                                             BTRFS_DEV_STAT_CORRUPTION_ERRS);
1246        } else if (sblock_bad->header_error) {
1247                spin_lock(&sctx->stat_lock);
1248                sctx->stat.verify_errors++;
1249                spin_unlock(&sctx->stat_lock);
1250                if (__ratelimit(&_rs))
1251                        scrub_print_warning("checksum/header error",
1252                                            sblock_to_check);
1253                if (sblock_bad->generation_error)
1254                        btrfs_dev_stat_inc_and_print(dev,
1255                                BTRFS_DEV_STAT_GENERATION_ERRS);
1256                else
1257                        btrfs_dev_stat_inc_and_print(dev,
1258                                BTRFS_DEV_STAT_CORRUPTION_ERRS);
1259        }
1260
1261        if (sctx->readonly) {
1262                ASSERT(!sctx->is_dev_replace);
1263                goto out;
1264        }
1265
1266        /*
1267         * NOTE: Even for nodatasum case, it's still possible that it's a
1268         * compressed data extent, thus scrub_fixup_nodatasum(), which write
1269         * inode page cache onto disk, could cause serious data corruption.
1270         *
1271         * So here we could only read from disk, and hope our recovery could
1272         * reach disk before the newer write.
1273         */
1274        if (0 && !is_metadata && !have_csum) {
1275                struct scrub_fixup_nodatasum *fixup_nodatasum;
1276
1277                WARN_ON(sctx->is_dev_replace);
1278
1279                /*
1280                 * !is_metadata and !have_csum, this means that the data
1281                 * might not be COWed, that it might be modified
1282                 * concurrently. The general strategy to work on the
1283                 * commit root does not help in the case when COW is not
1284                 * used.
1285                 */
1286                fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
1287                if (!fixup_nodatasum)
1288                        goto did_not_correct_error;
1289                fixup_nodatasum->sctx = sctx;
1290                fixup_nodatasum->dev = dev;
1291                fixup_nodatasum->logical = logical;
1292                fixup_nodatasum->root = fs_info->extent_root;
1293                fixup_nodatasum->mirror_num = failed_mirror_index + 1;
1294                scrub_pending_trans_workers_inc(sctx);
1295                btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper,
1296                                scrub_fixup_nodatasum, NULL, NULL);
1297                btrfs_queue_work(fs_info->scrub_workers,
1298                                 &fixup_nodatasum->work);
1299                goto out;
1300        }
1301
1302        /*
1303         * now build and submit the bios for the other mirrors, check
1304         * checksums.
1305         * First try to pick the mirror which is completely without I/O
1306         * errors and also does not have a checksum error.
1307         * If one is found, and if a checksum is present, the full block
1308         * that is known to contain an error is rewritten. Afterwards
1309         * the block is known to be corrected.
1310         * If a mirror is found which is completely correct, and no
1311         * checksum is present, only those pages are rewritten that had
1312         * an I/O error in the block to be repaired, since it cannot be
1313         * determined, which copy of the other pages is better (and it
1314         * could happen otherwise that a correct page would be
1315         * overwritten by a bad one).
1316         */
1317        for (mirror_index = 0; ;mirror_index++) {
1318                struct scrub_block *sblock_other;
1319
1320                if (mirror_index == failed_mirror_index)
1321                        continue;
1322
1323                /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
1324                if (!scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
1325                        if (mirror_index >= BTRFS_MAX_MIRRORS)
1326                                break;
1327                        if (!sblocks_for_recheck[mirror_index].page_count)
1328                                break;
1329
1330                        sblock_other = sblocks_for_recheck + mirror_index;
1331                } else {
1332                        struct scrub_recover *r = sblock_bad->pagev[0]->recover;
1333                        int max_allowed = r->bbio->num_stripes -
1334                                                r->bbio->num_tgtdevs;
1335
1336                        if (mirror_index >= max_allowed)
1337                                break;
1338                        if (!sblocks_for_recheck[1].page_count)
1339                                break;
1340
1341                        ASSERT(failed_mirror_index == 0);
1342                        sblock_other = sblocks_for_recheck + 1;
1343                        sblock_other->pagev[0]->mirror_num = 1 + mirror_index;
1344                }
1345
1346                /* build and submit the bios, check checksums */
1347                scrub_recheck_block(fs_info, sblock_other, 0);
1348
1349                if (!sblock_other->header_error &&
1350                    !sblock_other->checksum_error &&
1351                    sblock_other->no_io_error_seen) {
1352                        if (sctx->is_dev_replace) {
1353                                scrub_write_block_to_dev_replace(sblock_other);
1354                                goto corrected_error;
1355                        } else {
1356                                ret = scrub_repair_block_from_good_copy(
1357                                                sblock_bad, sblock_other);
1358                                if (!ret)
1359                                        goto corrected_error;
1360                        }
1361                }
1362        }
1363
1364        if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1365                goto did_not_correct_error;
1366
1367        /*
1368         * In case of I/O errors in the area that is supposed to be
1369         * repaired, continue by picking good copies of those pages.
1370         * Select the good pages from mirrors to rewrite bad pages from
1371         * the area to fix. Afterwards verify the checksum of the block
1372         * that is supposed to be repaired. This verification step is
1373         * only done for the purpose of statistic counting and for the
1374         * final scrub report, whether errors remain.
1375         * A perfect algorithm could make use of the checksum and try
1376         * all possible combinations of pages from the different mirrors
1377         * until the checksum verification succeeds. For example, when
1378         * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
1379         * of mirror #2 is readable but the final checksum test fails,
1380         * then the 2nd page of mirror #3 could be tried, whether now
1381         * the final checksum succeeds. But this would be a rare
1382         * exception and is therefore not implemented. At least it is
1383         * avoided that the good copy is overwritten.
1384         * A more useful improvement would be to pick the sectors
1385         * without I/O error based on sector sizes (512 bytes on legacy
1386         * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
1387         * mirror could be repaired by taking 512 byte of a different
1388         * mirror, even if other 512 byte sectors in the same PAGE_SIZE
1389         * area are unreadable.
1390         */
1391        success = 1;
1392        for (page_num = 0; page_num < sblock_bad->page_count;
1393             page_num++) {
1394                struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1395                struct scrub_block *sblock_other = NULL;
1396
1397                /* skip no-io-error page in scrub */
1398                if (!page_bad->io_error && !sctx->is_dev_replace)
1399                        continue;
1400
1401                if (scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
1402                        /*
1403                         * In case of dev replace, if raid56 rebuild process
1404                         * didn't work out correct data, then copy the content
1405                         * in sblock_bad to make sure target device is identical
1406                         * to source device, instead of writing garbage data in
1407                         * sblock_for_recheck array to target device.
1408                         */
1409                        sblock_other = NULL;
1410                } else if (page_bad->io_error) {
1411                        /* try to find no-io-error page in mirrors */
1412                        for (mirror_index = 0;
1413                             mirror_index < BTRFS_MAX_MIRRORS &&
1414                             sblocks_for_recheck[mirror_index].page_count > 0;
1415                             mirror_index++) {
1416                                if (!sblocks_for_recheck[mirror_index].
1417                                    pagev[page_num]->io_error) {
1418                                        sblock_other = sblocks_for_recheck +
1419                                                       mirror_index;
1420                                        break;
1421                                }
1422                        }
1423                        if (!sblock_other)
1424                                success = 0;
1425                }
1426
1427                if (sctx->is_dev_replace) {
1428                        /*
1429                         * did not find a mirror to fetch the page
1430                         * from. scrub_write_page_to_dev_replace()
1431                         * handles this case (page->io_error), by
1432                         * filling the block with zeros before
1433                         * submitting the write request
1434                         */
1435                        if (!sblock_other)
1436                                sblock_other = sblock_bad;
1437
1438                        if (scrub_write_page_to_dev_replace(sblock_other,
1439                                                            page_num) != 0) {
1440                                btrfs_dev_replace_stats_inc(
1441                                        &fs_info->dev_replace.num_write_errors);
1442                                success = 0;
1443                        }
1444                } else if (sblock_other) {
1445                        ret = scrub_repair_page_from_good_copy(sblock_bad,
1446                                                               sblock_other,
1447                                                               page_num, 0);
1448                        if (0 == ret)
1449                                page_bad->io_error = 0;
1450                        else
1451                                success = 0;
1452                }
1453        }
1454
1455        if (success && !sctx->is_dev_replace) {
1456                if (is_metadata || have_csum) {
1457                        /*
1458                         * need to verify the checksum now that all
1459                         * sectors on disk are repaired (the write
1460                         * request for data to be repaired is on its way).
1461                         * Just be lazy and use scrub_recheck_block()
1462                         * which re-reads the data before the checksum
1463                         * is verified, but most likely the data comes out
1464                         * of the page cache.
1465                         */
1466                        scrub_recheck_block(fs_info, sblock_bad, 1);
1467                        if (!sblock_bad->header_error &&
1468                            !sblock_bad->checksum_error &&
1469                            sblock_bad->no_io_error_seen)
1470                                goto corrected_error;
1471                        else
1472                                goto did_not_correct_error;
1473                } else {
1474corrected_error:
1475                        spin_lock(&sctx->stat_lock);
1476                        sctx->stat.corrected_errors++;
1477                        sblock_to_check->data_corrected = 1;
1478                        spin_unlock(&sctx->stat_lock);
1479                        btrfs_err_rl_in_rcu(fs_info,
1480                                "fixed up error at logical %llu on dev %s",
1481                                logical, rcu_str_deref(dev->name));
1482                }
1483        } else {
1484did_not_correct_error:
1485                spin_lock(&sctx->stat_lock);
1486                sctx->stat.uncorrectable_errors++;
1487                spin_unlock(&sctx->stat_lock);
1488                btrfs_err_rl_in_rcu(fs_info,
1489                        "unable to fixup (regular) error at logical %llu on dev %s",
1490                        logical, rcu_str_deref(dev->name));
1491        }
1492
1493out:
1494        if (sblocks_for_recheck) {
1495                for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1496                     mirror_index++) {
1497                        struct scrub_block *sblock = sblocks_for_recheck +
1498                                                     mirror_index;
1499                        struct scrub_recover *recover;
1500                        int page_index;
1501
1502                        for (page_index = 0; page_index < sblock->page_count;
1503                             page_index++) {
1504                                sblock->pagev[page_index]->sblock = NULL;
1505                                recover = sblock->pagev[page_index]->recover;
1506                                if (recover) {
1507                                        scrub_put_recover(fs_info, recover);
1508                                        sblock->pagev[page_index]->recover =
1509                                                                        NULL;
1510                                }
1511                                scrub_page_put(sblock->pagev[page_index]);
1512                        }
1513                }
1514                kfree(sblocks_for_recheck);
1515        }
1516
1517        ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
1518        if (ret < 0)
1519                return ret;
1520        return 0;
1521}
1522
1523static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
1524{
1525        if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
1526                return 2;
1527        else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
1528                return 3;
1529        else
1530                return (int)bbio->num_stripes;
1531}
1532
1533static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1534                                                 u64 *raid_map,
1535                                                 u64 mapped_length,
1536                                                 int nstripes, int mirror,
1537                                                 int *stripe_index,
1538                                                 u64 *stripe_offset)
1539{
1540        int i;
1541
1542        if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1543                /* RAID5/6 */
1544                for (i = 0; i < nstripes; i++) {
1545                        if (raid_map[i] == RAID6_Q_STRIPE ||
1546                            raid_map[i] == RAID5_P_STRIPE)
1547                                continue;
1548
1549                        if (logical >= raid_map[i] &&
1550                            logical < raid_map[i] + mapped_length)
1551                                break;
1552                }
1553
1554                *stripe_index = i;
1555                *stripe_offset = logical - raid_map[i];
1556        } else {
1557                /* The other RAID type */
1558                *stripe_index = mirror;
1559                *stripe_offset = 0;
1560        }
1561}
1562
1563static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1564                                     struct scrub_block *sblocks_for_recheck)
1565{
1566        struct scrub_ctx *sctx = original_sblock->sctx;
1567        struct btrfs_fs_info *fs_info = sctx->fs_info;
1568        u64 length = original_sblock->page_count * PAGE_SIZE;
1569        u64 logical = original_sblock->pagev[0]->logical;
1570        u64 generation = original_sblock->pagev[0]->generation;
1571        u64 flags = original_sblock->pagev[0]->flags;
1572        u64 have_csum = original_sblock->pagev[0]->have_csum;
1573        struct scrub_recover *recover;
1574        struct btrfs_bio *bbio;
1575        u64 sublen;
1576        u64 mapped_length;
1577        u64 stripe_offset;
1578        int stripe_index;
1579        int page_index = 0;
1580        int mirror_index;
1581        int nmirrors;
1582        int ret;
1583
1584        /*
1585         * note: the two members refs and outstanding_pages
1586         * are not used (and not set) in the blocks that are used for
1587         * the recheck procedure
1588         */
1589
1590        while (length > 0) {
1591                sublen = min_t(u64, length, PAGE_SIZE);
1592                mapped_length = sublen;
1593                bbio = NULL;
1594
1595                /*
1596                 * with a length of PAGE_SIZE, each returned stripe
1597                 * represents one mirror
1598                 */
1599                btrfs_bio_counter_inc_blocked(fs_info);
1600                ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
1601                                logical, &mapped_length, &bbio);
1602                if (ret || !bbio || mapped_length < sublen) {
1603                        btrfs_put_bbio(bbio);
1604                        btrfs_bio_counter_dec(fs_info);
1605                        return -EIO;
1606                }
1607
1608                recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1609                if (!recover) {
1610                        btrfs_put_bbio(bbio);
1611                        btrfs_bio_counter_dec(fs_info);
1612                        return -ENOMEM;
1613                }
1614
1615                refcount_set(&recover->refs, 1);
1616                recover->bbio = bbio;
1617                recover->map_length = mapped_length;
1618
1619                BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK);
1620
1621                nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
1622
1623                for (mirror_index = 0; mirror_index < nmirrors;
1624                     mirror_index++) {
1625                        struct scrub_block *sblock;
1626                        struct scrub_page *page;
1627
1628                        sblock = sblocks_for_recheck + mirror_index;
1629                        sblock->sctx = sctx;
1630
1631                        page = kzalloc(sizeof(*page), GFP_NOFS);
1632                        if (!page) {
1633leave_nomem:
1634                                spin_lock(&sctx->stat_lock);
1635                                sctx->stat.malloc_errors++;
1636                                spin_unlock(&sctx->stat_lock);
1637                                scrub_put_recover(fs_info, recover);
1638                                return -ENOMEM;
1639                        }
1640                        scrub_page_get(page);
1641                        sblock->pagev[page_index] = page;
1642                        page->sblock = sblock;
1643                        page->flags = flags;
1644                        page->generation = generation;
1645                        page->logical = logical;
1646                        page->have_csum = have_csum;
1647                        if (have_csum)
1648                                memcpy(page->csum,
1649                                       original_sblock->pagev[0]->csum,
1650                                       sctx->csum_size);
1651
1652                        scrub_stripe_index_and_offset(logical,
1653                                                      bbio->map_type,
1654                                                      bbio->raid_map,
1655                                                      mapped_length,
1656                                                      bbio->num_stripes -
1657                                                      bbio->num_tgtdevs,
1658                                                      mirror_index,
1659                                                      &stripe_index,
1660                                                      &stripe_offset);
1661                        page->physical = bbio->stripes[stripe_index].physical +
1662                                         stripe_offset;
1663                        page->dev = bbio->stripes[stripe_index].dev;
1664
1665                        BUG_ON(page_index >= original_sblock->page_count);
1666                        page->physical_for_dev_replace =
1667                                original_sblock->pagev[page_index]->
1668                                physical_for_dev_replace;
1669                        /* for missing devices, dev->bdev is NULL */
1670                        page->mirror_num = mirror_index + 1;
1671                        sblock->page_count++;
1672                        page->page = alloc_page(GFP_NOFS);
1673                        if (!page->page)
1674                                goto leave_nomem;
1675
1676                        scrub_get_recover(recover);
1677                        page->recover = recover;
1678                }
1679                scrub_put_recover(fs_info, recover);
1680                length -= sublen;
1681                logical += sublen;
1682                page_index++;
1683        }
1684
1685        return 0;
1686}
1687
1688static void scrub_bio_wait_endio(struct bio *bio)
1689{
1690        complete(bio->bi_private);
1691}
1692
1693static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1694                                        struct bio *bio,
1695                                        struct scrub_page *page)
1696{
1697        DECLARE_COMPLETION_ONSTACK(done);
1698        int ret;
1699        int mirror_num;
1700
1701        bio->bi_iter.bi_sector = page->logical >> 9;
1702        bio->bi_private = &done;
1703        bio->bi_end_io = scrub_bio_wait_endio;
1704
1705        mirror_num = page->sblock->pagev[0]->mirror_num;
1706        ret = raid56_parity_recover(fs_info, bio, page->recover->bbio,
1707                                    page->recover->map_length,
1708                                    mirror_num, 0);
1709        if (ret)
1710                return ret;
1711
1712        wait_for_completion_io(&done);
1713        return blk_status_to_errno(bio->bi_status);
1714}
1715
1716static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
1717                                          struct scrub_block *sblock)
1718{
1719        struct scrub_page *first_page = sblock->pagev[0];
1720        struct bio *bio;
1721        int page_num;
1722
1723        /* All pages in sblock belong to the same stripe on the same device. */
1724        ASSERT(first_page->dev);
1725        if (!first_page->dev->bdev)
1726                goto out;
1727
1728        bio = btrfs_io_bio_alloc(BIO_MAX_PAGES);
1729        bio_set_dev(bio, first_page->dev->bdev);
1730
1731        for (page_num = 0; page_num < sblock->page_count; page_num++) {
1732                struct scrub_page *page = sblock->pagev[page_num];
1733
1734                WARN_ON(!page->page);
1735                bio_add_page(bio, page->page, PAGE_SIZE, 0);
1736        }
1737
1738        if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) {
1739                bio_put(bio);
1740                goto out;
1741        }
1742
1743        bio_put(bio);
1744
1745        scrub_recheck_block_checksum(sblock);
1746
1747        return;
1748out:
1749        for (page_num = 0; page_num < sblock->page_count; page_num++)
1750                sblock->pagev[page_num]->io_error = 1;
1751
1752        sblock->no_io_error_seen = 0;
1753}
1754
1755/*
1756 * this function will check the on disk data for checksum errors, header
1757 * errors and read I/O errors. If any I/O errors happen, the exact pages
1758 * which are errored are marked as being bad. The goal is to enable scrub
1759 * to take those pages that are not errored from all the mirrors so that
1760 * the pages that are errored in the just handled mirror can be repaired.
1761 */
1762static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1763                                struct scrub_block *sblock,
1764                                int retry_failed_mirror)
1765{
1766        int page_num;
1767
1768        sblock->no_io_error_seen = 1;
1769
1770        /* short cut for raid56 */
1771        if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->pagev[0]))
1772                return scrub_recheck_block_on_raid56(fs_info, sblock);
1773
1774        for (page_num = 0; page_num < sblock->page_count; page_num++) {
1775                struct bio *bio;
1776                struct scrub_page *page = sblock->pagev[page_num];
1777
1778                if (page->dev->bdev == NULL) {
1779                        page->io_error = 1;
1780                        sblock->no_io_error_seen = 0;
1781                        continue;
1782                }
1783
1784                WARN_ON(!page->page);
1785                bio = btrfs_io_bio_alloc(1);
1786                bio_set_dev(bio, page->dev->bdev);
1787
1788                bio_add_page(bio, page->page, PAGE_SIZE, 0);
1789                bio->bi_iter.bi_sector = page->physical >> 9;
1790                bio->bi_opf = REQ_OP_READ;
1791
1792                if (btrfsic_submit_bio_wait(bio)) {
1793                        page->io_error = 1;
1794                        sblock->no_io_error_seen = 0;
1795                }
1796
1797                bio_put(bio);
1798        }
1799
1800        if (sblock->no_io_error_seen)
1801                scrub_recheck_block_checksum(sblock);
1802}
1803
1804static inline int scrub_check_fsid(u8 fsid[],
1805                                   struct scrub_page *spage)
1806{
1807        struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
1808        int ret;
1809
1810        ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1811        return !ret;
1812}
1813
1814static void scrub_recheck_block_checksum(struct scrub_block *sblock)
1815{
1816        sblock->header_error = 0;
1817        sblock->checksum_error = 0;
1818        sblock->generation_error = 0;
1819
1820        if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA)
1821                scrub_checksum_data(sblock);
1822        else
1823                scrub_checksum_tree_block(sblock);
1824}
1825
1826static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1827                                             struct scrub_block *sblock_good)
1828{
1829        int page_num;
1830        int ret = 0;
1831
1832        for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1833                int ret_sub;
1834
1835                ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1836                                                           sblock_good,
1837                                                           page_num, 1);
1838                if (ret_sub)
1839                        ret = ret_sub;
1840        }
1841
1842        return ret;
1843}
1844
1845static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1846                                            struct scrub_block *sblock_good,
1847                                            int page_num, int force_write)
1848{
1849        struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1850        struct scrub_page *page_good = sblock_good->pagev[page_num];
1851        struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
1852
1853        BUG_ON(page_bad->page == NULL);
1854        BUG_ON(page_good->page == NULL);
1855        if (force_write || sblock_bad->header_error ||
1856            sblock_bad->checksum_error || page_bad->io_error) {
1857                struct bio *bio;
1858                int ret;
1859
1860                if (!page_bad->dev->bdev) {
1861                        btrfs_warn_rl(fs_info,
1862                                "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
1863                        return -EIO;
1864                }
1865
1866                bio = btrfs_io_bio_alloc(1);
1867                bio_set_dev(bio, page_bad->dev->bdev);
1868                bio->bi_iter.bi_sector = page_bad->physical >> 9;
1869                bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
1870
1871                ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
1872                if (PAGE_SIZE != ret) {
1873                        bio_put(bio);
1874                        return -EIO;
1875                }
1876
1877                if (btrfsic_submit_bio_wait(bio)) {
1878                        btrfs_dev_stat_inc_and_print(page_bad->dev,
1879                                BTRFS_DEV_STAT_WRITE_ERRS);
1880                        btrfs_dev_replace_stats_inc(
1881                                &fs_info->dev_replace.num_write_errors);
1882                        bio_put(bio);
1883                        return -EIO;
1884                }
1885                bio_put(bio);
1886        }
1887
1888        return 0;
1889}
1890
1891static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1892{
1893        struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
1894        int page_num;
1895
1896        /*
1897         * This block is used for the check of the parity on the source device,
1898         * so the data needn't be written into the destination device.
1899         */
1900        if (sblock->sparity)
1901                return;
1902
1903        for (page_num = 0; page_num < sblock->page_count; page_num++) {
1904                int ret;
1905
1906                ret = scrub_write_page_to_dev_replace(sblock, page_num);
1907                if (ret)
1908                        btrfs_dev_replace_stats_inc(
1909                                &fs_info->dev_replace.num_write_errors);
1910        }
1911}
1912
1913static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1914                                           int page_num)
1915{
1916        struct scrub_page *spage = sblock->pagev[page_num];
1917
1918        BUG_ON(spage->page == NULL);
1919        if (spage->io_error) {
1920                void *mapped_buffer = kmap_atomic(spage->page);
1921
1922                clear_page(mapped_buffer);
1923                flush_dcache_page(spage->page);
1924                kunmap_atomic(mapped_buffer);
1925        }
1926        return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1927}
1928
1929static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1930                                    struct scrub_page *spage)
1931{
1932        struct scrub_bio *sbio;
1933        int ret;
1934
1935        mutex_lock(&sctx->wr_lock);
1936again:
1937        if (!sctx->wr_curr_bio) {
1938                sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
1939                                              GFP_KERNEL);
1940                if (!sctx->wr_curr_bio) {
1941                        mutex_unlock(&sctx->wr_lock);
1942                        return -ENOMEM;
1943                }
1944                sctx->wr_curr_bio->sctx = sctx;
1945                sctx->wr_curr_bio->page_count = 0;
1946        }
1947        sbio = sctx->wr_curr_bio;
1948        if (sbio->page_count == 0) {
1949                struct bio *bio;
1950
1951                sbio->physical = spage->physical_for_dev_replace;
1952                sbio->logical = spage->logical;
1953                sbio->dev = sctx->wr_tgtdev;
1954                bio = sbio->bio;
1955                if (!bio) {
1956                        bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio);
1957                        sbio->bio = bio;
1958                }
1959
1960                bio->bi_private = sbio;
1961                bio->bi_end_io = scrub_wr_bio_end_io;
1962                bio_set_dev(bio, sbio->dev->bdev);
1963                bio->bi_iter.bi_sector = sbio->physical >> 9;
1964                bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
1965                sbio->status = 0;
1966        } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1967                   spage->physical_for_dev_replace ||
1968                   sbio->logical + sbio->page_count * PAGE_SIZE !=
1969                   spage->logical) {
1970                scrub_wr_submit(sctx);
1971                goto again;
1972        }
1973
1974        ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1975        if (ret != PAGE_SIZE) {
1976                if (sbio->page_count < 1) {
1977                        bio_put(sbio->bio);
1978                        sbio->bio = NULL;
1979                        mutex_unlock(&sctx->wr_lock);
1980                        return -EIO;
1981                }
1982                scrub_wr_submit(sctx);
1983                goto again;
1984        }
1985
1986        sbio->pagev[sbio->page_count] = spage;
1987        scrub_page_get(spage);
1988        sbio->page_count++;
1989        if (sbio->page_count == sctx->pages_per_wr_bio)
1990                scrub_wr_submit(sctx);
1991        mutex_unlock(&sctx->wr_lock);
1992
1993        return 0;
1994}
1995
1996static void scrub_wr_submit(struct scrub_ctx *sctx)
1997{
1998        struct scrub_bio *sbio;
1999
2000        if (!sctx->wr_curr_bio)
2001                return;
2002
2003        sbio = sctx->wr_curr_bio;
2004        sctx->wr_curr_bio = NULL;
2005        WARN_ON(!sbio->bio->bi_disk);
2006        scrub_pending_bio_inc(sctx);
2007        /* process all writes in a single worker thread. Then the block layer
2008         * orders the requests before sending them to the driver which
2009         * doubled the write performance on spinning disks when measured
2010         * with Linux 3.5 */
2011        btrfsic_submit_bio(sbio->bio);
2012}
2013
2014static void scrub_wr_bio_end_io(struct bio *bio)
2015{
2016        struct scrub_bio *sbio = bio->bi_private;
2017        struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
2018
2019        sbio->status = bio->bi_status;
2020        sbio->bio = bio;
2021
2022        btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
2023                         scrub_wr_bio_end_io_worker, NULL, NULL);
2024        btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
2025}
2026
2027static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
2028{
2029        struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2030        struct scrub_ctx *sctx = sbio->sctx;
2031        int i;
2032
2033        WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
2034        if (sbio->status) {
2035                struct btrfs_dev_replace *dev_replace =
2036                        &sbio->sctx->fs_info->dev_replace;
2037
2038                for (i = 0; i < sbio->page_count; i++) {
2039                        struct scrub_page *spage = sbio->pagev[i];
2040
2041                        spage->io_error = 1;
2042                        btrfs_dev_replace_stats_inc(&dev_replace->
2043                                                    num_write_errors);
2044                }
2045        }
2046
2047        for (i = 0; i < sbio->page_count; i++)
2048                scrub_page_put(sbio->pagev[i]);
2049
2050        bio_put(sbio->bio);
2051        kfree(sbio);
2052        scrub_pending_bio_dec(sctx);
2053}
2054
2055static int scrub_checksum(struct scrub_block *sblock)
2056{
2057        u64 flags;
2058        int ret;
2059
2060        /*
2061         * No need to initialize these stats currently,
2062         * because this function only use return value
2063         * instead of these stats value.
2064         *
2065         * Todo:
2066         * always use stats
2067         */
2068        sblock->header_error = 0;
2069        sblock->generation_error = 0;
2070        sblock->checksum_error = 0;
2071
2072        WARN_ON(sblock->page_count < 1);
2073        flags = sblock->pagev[0]->flags;
2074        ret = 0;
2075        if (flags & BTRFS_EXTENT_FLAG_DATA)
2076                ret = scrub_checksum_data(sblock);
2077        else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
2078                ret = scrub_checksum_tree_block(sblock);
2079        else if (flags & BTRFS_EXTENT_FLAG_SUPER)
2080                (void)scrub_checksum_super(sblock);
2081        else
2082                WARN_ON(1);
2083        if (ret)
2084                scrub_handle_errored_block(sblock);
2085
2086        return ret;
2087}
2088
2089static int scrub_checksum_data(struct scrub_block *sblock)
2090{
2091        struct scrub_ctx *sctx = sblock->sctx;
2092        u8 csum[BTRFS_CSUM_SIZE];
2093        u8 *on_disk_csum;
2094        struct page *page;
2095        void *buffer;
2096        u32 crc = ~(u32)0;
2097        u64 len;
2098        int index;
2099
2100        BUG_ON(sblock->page_count < 1);
2101        if (!sblock->pagev[0]->have_csum)
2102                return 0;
2103
2104        on_disk_csum = sblock->pagev[0]->csum;
2105        page = sblock->pagev[0]->page;
2106        buffer = kmap_atomic(page);
2107
2108        len = sctx->fs_info->sectorsize;
2109        index = 0;
2110        for (;;) {
2111                u64 l = min_t(u64, len, PAGE_SIZE);
2112
2113                crc = btrfs_csum_data(buffer, crc, l);
2114                kunmap_atomic(buffer);
2115                len -= l;
2116                if (len == 0)
2117                        break;
2118                index++;
2119                BUG_ON(index >= sblock->page_count);
2120                BUG_ON(!sblock->pagev[index]->page);
2121                page = sblock->pagev[index]->page;
2122                buffer = kmap_atomic(page);
2123        }
2124
2125        btrfs_csum_final(crc, csum);
2126        if (memcmp(csum, on_disk_csum, sctx->csum_size))
2127                sblock->checksum_error = 1;
2128
2129        return sblock->checksum_error;
2130}
2131
2132static int scrub_checksum_tree_block(struct scrub_block *sblock)
2133{
2134        struct scrub_ctx *sctx = sblock->sctx;
2135        struct btrfs_header *h;
2136        struct btrfs_fs_info *fs_info = sctx->fs_info;
2137        u8 calculated_csum[BTRFS_CSUM_SIZE];
2138        u8 on_disk_csum[BTRFS_CSUM_SIZE];
2139        struct page *page;
2140        void *mapped_buffer;
2141        u64 mapped_size;
2142        void *p;
2143        u32 crc = ~(u32)0;
2144        u64 len;
2145        int index;
2146
2147        BUG_ON(sblock->page_count < 1);
2148        page = sblock->pagev[0]->page;
2149        mapped_buffer = kmap_atomic(page);
2150        h = (struct btrfs_header *)mapped_buffer;
2151        memcpy(on_disk_csum, h->csum, sctx->csum_size);
2152
2153        /*
2154         * we don't use the getter functions here, as we
2155         * a) don't have an extent buffer and
2156         * b) the page is already kmapped
2157         */
2158        if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
2159                sblock->header_error = 1;
2160
2161        if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) {
2162                sblock->header_error = 1;
2163                sblock->generation_error = 1;
2164        }
2165
2166        if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
2167                sblock->header_error = 1;
2168
2169        if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
2170                   BTRFS_UUID_SIZE))
2171                sblock->header_error = 1;
2172
2173        len = sctx->fs_info->nodesize - BTRFS_CSUM_SIZE;
2174        mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
2175        p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
2176        index = 0;
2177        for (;;) {
2178                u64 l = min_t(u64, len, mapped_size);
2179
2180                crc = btrfs_csum_data(p, crc, l);
2181                kunmap_atomic(mapped_buffer);
2182                len -= l;
2183                if (len == 0)
2184                        break;
2185                index++;
2186                BUG_ON(index >= sblock->page_count);
2187                BUG_ON(!sblock->pagev[index]->page);
2188                page = sblock->pagev[index]->page;
2189                mapped_buffer = kmap_atomic(page);
2190                mapped_size = PAGE_SIZE;
2191                p = mapped_buffer;
2192        }
2193
2194        btrfs_csum_final(crc, calculated_csum);
2195        if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
2196                sblock->checksum_error = 1;
2197
2198        return sblock->header_error || sblock->checksum_error;
2199}
2200
2201static int scrub_checksum_super(struct scrub_block *sblock)
2202{
2203        struct btrfs_super_block *s;
2204        struct scrub_ctx *sctx = sblock->sctx;
2205        u8 calculated_csum[BTRFS_CSUM_SIZE];
2206        u8 on_disk_csum[BTRFS_CSUM_SIZE];
2207        struct page *page;
2208        void *mapped_buffer;
2209        u64 mapped_size;
2210        void *p;
2211        u32 crc = ~(u32)0;
2212        int fail_gen = 0;
2213        int fail_cor = 0;
2214        u64 len;
2215        int index;
2216
2217        BUG_ON(sblock->page_count < 1);
2218        page = sblock->pagev[0]->page;
2219        mapped_buffer = kmap_atomic(page);
2220        s = (struct btrfs_super_block *)mapped_buffer;
2221        memcpy(on_disk_csum, s->csum, sctx->csum_size);
2222
2223        if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
2224                ++fail_cor;
2225
2226        if (sblock->pagev[0]->generation != btrfs_super_generation(s))
2227                ++fail_gen;
2228
2229        if (!scrub_check_fsid(s->fsid, sblock->pagev[0]))
2230                ++fail_cor;
2231
2232        len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
2233        mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
2234        p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
2235        index = 0;
2236        for (;;) {
2237                u64 l = min_t(u64, len, mapped_size);
2238
2239                crc = btrfs_csum_data(p, crc, l);
2240                kunmap_atomic(mapped_buffer);
2241                len -= l;
2242                if (len == 0)
2243                        break;
2244                index++;
2245                BUG_ON(index >= sblock->page_count);
2246                BUG_ON(!sblock->pagev[index]->page);
2247                page = sblock->pagev[index]->page;
2248                mapped_buffer = kmap_atomic(page);
2249                mapped_size = PAGE_SIZE;
2250                p = mapped_buffer;
2251        }
2252
2253        btrfs_csum_final(crc, calculated_csum);
2254        if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
2255                ++fail_cor;
2256
2257        if (fail_cor + fail_gen) {
2258                /*
2259                 * if we find an error in a super block, we just report it.
2260                 * They will get written with the next transaction commit
2261                 * anyway
2262                 */
2263                spin_lock(&sctx->stat_lock);
2264                ++sctx->stat.super_errors;
2265                spin_unlock(&sctx->stat_lock);
2266                if (fail_cor)
2267                        btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
2268                                BTRFS_DEV_STAT_CORRUPTION_ERRS);
2269                else
2270                        btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
2271                                BTRFS_DEV_STAT_GENERATION_ERRS);
2272        }
2273
2274        return fail_cor + fail_gen;
2275}
2276
2277static void scrub_block_get(struct scrub_block *sblock)
2278{
2279        refcount_inc(&sblock->refs);
2280}
2281
2282static void scrub_block_put(struct scrub_block *sblock)
2283{
2284        if (refcount_dec_and_test(&sblock->refs)) {
2285                int i;
2286
2287                if (sblock->sparity)
2288                        scrub_parity_put(sblock->sparity);
2289
2290                for (i = 0; i < sblock->page_count; i++)
2291                        scrub_page_put(sblock->pagev[i]);
2292                kfree(sblock);
2293        }
2294}
2295
2296static void scrub_page_get(struct scrub_page *spage)
2297{
2298        atomic_inc(&spage->refs);
2299}
2300
2301static void scrub_page_put(struct scrub_page *spage)
2302{
2303        if (atomic_dec_and_test(&spage->refs)) {
2304                if (spage->page)
2305                        __free_page(spage->page);
2306                kfree(spage);
2307        }
2308}
2309
2310static void scrub_submit(struct scrub_ctx *sctx)
2311{
2312        struct scrub_bio *sbio;
2313
2314        if (sctx->curr == -1)
2315                return;
2316
2317        sbio = sctx->bios[sctx->curr];
2318        sctx->curr = -1;
2319        scrub_pending_bio_inc(sctx);
2320        btrfsic_submit_bio(sbio->bio);
2321}
2322
2323static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
2324                                    struct scrub_page *spage)
2325{
2326        struct scrub_block *sblock = spage->sblock;
2327        struct scrub_bio *sbio;
2328        int ret;
2329
2330again:
2331        /*
2332         * grab a fresh bio or wait for one to become available
2333         */
2334        while (sctx->curr == -1) {
2335                spin_lock(&sctx->list_lock);
2336                sctx->curr = sctx->first_free;
2337                if (sctx->curr != -1) {
2338                        sctx->first_free = sctx->bios[sctx->curr]->next_free;
2339                        sctx->bios[sctx->curr]->next_free = -1;
2340                        sctx->bios[sctx->curr]->page_count = 0;
2341                        spin_unlock(&sctx->list_lock);
2342                } else {
2343                        spin_unlock(&sctx->list_lock);
2344                        wait_event(sctx->list_wait, sctx->first_free != -1);
2345                }
2346        }
2347        sbio = sctx->bios[sctx->curr];
2348        if (sbio->page_count == 0) {
2349                struct bio *bio;
2350
2351                sbio->physical = spage->physical;
2352                sbio->logical = spage->logical;
2353                sbio->dev = spage->dev;
2354                bio = sbio->bio;
2355                if (!bio) {
2356                        bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio);
2357                        sbio->bio = bio;
2358                }
2359
2360                bio->bi_private = sbio;
2361                bio->bi_end_io = scrub_bio_end_io;
2362                bio_set_dev(bio, sbio->dev->bdev);
2363                bio->bi_iter.bi_sector = sbio->physical >> 9;
2364                bio_set_op_attrs(bio, REQ_OP_READ, 0);
2365                sbio->status = 0;
2366        } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
2367                   spage->physical ||
2368                   sbio->logical + sbio->page_count * PAGE_SIZE !=
2369                   spage->logical ||
2370                   sbio->dev != spage->dev) {
2371                scrub_submit(sctx);
2372                goto again;
2373        }
2374
2375        sbio->pagev[sbio->page_count] = spage;
2376        ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
2377        if (ret != PAGE_SIZE) {
2378                if (sbio->page_count < 1) {
2379                        bio_put(sbio->bio);
2380                        sbio->bio = NULL;
2381                        return -EIO;
2382                }
2383                scrub_submit(sctx);
2384                goto again;
2385        }
2386
2387        scrub_block_get(sblock); /* one for the page added to the bio */
2388        atomic_inc(&sblock->outstanding_pages);
2389        sbio->page_count++;
2390        if (sbio->page_count == sctx->pages_per_rd_bio)
2391                scrub_submit(sctx);
2392
2393        return 0;
2394}
2395
2396static void scrub_missing_raid56_end_io(struct bio *bio)
2397{
2398        struct scrub_block *sblock = bio->bi_private;
2399        struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
2400
2401        if (bio->bi_status)
2402                sblock->no_io_error_seen = 0;
2403
2404        bio_put(bio);
2405
2406        btrfs_queue_work(fs_info->scrub_workers, &sblock->work);
2407}
2408
2409static void scrub_missing_raid56_worker(struct btrfs_work *work)
2410{
2411        struct scrub_block *sblock = container_of(work, struct scrub_block, work);
2412        struct scrub_ctx *sctx = sblock->sctx;
2413        struct btrfs_fs_info *fs_info = sctx->fs_info;
2414        u64 logical;
2415        struct btrfs_device *dev;
2416
2417        logical = sblock->pagev[0]->logical;
2418        dev = sblock->pagev[0]->dev;
2419
2420        if (sblock->no_io_error_seen)
2421                scrub_recheck_block_checksum(sblock);
2422
2423        if (!sblock->no_io_error_seen) {
2424                spin_lock(&sctx->stat_lock);
2425                sctx->stat.read_errors++;
2426                spin_unlock(&sctx->stat_lock);
2427                btrfs_err_rl_in_rcu(fs_info,
2428                        "IO error rebuilding logical %llu for dev %s",
2429                        logical, rcu_str_deref(dev->name));
2430        } else if (sblock->header_error || sblock->checksum_error) {
2431                spin_lock(&sctx->stat_lock);
2432                sctx->stat.uncorrectable_errors++;
2433                spin_unlock(&sctx->stat_lock);
2434                btrfs_err_rl_in_rcu(fs_info,
2435                        "failed to rebuild valid logical %llu for dev %s",
2436                        logical, rcu_str_deref(dev->name));
2437        } else {
2438                scrub_write_block_to_dev_replace(sblock);
2439        }
2440
2441        scrub_block_put(sblock);
2442
2443        if (sctx->is_dev_replace && sctx->flush_all_writes) {
2444                mutex_lock(&sctx->wr_lock);
2445                scrub_wr_submit(sctx);
2446                mutex_unlock(&sctx->wr_lock);
2447        }
2448
2449        scrub_pending_bio_dec(sctx);
2450}
2451
2452static void scrub_missing_raid56_pages(struct scrub_block *sblock)
2453{
2454        struct scrub_ctx *sctx = sblock->sctx;
2455        struct btrfs_fs_info *fs_info = sctx->fs_info;
2456        u64 length = sblock->page_count * PAGE_SIZE;
2457        u64 logical = sblock->pagev[0]->logical;
2458        struct btrfs_bio *bbio = NULL;
2459        struct bio *bio;
2460        struct btrfs_raid_bio *rbio;
2461        int ret;
2462        int i;
2463
2464        btrfs_bio_counter_inc_blocked(fs_info);
2465        ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
2466                        &length, &bbio);
2467        if (ret || !bbio || !bbio->raid_map)
2468                goto bbio_out;
2469
2470        if (WARN_ON(!sctx->is_dev_replace ||
2471                    !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2472                /*
2473                 * We shouldn't be scrubbing a missing device. Even for dev
2474                 * replace, we should only get here for RAID 5/6. We either
2475                 * managed to mount something with no mirrors remaining or
2476                 * there's a bug in scrub_remap_extent()/btrfs_map_block().
2477                 */
2478                goto bbio_out;
2479        }
2480
2481        bio = btrfs_io_bio_alloc(0);
2482        bio->bi_iter.bi_sector = logical >> 9;
2483        bio->bi_private = sblock;
2484        bio->bi_end_io = scrub_missing_raid56_end_io;
2485
2486        rbio = raid56_alloc_missing_rbio(fs_info, bio, bbio, length);
2487        if (!rbio)
2488                goto rbio_out;
2489
2490        for (i = 0; i < sblock->page_count; i++) {
2491                struct scrub_page *spage = sblock->pagev[i];
2492
2493                raid56_add_scrub_pages(rbio, spage->page, spage->logical);
2494        }
2495
2496        btrfs_init_work(&sblock->work, btrfs_scrub_helper,
2497                        scrub_missing_raid56_worker, NULL, NULL);
2498        scrub_block_get(sblock);
2499        scrub_pending_bio_inc(sctx);
2500        raid56_submit_missing_rbio(rbio);
2501        return;
2502
2503rbio_out:
2504        bio_put(bio);
2505bbio_out:
2506        btrfs_bio_counter_dec(fs_info);
2507        btrfs_put_bbio(bbio);
2508        spin_lock(&sctx->stat_lock);
2509        sctx->stat.malloc_errors++;
2510        spin_unlock(&sctx->stat_lock);
2511}
2512
2513static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
2514                       u64 physical, struct btrfs_device *dev, u64 flags,
2515                       u64 gen, int mirror_num, u8 *csum, int force,
2516                       u64 physical_for_dev_replace)
2517{
2518        struct scrub_block *sblock;
2519        int index;
2520
2521        sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2522        if (!sblock) {
2523                spin_lock(&sctx->stat_lock);
2524                sctx->stat.malloc_errors++;
2525                spin_unlock(&sctx->stat_lock);
2526                return -ENOMEM;
2527        }
2528
2529        /* one ref inside this function, plus one for each page added to
2530         * a bio later on */
2531        refcount_set(&sblock->refs, 1);
2532        sblock->sctx = sctx;
2533        sblock->no_io_error_seen = 1;
2534
2535        for (index = 0; len > 0; index++) {
2536                struct scrub_page *spage;
2537                u64 l = min_t(u64, len, PAGE_SIZE);
2538
2539                spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2540                if (!spage) {
2541leave_nomem:
2542                        spin_lock(&sctx->stat_lock);
2543                        sctx->stat.malloc_errors++;
2544                        spin_unlock(&sctx->stat_lock);
2545                        scrub_block_put(sblock);
2546                        return -ENOMEM;
2547                }
2548                BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2549                scrub_page_get(spage);
2550                sblock->pagev[index] = spage;
2551                spage->sblock = sblock;
2552                spage->dev = dev;
2553                spage->flags = flags;
2554                spage->generation = gen;
2555                spage->logical = logical;
2556                spage->physical = physical;
2557                spage->physical_for_dev_replace = physical_for_dev_replace;
2558                spage->mirror_num = mirror_num;
2559                if (csum) {
2560                        spage->have_csum = 1;
2561                        memcpy(spage->csum, csum, sctx->csum_size);
2562                } else {
2563                        spage->have_csum = 0;
2564                }
2565                sblock->page_count++;
2566                spage->page = alloc_page(GFP_KERNEL);
2567                if (!spage->page)
2568                        goto leave_nomem;
2569                len -= l;
2570                logical += l;
2571                physical += l;
2572                physical_for_dev_replace += l;
2573        }
2574
2575        WARN_ON(sblock->page_count == 0);
2576        if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2577                /*
2578                 * This case should only be hit for RAID 5/6 device replace. See
2579                 * the comment in scrub_missing_raid56_pages() for details.
2580                 */
2581                scrub_missing_raid56_pages(sblock);
2582        } else {
2583                for (index = 0; index < sblock->page_count; index++) {
2584                        struct scrub_page *spage = sblock->pagev[index];
2585                        int ret;
2586
2587                        ret = scrub_add_page_to_rd_bio(sctx, spage);
2588                        if (ret) {
2589                                scrub_block_put(sblock);
2590                                return ret;
2591                        }
2592                }
2593
2594                if (force)
2595                        scrub_submit(sctx);
2596        }
2597
2598        /* last one frees, either here or in bio completion for last page */
2599        scrub_block_put(sblock);
2600        return 0;
2601}
2602
2603static void scrub_bio_end_io(struct bio *bio)
2604{
2605        struct scrub_bio *sbio = bio->bi_private;
2606        struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
2607
2608        sbio->status = bio->bi_status;
2609        sbio->bio = bio;
2610
2611        btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
2612}
2613
2614static void scrub_bio_end_io_worker(struct btrfs_work *work)
2615{
2616        struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2617        struct scrub_ctx *sctx = sbio->sctx;
2618        int i;
2619
2620        BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
2621        if (sbio->status) {
2622                for (i = 0; i < sbio->page_count; i++) {
2623                        struct scrub_page *spage = sbio->pagev[i];
2624
2625                        spage->io_error = 1;
2626                        spage->sblock->no_io_error_seen = 0;
2627                }
2628        }
2629
2630        /* now complete the scrub_block items that have all pages completed */
2631        for (i = 0; i < sbio->page_count; i++) {
2632                struct scrub_page *spage = sbio->pagev[i];
2633                struct scrub_block *sblock = spage->sblock;
2634
2635                if (atomic_dec_and_test(&sblock->outstanding_pages))
2636                        scrub_block_complete(sblock);
2637                scrub_block_put(sblock);
2638        }
2639
2640        bio_put(sbio->bio);
2641        sbio->bio = NULL;
2642        spin_lock(&sctx->list_lock);
2643        sbio->next_free = sctx->first_free;
2644        sctx->first_free = sbio->index;
2645        spin_unlock(&sctx->list_lock);
2646
2647        if (sctx->is_dev_replace && sctx->flush_all_writes) {
2648                mutex_lock(&sctx->wr_lock);
2649                scrub_wr_submit(sctx);
2650                mutex_unlock(&sctx->wr_lock);
2651        }
2652
2653        scrub_pending_bio_dec(sctx);
2654}
2655
2656static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2657                                       unsigned long *bitmap,
2658                                       u64 start, u64 len)
2659{
2660        u64 offset;
2661        u64 nsectors64;
2662        u32 nsectors;
2663        int sectorsize = sparity->sctx->fs_info->sectorsize;
2664
2665        if (len >= sparity->stripe_len) {
2666                bitmap_set(bitmap, 0, sparity->nsectors);
2667                return;
2668        }
2669
2670        start -= sparity->logic_start;
2671        start = div64_u64_rem(start, sparity->stripe_len, &offset);
2672        offset = div_u64(offset, sectorsize);
2673        nsectors64 = div_u64(len, sectorsize);
2674
2675        ASSERT(nsectors64 < UINT_MAX);
2676        nsectors = (u32)nsectors64;
2677
2678        if (offset + nsectors <= sparity->nsectors) {
2679                bitmap_set(bitmap, offset, nsectors);
2680                return;
2681        }
2682
2683        bitmap_set(bitmap, offset, sparity->nsectors - offset);
2684        bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2685}
2686
2687static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2688                                                   u64 start, u64 len)
2689{
2690        __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
2691}
2692
2693static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2694                                                  u64 start, u64 len)
2695{
2696        __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
2697}
2698
2699static void scrub_block_complete(struct scrub_block *sblock)
2700{
2701        int corrupted = 0;
2702
2703        if (!sblock->no_io_error_seen) {
2704                corrupted = 1;
2705                scrub_handle_errored_block(sblock);
2706        } else {
2707                /*
2708                 * if has checksum error, write via repair mechanism in
2709                 * dev replace case, otherwise write here in dev replace
2710                 * case.
2711                 */
2712                corrupted = scrub_checksum(sblock);
2713                if (!corrupted && sblock->sctx->is_dev_replace)
2714                        scrub_write_block_to_dev_replace(sblock);
2715        }
2716
2717        if (sblock->sparity && corrupted && !sblock->data_corrected) {
2718                u64 start = sblock->pagev[0]->logical;
2719                u64 end = sblock->pagev[sblock->page_count - 1]->logical +
2720                          PAGE_SIZE;
2721
2722                scrub_parity_mark_sectors_error(sblock->sparity,
2723                                                start, end - start);
2724        }
2725}
2726
2727static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
2728{
2729        struct btrfs_ordered_sum *sum = NULL;
2730        unsigned long index;
2731        unsigned long num_sectors;
2732
2733        while (!list_empty(&sctx->csum_list)) {
2734                sum = list_first_entry(&sctx->csum_list,
2735                                       struct btrfs_ordered_sum, list);
2736                if (sum->bytenr > logical)
2737                        return 0;
2738                if (sum->bytenr + sum->len > logical)
2739                        break;
2740
2741                ++sctx->stat.csum_discards;
2742                list_del(&sum->list);
2743                kfree(sum);
2744                sum = NULL;
2745        }
2746        if (!sum)
2747                return 0;
2748
2749        index = div_u64(logical - sum->bytenr, sctx->fs_info->sectorsize);
2750        ASSERT(index < UINT_MAX);
2751
2752        num_sectors = sum->len / sctx->fs_info->sectorsize;
2753        memcpy(csum, sum->sums + index, sctx->csum_size);
2754        if (index == num_sectors - 1) {
2755                list_del(&sum->list);
2756                kfree(sum);
2757        }
2758        return 1;
2759}
2760
2761/* scrub extent tries to collect up to 64 kB for each bio */
2762static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
2763                        u64 logical, u64 len,
2764                        u64 physical, struct btrfs_device *dev, u64 flags,
2765                        u64 gen, int mirror_num, u64 physical_for_dev_replace)
2766{
2767        int ret;
2768        u8 csum[BTRFS_CSUM_SIZE];
2769        u32 blocksize;
2770
2771        if (flags & BTRFS_EXTENT_FLAG_DATA) {
2772                if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2773                        blocksize = map->stripe_len;
2774                else
2775                        blocksize = sctx->fs_info->sectorsize;
2776                spin_lock(&sctx->stat_lock);
2777                sctx->stat.data_extents_scrubbed++;
2778                sctx->stat.data_bytes_scrubbed += len;
2779                spin_unlock(&sctx->stat_lock);
2780        } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2781                if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2782                        blocksize = map->stripe_len;
2783                else
2784                        blocksize = sctx->fs_info->nodesize;
2785                spin_lock(&sctx->stat_lock);
2786                sctx->stat.tree_extents_scrubbed++;
2787                sctx->stat.tree_bytes_scrubbed += len;
2788                spin_unlock(&sctx->stat_lock);
2789        } else {
2790                blocksize = sctx->fs_info->sectorsize;
2791                WARN_ON(1);
2792        }
2793
2794        while (len) {
2795                u64 l = min_t(u64, len, blocksize);
2796                int have_csum = 0;
2797
2798                if (flags & BTRFS_EXTENT_FLAG_DATA) {
2799                        /* push csums to sbio */
2800                        have_csum = scrub_find_csum(sctx, logical, csum);
2801                        if (have_csum == 0)
2802                                ++sctx->stat.no_csum;
2803                        if (0 && sctx->is_dev_replace && !have_csum) {
2804                                ret = copy_nocow_pages(sctx, logical, l,
2805                                                       mirror_num,
2806                                                      physical_for_dev_replace);
2807                                goto behind_scrub_pages;
2808                        }
2809                }
2810                ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
2811                                  mirror_num, have_csum ? csum : NULL, 0,
2812                                  physical_for_dev_replace);
2813behind_scrub_pages:
2814                if (ret)
2815                        return ret;
2816                len -= l;
2817                logical += l;
2818                physical += l;
2819                physical_for_dev_replace += l;
2820        }
2821        return 0;
2822}
2823
2824static int scrub_pages_for_parity(struct scrub_parity *sparity,
2825                                  u64 logical, u64 len,
2826                                  u64 physical, struct btrfs_device *dev,
2827                                  u64 flags, u64 gen, int mirror_num, u8 *csum)
2828{
2829        struct scrub_ctx *sctx = sparity->sctx;
2830        struct scrub_block *sblock;
2831        int index;
2832
2833        sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2834        if (!sblock) {
2835                spin_lock(&sctx->stat_lock);
2836                sctx->stat.malloc_errors++;
2837                spin_unlock(&sctx->stat_lock);
2838                return -ENOMEM;
2839        }
2840
2841        /* one ref inside this function, plus one for each page added to
2842         * a bio later on */
2843        refcount_set(&sblock->refs, 1);
2844        sblock->sctx = sctx;
2845        sblock->no_io_error_seen = 1;
2846        sblock->sparity = sparity;
2847        scrub_parity_get(sparity);
2848
2849        for (index = 0; len > 0; index++) {
2850                struct scrub_page *spage;
2851                u64 l = min_t(u64, len, PAGE_SIZE);
2852
2853                spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2854                if (!spage) {
2855leave_nomem:
2856                        spin_lock(&sctx->stat_lock);
2857                        sctx->stat.malloc_errors++;
2858                        spin_unlock(&sctx->stat_lock);
2859                        scrub_block_put(sblock);
2860                        return -ENOMEM;
2861                }
2862                BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2863                /* For scrub block */
2864                scrub_page_get(spage);
2865                sblock->pagev[index] = spage;
2866                /* For scrub parity */
2867                scrub_page_get(spage);
2868                list_add_tail(&spage->list, &sparity->spages);
2869                spage->sblock = sblock;
2870                spage->dev = dev;
2871                spage->flags = flags;
2872                spage->generation = gen;
2873                spage->logical = logical;
2874                spage->physical = physical;
2875                spage->mirror_num = mirror_num;
2876                if (csum) {
2877                        spage->have_csum = 1;
2878                        memcpy(spage->csum, csum, sctx->csum_size);
2879                } else {
2880                        spage->have_csum = 0;
2881                }
2882                sblock->page_count++;
2883                spage->page = alloc_page(GFP_KERNEL);
2884                if (!spage->page)
2885                        goto leave_nomem;
2886                len -= l;
2887                logical += l;
2888                physical += l;
2889        }
2890
2891        WARN_ON(sblock->page_count == 0);
2892        for (index = 0; index < sblock->page_count; index++) {
2893                struct scrub_page *spage = sblock->pagev[index];
2894                int ret;
2895
2896                ret = scrub_add_page_to_rd_bio(sctx, spage);
2897                if (ret) {
2898                        scrub_block_put(sblock);
2899                        return ret;
2900                }
2901        }
2902
2903        /* last one frees, either here or in bio completion for last page */
2904        scrub_block_put(sblock);
2905        return 0;
2906}
2907
2908static int scrub_extent_for_parity(struct scrub_parity *sparity,
2909                                   u64 logical, u64 len,
2910                                   u64 physical, struct btrfs_device *dev,
2911                                   u64 flags, u64 gen, int mirror_num)
2912{
2913        struct scrub_ctx *sctx = sparity->sctx;
2914        int ret;
2915        u8 csum[BTRFS_CSUM_SIZE];
2916        u32 blocksize;
2917
2918        if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2919                scrub_parity_mark_sectors_error(sparity, logical, len);
2920                return 0;
2921        }
2922
2923        if (flags & BTRFS_EXTENT_FLAG_DATA) {
2924                blocksize = sparity->stripe_len;
2925        } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2926                blocksize = sparity->stripe_len;
2927        } else {
2928                blocksize = sctx->fs_info->sectorsize;
2929                WARN_ON(1);
2930        }
2931
2932        while (len) {
2933                u64 l = min_t(u64, len, blocksize);
2934                int have_csum = 0;
2935
2936                if (flags & BTRFS_EXTENT_FLAG_DATA) {
2937                        /* push csums to sbio */
2938                        have_csum = scrub_find_csum(sctx, logical, csum);
2939                        if (have_csum == 0)
2940                                goto skip;
2941                }
2942                ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
2943                                             flags, gen, mirror_num,
2944                                             have_csum ? csum : NULL);
2945                if (ret)
2946                        return ret;
2947skip:
2948                len -= l;
2949                logical += l;
2950                physical += l;
2951        }
2952        return 0;
2953}
2954
2955/*
2956 * Given a physical address, this will calculate it's
2957 * logical offset. if this is a parity stripe, it will return
2958 * the most left data stripe's logical offset.
2959 *
2960 * return 0 if it is a data stripe, 1 means parity stripe.
2961 */
2962static int get_raid56_logic_offset(u64 physical, int num,
2963                                   struct map_lookup *map, u64 *offset,
2964                                   u64 *stripe_start)
2965{
2966        int i;
2967        int j = 0;
2968        u64 stripe_nr;
2969        u64 last_offset;
2970        u32 stripe_index;
2971        u32 rot;
2972
2973        last_offset = (physical - map->stripes[num].physical) *
2974                      nr_data_stripes(map);
2975        if (stripe_start)
2976                *stripe_start = last_offset;
2977
2978        *offset = last_offset;
2979        for (i = 0; i < nr_data_stripes(map); i++) {
2980                *offset = last_offset + i * map->stripe_len;
2981
2982                stripe_nr = div64_u64(*offset, map->stripe_len);
2983                stripe_nr = div_u64(stripe_nr, nr_data_stripes(map));
2984
2985                /* Work out the disk rotation on this stripe-set */
2986                stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
2987                /* calculate which stripe this data locates */
2988                rot += i;
2989                stripe_index = rot % map->num_stripes;
2990                if (stripe_index == num)
2991                        return 0;
2992                if (stripe_index < num)
2993                        j++;
2994        }
2995        *offset = last_offset + j * map->stripe_len;
2996        return 1;
2997}
2998
2999static void scrub_free_parity(struct scrub_parity *sparity)
3000{
3001        struct scrub_ctx *sctx = sparity->sctx;
3002        struct scrub_page *curr, *next;
3003        int nbits;
3004
3005        nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
3006        if (nbits) {
3007                spin_lock(&sctx->stat_lock);
3008                sctx->stat.read_errors += nbits;
3009                sctx->stat.uncorrectable_errors += nbits;
3010                spin_unlock(&sctx->stat_lock);
3011        }
3012
3013        list_for_each_entry_safe(curr, next, &sparity->spages, list) {
3014                list_del_init(&curr->list);
3015                scrub_page_put(curr);
3016        }
3017
3018        kfree(sparity);
3019}
3020
3021static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
3022{
3023        struct scrub_parity *sparity = container_of(work, struct scrub_parity,
3024                                                    work);
3025        struct scrub_ctx *sctx = sparity->sctx;
3026
3027        scrub_free_parity(sparity);
3028        scrub_pending_bio_dec(sctx);
3029}
3030
3031static void scrub_parity_bio_endio(struct bio *bio)
3032{
3033        struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
3034        struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
3035
3036        if (bio->bi_status)
3037                bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
3038                          sparity->nsectors);
3039
3040        bio_put(bio);
3041
3042        btrfs_init_work(&sparity->work, btrfs_scrubparity_helper,
3043                        scrub_parity_bio_endio_worker, NULL, NULL);
3044        btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work);
3045}
3046
3047static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
3048{
3049        struct scrub_ctx *sctx = sparity->sctx;
3050        struct btrfs_fs_info *fs_info = sctx->fs_info;
3051        struct bio *bio;
3052        struct btrfs_raid_bio *rbio;
3053        struct btrfs_bio *bbio = NULL;
3054        u64 length;
3055        int ret;
3056
3057        if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
3058                           sparity->nsectors))
3059                goto out;
3060
3061        length = sparity->logic_end - sparity->logic_start;
3062
3063        btrfs_bio_counter_inc_blocked(fs_info);
3064        ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
3065                               &length, &bbio);
3066        if (ret || !bbio || !bbio->raid_map)
3067                goto bbio_out;
3068
3069        bio = btrfs_io_bio_alloc(0);
3070        bio->bi_iter.bi_sector = sparity->logic_start >> 9;
3071        bio->bi_private = sparity;
3072        bio->bi_end_io = scrub_parity_bio_endio;
3073
3074        rbio = raid56_parity_alloc_scrub_rbio(fs_info, bio, bbio,
3075                                              length, sparity->scrub_dev,
3076                                              sparity->dbitmap,
3077                                              sparity->nsectors);
3078        if (!rbio)
3079                goto rbio_out;
3080
3081        scrub_pending_bio_inc(sctx);
3082        raid56_parity_submit_scrub_rbio(rbio);
3083        return;
3084
3085rbio_out:
3086        bio_put(bio);
3087bbio_out:
3088        btrfs_bio_counter_dec(fs_info);
3089        btrfs_put_bbio(bbio);
3090        bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
3091                  sparity->nsectors);
3092        spin_lock(&sctx->stat_lock);
3093        sctx->stat.malloc_errors++;
3094        spin_unlock(&sctx->stat_lock);
3095out:
3096        scrub_free_parity(sparity);
3097}
3098
3099static inline int scrub_calc_parity_bitmap_len(int nsectors)
3100{
3101        return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long);
3102}
3103
3104static void scrub_parity_get(struct scrub_parity *sparity)
3105{
3106        refcount_inc(&sparity->refs);
3107}
3108
3109static void scrub_parity_put(struct scrub_parity *sparity)
3110{
3111        if (!refcount_dec_and_test(&sparity->refs))
3112                return;
3113
3114        scrub_parity_check_and_repair(sparity);
3115}
3116
3117static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
3118                                                  struct map_lookup *map,
3119                                                  struct btrfs_device *sdev,
3120                                                  struct btrfs_path *path,
3121                                                  u64 logic_start,
3122                                                  u64 logic_end)
3123{
3124        struct btrfs_fs_info *fs_info = sctx->fs_info;
3125        struct btrfs_root *root = fs_info->extent_root;
3126        struct btrfs_root *csum_root = fs_info->csum_root;
3127        struct btrfs_extent_item *extent;
3128        struct btrfs_bio *bbio = NULL;
3129        u64 flags;
3130        int ret;
3131        int slot;
3132        struct extent_buffer *l;
3133        struct btrfs_key key;
3134        u64 generation;
3135        u64 extent_logical;
3136        u64 extent_physical;
3137        u64 extent_len;
3138        u64 mapped_length;
3139        struct btrfs_device *extent_dev;
3140        struct scrub_parity *sparity;
3141        int nsectors;
3142        int bitmap_len;
3143        int extent_mirror_num;
3144        int stop_loop = 0;
3145
3146        nsectors = div_u64(map->stripe_len, fs_info->sectorsize);
3147        bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
3148        sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
3149                          GFP_NOFS);
3150        if (!sparity) {
3151                spin_lock(&sctx->stat_lock);
3152                sctx->stat.malloc_errors++;
3153                spin_unlock(&sctx->stat_lock);
3154                return -ENOMEM;
3155        }
3156
3157        sparity->stripe_len = map->stripe_len;
3158        sparity->nsectors = nsectors;
3159        sparity->sctx = sctx;
3160        sparity->scrub_dev = sdev;
3161        sparity->logic_start = logic_start;
3162        sparity->logic_end = logic_end;
3163        refcount_set(&sparity->refs, 1);
3164        INIT_LIST_HEAD(&sparity->spages);
3165        sparity->dbitmap = sparity->bitmap;
3166        sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
3167
3168        ret = 0;
3169        while (logic_start < logic_end) {
3170                if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3171                        key.type = BTRFS_METADATA_ITEM_KEY;
3172                else
3173                        key.type = BTRFS_EXTENT_ITEM_KEY;
3174                key.objectid = logic_start;
3175                key.offset = (u64)-1;
3176
3177                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3178                if (ret < 0)
3179                        goto out;
3180
3181                if (ret > 0) {
3182                        ret = btrfs_previous_extent_item(root, path, 0);
3183                        if (ret < 0)
3184                                goto out;
3185                        if (ret > 0) {
3186                                btrfs_release_path(path);
3187                                ret = btrfs_search_slot(NULL, root, &key,
3188                                                        path, 0, 0);
3189                                if (ret < 0)
3190                                        goto out;
3191                        }
3192                }
3193
3194                stop_loop = 0;
3195                while (1) {
3196                        u64 bytes;
3197
3198                        l = path->nodes[0];
3199                        slot = path->slots[0];
3200                        if (slot >= btrfs_header_nritems(l)) {
3201                                ret = btrfs_next_leaf(root, path);
3202                                if (ret == 0)
3203                                        continue;
3204                                if (ret < 0)
3205                                        goto out;
3206
3207                                stop_loop = 1;
3208                                break;
3209                        }
3210                        btrfs_item_key_to_cpu(l, &key, slot);
3211
3212                        if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3213                            key.type != BTRFS_METADATA_ITEM_KEY)
3214                                goto next;
3215
3216                        if (key.type == BTRFS_METADATA_ITEM_KEY)
3217                                bytes = fs_info->nodesize;
3218                        else
3219                                bytes = key.offset;
3220
3221                        if (key.objectid + bytes <= logic_start)
3222                                goto next;
3223
3224                        if (key.objectid >= logic_end) {
3225                                stop_loop = 1;
3226                                break;
3227                        }
3228
3229                        while (key.objectid >= logic_start + map->stripe_len)
3230                                logic_start += map->stripe_len;
3231
3232                        extent = btrfs_item_ptr(l, slot,
3233                                                struct btrfs_extent_item);
3234                        flags = btrfs_extent_flags(l, extent);
3235                        generation = btrfs_extent_generation(l, extent);
3236
3237                        if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3238                            (key.objectid < logic_start ||
3239                             key.objectid + bytes >
3240                             logic_start + map->stripe_len)) {
3241                                btrfs_err(fs_info,
3242                                          "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3243                                          key.objectid, logic_start);
3244                                spin_lock(&sctx->stat_lock);
3245                                sctx->stat.uncorrectable_errors++;
3246                                spin_unlock(&sctx->stat_lock);
3247                                goto next;
3248                        }
3249again:
3250                        extent_logical = key.objectid;
3251                        extent_len = bytes;
3252
3253                        if (extent_logical < logic_start) {
3254                                extent_len -= logic_start - extent_logical;
3255                                extent_logical = logic_start;
3256                        }
3257
3258                        if (extent_logical + extent_len >
3259                            logic_start + map->stripe_len)
3260                                extent_len = logic_start + map->stripe_len -
3261                                             extent_logical;
3262
3263                        scrub_parity_mark_sectors_data(sparity, extent_logical,
3264                                                       extent_len);
3265
3266                        mapped_length = extent_len;
3267                        bbio = NULL;
3268                        ret = btrfs_map_block(fs_info, BTRFS_MAP_READ,
3269                                        extent_logical, &mapped_length, &bbio,
3270                                        0);
3271                        if (!ret) {
3272                                if (!bbio || mapped_length < extent_len)
3273                                        ret = -EIO;
3274                        }
3275                        if (ret) {
3276                                btrfs_put_bbio(bbio);
3277                                goto out;
3278                        }
3279                        extent_physical = bbio->stripes[0].physical;
3280                        extent_mirror_num = bbio->mirror_num;
3281                        extent_dev = bbio->stripes[0].dev;
3282                        btrfs_put_bbio(bbio);
3283
3284                        ret = btrfs_lookup_csums_range(csum_root,
3285                                                extent_logical,
3286                                                extent_logical + extent_len - 1,
3287                                                &sctx->csum_list, 1);
3288                        if (ret)
3289                                goto out;
3290
3291                        ret = scrub_extent_for_parity(sparity, extent_logical,
3292                                                      extent_len,
3293                                                      extent_physical,
3294                                                      extent_dev, flags,
3295                                                      generation,
3296                                                      extent_mirror_num);
3297
3298                        scrub_free_csums(sctx);
3299
3300                        if (ret)
3301                                goto out;
3302
3303                        if (extent_logical + extent_len <
3304                            key.objectid + bytes) {
3305                                logic_start += map->stripe_len;
3306
3307                                if (logic_start >= logic_end) {
3308                                        stop_loop = 1;
3309                                        break;
3310                                }
3311
3312                                if (logic_start < key.objectid + bytes) {
3313                                        cond_resched();
3314                                        goto again;
3315                                }
3316                        }
3317next:
3318                        path->slots[0]++;
3319                }
3320
3321                btrfs_release_path(path);
3322
3323                if (stop_loop)
3324                        break;
3325
3326                logic_start += map->stripe_len;
3327        }
3328out:
3329        if (ret < 0)
3330                scrub_parity_mark_sectors_error(sparity, logic_start,
3331                                                logic_end - logic_start);
3332        scrub_parity_put(sparity);
3333        scrub_submit(sctx);
3334        mutex_lock(&sctx->wr_lock);
3335        scrub_wr_submit(sctx);
3336        mutex_unlock(&sctx->wr_lock);
3337
3338        btrfs_release_path(path);
3339        return ret < 0 ? ret : 0;
3340}
3341
3342static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3343                                           struct map_lookup *map,
3344                                           struct btrfs_device *scrub_dev,
3345                                           int num, u64 base, u64 length,
3346                                           int is_dev_replace)
3347{
3348        struct btrfs_path *path, *ppath;
3349        struct btrfs_fs_info *fs_info = sctx->fs_info;
3350        struct btrfs_root *root = fs_info->extent_root;
3351        struct btrfs_root *csum_root = fs_info->csum_root;
3352        struct btrfs_extent_item *extent;
3353        struct blk_plug plug;
3354        u64 flags;
3355        int ret;
3356        int slot;
3357        u64 nstripes;
3358        struct extent_buffer *l;
3359        u64 physical;
3360        u64 logical;
3361        u64 logic_end;
3362        u64 physical_end;
3363        u64 generation;
3364        int mirror_num;
3365        struct reada_control *reada1;
3366        struct reada_control *reada2;
3367        struct btrfs_key key;
3368        struct btrfs_key key_end;
3369        u64 increment = map->stripe_len;
3370        u64 offset;
3371        u64 extent_logical;
3372        u64 extent_physical;
3373        u64 extent_len;
3374        u64 stripe_logical;
3375        u64 stripe_end;
3376        struct btrfs_device *extent_dev;
3377        int extent_mirror_num;
3378        int stop_loop = 0;
3379
3380        physical = map->stripes[num].physical;
3381        offset = 0;
3382        nstripes = div64_u64(length, map->stripe_len);
3383        if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3384                offset = map->stripe_len * num;
3385                increment = map->stripe_len * map->num_stripes;
3386                mirror_num = 1;
3387        } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3388                int factor = map->num_stripes / map->sub_stripes;
3389                offset = map->stripe_len * (num / map->sub_stripes);
3390                increment = map->stripe_len * factor;
3391                mirror_num = num % map->sub_stripes + 1;
3392        } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3393                increment = map->stripe_len;
3394                mirror_num = num % map->num_stripes + 1;
3395        } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3396                increment = map->stripe_len;
3397                mirror_num = num % map->num_stripes + 1;
3398        } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3399                get_raid56_logic_offset(physical, num, map, &offset, NULL);
3400                increment = map->stripe_len * nr_data_stripes(map);
3401                mirror_num = 1;
3402        } else {
3403                increment = map->stripe_len;
3404                mirror_num = 1;
3405        }
3406
3407        path = btrfs_alloc_path();
3408        if (!path)
3409                return -ENOMEM;
3410
3411        ppath = btrfs_alloc_path();
3412        if (!ppath) {
3413                btrfs_free_path(path);
3414                return -ENOMEM;
3415        }
3416
3417        /*
3418         * work on commit root. The related disk blocks are static as
3419         * long as COW is applied. This means, it is save to rewrite
3420         * them to repair disk errors without any race conditions
3421         */
3422        path->search_commit_root = 1;
3423        path->skip_locking = 1;
3424
3425        ppath->search_commit_root = 1;
3426        ppath->skip_locking = 1;
3427        /*
3428         * trigger the readahead for extent tree csum tree and wait for
3429         * completion. During readahead, the scrub is officially paused
3430         * to not hold off transaction commits
3431         */
3432        logical = base + offset;
3433        physical_end = physical + nstripes * map->stripe_len;
3434        if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3435                get_raid56_logic_offset(physical_end, num,
3436                                        map, &logic_end, NULL);
3437                logic_end += base;
3438        } else {
3439                logic_end = logical + increment * nstripes;
3440        }
3441        wait_event(sctx->list_wait,
3442                   atomic_read(&sctx->bios_in_flight) == 0);
3443        scrub_blocked_if_needed(fs_info);
3444
3445        /* FIXME it might be better to start readahead at commit root */
3446        key.objectid = logical;
3447        key.type = BTRFS_EXTENT_ITEM_KEY;
3448        key.offset = (u64)0;
3449        key_end.objectid = logic_end;
3450        key_end.type = BTRFS_METADATA_ITEM_KEY;
3451        key_end.offset = (u64)-1;
3452        reada1 = btrfs_reada_add(root, &key, &key_end);
3453
3454        key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3455        key.type = BTRFS_EXTENT_CSUM_KEY;
3456        key.offset = logical;
3457        key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3458        key_end.type = BTRFS_EXTENT_CSUM_KEY;
3459        key_end.offset = logic_end;
3460        reada2 = btrfs_reada_add(csum_root, &key, &key_end);
3461
3462        if (!IS_ERR(reada1))
3463                btrfs_reada_wait(reada1);
3464        if (!IS_ERR(reada2))
3465                btrfs_reada_wait(reada2);
3466
3467
3468        /*
3469         * collect all data csums for the stripe to avoid seeking during
3470         * the scrub. This might currently (crc32) end up to be about 1MB
3471         */
3472        blk_start_plug(&plug);
3473
3474        /*
3475         * now find all extents for each stripe and scrub them
3476         */
3477        ret = 0;
3478        while (physical < physical_end) {
3479                /*
3480                 * canceled?
3481                 */
3482                if (atomic_read(&fs_info->scrub_cancel_req) ||
3483                    atomic_read(&sctx->cancel_req)) {
3484                        ret = -ECANCELED;
3485                        goto out;
3486                }
3487                /*
3488                 * check to see if we have to pause
3489                 */
3490                if (atomic_read(&fs_info->scrub_pause_req)) {
3491                        /* push queued extents */
3492                        sctx->flush_all_writes = true;
3493                        scrub_submit(sctx);
3494                        mutex_lock(&sctx->wr_lock);
3495                        scrub_wr_submit(sctx);
3496                        mutex_unlock(&sctx->wr_lock);
3497                        wait_event(sctx->list_wait,
3498                                   atomic_read(&sctx->bios_in_flight) == 0);
3499                        sctx->flush_all_writes = false;
3500                        scrub_blocked_if_needed(fs_info);
3501                }
3502
3503                if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3504                        ret = get_raid56_logic_offset(physical, num, map,
3505                                                      &logical,
3506                                                      &stripe_logical);
3507                        logical += base;
3508                        if (ret) {
3509                                /* it is parity strip */
3510                                stripe_logical += base;
3511                                stripe_end = stripe_logical + increment;
3512                                ret = scrub_raid56_parity(sctx, map, scrub_dev,
3513                                                          ppath, stripe_logical,
3514                                                          stripe_end);
3515                                if (ret)
3516                                        goto out;
3517                                goto skip;
3518                        }
3519                }
3520
3521                if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3522                        key.type = BTRFS_METADATA_ITEM_KEY;
3523                else
3524                        key.type = BTRFS_EXTENT_ITEM_KEY;
3525                key.objectid = logical;
3526                key.offset = (u64)-1;
3527
3528                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3529                if (ret < 0)
3530                        goto out;
3531
3532                if (ret > 0) {
3533                        ret = btrfs_previous_extent_item(root, path, 0);
3534                        if (ret < 0)
3535                                goto out;
3536                        if (ret > 0) {
3537                                /* there's no smaller item, so stick with the
3538                                 * larger one */
3539                                btrfs_release_path(path);
3540                                ret = btrfs_search_slot(NULL, root, &key,
3541                                                        path, 0, 0);
3542                                if (ret < 0)
3543                                        goto out;
3544                        }
3545                }
3546
3547                stop_loop = 0;
3548                while (1) {
3549                        u64 bytes;
3550
3551                        l = path->nodes[0];
3552                        slot = path->slots[0];
3553                        if (slot >= btrfs_header_nritems(l)) {
3554                                ret = btrfs_next_leaf(root, path);
3555                                if (ret == 0)
3556                                        continue;
3557                                if (ret < 0)
3558                                        goto out;
3559
3560                                stop_loop = 1;
3561                                break;
3562                        }
3563                        btrfs_item_key_to_cpu(l, &key, slot);
3564
3565                        if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3566                            key.type != BTRFS_METADATA_ITEM_KEY)
3567                                goto next;
3568
3569                        if (key.type == BTRFS_METADATA_ITEM_KEY)
3570                                bytes = fs_info->nodesize;
3571                        else
3572                                bytes = key.offset;
3573
3574                        if (key.objectid + bytes <= logical)
3575                                goto next;
3576
3577                        if (key.objectid >= logical + map->stripe_len) {
3578                                /* out of this device extent */
3579                                if (key.objectid >= logic_end)
3580                                        stop_loop = 1;
3581                                break;
3582                        }
3583
3584                        extent = btrfs_item_ptr(l, slot,
3585                                                struct btrfs_extent_item);
3586                        flags = btrfs_extent_flags(l, extent);
3587                        generation = btrfs_extent_generation(l, extent);
3588
3589                        if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3590                            (key.objectid < logical ||
3591                             key.objectid + bytes >
3592                             logical + map->stripe_len)) {
3593                                btrfs_err(fs_info,
3594                                           "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3595                                       key.objectid, logical);
3596                                spin_lock(&sctx->stat_lock);
3597                                sctx->stat.uncorrectable_errors++;
3598                                spin_unlock(&sctx->stat_lock);
3599                                goto next;
3600                        }
3601
3602again:
3603                        extent_logical = key.objectid;
3604                        extent_len = bytes;
3605
3606                        /*
3607                         * trim extent to this stripe
3608                         */
3609                        if (extent_logical < logical) {
3610                                extent_len -= logical - extent_logical;
3611                                extent_logical = logical;
3612                        }
3613                        if (extent_logical + extent_len >
3614                            logical + map->stripe_len) {
3615                                extent_len = logical + map->stripe_len -
3616                                             extent_logical;
3617                        }
3618
3619                        extent_physical = extent_logical - logical + physical;
3620                        extent_dev = scrub_dev;
3621                        extent_mirror_num = mirror_num;
3622                        if (is_dev_replace)
3623                                scrub_remap_extent(fs_info, extent_logical,
3624                                                   extent_len, &extent_physical,
3625                                                   &extent_dev,
3626                                                   &extent_mirror_num);
3627
3628                        ret = btrfs_lookup_csums_range(csum_root,
3629                                                       extent_logical,
3630                                                       extent_logical +
3631                                                       extent_len - 1,
3632                                                       &sctx->csum_list, 1);
3633                        if (ret)
3634                                goto out;
3635
3636                        ret = scrub_extent(sctx, map, extent_logical, extent_len,
3637                                           extent_physical, extent_dev, flags,
3638                                           generation, extent_mirror_num,
3639                                           extent_logical - logical + physical);
3640
3641                        scrub_free_csums(sctx);
3642
3643                        if (ret)
3644                                goto out;
3645
3646                        if (extent_logical + extent_len <
3647                            key.objectid + bytes) {
3648                                if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3649                                        /*
3650                                         * loop until we find next data stripe
3651                                         * or we have finished all stripes.
3652                                         */
3653loop:
3654                                        physical += map->stripe_len;
3655                                        ret = get_raid56_logic_offset(physical,
3656                                                        num, map, &logical,
3657                                                        &stripe_logical);
3658                                        logical += base;
3659
3660                                        if (ret && physical < physical_end) {
3661                                                stripe_logical += base;
3662                                                stripe_end = stripe_logical +
3663                                                                increment;
3664                                                ret = scrub_raid56_parity(sctx,
3665                                                        map, scrub_dev, ppath,
3666                                                        stripe_logical,
3667                                                        stripe_end);
3668                                                if (ret)
3669                                                        goto out;
3670                                                goto loop;
3671                                        }
3672                                } else {
3673                                        physical += map->stripe_len;
3674                                        logical += increment;
3675                                }
3676                                if (logical < key.objectid + bytes) {
3677                                        cond_resched();
3678                                        goto again;
3679                                }
3680
3681                                if (physical >= physical_end) {
3682                                        stop_loop = 1;
3683                                        break;
3684                                }
3685                        }
3686next:
3687                        path->slots[0]++;
3688                }
3689                btrfs_release_path(path);
3690skip:
3691                logical += increment;
3692                physical += map->stripe_len;
3693                spin_lock(&sctx->stat_lock);
3694                if (stop_loop)
3695                        sctx->stat.last_physical = map->stripes[num].physical +
3696                                                   length;
3697                else
3698                        sctx->stat.last_physical = physical;
3699                spin_unlock(&sctx->stat_lock);
3700                if (stop_loop)
3701                        break;
3702        }
3703out:
3704        /* push queued extents */
3705        scrub_submit(sctx);
3706        mutex_lock(&sctx->wr_lock);
3707        scrub_wr_submit(sctx);
3708        mutex_unlock(&sctx->wr_lock);
3709
3710        blk_finish_plug(&plug);
3711        btrfs_free_path(path);
3712        btrfs_free_path(ppath);
3713        return ret < 0 ? ret : 0;
3714}
3715
3716static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
3717                                          struct btrfs_device *scrub_dev,
3718                                          u64 chunk_offset, u64 length,
3719                                          u64 dev_offset,
3720                                          struct btrfs_block_group_cache *cache,
3721                                          int is_dev_replace)
3722{
3723        struct btrfs_fs_info *fs_info = sctx->fs_info;
3724        struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
3725        struct map_lookup *map;
3726        struct extent_map *em;
3727        int i;
3728        int ret = 0;
3729
3730        read_lock(&map_tree->map_tree.lock);
3731        em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
3732        read_unlock(&map_tree->map_tree.lock);
3733
3734        if (!em) {
3735                /*
3736                 * Might have been an unused block group deleted by the cleaner
3737                 * kthread or relocation.
3738                 */
3739                spin_lock(&cache->lock);
3740                if (!cache->removed)
3741                        ret = -EINVAL;
3742                spin_unlock(&cache->lock);
3743
3744                return ret;
3745        }
3746
3747        map = em->map_lookup;
3748        if (em->start != chunk_offset)
3749                goto out;
3750
3751        if (em->len < length)
3752                goto out;
3753
3754        for (i = 0; i < map->num_stripes; ++i) {
3755                if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
3756                    map->stripes[i].physical == dev_offset) {
3757                        ret = scrub_stripe(sctx, map, scrub_dev, i,
3758                                           chunk_offset, length,
3759                                           is_dev_replace);
3760                        if (ret)
3761                                goto out;
3762                }
3763        }
3764out:
3765        free_extent_map(em);
3766
3767        return ret;
3768}
3769
3770static noinline_for_stack
3771int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3772                           struct btrfs_device *scrub_dev, u64 start, u64 end,
3773                           int is_dev_replace)
3774{
3775        struct btrfs_dev_extent *dev_extent = NULL;
3776        struct btrfs_path *path;
3777        struct btrfs_fs_info *fs_info = sctx->fs_info;
3778        struct btrfs_root *root = fs_info->dev_root;
3779        u64 length;
3780        u64 chunk_offset;
3781        int ret = 0;
3782        int ro_set;
3783        int slot;
3784        struct extent_buffer *l;
3785        struct btrfs_key key;
3786        struct btrfs_key found_key;
3787        struct btrfs_block_group_cache *cache;
3788        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
3789
3790        path = btrfs_alloc_path();
3791        if (!path)
3792                return -ENOMEM;
3793
3794        path->reada = READA_FORWARD;
3795        path->search_commit_root = 1;
3796        path->skip_locking = 1;
3797
3798        key.objectid = scrub_dev->devid;
3799        key.offset = 0ull;
3800        key.type = BTRFS_DEV_EXTENT_KEY;
3801
3802        while (1) {
3803                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3804                if (ret < 0)
3805                        break;
3806                if (ret > 0) {
3807                        if (path->slots[0] >=
3808                            btrfs_header_nritems(path->nodes[0])) {
3809                                ret = btrfs_next_leaf(root, path);
3810                                if (ret < 0)
3811                                        break;
3812                                if (ret > 0) {
3813                                        ret = 0;
3814                                        break;
3815                                }
3816                        } else {
3817                                ret = 0;
3818                        }
3819                }
3820
3821                l = path->nodes[0];
3822                slot = path->slots[0];
3823
3824                btrfs_item_key_to_cpu(l, &found_key, slot);
3825
3826                if (found_key.objectid != scrub_dev->devid)
3827                        break;
3828
3829                if (found_key.type != BTRFS_DEV_EXTENT_KEY)
3830                        break;
3831
3832                if (found_key.offset >= end)
3833                        break;
3834
3835                if (found_key.offset < key.offset)
3836                        break;
3837
3838                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3839                length = btrfs_dev_extent_length(l, dev_extent);
3840
3841                if (found_key.offset + length <= start)
3842                        goto skip;
3843
3844                chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3845
3846                /*
3847                 * get a reference on the corresponding block group to prevent
3848                 * the chunk from going away while we scrub it
3849                 */
3850                cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3851
3852                /* some chunks are removed but not committed to disk yet,
3853                 * continue scrubbing */
3854                if (!cache)
3855                        goto skip;
3856
3857                /*
3858                 * we need call btrfs_inc_block_group_ro() with scrubs_paused,
3859                 * to avoid deadlock caused by:
3860                 * btrfs_inc_block_group_ro()
3861                 * -> btrfs_wait_for_commit()
3862                 * -> btrfs_commit_transaction()
3863                 * -> btrfs_scrub_pause()
3864                 */
3865                scrub_pause_on(fs_info);
3866                ret = btrfs_inc_block_group_ro(fs_info, cache);
3867                if (!ret && is_dev_replace) {
3868                        /*
3869                         * If we are doing a device replace wait for any tasks
3870                         * that started dellaloc right before we set the block
3871                         * group to RO mode, as they might have just allocated
3872                         * an extent from it or decided they could do a nocow
3873                         * write. And if any such tasks did that, wait for their
3874                         * ordered extents to complete and then commit the
3875                         * current transaction, so that we can later see the new
3876                         * extent items in the extent tree - the ordered extents
3877                         * create delayed data references (for cow writes) when
3878                         * they complete, which will be run and insert the
3879                         * corresponding extent items into the extent tree when
3880                         * we commit the transaction they used when running
3881                         * inode.c:btrfs_finish_ordered_io(). We later use
3882                         * the commit root of the extent tree to find extents
3883                         * to copy from the srcdev into the tgtdev, and we don't
3884                         * want to miss any new extents.
3885                         */
3886                        btrfs_wait_block_group_reservations(cache);
3887                        btrfs_wait_nocow_writers(cache);
3888                        ret = btrfs_wait_ordered_roots(fs_info, U64_MAX,
3889                                                       cache->key.objectid,
3890                                                       cache->key.offset);
3891                        if (ret > 0) {
3892                                struct btrfs_trans_handle *trans;
3893
3894                                trans = btrfs_join_transaction(root);
3895                                if (IS_ERR(trans))
3896                                        ret = PTR_ERR(trans);
3897                                else
3898                                        ret = btrfs_commit_transaction(trans);
3899                                if (ret) {
3900                                        scrub_pause_off(fs_info);
3901                                        btrfs_put_block_group(cache);
3902                                        break;
3903                                }
3904                        }
3905                }
3906                scrub_pause_off(fs_info);
3907
3908                if (ret == 0) {
3909                        ro_set = 1;
3910                } else if (ret == -ENOSPC) {
3911                        /*
3912                         * btrfs_inc_block_group_ro return -ENOSPC when it
3913                         * failed in creating new chunk for metadata.
3914                         * It is not a problem for scrub/replace, because
3915                         * metadata are always cowed, and our scrub paused
3916                         * commit_transactions.
3917                         */
3918                        ro_set = 0;
3919                } else {
3920                        btrfs_warn(fs_info,
3921                                   "failed setting block group ro: %d", ret);
3922                        btrfs_put_block_group(cache);
3923                        break;
3924                }
3925
3926                btrfs_dev_replace_write_lock(&fs_info->dev_replace);
3927                dev_replace->cursor_right = found_key.offset + length;
3928                dev_replace->cursor_left = found_key.offset;
3929                dev_replace->item_needs_writeback = 1;
3930                btrfs_dev_replace_write_unlock(&fs_info->dev_replace);
3931                ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
3932                                  found_key.offset, cache, is_dev_replace);
3933
3934                /*
3935                 * flush, submit all pending read and write bios, afterwards
3936                 * wait for them.
3937                 * Note that in the dev replace case, a read request causes
3938                 * write requests that are submitted in the read completion
3939                 * worker. Therefore in the current situation, it is required
3940                 * that all write requests are flushed, so that all read and
3941                 * write requests are really completed when bios_in_flight
3942                 * changes to 0.
3943                 */
3944                sctx->flush_all_writes = true;
3945                scrub_submit(sctx);
3946                mutex_lock(&sctx->wr_lock);
3947                scrub_wr_submit(sctx);
3948                mutex_unlock(&sctx->wr_lock);
3949
3950                wait_event(sctx->list_wait,
3951                           atomic_read(&sctx->bios_in_flight) == 0);
3952
3953                scrub_pause_on(fs_info);
3954
3955                /*
3956                 * must be called before we decrease @scrub_paused.
3957                 * make sure we don't block transaction commit while
3958                 * we are waiting pending workers finished.
3959                 */
3960                wait_event(sctx->list_wait,
3961                           atomic_read(&sctx->workers_pending) == 0);
3962                sctx->flush_all_writes = false;
3963
3964                scrub_pause_off(fs_info);
3965
3966                btrfs_dev_replace_write_lock(&fs_info->dev_replace);
3967                dev_replace->cursor_left = dev_replace->cursor_right;
3968                dev_replace->item_needs_writeback = 1;
3969                btrfs_dev_replace_write_unlock(&fs_info->dev_replace);
3970
3971                if (ro_set)
3972                        btrfs_dec_block_group_ro(cache);
3973
3974                /*
3975                 * We might have prevented the cleaner kthread from deleting
3976                 * this block group if it was already unused because we raced
3977                 * and set it to RO mode first. So add it back to the unused
3978                 * list, otherwise it might not ever be deleted unless a manual
3979                 * balance is triggered or it becomes used and unused again.
3980                 */
3981                spin_lock(&cache->lock);
3982                if (!cache->removed && !cache->ro && cache->reserved == 0 &&
3983                    btrfs_block_group_used(&cache->item) == 0) {
3984                        spin_unlock(&cache->lock);
3985                        spin_lock(&fs_info->unused_bgs_lock);
3986                        if (list_empty(&cache->bg_list)) {
3987                                btrfs_get_block_group(cache);
3988                                trace_btrfs_add_unused_block_group(cache);
3989                                list_add_tail(&cache->bg_list,
3990                                              &fs_info->unused_bgs);
3991                        }
3992                        spin_unlock(&fs_info->unused_bgs_lock);
3993                } else {
3994                        spin_unlock(&cache->lock);
3995                }
3996
3997                btrfs_put_block_group(cache);
3998                if (ret)
3999                        break;
4000                if (is_dev_replace &&
4001                    atomic64_read(&dev_replace->num_write_errors) > 0) {
4002                        ret = -EIO;
4003                        break;
4004                }
4005                if (sctx->stat.malloc_errors > 0) {
4006                        ret = -ENOMEM;
4007                        break;
4008                }
4009skip:
4010                key.offset = found_key.offset + length;
4011                btrfs_release_path(path);
4012        }
4013
4014        btrfs_free_path(path);
4015
4016        return ret;
4017}
4018
4019static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
4020                                           struct btrfs_device *scrub_dev)
4021{
4022        int     i;
4023        u64     bytenr;
4024        u64     gen;
4025        int     ret;
4026        struct btrfs_fs_info *fs_info = sctx->fs_info;
4027
4028        if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
4029                return -EIO;
4030
4031        /* Seed devices of a new filesystem has their own generation. */
4032        if (scrub_dev->fs_devices != fs_info->fs_devices)
4033                gen = scrub_dev->generation;
4034        else
4035                gen = fs_info->last_trans_committed;
4036
4037        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
4038                bytenr = btrfs_sb_offset(i);
4039                if (bytenr + BTRFS_SUPER_INFO_SIZE >
4040                    scrub_dev->commit_total_bytes)
4041                        break;
4042
4043                ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
4044                                  scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
4045                                  NULL, 1, bytenr);
4046                if (ret)
4047                        return ret;
4048        }
4049        wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4050
4051        return 0;
4052}
4053
4054/*
4055 * get a reference count on fs_info->scrub_workers. start worker if necessary
4056 */
4057static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
4058                                                int is_dev_replace)
4059{
4060        unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
4061        int max_active = fs_info->thread_pool_size;
4062
4063        if (fs_info->scrub_workers_refcnt == 0) {
4064                fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub",
4065                                flags, is_dev_replace ? 1 : max_active, 4);
4066                if (!fs_info->scrub_workers)
4067                        goto fail_scrub_workers;
4068
4069                fs_info->scrub_wr_completion_workers =
4070                        btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
4071                                              max_active, 2);
4072                if (!fs_info->scrub_wr_completion_workers)
4073                        goto fail_scrub_wr_completion_workers;
4074
4075                fs_info->scrub_nocow_workers =
4076                        btrfs_alloc_workqueue(fs_info, "scrubnc", flags, 1, 0);
4077                if (!fs_info->scrub_nocow_workers)
4078                        goto fail_scrub_nocow_workers;
4079                fs_info->scrub_parity_workers =
4080                        btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
4081                                              max_active, 2);
4082                if (!fs_info->scrub_parity_workers)
4083                        goto fail_scrub_parity_workers;
4084        }
4085        ++fs_info->scrub_workers_refcnt;
4086        return 0;
4087
4088fail_scrub_parity_workers:
4089        btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
4090fail_scrub_nocow_workers:
4091        btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
4092fail_scrub_wr_completion_workers:
4093        btrfs_destroy_workqueue(fs_info->scrub_workers);
4094fail_scrub_workers:
4095        return -ENOMEM;
4096}
4097
4098static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
4099{
4100        if (--fs_info->scrub_workers_refcnt == 0) {
4101                btrfs_destroy_workqueue(fs_info->scrub_workers);
4102                btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
4103                btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
4104                btrfs_destroy_workqueue(fs_info->scrub_parity_workers);
4105        }
4106        WARN_ON(fs_info->scrub_workers_refcnt < 0);
4107}
4108
4109int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
4110                    u64 end, struct btrfs_scrub_progress *progress,
4111                    int readonly, int is_dev_replace)
4112{
4113        struct scrub_ctx *sctx;
4114        int ret;
4115        struct btrfs_device *dev;
4116        struct rcu_string *name;
4117
4118        if (btrfs_fs_closing(fs_info))
4119                return -EINVAL;
4120
4121        if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
4122                /*
4123                 * in this case scrub is unable to calculate the checksum
4124                 * the way scrub is implemented. Do not handle this
4125                 * situation at all because it won't ever happen.
4126                 */
4127                btrfs_err(fs_info,
4128                           "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
4129                       fs_info->nodesize,
4130                       BTRFS_STRIPE_LEN);
4131                return -EINVAL;
4132        }
4133
4134        if (fs_info->sectorsize != PAGE_SIZE) {
4135                /* not supported for data w/o checksums */
4136                btrfs_err_rl(fs_info,
4137                           "scrub: size assumption sectorsize != PAGE_SIZE (%d != %lu) fails",
4138                       fs_info->sectorsize, PAGE_SIZE);
4139                return -EINVAL;
4140        }
4141
4142        if (fs_info->nodesize >
4143            PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
4144            fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
4145                /*
4146                 * would exhaust the array bounds of pagev member in
4147                 * struct scrub_block
4148                 */
4149                btrfs_err(fs_info,
4150                          "scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
4151                       fs_info->nodesize,
4152                       SCRUB_MAX_PAGES_PER_BLOCK,
4153                       fs_info->sectorsize,
4154                       SCRUB_MAX_PAGES_PER_BLOCK);
4155                return -EINVAL;
4156        }
4157
4158
4159        mutex_lock(&fs_info->fs_devices->device_list_mutex);
4160        dev = btrfs_find_device(fs_info, devid, NULL, NULL);
4161        if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
4162                     !is_dev_replace)) {
4163                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4164                return -ENODEV;
4165        }
4166
4167        if (!is_dev_replace && !readonly &&
4168            !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
4169                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4170                rcu_read_lock();
4171                name = rcu_dereference(dev->name);
4172                btrfs_err(fs_info, "scrub: device %s is not writable",
4173                          name->str);
4174                rcu_read_unlock();
4175                return -EROFS;
4176        }
4177
4178        mutex_lock(&fs_info->scrub_lock);
4179        if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4180            test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
4181                mutex_unlock(&fs_info->scrub_lock);
4182                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4183                return -EIO;
4184        }
4185
4186        btrfs_dev_replace_read_lock(&fs_info->dev_replace);
4187        if (dev->scrub_ctx ||
4188            (!is_dev_replace &&
4189             btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
4190                btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
4191                mutex_unlock(&fs_info->scrub_lock);
4192                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4193                return -EINPROGRESS;
4194        }
4195        btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
4196
4197        ret = scrub_workers_get(fs_info, is_dev_replace);
4198        if (ret) {
4199                mutex_unlock(&fs_info->scrub_lock);
4200                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4201                return ret;
4202        }
4203
4204        sctx = scrub_setup_ctx(dev, is_dev_replace);
4205        if (IS_ERR(sctx)) {
4206                mutex_unlock(&fs_info->scrub_lock);
4207                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4208                scrub_workers_put(fs_info);
4209                return PTR_ERR(sctx);
4210        }
4211        sctx->readonly = readonly;
4212        dev->scrub_ctx = sctx;
4213        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4214
4215        /*
4216         * checking @scrub_pause_req here, we can avoid
4217         * race between committing transaction and scrubbing.
4218         */
4219        __scrub_blocked_if_needed(fs_info);
4220        atomic_inc(&fs_info->scrubs_running);
4221        mutex_unlock(&fs_info->scrub_lock);
4222
4223        if (!is_dev_replace) {
4224                /*
4225                 * by holding device list mutex, we can
4226                 * kick off writing super in log tree sync.
4227                 */
4228                mutex_lock(&fs_info->fs_devices->device_list_mutex);
4229                ret = scrub_supers(sctx, dev);
4230                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4231        }
4232
4233        if (!ret)
4234                ret = scrub_enumerate_chunks(sctx, dev, start, end,
4235                                             is_dev_replace);
4236
4237        wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4238        atomic_dec(&fs_info->scrubs_running);
4239        wake_up(&fs_info->scrub_pause_wait);
4240
4241        wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
4242
4243        if (progress)
4244                memcpy(progress, &sctx->stat, sizeof(*progress));
4245
4246        mutex_lock(&fs_info->scrub_lock);
4247        dev->scrub_ctx = NULL;
4248        scrub_workers_put(fs_info);
4249        mutex_unlock(&fs_info->scrub_lock);
4250
4251        scrub_put_ctx(sctx);
4252
4253        return ret;
4254}
4255
4256void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
4257{
4258        mutex_lock(&fs_info->scrub_lock);
4259        atomic_inc(&fs_info->scrub_pause_req);
4260        while (atomic_read(&fs_info->scrubs_paused) !=
4261               atomic_read(&fs_info->scrubs_running)) {
4262                mutex_unlock(&fs_info->scrub_lock);
4263                wait_event(fs_info->scrub_pause_wait,
4264                           atomic_read(&fs_info->scrubs_paused) ==
4265                           atomic_read(&fs_info->scrubs_running));
4266                mutex_lock(&fs_info->scrub_lock);
4267        }
4268        mutex_unlock(&fs_info->scrub_lock);
4269}
4270
4271void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
4272{
4273        atomic_dec(&fs_info->scrub_pause_req);
4274        wake_up(&fs_info->scrub_pause_wait);
4275}
4276
4277int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
4278{
4279        mutex_lock(&fs_info->scrub_lock);
4280        if (!atomic_read(&fs_info->scrubs_running)) {
4281                mutex_unlock(&fs_info->scrub_lock);
4282                return -ENOTCONN;
4283        }
4284
4285        atomic_inc(&fs_info->scrub_cancel_req);
4286        while (atomic_read(&fs_info->scrubs_running)) {
4287                mutex_unlock(&fs_info->scrub_lock);
4288                wait_event(fs_info->scrub_pause_wait,
4289                           atomic_read(&fs_info->scrubs_running) == 0);
4290                mutex_lock(&fs_info->scrub_lock);
4291        }
4292        atomic_dec(&fs_info->scrub_cancel_req);
4293        mutex_unlock(&fs_info->scrub_lock);
4294
4295        return 0;
4296}
4297
4298int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
4299                           struct btrfs_device *dev)
4300{
4301        struct scrub_ctx *sctx;
4302
4303        mutex_lock(&fs_info->scrub_lock);
4304        sctx = dev->scrub_ctx;
4305        if (!sctx) {
4306                mutex_unlock(&fs_info->scrub_lock);
4307                return -ENOTCONN;
4308        }
4309        atomic_inc(&sctx->cancel_req);
4310        while (dev->scrub_ctx) {
4311                mutex_unlock(&fs_info->scrub_lock);
4312                wait_event(fs_info->scrub_pause_wait,
4313                           dev->scrub_ctx == NULL);
4314                mutex_lock(&fs_info->scrub_lock);
4315        }
4316        mutex_unlock(&fs_info->scrub_lock);
4317
4318        return 0;
4319}
4320
4321int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
4322                         struct btrfs_scrub_progress *progress)
4323{
4324        struct btrfs_device *dev;
4325        struct scrub_ctx *sctx = NULL;
4326
4327        mutex_lock(&fs_info->fs_devices->device_list_mutex);
4328        dev = btrfs_find_device(fs_info, devid, NULL, NULL);
4329        if (dev)
4330                sctx = dev->scrub_ctx;
4331        if (sctx)
4332                memcpy(progress, &sctx->stat, sizeof(*progress));
4333        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4334
4335        return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
4336}
4337
4338static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
4339                               u64 extent_logical, u64 extent_len,
4340                               u64 *extent_physical,
4341                               struct btrfs_device **extent_dev,
4342                               int *extent_mirror_num)
4343{
4344        u64 mapped_length;
4345        struct btrfs_bio *bbio = NULL;
4346        int ret;
4347
4348        mapped_length = extent_len;
4349        ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
4350                              &mapped_length, &bbio, 0);
4351        if (ret || !bbio || mapped_length < extent_len ||
4352            !bbio->stripes[0].dev->bdev) {
4353                btrfs_put_bbio(bbio);
4354                return;
4355        }
4356
4357        *extent_physical = bbio->stripes[0].physical;
4358        *extent_mirror_num = bbio->mirror_num;
4359        *extent_dev = bbio->stripes[0].dev;
4360        btrfs_put_bbio(bbio);
4361}
4362
4363static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
4364                            int mirror_num, u64 physical_for_dev_replace)
4365{
4366        struct scrub_copy_nocow_ctx *nocow_ctx;
4367        struct btrfs_fs_info *fs_info = sctx->fs_info;
4368
4369        nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
4370        if (!nocow_ctx) {
4371                spin_lock(&sctx->stat_lock);
4372                sctx->stat.malloc_errors++;
4373                spin_unlock(&sctx->stat_lock);
4374                return -ENOMEM;
4375        }
4376
4377        scrub_pending_trans_workers_inc(sctx);
4378
4379        nocow_ctx->sctx = sctx;
4380        nocow_ctx->logical = logical;
4381        nocow_ctx->len = len;
4382        nocow_ctx->mirror_num = mirror_num;
4383        nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
4384        btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper,
4385                        copy_nocow_pages_worker, NULL, NULL);
4386        INIT_LIST_HEAD(&nocow_ctx->inodes);
4387        btrfs_queue_work(fs_info->scrub_nocow_workers,
4388                         &nocow_ctx->work);
4389
4390        return 0;
4391}
4392
4393static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx)
4394{
4395        struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
4396        struct scrub_nocow_inode *nocow_inode;
4397
4398        nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS);
4399        if (!nocow_inode)
4400                return -ENOMEM;
4401        nocow_inode->inum = inum;
4402        nocow_inode->offset = offset;
4403        nocow_inode->root = root;
4404        list_add_tail(&nocow_inode->list, &nocow_ctx->inodes);
4405        return 0;
4406}
4407
4408#define COPY_COMPLETE 1
4409
4410static void copy_nocow_pages_worker(struct btrfs_work *work)
4411{
4412        struct scrub_copy_nocow_ctx *nocow_ctx =
4413                container_of(work, struct scrub_copy_nocow_ctx, work);
4414        struct scrub_ctx *sctx = nocow_ctx->sctx;
4415        struct btrfs_fs_info *fs_info = sctx->fs_info;
4416        struct btrfs_root *root = fs_info->extent_root;
4417        u64 logical = nocow_ctx->logical;
4418        u64 len = nocow_ctx->len;
4419        int mirror_num = nocow_ctx->mirror_num;
4420        u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
4421        int ret;
4422        struct btrfs_trans_handle *trans = NULL;
4423        struct btrfs_path *path;
4424        int not_written = 0;
4425
4426        path = btrfs_alloc_path();
4427        if (!path) {
4428                spin_lock(&sctx->stat_lock);
4429                sctx->stat.malloc_errors++;
4430                spin_unlock(&sctx->stat_lock);
4431                not_written = 1;
4432                goto out;
4433        }
4434
4435        trans = btrfs_join_transaction(root);
4436        if (IS_ERR(trans)) {
4437                not_written = 1;
4438                goto out;
4439        }
4440
4441        ret = iterate_inodes_from_logical(logical, fs_info, path,
4442                        record_inode_for_nocow, nocow_ctx, false);
4443        if (ret != 0 && ret != -ENOENT) {
4444                btrfs_warn(fs_info,
4445                           "iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d",
4446                           logical, physical_for_dev_replace, len, mirror_num,
4447                           ret);
4448                not_written = 1;
4449                goto out;
4450        }
4451
4452        btrfs_end_transaction(trans);
4453        trans = NULL;
4454        while (!list_empty(&nocow_ctx->inodes)) {
4455                struct scrub_nocow_inode *entry;
4456                entry = list_first_entry(&nocow_ctx->inodes,
4457                                         struct scrub_nocow_inode,
4458                                         list);
4459                list_del_init(&entry->list);
4460                ret = copy_nocow_pages_for_inode(entry->inum, entry->offset,
4461                                                 entry->root, nocow_ctx);
4462                kfree(entry);
4463                if (ret == COPY_COMPLETE) {
4464                        ret = 0;
4465                        break;
4466                } else if (ret) {
4467                        break;
4468                }
4469        }
4470out:
4471        while (!list_empty(&nocow_ctx->inodes)) {
4472                struct scrub_nocow_inode *entry;
4473                entry = list_first_entry(&nocow_ctx->inodes,
4474                                         struct scrub_nocow_inode,
4475                                         list);
4476                list_del_init(&entry->list);
4477                kfree(entry);
4478        }
4479        if (trans && !IS_ERR(trans))
4480                btrfs_end_transaction(trans);
4481        if (not_written)
4482                btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
4483                                            num_uncorrectable_read_errors);
4484
4485        btrfs_free_path(path);
4486        kfree(nocow_ctx);
4487
4488        scrub_pending_trans_workers_dec(sctx);
4489}
4490
4491static int check_extent_to_block(struct btrfs_inode *inode, u64 start, u64 len,
4492                                 u64 logical)
4493{
4494        struct extent_state *cached_state = NULL;
4495        struct btrfs_ordered_extent *ordered;
4496        struct extent_io_tree *io_tree;
4497        struct extent_map *em;
4498        u64 lockstart = start, lockend = start + len - 1;
4499        int ret = 0;
4500
4501        io_tree = &inode->io_tree;
4502
4503        lock_extent_bits(io_tree, lockstart, lockend, &cached_state);
4504        ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
4505        if (ordered) {
4506                btrfs_put_ordered_extent(ordered);
4507                ret = 1;
4508                goto out_unlock;
4509        }
4510
4511        em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
4512        if (IS_ERR(em)) {
4513                ret = PTR_ERR(em);
4514                goto out_unlock;
4515        }
4516
4517        /*
4518         * This extent does not actually cover the logical extent anymore,
4519         * move on to the next inode.
4520         */
4521        if (em->block_start > logical ||
4522            em->block_start + em->block_len < logical + len ||
4523            test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
4524                free_extent_map(em);
4525                ret = 1;
4526                goto out_unlock;
4527        }
4528        free_extent_map(em);
4529
4530out_unlock:
4531        unlock_extent_cached(io_tree, lockstart, lockend, &cached_state);
4532        return ret;
4533}
4534
4535static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
4536                                      struct scrub_copy_nocow_ctx *nocow_ctx)
4537{
4538        struct btrfs_fs_info *fs_info = nocow_ctx->sctx->fs_info;
4539        struct btrfs_key key;
4540        struct inode *inode;
4541        struct page *page;
4542        struct btrfs_root *local_root;
4543        struct extent_io_tree *io_tree;
4544        u64 physical_for_dev_replace;
4545        u64 nocow_ctx_logical;
4546        u64 len = nocow_ctx->len;
4547        unsigned long index;
4548        int srcu_index;
4549        int ret = 0;
4550        int err = 0;
4551
4552        key.objectid = root;
4553        key.type = BTRFS_ROOT_ITEM_KEY;
4554        key.offset = (u64)-1;
4555
4556        srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
4557
4558        local_root = btrfs_read_fs_root_no_name(fs_info, &key);
4559        if (IS_ERR(local_root)) {
4560                srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
4561                return PTR_ERR(local_root);
4562        }
4563
4564        key.type = BTRFS_INODE_ITEM_KEY;
4565        key.objectid = inum;
4566        key.offset = 0;
4567        inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
4568        srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
4569        if (IS_ERR(inode))
4570                return PTR_ERR(inode);
4571
4572        /* Avoid truncate/dio/punch hole.. */
4573        inode_lock(inode);
4574        inode_dio_wait(inode);
4575
4576        physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
4577        io_tree = &BTRFS_I(inode)->io_tree;
4578        nocow_ctx_logical = nocow_ctx->logical;
4579
4580        ret = check_extent_to_block(BTRFS_I(inode), offset, len,
4581                        nocow_ctx_logical);
4582        if (ret) {
4583                ret = ret > 0 ? 0 : ret;
4584                goto out;
4585        }
4586
4587        while (len >= PAGE_SIZE) {
4588                index = offset >> PAGE_SHIFT;
4589again:
4590                page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
4591                if (!page) {
4592                        btrfs_err(fs_info, "find_or_create_page() failed");
4593                        ret = -ENOMEM;
4594                        goto out;
4595                }
4596
4597                if (PageUptodate(page)) {
4598                        if (PageDirty(page))
4599                                goto next_page;
4600                } else {
4601                        ClearPageError(page);
4602                        err = extent_read_full_page(io_tree, page,
4603                                                           btrfs_get_extent,
4604                                                           nocow_ctx->mirror_num);
4605                        if (err) {
4606                                ret = err;
4607                                goto next_page;
4608                        }
4609
4610                        lock_page(page);
4611                        /*
4612                         * If the page has been remove from the page cache,
4613                         * the data on it is meaningless, because it may be
4614                         * old one, the new data may be written into the new
4615                         * page in the page cache.
4616                         */
4617                        if (page->mapping != inode->i_mapping) {
4618                                unlock_page(page);
4619                                put_page(page);
4620                                goto again;
4621                        }
4622                        if (!PageUptodate(page)) {
4623                                ret = -EIO;
4624                                goto next_page;
4625                        }
4626                }
4627
4628                ret = check_extent_to_block(BTRFS_I(inode), offset, len,
4629                                            nocow_ctx_logical);
4630                if (ret) {
4631                        ret = ret > 0 ? 0 : ret;
4632                        goto next_page;
4633                }
4634
4635                err = write_page_nocow(nocow_ctx->sctx,
4636                                       physical_for_dev_replace, page);
4637                if (err)
4638                        ret = err;
4639next_page:
4640                unlock_page(page);
4641                put_page(page);
4642
4643                if (ret)
4644                        break;
4645
4646                offset += PAGE_SIZE;
4647                physical_for_dev_replace += PAGE_SIZE;
4648                nocow_ctx_logical += PAGE_SIZE;
4649                len -= PAGE_SIZE;
4650        }
4651        ret = COPY_COMPLETE;
4652out:
4653        inode_unlock(inode);
4654        iput(inode);
4655        return ret;
4656}
4657
4658static int write_page_nocow(struct scrub_ctx *sctx,
4659                            u64 physical_for_dev_replace, struct page *page)
4660{
4661        struct bio *bio;
4662        struct btrfs_device *dev;
4663
4664        dev = sctx->wr_tgtdev;
4665        if (!dev)
4666                return -EIO;
4667        if (!dev->bdev) {
4668                btrfs_warn_rl(dev->fs_info,
4669                        "scrub write_page_nocow(bdev == NULL) is unexpected");
4670                return -EIO;
4671        }
4672        bio = btrfs_io_bio_alloc(1);
4673        bio->bi_iter.bi_size = 0;
4674        bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
4675        bio_set_dev(bio, dev->bdev);
4676        bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
4677        /* bio_add_page won't fail on a freshly allocated bio */
4678        bio_add_page(bio, page, PAGE_SIZE, 0);
4679
4680        if (btrfsic_submit_bio_wait(bio)) {
4681                bio_put(bio);
4682                btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
4683                return -EIO;
4684        }
4685
4686        bio_put(bio);
4687        return 0;
4688}
4689