LXR linux/fs/btrfs/scrub.c

   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
   4 */
   5
   6#include <linux/blkdev.h>
   7#include <linux/ratelimit.h>
   8#include <linux/sched/mm.h>
   9#include "ctree.h"
  10#include "volumes.h"
  11#include "disk-io.h"
  12#include "ordered-data.h"
  13#include "transaction.h"
  14#include "backref.h"
  15#include "extent_io.h"
  16#include "dev-replace.h"
  17#include "check-integrity.h"
  18#include "rcu-string.h"
  19#include "raid56.h"
  20
  21/*
  22 * This is only the first step towards a full-features scrub. It reads all
  23 * extent and super block and verifies the checksums. In case a bad checksum
  24 * is found or the extent cannot be read, good data will be written back if
  25 * any can be found.
  26 *
  27 * Future enhancements:
  28 *  - In case an unrepairable extent is encountered, track which files are
  29 *    affected and report them
  30 *  - track and record media errors, throw out bad devices
  31 *  - add a mode to also read unallocated space
  32 */
  33
  34struct scrub_block;
  35struct scrub_ctx;
  36
  37/*
  38 * the following three values only influence the performance.
  39 * The last one configures the number of parallel and outstanding I/O
  40 * operations. The first two values configure an upper limit for the number
  41 * of (dynamically allocated) pages that are added to a bio.
  42 */
  43#define SCRUB_PAGES_PER_RD_BIO  32      /* 128k per bio */
  44#define SCRUB_PAGES_PER_WR_BIO  32      /* 128k per bio */
  45#define SCRUB_BIOS_PER_SCTX     64      /* 8MB per device in flight */
  46
  47/*
  48 * the following value times PAGE_SIZE needs to be large enough to match the
  49 * largest node/leaf/sector size that shall be supported.
  50 * Values larger than BTRFS_STRIPE_LEN are not supported.
  51 */
  52#define SCRUB_MAX_PAGES_PER_BLOCK       16      /* 64k per node/leaf/sector */
  53
  54struct scrub_recover {
  55        refcount_t              refs;
  56        struct btrfs_bio        *bbio;
  57        u64                     map_length;
  58};
  59
  60struct scrub_page {
  61        struct scrub_block      *sblock;
  62        struct page             *page;
  63        struct btrfs_device     *dev;
  64        struct list_head        list;
  65        u64                     flags;  /* extent flags */
  66        u64                     generation;
  67        u64                     logical;
  68        u64                     physical;
  69        u64                     physical_for_dev_replace;
  70        atomic_t                refs;
  71        struct {
  72                unsigned int    mirror_num:8;
  73                unsigned int    have_csum:1;
  74                unsigned int    io_error:1;
  75        };
  76        u8                      csum[BTRFS_CSUM_SIZE];
  77
  78        struct scrub_recover    *recover;
  79};
  80
  81struct scrub_bio {
  82        int                     index;
  83        struct scrub_ctx        *sctx;
  84        struct btrfs_device     *dev;
  85        struct bio              *bio;
  86        blk_status_t            status;
  87        u64                     logical;
  88        u64                     physical;
  89#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
  90        struct scrub_page       *pagev[SCRUB_PAGES_PER_WR_BIO];
  91#else
  92        struct scrub_page       *pagev[SCRUB_PAGES_PER_RD_BIO];
  93#endif
  94        int                     page_count;
  95        int                     next_free;
  96        struct btrfs_work       work;
  97};
  98
  99struct scrub_block {
 100        struct scrub_page       *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
 101        int                     page_count;
 102        atomic_t                outstanding_pages;
 103        refcount_t              refs; /* free mem on transition to zero */
 104        struct scrub_ctx        *sctx;
 105        struct scrub_parity     *sparity;
 106        struct {
 107                unsigned int    header_error:1;
 108                unsigned int    checksum_error:1;
 109                unsigned int    no_io_error_seen:1;
 110                unsigned int    generation_error:1; /* also sets header_error */
 111
 112                /* The following is for the data used to check parity */
 113                /* It is for the data with checksum */
 114                unsigned int    data_corrected:1;
 115        };
 116        struct btrfs_work       work;
 117};
 118
 119/* Used for the chunks with parity stripe such RAID5/6 */
 120struct scrub_parity {
 121        struct scrub_ctx        *sctx;
 122
 123        struct btrfs_device     *scrub_dev;
 124
 125        u64                     logic_start;
 126
 127        u64                     logic_end;
 128
 129        int                     nsectors;
 130
 131        u64                     stripe_len;
 132
 133        refcount_t              refs;
 134
 135        struct list_head        spages;
 136
 137        /* Work of parity check and repair */
 138        struct btrfs_work       work;
 139
 140        /* Mark the parity blocks which have data */
 141        unsigned long           *dbitmap;
 142
 143        /*
 144         * Mark the parity blocks which have data, but errors happen when
 145         * read data or check data
 146         */
 147        unsigned long           *ebitmap;
 148
 149        unsigned long           bitmap[0];
 150};
 151
 152struct scrub_ctx {
 153        struct scrub_bio        *bios[SCRUB_BIOS_PER_SCTX];
 154        struct btrfs_fs_info    *fs_info;
 155        int                     first_free;
 156        int                     curr;
 157        atomic_t                bios_in_flight;
 158        atomic_t                workers_pending;
 159        spinlock_t              list_lock;
 160        wait_queue_head_t       list_wait;
 161        u16                     csum_size;
 162        struct list_head        csum_list;
 163        atomic_t                cancel_req;
 164        int                     readonly;
 165        int                     pages_per_rd_bio;
 166
 167        int                     is_dev_replace;
 168
 169        struct scrub_bio        *wr_curr_bio;
 170        struct mutex            wr_lock;
 171        int                     pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
 172        struct btrfs_device     *wr_tgtdev;
 173        bool                    flush_all_writes;
 174
 175        /*
 176         * statistics
 177         */
 178        struct btrfs_scrub_progress stat;
 179        spinlock_t              stat_lock;
 180
 181        /*
 182         * Use a ref counter to avoid use-after-free issues. Scrub workers
 183         * decrement bios_in_flight and workers_pending and then do a wakeup
 184         * on the list_wait wait queue. We must ensure the main scrub task
 185         * doesn't free the scrub context before or while the workers are
 186         * doing the wakeup() call.
 187         */
 188        refcount_t              refs;
 189};
 190
 191struct scrub_fixup_nodatasum {
 192        struct scrub_ctx        *sctx;
 193        struct btrfs_device     *dev;
 194        u64                     logical;
 195        struct btrfs_root       *root;
 196        struct btrfs_work       work;
 197        int                     mirror_num;
 198};
 199
 200struct scrub_nocow_inode {
 201        u64                     inum;
 202        u64                     offset;
 203        u64                     root;
 204        struct list_head        list;
 205};
 206
 207struct scrub_copy_nocow_ctx {
 208        struct scrub_ctx        *sctx;
 209        u64                     logical;
 210        u64                     len;
 211        int                     mirror_num;
 212        u64                     physical_for_dev_replace;
 213        struct list_head        inodes;
 214        struct btrfs_work       work;
 215};
 216
 217struct scrub_warning {
 218        struct btrfs_path       *path;
 219        u64                     extent_item_size;
 220        const char              *errstr;
 221        u64                     physical;
 222        u64                     logical;
 223        struct btrfs_device     *dev;
 224};
 225
 226struct full_stripe_lock {
 227        struct rb_node node;
 228        u64 logical;
 229        u64 refs;
 230        struct mutex mutex;
 231};
 232
 233static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
 234static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
 235static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
 236static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
 237static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
 238static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
 239                                     struct scrub_block *sblocks_for_recheck);
 240static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 241                                struct scrub_block *sblock,
 242                                int retry_failed_mirror);
 243static void scrub_recheck_block_checksum(struct scrub_block *sblock);
 244static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
 245                                             struct scrub_block *sblock_good);
 246static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 247                                            struct scrub_block *sblock_good,
 248                                            int page_num, int force_write);
 249static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
 250static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
 251                                           int page_num);
 252static int scrub_checksum_data(struct scrub_block *sblock);
 253static int scrub_checksum_tree_block(struct scrub_block *sblock);
 254static int scrub_checksum_super(struct scrub_block *sblock);
 255static void scrub_block_get(struct scrub_block *sblock);
 256static void scrub_block_put(struct scrub_block *sblock);
 257static void scrub_page_get(struct scrub_page *spage);
 258static void scrub_page_put(struct scrub_page *spage);
 259static void scrub_parity_get(struct scrub_parity *sparity);
 260static void scrub_parity_put(struct scrub_parity *sparity);
 261static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
 262                                    struct scrub_page *spage);
 263static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 264                       u64 physical, struct btrfs_device *dev, u64 flags,
 265                       u64 gen, int mirror_num, u8 *csum, int force,
 266                       u64 physical_for_dev_replace);
 267static void scrub_bio_end_io(struct bio *bio);
 268static void scrub_bio_end_io_worker(struct btrfs_work *work);
 269static void scrub_block_complete(struct scrub_block *sblock);
 270static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
 271                               u64 extent_logical, u64 extent_len,
 272                               u64 *extent_physical,
 273                               struct btrfs_device **extent_dev,
 274                               int *extent_mirror_num);
 275static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
 276                                    struct scrub_page *spage);
 277static void scrub_wr_submit(struct scrub_ctx *sctx);
 278static void scrub_wr_bio_end_io(struct bio *bio);
 279static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
 280static int write_page_nocow(struct scrub_ctx *sctx,
 281                            u64 physical_for_dev_replace, struct page *page);
 282static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
 283                                      struct scrub_copy_nocow_ctx *ctx);
 284static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 285                            int mirror_num, u64 physical_for_dev_replace);
 286static void copy_nocow_pages_worker(struct btrfs_work *work);
 287static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
 288static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
 289static void scrub_put_ctx(struct scrub_ctx *sctx);
 290
 291static inline int scrub_is_page_on_raid56(struct scrub_page *page)
 292{
 293        return page->recover &&
 294               (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
 295}
 296
 297static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
 298{
 299        refcount_inc(&sctx->refs);
 300        atomic_inc(&sctx->bios_in_flight);
 301}
 302
 303static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
 304{
 305        atomic_dec(&sctx->bios_in_flight);
 306        wake_up(&sctx->list_wait);
 307        scrub_put_ctx(sctx);
 308}
 309
 310static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 311{
 312        while (atomic_read(&fs_info->scrub_pause_req)) {
 313                mutex_unlock(&fs_info->scrub_lock);
 314                wait_event(fs_info->scrub_pause_wait,
 315                   atomic_read(&fs_info->scrub_pause_req) == 0);
 316                mutex_lock(&fs_info->scrub_lock);
 317        }
 318}
 319
 320static void scrub_pause_on(struct btrfs_fs_info *fs_info)
 321{
 322        atomic_inc(&fs_info->scrubs_paused);
 323        wake_up(&fs_info->scrub_pause_wait);
 324}
 325
 326static void scrub_pause_off(struct btrfs_fs_info *fs_info)
 327{
 328        mutex_lock(&fs_info->scrub_lock);
 329        __scrub_blocked_if_needed(fs_info);
 330        atomic_dec(&fs_info->scrubs_paused);
 331        mutex_unlock(&fs_info->scrub_lock);
 332
 333        wake_up(&fs_info->scrub_pause_wait);
 334}
 335
 336static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 337{
 338        scrub_pause_on(fs_info);
 339        scrub_pause_off(fs_info);
 340}
 341
 342/*
 343 * Insert new full stripe lock into full stripe locks tree
 344 *
 345 * Return pointer to existing or newly inserted full_stripe_lock structure if
 346 * everything works well.
 347 * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
 348 *
 349 * NOTE: caller must hold full_stripe_locks_root->lock before calling this
 350 * function
 351 */
 352static struct full_stripe_lock *insert_full_stripe_lock(
 353                struct btrfs_full_stripe_locks_tree *locks_root,
 354                u64 fstripe_logical)
 355{
 356        struct rb_node **p;
 357        struct rb_node *parent = NULL;
 358        struct full_stripe_lock *entry;
 359        struct full_stripe_lock *ret;
 360
 361        lockdep_assert_held(&locks_root->lock);
 362
 363        p = &locks_root->root.rb_node;
 364        while (*p) {
 365                parent = *p;
 366                entry = rb_entry(parent, struct full_stripe_lock, node);
 367                if (fstripe_logical < entry->logical) {
 368                        p = &(*p)->rb_left;
 369                } else if (fstripe_logical > entry->logical) {
 370                        p = &(*p)->rb_right;
 371                } else {
 372                        entry->refs++;
 373                        return entry;
 374                }
 375        }
 376
 377        /* Insert new lock */
 378        ret = kmalloc(sizeof(*ret), GFP_KERNEL);
 379        if (!ret)
 380                return ERR_PTR(-ENOMEM);
 381        ret->logical = fstripe_logical;
 382        ret->refs = 1;
 383        mutex_init(&ret->mutex);
 384
 385        rb_link_node(&ret->node, parent, p);
 386        rb_insert_color(&ret->node, &locks_root->root);
 387        return ret;
 388}
 389
 390/*
 391 * Search for a full stripe lock of a block group
 392 *
 393 * Return pointer to existing full stripe lock if found
 394 * Return NULL if not found
 395 */
 396static struct full_stripe_lock *search_full_stripe_lock(
 397                struct btrfs_full_stripe_locks_tree *locks_root,
 398                u64 fstripe_logical)
 399{
 400        struct rb_node *node;
 401        struct full_stripe_lock *entry;
 402
 403        lockdep_assert_held(&locks_root->lock);
 404
 405        node = locks_root->root.rb_node;
 406        while (node) {
 407                entry = rb_entry(node, struct full_stripe_lock, node);
 408                if (fstripe_logical < entry->logical)
 409                        node = node->rb_left;
 410                else if (fstripe_logical > entry->logical)
 411                        node = node->rb_right;
 412                else
 413                        return entry;
 414        }
 415        return NULL;
 416}
 417
 418/*
 419 * Helper to get full stripe logical from a normal bytenr.
 420 *
 421 * Caller must ensure @cache is a RAID56 block group.
 422 */
 423static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache,
 424                                   u64 bytenr)
 425{
 426        u64 ret;
 427
 428        /*
 429         * Due to chunk item size limit, full stripe length should not be
 430         * larger than U32_MAX. Just a sanity check here.
 431         */
 432        WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
 433
 434        /*
 435         * round_down() can only handle power of 2, while RAID56 full
 436         * stripe length can be 64KiB * n, so we need to manually round down.
 437         */
 438        ret = div64_u64(bytenr - cache->key.objectid, cache->full_stripe_len) *
 439                cache->full_stripe_len + cache->key.objectid;
 440        return ret;
 441}
 442
 443/*
 444 * Lock a full stripe to avoid concurrency of recovery and read
 445 *
 446 * It's only used for profiles with parities (RAID5/6), for other profiles it
 447 * does nothing.
 448 *
 449 * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
 450 * So caller must call unlock_full_stripe() at the same context.
 451 *
 452 * Return <0 if encounters error.
 453 */
 454static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
 455                            bool *locked_ret)
 456{
 457        struct btrfs_block_group_cache *bg_cache;
 458        struct btrfs_full_stripe_locks_tree *locks_root;
 459        struct full_stripe_lock *existing;
 460        u64 fstripe_start;
 461        int ret = 0;
 462
 463        *locked_ret = false;
 464        bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
 465        if (!bg_cache) {
 466                ASSERT(0);
 467                return -ENOENT;
 468        }
 469
 470        /* Profiles not based on parity don't need full stripe lock */
 471        if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
 472                goto out;
 473        locks_root = &bg_cache->full_stripe_locks_root;
 474
 475        fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
 476
 477        /* Now insert the full stripe lock */
 478        mutex_lock(&locks_root->lock);
 479        existing = insert_full_stripe_lock(locks_root, fstripe_start);
 480        mutex_unlock(&locks_root->lock);
 481        if (IS_ERR(existing)) {
 482                ret = PTR_ERR(existing);
 483                goto out;
 484        }
 485        mutex_lock(&existing->mutex);
 486        *locked_ret = true;
 487out:
 488        btrfs_put_block_group(bg_cache);
 489        return ret;
 490}
 491
 492/*
 493 * Unlock a full stripe.
 494 *
 495 * NOTE: Caller must ensure it's the same context calling corresponding
 496 * lock_full_stripe().
 497 *
 498 * Return 0 if we unlock full stripe without problem.
 499 * Return <0 for error
 500 */
 501static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
 502                              bool locked)
 503{
 504        struct btrfs_block_group_cache *bg_cache;
 505        struct btrfs_full_stripe_locks_tree *locks_root;
 506        struct full_stripe_lock *fstripe_lock;
 507        u64 fstripe_start;
 508        bool freeit = false;
 509        int ret = 0;
 510
 511        /* If we didn't acquire full stripe lock, no need to continue */
 512        if (!locked)
 513                return 0;
 514
 515        bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
 516        if (!bg_cache) {
 517                ASSERT(0);
 518                return -ENOENT;
 519        }
 520        if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
 521                goto out;
 522
 523        locks_root = &bg_cache->full_stripe_locks_root;
 524        fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
 525
 526        mutex_lock(&locks_root->lock);
 527        fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
 528        /* Unpaired unlock_full_stripe() detected */
 529        if (!fstripe_lock) {
 530                WARN_ON(1);
 531                ret = -ENOENT;
 532                mutex_unlock(&locks_root->lock);
 533                goto out;
 534        }
 535
 536        if (fstripe_lock->refs == 0) {
 537                WARN_ON(1);
 538                btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
 539                        fstripe_lock->logical);
 540        } else {
 541                fstripe_lock->refs--;
 542        }
 543
 544        if (fstripe_lock->refs == 0) {
 545                rb_erase(&fstripe_lock->node, &locks_root->root);
 546                freeit = true;
 547        }
 548        mutex_unlock(&locks_root->lock);
 549
 550        mutex_unlock(&fstripe_lock->mutex);
 551        if (freeit)
 552                kfree(fstripe_lock);
 553out:
 554        btrfs_put_block_group(bg_cache);
 555        return ret;
 556}
 557
 558/*
 559 * used for workers that require transaction commits (i.e., for the
 560 * NOCOW case)
 561 */
 562static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
 563{
 564        struct btrfs_fs_info *fs_info = sctx->fs_info;
 565
 566        refcount_inc(&sctx->refs);
 567        /*
 568         * increment scrubs_running to prevent cancel requests from
 569         * completing as long as a worker is running. we must also
 570         * increment scrubs_paused to prevent deadlocking on pause
 571         * requests used for transactions commits (as the worker uses a
 572         * transaction context). it is safe to regard the worker
 573         * as paused for all matters practical. effectively, we only
 574         * avoid cancellation requests from completing.
 575         */
 576        mutex_lock(&fs_info->scrub_lock);
 577        atomic_inc(&fs_info->scrubs_running);
 578        atomic_inc(&fs_info->scrubs_paused);
 579        mutex_unlock(&fs_info->scrub_lock);
 580
 581        /*
 582         * check if @scrubs_running=@scrubs_paused condition
 583         * inside wait_event() is not an atomic operation.
 584         * which means we may inc/dec @scrub_running/paused
 585         * at any time. Let's wake up @scrub_pause_wait as
 586         * much as we can to let commit transaction blocked less.
 587         */
 588        wake_up(&fs_info->scrub_pause_wait);
 589
 590        atomic_inc(&sctx->workers_pending);
 591}
 592
 593/* used for workers that require transaction commits */
 594static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
 595{
 596        struct btrfs_fs_info *fs_info = sctx->fs_info;
 597
 598        /*
 599         * see scrub_pending_trans_workers_inc() why we're pretending
 600         * to be paused in the scrub counters
 601         */
 602        mutex_lock(&fs_info->scrub_lock);
 603        atomic_dec(&fs_info->scrubs_running);
 604        atomic_dec(&fs_info->scrubs_paused);
 605        mutex_unlock(&fs_info->scrub_lock);
 606        atomic_dec(&sctx->workers_pending);
 607        wake_up(&fs_info->scrub_pause_wait);
 608        wake_up(&sctx->list_wait);
 609        scrub_put_ctx(sctx);
 610}
 611
 612static void scrub_free_csums(struct scrub_ctx *sctx)
 613{
 614        while (!list_empty(&sctx->csum_list)) {
 615                struct btrfs_ordered_sum *sum;
 616                sum = list_first_entry(&sctx->csum_list,
 617                                       struct btrfs_ordered_sum, list);
 618                list_del(&sum->list);
 619                kfree(sum);
 620        }
 621}
 622
 623static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
 624{
 625        int i;
 626
 627        if (!sctx)
 628                return;
 629
 630        /* this can happen when scrub is cancelled */
 631        if (sctx->curr != -1) {
 632                struct scrub_bio *sbio = sctx->bios[sctx->curr];
 633
 634                for (i = 0; i < sbio->page_count; i++) {
 635                        WARN_ON(!sbio->pagev[i]->page);
 636                        scrub_block_put(sbio->pagev[i]->sblock);
 637                }
 638                bio_put(sbio->bio);
 639        }
 640
 641        for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 642                struct scrub_bio *sbio = sctx->bios[i];
 643
 644                if (!sbio)
 645                        break;
 646                kfree(sbio);
 647        }
 648
 649        kfree(sctx->wr_curr_bio);
 650        scrub_free_csums(sctx);
 651        kfree(sctx);
 652}
 653
 654static void scrub_put_ctx(struct scrub_ctx *sctx)
 655{
 656        if (refcount_dec_and_test(&sctx->refs))
 657                scrub_free_ctx(sctx);
 658}
 659
 660static noinline_for_stack
 661struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
 662{
 663        struct scrub_ctx *sctx;
 664        int             i;
 665        struct btrfs_fs_info *fs_info = dev->fs_info;
 666
 667        sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
 668        if (!sctx)
 669                goto nomem;
 670        refcount_set(&sctx->refs, 1);
 671        sctx->is_dev_replace = is_dev_replace;
 672        sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
 673        sctx->curr = -1;
 674        sctx->fs_info = dev->fs_info;
 675        for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 676                struct scrub_bio *sbio;
 677
 678                sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
 679                if (!sbio)
 680                        goto nomem;
 681                sctx->bios[i] = sbio;
 682
 683                sbio->index = i;
 684                sbio->sctx = sctx;
 685                sbio->page_count = 0;
 686                btrfs_init_work(&sbio->work, btrfs_scrub_helper,
 687                                scrub_bio_end_io_worker, NULL, NULL);
 688
 689                if (i != SCRUB_BIOS_PER_SCTX - 1)
 690                        sctx->bios[i]->next_free = i + 1;
 691                else
 692                        sctx->bios[i]->next_free = -1;
 693        }
 694        sctx->first_free = 0;
 695        atomic_set(&sctx->bios_in_flight, 0);
 696        atomic_set(&sctx->workers_pending, 0);
 697        atomic_set(&sctx->cancel_req, 0);
 698        sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
 699        INIT_LIST_HEAD(&sctx->csum_list);
 700
 701        spin_lock_init(&sctx->list_lock);
 702        spin_lock_init(&sctx->stat_lock);
 703        init_waitqueue_head(&sctx->list_wait);
 704
 705        WARN_ON(sctx->wr_curr_bio != NULL);
 706        mutex_init(&sctx->wr_lock);
 707        sctx->wr_curr_bio = NULL;
 708        if (is_dev_replace) {
 709                WARN_ON(!fs_info->dev_replace.tgtdev);
 710                sctx->pages_per_wr_bio = SCRUB_PAGES_PER_WR_BIO;
 711                sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
 712                sctx->flush_all_writes = false;
 713        }
 714
 715        return sctx;
 716
 717nomem:
 718        scrub_free_ctx(sctx);
 719        return ERR_PTR(-ENOMEM);
 720}
 721
 722static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
 723                                     void *warn_ctx)
 724{
 725        u64 isize;
 726        u32 nlink;
 727        int ret;
 728        int i;
 729        unsigned nofs_flag;
 730        struct extent_buffer *eb;
 731        struct btrfs_inode_item *inode_item;
 732        struct scrub_warning *swarn = warn_ctx;
 733        struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
 734        struct inode_fs_paths *ipath = NULL;
 735        struct btrfs_root *local_root;
 736        struct btrfs_key root_key;
 737        struct btrfs_key key;
 738
 739        root_key.objectid = root;
 740        root_key.type = BTRFS_ROOT_ITEM_KEY;
 741        root_key.offset = (u64)-1;
 742        local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
 743        if (IS_ERR(local_root)) {
 744                ret = PTR_ERR(local_root);
 745                goto err;
 746        }
 747
 748        /*
 749         * this makes the path point to (inum INODE_ITEM ioff)
 750         */
 751        key.objectid = inum;
 752        key.type = BTRFS_INODE_ITEM_KEY;
 753        key.offset = 0;
 754
 755        ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
 756        if (ret) {
 757                btrfs_release_path(swarn->path);
 758                goto err;
 759        }
 760
 761        eb = swarn->path->nodes[0];
 762        inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
 763                                        struct btrfs_inode_item);
 764        isize = btrfs_inode_size(eb, inode_item);
 765        nlink = btrfs_inode_nlink(eb, inode_item);
 766        btrfs_release_path(swarn->path);
 767
 768        /*
 769         * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
 770         * uses GFP_NOFS in this context, so we keep it consistent but it does
 771         * not seem to be strictly necessary.
 772         */
 773        nofs_flag = memalloc_nofs_save();
 774        ipath = init_ipath(4096, local_root, swarn->path);
 775        memalloc_nofs_restore(nofs_flag);
 776        if (IS_ERR(ipath)) {
 777                ret = PTR_ERR(ipath);
 778                ipath = NULL;
 779                goto err;
 780        }
 781        ret = paths_from_inode(inum, ipath);
 782
 783        if (ret < 0)
 784                goto err;
 785
 786        /*
 787         * we deliberately ignore the bit ipath might have been too small to
 788         * hold all of the paths here
 789         */
 790        for (i = 0; i < ipath->fspath->elem_cnt; ++i)
 791                btrfs_warn_in_rcu(fs_info,
 792"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %llu, links %u (path: %s)",
 793                                  swarn->errstr, swarn->logical,
 794                                  rcu_str_deref(swarn->dev->name),
 795                                  swarn->physical,
 796                                  root, inum, offset,
 797                                  min(isize - offset, (u64)PAGE_SIZE), nlink,
 798                                  (char *)(unsigned long)ipath->fspath->val[i]);
 799
 800        free_ipath(ipath);
 801        return 0;
 802
 803err:
 804        btrfs_warn_in_rcu(fs_info,
 805                          "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
 806                          swarn->errstr, swarn->logical,
 807                          rcu_str_deref(swarn->dev->name),
 808                          swarn->physical,
 809                          root, inum, offset, ret);
 810
 811        free_ipath(ipath);
 812        return 0;
 813}
 814
 815static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 816{
 817        struct btrfs_device *dev;
 818        struct btrfs_fs_info *fs_info;
 819        struct btrfs_path *path;
 820        struct btrfs_key found_key;
 821        struct extent_buffer *eb;
 822        struct btrfs_extent_item *ei;
 823        struct scrub_warning swarn;
 824        unsigned long ptr = 0;
 825        u64 extent_item_pos;
 826        u64 flags = 0;
 827        u64 ref_root;
 828        u32 item_size;
 829        u8 ref_level = 0;
 830        int ret;
 831
 832        WARN_ON(sblock->page_count < 1);
 833        dev = sblock->pagev[0]->dev;
 834        fs_info = sblock->sctx->fs_info;
 835
 836        path = btrfs_alloc_path();
 837        if (!path)
 838                return;
 839
 840        swarn.physical = sblock->pagev[0]->physical;
 841        swarn.logical = sblock->pagev[0]->logical;
 842        swarn.errstr = errstr;
 843        swarn.dev = NULL;
 844
 845        ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
 846                                  &flags);
 847        if (ret < 0)
 848                goto out;
 849
 850        extent_item_pos = swarn.logical - found_key.objectid;
 851        swarn.extent_item_size = found_key.offset;
 852
 853        eb = path->nodes[0];
 854        ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
 855        item_size = btrfs_item_size_nr(eb, path->slots[0]);
 856
 857        if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 858                do {
 859                        ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
 860                                                      item_size, &ref_root,
 861                                                      &ref_level);
 862                        btrfs_warn_in_rcu(fs_info,
 863"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
 864                                errstr, swarn.logical,
 865                                rcu_str_deref(dev->name),
 866                                swarn.physical,
 867                                ref_level ? "node" : "leaf",
 868                                ret < 0 ? -1 : ref_level,
 869                                ret < 0 ? -1 : ref_root);
 870                } while (ret != 1);
 871                btrfs_release_path(path);
 872        } else {
 873                btrfs_release_path(path);
 874                swarn.path = path;
 875                swarn.dev = dev;
 876                iterate_extent_inodes(fs_info, found_key.objectid,
 877                                        extent_item_pos, 1,
 878                                        scrub_print_warning_inode, &swarn, false);
 879        }
 880
 881out:
 882        btrfs_free_path(path);
 883}
 884
 885static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
 886{
 887        struct page *page = NULL;
 888        unsigned long index;
 889        struct scrub_fixup_nodatasum *fixup = fixup_ctx;
 890        int ret;
 891        int corrected = 0;
 892        struct btrfs_key key;
 893        struct inode *inode = NULL;
 894        struct btrfs_fs_info *fs_info;
 895        u64 end = offset + PAGE_SIZE - 1;
 896        struct btrfs_root *local_root;
 897        int srcu_index;
 898
 899        key.objectid = root;
 900        key.type = BTRFS_ROOT_ITEM_KEY;
 901        key.offset = (u64)-1;
 902
 903        fs_info = fixup->root->fs_info;
 904        srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
 905
 906        local_root = btrfs_read_fs_root_no_name(fs_info, &key);
 907        if (IS_ERR(local_root)) {
 908                srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
 909                return PTR_ERR(local_root);
 910        }
 911
 912        key.type = BTRFS_INODE_ITEM_KEY;
 913        key.objectid = inum;
 914        key.offset = 0;
 915        inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
 916        srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
 917        if (IS_ERR(inode))
 918                return PTR_ERR(inode);
 919
 920        index = offset >> PAGE_SHIFT;
 921
 922        page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
 923        if (!page) {
 924                ret = -ENOMEM;
 925                goto out;
 926        }
 927
 928        if (PageUptodate(page)) {
 929                if (PageDirty(page)) {
 930                        /*
 931                         * we need to write the data to the defect sector. the
 932                         * data that was in that sector is not in memory,
 933                         * because the page was modified. we must not write the
 934                         * modified page to that sector.
 935                         *
 936                         * TODO: what could be done here: wait for the delalloc
 937                         *       runner to write out that page (might involve
 938                         *       COW) and see whether the sector is still
 939                         *       referenced afterwards.
 940                         *
 941                         * For the meantime, we'll treat this error
 942                         * incorrectable, although there is a chance that a
 943                         * later scrub will find the bad sector again and that
 944                         * there's no dirty page in memory, then.
 945                         */
 946                        ret = -EIO;
 947                        goto out;
 948                }
 949                ret = repair_io_failure(fs_info, inum, offset, PAGE_SIZE,
 950                                        fixup->logical, page,
 951                                        offset - page_offset(page),
 952                                        fixup->mirror_num);
 953                unlock_page(page);
 954                corrected = !ret;
 955        } else {
 956                /*
 957                 * we need to get good data first. the general readpage path
 958                 * will call repair_io_failure for us, we just have to make
 959                 * sure we read the bad mirror.
 960                 */
 961                ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
 962                                        EXTENT_DAMAGED);
 963                if (ret) {
 964                        /* set_extent_bits should give proper error */
 965                        WARN_ON(ret > 0);
 966                        if (ret > 0)
 967                                ret = -EFAULT;
 968                        goto out;
 969                }
 970
 971                ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
 972                                                btrfs_get_extent,
 973                                                fixup->mirror_num);
 974                wait_on_page_locked(page);
 975
 976                corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
 977                                                end, EXTENT_DAMAGED, 0, NULL);
 978                if (!corrected)
 979                        clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
 980                                                EXTENT_DAMAGED);
 981        }
 982
 983out:
 984        if (page)
 985                put_page(page);
 986
 987        iput(inode);
 988
 989        if (ret < 0)
 990                return ret;
 991
 992        if (ret == 0 && corrected) {
 993                /*
 994                 * we only need to call readpage for one of the inodes belonging
 995                 * to this extent. so make iterate_extent_inodes stop
 996                 */
 997                return 1;
 998        }
 999
1000        return -EIO;

1001}
1002
1003static void scrub_fixup_nodatasum(struct btrfs_work *work)
1004{
1005        struct btrfs_fs_info *fs_info;
1006        int ret;
1007        struct scrub_fixup_nodatasum *fixup;
1008        struct scrub_ctx *sctx;
1009        struct btrfs_trans_handle *trans = NULL;
1010        struct btrfs_path *path;
1011        int uncorrectable = 0;
1012
1013        fixup = container_of(work, struct scrub_fixup_nodatasum, work);
1014        sctx = fixup->sctx;
1015        fs_info = fixup->root->fs_info;
1016
1017        path = btrfs_alloc_path();
1018        if (!path) {
1019                spin_lock(&sctx->stat_lock);
1020                ++sctx->stat.malloc_errors;
1021                spin_unlock(&sctx->stat_lock);
1022                uncorrectable = 1;
1023                goto out;
1024        }
1025
1026        trans = btrfs_join_transaction(fixup->root);
1027        if (IS_ERR(trans)) {
1028                uncorrectable = 1;
1029                goto out;
1030        }
1031
1032        /*
1033         * the idea is to trigger a regular read through the standard path. we
1034         * read a page from the (failed) logical address by specifying the
1035         * corresponding copynum of the failed sector. thus, that readpage is
1036         * expected to fail.
1037         * that is the point where on-the-fly error correction will kick in
1038         * (once it's finished) and rewrite the failed sector if a good copy
1039         * can be found.
1040         */
1041        ret = iterate_inodes_from_logical(fixup->logical, fs_info, path,
1042                                          scrub_fixup_readpage, fixup, false);
1043        if (ret < 0) {
1044                uncorrectable = 1;
1045                goto out;
1046        }
1047        WARN_ON(ret != 1);
1048
1049        spin_lock(&sctx->stat_lock);
1050        ++sctx->stat.corrected_errors;
1051        spin_unlock(&sctx->stat_lock);
1052
1053out:
1054        if (trans && !IS_ERR(trans))
1055                btrfs_end_transaction(trans);
1056        if (uncorrectable) {
1057                spin_lock(&sctx->stat_lock);
1058                ++sctx->stat.uncorrectable_errors;
1059                spin_unlock(&sctx->stat_lock);
1060                btrfs_dev_replace_stats_inc(
1061                        &fs_info->dev_replace.num_uncorrectable_read_errors);
1062                btrfs_err_rl_in_rcu(fs_info,
1063                    "unable to fixup (nodatasum) error at logical %llu on dev %s",
1064                        fixup->logical, rcu_str_deref(fixup->dev->name));
1065        }
1066
1067        btrfs_free_path(path);
1068        kfree(fixup);
1069
1070        scrub_pending_trans_workers_dec(sctx);
1071}
1072
1073static inline void scrub_get_recover(struct scrub_recover *recover)
1074{
1075        refcount_inc(&recover->refs);
1076}
1077
1078static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
1079                                     struct scrub_recover *recover)
1080{
1081        if (refcount_dec_and_test(&recover->refs)) {
1082                btrfs_bio_counter_dec(fs_info);
1083                btrfs_put_bbio(recover->bbio);
1084                kfree(recover);
1085        }
1086}
1087
1088/*
1089 * scrub_handle_errored_block gets called when either verification of the
1090 * pages failed or the bio failed to read, e.g. with EIO. In the latter
1091 * case, this function handles all pages in the bio, even though only one
1092 * may be bad.
1093 * The goal of this function is to repair the errored block by using the
1094 * contents of one of the mirrors.
1095 */
1096static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
1097{
1098        struct scrub_ctx *sctx = sblock_to_check->sctx;
1099        struct btrfs_device *dev;
1100        struct btrfs_fs_info *fs_info;
1101        u64 logical;
1102        unsigned int failed_mirror_index;
1103        unsigned int is_metadata;
1104        unsigned int have_csum;
1105        struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
1106        struct scrub_block *sblock_bad;
1107        int ret;
1108        int mirror_index;
1109        int page_num;
1110        int success;
1111        bool full_stripe_locked;
1112        static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
1113                                      DEFAULT_RATELIMIT_BURST);
1114
1115        BUG_ON(sblock_to_check->page_count < 1);
1116        fs_info = sctx->fs_info;
1117        if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
1118                /*
1119                 * if we find an error in a super block, we just report it.
1120                 * They will get written with the next transaction commit
1121                 * anyway
1122                 */
1123                spin_lock(&sctx->stat_lock);
1124                ++sctx->stat.super_errors;
1125                spin_unlock(&sctx->stat_lock);
1126                return 0;
1127        }
1128        logical = sblock_to_check->pagev[0]->logical;
1129        BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
1130        failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
1131        is_metadata = !(sblock_to_check->pagev[0]->flags &
1132                        BTRFS_EXTENT_FLAG_DATA);
1133        have_csum = sblock_to_check->pagev[0]->have_csum;
1134        dev = sblock_to_check->pagev[0]->dev;
1135
1136        /*
1137         * For RAID5/6, race can happen for a different device scrub thread.
1138         * For data corruption, Parity and Data threads will both try
1139         * to recovery the data.
1140         * Race can lead to doubly added csum error, or even unrecoverable
1141         * error.
1142         */
1143        ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
1144        if (ret < 0) {
1145                spin_lock(&sctx->stat_lock);
1146                if (ret == -ENOMEM)
1147                        sctx->stat.malloc_errors++;
1148                sctx->stat.read_errors++;
1149                sctx->stat.uncorrectable_errors++;
1150                spin_unlock(&sctx->stat_lock);
1151                return ret;
1152        }
1153
1154        if (sctx->is_dev_replace && !is_metadata && !have_csum) {
1155                sblocks_for_recheck = NULL;
1156                goto nodatasum_case;
1157        }
1158
1159        /*
1160         * read all mirrors one after the other. This includes to
1161         * re-read the extent or metadata block that failed (that was
1162         * the cause that this fixup code is called) another time,
1163         * page by page this time in order to know which pages
1164         * caused I/O errors and which ones are good (for all mirrors).
1165         * It is the goal to handle the situation when more than one
1166         * mirror contains I/O errors, but the errors do not
1167         * overlap, i.e. the data can be repaired by selecting the
1168         * pages from those mirrors without I/O error on the
1169         * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
1170         * would be that mirror #1 has an I/O error on the first page,
1171         * the second page is good, and mirror #2 has an I/O error on
1172         * the second page, but the first page is good.
1173         * Then the first page of the first mirror can be repaired by
1174         * taking the first page of the second mirror, and the
1175         * second page of the second mirror can be repaired by
1176         * copying the contents of the 2nd page of the 1st mirror.
1177         * One more note: if the pages of one mirror contain I/O
1178         * errors, the checksum cannot be verified. In order to get
1179         * the best data for repairing, the first attempt is to find
1180         * a mirror without I/O errors and with a validated checksum.
1181         * Only if this is not possible, the pages are picked from
1182         * mirrors with I/O errors without considering the checksum.
1183         * If the latter is the case, at the end, the checksum of the
1184         * repaired area is verified in order to correctly maintain
1185         * the statistics.
1186         */
1187
1188        sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
1189                                      sizeof(*sblocks_for_recheck), GFP_NOFS);
1190        if (!sblocks_for_recheck) {
1191                spin_lock(&sctx->stat_lock);
1192                sctx->stat.malloc_errors++;
1193                sctx->stat.read_errors++;
1194                sctx->stat.uncorrectable_errors++;
1195                spin_unlock(&sctx->stat_lock);
1196                btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
1197                goto out;
1198        }
1199
1200        /* setup the context, map the logical blocks and alloc the pages */
1201        ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
1202        if (ret) {
1203                spin_lock(&sctx->stat_lock);
1204                sctx->stat.read_errors++;
1205                sctx->stat.uncorrectable_errors++;
1206                spin_unlock(&sctx->stat_lock);
1207                btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
1208                goto out;
1209        }
1210        BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
1211        sblock_bad = sblocks_for_recheck + failed_mirror_index;
1212
1213        /* build and submit the bios for the failed mirror, check checksums */
1214        scrub_recheck_block(fs_info, sblock_bad, 1);
1215
1216        if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
1217            sblock_bad->no_io_error_seen) {
1218                /*
1219                 * the error disappeared after reading page by page, or
1220                 * the area was part of a huge bio and other parts of the
1221                 * bio caused I/O errors, or the block layer merged several
1222                 * read requests into one and the error is caused by a
1223                 * different bio (usually one of the two latter cases is
1224                 * the cause)
1225                 */
1226                spin_lock(&sctx->stat_lock);
1227                sctx->stat.unverified_errors++;
1228                sblock_to_check->data_corrected = 1;
1229                spin_unlock(&sctx->stat_lock);
1230
1231                if (sctx->is_dev_replace)
1232                        scrub_write_block_to_dev_replace(sblock_bad);
1233                goto out;
1234        }
1235
1236        if (!sblock_bad->no_io_error_seen) {
1237                spin_lock(&sctx->stat_lock);
1238                sctx->stat.read_errors++;
1239                spin_unlock(&sctx->stat_lock);
1240                if (__ratelimit(&_rs))
1241                        scrub_print_warning("i/o error", sblock_to_check);
1242                btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
1243        } else if (sblock_bad->checksum_error) {
1244                spin_lock(&sctx->stat_lock);
1245                sctx->stat.csum_errors++;
1246                spin_unlock(&sctx->stat_lock);
1247                if (__ratelimit(&_rs))
1248                        scrub_print_warning("checksum error", sblock_to_check);
1249                btrfs_dev_stat_inc_and_print(dev,
1250                                             BTRFS_DEV_STAT_CORRUPTION_ERRS);
1251        } else if (sblock_bad->header_error) {
1252                spin_lock(&sctx->stat_lock);
1253                sctx->stat.verify_errors++;
1254                spin_unlock(&sctx->stat_lock);
1255                if (__ratelimit(&_rs))
1256                        scrub_print_warning("checksum/header error",
1257                                            sblock_to_check);
1258                if (sblock_bad->generation_error)
1259                        btrfs_dev_stat_inc_and_print(dev,
1260                                BTRFS_DEV_STAT_GENERATION_ERRS);
1261                else
1262                        btrfs_dev_stat_inc_and_print(dev,
1263                                BTRFS_DEV_STAT_CORRUPTION_ERRS);
1264        }
1265
1266        if (sctx->readonly) {
1267                ASSERT(!sctx->is_dev_replace);
1268                goto out;
1269        }
1270
1271        if (!is_metadata && !have_csum) {
1272                struct scrub_fixup_nodatasum *fixup_nodatasum;
1273
1274                WARN_ON(sctx->is_dev_replace);
1275
1276nodatasum_case:
1277
1278                /*
1279                 * !is_metadata and !have_csum, this means that the data
1280                 * might not be COWed, that it might be modified
1281                 * concurrently. The general strategy to work on the
1282                 * commit root does not help in the case when COW is not
1283                 * used.
1284                 */
1285                fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
1286                if (!fixup_nodatasum)
1287                        goto did_not_correct_error;
1288                fixup_nodatasum->sctx = sctx;
1289                fixup_nodatasum->dev = dev;
1290                fixup_nodatasum->logical = logical;
1291                fixup_nodatasum->root = fs_info->extent_root;
1292                fixup_nodatasum->mirror_num = failed_mirror_index + 1;
1293                scrub_pending_trans_workers_inc(sctx);
1294                btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper,
1295                                scrub_fixup_nodatasum, NULL, NULL);
1296                btrfs_queue_work(fs_info->scrub_workers,
1297                                 &fixup_nodatasum->work);
1298                goto out;
1299        }
1300
1301        /*
1302         * now build and submit the bios for the other mirrors, check
1303         * checksums.
1304         * First try to pick the mirror which is completely without I/O
1305         * errors and also does not have a checksum error.
1306         * If one is found, and if a checksum is present, the full block
1307         * that is known to contain an error is rewritten. Afterwards
1308         * the block is known to be corrected.
1309         * If a mirror is found which is completely correct, and no
1310         * checksum is present, only those pages are rewritten that had
1311         * an I/O error in the block to be repaired, since it cannot be
1312         * determined, which copy of the other pages is better (and it
1313         * could happen otherwise that a correct page would be
1314         * overwritten by a bad one).
1315         */
1316        for (mirror_index = 0; ;mirror_index++) {
1317                struct scrub_block *sblock_other;
1318
1319                if (mirror_index == failed_mirror_index)
1320                        continue;
1321
1322                /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
1323                if (!scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
1324                        if (mirror_index >= BTRFS_MAX_MIRRORS)
1325                                break;
1326                        if (!sblocks_for_recheck[mirror_index].page_count)
1327                                break;
1328
1329                        sblock_other = sblocks_for_recheck + mirror_index;
1330                } else {
1331                        struct scrub_recover *r = sblock_bad->pagev[0]->recover;
1332                        int max_allowed = r->bbio->num_stripes -
1333                                                r->bbio->num_tgtdevs;
1334
1335                        if (mirror_index >= max_allowed)
1336                                break;
1337                        if (!sblocks_for_recheck[1].page_count)
1338                                break;
1339
1340                        ASSERT(failed_mirror_index == 0);
1341                        sblock_other = sblocks_for_recheck + 1;
1342                        sblock_other->pagev[0]->mirror_num = 1 + mirror_index;
1343                }
1344
1345                /* build and submit the bios, check checksums */
1346                scrub_recheck_block(fs_info, sblock_other, 0);
1347
1348                if (!sblock_other->header_error &&
1349                    !sblock_other->checksum_error &&
1350                    sblock_other->no_io_error_seen) {
1351                        if (sctx->is_dev_replace) {
1352                                scrub_write_block_to_dev_replace(sblock_other);
1353                                goto corrected_error;
1354                        } else {
1355                                ret = scrub_repair_block_from_good_copy(
1356                                                sblock_bad, sblock_other);
1357                                if (!ret)
1358                                        goto corrected_error;
1359                        }
1360                }
1361        }
1362
1363        if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1364                goto did_not_correct_error;
1365
1366        /*
1367         * In case of I/O errors in the area that is supposed to be
1368         * repaired, continue by picking good copies of those pages.
1369         * Select the good pages from mirrors to rewrite bad pages from
1370         * the area to fix. Afterwards verify the checksum of the block
1371         * that is supposed to be repaired. This verification step is
1372         * only done for the purpose of statistic counting and for the
1373         * final scrub report, whether errors remain.
1374         * A perfect algorithm could make use of the checksum and try
1375         * all possible combinations of pages from the different mirrors
1376         * until the checksum verification succeeds. For example, when
1377         * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
1378         * of mirror #2 is readable but the final checksum test fails,
1379         * then the 2nd page of mirror #3 could be tried, whether now
1380         * the final checksum succeeds. But this would be a rare
1381         * exception and is therefore not implemented. At least it is
1382         * avoided that the good copy is overwritten.
1383         * A more useful improvement would be to pick the sectors
1384         * without I/O error based on sector sizes (512 bytes on legacy
1385         * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
1386         * mirror could be repaired by taking 512 byte of a different
1387         * mirror, even if other 512 byte sectors in the same PAGE_SIZE
1388         * area are unreadable.
1389         */
1390        success = 1;
1391        for (page_num = 0; page_num < sblock_bad->page_count;
1392             page_num++) {
1393                struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1394                struct scrub_block *sblock_other = NULL;
1395
1396                /* skip no-io-error page in scrub */
1397                if (!page_bad->io_error && !sctx->is_dev_replace)
1398                        continue;
1399
1400                if (scrub_is_page_on_raid56(sblock_bad->pagev[0])) {
1401                        /*
1402                         * In case of dev replace, if raid56 rebuild process
1403                         * didn't work out correct data, then copy the content
1404                         * in sblock_bad to make sure target device is identical
1405                         * to source device, instead of writing garbage data in
1406                         * sblock_for_recheck array to target device.
1407                         */
1408                        sblock_other = NULL;
1409                } else if (page_bad->io_error) {
1410                        /* try to find no-io-error page in mirrors */
1411                        for (mirror_index = 0;
1412                             mirror_index < BTRFS_MAX_MIRRORS &&
1413                             sblocks_for_recheck[mirror_index].page_count > 0;
1414                             mirror_index++) {
1415                                if (!sblocks_for_recheck[mirror_index].
1416                                    pagev[page_num]->io_error) {
1417                                        sblock_other = sblocks_for_recheck +
1418                                                       mirror_index;
1419                                        break;
1420                                }
1421                        }
1422                        if (!sblock_other)
1423                                success = 0;
1424                }
1425
1426                if (sctx->is_dev_replace) {
1427                        /*
1428                         * did not find a mirror to fetch the page
1429                         * from. scrub_write_page_to_dev_replace()
1430                         * handles this case (page->io_error), by
1431                         * filling the block with zeros before
1432                         * submitting the write request
1433                         */
1434                        if (!sblock_other)
1435                                sblock_other = sblock_bad;
1436
1437                        if (scrub_write_page_to_dev_replace(sblock_other,
1438                                                            page_num) != 0) {
1439                                btrfs_dev_replace_stats_inc(
1440                                        &fs_info->dev_replace.num_write_errors);
1441                                success = 0;
1442                        }
1443                } else if (sblock_other) {
1444                        ret = scrub_repair_page_from_good_copy(sblock_bad,
1445                                                               sblock_other,
1446                                                               page_num, 0);
1447                        if (0 == ret)
1448                                page_bad->io_error = 0;
1449                        else
1450                                success = 0;
1451                }
1452        }
1453
1454        if (success && !sctx->is_dev_replace) {
1455                if (is_metadata || have_csum) {
1456                        /*
1457                         * need to verify the checksum now that all
1458                         * sectors on disk are repaired (the write
1459                         * request for data to be repaired is on its way).
1460                         * Just be lazy and use scrub_recheck_block()
1461                         * which re-reads the data before the checksum
1462                         * is verified, but most likely the data comes out
1463                         * of the page cache.
1464                         */
1465                        scrub_recheck_block(fs_info, sblock_bad, 1);
1466                        if (!sblock_bad->header_error &&
1467                            !sblock_bad->checksum_error &&
1468                            sblock_bad->no_io_error_seen)
1469                                goto corrected_error;
1470                        else
1471                                goto did_not_correct_error;
1472                } else {
1473corrected_error:
1474                        spin_lock(&sctx->stat_lock);
1475                        sctx->stat.corrected_errors++;
1476                        sblock_to_check->data_corrected = 1;
1477                        spin_unlock(&sctx->stat_lock);
1478                        btrfs_err_rl_in_rcu(fs_info,
1479                                "fixed up error at logical %llu on dev %s",
1480                                logical, rcu_str_deref(dev->name));
1481                }
1482        } else {
1483did_not_correct_error:
1484                spin_lock(&sctx->stat_lock);
1485                sctx->stat.uncorrectable_errors++;
1486                spin_unlock(&sctx->stat_lock);
1487                btrfs_err_rl_in_rcu(fs_info,
1488                        "unable to fixup (regular) error at logical %llu on dev %s",
1489                        logical, rcu_str_deref(dev->name));
1490        }
1491
1492out:
1493        if (sblocks_for_recheck) {
1494                for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1495                     mirror_index++) {
1496                        struct scrub_block *sblock = sblocks_for_recheck +
1497                                                     mirror_index;
1498                        struct scrub_recover *recover;
1499                        int page_index;
1500
1501                        for (page_index = 0; page_index < sblock->page_count;
1502                             page_index++) {
1503                                sblock->pagev[page_index]->sblock = NULL;
1504                                recover = sblock->pagev[page_index]->recover;
1505                                if (recover) {
1506                                        scrub_put_recover(fs_info, recover);
1507                                        sblock->pagev[page_index]->recover =
1508                                                                        NULL;
1509                                }
1510                                scrub_page_put(sblock->pagev[page_index]);
1511                        }
1512                }
1513                kfree(sblocks_for_recheck);
1514        }
1515
1516        ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
1517        if (ret < 0)
1518                return ret;
1519        return 0;
1520}
1521
1522static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
1523{
1524        if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
1525                return 2;
1526        else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
1527                return 3;
1528        else
1529                return (int)bbio->num_stripes;
1530}
1531
1532static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1533                                                 u64 *raid_map,
1534                                                 u64 mapped_length,
1535                                                 int nstripes, int mirror,
1536                                                 int *stripe_index,
1537                                                 u64 *stripe_offset)
1538{
1539        int i;
1540
1541        if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1542                /* RAID5/6 */
1543                for (i = 0; i < nstripes; i++) {
1544                        if (raid_map[i] == RAID6_Q_STRIPE ||
1545                            raid_map[i] == RAID5_P_STRIPE)
1546                                continue;
1547
1548                        if (logical >= raid_map[i] &&
1549                            logical < raid_map[i] + mapped_length)
1550                                break;
1551                }
1552
1553                *stripe_index = i;
1554                *stripe_offset = logical - raid_map[i];
1555        } else {
1556                /* The other RAID type */
1557                *stripe_index = mirror;
1558                *stripe_offset = 0;
1559        }
1560}
1561
1562static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1563                                     struct scrub_block *sblocks_for_recheck)
1564{
1565        struct scrub_ctx *sctx = original_sblock->sctx;
1566        struct btrfs_fs_info *fs_info = sctx->fs_info;
1567        u64 length = original_sblock->page_count * PAGE_SIZE;
1568        u64 logical = original_sblock->pagev[0]->logical;
1569        u64 generation = original_sblock->pagev[0]->generation;
1570        u64 flags = original_sblock->pagev[0]->flags;
1571        u64 have_csum = original_sblock->pagev[0]->have_csum;
1572        struct scrub_recover *recover;
1573        struct btrfs_bio *bbio;
1574        u64 sublen;
1575        u64 mapped_length;
1576        u64 stripe_offset;
1577        int stripe_index;
1578        int page_index = 0;
1579        int mirror_index;
1580        int nmirrors;
1581        int ret;
1582
1583        /*
1584         * note: the two members refs and outstanding_pages
1585         * are not used (and not set) in the blocks that are used for
1586         * the recheck procedure
1587         */
1588
1589        while (length > 0) {
1590                sublen = min_t(u64, length, PAGE_SIZE);
1591                mapped_length = sublen;
1592                bbio = NULL;
1593
1594                /*
1595                 * with a length of PAGE_SIZE, each returned stripe
1596                 * represents one mirror
1597                 */
1598                btrfs_bio_counter_inc_blocked(fs_info);
1599                ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
1600                                logical, &mapped_length, &bbio);
1601                if (ret || !bbio || mapped_length < sublen) {
1602                        btrfs_put_bbio(bbio);
1603                        btrfs_bio_counter_dec(fs_info);
1604                        return -EIO;
1605                }
1606
1607                recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1608                if (!recover) {
1609                        btrfs_put_bbio(bbio);
1610                        btrfs_bio_counter_dec(fs_info);
1611                        return -ENOMEM;
1612                }
1613
1614                refcount_set(&recover->refs, 1);
1615                recover->bbio = bbio;
1616                recover->map_length = mapped_length;
1617
1618                BUG_ON(page_index >= SCRUB_MAX_PAGES_PER_BLOCK);
1619
1620                nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
1621
1622                for (mirror_index = 0; mirror_index < nmirrors;
1623                     mirror_index++) {
1624                        struct scrub_block *sblock;
1625                        struct scrub_page *page;
1626
1627                        sblock = sblocks_for_recheck + mirror_index;
1628                        sblock->sctx = sctx;
1629
1630                        page = kzalloc(sizeof(*page), GFP_NOFS);
1631                        if (!page) {
1632leave_nomem:
1633                                spin_lock(&sctx->stat_lock);
1634                                sctx->stat.malloc_errors++;
1635                                spin_unlock(&sctx->stat_lock);
1636                                scrub_put_recover(fs_info, recover);
1637                                return -ENOMEM;
1638                        }
1639                        scrub_page_get(page);
1640                        sblock->pagev[page_index] = page;
1641                        page->sblock = sblock;
1642                        page->flags = flags;
1643                        page->generation = generation;
1644                        page->logical = logical;
1645                        page->have_csum = have_csum;
1646                        if (have_csum)
1647                                memcpy(page->csum,
1648                                       original_sblock->pagev[0]->csum,
1649                                       sctx->csum_size);
1650
1651                        scrub_stripe_index_and_offset(logical,
1652                                                      bbio->map_type,
1653                                                      bbio->raid_map,
1654                                                      mapped_length,
1655                                                      bbio->num_stripes -
1656                                                      bbio->num_tgtdevs,
1657                                                      mirror_index,
1658                                                      &stripe_index,
1659                                                      &stripe_offset);
1660                        page->physical = bbio->stripes[stripe_index].physical +
1661                                         stripe_offset;
1662                        page->dev = bbio->stripes[stripe_index].dev;
1663
1664                        BUG_ON(page_index >= original_sblock->page_count);
1665                        page->physical_for_dev_replace =
1666                                original_sblock->pagev[page_index]->
1667                                physical_for_dev_replace;
1668                        /* for missing devices, dev->bdev is NULL */
1669                        page->mirror_num = mirror_index + 1;
1670                        sblock->page_count++;
1671                        page->page = alloc_page(GFP_NOFS);
1672                        if (!page->page)
1673                                goto leave_nomem;
1674
1675                        scrub_get_recover(recover);
1676                        page->recover = recover;
1677                }
1678                scrub_put_recover(fs_info, recover);
1679                length -= sublen;
1680                logical += sublen;
1681                page_index++;
1682        }
1683
1684        return 0;
1685}
1686
1687static void scrub_bio_wait_endio(struct bio *bio)
1688{
1689        complete(bio->bi_private);
1690}
1691
1692static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1693                                        struct bio *bio,
1694                                        struct scrub_page *page)
1695{
1696        DECLARE_COMPLETION_ONSTACK(done);
1697        int ret;
1698        int mirror_num;
1699
1700        bio->bi_iter.bi_sector = page->logical >> 9;
1701        bio->bi_private = &done;
1702        bio->bi_end_io = scrub_bio_wait_endio;
1703
1704        mirror_num = page->sblock->pagev[0]->mirror_num;
1705        ret = raid56_parity_recover(fs_info, bio, page->recover->bbio,
1706                                    page->recover->map_length,
1707                                    mirror_num, 0);
1708        if (ret)
1709                return ret;
1710
1711        wait_for_completion_io(&done);
1712        return blk_status_to_errno(bio->bi_status);
1713}
1714
1715static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
1716                                          struct scrub_block *sblock)
1717{
1718        struct scrub_page *first_page = sblock->pagev[0];
1719        struct bio *bio;
1720        int page_num;
1721
1722        /* All pages in sblock belong to the same stripe on the same device. */
1723        ASSERT(first_page->dev);
1724        if (!first_page->dev->bdev)
1725                goto out;
1726
1727        bio = btrfs_io_bio_alloc(BIO_MAX_PAGES);
1728        bio_set_dev(bio, first_page->dev->bdev);
1729
1730        for (page_num = 0; page_num < sblock->page_count; page_num++) {
1731                struct scrub_page *page = sblock->pagev[page_num];
1732
1733                WARN_ON(!page->page);
1734                bio_add_page(bio, page->page, PAGE_SIZE, 0);
1735        }
1736
1737        if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) {
1738                bio_put(bio);
1739                goto out;
1740        }
1741
1742        bio_put(bio);
1743
1744        scrub_recheck_block_checksum(sblock);
1745
1746        return;
1747out:
1748        for (page_num = 0; page_num < sblock->page_count; page_num++)
1749                sblock->pagev[page_num]->io_error = 1;
1750
1751        sblock->no_io_error_seen = 0;
1752}
1753
1754/*
1755 * this function will check the on disk data for checksum errors, header
1756 * errors and read I/O errors. If any I/O errors happen, the exact pages
1757 * which are errored are marked as being bad. The goal is to enable scrub
1758 * to take those pages that are not errored from all the mirrors so that
1759 * the pages that are errored in the just handled mirror can be repaired.
1760 */
1761static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1762                                struct scrub_block *sblock,
1763                                int retry_failed_mirror)
1764{
1765        int page_num;
1766
1767        sblock->no_io_error_seen = 1;
1768
1769        /* short cut for raid56 */
1770        if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->pagev[0]))
1771                return scrub_recheck_block_on_raid56(fs_info, sblock);
1772
1773        for (page_num = 0; page_num < sblock->page_count; page_num++) {
1774                struct bio *bio;
1775                struct scrub_page *page = sblock->pagev[page_num];
1776
1777                if (page->dev->bdev == NULL) {
1778                        page->io_error = 1;
1779                        sblock->no_io_error_seen = 0;
1780                        continue;
1781                }
1782
1783                WARN_ON(!page->page);
1784                bio = btrfs_io_bio_alloc(1);
1785                bio_set_dev(bio, page->dev->bdev);
1786
1787                bio_add_page(bio, page->page, PAGE_SIZE, 0);
1788                bio->bi_iter.bi_sector = page->physical >> 9;
1789                bio->bi_opf = REQ_OP_READ;
1790
1791                if (btrfsic_submit_bio_wait(bio)) {
1792                        page->io_error = 1;
1793                        sblock->no_io_error_seen = 0;
1794                }
1795
1796                bio_put(bio);
1797        }
1798
1799        if (sblock->no_io_error_seen)
1800                scrub_recheck_block_checksum(sblock);
1801}
1802
1803static inline int scrub_check_fsid(u8 fsid[],
1804                                   struct scrub_page *spage)
1805{
1806        struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
1807        int ret;
1808
1809        ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1810        return !ret;
1811}
1812
1813static void scrub_recheck_block_checksum(struct scrub_block *sblock)
1814{
1815        sblock->header_error = 0;
1816        sblock->checksum_error = 0;
1817        sblock->generation_error = 0;
1818
1819        if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA)
1820                scrub_checksum_data(sblock);
1821        else
1822                scrub_checksum_tree_block(sblock);
1823}
1824
1825static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1826                                             struct scrub_block *sblock_good)
1827{
1828        int page_num;
1829        int ret = 0;
1830
1831        for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1832                int ret_sub;
1833
1834                ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1835                                                           sblock_good,
1836                                                           page_num, 1);
1837                if (ret_sub)
1838                        ret = ret_sub;
1839        }
1840
1841        return ret;
1842}
1843
1844static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1845                                            struct scrub_block *sblock_good,
1846                                            int page_num, int force_write)
1847{
1848        struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1849        struct scrub_page *page_good = sblock_good->pagev[page_num];
1850        struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
1851
1852        BUG_ON(page_bad->page == NULL);
1853        BUG_ON(page_good->page == NULL);
1854        if (force_write || sblock_bad->header_error ||
1855            sblock_bad->checksum_error || page_bad->io_error) {
1856                struct bio *bio;
1857                int ret;
1858
1859                if (!page_bad->dev->bdev) {
1860                        btrfs_warn_rl(fs_info,
1861                                "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
1862                        return -EIO;
1863                }
1864
1865                bio = btrfs_io_bio_alloc(1);
1866                bio_set_dev(bio, page_bad->dev->bdev);
1867                bio->bi_iter.bi_sector = page_bad->physical >> 9;
1868                bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
1869
1870                ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
1871                if (PAGE_SIZE != ret) {
1872                        bio_put(bio);
1873                        return -EIO;
1874                }
1875
1876                if (btrfsic_submit_bio_wait(bio)) {
1877                        btrfs_dev_stat_inc_and_print(page_bad->dev,
1878                                BTRFS_DEV_STAT_WRITE_ERRS);
1879                        btrfs_dev_replace_stats_inc(
1880                                &fs_info->dev_replace.num_write_errors);
1881                        bio_put(bio);
1882                        return -EIO;
1883                }
1884                bio_put(bio);
1885        }
1886
1887        return 0;
1888}
1889
1890static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1891{
1892        struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
1893        int page_num;
1894
1895        /*
1896         * This block is used for the check of the parity on the source device,
1897         * so the data needn't be written into the destination device.
1898         */
1899        if (sblock->sparity)
1900                return;
1901
1902        for (page_num = 0; page_num < sblock->page_count; page_num++) {
1903                int ret;
1904
1905                ret = scrub_write_page_to_dev_replace(sblock, page_num);
1906                if (ret)
1907                        btrfs_dev_replace_stats_inc(
1908                                &fs_info->dev_replace.num_write_errors);
1909        }
1910}
1911
1912static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1913                                           int page_num)
1914{
1915        struct scrub_page *spage = sblock->pagev[page_num];
1916
1917        BUG_ON(spage->page == NULL);
1918        if (spage->io_error) {
1919                void *mapped_buffer = kmap_atomic(spage->page);
1920
1921                clear_page(mapped_buffer);
1922                flush_dcache_page(spage->page);
1923                kunmap_atomic(mapped_buffer);
1924        }
1925        return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1926}
1927
1928static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1929                                    struct scrub_page *spage)
1930{
1931        struct scrub_bio *sbio;
1932        int ret;
1933
1934        mutex_lock(&sctx->wr_lock);
1935again:
1936        if (!sctx->wr_curr_bio) {
1937                sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
1938                                              GFP_KERNEL);
1939                if (!sctx->wr_curr_bio) {
1940                        mutex_unlock(&sctx->wr_lock);
1941                        return -ENOMEM;
1942                }
1943                sctx->wr_curr_bio->sctx = sctx;
1944                sctx->wr_curr_bio->page_count = 0;
1945        }
1946        sbio = sctx->wr_curr_bio;
1947        if (sbio->page_count == 0) {
1948                struct bio *bio;
1949
1950                sbio->physical = spage->physical_for_dev_replace;
1951                sbio->logical = spage->logical;
1952                sbio->dev = sctx->wr_tgtdev;
1953                bio = sbio->bio;
1954                if (!bio) {
1955                        bio = btrfs_io_bio_alloc(sctx->pages_per_wr_bio);
1956                        sbio->bio = bio;
1957                }
1958
1959                bio->bi_private = sbio;
1960                bio->bi_end_io = scrub_wr_bio_end_io;
1961                bio_set_dev(bio, sbio->dev->bdev);
1962                bio->bi_iter.bi_sector = sbio->physical >> 9;
1963                bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
1964                sbio->status = 0;
1965        } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1966                   spage->physical_for_dev_replace ||
1967                   sbio->logical + sbio->page_count * PAGE_SIZE !=
1968                   spage->logical) {
1969                scrub_wr_submit(sctx);
1970                goto again;
1971        }
1972
1973        ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1974        if (ret != PAGE_SIZE) {
1975                if (sbio->page_count < 1) {
1976                        bio_put(sbio->bio);
1977                        sbio->bio = NULL;
1978                        mutex_unlock(&sctx->wr_lock);
1979                        return -EIO;
1980                }
1981                scrub_wr_submit(sctx);
1982                goto again;
1983        }
1984
1985        sbio->pagev[sbio->page_count] = spage;
1986        scrub_page_get(spage);
1987        sbio->page_count++;
1988        if (sbio->page_count == sctx->pages_per_wr_bio)
1989                scrub_wr_submit(sctx);
1990        mutex_unlock(&sctx->wr_lock);
1991
1992        return 0;
1993}
1994
1995static void scrub_wr_submit(struct scrub_ctx *sctx)
1996{
1997        struct scrub_bio *sbio;
1998
1999        if (!sctx->wr_curr_bio)
2000                return;

2001
2002        sbio = sctx->wr_curr_bio;
2003        sctx->wr_curr_bio = NULL;
2004        WARN_ON(!sbio->bio->bi_disk);
2005        scrub_pending_bio_inc(sctx);
2006        /* process all writes in a single worker thread. Then the block layer
2007         * orders the requests before sending them to the driver which
2008         * doubled the write performance on spinning disks when measured
2009         * with Linux 3.5 */
2010        btrfsic_submit_bio(sbio->bio);
2011}
2012
2013static void scrub_wr_bio_end_io(struct bio *bio)
2014{
2015        struct scrub_bio *sbio = bio->bi_private;
2016        struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
2017
2018        sbio->status = bio->bi_status;
2019        sbio->bio = bio;
2020
2021        btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
2022                         scrub_wr_bio_end_io_worker, NULL, NULL);
2023        btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
2024}
2025
2026static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
2027{
2028        struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2029        struct scrub_ctx *sctx = sbio->sctx;
2030        int i;
2031
2032        WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
2033        if (sbio->status) {
2034                struct btrfs_dev_replace *dev_replace =
2035                        &sbio->sctx->fs_info->dev_replace;
2036
2037                for (i = 0; i < sbio->page_count; i++) {
2038                        struct scrub_page *spage = sbio->pagev[i];
2039
2040                        spage->io_error = 1;
2041                        btrfs_dev_replace_stats_inc(&dev_replace->
2042                                                    num_write_errors);
2043                }
2044        }
2045
2046        for (i = 0; i < sbio->page_count; i++)
2047                scrub_page_put(sbio->pagev[i]);
2048
2049        bio_put(sbio->bio);
2050        kfree(sbio);
2051        scrub_pending_bio_dec(sctx);
2052}
2053
2054static int scrub_checksum(struct scrub_block *sblock)
2055{
2056        u64 flags;
2057        int ret;
2058
2059        /*
2060         * No need to initialize these stats currently,
2061         * because this function only use return value
2062         * instead of these stats value.
2063         *
2064         * Todo:
2065         * always use stats
2066         */
2067        sblock->header_error = 0;
2068        sblock->generation_error = 0;
2069        sblock->checksum_error = 0;
2070
2071        WARN_ON(sblock->page_count < 1);
2072        flags = sblock->pagev[0]->flags;
2073        ret = 0;
2074        if (flags & BTRFS_EXTENT_FLAG_DATA)
2075                ret = scrub_checksum_data(sblock);
2076        else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
2077                ret = scrub_checksum_tree_block(sblock);
2078        else if (flags & BTRFS_EXTENT_FLAG_SUPER)
2079                (void)scrub_checksum_super(sblock);
2080        else
2081                WARN_ON(1);
2082        if (ret)
2083                scrub_handle_errored_block(sblock);
2084
2085        return ret;
2086}
2087
2088static int scrub_checksum_data(struct scrub_block *sblock)
2089{
2090        struct scrub_ctx *sctx = sblock->sctx;
2091        u8 csum[BTRFS_CSUM_SIZE];
2092        u8 *on_disk_csum;
2093        struct page *page;
2094        void *buffer;
2095        u32 crc = ~(u32)0;
2096        u64 len;
2097        int index;
2098
2099        BUG_ON(sblock->page_count < 1);
2100        if (!sblock->pagev[0]->have_csum)
2101                return 0;
2102
2103        on_disk_csum = sblock->pagev[0]->csum;
2104        page = sblock->pagev[0]->page;
2105        buffer = kmap_atomic(page);
2106
2107        len = sctx->fs_info->sectorsize;
2108        index = 0;
2109        for (;;) {
2110                u64 l = min_t(u64, len, PAGE_SIZE);
2111
2112                crc = btrfs_csum_data(buffer, crc, l);
2113                kunmap_atomic(buffer);
2114                len -= l;
2115                if (len == 0)
2116                        break;
2117                index++;
2118                BUG_ON(index >= sblock->page_count);
2119                BUG_ON(!sblock->pagev[index]->page);
2120                page = sblock->pagev[index]->page;
2121                buffer = kmap_atomic(page);
2122        }
2123
2124        btrfs_csum_final(crc, csum);
2125        if (memcmp(csum, on_disk_csum, sctx->csum_size))
2126                sblock->checksum_error = 1;
2127
2128        return sblock->checksum_error;
2129}
2130
2131static int scrub_checksum_tree_block(struct scrub_block *sblock)
2132{
2133        struct scrub_ctx *sctx = sblock->sctx;
2134        struct btrfs_header *h;
2135        struct btrfs_fs_info *fs_info = sctx->fs_info;
2136        u8 calculated_csum[BTRFS_CSUM_SIZE];
2137        u8 on_disk_csum[BTRFS_CSUM_SIZE];
2138        struct page *page;
2139        void *mapped_buffer;
2140        u64 mapped_size;
2141        void *p;
2142        u32 crc = ~(u32)0;
2143        u64 len;
2144        int index;
2145
2146        BUG_ON(sblock->page_count < 1);
2147        page = sblock->pagev[0]->page;
2148        mapped_buffer = kmap_atomic(page);
2149        h = (struct btrfs_header *)mapped_buffer;
2150        memcpy(on_disk_csum, h->csum, sctx->csum_size);
2151
2152        /*
2153         * we don't use the getter functions here, as we
2154         * a) don't have an extent buffer and
2155         * b) the page is already kmapped
2156         */
2157        if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
2158                sblock->header_error = 1;
2159
2160        if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) {
2161                sblock->header_error = 1;
2162                sblock->generation_error = 1;
2163        }
2164
2165        if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
2166                sblock->header_error = 1;
2167
2168        if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
2169                   BTRFS_UUID_SIZE))
2170                sblock->header_error = 1;
2171
2172        len = sctx->fs_info->nodesize - BTRFS_CSUM_SIZE;
2173        mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
2174        p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
2175        index = 0;
2176        for (;;) {
2177                u64 l = min_t(u64, len, mapped_size);
2178
2179                crc = btrfs_csum_data(p, crc, l);
2180                kunmap_atomic(mapped_buffer);
2181                len -= l;
2182                if (len == 0)
2183                        break;
2184                index++;
2185                BUG_ON(index >= sblock->page_count);
2186                BUG_ON(!sblock->pagev[index]->page);
2187                page = sblock->pagev[index]->page;
2188                mapped_buffer = kmap_atomic(page);
2189                mapped_size = PAGE_SIZE;
2190                p = mapped_buffer;
2191        }
2192
2193        btrfs_csum_final(crc, calculated_csum);
2194        if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
2195                sblock->checksum_error = 1;
2196
2197        return sblock->header_error || sblock->checksum_error;
2198}
2199
2200static int scrub_checksum_super(struct scrub_block *sblock)
2201{
2202        struct btrfs_super_block *s;
2203        struct scrub_ctx *sctx = sblock->sctx;
2204        u8 calculated_csum[BTRFS_CSUM_SIZE];
2205        u8 on_disk_csum[BTRFS_CSUM_SIZE];
2206        struct page *page;
2207        void *mapped_buffer;
2208        u64 mapped_size;
2209        void *p;
2210        u32 crc = ~(u32)0;
2211        int fail_gen = 0;
2212        int fail_cor = 0;
2213        u64 len;
2214        int index;
2215
2216        BUG_ON(sblock->page_count < 1);
2217        page = sblock->pagev[0]->page;
2218        mapped_buffer = kmap_atomic(page);
2219        s = (struct btrfs_super_block *)mapped_buffer;
2220        memcpy(on_disk_csum, s->csum, sctx->csum_size);
2221
2222        if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
2223                ++fail_cor;
2224
2225        if (sblock->pagev[0]->generation != btrfs_super_generation(s))
2226                ++fail_gen;
2227
2228        if (!scrub_check_fsid(s->fsid, sblock->pagev[0]))
2229                ++fail_cor;
2230
2231        len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
2232        mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
2233        p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
2234        index = 0;
2235        for (;;) {
2236                u64 l = min_t(u64, len, mapped_size);
2237
2238                crc = btrfs_csum_data(p, crc, l);
2239                kunmap_atomic(mapped_buffer);
2240                len -= l;
2241                if (len == 0)
2242                        break;
2243                index++;
2244                BUG_ON(index >= sblock->page_count);
2245                BUG_ON(!sblock->pagev[index]->page);
2246                page = sblock->pagev[index]->page;
2247                mapped_buffer = kmap_atomic(page);
2248                mapped_size = PAGE_SIZE;
2249                p = mapped_buffer;
2250        }
2251
2252        btrfs_csum_final(crc, calculated_csum);
2253        if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
2254                ++fail_cor;
2255
2256        if (fail_cor + fail_gen) {
2257                /*
2258                 * if we find an error in a super block, we just report it.
2259                 * They will get written with the next transaction commit
2260                 * anyway
2261                 */
2262                spin_lock(&sctx->stat_lock);
2263                ++sctx->stat.super_errors;
2264                spin_unlock(&sctx->stat_lock);
2265                if (fail_cor)
2266                        btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
2267                                BTRFS_DEV_STAT_CORRUPTION_ERRS);
2268                else
2269                        btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
2270                                BTRFS_DEV_STAT_GENERATION_ERRS);
2271        }
2272
2273        return fail_cor + fail_gen;
2274}
2275
2276static void scrub_block_get(struct scrub_block *sblock)
2277{
2278        refcount_inc(&sblock->refs);
2279}
2280
2281static void scrub_block_put(struct scrub_block *sblock)
2282{
2283        if (refcount_dec_and_test(&sblock->refs)) {
2284                int i;
2285
2286                if (sblock->sparity)
2287                        scrub_parity_put(sblock->sparity);
2288
2289                for (i = 0; i < sblock->page_count; i++)
2290                        scrub_page_put(sblock->pagev[i]);
2291                kfree(sblock);
2292        }
2293}
2294
2295static void scrub_page_get(struct scrub_page *spage)
2296{
2297        atomic_inc(&spage->refs);
2298}
2299
2300static void scrub_page_put(struct scrub_page *spage)
2301{
2302        if (atomic_dec_and_test(&spage->refs)) {
2303                if (spage->page)
2304                        __free_page(spage->page);
2305                kfree(spage);
2306        }
2307}
2308
2309static void scrub_submit(struct scrub_ctx *sctx)
2310{
2311        struct scrub_bio *sbio;
2312
2313        if (sctx->curr == -1)
2314                return;
2315
2316        sbio = sctx->bios[sctx->curr];
2317        sctx->curr = -1;
2318        scrub_pending_bio_inc(sctx);
2319        btrfsic_submit_bio(sbio->bio);
2320}
2321
2322static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
2323                                    struct scrub_page *spage)
2324{
2325        struct scrub_block *sblock = spage->sblock;
2326        struct scrub_bio *sbio;
2327        int ret;
2328
2329again:
2330        /*
2331         * grab a fresh bio or wait for one to become available
2332         */
2333        while (sctx->curr == -1) {
2334                spin_lock(&sctx->list_lock);
2335                sctx->curr = sctx->first_free;
2336                if (sctx->curr != -1) {
2337                        sctx->first_free = sctx->bios[sctx->curr]->next_free;
2338                        sctx->bios[sctx->curr]->next_free = -1;
2339                        sctx->bios[sctx->curr]->page_count = 0;
2340                        spin_unlock(&sctx->list_lock);
2341                } else {
2342                        spin_unlock(&sctx->list_lock);
2343                        wait_event(sctx->list_wait, sctx->first_free != -1);
2344                }
2345        }
2346        sbio = sctx->bios[sctx->curr];
2347        if (sbio->page_count == 0) {
2348                struct bio *bio;
2349
2350                sbio->physical = spage->physical;
2351                sbio->logical = spage->logical;
2352                sbio->dev = spage->dev;
2353                bio = sbio->bio;
2354                if (!bio) {
2355                        bio = btrfs_io_bio_alloc(sctx->pages_per_rd_bio);
2356                        sbio->bio = bio;
2357                }
2358
2359                bio->bi_private = sbio;
2360                bio->bi_end_io = scrub_bio_end_io;
2361                bio_set_dev(bio, sbio->dev->bdev);
2362                bio->bi_iter.bi_sector = sbio->physical >> 9;
2363                bio_set_op_attrs(bio, REQ_OP_READ, 0);
2364                sbio->status = 0;
2365        } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
2366                   spage->physical ||
2367                   sbio->logical + sbio->page_count * PAGE_SIZE !=
2368                   spage->logical ||
2369                   sbio->dev != spage->dev) {
2370                scrub_submit(sctx);
2371                goto again;
2372        }
2373
2374        sbio->pagev[sbio->page_count] = spage;
2375        ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
2376        if (ret != PAGE_SIZE) {
2377                if (sbio->page_count < 1) {
2378                        bio_put(sbio->bio);
2379                        sbio->bio = NULL;
2380                        return -EIO;
2381                }
2382                scrub_submit(sctx);
2383                goto again;
2384        }
2385
2386        scrub_block_get(sblock); /* one for the page added to the bio */
2387        atomic_inc(&sblock->outstanding_pages);
2388        sbio->page_count++;
2389        if (sbio->page_count == sctx->pages_per_rd_bio)
2390                scrub_submit(sctx);
2391
2392        return 0;
2393}
2394
2395static void scrub_missing_raid56_end_io(struct bio *bio)
2396{
2397        struct scrub_block *sblock = bio->bi_private;
2398        struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
2399
2400        if (bio->bi_status)
2401                sblock->no_io_error_seen = 0;
2402
2403        bio_put(bio);
2404
2405        btrfs_queue_work(fs_info->scrub_workers, &sblock->work);
2406}
2407
2408static void scrub_missing_raid56_worker(struct btrfs_work *work)
2409{
2410        struct scrub_block *sblock = container_of(work, struct scrub_block, work);
2411        struct scrub_ctx *sctx = sblock->sctx;
2412        struct btrfs_fs_info *fs_info = sctx->fs_info;
2413        u64 logical;
2414        struct btrfs_device *dev;
2415
2416        logical = sblock->pagev[0]->logical;
2417        dev = sblock->pagev[0]->dev;
2418
2419        if (sblock->no_io_error_seen)
2420                scrub_recheck_block_checksum(sblock);
2421
2422        if (!sblock->no_io_error_seen) {
2423                spin_lock(&sctx->stat_lock);
2424                sctx->stat.read_errors++;
2425                spin_unlock(&sctx->stat_lock);
2426                btrfs_err_rl_in_rcu(fs_info,
2427                        "IO error rebuilding logical %llu for dev %s",
2428                        logical, rcu_str_deref(dev->name));
2429        } else if (sblock->header_error || sblock->checksum_error) {
2430                spin_lock(&sctx->stat_lock);
2431                sctx->stat.uncorrectable_errors++;
2432                spin_unlock(&sctx->stat_lock);
2433                btrfs_err_rl_in_rcu(fs_info,
2434                        "failed to rebuild valid logical %llu for dev %s",
2435                        logical, rcu_str_deref(dev->name));
2436        } else {
2437                scrub_write_block_to_dev_replace(sblock);
2438        }
2439
2440        scrub_block_put(sblock);
2441
2442        if (sctx->is_dev_replace && sctx->flush_all_writes) {
2443                mutex_lock(&sctx->wr_lock);
2444                scrub_wr_submit(sctx);
2445                mutex_unlock(&sctx->wr_lock);
2446        }
2447
2448        scrub_pending_bio_dec(sctx);
2449}
2450
2451static void scrub_missing_raid56_pages(struct scrub_block *sblock)
2452{
2453        struct scrub_ctx *sctx = sblock->sctx;
2454        struct btrfs_fs_info *fs_info = sctx->fs_info;
2455        u64 length = sblock->page_count * PAGE_SIZE;
2456        u64 logical = sblock->pagev[0]->logical;
2457        struct btrfs_bio *bbio = NULL;
2458        struct bio *bio;
2459        struct btrfs_raid_bio *rbio;
2460        int ret;
2461        int i;
2462
2463        btrfs_bio_counter_inc_blocked(fs_info);
2464        ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
2465                        &length, &bbio);
2466        if (ret || !bbio || !bbio->raid_map)
2467                goto bbio_out;
2468
2469        if (WARN_ON(!sctx->is_dev_replace ||
2470                    !(bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2471                /*
2472                 * We shouldn't be scrubbing a missing device. Even for dev
2473                 * replace, we should only get here for RAID 5/6. We either
2474                 * managed to mount something with no mirrors remaining or
2475                 * there's a bug in scrub_remap_extent()/btrfs_map_block().
2476                 */
2477                goto bbio_out;
2478        }
2479
2480        bio = btrfs_io_bio_alloc(0);
2481        bio->bi_iter.bi_sector = logical >> 9;
2482        bio->bi_private = sblock;
2483        bio->bi_end_io = scrub_missing_raid56_end_io;
2484
2485        rbio = raid56_alloc_missing_rbio(fs_info, bio, bbio, length);
2486        if (!rbio)
2487                goto rbio_out;
2488
2489        for (i = 0; i < sblock->page_count; i++) {
2490                struct scrub_page *spage = sblock->pagev[i];
2491
2492                raid56_add_scrub_pages(rbio, spage->page, spage->logical);
2493        }
2494
2495        btrfs_init_work(&sblock->work, btrfs_scrub_helper,
2496                        scrub_missing_raid56_worker, NULL, NULL);
2497        scrub_block_get(sblock);
2498        scrub_pending_bio_inc(sctx);
2499        raid56_submit_missing_rbio(rbio);
2500        return;
2501
2502rbio_out:
2503        bio_put(bio);
2504bbio_out:
2505        btrfs_bio_counter_dec(fs_info);
2506        btrfs_put_bbio(bbio);
2507        spin_lock(&sctx->stat_lock);
2508        sctx->stat.malloc_errors++;
2509        spin_unlock(&sctx->stat_lock);
2510}
2511
2512static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
2513                       u64 physical, struct btrfs_device *dev, u64 flags,
2514                       u64 gen, int mirror_num, u8 *csum, int force,
2515                       u64 physical_for_dev_replace)
2516{
2517        struct scrub_block *sblock;
2518        int index;
2519
2520        sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2521        if (!sblock) {
2522                spin_lock(&sctx->stat_lock);
2523                sctx->stat.malloc_errors++;
2524                spin_unlock(&sctx->stat_lock);
2525                return -ENOMEM;
2526        }
2527
2528        /* one ref inside this function, plus one for each page added to
2529         * a bio later on */
2530        refcount_set(&sblock->refs, 1);
2531        sblock->sctx = sctx;
2532        sblock->no_io_error_seen = 1;
2533
2534        for (index = 0; len > 0; index++) {
2535                struct scrub_page *spage;
2536                u64 l = min_t(u64, len, PAGE_SIZE);
2537
2538                spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2539                if (!spage) {
2540leave_nomem:
2541                        spin_lock(&sctx->stat_lock);
2542                        sctx->stat.malloc_errors++;
2543                        spin_unlock(&sctx->stat_lock);
2544                        scrub_block_put(sblock);
2545                        return -ENOMEM;
2546                }
2547                BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2548                scrub_page_get(spage);
2549                sblock->pagev[index] = spage;
2550                spage->sblock = sblock;
2551                spage->dev = dev;
2552                spage->flags = flags;
2553                spage->generation = gen;
2554                spage->logical = logical;
2555                spage->physical = physical;
2556                spage->physical_for_dev_replace = physical_for_dev_replace;
2557                spage->mirror_num = mirror_num;
2558                if (csum) {
2559                        spage->have_csum = 1;
2560                        memcpy(spage->csum, csum, sctx->csum_size);
2561                } else {
2562                        spage->have_csum = 0;
2563                }
2564                sblock->page_count++;
2565                spage->page = alloc_page(GFP_KERNEL);
2566                if (!spage->page)
2567                        goto leave_nomem;
2568                len -= l;
2569                logical += l;
2570                physical += l;
2571                physical_for_dev_replace += l;
2572        }
2573
2574        WARN_ON(sblock->page_count == 0);
2575        if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2576                /*
2577                 * This case should only be hit for RAID 5/6 device replace. See
2578                 * the comment in scrub_missing_raid56_pages() for details.
2579                 */
2580                scrub_missing_raid56_pages(sblock);
2581        } else {
2582                for (index = 0; index < sblock->page_count; index++) {
2583                        struct scrub_page *spage = sblock->pagev[index];
2584                        int ret;
2585
2586                        ret = scrub_add_page_to_rd_bio(sctx, spage);
2587                        if (ret) {
2588                                scrub_block_put(sblock);
2589                                return ret;
2590                        }
2591                }
2592
2593                if (force)
2594                        scrub_submit(sctx);
2595        }
2596
2597        /* last one frees, either here or in bio completion for last page */
2598        scrub_block_put(sblock);
2599        return 0;
2600}
2601
2602static void scrub_bio_end_io(struct bio *bio)
2603{
2604        struct scrub_bio *sbio = bio->bi_private;
2605        struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
2606
2607        sbio->status = bio->bi_status;
2608        sbio->bio = bio;
2609
2610        btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
2611}
2612
2613static void scrub_bio_end_io_worker(struct btrfs_work *work)
2614{
2615        struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2616        struct scrub_ctx *sctx = sbio->sctx;
2617        int i;
2618
2619        BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
2620        if (sbio->status) {
2621                for (i = 0; i < sbio->page_count; i++) {
2622                        struct scrub_page *spage = sbio->pagev[i];
2623
2624                        spage->io_error = 1;
2625                        spage->sblock->no_io_error_seen = 0;
2626                }
2627        }
2628
2629        /* now complete the scrub_block items that have all pages completed */
2630        for (i = 0; i < sbio->page_count; i++) {
2631                struct scrub_page *spage = sbio->pagev[i];
2632                struct scrub_block *sblock = spage->sblock;
2633
2634                if (atomic_dec_and_test(&sblock->outstanding_pages))
2635                        scrub_block_complete(sblock);
2636                scrub_block_put(sblock);
2637        }
2638
2639        bio_put(sbio->bio);
2640        sbio->bio = NULL;
2641        spin_lock(&sctx->list_lock);
2642        sbio->next_free = sctx->first_free;
2643        sctx->first_free = sbio->index;
2644        spin_unlock(&sctx->list_lock);
2645
2646        if (sctx->is_dev_replace && sctx->flush_all_writes) {
2647                mutex_lock(&sctx->wr_lock);
2648                scrub_wr_submit(sctx);
2649                mutex_unlock(&sctx->wr_lock);
2650        }
2651
2652        scrub_pending_bio_dec(sctx);
2653}
2654
2655static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2656                                       unsigned long *bitmap,
2657                                       u64 start, u64 len)
2658{
2659        u64 offset;
2660        u64 nsectors64;
2661        u32 nsectors;
2662        int sectorsize = sparity->sctx->fs_info->sectorsize;
2663
2664        if (len >= sparity->stripe_len) {
2665                bitmap_set(bitmap, 0, sparity->nsectors);
2666                return;
2667        }
2668
2669        start -= sparity->logic_start;
2670        start = div64_u64_rem(start, sparity->stripe_len, &offset);
2671        offset = div_u64(offset, sectorsize);
2672        nsectors64 = div_u64(len, sectorsize);
2673
2674        ASSERT(nsectors64 < UINT_MAX);
2675        nsectors = (u32)nsectors64;
2676
2677        if (offset + nsectors <= sparity->nsectors) {
2678                bitmap_set(bitmap, offset, nsectors);
2679                return;
2680        }
2681
2682        bitmap_set(bitmap, offset, sparity->nsectors - offset);
2683        bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2684}
2685
2686static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2687                                                   u64 start, u64 len)
2688{
2689        __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
2690}
2691
2692static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2693                                                  u64 start, u64 len)
2694{
2695        __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
2696}
2697
2698static void scrub_block_complete(struct scrub_block *sblock)
2699{
2700        int corrupted = 0;
2701
2702        if (!sblock->no_io_error_seen) {
2703                corrupted = 1;
2704                scrub_handle_errored_block(sblock);
2705        } else {
2706                /*
2707                 * if has checksum error, write via repair mechanism in
2708                 * dev replace case, otherwise write here in dev replace
2709                 * case.
2710                 */
2711                corrupted = scrub_checksum(sblock);
2712                if (!corrupted && sblock->sctx->is_dev_replace)
2713                        scrub_write_block_to_dev_replace(sblock);
2714        }
2715
2716        if (sblock->sparity && corrupted && !sblock->data_corrected) {
2717                u64 start = sblock->pagev[0]->logical;
2718                u64 end = sblock->pagev[sblock->page_count - 1]->logical +
2719                          PAGE_SIZE;
2720
2721                scrub_parity_mark_sectors_error(sblock->sparity,
2722                                                start, end - start);
2723        }
2724}
2725
2726static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
2727{
2728        struct btrfs_ordered_sum *sum = NULL;
2729        unsigned long index;
2730        unsigned long num_sectors;
2731
2732        while (!list_empty(&sctx->csum_list)) {
2733                sum = list_first_entry(&sctx->csum_list,
2734                                       struct btrfs_ordered_sum, list);
2735                if (sum->bytenr > logical)
2736                        return 0;
2737                if (sum->bytenr + sum->len > logical)
2738                        break;
2739
2740                ++sctx->stat.csum_discards;
2741                list_del(&sum->list);
2742                kfree(sum);
2743                sum = NULL;
2744        }
2745        if (!sum)
2746                return 0;
2747
2748        index = div_u64(logical - sum->bytenr, sctx->fs_info->sectorsize);
2749        ASSERT(index < UINT_MAX);
2750
2751        num_sectors = sum->len / sctx->fs_info->sectorsize;
2752        memcpy(csum, sum->sums + index, sctx->csum_size);
2753        if (index == num_sectors - 1) {
2754                list_del(&sum->list);
2755                kfree(sum);
2756        }
2757        return 1;
2758}
2759
2760/* scrub extent tries to collect up to 64 kB for each bio */
2761static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
2762                        u64 logical, u64 len,
2763                        u64 physical, struct btrfs_device *dev, u64 flags,
2764                        u64 gen, int mirror_num, u64 physical_for_dev_replace)
2765{
2766        int ret;
2767        u8 csum[BTRFS_CSUM_SIZE];
2768        u32 blocksize;
2769
2770        if (flags & BTRFS_EXTENT_FLAG_DATA) {
2771                if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2772                        blocksize = map->stripe_len;
2773                else
2774                        blocksize = sctx->fs_info->sectorsize;
2775                spin_lock(&sctx->stat_lock);
2776                sctx->stat.data_extents_scrubbed++;
2777                sctx->stat.data_bytes_scrubbed += len;
2778                spin_unlock(&sctx->stat_lock);
2779        } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2780                if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2781                        blocksize = map->stripe_len;
2782                else
2783                        blocksize = sctx->fs_info->nodesize;
2784                spin_lock(&sctx->stat_lock);
2785                sctx->stat.tree_extents_scrubbed++;
2786                sctx->stat.tree_bytes_scrubbed += len;
2787                spin_unlock(&sctx->stat_lock);
2788        } else {
2789                blocksize = sctx->fs_info->sectorsize;
2790                WARN_ON(1);
2791        }
2792
2793        while (len) {
2794                u64 l = min_t(u64, len, blocksize);
2795                int have_csum = 0;
2796
2797                if (flags & BTRFS_EXTENT_FLAG_DATA) {
2798                        /* push csums to sbio */
2799                        have_csum = scrub_find_csum(sctx, logical, csum);
2800                        if (have_csum == 0)
2801                                ++sctx->stat.no_csum;
2802                        if (sctx->is_dev_replace && !have_csum) {
2803                                ret = copy_nocow_pages(sctx, logical, l,
2804                                                       mirror_num,
2805                                                      physical_for_dev_replace);
2806                                goto behind_scrub_pages;
2807                        }
2808                }
2809                ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
2810                                  mirror_num, have_csum ? csum : NULL, 0,
2811                                  physical_for_dev_replace);
2812behind_scrub_pages:
2813                if (ret)
2814                        return ret;
2815                len -= l;
2816                logical += l;
2817                physical += l;
2818                physical_for_dev_replace += l;
2819        }
2820        return 0;
2821}
2822
2823static int scrub_pages_for_parity(struct scrub_parity *sparity,
2824                                  u64 logical, u64 len,
2825                                  u64 physical, struct btrfs_device *dev,
2826                                  u64 flags, u64 gen, int mirror_num, u8 *csum)
2827{
2828        struct scrub_ctx *sctx = sparity->sctx;
2829        struct scrub_block *sblock;
2830        int index;
2831
2832        sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2833        if (!sblock) {
2834                spin_lock(&sctx->stat_lock);
2835                sctx->stat.malloc_errors++;
2836                spin_unlock(&sctx->stat_lock);
2837                return -ENOMEM;
2838        }
2839
2840        /* one ref inside this function, plus one for each page added to
2841         * a bio later on */
2842        refcount_set(&sblock->refs, 1);
2843        sblock->sctx = sctx;
2844        sblock->no_io_error_seen = 1;
2845        sblock->sparity = sparity;
2846        scrub_parity_get(sparity);
2847
2848        for (index = 0; len > 0; index++) {
2849                struct scrub_page *spage;
2850                u64 l = min_t(u64, len, PAGE_SIZE);
2851
2852                spage = kzalloc(sizeof(*spage), GFP_KERNEL);
2853                if (!spage) {
2854leave_nomem:
2855                        spin_lock(&sctx->stat_lock);
2856                        sctx->stat.malloc_errors++;
2857                        spin_unlock(&sctx->stat_lock);
2858                        scrub_block_put(sblock);
2859                        return -ENOMEM;
2860                }
2861                BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2862                /* For scrub block */
2863                scrub_page_get(spage);
2864                sblock->pagev[index] = spage;
2865                /* For scrub parity */
2866                scrub_page_get(spage);
2867                list_add_tail(&spage->list, &sparity->spages);
2868                spage->sblock = sblock;
2869                spage->dev = dev;
2870                spage->flags = flags;
2871                spage->generation = gen;
2872                spage->logical = logical;
2873                spage->physical = physical;
2874                spage->mirror_num = mirror_num;
2875                if (csum) {
2876                        spage->have_csum = 1;
2877                        memcpy(spage->csum, csum, sctx->csum_size);
2878                } else {
2879                        spage->have_csum = 0;
2880                }
2881                sblock->page_count++;
2882                spage->page = alloc_page(GFP_KERNEL);
2883                if (!spage->page)
2884                        goto leave_nomem;
2885                len -= l;
2886                logical += l;
2887                physical += l;
2888        }
2889
2890        WARN_ON(sblock->page_count == 0);
2891        for (index = 0; index < sblock->page_count; index++) {
2892                struct scrub_page *spage = sblock->pagev[index];
2893                int ret;
2894
2895                ret = scrub_add_page_to_rd_bio(sctx, spage);
2896                if (ret) {
2897                        scrub_block_put(sblock);
2898                        return ret;
2899                }
2900        }
2901
2902        /* last one frees, either here or in bio completion for last page */
2903        scrub_block_put(sblock);
2904        return 0;
2905}
2906
2907static int scrub_extent_for_parity(struct scrub_parity *sparity,
2908                                   u64 logical, u64 len,
2909                                   u64 physical, struct btrfs_device *dev,
2910                                   u64 flags, u64 gen, int mirror_num)
2911{
2912        struct scrub_ctx *sctx = sparity->sctx;
2913        int ret;
2914        u8 csum[BTRFS_CSUM_SIZE];
2915        u32 blocksize;
2916
2917        if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2918                scrub_parity_mark_sectors_error(sparity, logical, len);
2919                return 0;
2920        }
2921
2922        if (flags & BTRFS_EXTENT_FLAG_DATA) {
2923                blocksize = sparity->stripe_len;
2924        } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2925                blocksize = sparity->stripe_len;
2926        } else {
2927                blocksize = sctx->fs_info->sectorsize;
2928                WARN_ON(1);
2929        }
2930
2931        while (len) {
2932                u64 l = min_t(u64, len, blocksize);
2933                int have_csum = 0;
2934
2935                if (flags & BTRFS_EXTENT_FLAG_DATA) {
2936                        /* push csums to sbio */
2937                        have_csum = scrub_find_csum(sctx, logical, csum);
2938                        if (have_csum == 0)
2939                                goto skip;
2940                }
2941                ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
2942                                             flags, gen, mirror_num,
2943                                             have_csum ? csum : NULL);
2944                if (ret)
2945                        return ret;
2946skip:
2947                len -= l;
2948                logical += l;
2949                physical += l;
2950        }
2951        return 0;
2952}
2953
2954/*
2955 * Given a physical address, this will calculate it's
2956 * logical offset. if this is a parity stripe, it will return
2957 * the most left data stripe's logical offset.
2958 *
2959 * return 0 if it is a data stripe, 1 means parity stripe.
2960 */
2961static int get_raid56_logic_offset(u64 physical, int num,
2962                                   struct map_lookup *map, u64 *offset,
2963                                   u64 *stripe_start)
2964{
2965        int i;
2966        int j = 0;
2967        u64 stripe_nr;
2968        u64 last_offset;
2969        u32 stripe_index;
2970        u32 rot;
2971
2972        last_offset = (physical - map->stripes[num].physical) *
2973                      nr_data_stripes(map);
2974        if (stripe_start)
2975                *stripe_start = last_offset;
2976
2977        *offset = last_offset;
2978        for (i = 0; i < nr_data_stripes(map); i++) {
2979                *offset = last_offset + i * map->stripe_len;
2980
2981                stripe_nr = div64_u64(*offset, map->stripe_len);
2982                stripe_nr = div_u64(stripe_nr, nr_data_stripes(map));
2983
2984                /* Work out the disk rotation on this stripe-set */
2985                stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
2986                /* calculate which stripe this data locates */
2987                rot += i;
2988                stripe_index = rot % map->num_stripes;
2989                if (stripe_index == num)
2990                        return 0;
2991                if (stripe_index < num)
2992                        j++;
2993        }
2994        *offset = last_offset + j * map->stripe_len;
2995        return 1;
2996}
2997
2998static void scrub_free_parity(struct scrub_parity *sparity)
2999{
3000        struct scrub_ctx *sctx = sparity->sctx;

3001        struct scrub_page *curr, *next;
3002        int nbits;
3003
3004        nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
3005        if (nbits) {
3006                spin_lock(&sctx->stat_lock);
3007                sctx->stat.read_errors += nbits;
3008                sctx->stat.uncorrectable_errors += nbits;
3009                spin_unlock(&sctx->stat_lock);
3010        }
3011
3012        list_for_each_entry_safe(curr, next, &sparity->spages, list) {
3013                list_del_init(&curr->list);
3014                scrub_page_put(curr);
3015        }
3016
3017        kfree(sparity);
3018}
3019
3020static void scrub_parity_bio_endio_worker(struct btrfs_work *work)
3021{
3022        struct scrub_parity *sparity = container_of(work, struct scrub_parity,
3023                                                    work);
3024        struct scrub_ctx *sctx = sparity->sctx;
3025
3026        scrub_free_parity(sparity);
3027        scrub_pending_bio_dec(sctx);
3028}
3029
3030static void scrub_parity_bio_endio(struct bio *bio)
3031{
3032        struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
3033        struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
3034
3035        if (bio->bi_status)
3036                bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
3037                          sparity->nsectors);
3038
3039        bio_put(bio);
3040
3041        btrfs_init_work(&sparity->work, btrfs_scrubparity_helper,
3042                        scrub_parity_bio_endio_worker, NULL, NULL);
3043        btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work);
3044}
3045
3046static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
3047{
3048        struct scrub_ctx *sctx = sparity->sctx;
3049        struct btrfs_fs_info *fs_info = sctx->fs_info;
3050        struct bio *bio;
3051        struct btrfs_raid_bio *rbio;
3052        struct btrfs_bio *bbio = NULL;
3053        u64 length;
3054        int ret;
3055
3056        if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
3057                           sparity->nsectors))
3058                goto out;
3059
3060        length = sparity->logic_end - sparity->logic_start;
3061
3062        btrfs_bio_counter_inc_blocked(fs_info);
3063        ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
3064                               &length, &bbio);
3065        if (ret || !bbio || !bbio->raid_map)
3066                goto bbio_out;
3067
3068        bio = btrfs_io_bio_alloc(0);
3069        bio->bi_iter.bi_sector = sparity->logic_start >> 9;
3070        bio->bi_private = sparity;
3071        bio->bi_end_io = scrub_parity_bio_endio;
3072
3073        rbio = raid56_parity_alloc_scrub_rbio(fs_info, bio, bbio,
3074                                              length, sparity->scrub_dev,
3075                                              sparity->dbitmap,
3076                                              sparity->nsectors);
3077        if (!rbio)
3078                goto rbio_out;
3079
3080        scrub_pending_bio_inc(sctx);
3081        raid56_parity_submit_scrub_rbio(rbio);
3082        return;
3083
3084rbio_out:
3085        bio_put(bio);
3086bbio_out:
3087        btrfs_bio_counter_dec(fs_info);
3088        btrfs_put_bbio(bbio);
3089        bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
3090                  sparity->nsectors);
3091        spin_lock(&sctx->stat_lock);
3092        sctx->stat.malloc_errors++;
3093        spin_unlock(&sctx->stat_lock);
3094out:
3095        scrub_free_parity(sparity);
3096}
3097
3098static inline int scrub_calc_parity_bitmap_len(int nsectors)
3099{
3100        return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long);
3101}
3102
3103static void scrub_parity_get(struct scrub_parity *sparity)
3104{
3105        refcount_inc(&sparity->refs);
3106}
3107
3108static void scrub_parity_put(struct scrub_parity *sparity)
3109{
3110        if (!refcount_dec_and_test(&sparity->refs))
3111                return;
3112
3113        scrub_parity_check_and_repair(sparity);
3114}
3115
3116static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
3117                                                  struct map_lookup *map,
3118                                                  struct btrfs_device *sdev,
3119                                                  struct btrfs_path *path,
3120                                                  u64 logic_start,
3121                                                  u64 logic_end)
3122{
3123        struct btrfs_fs_info *fs_info = sctx->fs_info;
3124        struct btrfs_root *root = fs_info->extent_root;
3125        struct btrfs_root *csum_root = fs_info->csum_root;
3126        struct btrfs_extent_item *extent;
3127        struct btrfs_bio *bbio = NULL;
3128        u64 flags;
3129        int ret;
3130        int slot;
3131        struct extent_buffer *l;
3132        struct btrfs_key key;
3133        u64 generation;
3134        u64 extent_logical;
3135        u64 extent_physical;
3136        u64 extent_len;
3137        u64 mapped_length;
3138        struct btrfs_device *extent_dev;
3139        struct scrub_parity *sparity;
3140        int nsectors;
3141        int bitmap_len;
3142        int extent_mirror_num;
3143        int stop_loop = 0;
3144
3145        nsectors = div_u64(map->stripe_len, fs_info->sectorsize);
3146        bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
3147        sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
3148                          GFP_NOFS);
3149        if (!sparity) {
3150                spin_lock(&sctx->stat_lock);
3151                sctx->stat.malloc_errors++;
3152                spin_unlock(&sctx->stat_lock);
3153                return -ENOMEM;
3154        }
3155
3156        sparity->stripe_len = map->stripe_len;
3157        sparity->nsectors = nsectors;
3158        sparity->sctx = sctx;
3159        sparity->scrub_dev = sdev;
3160        sparity->logic_start = logic_start;
3161        sparity->logic_end = logic_end;
3162        refcount_set(&sparity->refs, 1);
3163        INIT_LIST_HEAD(&sparity->spages);
3164        sparity->dbitmap = sparity->bitmap;
3165        sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
3166
3167        ret = 0;
3168        while (logic_start < logic_end) {
3169                if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3170                        key.type = BTRFS_METADATA_ITEM_KEY;
3171                else
3172                        key.type = BTRFS_EXTENT_ITEM_KEY;
3173                key.objectid = logic_start;
3174                key.offset = (u64)-1;
3175
3176                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3177                if (ret < 0)
3178                        goto out;
3179
3180                if (ret > 0) {
3181                        ret = btrfs_previous_extent_item(root, path, 0);
3182                        if (ret < 0)
3183                                goto out;
3184                        if (ret > 0) {
3185                                btrfs_release_path(path);
3186                                ret = btrfs_search_slot(NULL, root, &key,
3187                                                        path, 0, 0);
3188                                if (ret < 0)
3189                                        goto out;
3190                        }
3191                }
3192
3193                stop_loop = 0;
3194                while (1) {
3195                        u64 bytes;
3196
3197                        l = path->nodes[0];
3198                        slot = path->slots[0];
3199                        if (slot >= btrfs_header_nritems(l)) {
3200                                ret = btrfs_next_leaf(root, path);
3201                                if (ret == 0)
3202                                        continue;
3203                                if (ret < 0)
3204                                        goto out;
3205
3206                                stop_loop = 1;
3207                                break;
3208                        }
3209                        btrfs_item_key_to_cpu(l, &key, slot);
3210
3211                        if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3212                            key.type != BTRFS_METADATA_ITEM_KEY)
3213                                goto next;
3214
3215                        if (key.type == BTRFS_METADATA_ITEM_KEY)
3216                                bytes = fs_info->nodesize;
3217                        else
3218                                bytes = key.offset;
3219
3220                        if (key.objectid + bytes <= logic_start)
3221                                goto next;
3222
3223                        if (key.objectid >= logic_end) {
3224                                stop_loop = 1;
3225                                break;
3226                        }
3227
3228                        while (key.objectid >= logic_start + map->stripe_len)
3229                                logic_start += map->stripe_len;
3230
3231                        extent = btrfs_item_ptr(l, slot,
3232                                                struct btrfs_extent_item);
3233                        flags = btrfs_extent_flags(l, extent);
3234                        generation = btrfs_extent_generation(l, extent);
3235
3236                        if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3237                            (key.objectid < logic_start ||
3238                             key.objectid + bytes >
3239                             logic_start + map->stripe_len)) {
3240                                btrfs_err(fs_info,
3241                                          "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3242                                          key.objectid, logic_start);
3243                                spin_lock(&sctx->stat_lock);
3244                                sctx->stat.uncorrectable_errors++;
3245                                spin_unlock(&sctx->stat_lock);
3246                                goto next;
3247                        }
3248again:
3249                        extent_logical = key.objectid;
3250                        extent_len = bytes;
3251
3252                        if (extent_logical < logic_start) {
3253                                extent_len -= logic_start - extent_logical;
3254                                extent_logical = logic_start;
3255                        }
3256
3257                        if (extent_logical + extent_len >
3258                            logic_start + map->stripe_len)
3259                                extent_len = logic_start + map->stripe_len -
3260                                             extent_logical;
3261
3262                        scrub_parity_mark_sectors_data(sparity, extent_logical,
3263                                                       extent_len);
3264
3265                        mapped_length = extent_len;
3266                        bbio = NULL;
3267                        ret = btrfs_map_block(fs_info, BTRFS_MAP_READ,
3268                                        extent_logical, &mapped_length, &bbio,
3269                                        0);
3270                        if (!ret) {
3271                                if (!bbio || mapped_length < extent_len)
3272                                        ret = -EIO;
3273                        }
3274                        if (ret) {
3275                                btrfs_put_bbio(bbio);
3276                                goto out;
3277                        }
3278                        extent_physical = bbio->stripes[0].physical;
3279                        extent_mirror_num = bbio->mirror_num;
3280                        extent_dev = bbio->stripes[0].dev;
3281                        btrfs_put_bbio(bbio);
3282
3283                        ret = btrfs_lookup_csums_range(csum_root,
3284                                                extent_logical,
3285                                                extent_logical + extent_len - 1,
3286                                                &sctx->csum_list, 1);
3287                        if (ret)
3288                                goto out;
3289
3290                        ret = scrub_extent_for_parity(sparity, extent_logical,
3291                                                      extent_len,
3292                                                      extent_physical,
3293                                                      extent_dev, flags,
3294                                                      generation,
3295                                                      extent_mirror_num);
3296
3297                        scrub_free_csums(sctx);
3298
3299                        if (ret)
3300                                goto out;
3301
3302                        if (extent_logical + extent_len <
3303                            key.objectid + bytes) {
3304                                logic_start += map->stripe_len;
3305
3306                                if (logic_start >= logic_end) {
3307                                        stop_loop = 1;
3308                                        break;
3309                                }
3310
3311                                if (logic_start < key.objectid + bytes) {
3312                                        cond_resched();
3313                                        goto again;
3314                                }
3315                        }
3316next:
3317                        path->slots[0]++;
3318                }
3319
3320                btrfs_release_path(path);
3321
3322                if (stop_loop)
3323                        break;
3324
3325                logic_start += map->stripe_len;
3326        }
3327out:
3328        if (ret < 0)
3329                scrub_parity_mark_sectors_error(sparity, logic_start,
3330                                                logic_end - logic_start);
3331        scrub_parity_put(sparity);
3332        scrub_submit(sctx);
3333        mutex_lock(&sctx->wr_lock);
3334        scrub_wr_submit(sctx);
3335        mutex_unlock(&sctx->wr_lock);
3336
3337        btrfs_release_path(path);
3338        return ret < 0 ? ret : 0;
3339}
3340
3341static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3342                                           struct map_lookup *map,
3343                                           struct btrfs_device *scrub_dev,
3344                                           int num, u64 base, u64 length,
3345                                           int is_dev_replace)
3346{
3347        struct btrfs_path *path, *ppath;
3348        struct btrfs_fs_info *fs_info = sctx->fs_info;
3349        struct btrfs_root *root = fs_info->extent_root;
3350        struct btrfs_root *csum_root = fs_info->csum_root;
3351        struct btrfs_extent_item *extent;
3352        struct blk_plug plug;
3353        u64 flags;
3354        int ret;
3355        int slot;
3356        u64 nstripes;
3357        struct extent_buffer *l;
3358        u64 physical;
3359        u64 logical;
3360        u64 logic_end;
3361        u64 physical_end;
3362        u64 generation;
3363        int mirror_num;
3364        struct reada_control *reada1;
3365        struct reada_control *reada2;
3366        struct btrfs_key key;
3367        struct btrfs_key key_end;
3368        u64 increment = map->stripe_len;
3369        u64 offset;
3370        u64 extent_logical;
3371        u64 extent_physical;
3372        u64 extent_len;
3373        u64 stripe_logical;
3374        u64 stripe_end;
3375        struct btrfs_device *extent_dev;
3376        int extent_mirror_num;
3377        int stop_loop = 0;
3378
3379        physical = map->stripes[num].physical;
3380        offset = 0;
3381        nstripes = div64_u64(length, map->stripe_len);
3382        if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3383                offset = map->stripe_len * num;
3384                increment = map->stripe_len * map->num_stripes;
3385                mirror_num = 1;
3386        } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3387                int factor = map->num_stripes / map->sub_stripes;
3388                offset = map->stripe_len * (num / map->sub_stripes);
3389                increment = map->stripe_len * factor;
3390                mirror_num = num % map->sub_stripes + 1;
3391        } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3392                increment = map->stripe_len;
3393                mirror_num = num % map->num_stripes + 1;
3394        } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3395                increment = map->stripe_len;
3396                mirror_num = num % map->num_stripes + 1;
3397        } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3398                get_raid56_logic_offset(physical, num, map, &offset, NULL);
3399                increment = map->stripe_len * nr_data_stripes(map);
3400                mirror_num = 1;
3401        } else {
3402                increment = map->stripe_len;
3403                mirror_num = 1;
3404        }
3405
3406        path = btrfs_alloc_path();
3407        if (!path)
3408                return -ENOMEM;
3409
3410        ppath = btrfs_alloc_path();
3411        if (!ppath) {
3412                btrfs_free_path(path);
3413                return -ENOMEM;
3414        }
3415
3416        /*
3417         * work on commit root. The related disk blocks are static as
3418         * long as COW is applied. This means, it is save to rewrite
3419         * them to repair disk errors without any race conditions
3420         */
3421        path->search_commit_root = 1;
3422        path->skip_locking = 1;
3423
3424        ppath->search_commit_root = 1;
3425        ppath->skip_locking = 1;
3426        /*
3427         * trigger the readahead for extent tree csum tree and wait for
3428         * completion. During readahead, the scrub is officially paused
3429         * to not hold off transaction commits
3430         */
3431        logical = base + offset;
3432        physical_end = physical + nstripes * map->stripe_len;
3433        if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3434                get_raid56_logic_offset(physical_end, num,
3435                                        map, &logic_end, NULL);
3436                logic_end += base;
3437        } else {
3438                logic_end = logical + increment * nstripes;
3439        }
3440        wait_event(sctx->list_wait,
3441                   atomic_read(&sctx->bios_in_flight) == 0);
3442        scrub_blocked_if_needed(fs_info);
3443
3444        /* FIXME it might be better to start readahead at commit root */
3445        key.objectid = logical;
3446        key.type = BTRFS_EXTENT_ITEM_KEY;
3447        key.offset = (u64)0;
3448        key_end.objectid = logic_end;
3449        key_end.type = BTRFS_METADATA_ITEM_KEY;
3450        key_end.offset = (u64)-1;
3451        reada1 = btrfs_reada_add(root, &key, &key_end);
3452
3453        key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3454        key.type = BTRFS_EXTENT_CSUM_KEY;
3455        key.offset = logical;
3456        key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3457        key_end.type = BTRFS_EXTENT_CSUM_KEY;
3458        key_end.offset = logic_end;
3459        reada2 = btrfs_reada_add(csum_root, &key, &key_end);
3460
3461        if (!IS_ERR(reada1))
3462                btrfs_reada_wait(reada1);
3463        if (!IS_ERR(reada2))
3464                btrfs_reada_wait(reada2);
3465
3466
3467        /*
3468         * collect all data csums for the stripe to avoid seeking during
3469         * the scrub. This might currently (crc32) end up to be about 1MB
3470         */
3471        blk_start_plug(&plug);
3472
3473        /*
3474         * now find all extents for each stripe and scrub them
3475         */
3476        ret = 0;
3477        while (physical < physical_end) {
3478                /*
3479                 * canceled?
3480                 */
3481                if (atomic_read(&fs_info->scrub_cancel_req) ||
3482                    atomic_read(&sctx->cancel_req)) {
3483                        ret = -ECANCELED;
3484                        goto out;
3485                }
3486                /*
3487                 * check to see if we have to pause
3488                 */
3489                if (atomic_read(&fs_info->scrub_pause_req)) {
3490                        /* push queued extents */
3491                        sctx->flush_all_writes = true;
3492                        scrub_submit(sctx);
3493                        mutex_lock(&sctx->wr_lock);
3494                        scrub_wr_submit(sctx);
3495                        mutex_unlock(&sctx->wr_lock);
3496                        wait_event(sctx->list_wait,
3497                                   atomic_read(&sctx->bios_in_flight) == 0);
3498                        sctx->flush_all_writes = false;
3499                        scrub_blocked_if_needed(fs_info);
3500                }
3501
3502                if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3503                        ret = get_raid56_logic_offset(physical, num, map,
3504                                                      &logical,
3505                                                      &stripe_logical);
3506                        logical += base;
3507                        if (ret) {
3508                                /* it is parity strip */
3509                                stripe_logical += base;
3510                                stripe_end = stripe_logical + increment;
3511                                ret = scrub_raid56_parity(sctx, map, scrub_dev,
3512                                                          ppath, stripe_logical,
3513                                                          stripe_end);
3514                                if (ret)
3515                                        goto out;
3516                                goto skip;
3517                        }
3518                }
3519
3520                if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3521                        key.type = BTRFS_METADATA_ITEM_KEY;
3522                else
3523                        key.type = BTRFS_EXTENT_ITEM_KEY;
3524                key.objectid = logical;
3525                key.offset = (u64)-1;
3526
3527                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3528                if (ret < 0)
3529                        goto out;
3530
3531                if (ret > 0) {
3532                        ret = btrfs_previous_extent_item(root, path, 0);
3533                        if (ret < 0)
3534                                goto out;
3535                        if (ret > 0) {
3536                                /* there's no smaller item, so stick with the
3537                                 * larger one */
3538                                btrfs_release_path(path);
3539                                ret = btrfs_search_slot(NULL, root, &key,
3540                                                        path, 0, 0);
3541                                if (ret < 0)
3542                                        goto out;
3543                        }
3544                }
3545
3546                stop_loop = 0;
3547                while (1) {
3548                        u64 bytes;
3549
3550                        l = path->nodes[0];
3551                        slot = path->slots[0];
3552                        if (slot >= btrfs_header_nritems(l)) {
3553                                ret = btrfs_next_leaf(root, path);
3554                                if (ret == 0)
3555                                        continue;
3556                                if (ret < 0)
3557                                        goto out;
3558
3559                                stop_loop = 1;
3560                                break;
3561                        }
3562                        btrfs_item_key_to_cpu(l, &key, slot);
3563
3564                        if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3565                            key.type != BTRFS_METADATA_ITEM_KEY)
3566                                goto next;
3567
3568                        if (key.type == BTRFS_METADATA_ITEM_KEY)
3569                                bytes = fs_info->nodesize;
3570                        else
3571                                bytes = key.offset;
3572
3573                        if (key.objectid + bytes <= logical)
3574                                goto next;
3575
3576                        if (key.objectid >= logical + map->stripe_len) {
3577                                /* out of this device extent */
3578                                if (key.objectid >= logic_end)
3579                                        stop_loop = 1;
3580                                break;
3581                        }
3582
3583                        extent = btrfs_item_ptr(l, slot,
3584                                                struct btrfs_extent_item);
3585                        flags = btrfs_extent_flags(l, extent);
3586                        generation = btrfs_extent_generation(l, extent);
3587
3588                        if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3589                            (key.objectid < logical ||
3590                             key.objectid + bytes >
3591                             logical + map->stripe_len)) {
3592                                btrfs_err(fs_info,
3593                                           "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3594                                       key.objectid, logical);
3595                                spin_lock(&sctx->stat_lock);
3596                                sctx->stat.uncorrectable_errors++;
3597                                spin_unlock(&sctx->stat_lock);
3598                                goto next;
3599                        }
3600
3601again:
3602                        extent_logical = key.objectid;
3603                        extent_len = bytes;
3604
3605                        /*
3606                         * trim extent to this stripe
3607                         */
3608                        if (extent_logical < logical) {
3609                                extent_len -= logical - extent_logical;
3610                                extent_logical = logical;
3611                        }
3612                        if (extent_logical + extent_len >
3613                            logical + map->stripe_len) {
3614                                extent_len = logical + map->stripe_len -
3615                                             extent_logical;
3616                        }
3617
3618                        extent_physical = extent_logical - logical + physical;
3619                        extent_dev = scrub_dev;
3620                        extent_mirror_num = mirror_num;
3621                        if (is_dev_replace)
3622                                scrub_remap_extent(fs_info, extent_logical,
3623                                                   extent_len, &extent_physical,
3624                                                   &extent_dev,
3625                                                   &extent_mirror_num);
3626
3627                        ret = btrfs_lookup_csums_range(csum_root,
3628                                                       extent_logical,
3629                                                       extent_logical +
3630                                                       extent_len - 1,
3631                                                       &sctx->csum_list, 1);
3632                        if (ret)
3633                                goto out;
3634
3635                        ret = scrub_extent(sctx, map, extent_logical, extent_len,
3636                                           extent_physical, extent_dev, flags,
3637                                           generation, extent_mirror_num,
3638                                           extent_logical - logical + physical);
3639
3640                        scrub_free_csums(sctx);
3641
3642                        if (ret)
3643                                goto out;
3644
3645                        if (extent_logical + extent_len <
3646                            key.objectid + bytes) {
3647                                if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3648                                        /*
3649                                         * loop until we find next data stripe
3650                                         * or we have finished all stripes.
3651                                         */
3652loop:
3653                                        physical += map->stripe_len;
3654                                        ret = get_raid56_logic_offset(physical,
3655                                                        num, map, &logical,
3656                                                        &stripe_logical);
3657                                        logical += base;
3658
3659                                        if (ret && physical < physical_end) {
3660                                                stripe_logical += base;
3661                                                stripe_end = stripe_logical +
3662                                                                increment;
3663                                                ret = scrub_raid56_parity(sctx,
3664                                                        map, scrub_dev, ppath,
3665                                                        stripe_logical,
3666                                                        stripe_end);
3667                                                if (ret)
3668                                                        goto out;
3669                                                goto loop;
3670                                        }
3671                                } else {
3672                                        physical += map->stripe_len;
3673                                        logical += increment;
3674                                }
3675                                if (logical < key.objectid + bytes) {
3676                                        cond_resched();
3677                                        goto again;
3678                                }
3679
3680                                if (physical >= physical_end) {
3681                                        stop_loop = 1;
3682                                        break;
3683                                }
3684                        }
3685next:
3686                        path->slots[0]++;
3687                }
3688                btrfs_release_path(path);
3689skip:
3690                logical += increment;
3691                physical += map->stripe_len;
3692                spin_lock(&sctx->stat_lock);
3693                if (stop_loop)
3694                        sctx->stat.last_physical = map->stripes[num].physical +
3695                                                   length;
3696                else
3697                        sctx->stat.last_physical = physical;
3698                spin_unlock(&sctx->stat_lock);
3699                if (stop_loop)
3700                        break;
3701        }
3702out:
3703        /* push queued extents */
3704        scrub_submit(sctx);
3705        mutex_lock(&sctx->wr_lock);
3706        scrub_wr_submit(sctx);
3707        mutex_unlock(&sctx->wr_lock);
3708
3709        blk_finish_plug(&plug);
3710        btrfs_free_path(path);
3711        btrfs_free_path(ppath);
3712        return ret < 0 ? ret : 0;
3713}
3714
3715static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
3716                                          struct btrfs_device *scrub_dev,
3717                                          u64 chunk_offset, u64 length,
3718                                          u64 dev_offset,
3719                                          struct btrfs_block_group_cache *cache,
3720                                          int is_dev_replace)
3721{
3722        struct btrfs_fs_info *fs_info = sctx->fs_info;
3723        struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
3724        struct map_lookup *map;
3725        struct extent_map *em;
3726        int i;
3727        int ret = 0;
3728
3729        read_lock(&map_tree->map_tree.lock);
3730        em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
3731        read_unlock(&map_tree->map_tree.lock);
3732
3733        if (!em) {
3734                /*
3735                 * Might have been an unused block group deleted by the cleaner
3736                 * kthread or relocation.
3737                 */
3738                spin_lock(&cache->lock);
3739                if (!cache->removed)
3740                        ret = -EINVAL;
3741                spin_unlock(&cache->lock);
3742
3743                return ret;
3744        }
3745
3746        map = em->map_lookup;
3747        if (em->start != chunk_offset)
3748                goto out;
3749
3750        if (em->len < length)
3751                goto out;
3752
3753        for (i = 0; i < map->num_stripes; ++i) {
3754                if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
3755                    map->stripes[i].physical == dev_offset) {
3756                        ret = scrub_stripe(sctx, map, scrub_dev, i,
3757                                           chunk_offset, length,
3758                                           is_dev_replace);
3759                        if (ret)
3760                                goto out;
3761                }
3762        }
3763out:
3764        free_extent_map(em);
3765
3766        return ret;
3767}
3768
3769static noinline_for_stack
3770int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3771                           struct btrfs_device *scrub_dev, u64 start, u64 end,
3772                           int is_dev_replace)
3773{
3774        struct btrfs_dev_extent *dev_extent = NULL;
3775        struct btrfs_path *path;
3776        struct btrfs_fs_info *fs_info = sctx->fs_info;
3777        struct btrfs_root *root = fs_info->dev_root;
3778        u64 length;
3779        u64 chunk_offset;
3780        int ret = 0;
3781        int ro_set;
3782        int slot;
3783        struct extent_buffer *l;
3784        struct btrfs_key key;
3785        struct btrfs_key found_key;
3786        struct btrfs_block_group_cache *cache;
3787        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
3788
3789        path = btrfs_alloc_path();
3790        if (!path)
3791                return -ENOMEM;
3792
3793        path->reada = READA_FORWARD;
3794        path->search_commit_root = 1;
3795        path->skip_locking = 1;
3796
3797        key.objectid = scrub_dev->devid;
3798        key.offset = 0ull;
3799        key.type = BTRFS_DEV_EXTENT_KEY;
3800
3801        while (1) {
3802                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3803                if (ret < 0)
3804                        break;
3805                if (ret > 0) {
3806                        if (path->slots[0] >=
3807                            btrfs_header_nritems(path->nodes[0])) {
3808                                ret = btrfs_next_leaf(root, path);
3809                                if (ret < 0)
3810                                        break;
3811                                if (ret > 0) {
3812                                        ret = 0;
3813                                        break;
3814                                }
3815                        } else {
3816                                ret = 0;
3817                        }
3818                }
3819
3820                l = path->nodes[0];
3821                slot = path->slots[0];
3822
3823                btrfs_item_key_to_cpu(l, &found_key, slot);
3824
3825                if (found_key.objectid != scrub_dev->devid)
3826                        break;
3827
3828                if (found_key.type != BTRFS_DEV_EXTENT_KEY)
3829                        break;
3830
3831                if (found_key.offset >= end)
3832                        break;
3833
3834                if (found_key.offset < key.offset)
3835                        break;
3836
3837                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3838                length = btrfs_dev_extent_length(l, dev_extent);
3839
3840                if (found_key.offset + length <= start)
3841                        goto skip;
3842
3843                chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3844
3845                /*
3846                 * get a reference on the corresponding block group to prevent
3847                 * the chunk from going away while we scrub it
3848                 */
3849                cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3850
3851                /* some chunks are removed but not committed to disk yet,
3852                 * continue scrubbing */
3853                if (!cache)
3854                        goto skip;
3855
3856                /*
3857                 * we need call btrfs_inc_block_group_ro() with scrubs_paused,
3858                 * to avoid deadlock caused by:
3859                 * btrfs_inc_block_group_ro()
3860                 * -> btrfs_wait_for_commit()
3861                 * -> btrfs_commit_transaction()
3862                 * -> btrfs_scrub_pause()
3863                 */
3864                scrub_pause_on(fs_info);
3865                ret = btrfs_inc_block_group_ro(fs_info, cache);
3866                if (!ret && is_dev_replace) {
3867                        /*
3868                         * If we are doing a device replace wait for any tasks
3869                         * that started dellaloc right before we set the block
3870                         * group to RO mode, as they might have just allocated
3871                         * an extent from it or decided they could do a nocow
3872                         * write. And if any such tasks did that, wait for their
3873                         * ordered extents to complete and then commit the
3874                         * current transaction, so that we can later see the new
3875                         * extent items in the extent tree - the ordered extents
3876                         * create delayed data references (for cow writes) when
3877                         * they complete, which will be run and insert the
3878                         * corresponding extent items into the extent tree when
3879                         * we commit the transaction they used when running
3880                         * inode.c:btrfs_finish_ordered_io(). We later use
3881                         * the commit root of the extent tree to find extents
3882                         * to copy from the srcdev into the tgtdev, and we don't
3883                         * want to miss any new extents.
3884                         */
3885                        btrfs_wait_block_group_reservations(cache);
3886                        btrfs_wait_nocow_writers(cache);
3887                        ret = btrfs_wait_ordered_roots(fs_info, U64_MAX,
3888                                                       cache->key.objectid,
3889                                                       cache->key.offset);
3890                        if (ret > 0) {
3891                                struct btrfs_trans_handle *trans;
3892
3893                                trans = btrfs_join_transaction(root);
3894                                if (IS_ERR(trans))
3895                                        ret = PTR_ERR(trans);
3896                                else
3897                                        ret = btrfs_commit_transaction(trans);
3898                                if (ret) {
3899                                        scrub_pause_off(fs_info);
3900                                        btrfs_put_block_group(cache);
3901                                        break;
3902                                }
3903                        }
3904                }
3905                scrub_pause_off(fs_info);
3906
3907                if (ret == 0) {
3908                        ro_set = 1;
3909                } else if (ret == -ENOSPC) {
3910                        /*
3911                         * btrfs_inc_block_group_ro return -ENOSPC when it
3912                         * failed in creating new chunk for metadata.
3913                         * It is not a problem for scrub/replace, because
3914                         * metadata are always cowed, and our scrub paused
3915                         * commit_transactions.
3916                         */
3917                        ro_set = 0;
3918                } else {
3919                        btrfs_warn(fs_info,
3920                                   "failed setting block group ro: %d", ret);
3921                        btrfs_put_block_group(cache);
3922                        break;
3923                }
3924
3925                btrfs_dev_replace_write_lock(&fs_info->dev_replace);
3926                dev_replace->cursor_right = found_key.offset + length;
3927                dev_replace->cursor_left = found_key.offset;
3928                dev_replace->item_needs_writeback = 1;
3929                btrfs_dev_replace_write_unlock(&fs_info->dev_replace);
3930                ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
3931                                  found_key.offset, cache, is_dev_replace);
3932
3933                /*
3934                 * flush, submit all pending read and write bios, afterwards
3935                 * wait for them.
3936                 * Note that in the dev replace case, a read request causes
3937                 * write requests that are submitted in the read completion
3938                 * worker. Therefore in the current situation, it is required
3939                 * that all write requests are flushed, so that all read and
3940                 * write requests are really completed when bios_in_flight
3941                 * changes to 0.
3942                 */
3943                sctx->flush_all_writes = true;
3944                scrub_submit(sctx);
3945                mutex_lock(&sctx->wr_lock);
3946                scrub_wr_submit(sctx);
3947                mutex_unlock(&sctx->wr_lock);
3948
3949                wait_event(sctx->list_wait,
3950                           atomic_read(&sctx->bios_in_flight) == 0);
3951
3952                scrub_pause_on(fs_info);
3953
3954                /*
3955                 * must be called before we decrease @scrub_paused.
3956                 * make sure we don't block transaction commit while
3957                 * we are waiting pending workers finished.
3958                 */
3959                wait_event(sctx->list_wait,
3960                           atomic_read(&sctx->workers_pending) == 0);
3961                sctx->flush_all_writes = false;
3962
3963                scrub_pause_off(fs_info);
3964
3965                btrfs_dev_replace_write_lock(&fs_info->dev_replace);
3966                dev_replace->cursor_left = dev_replace->cursor_right;
3967                dev_replace->item_needs_writeback = 1;
3968                btrfs_dev_replace_write_unlock(&fs_info->dev_replace);
3969
3970                if (ro_set)
3971                        btrfs_dec_block_group_ro(cache);
3972
3973                /*
3974                 * We might have prevented the cleaner kthread from deleting
3975                 * this block group if it was already unused because we raced
3976                 * and set it to RO mode first. So add it back to the unused
3977                 * list, otherwise it might not ever be deleted unless a manual
3978                 * balance is triggered or it becomes used and unused again.
3979                 */
3980                spin_lock(&cache->lock);
3981                if (!cache->removed && !cache->ro && cache->reserved == 0 &&
3982                    btrfs_block_group_used(&cache->item) == 0) {
3983                        spin_unlock(&cache->lock);
3984                        spin_lock(&fs_info->unused_bgs_lock);
3985                        if (list_empty(&cache->bg_list)) {
3986                                btrfs_get_block_group(cache);
3987                                list_add_tail(&cache->bg_list,
3988                                              &fs_info->unused_bgs);
3989                        }
3990                        spin_unlock(&fs_info->unused_bgs_lock);
3991                } else {
3992                        spin_unlock(&cache->lock);
3993                }
3994
3995                btrfs_put_block_group(cache);
3996                if (ret)
3997                        break;
3998                if (is_dev_replace &&
3999                    atomic64_read(&dev_replace->num_write_errors) > 0) {
4000                        ret = -EIO;

4001                        break;
4002                }
4003                if (sctx->stat.malloc_errors > 0) {
4004                        ret = -ENOMEM;
4005                        break;
4006                }
4007skip:
4008                key.offset = found_key.offset + length;
4009                btrfs_release_path(path);
4010        }
4011
4012        btrfs_free_path(path);
4013
4014        return ret;
4015}
4016
4017static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
4018                                           struct btrfs_device *scrub_dev)
4019{
4020        int     i;
4021        u64     bytenr;
4022        u64     gen;
4023        int     ret;
4024        struct btrfs_fs_info *fs_info = sctx->fs_info;
4025
4026        if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
4027                return -EIO;
4028
4029        /* Seed devices of a new filesystem has their own generation. */
4030        if (scrub_dev->fs_devices != fs_info->fs_devices)
4031                gen = scrub_dev->generation;
4032        else
4033                gen = fs_info->last_trans_committed;
4034
4035        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
4036                bytenr = btrfs_sb_offset(i);
4037                if (bytenr + BTRFS_SUPER_INFO_SIZE >
4038                    scrub_dev->commit_total_bytes)
4039                        break;
4040
4041                ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
4042                                  scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
4043                                  NULL, 1, bytenr);
4044                if (ret)
4045                        return ret;
4046        }
4047        wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4048
4049        return 0;
4050}
4051
4052/*
4053 * get a reference count on fs_info->scrub_workers. start worker if necessary
4054 */
4055static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
4056                                                int is_dev_replace)
4057{
4058        unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
4059        int max_active = fs_info->thread_pool_size;
4060
4061        if (fs_info->scrub_workers_refcnt == 0) {
4062                fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub",
4063                                flags, is_dev_replace ? 1 : max_active, 4);
4064                if (!fs_info->scrub_workers)
4065                        goto fail_scrub_workers;
4066
4067                fs_info->scrub_wr_completion_workers =
4068                        btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
4069                                              max_active, 2);
4070                if (!fs_info->scrub_wr_completion_workers)
4071                        goto fail_scrub_wr_completion_workers;
4072
4073                fs_info->scrub_nocow_workers =
4074                        btrfs_alloc_workqueue(fs_info, "scrubnc", flags, 1, 0);
4075                if (!fs_info->scrub_nocow_workers)
4076                        goto fail_scrub_nocow_workers;
4077                fs_info->scrub_parity_workers =
4078                        btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
4079                                              max_active, 2);
4080                if (!fs_info->scrub_parity_workers)
4081                        goto fail_scrub_parity_workers;
4082        }
4083        ++fs_info->scrub_workers_refcnt;
4084        return 0;
4085
4086fail_scrub_parity_workers:
4087        btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
4088fail_scrub_nocow_workers:
4089        btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
4090fail_scrub_wr_completion_workers:
4091        btrfs_destroy_workqueue(fs_info->scrub_workers);
4092fail_scrub_workers:
4093        return -ENOMEM;
4094}
4095
4096static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
4097{
4098        if (--fs_info->scrub_workers_refcnt == 0) {
4099                btrfs_destroy_workqueue(fs_info->scrub_workers);
4100                btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
4101                btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
4102                btrfs_destroy_workqueue(fs_info->scrub_parity_workers);
4103        }
4104        WARN_ON(fs_info->scrub_workers_refcnt < 0);
4105}
4106
4107int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
4108                    u64 end, struct btrfs_scrub_progress *progress,
4109                    int readonly, int is_dev_replace)
4110{
4111        struct scrub_ctx *sctx;
4112        int ret;
4113        struct btrfs_device *dev;
4114        struct rcu_string *name;
4115
4116        if (btrfs_fs_closing(fs_info))
4117                return -EINVAL;
4118
4119        if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
4120                /*
4121                 * in this case scrub is unable to calculate the checksum
4122                 * the way scrub is implemented. Do not handle this
4123                 * situation at all because it won't ever happen.
4124                 */
4125                btrfs_err(fs_info,
4126                           "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
4127                       fs_info->nodesize,
4128                       BTRFS_STRIPE_LEN);
4129                return -EINVAL;
4130        }
4131
4132        if (fs_info->sectorsize != PAGE_SIZE) {
4133                /* not supported for data w/o checksums */
4134                btrfs_err_rl(fs_info,
4135                           "scrub: size assumption sectorsize != PAGE_SIZE (%d != %lu) fails",
4136                       fs_info->sectorsize, PAGE_SIZE);
4137                return -EINVAL;
4138        }
4139
4140        if (fs_info->nodesize >
4141            PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
4142            fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
4143                /*
4144                 * would exhaust the array bounds of pagev member in
4145                 * struct scrub_block
4146                 */
4147                btrfs_err(fs_info,
4148                          "scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
4149                       fs_info->nodesize,
4150                       SCRUB_MAX_PAGES_PER_BLOCK,
4151                       fs_info->sectorsize,
4152                       SCRUB_MAX_PAGES_PER_BLOCK);
4153                return -EINVAL;
4154        }
4155
4156
4157        mutex_lock(&fs_info->fs_devices->device_list_mutex);
4158        dev = btrfs_find_device(fs_info, devid, NULL, NULL);
4159        if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
4160                     !is_dev_replace)) {
4161                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4162                return -ENODEV;
4163        }
4164
4165        if (!is_dev_replace && !readonly &&
4166            !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
4167                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4168                rcu_read_lock();
4169                name = rcu_dereference(dev->name);
4170                btrfs_err(fs_info, "scrub: device %s is not writable",
4171                          name->str);
4172                rcu_read_unlock();
4173                return -EROFS;
4174        }
4175
4176        mutex_lock(&fs_info->scrub_lock);
4177        if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4178            test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
4179                mutex_unlock(&fs_info->scrub_lock);
4180                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4181                return -EIO;
4182        }
4183
4184        btrfs_dev_replace_read_lock(&fs_info->dev_replace);
4185        if (dev->scrub_ctx ||
4186            (!is_dev_replace &&
4187             btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
4188                btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
4189                mutex_unlock(&fs_info->scrub_lock);
4190                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4191                return -EINPROGRESS;
4192        }
4193        btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
4194
4195        ret = scrub_workers_get(fs_info, is_dev_replace);
4196        if (ret) {
4197                mutex_unlock(&fs_info->scrub_lock);
4198                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4199                return ret;
4200        }
4201
4202        sctx = scrub_setup_ctx(dev, is_dev_replace);
4203        if (IS_ERR(sctx)) {
4204                mutex_unlock(&fs_info->scrub_lock);
4205                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4206                scrub_workers_put(fs_info);
4207                return PTR_ERR(sctx);
4208        }
4209        sctx->readonly = readonly;
4210        dev->scrub_ctx = sctx;
4211        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4212
4213        /*
4214         * checking @scrub_pause_req here, we can avoid
4215         * race between committing transaction and scrubbing.
4216         */
4217        __scrub_blocked_if_needed(fs_info);
4218        atomic_inc(&fs_info->scrubs_running);
4219        mutex_unlock(&fs_info->scrub_lock);
4220
4221        if (!is_dev_replace) {
4222                /*
4223                 * by holding device list mutex, we can
4224                 * kick off writing super in log tree sync.
4225                 */
4226                mutex_lock(&fs_info->fs_devices->device_list_mutex);
4227                ret = scrub_supers(sctx, dev);
4228                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4229        }
4230
4231        if (!ret)
4232                ret = scrub_enumerate_chunks(sctx, dev, start, end,
4233                                             is_dev_replace);
4234
4235        wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4236        atomic_dec(&fs_info->scrubs_running);
4237        wake_up(&fs_info->scrub_pause_wait);
4238
4239        wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
4240
4241        if (progress)
4242                memcpy(progress, &sctx->stat, sizeof(*progress));
4243
4244        mutex_lock(&fs_info->scrub_lock);
4245        dev->scrub_ctx = NULL;
4246        scrub_workers_put(fs_info);
4247        mutex_unlock(&fs_info->scrub_lock);
4248
4249        scrub_put_ctx(sctx);
4250
4251        return ret;
4252}
4253
4254void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
4255{
4256        mutex_lock(&fs_info->scrub_lock);
4257        atomic_inc(&fs_info->scrub_pause_req);
4258        while (atomic_read(&fs_info->scrubs_paused) !=
4259               atomic_read(&fs_info->scrubs_running)) {
4260                mutex_unlock(&fs_info->scrub_lock);
4261                wait_event(fs_info->scrub_pause_wait,
4262                           atomic_read(&fs_info->scrubs_paused) ==
4263                           atomic_read(&fs_info->scrubs_running));
4264                mutex_lock(&fs_info->scrub_lock);
4265        }
4266        mutex_unlock(&fs_info->scrub_lock);
4267}
4268
4269void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
4270{
4271        atomic_dec(&fs_info->scrub_pause_req);
4272        wake_up(&fs_info->scrub_pause_wait);
4273}
4274
4275int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
4276{
4277        mutex_lock(&fs_info->scrub_lock);
4278        if (!atomic_read(&fs_info->scrubs_running)) {
4279                mutex_unlock(&fs_info->scrub_lock);
4280                return -ENOTCONN;
4281        }
4282
4283        atomic_inc(&fs_info->scrub_cancel_req);
4284        while (atomic_read(&fs_info->scrubs_running)) {
4285                mutex_unlock(&fs_info->scrub_lock);
4286                wait_event(fs_info->scrub_pause_wait,
4287                           atomic_read(&fs_info->scrubs_running) == 0);
4288                mutex_lock(&fs_info->scrub_lock);
4289        }
4290        atomic_dec(&fs_info->scrub_cancel_req);
4291        mutex_unlock(&fs_info->scrub_lock);
4292
4293        return 0;
4294}
4295
4296int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
4297                           struct btrfs_device *dev)
4298{
4299        struct scrub_ctx *sctx;
4300
4301        mutex_lock(&fs_info->scrub_lock);
4302        sctx = dev->scrub_ctx;
4303        if (!sctx) {
4304                mutex_unlock(&fs_info->scrub_lock);
4305                return -ENOTCONN;
4306        }
4307        atomic_inc(&sctx->cancel_req);
4308        while (dev->scrub_ctx) {
4309                mutex_unlock(&fs_info->scrub_lock);
4310                wait_event(fs_info->scrub_pause_wait,
4311                           dev->scrub_ctx == NULL);
4312                mutex_lock(&fs_info->scrub_lock);
4313        }
4314        mutex_unlock(&fs_info->scrub_lock);
4315
4316        return 0;
4317}
4318
4319int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
4320                         struct btrfs_scrub_progress *progress)
4321{
4322        struct btrfs_device *dev;
4323        struct scrub_ctx *sctx = NULL;
4324
4325        mutex_lock(&fs_info->fs_devices->device_list_mutex);
4326        dev = btrfs_find_device(fs_info, devid, NULL, NULL);
4327        if (dev)
4328                sctx = dev->scrub_ctx;
4329        if (sctx)
4330                memcpy(progress, &sctx->stat, sizeof(*progress));
4331        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4332
4333        return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
4334}
4335
4336static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
4337                               u64 extent_logical, u64 extent_len,
4338                               u64 *extent_physical,
4339                               struct btrfs_device **extent_dev,
4340                               int *extent_mirror_num)
4341{
4342        u64 mapped_length;
4343        struct btrfs_bio *bbio = NULL;
4344        int ret;
4345
4346        mapped_length = extent_len;
4347        ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
4348                              &mapped_length, &bbio, 0);
4349        if (ret || !bbio || mapped_length < extent_len ||
4350            !bbio->stripes[0].dev->bdev) {
4351                btrfs_put_bbio(bbio);
4352                return;
4353        }
4354
4355        *extent_physical = bbio->stripes[0].physical;
4356        *extent_mirror_num = bbio->mirror_num;
4357        *extent_dev = bbio->stripes[0].dev;
4358        btrfs_put_bbio(bbio);
4359}
4360
4361static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
4362                            int mirror_num, u64 physical_for_dev_replace)
4363{
4364        struct scrub_copy_nocow_ctx *nocow_ctx;
4365        struct btrfs_fs_info *fs_info = sctx->fs_info;
4366
4367        nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
4368        if (!nocow_ctx) {
4369                spin_lock(&sctx->stat_lock);
4370                sctx->stat.malloc_errors++;
4371                spin_unlock(&sctx->stat_lock);
4372                return -ENOMEM;
4373        }
4374
4375        scrub_pending_trans_workers_inc(sctx);
4376
4377        nocow_ctx->sctx = sctx;
4378        nocow_ctx->logical = logical;
4379        nocow_ctx->len = len;
4380        nocow_ctx->mirror_num = mirror_num;
4381        nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
4382        btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper,
4383                        copy_nocow_pages_worker, NULL, NULL);
4384        INIT_LIST_HEAD(&nocow_ctx->inodes);
4385        btrfs_queue_work(fs_info->scrub_nocow_workers,
4386                         &nocow_ctx->work);
4387
4388        return 0;
4389}
4390
4391static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx)
4392{
4393        struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
4394        struct scrub_nocow_inode *nocow_inode;
4395
4396        nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS);
4397        if (!nocow_inode)
4398                return -ENOMEM;
4399        nocow_inode->inum = inum;
4400        nocow_inode->offset = offset;
4401        nocow_inode->root = root;
4402        list_add_tail(&nocow_inode->list, &nocow_ctx->inodes);
4403        return 0;
4404}
4405
4406#define COPY_COMPLETE 1
4407
4408static void copy_nocow_pages_worker(struct btrfs_work *work)
4409{
4410        struct scrub_copy_nocow_ctx *nocow_ctx =
4411                container_of(work, struct scrub_copy_nocow_ctx, work);
4412        struct scrub_ctx *sctx = nocow_ctx->sctx;
4413        struct btrfs_fs_info *fs_info = sctx->fs_info;
4414        struct btrfs_root *root = fs_info->extent_root;
4415        u64 logical = nocow_ctx->logical;
4416        u64 len = nocow_ctx->len;
4417        int mirror_num = nocow_ctx->mirror_num;
4418        u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
4419        int ret;
4420        struct btrfs_trans_handle *trans = NULL;
4421        struct btrfs_path *path;
4422        int not_written = 0;
4423
4424        path = btrfs_alloc_path();
4425        if (!path) {
4426                spin_lock(&sctx->stat_lock);
4427                sctx->stat.malloc_errors++;
4428                spin_unlock(&sctx->stat_lock);
4429                not_written = 1;
4430                goto out;
4431        }
4432
4433        trans = btrfs_join_transaction(root);
4434        if (IS_ERR(trans)) {
4435                not_written = 1;
4436                goto out;
4437        }
4438
4439        ret = iterate_inodes_from_logical(logical, fs_info, path,
4440                        record_inode_for_nocow, nocow_ctx, false);
4441        if (ret != 0 && ret != -ENOENT) {
4442                btrfs_warn(fs_info,
4443                           "iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d",
4444                           logical, physical_for_dev_replace, len, mirror_num,
4445                           ret);
4446                not_written = 1;
4447                goto out;
4448        }
4449
4450        btrfs_end_transaction(trans);
4451        trans = NULL;
4452        while (!list_empty(&nocow_ctx->inodes)) {
4453                struct scrub_nocow_inode *entry;
4454                entry = list_first_entry(&nocow_ctx->inodes,
4455                                         struct scrub_nocow_inode,
4456                                         list);
4457                list_del_init(&entry->list);
4458                ret = copy_nocow_pages_for_inode(entry->inum, entry->offset,
4459                                                 entry->root, nocow_ctx);
4460                kfree(entry);
4461                if (ret == COPY_COMPLETE) {
4462                        ret = 0;
4463                        break;
4464                } else if (ret) {
4465                        break;
4466                }
4467        }
4468out:
4469        while (!list_empty(&nocow_ctx->inodes)) {
4470                struct scrub_nocow_inode *entry;
4471                entry = list_first_entry(&nocow_ctx->inodes,
4472                                         struct scrub_nocow_inode,
4473                                         list);
4474                list_del_init(&entry->list);
4475                kfree(entry);
4476        }
4477        if (trans && !IS_ERR(trans))
4478                btrfs_end_transaction(trans);
4479        if (not_written)
4480                btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
4481                                            num_uncorrectable_read_errors);
4482
4483        btrfs_free_path(path);
4484        kfree(nocow_ctx);
4485
4486        scrub_pending_trans_workers_dec(sctx);
4487}
4488
4489static int check_extent_to_block(struct btrfs_inode *inode, u64 start, u64 len,
4490                                 u64 logical)
4491{
4492        struct extent_state *cached_state = NULL;
4493        struct btrfs_ordered_extent *ordered;
4494        struct extent_io_tree *io_tree;
4495        struct extent_map *em;
4496        u64 lockstart = start, lockend = start + len - 1;
4497        int ret = 0;
4498
4499        io_tree = &inode->io_tree;
4500
4501        lock_extent_bits(io_tree, lockstart, lockend, &cached_state);
4502        ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
4503        if (ordered) {
4504                btrfs_put_ordered_extent(ordered);
4505                ret = 1;
4506                goto out_unlock;
4507        }
4508
4509        em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
4510        if (IS_ERR(em)) {
4511                ret = PTR_ERR(em);
4512                goto out_unlock;
4513        }
4514
4515        /*
4516         * This extent does not actually cover the logical extent anymore,
4517         * move on to the next inode.
4518         */
4519        if (em->block_start > logical ||
4520            em->block_start + em->block_len < logical + len ||
4521            test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
4522                free_extent_map(em);
4523                ret = 1;
4524                goto out_unlock;
4525        }
4526        free_extent_map(em);
4527
4528out_unlock:
4529        unlock_extent_cached(io_tree, lockstart, lockend, &cached_state);
4530        return ret;
4531}
4532
4533static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
4534                                      struct scrub_copy_nocow_ctx *nocow_ctx)
4535{
4536        struct btrfs_fs_info *fs_info = nocow_ctx->sctx->fs_info;
4537        struct btrfs_key key;
4538        struct inode *inode;
4539        struct page *page;
4540        struct btrfs_root *local_root;
4541        struct extent_io_tree *io_tree;
4542        u64 physical_for_dev_replace;
4543        u64 nocow_ctx_logical;
4544        u64 len = nocow_ctx->len;
4545        unsigned long index;
4546        int srcu_index;
4547        int ret = 0;
4548        int err = 0;
4549
4550        key.objectid = root;
4551        key.type = BTRFS_ROOT_ITEM_KEY;
4552        key.offset = (u64)-1;
4553
4554        srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
4555
4556        local_root = btrfs_read_fs_root_no_name(fs_info, &key);
4557        if (IS_ERR(local_root)) {
4558                srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
4559                return PTR_ERR(local_root);
4560        }
4561
4562        key.type = BTRFS_INODE_ITEM_KEY;
4563        key.objectid = inum;
4564        key.offset = 0;
4565        inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
4566        srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
4567        if (IS_ERR(inode))
4568                return PTR_ERR(inode);
4569
4570        /* Avoid truncate/dio/punch hole.. */
4571        inode_lock(inode);
4572        inode_dio_wait(inode);
4573
4574        physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
4575        io_tree = &BTRFS_I(inode)->io_tree;
4576        nocow_ctx_logical = nocow_ctx->logical;
4577
4578        ret = check_extent_to_block(BTRFS_I(inode), offset, len,
4579                        nocow_ctx_logical);
4580        if (ret) {
4581                ret = ret > 0 ? 0 : ret;
4582                goto out;
4583        }
4584
4585        while (len >= PAGE_SIZE) {
4586                index = offset >> PAGE_SHIFT;
4587again:
4588                page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
4589                if (!page) {
4590                        btrfs_err(fs_info, "find_or_create_page() failed");
4591                        ret = -ENOMEM;
4592                        goto out;
4593                }
4594
4595                if (PageUptodate(page)) {
4596                        if (PageDirty(page))
4597                                goto next_page;
4598                } else {
4599                        ClearPageError(page);
4600                        err = extent_read_full_page(io_tree, page,
4601                                                           btrfs_get_extent,
4602                                                           nocow_ctx->mirror_num);
4603                        if (err) {
4604                                ret = err;
4605                                goto next_page;
4606                        }
4607
4608                        lock_page(page);
4609                        /*
4610                         * If the page has been remove from the page cache,
4611                         * the data on it is meaningless, because it may be
4612                         * old one, the new data may be written into the new
4613                         * page in the page cache.
4614                         */
4615                        if (page->mapping != inode->i_mapping) {
4616                                unlock_page(page);
4617                                put_page(page);
4618                                goto again;
4619                        }
4620                        if (!PageUptodate(page)) {
4621                                ret = -EIO;
4622                                goto next_page;
4623                        }
4624                }
4625
4626                ret = check_extent_to_block(BTRFS_I(inode), offset, len,
4627                                            nocow_ctx_logical);
4628                if (ret) {
4629                        ret = ret > 0 ? 0 : ret;
4630                        goto next_page;
4631                }
4632
4633                err = write_page_nocow(nocow_ctx->sctx,
4634                                       physical_for_dev_replace, page);
4635                if (err)
4636                        ret = err;
4637next_page:
4638                unlock_page(page);
4639                put_page(page);
4640
4641                if (ret)
4642                        break;
4643
4644                offset += PAGE_SIZE;
4645                physical_for_dev_replace += PAGE_SIZE;
4646                nocow_ctx_logical += PAGE_SIZE;
4647                len -= PAGE_SIZE;
4648        }
4649        ret = COPY_COMPLETE;
4650out:
4651        inode_unlock(inode);
4652        iput(inode);
4653        return ret;
4654}
4655
4656static int write_page_nocow(struct scrub_ctx *sctx,
4657                            u64 physical_for_dev_replace, struct page *page)
4658{
4659        struct bio *bio;
4660        struct btrfs_device *dev;
4661
4662        dev = sctx->wr_tgtdev;
4663        if (!dev)
4664                return -EIO;
4665        if (!dev->bdev) {
4666                btrfs_warn_rl(dev->fs_info,
4667                        "scrub write_page_nocow(bdev == NULL) is unexpected");
4668                return -EIO;
4669        }
4670        bio = btrfs_io_bio_alloc(1);
4671        bio->bi_iter.bi_size = 0;
4672        bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
4673        bio_set_dev(bio, dev->bdev);
4674        bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
4675        /* bio_add_page won't fail on a freshly allocated bio */
4676        bio_add_page(bio, page, PAGE_SIZE, 0);
4677
4678        if (btrfsic_submit_bio_wait(bio)) {
4679                bio_put(bio);
4680                btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
4681                return -EIO;
4682        }
4683
4684        bio_put(bio);
4685        return 0;
4686}
4687