LXR linux/fs/btrfs/scrub.c

   1/*
   2 * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18
  19#include <linux/blkdev.h>
  20#include <linux/ratelimit.h>
  21#include "ctree.h"
  22#include "volumes.h"
  23#include "disk-io.h"
  24#include "ordered-data.h"
  25#include "transaction.h"
  26#include "backref.h"
  27#include "extent_io.h"
  28#include "dev-replace.h"
  29#include "check-integrity.h"
  30#include "rcu-string.h"
  31#include "raid56.h"
  32
  33/*
  34 * This is only the first step towards a full-features scrub. It reads all
  35 * extent and super block and verifies the checksums. In case a bad checksum
  36 * is found or the extent cannot be read, good data will be written back if
  37 * any can be found.
  38 *
  39 * Future enhancements:
  40 *  - In case an unrepairable extent is encountered, track which files are
  41 *    affected and report them
  42 *  - track and record media errors, throw out bad devices
  43 *  - add a mode to also read unallocated space
  44 */
  45
  46struct scrub_block;
  47struct scrub_ctx;
  48
  49/*
  50 * the following three values only influence the performance.
  51 * The last one configures the number of parallel and outstanding I/O
  52 * operations. The first two values configure an upper limit for the number
  53 * of (dynamically allocated) pages that are added to a bio.
  54 */
  55#define SCRUB_PAGES_PER_RD_BIO  32      /* 128k per bio */
  56#define SCRUB_PAGES_PER_WR_BIO  32      /* 128k per bio */
  57#define SCRUB_BIOS_PER_SCTX     64      /* 8MB per device in flight */
  58
  59/*
  60 * the following value times PAGE_SIZE needs to be large enough to match the
  61 * largest node/leaf/sector size that shall be supported.
  62 * Values larger than BTRFS_STRIPE_LEN are not supported.
  63 */
  64#define SCRUB_MAX_PAGES_PER_BLOCK       16      /* 64k per node/leaf/sector */
  65
  66struct scrub_recover {
  67        atomic_t                refs;
  68        struct btrfs_bio        *bbio;
  69        u64                     map_length;
  70};
  71
  72struct scrub_page {
  73        struct scrub_block      *sblock;
  74        struct page             *page;
  75        struct btrfs_device     *dev;
  76        struct list_head        list;
  77        u64                     flags;  /* extent flags */
  78        u64                     generation;
  79        u64                     logical;
  80        u64                     physical;
  81        u64                     physical_for_dev_replace;
  82        atomic_t                refs;
  83        struct {
  84                unsigned int    mirror_num:8;
  85                unsigned int    have_csum:1;
  86                unsigned int    io_error:1;
  87        };
  88        u8                      csum[BTRFS_CSUM_SIZE];
  89
  90        struct scrub_recover    *recover;
  91};
  92
  93struct scrub_bio {
  94        int                     index;
  95        struct scrub_ctx        *sctx;
  96        struct btrfs_device     *dev;
  97        struct bio              *bio;
  98        int                     err;
  99        u64                     logical;
 100        u64                     physical;
 101#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
 102        struct scrub_page       *pagev[SCRUB_PAGES_PER_WR_BIO];
 103#else
 104        struct scrub_page       *pagev[SCRUB_PAGES_PER_RD_BIO];
 105#endif
 106        int                     page_count;
 107        int                     next_free;
 108        struct btrfs_work       work;
 109};
 110
 111struct scrub_block {
 112        struct scrub_page       *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
 113        int                     page_count;
 114        atomic_t                outstanding_pages;
 115        atomic_t                refs; /* free mem on transition to zero */
 116        struct scrub_ctx        *sctx;
 117        struct scrub_parity     *sparity;
 118        struct {
 119                unsigned int    header_error:1;
 120                unsigned int    checksum_error:1;
 121                unsigned int    no_io_error_seen:1;
 122                unsigned int    generation_error:1; /* also sets header_error */
 123
 124                /* The following is for the data used to check parity */
 125                /* It is for the data with checksum */
 126                unsigned int    data_corrected:1;
 127        };
 128};
 129
 130/* Used for the chunks with parity stripe such RAID5/6 */
 131struct scrub_parity {
 132        struct scrub_ctx        *sctx;
 133
 134        struct btrfs_device     *scrub_dev;
 135
 136        u64                     logic_start;
 137
 138        u64                     logic_end;
 139
 140        int                     nsectors;
 141
 142        int                     stripe_len;
 143
 144        atomic_t                refs;
 145
 146        struct list_head        spages;
 147
 148        /* Work of parity check and repair */
 149        struct btrfs_work       work;
 150
 151        /* Mark the parity blocks which have data */
 152        unsigned long           *dbitmap;
 153
 154        /*
 155         * Mark the parity blocks which have data, but errors happen when
 156         * read data or check data
 157         */
 158        unsigned long           *ebitmap;
 159
 160        unsigned long           bitmap[0];
 161};
 162
 163struct scrub_wr_ctx {
 164        struct scrub_bio *wr_curr_bio;
 165        struct btrfs_device *tgtdev;
 166        int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
 167        atomic_t flush_all_writes;
 168        struct mutex wr_lock;
 169};
 170
 171struct scrub_ctx {
 172        struct scrub_bio        *bios[SCRUB_BIOS_PER_SCTX];
 173        struct btrfs_root       *dev_root;
 174        int                     first_free;
 175        int                     curr;
 176        atomic_t                bios_in_flight;
 177        atomic_t                workers_pending;
 178        spinlock_t              list_lock;
 179        wait_queue_head_t       list_wait;
 180        u16                     csum_size;
 181        struct list_head        csum_list;
 182        atomic_t                cancel_req;
 183        int                     readonly;
 184        int                     pages_per_rd_bio;
 185        u32                     sectorsize;
 186        u32                     nodesize;
 187
 188        int                     is_dev_replace;
 189        struct scrub_wr_ctx     wr_ctx;
 190
 191        /*
 192         * statistics
 193         */
 194        struct btrfs_scrub_progress stat;
 195        spinlock_t              stat_lock;
 196
 197        /*
 198         * Use a ref counter to avoid use-after-free issues. Scrub workers
 199         * decrement bios_in_flight and workers_pending and then do a wakeup
 200         * on the list_wait wait queue. We must ensure the main scrub task
 201         * doesn't free the scrub context before or while the workers are
 202         * doing the wakeup() call.
 203         */
 204        atomic_t                refs;
 205};
 206
 207struct scrub_fixup_nodatasum {
 208        struct scrub_ctx        *sctx;
 209        struct btrfs_device     *dev;
 210        u64                     logical;
 211        struct btrfs_root       *root;
 212        struct btrfs_work       work;
 213        int                     mirror_num;
 214};
 215
 216struct scrub_nocow_inode {
 217        u64                     inum;
 218        u64                     offset;
 219        u64                     root;
 220        struct list_head        list;
 221};
 222
 223struct scrub_copy_nocow_ctx {
 224        struct scrub_ctx        *sctx;
 225        u64                     logical;
 226        u64                     len;
 227        int                     mirror_num;
 228        u64                     physical_for_dev_replace;
 229        struct list_head        inodes;
 230        struct btrfs_work       work;
 231};
 232
 233struct scrub_warning {
 234        struct btrfs_path       *path;
 235        u64                     extent_item_size;
 236        const char              *errstr;
 237        sector_t                sector;
 238        u64                     logical;
 239        struct btrfs_device     *dev;
 240};
 241
 242static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
 243static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
 244static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
 245static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
 246static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
 247static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
 248                                     struct scrub_block *sblocks_for_recheck);
 249static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 250                                struct scrub_block *sblock, int is_metadata,
 251                                int have_csum, u8 *csum, u64 generation,
 252                                u16 csum_size, int retry_failed_mirror);
 253static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
 254                                         struct scrub_block *sblock,
 255                                         int is_metadata, int have_csum,
 256                                         const u8 *csum, u64 generation,
 257                                         u16 csum_size);
 258static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
 259                                             struct scrub_block *sblock_good);
 260static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 261                                            struct scrub_block *sblock_good,
 262                                            int page_num, int force_write);
 263static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
 264static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
 265                                           int page_num);
 266static int scrub_checksum_data(struct scrub_block *sblock);
 267static int scrub_checksum_tree_block(struct scrub_block *sblock);
 268static int scrub_checksum_super(struct scrub_block *sblock);
 269static void scrub_block_get(struct scrub_block *sblock);
 270static void scrub_block_put(struct scrub_block *sblock);
 271static void scrub_page_get(struct scrub_page *spage);
 272static void scrub_page_put(struct scrub_page *spage);
 273static void scrub_parity_get(struct scrub_parity *sparity);
 274static void scrub_parity_put(struct scrub_parity *sparity);
 275static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
 276                                    struct scrub_page *spage);
 277static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 278                       u64 physical, struct btrfs_device *dev, u64 flags,
 279                       u64 gen, int mirror_num, u8 *csum, int force,
 280                       u64 physical_for_dev_replace);
 281static void scrub_bio_end_io(struct bio *bio, int err);
 282static void scrub_bio_end_io_worker(struct btrfs_work *work);
 283static void scrub_block_complete(struct scrub_block *sblock);
 284static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
 285                               u64 extent_logical, u64 extent_len,
 286                               u64 *extent_physical,
 287                               struct btrfs_device **extent_dev,
 288                               int *extent_mirror_num);
 289static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
 290                              struct scrub_wr_ctx *wr_ctx,
 291                              struct btrfs_fs_info *fs_info,
 292                              struct btrfs_device *dev,
 293                              int is_dev_replace);
 294static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
 295static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
 296                                    struct scrub_page *spage);
 297static void scrub_wr_submit(struct scrub_ctx *sctx);
 298static void scrub_wr_bio_end_io(struct bio *bio, int err);
 299static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
 300static int write_page_nocow(struct scrub_ctx *sctx,
 301                            u64 physical_for_dev_replace, struct page *page);
 302static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
 303                                      struct scrub_copy_nocow_ctx *ctx);
 304static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 305                            int mirror_num, u64 physical_for_dev_replace);
 306static void copy_nocow_pages_worker(struct btrfs_work *work);
 307static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
 308static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
 309static void scrub_put_ctx(struct scrub_ctx *sctx);
 310
 311
 312static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
 313{
 314        atomic_inc(&sctx->refs);
 315        atomic_inc(&sctx->bios_in_flight);
 316}
 317
 318static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
 319{
 320        atomic_dec(&sctx->bios_in_flight);
 321        wake_up(&sctx->list_wait);
 322        scrub_put_ctx(sctx);
 323}
 324
 325static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 326{
 327        while (atomic_read(&fs_info->scrub_pause_req)) {
 328                mutex_unlock(&fs_info->scrub_lock);
 329                wait_event(fs_info->scrub_pause_wait,
 330                   atomic_read(&fs_info->scrub_pause_req) == 0);
 331                mutex_lock(&fs_info->scrub_lock);
 332        }
 333}
 334
 335static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 336{
 337        atomic_inc(&fs_info->scrubs_paused);
 338        wake_up(&fs_info->scrub_pause_wait);
 339
 340        mutex_lock(&fs_info->scrub_lock);
 341        __scrub_blocked_if_needed(fs_info);
 342        atomic_dec(&fs_info->scrubs_paused);
 343        mutex_unlock(&fs_info->scrub_lock);
 344
 345        wake_up(&fs_info->scrub_pause_wait);
 346}
 347
 348/*
 349 * used for workers that require transaction commits (i.e., for the
 350 * NOCOW case)
 351 */
 352static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
 353{
 354        struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
 355
 356        atomic_inc(&sctx->refs);
 357        /*
 358         * increment scrubs_running to prevent cancel requests from
 359         * completing as long as a worker is running. we must also
 360         * increment scrubs_paused to prevent deadlocking on pause
 361         * requests used for transactions commits (as the worker uses a
 362         * transaction context). it is safe to regard the worker
 363         * as paused for all matters practical. effectively, we only
 364         * avoid cancellation requests from completing.
 365         */
 366        mutex_lock(&fs_info->scrub_lock);
 367        atomic_inc(&fs_info->scrubs_running);
 368        atomic_inc(&fs_info->scrubs_paused);
 369        mutex_unlock(&fs_info->scrub_lock);
 370
 371        /*
 372         * check if @scrubs_running=@scrubs_paused condition
 373         * inside wait_event() is not an atomic operation.
 374         * which means we may inc/dec @scrub_running/paused
 375         * at any time. Let's wake up @scrub_pause_wait as
 376         * much as we can to let commit transaction blocked less.
 377         */
 378        wake_up(&fs_info->scrub_pause_wait);
 379
 380        atomic_inc(&sctx->workers_pending);
 381}
 382
 383/* used for workers that require transaction commits */
 384static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
 385{
 386        struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
 387
 388        /*
 389         * see scrub_pending_trans_workers_inc() why we're pretending
 390         * to be paused in the scrub counters
 391         */
 392        mutex_lock(&fs_info->scrub_lock);
 393        atomic_dec(&fs_info->scrubs_running);
 394        atomic_dec(&fs_info->scrubs_paused);
 395        mutex_unlock(&fs_info->scrub_lock);
 396        atomic_dec(&sctx->workers_pending);
 397        wake_up(&fs_info->scrub_pause_wait);
 398        wake_up(&sctx->list_wait);
 399        scrub_put_ctx(sctx);
 400}
 401
 402static void scrub_free_csums(struct scrub_ctx *sctx)
 403{
 404        while (!list_empty(&sctx->csum_list)) {
 405                struct btrfs_ordered_sum *sum;
 406                sum = list_first_entry(&sctx->csum_list,
 407                                       struct btrfs_ordered_sum, list);
 408                list_del(&sum->list);
 409                kfree(sum);
 410        }
 411}
 412
 413static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
 414{
 415        int i;
 416
 417        if (!sctx)
 418                return;
 419
 420        scrub_free_wr_ctx(&sctx->wr_ctx);
 421
 422        /* this can happen when scrub is cancelled */
 423        if (sctx->curr != -1) {
 424                struct scrub_bio *sbio = sctx->bios[sctx->curr];
 425
 426                for (i = 0; i < sbio->page_count; i++) {
 427                        WARN_ON(!sbio->pagev[i]->page);
 428                        scrub_block_put(sbio->pagev[i]->sblock);
 429                }
 430                bio_put(sbio->bio);
 431        }
 432
 433        for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 434                struct scrub_bio *sbio = sctx->bios[i];
 435
 436                if (!sbio)
 437                        break;
 438                kfree(sbio);
 439        }
 440
 441        scrub_free_csums(sctx);
 442        kfree(sctx);
 443}
 444
 445static void scrub_put_ctx(struct scrub_ctx *sctx)
 446{
 447        if (atomic_dec_and_test(&sctx->refs))
 448                scrub_free_ctx(sctx);
 449}
 450
 451static noinline_for_stack
 452struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
 453{
 454        struct scrub_ctx *sctx;
 455        int             i;
 456        struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
 457        int pages_per_rd_bio;
 458        int ret;
 459
 460        /*
 461         * the setting of pages_per_rd_bio is correct for scrub but might
 462         * be wrong for the dev_replace code where we might read from
 463         * different devices in the initial huge bios. However, that
 464         * code is able to correctly handle the case when adding a page
 465         * to a bio fails.
 466         */
 467        if (dev->bdev)
 468                pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
 469                                         bio_get_nr_vecs(dev->bdev));
 470        else
 471                pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
 472        sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
 473        if (!sctx)
 474                goto nomem;
 475        atomic_set(&sctx->refs, 1);
 476        sctx->is_dev_replace = is_dev_replace;
 477        sctx->pages_per_rd_bio = pages_per_rd_bio;
 478        sctx->curr = -1;
 479        sctx->dev_root = dev->dev_root;
 480        for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 481                struct scrub_bio *sbio;
 482
 483                sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
 484                if (!sbio)
 485                        goto nomem;
 486                sctx->bios[i] = sbio;
 487
 488                sbio->index = i;
 489                sbio->sctx = sctx;
 490                sbio->page_count = 0;
 491                btrfs_init_work(&sbio->work, btrfs_scrub_helper,
 492                                scrub_bio_end_io_worker, NULL, NULL);
 493
 494                if (i != SCRUB_BIOS_PER_SCTX - 1)
 495                        sctx->bios[i]->next_free = i + 1;
 496                else
 497                        sctx->bios[i]->next_free = -1;
 498        }
 499        sctx->first_free = 0;
 500        sctx->nodesize = dev->dev_root->nodesize;
 501        sctx->sectorsize = dev->dev_root->sectorsize;
 502        atomic_set(&sctx->bios_in_flight, 0);
 503        atomic_set(&sctx->workers_pending, 0);
 504        atomic_set(&sctx->cancel_req, 0);
 505        sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
 506        INIT_LIST_HEAD(&sctx->csum_list);
 507
 508        spin_lock_init(&sctx->list_lock);
 509        spin_lock_init(&sctx->stat_lock);
 510        init_waitqueue_head(&sctx->list_wait);
 511
 512        ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
 513                                 fs_info->dev_replace.tgtdev, is_dev_replace);
 514        if (ret) {
 515                scrub_free_ctx(sctx);
 516                return ERR_PTR(ret);
 517        }
 518        return sctx;
 519
 520nomem:
 521        scrub_free_ctx(sctx);
 522        return ERR_PTR(-ENOMEM);
 523}
 524
 525static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
 526                                     void *warn_ctx)
 527{
 528        u64 isize;
 529        u32 nlink;
 530        int ret;
 531        int i;
 532        struct extent_buffer *eb;
 533        struct btrfs_inode_item *inode_item;
 534        struct scrub_warning *swarn = warn_ctx;
 535        struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
 536        struct inode_fs_paths *ipath = NULL;
 537        struct btrfs_root *local_root;
 538        struct btrfs_key root_key;
 539        struct btrfs_key key;
 540
 541        root_key.objectid = root;
 542        root_key.type = BTRFS_ROOT_ITEM_KEY;
 543        root_key.offset = (u64)-1;
 544        local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
 545        if (IS_ERR(local_root)) {
 546                ret = PTR_ERR(local_root);
 547                goto err;
 548        }
 549
 550        /*
 551         * this makes the path point to (inum INODE_ITEM ioff)
 552         */
 553        key.objectid = inum;
 554        key.type = BTRFS_INODE_ITEM_KEY;
 555        key.offset = 0;
 556
 557        ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
 558        if (ret) {
 559                btrfs_release_path(swarn->path);
 560                goto err;
 561        }
 562
 563        eb = swarn->path->nodes[0];
 564        inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
 565                                        struct btrfs_inode_item);
 566        isize = btrfs_inode_size(eb, inode_item);
 567        nlink = btrfs_inode_nlink(eb, inode_item);
 568        btrfs_release_path(swarn->path);
 569
 570        ipath = init_ipath(4096, local_root, swarn->path);
 571        if (IS_ERR(ipath)) {
 572                ret = PTR_ERR(ipath);
 573                ipath = NULL;
 574                goto err;
 575        }
 576        ret = paths_from_inode(inum, ipath);
 577
 578        if (ret < 0)
 579                goto err;
 580
 581        /*
 582         * we deliberately ignore the bit ipath might have been too small to
 583         * hold all of the paths here
 584         */
 585        for (i = 0; i < ipath->fspath->elem_cnt; ++i)
 586                printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
 587                        "%s, sector %llu, root %llu, inode %llu, offset %llu, "
 588                        "length %llu, links %u (path: %s)\n", swarn->errstr,
 589                        swarn->logical, rcu_str_deref(swarn->dev->name),
 590                        (unsigned long long)swarn->sector, root, inum, offset,
 591                        min(isize - offset, (u64)PAGE_SIZE), nlink,
 592                        (char *)(unsigned long)ipath->fspath->val[i]);
 593
 594        free_ipath(ipath);
 595        return 0;
 596
 597err:
 598        printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
 599                "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
 600                "resolving failed with ret=%d\n", swarn->errstr,
 601                swarn->logical, rcu_str_deref(swarn->dev->name),
 602                (unsigned long long)swarn->sector, root, inum, offset, ret);
 603
 604        free_ipath(ipath);
 605        return 0;
 606}
 607
 608static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 609{
 610        struct btrfs_device *dev;
 611        struct btrfs_fs_info *fs_info;
 612        struct btrfs_path *path;
 613        struct btrfs_key found_key;
 614        struct extent_buffer *eb;
 615        struct btrfs_extent_item *ei;
 616        struct scrub_warning swarn;
 617        unsigned long ptr = 0;
 618        u64 extent_item_pos;
 619        u64 flags = 0;
 620        u64 ref_root;
 621        u32 item_size;
 622        u8 ref_level;
 623        int ret;
 624
 625        WARN_ON(sblock->page_count < 1);
 626        dev = sblock->pagev[0]->dev;
 627        fs_info = sblock->sctx->dev_root->fs_info;
 628
 629        path = btrfs_alloc_path();
 630        if (!path)
 631                return;
 632
 633        swarn.sector = (sblock->pagev[0]->physical) >> 9;
 634        swarn.logical = sblock->pagev[0]->logical;
 635        swarn.errstr = errstr;
 636        swarn.dev = NULL;
 637
 638        ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
 639                                  &flags);
 640        if (ret < 0)
 641                goto out;
 642
 643        extent_item_pos = swarn.logical - found_key.objectid;
 644        swarn.extent_item_size = found_key.offset;
 645
 646        eb = path->nodes[0];
 647        ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
 648        item_size = btrfs_item_size_nr(eb, path->slots[0]);
 649
 650        if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 651                do {
 652                        ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
 653                                                      item_size, &ref_root,
 654                                                      &ref_level);
 655                        printk_in_rcu(KERN_WARNING
 656                                "BTRFS: %s at logical %llu on dev %s, "
 657                                "sector %llu: metadata %s (level %d) in tree "
 658                                "%llu\n", errstr, swarn.logical,
 659                                rcu_str_deref(dev->name),
 660                                (unsigned long long)swarn.sector,
 661                                ref_level ? "node" : "leaf",
 662                                ret < 0 ? -1 : ref_level,
 663                                ret < 0 ? -1 : ref_root);
 664                } while (ret != 1);
 665                btrfs_release_path(path);
 666        } else {
 667                btrfs_release_path(path);
 668                swarn.path = path;
 669                swarn.dev = dev;
 670                iterate_extent_inodes(fs_info, found_key.objectid,
 671                                        extent_item_pos, 1,
 672                                        scrub_print_warning_inode, &swarn);
 673        }
 674
 675out:
 676        btrfs_free_path(path);
 677}
 678
 679static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
 680{
 681        struct page *page = NULL;
 682        unsigned long index;
 683        struct scrub_fixup_nodatasum *fixup = fixup_ctx;
 684        int ret;
 685        int corrected = 0;
 686        struct btrfs_key key;
 687        struct inode *inode = NULL;
 688        struct btrfs_fs_info *fs_info;
 689        u64 end = offset + PAGE_SIZE - 1;
 690        struct btrfs_root *local_root;
 691        int srcu_index;
 692
 693        key.objectid = root;
 694        key.type = BTRFS_ROOT_ITEM_KEY;
 695        key.offset = (u64)-1;
 696
 697        fs_info = fixup->root->fs_info;
 698        srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
 699
 700        local_root = btrfs_read_fs_root_no_name(fs_info, &key);
 701        if (IS_ERR(local_root)) {
 702                srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
 703                return PTR_ERR(local_root);
 704        }
 705
 706        key.type = BTRFS_INODE_ITEM_KEY;
 707        key.objectid = inum;
 708        key.offset = 0;
 709        inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
 710        srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
 711        if (IS_ERR(inode))
 712                return PTR_ERR(inode);
 713
 714        index = offset >> PAGE_CACHE_SHIFT;
 715
 716        page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
 717        if (!page) {
 718                ret = -ENOMEM;
 719                goto out;
 720        }
 721
 722        if (PageUptodate(page)) {
 723                if (PageDirty(page)) {
 724                        /*
 725                         * we need to write the data to the defect sector. the
 726                         * data that was in that sector is not in memory,
 727                         * because the page was modified. we must not write the
 728                         * modified page to that sector.
 729                         *
 730                         * TODO: what could be done here: wait for the delalloc
 731                         *       runner to write out that page (might involve
 732                         *       COW) and see whether the sector is still
 733                         *       referenced afterwards.
 734                         *
 735                         * For the meantime, we'll treat this error
 736                         * incorrectable, although there is a chance that a
 737                         * later scrub will find the bad sector again and that
 738                         * there's no dirty page in memory, then.
 739                         */
 740                        ret = -EIO;
 741                        goto out;
 742                }
 743                ret = repair_io_failure(inode, offset, PAGE_SIZE,
 744                                        fixup->logical, page,
 745                                        offset - page_offset(page),
 746                                        fixup->mirror_num);
 747                unlock_page(page);
 748                corrected = !ret;
 749        } else {
 750                /*
 751                 * we need to get good data first. the general readpage path
 752                 * will call repair_io_failure for us, we just have to make
 753                 * sure we read the bad mirror.
 754                 */
 755                ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
 756                                        EXTENT_DAMAGED, GFP_NOFS);
 757                if (ret) {
 758                        /* set_extent_bits should give proper error */
 759                        WARN_ON(ret > 0);
 760                        if (ret > 0)
 761                                ret = -EFAULT;
 762                        goto out;
 763                }
 764
 765                ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
 766                                                btrfs_get_extent,
 767                                                fixup->mirror_num);
 768                wait_on_page_locked(page);
 769
 770                corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
 771                                                end, EXTENT_DAMAGED, 0, NULL);
 772                if (!corrected)
 773                        clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
 774                                                EXTENT_DAMAGED, GFP_NOFS);
 775        }
 776
 777out:
 778        if (page)
 779                put_page(page);
 780
 781        iput(inode);
 782
 783        if (ret < 0)
 784                return ret;
 785
 786        if (ret == 0 && corrected) {
 787                /*
 788                 * we only need to call readpage for one of the inodes belonging
 789                 * to this extent. so make iterate_extent_inodes stop
 790                 */
 791                return 1;
 792        }
 793
 794        return -EIO;
 795}
 796
 797static void scrub_fixup_nodatasum(struct btrfs_work *work)
 798{
 799        int ret;
 800        struct scrub_fixup_nodatasum *fixup;
 801        struct scrub_ctx *sctx;
 802        struct btrfs_trans_handle *trans = NULL;
 803        struct btrfs_path *path;
 804        int uncorrectable = 0;
 805
 806        fixup = container_of(work, struct scrub_fixup_nodatasum, work);
 807        sctx = fixup->sctx;
 808
 809        path = btrfs_alloc_path();
 810        if (!path) {
 811                spin_lock(&sctx->stat_lock);
 812                ++sctx->stat.malloc_errors;
 813                spin_unlock(&sctx->stat_lock);
 814                uncorrectable = 1;
 815                goto out;
 816        }
 817
 818        trans = btrfs_join_transaction(fixup->root);
 819        if (IS_ERR(trans)) {
 820                uncorrectable = 1;
 821                goto out;
 822        }
 823
 824        /*
 825         * the idea is to trigger a regular read through the standard path. we
 826         * read a page from the (failed) logical address by specifying the
 827         * corresponding copynum of the failed sector. thus, that readpage is
 828         * expected to fail.
 829         * that is the point where on-the-fly error correction will kick in
 830         * (once it's finished) and rewrite the failed sector if a good copy
 831         * can be found.
 832         */
 833        ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
 834                                                path, scrub_fixup_readpage,
 835                                                fixup);
 836        if (ret < 0) {
 837                uncorrectable = 1;
 838                goto out;
 839        }
 840        WARN_ON(ret != 1);
 841
 842        spin_lock(&sctx->stat_lock);
 843        ++sctx->stat.corrected_errors;
 844        spin_unlock(&sctx->stat_lock);
 845
 846out:
 847        if (trans && !IS_ERR(trans))
 848                btrfs_end_transaction(trans, fixup->root);
 849        if (uncorrectable) {
 850                spin_lock(&sctx->stat_lock);
 851                ++sctx->stat.uncorrectable_errors;
 852                spin_unlock(&sctx->stat_lock);
 853                btrfs_dev_replace_stats_inc(
 854                        &sctx->dev_root->fs_info->dev_replace.
 855                        num_uncorrectable_read_errors);
 856                printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
 857                    "unable to fixup (nodatasum) error at logical %llu on dev %s\n",
 858                        fixup->logical, rcu_str_deref(fixup->dev->name));
 859        }
 860
 861        btrfs_free_path(path);
 862        kfree(fixup);
 863
 864        scrub_pending_trans_workers_dec(sctx);
 865}
 866
 867static inline void scrub_get_recover(struct scrub_recover *recover)
 868{
 869        atomic_inc(&recover->refs);
 870}
 871
 872static inline void scrub_put_recover(struct scrub_recover *recover)
 873{
 874        if (atomic_dec_and_test(&recover->refs)) {
 875                btrfs_put_bbio(recover->bbio);
 876                kfree(recover);
 877        }
 878}
 879
 880/*
 881 * scrub_handle_errored_block gets called when either verification of the
 882 * pages failed or the bio failed to read, e.g. with EIO. In the latter
 883 * case, this function handles all pages in the bio, even though only one
 884 * may be bad.
 885 * The goal of this function is to repair the errored block by using the
 886 * contents of one of the mirrors.
 887 */
 888static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 889{
 890        struct scrub_ctx *sctx = sblock_to_check->sctx;
 891        struct btrfs_device *dev;
 892        struct btrfs_fs_info *fs_info;
 893        u64 length;
 894        u64 logical;
 895        u64 generation;
 896        unsigned int failed_mirror_index;
 897        unsigned int is_metadata;
 898        unsigned int have_csum;
 899        u8 *csum;
 900        struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
 901        struct scrub_block *sblock_bad;
 902        int ret;
 903        int mirror_index;
 904        int page_num;
 905        int success;
 906        static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
 907                                      DEFAULT_RATELIMIT_BURST);
 908
 909        BUG_ON(sblock_to_check->page_count < 1);
 910        fs_info = sctx->dev_root->fs_info;
 911        if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
 912                /*
 913                 * if we find an error in a super block, we just report it.
 914                 * They will get written with the next transaction commit
 915                 * anyway
 916                 */
 917                spin_lock(&sctx->stat_lock);
 918                ++sctx->stat.super_errors;
 919                spin_unlock(&sctx->stat_lock);
 920                return 0;
 921        }
 922        length = sblock_to_check->page_count * PAGE_SIZE;
 923        logical = sblock_to_check->pagev[0]->logical;
 924        generation = sblock_to_check->pagev[0]->generation;
 925        BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
 926        failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
 927        is_metadata = !(sblock_to_check->pagev[0]->flags &
 928                        BTRFS_EXTENT_FLAG_DATA);
 929        have_csum = sblock_to_check->pagev[0]->have_csum;
 930        csum = sblock_to_check->pagev[0]->csum;
 931        dev = sblock_to_check->pagev[0]->dev;
 932
 933        if (sctx->is_dev_replace && !is_metadata && !have_csum) {
 934                sblocks_for_recheck = NULL;
 935                goto nodatasum_case;
 936        }
 937
 938        /*
 939         * read all mirrors one after the other. This includes to
 940         * re-read the extent or metadata block that failed (that was
 941         * the cause that this fixup code is called) another time,
 942         * page by page this time in order to know which pages
 943         * caused I/O errors and which ones are good (for all mirrors).
 944         * It is the goal to handle the situation when more than one
 945         * mirror contains I/O errors, but the errors do not
 946         * overlap, i.e. the data can be repaired by selecting the
 947         * pages from those mirrors without I/O error on the
 948         * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
 949         * would be that mirror #1 has an I/O error on the first page,
 950         * the second page is good, and mirror #2 has an I/O error on
 951         * the second page, but the first page is good.
 952         * Then the first page of the first mirror can be repaired by
 953         * taking the first page of the second mirror, and the
 954         * second page of the second mirror can be repaired by
 955         * copying the contents of the 2nd page of the 1st mirror.
 956         * One more note: if the pages of one mirror contain I/O
 957         * errors, the checksum cannot be verified. In order to get
 958         * the best data for repairing, the first attempt is to find
 959         * a mirror without I/O errors and with a validated checksum.
 960         * Only if this is not possible, the pages are picked from
 961         * mirrors with I/O errors without considering the checksum.
 962         * If the latter is the case, at the end, the checksum of the
 963         * repaired area is verified in order to correctly maintain
 964         * the statistics.
 965         */
 966
 967        sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS *
 968                                     sizeof(*sblocks_for_recheck),
 969                                     GFP_NOFS);
 970        if (!sblocks_for_recheck) {
 971                spin_lock(&sctx->stat_lock);
 972                sctx->stat.malloc_errors++;
 973                sctx->stat.read_errors++;
 974                sctx->stat.uncorrectable_errors++;
 975                spin_unlock(&sctx->stat_lock);
 976                btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 977                goto out;
 978        }
 979
 980        /* setup the context, map the logical blocks and alloc the pages */
 981        ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
 982        if (ret) {
 983                spin_lock(&sctx->stat_lock);
 984                sctx->stat.read_errors++;
 985                sctx->stat.uncorrectable_errors++;
 986                spin_unlock(&sctx->stat_lock);
 987                btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 988                goto out;
 989        }
 990        BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
 991        sblock_bad = sblocks_for_recheck + failed_mirror_index;
 992
 993        /* build and submit the bios for the failed mirror, check checksums */
 994        scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
 995                            csum, generation, sctx->csum_size, 1);
 996
 997        if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
 998            sblock_bad->no_io_error_seen) {
 999                /*
1000                 * the error disappeared after reading page by page, or

1001                 * the area was part of a huge bio and other parts of the
1002                 * bio caused I/O errors, or the block layer merged several
1003                 * read requests into one and the error is caused by a
1004                 * different bio (usually one of the two latter cases is
1005                 * the cause)
1006                 */
1007                spin_lock(&sctx->stat_lock);
1008                sctx->stat.unverified_errors++;
1009                sblock_to_check->data_corrected = 1;
1010                spin_unlock(&sctx->stat_lock);
1011
1012                if (sctx->is_dev_replace)
1013                        scrub_write_block_to_dev_replace(sblock_bad);
1014                goto out;
1015        }
1016
1017        if (!sblock_bad->no_io_error_seen) {
1018                spin_lock(&sctx->stat_lock);
1019                sctx->stat.read_errors++;
1020                spin_unlock(&sctx->stat_lock);
1021                if (__ratelimit(&_rs))
1022                        scrub_print_warning("i/o error", sblock_to_check);
1023                btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
1024        } else if (sblock_bad->checksum_error) {
1025                spin_lock(&sctx->stat_lock);
1026                sctx->stat.csum_errors++;
1027                spin_unlock(&sctx->stat_lock);
1028                if (__ratelimit(&_rs))
1029                        scrub_print_warning("checksum error", sblock_to_check);
1030                btrfs_dev_stat_inc_and_print(dev,
1031                                             BTRFS_DEV_STAT_CORRUPTION_ERRS);
1032        } else if (sblock_bad->header_error) {
1033                spin_lock(&sctx->stat_lock);
1034                sctx->stat.verify_errors++;
1035                spin_unlock(&sctx->stat_lock);
1036                if (__ratelimit(&_rs))
1037                        scrub_print_warning("checksum/header error",
1038                                            sblock_to_check);
1039                if (sblock_bad->generation_error)
1040                        btrfs_dev_stat_inc_and_print(dev,
1041                                BTRFS_DEV_STAT_GENERATION_ERRS);
1042                else
1043                        btrfs_dev_stat_inc_and_print(dev,
1044                                BTRFS_DEV_STAT_CORRUPTION_ERRS);
1045        }
1046
1047        if (sctx->readonly) {
1048                ASSERT(!sctx->is_dev_replace);
1049                goto out;
1050        }
1051
1052        if (!is_metadata && !have_csum) {
1053                struct scrub_fixup_nodatasum *fixup_nodatasum;
1054
1055                WARN_ON(sctx->is_dev_replace);
1056
1057nodatasum_case:
1058
1059                /*
1060                 * !is_metadata and !have_csum, this means that the data
1061                 * might not be COW'ed, that it might be modified
1062                 * concurrently. The general strategy to work on the
1063                 * commit root does not help in the case when COW is not
1064                 * used.
1065                 */
1066                fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
1067                if (!fixup_nodatasum)
1068                        goto did_not_correct_error;
1069                fixup_nodatasum->sctx = sctx;
1070                fixup_nodatasum->dev = dev;
1071                fixup_nodatasum->logical = logical;
1072                fixup_nodatasum->root = fs_info->extent_root;
1073                fixup_nodatasum->mirror_num = failed_mirror_index + 1;
1074                scrub_pending_trans_workers_inc(sctx);
1075                btrfs_init_work(&fixup_nodatasum->work, btrfs_scrub_helper,
1076                                scrub_fixup_nodatasum, NULL, NULL);
1077                btrfs_queue_work(fs_info->scrub_workers,
1078                                 &fixup_nodatasum->work);
1079                goto out;
1080        }
1081
1082        /*
1083         * now build and submit the bios for the other mirrors, check
1084         * checksums.
1085         * First try to pick the mirror which is completely without I/O
1086         * errors and also does not have a checksum error.
1087         * If one is found, and if a checksum is present, the full block
1088         * that is known to contain an error is rewritten. Afterwards
1089         * the block is known to be corrected.
1090         * If a mirror is found which is completely correct, and no
1091         * checksum is present, only those pages are rewritten that had
1092         * an I/O error in the block to be repaired, since it cannot be
1093         * determined, which copy of the other pages is better (and it
1094         * could happen otherwise that a correct page would be
1095         * overwritten by a bad one).
1096         */
1097        for (mirror_index = 0;
1098             mirror_index < BTRFS_MAX_MIRRORS &&
1099             sblocks_for_recheck[mirror_index].page_count > 0;
1100             mirror_index++) {
1101                struct scrub_block *sblock_other;
1102
1103                if (mirror_index == failed_mirror_index)
1104                        continue;
1105                sblock_other = sblocks_for_recheck + mirror_index;
1106
1107                /* build and submit the bios, check checksums */
1108                scrub_recheck_block(fs_info, sblock_other, is_metadata,
1109                                    have_csum, csum, generation,
1110                                    sctx->csum_size, 0);
1111
1112                if (!sblock_other->header_error &&
1113                    !sblock_other->checksum_error &&
1114                    sblock_other->no_io_error_seen) {
1115                        if (sctx->is_dev_replace) {
1116                                scrub_write_block_to_dev_replace(sblock_other);
1117                                goto corrected_error;
1118                        } else {
1119                                ret = scrub_repair_block_from_good_copy(
1120                                                sblock_bad, sblock_other);
1121                                if (!ret)
1122                                        goto corrected_error;
1123                        }
1124                }
1125        }
1126
1127        if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1128                goto did_not_correct_error;
1129
1130        /*
1131         * In case of I/O errors in the area that is supposed to be
1132         * repaired, continue by picking good copies of those pages.
1133         * Select the good pages from mirrors to rewrite bad pages from
1134         * the area to fix. Afterwards verify the checksum of the block
1135         * that is supposed to be repaired. This verification step is
1136         * only done for the purpose of statistic counting and for the
1137         * final scrub report, whether errors remain.
1138         * A perfect algorithm could make use of the checksum and try
1139         * all possible combinations of pages from the different mirrors
1140         * until the checksum verification succeeds. For example, when
1141         * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
1142         * of mirror #2 is readable but the final checksum test fails,
1143         * then the 2nd page of mirror #3 could be tried, whether now
1144         * the final checksum succeedes. But this would be a rare
1145         * exception and is therefore not implemented. At least it is
1146         * avoided that the good copy is overwritten.
1147         * A more useful improvement would be to pick the sectors
1148         * without I/O error based on sector sizes (512 bytes on legacy
1149         * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
1150         * mirror could be repaired by taking 512 byte of a different
1151         * mirror, even if other 512 byte sectors in the same PAGE_SIZE
1152         * area are unreadable.
1153         */
1154        success = 1;
1155        for (page_num = 0; page_num < sblock_bad->page_count;
1156             page_num++) {
1157                struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1158                struct scrub_block *sblock_other = NULL;
1159
1160                /* skip no-io-error page in scrub */
1161                if (!page_bad->io_error && !sctx->is_dev_replace)
1162                        continue;
1163
1164                /* try to find no-io-error page in mirrors */
1165                if (page_bad->io_error) {
1166                        for (mirror_index = 0;
1167                             mirror_index < BTRFS_MAX_MIRRORS &&
1168                             sblocks_for_recheck[mirror_index].page_count > 0;
1169                             mirror_index++) {
1170                                if (!sblocks_for_recheck[mirror_index].
1171                                    pagev[page_num]->io_error) {
1172                                        sblock_other = sblocks_for_recheck +
1173                                                       mirror_index;
1174                                        break;
1175                                }
1176                        }
1177                        if (!sblock_other)
1178                                success = 0;
1179                }
1180
1181                if (sctx->is_dev_replace) {
1182                        /*
1183                         * did not find a mirror to fetch the page
1184                         * from. scrub_write_page_to_dev_replace()
1185                         * handles this case (page->io_error), by
1186                         * filling the block with zeros before
1187                         * submitting the write request
1188                         */
1189                        if (!sblock_other)
1190                                sblock_other = sblock_bad;
1191
1192                        if (scrub_write_page_to_dev_replace(sblock_other,
1193                                                            page_num) != 0) {
1194                                btrfs_dev_replace_stats_inc(
1195                                        &sctx->dev_root->
1196                                        fs_info->dev_replace.
1197                                        num_write_errors);
1198                                success = 0;
1199                        }
1200                } else if (sblock_other) {
1201                        ret = scrub_repair_page_from_good_copy(sblock_bad,
1202                                                               sblock_other,
1203                                                               page_num, 0);
1204                        if (0 == ret)
1205                                page_bad->io_error = 0;
1206                        else
1207                                success = 0;
1208                }
1209        }
1210
1211        if (success && !sctx->is_dev_replace) {
1212                if (is_metadata || have_csum) {
1213                        /*
1214                         * need to verify the checksum now that all
1215                         * sectors on disk are repaired (the write
1216                         * request for data to be repaired is on its way).
1217                         * Just be lazy and use scrub_recheck_block()
1218                         * which re-reads the data before the checksum
1219                         * is verified, but most likely the data comes out
1220                         * of the page cache.
1221                         */
1222                        scrub_recheck_block(fs_info, sblock_bad,
1223                                            is_metadata, have_csum, csum,
1224                                            generation, sctx->csum_size, 1);
1225                        if (!sblock_bad->header_error &&
1226                            !sblock_bad->checksum_error &&
1227                            sblock_bad->no_io_error_seen)
1228                                goto corrected_error;
1229                        else
1230                                goto did_not_correct_error;
1231                } else {
1232corrected_error:
1233                        spin_lock(&sctx->stat_lock);
1234                        sctx->stat.corrected_errors++;
1235                        sblock_to_check->data_corrected = 1;
1236                        spin_unlock(&sctx->stat_lock);
1237                        printk_ratelimited_in_rcu(KERN_ERR
1238                                "BTRFS: fixed up error at logical %llu on dev %s\n",
1239                                logical, rcu_str_deref(dev->name));
1240                }
1241        } else {
1242did_not_correct_error:
1243                spin_lock(&sctx->stat_lock);
1244                sctx->stat.uncorrectable_errors++;
1245                spin_unlock(&sctx->stat_lock);
1246                printk_ratelimited_in_rcu(KERN_ERR
1247                        "BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n",
1248                        logical, rcu_str_deref(dev->name));
1249        }
1250
1251out:
1252        if (sblocks_for_recheck) {
1253                for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1254                     mirror_index++) {
1255                        struct scrub_block *sblock = sblocks_for_recheck +
1256                                                     mirror_index;
1257                        struct scrub_recover *recover;
1258                        int page_index;
1259
1260                        for (page_index = 0; page_index < sblock->page_count;
1261                             page_index++) {
1262                                sblock->pagev[page_index]->sblock = NULL;
1263                                recover = sblock->pagev[page_index]->recover;
1264                                if (recover) {
1265                                        scrub_put_recover(recover);
1266                                        sblock->pagev[page_index]->recover =
1267                                                                        NULL;
1268                                }
1269                                scrub_page_put(sblock->pagev[page_index]);
1270                        }
1271                }
1272                kfree(sblocks_for_recheck);
1273        }
1274
1275        return 0;
1276}
1277
1278static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
1279{
1280        if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
1281                return 2;
1282        else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
1283                return 3;
1284        else
1285                return (int)bbio->num_stripes;
1286}
1287
1288static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1289                                                 u64 *raid_map,
1290                                                 u64 mapped_length,
1291                                                 int nstripes, int mirror,
1292                                                 int *stripe_index,
1293                                                 u64 *stripe_offset)
1294{
1295        int i;
1296
1297        if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1298                /* RAID5/6 */
1299                for (i = 0; i < nstripes; i++) {
1300                        if (raid_map[i] == RAID6_Q_STRIPE ||
1301                            raid_map[i] == RAID5_P_STRIPE)
1302                                continue;
1303
1304                        if (logical >= raid_map[i] &&
1305                            logical < raid_map[i] + mapped_length)
1306                                break;
1307                }
1308
1309                *stripe_index = i;
1310                *stripe_offset = logical - raid_map[i];
1311        } else {
1312                /* The other RAID type */
1313                *stripe_index = mirror;
1314                *stripe_offset = 0;
1315        }
1316}
1317
1318static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1319                                     struct scrub_block *sblocks_for_recheck)
1320{
1321        struct scrub_ctx *sctx = original_sblock->sctx;
1322        struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
1323        u64 length = original_sblock->page_count * PAGE_SIZE;
1324        u64 logical = original_sblock->pagev[0]->logical;
1325        struct scrub_recover *recover;
1326        struct btrfs_bio *bbio;
1327        u64 sublen;
1328        u64 mapped_length;
1329        u64 stripe_offset;
1330        int stripe_index;
1331        int page_index = 0;
1332        int mirror_index;
1333        int nmirrors;
1334        int ret;
1335
1336        /*
1337         * note: the two members refs and outstanding_pages
1338         * are not used (and not set) in the blocks that are used for
1339         * the recheck procedure
1340         */
1341
1342        while (length > 0) {
1343                sublen = min_t(u64, length, PAGE_SIZE);
1344                mapped_length = sublen;
1345                bbio = NULL;
1346
1347                /*
1348                 * with a length of PAGE_SIZE, each returned stripe
1349                 * represents one mirror
1350                 */
1351                ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical,
1352                                       &mapped_length, &bbio, 0, 1);
1353                if (ret || !bbio || mapped_length < sublen) {
1354                        btrfs_put_bbio(bbio);
1355                        return -EIO;
1356                }
1357
1358                recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1359                if (!recover) {
1360                        btrfs_put_bbio(bbio);
1361                        return -ENOMEM;
1362                }
1363
1364                atomic_set(&recover->refs, 1);
1365                recover->bbio = bbio;
1366                recover->map_length = mapped_length;
1367
1368                BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
1369
1370                nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
1371
1372                for (mirror_index = 0; mirror_index < nmirrors;
1373                     mirror_index++) {
1374                        struct scrub_block *sblock;
1375                        struct scrub_page *page;
1376
1377                        sblock = sblocks_for_recheck + mirror_index;
1378                        sblock->sctx = sctx;
1379                        page = kzalloc(sizeof(*page), GFP_NOFS);
1380                        if (!page) {
1381leave_nomem:
1382                                spin_lock(&sctx->stat_lock);
1383                                sctx->stat.malloc_errors++;
1384                                spin_unlock(&sctx->stat_lock);
1385                                scrub_put_recover(recover);
1386                                return -ENOMEM;
1387                        }
1388                        scrub_page_get(page);
1389                        sblock->pagev[page_index] = page;
1390                        page->logical = logical;
1391
1392                        scrub_stripe_index_and_offset(logical,
1393                                                      bbio->map_type,
1394                                                      bbio->raid_map,
1395                                                      mapped_length,
1396                                                      bbio->num_stripes -
1397                                                      bbio->num_tgtdevs,
1398                                                      mirror_index,
1399                                                      &stripe_index,
1400                                                      &stripe_offset);
1401                        page->physical = bbio->stripes[stripe_index].physical +
1402                                         stripe_offset;
1403                        page->dev = bbio->stripes[stripe_index].dev;
1404
1405                        BUG_ON(page_index >= original_sblock->page_count);
1406                        page->physical_for_dev_replace =
1407                                original_sblock->pagev[page_index]->
1408                                physical_for_dev_replace;
1409                        /* for missing devices, dev->bdev is NULL */
1410                        page->mirror_num = mirror_index + 1;
1411                        sblock->page_count++;
1412                        page->page = alloc_page(GFP_NOFS);
1413                        if (!page->page)
1414                                goto leave_nomem;
1415
1416                        scrub_get_recover(recover);
1417                        page->recover = recover;
1418                }
1419                scrub_put_recover(recover);
1420                length -= sublen;
1421                logical += sublen;
1422                page_index++;
1423        }
1424
1425        return 0;
1426}
1427
1428struct scrub_bio_ret {
1429        struct completion event;
1430        int error;
1431};
1432
1433static void scrub_bio_wait_endio(struct bio *bio, int error)
1434{
1435        struct scrub_bio_ret *ret = bio->bi_private;
1436
1437        ret->error = error;
1438        complete(&ret->event);
1439}
1440
1441static inline int scrub_is_page_on_raid56(struct scrub_page *page)
1442{
1443        return page->recover &&
1444               (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
1445}
1446
1447static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1448                                        struct bio *bio,
1449                                        struct scrub_page *page)
1450{
1451        struct scrub_bio_ret done;
1452        int ret;
1453
1454        init_completion(&done.event);
1455        done.error = 0;
1456        bio->bi_iter.bi_sector = page->logical >> 9;
1457        bio->bi_private = &done;
1458        bio->bi_end_io = scrub_bio_wait_endio;
1459
1460        ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio,
1461                                    page->recover->map_length,
1462                                    page->mirror_num, 0);
1463        if (ret)
1464                return ret;
1465
1466        wait_for_completion(&done.event);
1467        if (done.error)
1468                return -EIO;
1469
1470        return 0;
1471}
1472
1473/*
1474 * this function will check the on disk data for checksum errors, header
1475 * errors and read I/O errors. If any I/O errors happen, the exact pages
1476 * which are errored are marked as being bad. The goal is to enable scrub
1477 * to take those pages that are not errored from all the mirrors so that
1478 * the pages that are errored in the just handled mirror can be repaired.
1479 */
1480static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1481                                struct scrub_block *sblock, int is_metadata,
1482                                int have_csum, u8 *csum, u64 generation,
1483                                u16 csum_size, int retry_failed_mirror)
1484{
1485        int page_num;
1486
1487        sblock->no_io_error_seen = 1;
1488        sblock->header_error = 0;
1489        sblock->checksum_error = 0;
1490
1491        for (page_num = 0; page_num < sblock->page_count; page_num++) {
1492                struct bio *bio;
1493                struct scrub_page *page = sblock->pagev[page_num];
1494
1495                if (page->dev->bdev == NULL) {
1496                        page->io_error = 1;
1497                        sblock->no_io_error_seen = 0;
1498                        continue;
1499                }
1500
1501                WARN_ON(!page->page);
1502                bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
1503                if (!bio) {
1504                        page->io_error = 1;
1505                        sblock->no_io_error_seen = 0;
1506                        continue;
1507                }
1508                bio->bi_bdev = page->dev->bdev;
1509
1510                bio_add_page(bio, page->page, PAGE_SIZE, 0);
1511                if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) {
1512                        if (scrub_submit_raid56_bio_wait(fs_info, bio, page))
1513                                sblock->no_io_error_seen = 0;
1514                } else {
1515                        bio->bi_iter.bi_sector = page->physical >> 9;
1516
1517                        if (btrfsic_submit_bio_wait(READ, bio))
1518                                sblock->no_io_error_seen = 0;
1519                }
1520
1521                bio_put(bio);
1522        }
1523
1524        if (sblock->no_io_error_seen)
1525                scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
1526                                             have_csum, csum, generation,
1527                                             csum_size);
1528
1529        return;
1530}
1531
1532static inline int scrub_check_fsid(u8 fsid[],
1533                                   struct scrub_page *spage)
1534{
1535        struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices;
1536        int ret;
1537
1538        ret = memcmp(fsid, fs_devices->fsid, BTRFS_UUID_SIZE);
1539        return !ret;
1540}
1541
1542static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1543                                         struct scrub_block *sblock,
1544                                         int is_metadata, int have_csum,
1545                                         const u8 *csum, u64 generation,
1546                                         u16 csum_size)
1547{
1548        int page_num;
1549        u8 calculated_csum[BTRFS_CSUM_SIZE];
1550        u32 crc = ~(u32)0;
1551        void *mapped_buffer;
1552
1553        WARN_ON(!sblock->pagev[0]->page);
1554        if (is_metadata) {
1555                struct btrfs_header *h;
1556
1557                mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1558                h = (struct btrfs_header *)mapped_buffer;
1559
1560                if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) ||
1561                    !scrub_check_fsid(h->fsid, sblock->pagev[0]) ||
1562                    memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1563                           BTRFS_UUID_SIZE)) {
1564                        sblock->header_error = 1;
1565                } else if (generation != btrfs_stack_header_generation(h)) {
1566                        sblock->header_error = 1;
1567                        sblock->generation_error = 1;
1568                }
1569                csum = h->csum;
1570        } else {
1571                if (!have_csum)
1572                        return;
1573
1574                mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1575        }
1576
1577        for (page_num = 0;;) {
1578                if (page_num == 0 && is_metadata)
1579                        crc = btrfs_csum_data(
1580                                ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
1581                                crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
1582                else
1583                        crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE);
1584
1585                kunmap_atomic(mapped_buffer);
1586                page_num++;
1587                if (page_num >= sblock->page_count)
1588                        break;
1589                WARN_ON(!sblock->pagev[page_num]->page);
1590
1591                mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
1592        }
1593
1594        btrfs_csum_final(crc, calculated_csum);
1595        if (memcmp(calculated_csum, csum, csum_size))
1596                sblock->checksum_error = 1;
1597}
1598
1599static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1600                                             struct scrub_block *sblock_good)
1601{
1602        int page_num;
1603        int ret = 0;
1604
1605        for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1606                int ret_sub;
1607
1608                ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1609                                                           sblock_good,
1610                                                           page_num, 1);
1611                if (ret_sub)
1612                        ret = ret_sub;
1613        }
1614
1615        return ret;
1616}
1617
1618static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1619                                            struct scrub_block *sblock_good,
1620                                            int page_num, int force_write)
1621{
1622        struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1623        struct scrub_page *page_good = sblock_good->pagev[page_num];
1624
1625        BUG_ON(page_bad->page == NULL);
1626        BUG_ON(page_good->page == NULL);
1627        if (force_write || sblock_bad->header_error ||
1628            sblock_bad->checksum_error || page_bad->io_error) {
1629                struct bio *bio;
1630                int ret;
1631
1632                if (!page_bad->dev->bdev) {
1633                        printk_ratelimited(KERN_WARNING "BTRFS: "
1634                                "scrub_repair_page_from_good_copy(bdev == NULL) "
1635                                "is unexpected!\n");
1636                        return -EIO;
1637                }
1638
1639                bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
1640                if (!bio)
1641                        return -EIO;
1642                bio->bi_bdev = page_bad->dev->bdev;
1643                bio->bi_iter.bi_sector = page_bad->physical >> 9;
1644
1645                ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
1646                if (PAGE_SIZE != ret) {
1647                        bio_put(bio);
1648                        return -EIO;
1649                }
1650
1651                if (btrfsic_submit_bio_wait(WRITE, bio)) {
1652                        btrfs_dev_stat_inc_and_print(page_bad->dev,
1653                                BTRFS_DEV_STAT_WRITE_ERRS);
1654                        btrfs_dev_replace_stats_inc(
1655                                &sblock_bad->sctx->dev_root->fs_info->
1656                                dev_replace.num_write_errors);
1657                        bio_put(bio);
1658                        return -EIO;
1659                }
1660                bio_put(bio);
1661        }
1662
1663        return 0;
1664}
1665
1666static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1667{
1668        int page_num;
1669
1670        /*
1671         * This block is used for the check of the parity on the source device,
1672         * so the data needn't be written into the destination device.
1673         */
1674        if (sblock->sparity)
1675                return;
1676
1677        for (page_num = 0; page_num < sblock->page_count; page_num++) {
1678                int ret;
1679
1680                ret = scrub_write_page_to_dev_replace(sblock, page_num);
1681                if (ret)
1682                        btrfs_dev_replace_stats_inc(
1683                                &sblock->sctx->dev_root->fs_info->dev_replace.
1684                                num_write_errors);
1685        }
1686}
1687
1688static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1689                                           int page_num)
1690{
1691        struct scrub_page *spage = sblock->pagev[page_num];
1692
1693        BUG_ON(spage->page == NULL);
1694        if (spage->io_error) {
1695                void *mapped_buffer = kmap_atomic(spage->page);
1696
1697                memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
1698                flush_dcache_page(spage->page);
1699                kunmap_atomic(mapped_buffer);
1700        }
1701        return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1702}
1703
1704static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1705                                    struct scrub_page *spage)
1706{
1707        struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1708        struct scrub_bio *sbio;
1709        int ret;
1710
1711        mutex_lock(&wr_ctx->wr_lock);
1712again:
1713        if (!wr_ctx->wr_curr_bio) {
1714                wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
1715                                              GFP_NOFS);
1716                if (!wr_ctx->wr_curr_bio) {
1717                        mutex_unlock(&wr_ctx->wr_lock);
1718                        return -ENOMEM;
1719                }
1720                wr_ctx->wr_curr_bio->sctx = sctx;
1721                wr_ctx->wr_curr_bio->page_count = 0;
1722        }
1723        sbio = wr_ctx->wr_curr_bio;
1724        if (sbio->page_count == 0) {
1725                struct bio *bio;
1726
1727                sbio->physical = spage->physical_for_dev_replace;
1728                sbio->logical = spage->logical;
1729                sbio->dev = wr_ctx->tgtdev;
1730                bio = sbio->bio;
1731                if (!bio) {
1732                        bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
1733                        if (!bio) {
1734                                mutex_unlock(&wr_ctx->wr_lock);
1735                                return -ENOMEM;
1736                        }
1737                        sbio->bio = bio;
1738                }
1739
1740                bio->bi_private = sbio;
1741                bio->bi_end_io = scrub_wr_bio_end_io;
1742                bio->bi_bdev = sbio->dev->bdev;
1743                bio->bi_iter.bi_sector = sbio->physical >> 9;
1744                sbio->err = 0;
1745        } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1746                   spage->physical_for_dev_replace ||
1747                   sbio->logical + sbio->page_count * PAGE_SIZE !=
1748                   spage->logical) {
1749                scrub_wr_submit(sctx);
1750                goto again;
1751        }
1752
1753        ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1754        if (ret != PAGE_SIZE) {
1755                if (sbio->page_count < 1) {
1756                        bio_put(sbio->bio);
1757                        sbio->bio = NULL;
1758                        mutex_unlock(&wr_ctx->wr_lock);
1759                        return -EIO;
1760                }
1761                scrub_wr_submit(sctx);
1762                goto again;
1763        }
1764
1765        sbio->pagev[sbio->page_count] = spage;
1766        scrub_page_get(spage);
1767        sbio->page_count++;
1768        if (sbio->page_count == wr_ctx->pages_per_wr_bio)
1769                scrub_wr_submit(sctx);
1770        mutex_unlock(&wr_ctx->wr_lock);
1771
1772        return 0;
1773}
1774
1775static void scrub_wr_submit(struct scrub_ctx *sctx)
1776{
1777        struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1778        struct scrub_bio *sbio;
1779
1780        if (!wr_ctx->wr_curr_bio)
1781                return;
1782
1783        sbio = wr_ctx->wr_curr_bio;
1784        wr_ctx->wr_curr_bio = NULL;
1785        WARN_ON(!sbio->bio->bi_bdev);
1786        scrub_pending_bio_inc(sctx);
1787        /* process all writes in a single worker thread. Then the block layer
1788         * orders the requests before sending them to the driver which
1789         * doubled the write performance on spinning disks when measured
1790         * with Linux 3.5 */
1791        btrfsic_submit_bio(WRITE, sbio->bio);
1792}
1793
1794static void scrub_wr_bio_end_io(struct bio *bio, int err)
1795{
1796        struct scrub_bio *sbio = bio->bi_private;
1797        struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
1798
1799        sbio->err = err;
1800        sbio->bio = bio;
1801
1802        btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
1803                         scrub_wr_bio_end_io_worker, NULL, NULL);
1804        btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1805}
1806
1807static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1808{
1809        struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1810        struct scrub_ctx *sctx = sbio->sctx;
1811        int i;
1812
1813        WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
1814        if (sbio->err) {
1815                struct btrfs_dev_replace *dev_replace =
1816                        &sbio->sctx->dev_root->fs_info->dev_replace;
1817
1818                for (i = 0; i < sbio->page_count; i++) {
1819                        struct scrub_page *spage = sbio->pagev[i];
1820
1821                        spage->io_error = 1;
1822                        btrfs_dev_replace_stats_inc(&dev_replace->
1823                                                    num_write_errors);
1824                }
1825        }
1826
1827        for (i = 0; i < sbio->page_count; i++)
1828                scrub_page_put(sbio->pagev[i]);
1829
1830        bio_put(sbio->bio);
1831        kfree(sbio);
1832        scrub_pending_bio_dec(sctx);
1833}
1834
1835static int scrub_checksum(struct scrub_block *sblock)
1836{
1837        u64 flags;
1838        int ret;
1839
1840        WARN_ON(sblock->page_count < 1);
1841        flags = sblock->pagev[0]->flags;
1842        ret = 0;
1843        if (flags & BTRFS_EXTENT_FLAG_DATA)
1844                ret = scrub_checksum_data(sblock);
1845        else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1846                ret = scrub_checksum_tree_block(sblock);
1847        else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1848                (void)scrub_checksum_super(sblock);
1849        else
1850                WARN_ON(1);
1851        if (ret)
1852                scrub_handle_errored_block(sblock);
1853
1854        return ret;
1855}
1856
1857static int scrub_checksum_data(struct scrub_block *sblock)
1858{
1859        struct scrub_ctx *sctx = sblock->sctx;
1860        u8 csum[BTRFS_CSUM_SIZE];
1861        u8 *on_disk_csum;
1862        struct page *page;
1863        void *buffer;
1864        u32 crc = ~(u32)0;
1865        int fail = 0;
1866        u64 len;
1867        int index;
1868
1869        BUG_ON(sblock->page_count < 1);
1870        if (!sblock->pagev[0]->have_csum)
1871                return 0;
1872
1873        on_disk_csum = sblock->pagev[0]->csum;
1874        page = sblock->pagev[0]->page;
1875        buffer = kmap_atomic(page);
1876
1877        len = sctx->sectorsize;
1878        index = 0;
1879        for (;;) {
1880                u64 l = min_t(u64, len, PAGE_SIZE);
1881
1882                crc = btrfs_csum_data(buffer, crc, l);
1883                kunmap_atomic(buffer);
1884                len -= l;
1885                if (len == 0)
1886                        break;
1887                index++;
1888                BUG_ON(index >= sblock->page_count);
1889                BUG_ON(!sblock->pagev[index]->page);
1890                page = sblock->pagev[index]->page;
1891                buffer = kmap_atomic(page);
1892        }
1893
1894        btrfs_csum_final(crc, csum);
1895        if (memcmp(csum, on_disk_csum, sctx->csum_size))
1896                fail = 1;
1897
1898        return fail;
1899}
1900
1901static int scrub_checksum_tree_block(struct scrub_block *sblock)
1902{
1903        struct scrub_ctx *sctx = sblock->sctx;
1904        struct btrfs_header *h;
1905        struct btrfs_root *root = sctx->dev_root;
1906        struct btrfs_fs_info *fs_info = root->fs_info;
1907        u8 calculated_csum[BTRFS_CSUM_SIZE];
1908        u8 on_disk_csum[BTRFS_CSUM_SIZE];
1909        struct page *page;
1910        void *mapped_buffer;
1911        u64 mapped_size;
1912        void *p;
1913        u32 crc = ~(u32)0;
1914        int fail = 0;
1915        int crc_fail = 0;
1916        u64 len;
1917        int index;
1918
1919        BUG_ON(sblock->page_count < 1);
1920        page = sblock->pagev[0]->page;
1921        mapped_buffer = kmap_atomic(page);
1922        h = (struct btrfs_header *)mapped_buffer;
1923        memcpy(on_disk_csum, h->csum, sctx->csum_size);
1924
1925        /*
1926         * we don't use the getter functions here, as we
1927         * a) don't have an extent buffer and
1928         * b) the page is already kmapped
1929         */
1930
1931        if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
1932                ++fail;
1933
1934        if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h))
1935                ++fail;
1936
1937        if (!scrub_check_fsid(h->fsid, sblock->pagev[0]))
1938                ++fail;
1939
1940        if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1941                   BTRFS_UUID_SIZE))
1942                ++fail;
1943
1944        len = sctx->nodesize - BTRFS_CSUM_SIZE;
1945        mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1946        p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1947        index = 0;
1948        for (;;) {
1949                u64 l = min_t(u64, len, mapped_size);
1950
1951                crc = btrfs_csum_data(p, crc, l);
1952                kunmap_atomic(mapped_buffer);
1953                len -= l;
1954                if (len == 0)
1955                        break;
1956                index++;
1957                BUG_ON(index >= sblock->page_count);
1958                BUG_ON(!sblock->pagev[index]->page);
1959                page = sblock->pagev[index]->page;
1960                mapped_buffer = kmap_atomic(page);
1961                mapped_size = PAGE_SIZE;
1962                p = mapped_buffer;
1963        }
1964
1965        btrfs_csum_final(crc, calculated_csum);
1966        if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1967                ++crc_fail;
1968
1969        return fail || crc_fail;
1970}
1971
1972static int scrub_checksum_super(struct scrub_block *sblock)
1973{
1974        struct btrfs_super_block *s;
1975        struct scrub_ctx *sctx = sblock->sctx;
1976        u8 calculated_csum[BTRFS_CSUM_SIZE];
1977        u8 on_disk_csum[BTRFS_CSUM_SIZE];
1978        struct page *page;
1979        void *mapped_buffer;
1980        u64 mapped_size;
1981        void *p;
1982        u32 crc = ~(u32)0;
1983        int fail_gen = 0;
1984        int fail_cor = 0;
1985        u64 len;
1986        int index;
1987
1988        BUG_ON(sblock->page_count < 1);
1989        page = sblock->pagev[0]->page;
1990        mapped_buffer = kmap_atomic(page);
1991        s = (struct btrfs_super_block *)mapped_buffer;
1992        memcpy(on_disk_csum, s->csum, sctx->csum_size);
1993
1994        if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
1995                ++fail_cor;
1996
1997        if (sblock->pagev[0]->generation != btrfs_super_generation(s))
1998                ++fail_gen;
1999
2000        if (!scrub_check_fsid(s->fsid, sblock->pagev[0]))

2001                ++fail_cor;
2002
2003        len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
2004        mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
2005        p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
2006        index = 0;
2007        for (;;) {
2008                u64 l = min_t(u64, len, mapped_size);
2009
2010                crc = btrfs_csum_data(p, crc, l);
2011                kunmap_atomic(mapped_buffer);
2012                len -= l;
2013                if (len == 0)
2014                        break;
2015                index++;
2016                BUG_ON(index >= sblock->page_count);
2017                BUG_ON(!sblock->pagev[index]->page);
2018                page = sblock->pagev[index]->page;
2019                mapped_buffer = kmap_atomic(page);
2020                mapped_size = PAGE_SIZE;
2021                p = mapped_buffer;
2022        }
2023
2024        btrfs_csum_final(crc, calculated_csum);
2025        if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
2026                ++fail_cor;
2027
2028        if (fail_cor + fail_gen) {
2029                /*
2030                 * if we find an error in a super block, we just report it.
2031                 * They will get written with the next transaction commit
2032                 * anyway
2033                 */
2034                spin_lock(&sctx->stat_lock);
2035                ++sctx->stat.super_errors;
2036                spin_unlock(&sctx->stat_lock);
2037                if (fail_cor)
2038                        btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
2039                                BTRFS_DEV_STAT_CORRUPTION_ERRS);
2040                else
2041                        btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
2042                                BTRFS_DEV_STAT_GENERATION_ERRS);
2043        }
2044
2045        return fail_cor + fail_gen;
2046}
2047
2048static void scrub_block_get(struct scrub_block *sblock)
2049{
2050        atomic_inc(&sblock->refs);
2051}
2052
2053static void scrub_block_put(struct scrub_block *sblock)
2054{
2055        if (atomic_dec_and_test(&sblock->refs)) {
2056                int i;
2057
2058                if (sblock->sparity)
2059                        scrub_parity_put(sblock->sparity);
2060
2061                for (i = 0; i < sblock->page_count; i++)
2062                        scrub_page_put(sblock->pagev[i]);
2063                kfree(sblock);
2064        }
2065}
2066
2067static void scrub_page_get(struct scrub_page *spage)
2068{
2069        atomic_inc(&spage->refs);
2070}
2071
2072static void scrub_page_put(struct scrub_page *spage)
2073{
2074        if (atomic_dec_and_test(&spage->refs)) {
2075                if (spage->page)
2076                        __free_page(spage->page);
2077                kfree(spage);
2078        }
2079}
2080
2081static void scrub_submit(struct scrub_ctx *sctx)
2082{
2083        struct scrub_bio *sbio;
2084
2085        if (sctx->curr == -1)
2086                return;
2087
2088        sbio = sctx->bios[sctx->curr];
2089        sctx->curr = -1;
2090        scrub_pending_bio_inc(sctx);
2091
2092        if (!sbio->bio->bi_bdev) {
2093                /*
2094                 * this case should not happen. If btrfs_map_block() is
2095                 * wrong, it could happen for dev-replace operations on
2096                 * missing devices when no mirrors are available, but in
2097                 * this case it should already fail the mount.
2098                 * This case is handled correctly (but _very_ slowly).
2099                 */
2100                printk_ratelimited(KERN_WARNING
2101                        "BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n");
2102                bio_endio(sbio->bio, -EIO);
2103        } else {
2104                btrfsic_submit_bio(READ, sbio->bio);
2105        }
2106}
2107
2108static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
2109                                    struct scrub_page *spage)
2110{
2111        struct scrub_block *sblock = spage->sblock;
2112        struct scrub_bio *sbio;
2113        int ret;
2114
2115again:
2116        /*
2117         * grab a fresh bio or wait for one to become available
2118         */
2119        while (sctx->curr == -1) {
2120                spin_lock(&sctx->list_lock);
2121                sctx->curr = sctx->first_free;
2122                if (sctx->curr != -1) {
2123                        sctx->first_free = sctx->bios[sctx->curr]->next_free;
2124                        sctx->bios[sctx->curr]->next_free = -1;
2125                        sctx->bios[sctx->curr]->page_count = 0;
2126                        spin_unlock(&sctx->list_lock);
2127                } else {
2128                        spin_unlock(&sctx->list_lock);
2129                        wait_event(sctx->list_wait, sctx->first_free != -1);
2130                }
2131        }
2132        sbio = sctx->bios[sctx->curr];
2133        if (sbio->page_count == 0) {
2134                struct bio *bio;
2135
2136                sbio->physical = spage->physical;
2137                sbio->logical = spage->logical;
2138                sbio->dev = spage->dev;
2139                bio = sbio->bio;
2140                if (!bio) {
2141                        bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
2142                        if (!bio)
2143                                return -ENOMEM;
2144                        sbio->bio = bio;
2145                }
2146
2147                bio->bi_private = sbio;
2148                bio->bi_end_io = scrub_bio_end_io;
2149                bio->bi_bdev = sbio->dev->bdev;
2150                bio->bi_iter.bi_sector = sbio->physical >> 9;
2151                sbio->err = 0;
2152        } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
2153                   spage->physical ||
2154                   sbio->logical + sbio->page_count * PAGE_SIZE !=
2155                   spage->logical ||
2156                   sbio->dev != spage->dev) {
2157                scrub_submit(sctx);
2158                goto again;
2159        }
2160
2161        sbio->pagev[sbio->page_count] = spage;
2162        ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
2163        if (ret != PAGE_SIZE) {
2164                if (sbio->page_count < 1) {
2165                        bio_put(sbio->bio);
2166                        sbio->bio = NULL;
2167                        return -EIO;
2168                }
2169                scrub_submit(sctx);
2170                goto again;
2171        }
2172
2173        scrub_block_get(sblock); /* one for the page added to the bio */
2174        atomic_inc(&sblock->outstanding_pages);
2175        sbio->page_count++;
2176        if (sbio->page_count == sctx->pages_per_rd_bio)
2177                scrub_submit(sctx);
2178
2179        return 0;
2180}
2181
2182static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
2183                       u64 physical, struct btrfs_device *dev, u64 flags,
2184                       u64 gen, int mirror_num, u8 *csum, int force,
2185                       u64 physical_for_dev_replace)
2186{
2187        struct scrub_block *sblock;
2188        int index;
2189
2190        sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
2191        if (!sblock) {
2192                spin_lock(&sctx->stat_lock);
2193                sctx->stat.malloc_errors++;
2194                spin_unlock(&sctx->stat_lock);
2195                return -ENOMEM;
2196        }
2197
2198        /* one ref inside this function, plus one for each page added to
2199         * a bio later on */
2200        atomic_set(&sblock->refs, 1);
2201        sblock->sctx = sctx;
2202        sblock->no_io_error_seen = 1;
2203
2204        for (index = 0; len > 0; index++) {
2205                struct scrub_page *spage;
2206                u64 l = min_t(u64, len, PAGE_SIZE);
2207
2208                spage = kzalloc(sizeof(*spage), GFP_NOFS);
2209                if (!spage) {
2210leave_nomem:
2211                        spin_lock(&sctx->stat_lock);
2212                        sctx->stat.malloc_errors++;
2213                        spin_unlock(&sctx->stat_lock);
2214                        scrub_block_put(sblock);
2215                        return -ENOMEM;
2216                }
2217                BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2218                scrub_page_get(spage);
2219                sblock->pagev[index] = spage;
2220                spage->sblock = sblock;
2221                spage->dev = dev;
2222                spage->flags = flags;
2223                spage->generation = gen;
2224                spage->logical = logical;
2225                spage->physical = physical;
2226                spage->physical_for_dev_replace = physical_for_dev_replace;
2227                spage->mirror_num = mirror_num;
2228                if (csum) {
2229                        spage->have_csum = 1;
2230                        memcpy(spage->csum, csum, sctx->csum_size);
2231                } else {
2232                        spage->have_csum = 0;
2233                }
2234                sblock->page_count++;
2235                spage->page = alloc_page(GFP_NOFS);
2236                if (!spage->page)
2237                        goto leave_nomem;
2238                len -= l;
2239                logical += l;
2240                physical += l;
2241                physical_for_dev_replace += l;
2242        }
2243
2244        WARN_ON(sblock->page_count == 0);
2245        for (index = 0; index < sblock->page_count; index++) {
2246                struct scrub_page *spage = sblock->pagev[index];
2247                int ret;
2248
2249                ret = scrub_add_page_to_rd_bio(sctx, spage);
2250                if (ret) {
2251                        scrub_block_put(sblock);
2252                        return ret;
2253                }
2254        }
2255
2256        if (force)
2257                scrub_submit(sctx);
2258
2259        /* last one frees, either here or in bio completion for last page */
2260        scrub_block_put(sblock);
2261        return 0;
2262}
2263
2264static void scrub_bio_end_io(struct bio *bio, int err)
2265{
2266        struct scrub_bio *sbio = bio->bi_private;
2267        struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
2268
2269        sbio->err = err;
2270        sbio->bio = bio;
2271
2272        btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
2273}
2274
2275static void scrub_bio_end_io_worker(struct btrfs_work *work)
2276{
2277        struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2278        struct scrub_ctx *sctx = sbio->sctx;
2279        int i;
2280
2281        BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
2282        if (sbio->err) {
2283                for (i = 0; i < sbio->page_count; i++) {
2284                        struct scrub_page *spage = sbio->pagev[i];
2285
2286                        spage->io_error = 1;
2287                        spage->sblock->no_io_error_seen = 0;
2288                }
2289        }
2290
2291        /* now complete the scrub_block items that have all pages completed */
2292        for (i = 0; i < sbio->page_count; i++) {
2293                struct scrub_page *spage = sbio->pagev[i];
2294                struct scrub_block *sblock = spage->sblock;
2295
2296                if (atomic_dec_and_test(&sblock->outstanding_pages))
2297                        scrub_block_complete(sblock);
2298                scrub_block_put(sblock);
2299        }
2300
2301        bio_put(sbio->bio);
2302        sbio->bio = NULL;
2303        spin_lock(&sctx->list_lock);
2304        sbio->next_free = sctx->first_free;
2305        sctx->first_free = sbio->index;
2306        spin_unlock(&sctx->list_lock);
2307
2308        if (sctx->is_dev_replace &&
2309            atomic_read(&sctx->wr_ctx.flush_all_writes)) {
2310                mutex_lock(&sctx->wr_ctx.wr_lock);
2311                scrub_wr_submit(sctx);
2312                mutex_unlock(&sctx->wr_ctx.wr_lock);
2313        }
2314
2315        scrub_pending_bio_dec(sctx);
2316}
2317
2318static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2319                                       unsigned long *bitmap,
2320                                       u64 start, u64 len)
2321{
2322        int offset;
2323        int nsectors;
2324        int sectorsize = sparity->sctx->dev_root->sectorsize;
2325
2326        if (len >= sparity->stripe_len) {
2327                bitmap_set(bitmap, 0, sparity->nsectors);
2328                return;
2329        }
2330
2331        start -= sparity->logic_start;
2332        offset = (int)do_div(start, sparity->stripe_len);
2333        offset /= sectorsize;
2334        nsectors = (int)len / sectorsize;
2335
2336        if (offset + nsectors <= sparity->nsectors) {
2337                bitmap_set(bitmap, offset, nsectors);
2338                return;
2339        }
2340
2341        bitmap_set(bitmap, offset, sparity->nsectors - offset);
2342        bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2343}
2344
2345static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2346                                                   u64 start, u64 len)
2347{
2348        __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
2349}
2350
2351static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2352                                                  u64 start, u64 len)
2353{
2354        __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
2355}
2356
2357static void scrub_block_complete(struct scrub_block *sblock)
2358{
2359        int corrupted = 0;
2360
2361        if (!sblock->no_io_error_seen) {
2362                corrupted = 1;
2363                scrub_handle_errored_block(sblock);
2364        } else {
2365                /*
2366                 * if has checksum error, write via repair mechanism in
2367                 * dev replace case, otherwise write here in dev replace
2368                 * case.
2369                 */
2370                corrupted = scrub_checksum(sblock);
2371                if (!corrupted && sblock->sctx->is_dev_replace)
2372                        scrub_write_block_to_dev_replace(sblock);
2373        }
2374
2375        if (sblock->sparity && corrupted && !sblock->data_corrected) {
2376                u64 start = sblock->pagev[0]->logical;
2377                u64 end = sblock->pagev[sblock->page_count - 1]->logical +
2378                          PAGE_SIZE;
2379
2380                scrub_parity_mark_sectors_error(sblock->sparity,
2381                                                start, end - start);
2382        }
2383}
2384
2385static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
2386                           u8 *csum)
2387{
2388        struct btrfs_ordered_sum *sum = NULL;
2389        unsigned long index;
2390        unsigned long num_sectors;
2391
2392        while (!list_empty(&sctx->csum_list)) {
2393                sum = list_first_entry(&sctx->csum_list,
2394                                       struct btrfs_ordered_sum, list);
2395                if (sum->bytenr > logical)
2396                        return 0;
2397                if (sum->bytenr + sum->len > logical)
2398                        break;
2399
2400                ++sctx->stat.csum_discards;
2401                list_del(&sum->list);
2402                kfree(sum);
2403                sum = NULL;
2404        }
2405        if (!sum)
2406                return 0;
2407
2408        index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;
2409        num_sectors = sum->len / sctx->sectorsize;
2410        memcpy(csum, sum->sums + index, sctx->csum_size);
2411        if (index == num_sectors - 1) {
2412                list_del(&sum->list);
2413                kfree(sum);
2414        }
2415        return 1;
2416}
2417
2418/* scrub extent tries to collect up to 64 kB for each bio */
2419static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
2420                        u64 physical, struct btrfs_device *dev, u64 flags,
2421                        u64 gen, int mirror_num, u64 physical_for_dev_replace)
2422{
2423        int ret;
2424        u8 csum[BTRFS_CSUM_SIZE];
2425        u32 blocksize;
2426
2427        if (flags & BTRFS_EXTENT_FLAG_DATA) {
2428                blocksize = sctx->sectorsize;
2429                spin_lock(&sctx->stat_lock);
2430                sctx->stat.data_extents_scrubbed++;
2431                sctx->stat.data_bytes_scrubbed += len;
2432                spin_unlock(&sctx->stat_lock);
2433        } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2434                blocksize = sctx->nodesize;
2435                spin_lock(&sctx->stat_lock);
2436                sctx->stat.tree_extents_scrubbed++;
2437                sctx->stat.tree_bytes_scrubbed += len;
2438                spin_unlock(&sctx->stat_lock);
2439        } else {
2440                blocksize = sctx->sectorsize;
2441                WARN_ON(1);
2442        }
2443
2444        while (len) {
2445                u64 l = min_t(u64, len, blocksize);
2446                int have_csum = 0;
2447
2448                if (flags & BTRFS_EXTENT_FLAG_DATA) {
2449                        /* push csums to sbio */
2450                        have_csum = scrub_find_csum(sctx, logical, l, csum);
2451                        if (have_csum == 0)
2452                                ++sctx->stat.no_csum;
2453                        if (sctx->is_dev_replace && !have_csum) {
2454                                ret = copy_nocow_pages(sctx, logical, l,
2455                                                       mirror_num,
2456                                                      physical_for_dev_replace);
2457                                goto behind_scrub_pages;
2458                        }
2459                }
2460                ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
2461                                  mirror_num, have_csum ? csum : NULL, 0,
2462                                  physical_for_dev_replace);
2463behind_scrub_pages:
2464                if (ret)
2465                        return ret;
2466                len -= l;
2467                logical += l;
2468                physical += l;
2469                physical_for_dev_replace += l;
2470        }
2471        return 0;
2472}
2473
2474static int scrub_pages_for_parity(struct scrub_parity *sparity,
2475                                  u64 logical, u64 len,
2476                                  u64 physical, struct btrfs_device *dev,
2477                                  u64 flags, u64 gen, int mirror_num, u8 *csum)
2478{
2479        struct scrub_ctx *sctx = sparity->sctx;
2480        struct scrub_block *sblock;
2481        int index;
2482
2483        sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
2484        if (!sblock) {
2485                spin_lock(&sctx->stat_lock);
2486                sctx->stat.malloc_errors++;
2487                spin_unlock(&sctx->stat_lock);
2488                return -ENOMEM;
2489        }
2490
2491        /* one ref inside this function, plus one for each page added to
2492         * a bio later on */
2493        atomic_set(&sblock->refs, 1);
2494        sblock->sctx = sctx;
2495        sblock->no_io_error_seen = 1;
2496        sblock->sparity = sparity;
2497        scrub_parity_get(sparity);
2498
2499        for (index = 0; len > 0; index++) {
2500                struct scrub_page *spage;
2501                u64 l = min_t(u64, len, PAGE_SIZE);
2502
2503                spage = kzalloc(sizeof(*spage), GFP_NOFS);
2504                if (!spage) {
2505leave_nomem:
2506                        spin_lock(&sctx->stat_lock);
2507                        sctx->stat.malloc_errors++;
2508                        spin_unlock(&sctx->stat_lock);
2509                        scrub_block_put(sblock);
2510                        return -ENOMEM;
2511                }
2512                BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2513                /* For scrub block */
2514                scrub_page_get(spage);
2515                sblock->pagev[index] = spage;
2516                /* For scrub parity */
2517                scrub_page_get(spage);
2518                list_add_tail(&spage->list, &sparity->spages);
2519                spage->sblock = sblock;
2520                spage->dev = dev;
2521                spage->flags = flags;
2522                spage->generation = gen;
2523                spage->logical = logical;
2524                spage->physical = physical;
2525                spage->mirror_num = mirror_num;
2526                if (csum) {
2527                        spage->have_csum = 1;
2528                        memcpy(spage->csum, csum, sctx->csum_size);
2529                } else {
2530                        spage->have_csum = 0;
2531                }
2532                sblock->page_count++;
2533                spage->page = alloc_page(GFP_NOFS);
2534                if (!spage->page)
2535                        goto leave_nomem;
2536                len -= l;
2537                logical += l;
2538                physical += l;
2539        }
2540
2541        WARN_ON(sblock->page_count == 0);
2542        for (index = 0; index < sblock->page_count; index++) {
2543                struct scrub_page *spage = sblock->pagev[index];
2544                int ret;
2545
2546                ret = scrub_add_page_to_rd_bio(sctx, spage);
2547                if (ret) {
2548                        scrub_block_put(sblock);
2549                        return ret;
2550                }
2551        }
2552
2553        /* last one frees, either here or in bio completion for last page */
2554        scrub_block_put(sblock);
2555        return 0;
2556}
2557
2558static int scrub_extent_for_parity(struct scrub_parity *sparity,
2559                                   u64 logical, u64 len,
2560                                   u64 physical, struct btrfs_device *dev,
2561                                   u64 flags, u64 gen, int mirror_num)
2562{
2563        struct scrub_ctx *sctx = sparity->sctx;
2564        int ret;
2565        u8 csum[BTRFS_CSUM_SIZE];
2566        u32 blocksize;
2567
2568        if (flags & BTRFS_EXTENT_FLAG_DATA) {
2569                blocksize = sctx->sectorsize;
2570        } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2571                blocksize = sctx->nodesize;
2572        } else {
2573                blocksize = sctx->sectorsize;
2574                WARN_ON(1);
2575        }
2576
2577        while (len) {
2578                u64 l = min_t(u64, len, blocksize);
2579                int have_csum = 0;
2580
2581                if (flags & BTRFS_EXTENT_FLAG_DATA) {
2582                        /* push csums to sbio */
2583                        have_csum = scrub_find_csum(sctx, logical, l, csum);
2584                        if (have_csum == 0)
2585                                goto skip;
2586                }
2587                ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
2588                                             flags, gen, mirror_num,
2589                                             have_csum ? csum : NULL);
2590                if (ret)
2591                        return ret;
2592skip:
2593                len -= l;
2594                logical += l;
2595                physical += l;
2596        }
2597        return 0;
2598}
2599
2600/*
2601 * Given a physical address, this will calculate it's
2602 * logical offset. if this is a parity stripe, it will return
2603 * the most left data stripe's logical offset.
2604 *
2605 * return 0 if it is a data stripe, 1 means parity stripe.
2606 */
2607static int get_raid56_logic_offset(u64 physical, int num,
2608                                   struct map_lookup *map, u64 *offset,
2609                                   u64 *stripe_start)
2610{
2611        int i;
2612        int j = 0;
2613        u64 stripe_nr;
2614        u64 last_offset;
2615        int stripe_index;
2616        int rot;
2617
2618        last_offset = (physical - map->stripes[num].physical) *
2619                      nr_data_stripes(map);
2620        if (stripe_start)
2621                *stripe_start = last_offset;
2622
2623        *offset = last_offset;
2624        for (i = 0; i < nr_data_stripes(map); i++) {
2625                *offset = last_offset + i * map->stripe_len;
2626
2627                stripe_nr = *offset;
2628                do_div(stripe_nr, map->stripe_len);
2629                do_div(stripe_nr, nr_data_stripes(map));
2630
2631                /* Work out the disk rotation on this stripe-set */
2632                rot = do_div(stripe_nr, map->num_stripes);
2633                /* calculate which stripe this data locates */
2634                rot += i;
2635                stripe_index = rot % map->num_stripes;
2636                if (stripe_index == num)
2637                        return 0;
2638                if (stripe_index < num)
2639                        j++;
2640        }
2641        *offset = last_offset + j * map->stripe_len;
2642        return 1;
2643}
2644
2645static void scrub_free_parity(struct scrub_parity *sparity)
2646{
2647        struct scrub_ctx *sctx = sparity->sctx;
2648        struct scrub_page *curr, *next;
2649        int nbits;
2650
2651        nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
2652        if (nbits) {
2653                spin_lock(&sctx->stat_lock);
2654                sctx->stat.read_errors += nbits;
2655                sctx->stat.uncorrectable_errors += nbits;
2656                spin_unlock(&sctx->stat_lock);
2657        }
2658
2659        list_for_each_entry_safe(curr, next, &sparity->spages, list) {
2660                list_del_init(&curr->list);
2661                scrub_page_put(curr);
2662        }
2663
2664        kfree(sparity);
2665}
2666
2667static void scrub_parity_bio_endio(struct bio *bio, int error)
2668{
2669        struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private;
2670        struct scrub_ctx *sctx = sparity->sctx;
2671
2672        if (error)
2673                bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2674                          sparity->nsectors);
2675
2676        scrub_free_parity(sparity);
2677        scrub_pending_bio_dec(sctx);
2678        bio_put(bio);
2679}
2680
2681static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2682{
2683        struct scrub_ctx *sctx = sparity->sctx;
2684        struct bio *bio;
2685        struct btrfs_raid_bio *rbio;
2686        struct scrub_page *spage;
2687        struct btrfs_bio *bbio = NULL;
2688        u64 length;
2689        int ret;
2690
2691        if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
2692                           sparity->nsectors))
2693                goto out;
2694
2695        length = sparity->logic_end - sparity->logic_start + 1;
2696        ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE,
2697                               sparity->logic_start,
2698                               &length, &bbio, 0, 1);
2699        if (ret || !bbio || !bbio->raid_map)
2700                goto bbio_out;
2701
2702        bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
2703        if (!bio)
2704                goto bbio_out;
2705
2706        bio->bi_iter.bi_sector = sparity->logic_start >> 9;
2707        bio->bi_private = sparity;
2708        bio->bi_end_io = scrub_parity_bio_endio;
2709
2710        rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio,
2711                                              length, sparity->scrub_dev,
2712                                              sparity->dbitmap,
2713                                              sparity->nsectors);
2714        if (!rbio)
2715                goto rbio_out;
2716
2717        list_for_each_entry(spage, &sparity->spages, list)
2718                raid56_parity_add_scrub_pages(rbio, spage->page,
2719                                              spage->logical);
2720
2721        scrub_pending_bio_inc(sctx);
2722        raid56_parity_submit_scrub_rbio(rbio);
2723        return;
2724
2725rbio_out:
2726        bio_put(bio);
2727bbio_out:
2728        btrfs_put_bbio(bbio);
2729        bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2730                  sparity->nsectors);
2731        spin_lock(&sctx->stat_lock);
2732        sctx->stat.malloc_errors++;
2733        spin_unlock(&sctx->stat_lock);
2734out:
2735        scrub_free_parity(sparity);
2736}
2737
2738static inline int scrub_calc_parity_bitmap_len(int nsectors)
2739{
2740        return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * (BITS_PER_LONG / 8);
2741}
2742
2743static void scrub_parity_get(struct scrub_parity *sparity)
2744{
2745        atomic_inc(&sparity->refs);
2746}
2747
2748static void scrub_parity_put(struct scrub_parity *sparity)
2749{
2750        if (!atomic_dec_and_test(&sparity->refs))
2751                return;
2752
2753        scrub_parity_check_and_repair(sparity);
2754}
2755
2756static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
2757                                                  struct map_lookup *map,
2758                                                  struct btrfs_device *sdev,
2759                                                  struct btrfs_path *path,
2760                                                  u64 logic_start,
2761                                                  u64 logic_end)
2762{
2763        struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
2764        struct btrfs_root *root = fs_info->extent_root;
2765        struct btrfs_root *csum_root = fs_info->csum_root;
2766        struct btrfs_extent_item *extent;
2767        u64 flags;
2768        int ret;
2769        int slot;
2770        struct extent_buffer *l;
2771        struct btrfs_key key;
2772        u64 generation;
2773        u64 extent_logical;
2774        u64 extent_physical;
2775        u64 extent_len;
2776        struct btrfs_device *extent_dev;
2777        struct scrub_parity *sparity;
2778        int nsectors;
2779        int bitmap_len;
2780        int extent_mirror_num;
2781        int stop_loop = 0;
2782
2783        nsectors = map->stripe_len / root->sectorsize;
2784        bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
2785        sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
2786                          GFP_NOFS);
2787        if (!sparity) {
2788                spin_lock(&sctx->stat_lock);
2789                sctx->stat.malloc_errors++;
2790                spin_unlock(&sctx->stat_lock);
2791                return -ENOMEM;
2792        }
2793
2794        sparity->stripe_len = map->stripe_len;
2795        sparity->nsectors = nsectors;
2796        sparity->sctx = sctx;
2797        sparity->scrub_dev = sdev;
2798        sparity->logic_start = logic_start;
2799        sparity->logic_end = logic_end;
2800        atomic_set(&sparity->refs, 1);
2801        INIT_LIST_HEAD(&sparity->spages);
2802        sparity->dbitmap = sparity->bitmap;
2803        sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
2804
2805        ret = 0;
2806        while (logic_start < logic_end) {
2807                if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2808                        key.type = BTRFS_METADATA_ITEM_KEY;
2809                else
2810                        key.type = BTRFS_EXTENT_ITEM_KEY;
2811                key.objectid = logic_start;
2812                key.offset = (u64)-1;
2813
2814                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2815                if (ret < 0)
2816                        goto out;
2817
2818                if (ret > 0) {
2819                        ret = btrfs_previous_extent_item(root, path, 0);
2820                        if (ret < 0)
2821                                goto out;
2822                        if (ret > 0) {
2823                                btrfs_release_path(path);
2824                                ret = btrfs_search_slot(NULL, root, &key,
2825                                                        path, 0, 0);
2826                                if (ret < 0)
2827                                        goto out;
2828                        }
2829                }
2830
2831                stop_loop = 0;
2832                while (1) {
2833                        u64 bytes;
2834
2835                        l = path->nodes[0];
2836                        slot = path->slots[0];
2837                        if (slot >= btrfs_header_nritems(l)) {
2838                                ret = btrfs_next_leaf(root, path);
2839                                if (ret == 0)
2840                                        continue;
2841                                if (ret < 0)
2842                                        goto out;
2843
2844                                stop_loop = 1;
2845                                break;
2846                        }
2847                        btrfs_item_key_to_cpu(l, &key, slot);
2848
2849                        if (key.type == BTRFS_METADATA_ITEM_KEY)
2850                                bytes = root->nodesize;
2851                        else
2852                                bytes = key.offset;
2853
2854                        if (key.objectid + bytes <= logic_start)
2855                                goto next;
2856
2857                        if (key.type != BTRFS_EXTENT_ITEM_KEY &&
2858                            key.type != BTRFS_METADATA_ITEM_KEY)
2859                                goto next;
2860
2861                        if (key.objectid > logic_end) {
2862                                stop_loop = 1;
2863                                break;
2864                        }
2865
2866                        while (key.objectid >= logic_start + map->stripe_len)
2867                                logic_start += map->stripe_len;
2868
2869                        extent = btrfs_item_ptr(l, slot,
2870                                                struct btrfs_extent_item);
2871                        flags = btrfs_extent_flags(l, extent);
2872                        generation = btrfs_extent_generation(l, extent);
2873
2874                        if (key.objectid < logic_start &&
2875                            (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
2876                                btrfs_err(fs_info,
2877                                          "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
2878                                           key.objectid, logic_start);
2879                                goto next;
2880                        }
2881again:
2882                        extent_logical = key.objectid;
2883                        extent_len = bytes;
2884
2885                        if (extent_logical < logic_start) {
2886                                extent_len -= logic_start - extent_logical;
2887                                extent_logical = logic_start;
2888                        }
2889
2890                        if (extent_logical + extent_len >
2891                            logic_start + map->stripe_len)
2892                                extent_len = logic_start + map->stripe_len -
2893                                             extent_logical;
2894
2895                        scrub_parity_mark_sectors_data(sparity, extent_logical,
2896                                                       extent_len);
2897
2898                        scrub_remap_extent(fs_info, extent_logical,
2899                                           extent_len, &extent_physical,
2900                                           &extent_dev,
2901                                           &extent_mirror_num);
2902
2903                        ret = btrfs_lookup_csums_range(csum_root,
2904                                                extent_logical,
2905                                                extent_logical + extent_len - 1,
2906                                                &sctx->csum_list, 1);
2907                        if (ret)
2908                                goto out;
2909
2910                        ret = scrub_extent_for_parity(sparity, extent_logical,
2911                                                      extent_len,
2912                                                      extent_physical,
2913                                                      extent_dev, flags,
2914                                                      generation,
2915                                                      extent_mirror_num);
2916                        if (ret)
2917                                goto out;
2918
2919                        scrub_free_csums(sctx);
2920                        if (extent_logical + extent_len <
2921                            key.objectid + bytes) {
2922                                logic_start += map->stripe_len;
2923
2924                                if (logic_start >= logic_end) {
2925                                        stop_loop = 1;
2926                                        break;
2927                                }
2928
2929                                if (logic_start < key.objectid + bytes) {
2930                                        cond_resched();
2931                                        goto again;
2932                                }
2933                        }
2934next:
2935                        path->slots[0]++;
2936                }
2937
2938                btrfs_release_path(path);
2939
2940                if (stop_loop)
2941                        break;
2942
2943                logic_start += map->stripe_len;
2944        }
2945out:
2946        if (ret < 0)
2947                scrub_parity_mark_sectors_error(sparity, logic_start,
2948                                                logic_end - logic_start + 1);
2949        scrub_parity_put(sparity);
2950        scrub_submit(sctx);
2951        mutex_lock(&sctx->wr_ctx.wr_lock);
2952        scrub_wr_submit(sctx);
2953        mutex_unlock(&sctx->wr_ctx.wr_lock);
2954
2955        btrfs_release_path(path);
2956        return ret < 0 ? ret : 0;
2957}
2958
2959static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2960                                           struct map_lookup *map,
2961                                           struct btrfs_device *scrub_dev,
2962                                           int num, u64 base, u64 length,
2963                                           int is_dev_replace)
2964{
2965        struct btrfs_path *path, *ppath;
2966        struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
2967        struct btrfs_root *root = fs_info->extent_root;
2968        struct btrfs_root *csum_root = fs_info->csum_root;
2969        struct btrfs_extent_item *extent;
2970        struct blk_plug plug;
2971        u64 flags;
2972        int ret;
2973        int slot;
2974        u64 nstripes;
2975        struct extent_buffer *l;
2976        struct btrfs_key key;
2977        u64 physical;
2978        u64 logical;
2979        u64 logic_end;
2980        u64 physical_end;
2981        u64 generation;
2982        int mirror_num;
2983        struct reada_control *reada1;
2984        struct reada_control *reada2;
2985        struct btrfs_key key_start;
2986        struct btrfs_key key_end;
2987        u64 increment = map->stripe_len;
2988        u64 offset;
2989        u64 extent_logical;
2990        u64 extent_physical;
2991        u64 extent_len;
2992        u64 stripe_logical;
2993        u64 stripe_end;
2994        struct btrfs_device *extent_dev;
2995        int extent_mirror_num;
2996        int stop_loop = 0;
2997
2998        nstripes = length;
2999        physical = map->stripes[num].physical;
3000        offset = 0;

3001        do_div(nstripes, map->stripe_len);
3002        if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
3003                offset = map->stripe_len * num;
3004                increment = map->stripe_len * map->num_stripes;
3005                mirror_num = 1;
3006        } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
3007                int factor = map->num_stripes / map->sub_stripes;
3008                offset = map->stripe_len * (num / map->sub_stripes);
3009                increment = map->stripe_len * factor;
3010                mirror_num = num % map->sub_stripes + 1;
3011        } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
3012                increment = map->stripe_len;
3013                mirror_num = num % map->num_stripes + 1;
3014        } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
3015                increment = map->stripe_len;
3016                mirror_num = num % map->num_stripes + 1;
3017        } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3018                get_raid56_logic_offset(physical, num, map, &offset, NULL);
3019                increment = map->stripe_len * nr_data_stripes(map);
3020                mirror_num = 1;
3021        } else {
3022                increment = map->stripe_len;
3023                mirror_num = 1;
3024        }
3025
3026        path = btrfs_alloc_path();
3027        if (!path)
3028                return -ENOMEM;
3029
3030        ppath = btrfs_alloc_path();
3031        if (!ppath) {
3032                btrfs_free_path(path);
3033                return -ENOMEM;
3034        }
3035
3036        /*
3037         * work on commit root. The related disk blocks are static as
3038         * long as COW is applied. This means, it is save to rewrite
3039         * them to repair disk errors without any race conditions
3040         */
3041        path->search_commit_root = 1;
3042        path->skip_locking = 1;
3043
3044        ppath->search_commit_root = 1;
3045        ppath->skip_locking = 1;
3046        /*
3047         * trigger the readahead for extent tree csum tree and wait for
3048         * completion. During readahead, the scrub is officially paused
3049         * to not hold off transaction commits
3050         */
3051        logical = base + offset;
3052        physical_end = physical + nstripes * map->stripe_len;
3053        if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3054                get_raid56_logic_offset(physical_end, num,
3055                                        map, &logic_end, NULL);
3056                logic_end += base;
3057        } else {
3058                logic_end = logical + increment * nstripes;
3059        }
3060        wait_event(sctx->list_wait,
3061                   atomic_read(&sctx->bios_in_flight) == 0);
3062        scrub_blocked_if_needed(fs_info);
3063
3064        /* FIXME it might be better to start readahead at commit root */
3065        key_start.objectid = logical;
3066        key_start.type = BTRFS_EXTENT_ITEM_KEY;
3067        key_start.offset = (u64)0;
3068        key_end.objectid = logic_end;
3069        key_end.type = BTRFS_METADATA_ITEM_KEY;
3070        key_end.offset = (u64)-1;
3071        reada1 = btrfs_reada_add(root, &key_start, &key_end);
3072
3073        key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3074        key_start.type = BTRFS_EXTENT_CSUM_KEY;
3075        key_start.offset = logical;
3076        key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
3077        key_end.type = BTRFS_EXTENT_CSUM_KEY;
3078        key_end.offset = logic_end;
3079        reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
3080
3081        if (!IS_ERR(reada1))
3082                btrfs_reada_wait(reada1);
3083        if (!IS_ERR(reada2))
3084                btrfs_reada_wait(reada2);
3085
3086
3087        /*
3088         * collect all data csums for the stripe to avoid seeking during
3089         * the scrub. This might currently (crc32) end up to be about 1MB
3090         */
3091        blk_start_plug(&plug);
3092
3093        /*
3094         * now find all extents for each stripe and scrub them
3095         */
3096        ret = 0;
3097        while (physical < physical_end) {
3098                /* for raid56, we skip parity stripe */
3099                if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3100                        ret = get_raid56_logic_offset(physical, num,
3101                                        map, &logical, &stripe_logical);
3102                        logical += base;
3103                        if (ret) {
3104                                stripe_logical += base;
3105                                stripe_end = stripe_logical + increment - 1;
3106                                ret = scrub_raid56_parity(sctx, map, scrub_dev,
3107                                                ppath, stripe_logical,
3108                                                stripe_end);
3109                                if (ret)
3110                                        goto out;
3111                                goto skip;
3112                        }
3113                }
3114                /*
3115                 * canceled?
3116                 */
3117                if (atomic_read(&fs_info->scrub_cancel_req) ||
3118                    atomic_read(&sctx->cancel_req)) {
3119                        ret = -ECANCELED;
3120                        goto out;
3121                }
3122                /*
3123                 * check to see if we have to pause
3124                 */
3125                if (atomic_read(&fs_info->scrub_pause_req)) {
3126                        /* push queued extents */
3127                        atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
3128                        scrub_submit(sctx);
3129                        mutex_lock(&sctx->wr_ctx.wr_lock);
3130                        scrub_wr_submit(sctx);
3131                        mutex_unlock(&sctx->wr_ctx.wr_lock);
3132                        wait_event(sctx->list_wait,
3133                                   atomic_read(&sctx->bios_in_flight) == 0);
3134                        atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
3135                        scrub_blocked_if_needed(fs_info);
3136                }
3137
3138                if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
3139                        key.type = BTRFS_METADATA_ITEM_KEY;
3140                else
3141                        key.type = BTRFS_EXTENT_ITEM_KEY;
3142                key.objectid = logical;
3143                key.offset = (u64)-1;
3144
3145                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3146                if (ret < 0)
3147                        goto out;
3148
3149                if (ret > 0) {
3150                        ret = btrfs_previous_extent_item(root, path, 0);
3151                        if (ret < 0)
3152                                goto out;
3153                        if (ret > 0) {
3154                                /* there's no smaller item, so stick with the
3155                                 * larger one */
3156                                btrfs_release_path(path);
3157                                ret = btrfs_search_slot(NULL, root, &key,
3158                                                        path, 0, 0);
3159                                if (ret < 0)
3160                                        goto out;
3161                        }
3162                }
3163
3164                stop_loop = 0;
3165                while (1) {
3166                        u64 bytes;
3167
3168                        l = path->nodes[0];
3169                        slot = path->slots[0];
3170                        if (slot >= btrfs_header_nritems(l)) {
3171                                ret = btrfs_next_leaf(root, path);
3172                                if (ret == 0)
3173                                        continue;
3174                                if (ret < 0)
3175                                        goto out;
3176
3177                                stop_loop = 1;
3178                                break;
3179                        }
3180                        btrfs_item_key_to_cpu(l, &key, slot);
3181
3182                        if (key.type == BTRFS_METADATA_ITEM_KEY)
3183                                bytes = root->nodesize;
3184                        else
3185                                bytes = key.offset;
3186
3187                        if (key.objectid + bytes <= logical)
3188                                goto next;
3189
3190                        if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3191                            key.type != BTRFS_METADATA_ITEM_KEY)
3192                                goto next;
3193
3194                        if (key.objectid >= logical + map->stripe_len) {
3195                                /* out of this device extent */
3196                                if (key.objectid >= logic_end)
3197                                        stop_loop = 1;
3198                                break;
3199                        }
3200
3201                        extent = btrfs_item_ptr(l, slot,
3202                                                struct btrfs_extent_item);
3203                        flags = btrfs_extent_flags(l, extent);
3204                        generation = btrfs_extent_generation(l, extent);
3205
3206                        if (key.objectid < logical &&
3207                            (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
3208                                btrfs_err(fs_info,
3209                                           "scrub: tree block %llu spanning "
3210                                           "stripes, ignored. logical=%llu",
3211                                       key.objectid, logical);
3212                                goto next;
3213                        }
3214
3215again:
3216                        extent_logical = key.objectid;
3217                        extent_len = bytes;
3218
3219                        /*
3220                         * trim extent to this stripe
3221                         */
3222                        if (extent_logical < logical) {
3223                                extent_len -= logical - extent_logical;
3224                                extent_logical = logical;
3225                        }
3226                        if (extent_logical + extent_len >
3227                            logical + map->stripe_len) {
3228                                extent_len = logical + map->stripe_len -
3229                                             extent_logical;
3230                        }
3231
3232                        extent_physical = extent_logical - logical + physical;
3233                        extent_dev = scrub_dev;
3234                        extent_mirror_num = mirror_num;
3235                        if (is_dev_replace)
3236                                scrub_remap_extent(fs_info, extent_logical,
3237                                                   extent_len, &extent_physical,
3238                                                   &extent_dev,
3239                                                   &extent_mirror_num);
3240
3241                        ret = btrfs_lookup_csums_range(csum_root, logical,
3242                                                logical + map->stripe_len - 1,
3243                                                &sctx->csum_list, 1);
3244                        if (ret)
3245                                goto out;
3246
3247                        ret = scrub_extent(sctx, extent_logical, extent_len,
3248                                           extent_physical, extent_dev, flags,
3249                                           generation, extent_mirror_num,
3250                                           extent_logical - logical + physical);
3251                        if (ret)
3252                                goto out;
3253
3254                        scrub_free_csums(sctx);
3255                        if (extent_logical + extent_len <
3256                            key.objectid + bytes) {
3257                                if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
3258                                        /*
3259                                         * loop until we find next data stripe
3260                                         * or we have finished all stripes.
3261                                         */
3262loop:
3263                                        physical += map->stripe_len;
3264                                        ret = get_raid56_logic_offset(physical,
3265                                                        num, map, &logical,
3266                                                        &stripe_logical);
3267                                        logical += base;
3268
3269                                        if (ret && physical < physical_end) {
3270                                                stripe_logical += base;
3271                                                stripe_end = stripe_logical +
3272                                                                increment - 1;
3273                                                ret = scrub_raid56_parity(sctx,
3274                                                        map, scrub_dev, ppath,
3275                                                        stripe_logical,
3276                                                        stripe_end);
3277                                                if (ret)
3278                                                        goto out;
3279                                                goto loop;
3280                                        }
3281                                } else {
3282                                        physical += map->stripe_len;
3283                                        logical += increment;
3284                                }
3285                                if (logical < key.objectid + bytes) {
3286                                        cond_resched();
3287                                        goto again;
3288                                }
3289
3290                                if (physical >= physical_end) {
3291                                        stop_loop = 1;
3292                                        break;
3293                                }
3294                        }
3295next:
3296                        path->slots[0]++;
3297                }
3298                btrfs_release_path(path);
3299skip:
3300                logical += increment;
3301                physical += map->stripe_len;
3302                spin_lock(&sctx->stat_lock);
3303                if (stop_loop)
3304                        sctx->stat.last_physical = map->stripes[num].physical +
3305                                                   length;
3306                else
3307                        sctx->stat.last_physical = physical;
3308                spin_unlock(&sctx->stat_lock);
3309                if (stop_loop)
3310                        break;
3311        }
3312out:
3313        /* push queued extents */
3314        scrub_submit(sctx);
3315        mutex_lock(&sctx->wr_ctx.wr_lock);
3316        scrub_wr_submit(sctx);
3317        mutex_unlock(&sctx->wr_ctx.wr_lock);
3318
3319        blk_finish_plug(&plug);
3320        btrfs_free_path(path);
3321        btrfs_free_path(ppath);
3322        return ret < 0 ? ret : 0;
3323}
3324
3325static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
3326                                          struct btrfs_device *scrub_dev,
3327                                          u64 chunk_tree, u64 chunk_objectid,
3328                                          u64 chunk_offset, u64 length,
3329                                          u64 dev_offset, int is_dev_replace)
3330{
3331        struct btrfs_mapping_tree *map_tree =
3332                &sctx->dev_root->fs_info->mapping_tree;
3333        struct map_lookup *map;
3334        struct extent_map *em;
3335        int i;
3336        int ret = 0;
3337
3338        read_lock(&map_tree->map_tree.lock);
3339        em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
3340        read_unlock(&map_tree->map_tree.lock);
3341
3342        if (!em)
3343                return -EINVAL;
3344
3345        map = (struct map_lookup *)em->bdev;
3346        if (em->start != chunk_offset)
3347                goto out;
3348
3349        if (em->len < length)
3350                goto out;
3351
3352        for (i = 0; i < map->num_stripes; ++i) {
3353                if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
3354                    map->stripes[i].physical == dev_offset) {
3355                        ret = scrub_stripe(sctx, map, scrub_dev, i,
3356                                           chunk_offset, length,
3357                                           is_dev_replace);
3358                        if (ret)
3359                                goto out;
3360                }
3361        }
3362out:
3363        free_extent_map(em);
3364
3365        return ret;
3366}
3367
3368static noinline_for_stack
3369int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3370                           struct btrfs_device *scrub_dev, u64 start, u64 end,
3371                           int is_dev_replace)
3372{
3373        struct btrfs_dev_extent *dev_extent = NULL;
3374        struct btrfs_path *path;
3375        struct btrfs_root *root = sctx->dev_root;
3376        struct btrfs_fs_info *fs_info = root->fs_info;
3377        u64 length;
3378        u64 chunk_tree;
3379        u64 chunk_objectid;
3380        u64 chunk_offset;
3381        int ret;
3382        int slot;
3383        struct extent_buffer *l;
3384        struct btrfs_key key;
3385        struct btrfs_key found_key;
3386        struct btrfs_block_group_cache *cache;
3387        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
3388
3389        path = btrfs_alloc_path();
3390        if (!path)
3391                return -ENOMEM;
3392
3393        path->reada = 2;
3394        path->search_commit_root = 1;
3395        path->skip_locking = 1;
3396
3397        key.objectid = scrub_dev->devid;
3398        key.offset = 0ull;
3399        key.type = BTRFS_DEV_EXTENT_KEY;
3400
3401        while (1) {
3402                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3403                if (ret < 0)
3404                        break;
3405                if (ret > 0) {
3406                        if (path->slots[0] >=
3407                            btrfs_header_nritems(path->nodes[0])) {
3408                                ret = btrfs_next_leaf(root, path);
3409                                if (ret)
3410                                        break;
3411                        }
3412                }
3413
3414                l = path->nodes[0];
3415                slot = path->slots[0];
3416
3417                btrfs_item_key_to_cpu(l, &found_key, slot);
3418
3419                if (found_key.objectid != scrub_dev->devid)
3420                        break;
3421
3422                if (found_key.type != BTRFS_DEV_EXTENT_KEY)
3423                        break;
3424
3425                if (found_key.offset >= end)
3426                        break;
3427
3428                if (found_key.offset < key.offset)
3429                        break;
3430
3431                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3432                length = btrfs_dev_extent_length(l, dev_extent);
3433
3434                if (found_key.offset + length <= start)
3435                        goto skip;
3436
3437                chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
3438                chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
3439                chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3440
3441                /*
3442                 * get a reference on the corresponding block group to prevent
3443                 * the chunk from going away while we scrub it
3444                 */
3445                cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3446
3447                /* some chunks are removed but not committed to disk yet,
3448                 * continue scrubbing */
3449                if (!cache)
3450                        goto skip;
3451
3452                dev_replace->cursor_right = found_key.offset + length;
3453                dev_replace->cursor_left = found_key.offset;
3454                dev_replace->item_needs_writeback = 1;
3455                ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
3456                                  chunk_offset, length, found_key.offset,
3457                                  is_dev_replace);
3458
3459                /*
3460                 * flush, submit all pending read and write bios, afterwards
3461                 * wait for them.
3462                 * Note that in the dev replace case, a read request causes
3463                 * write requests that are submitted in the read completion
3464                 * worker. Therefore in the current situation, it is required
3465                 * that all write requests are flushed, so that all read and
3466                 * write requests are really completed when bios_in_flight
3467                 * changes to 0.
3468                 */
3469                atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
3470                scrub_submit(sctx);
3471                mutex_lock(&sctx->wr_ctx.wr_lock);
3472                scrub_wr_submit(sctx);
3473                mutex_unlock(&sctx->wr_ctx.wr_lock);
3474
3475                wait_event(sctx->list_wait,
3476                           atomic_read(&sctx->bios_in_flight) == 0);
3477                atomic_inc(&fs_info->scrubs_paused);
3478                wake_up(&fs_info->scrub_pause_wait);
3479
3480                /*
3481                 * must be called before we decrease @scrub_paused.
3482                 * make sure we don't block transaction commit while
3483                 * we are waiting pending workers finished.
3484                 */
3485                wait_event(sctx->list_wait,
3486                           atomic_read(&sctx->workers_pending) == 0);
3487                atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
3488
3489                mutex_lock(&fs_info->scrub_lock);
3490                __scrub_blocked_if_needed(fs_info);
3491                atomic_dec(&fs_info->scrubs_paused);
3492                mutex_unlock(&fs_info->scrub_lock);
3493                wake_up(&fs_info->scrub_pause_wait);
3494
3495                btrfs_put_block_group(cache);
3496                if (ret)
3497                        break;
3498                if (is_dev_replace &&
3499                    atomic64_read(&dev_replace->num_write_errors) > 0) {
3500                        ret = -EIO;
3501                        break;
3502                }
3503                if (sctx->stat.malloc_errors > 0) {
3504                        ret = -ENOMEM;
3505                        break;
3506                }
3507
3508                dev_replace->cursor_left = dev_replace->cursor_right;
3509                dev_replace->item_needs_writeback = 1;
3510skip:
3511                key.offset = found_key.offset + length;
3512                btrfs_release_path(path);
3513        }
3514
3515        btrfs_free_path(path);
3516
3517        /*
3518         * ret can still be 1 from search_slot or next_leaf,
3519         * that's not an error
3520         */
3521        return ret < 0 ? ret : 0;
3522}
3523
3524static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
3525                                           struct btrfs_device *scrub_dev)
3526{
3527        int     i;
3528        u64     bytenr;
3529        u64     gen;
3530        int     ret;
3531        struct btrfs_root *root = sctx->dev_root;
3532
3533        if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
3534                return -EIO;
3535
3536        /* Seed devices of a new filesystem has their own generation. */
3537        if (scrub_dev->fs_devices != root->fs_info->fs_devices)
3538                gen = scrub_dev->generation;
3539        else
3540                gen = root->fs_info->last_trans_committed;
3541
3542        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
3543                bytenr = btrfs_sb_offset(i);
3544                if (bytenr + BTRFS_SUPER_INFO_SIZE >
3545                    scrub_dev->commit_total_bytes)
3546                        break;
3547
3548                ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
3549                                  scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
3550                                  NULL, 1, bytenr);
3551                if (ret)
3552                        return ret;
3553        }
3554        wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3555
3556        return 0;
3557}
3558
3559/*
3560 * get a reference count on fs_info->scrub_workers. start worker if necessary
3561 */
3562static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
3563                                                int is_dev_replace)
3564{
3565        int ret = 0;
3566        int flags = WQ_FREEZABLE | WQ_UNBOUND;
3567        int max_active = fs_info->thread_pool_size;
3568
3569        if (fs_info->scrub_workers_refcnt == 0) {
3570                if (is_dev_replace)
3571                        fs_info->scrub_workers =
3572                                btrfs_alloc_workqueue("btrfs-scrub", flags,
3573                                                      1, 4);
3574                else
3575                        fs_info->scrub_workers =
3576                                btrfs_alloc_workqueue("btrfs-scrub", flags,
3577                                                      max_active, 4);
3578                if (!fs_info->scrub_workers) {
3579                        ret = -ENOMEM;
3580                        goto out;
3581                }
3582                fs_info->scrub_wr_completion_workers =
3583                        btrfs_alloc_workqueue("btrfs-scrubwrc", flags,
3584                                              max_active, 2);
3585                if (!fs_info->scrub_wr_completion_workers) {
3586                        ret = -ENOMEM;
3587                        goto out;
3588                }
3589                fs_info->scrub_nocow_workers =
3590                        btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0);
3591                if (!fs_info->scrub_nocow_workers) {
3592                        ret = -ENOMEM;
3593                        goto out;
3594                }
3595        }
3596        ++fs_info->scrub_workers_refcnt;
3597out:
3598        return ret;
3599}
3600
3601static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
3602{
3603        if (--fs_info->scrub_workers_refcnt == 0) {
3604                btrfs_destroy_workqueue(fs_info->scrub_workers);
3605                btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
3606                btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
3607        }
3608        WARN_ON(fs_info->scrub_workers_refcnt < 0);
3609}
3610
3611int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
3612                    u64 end, struct btrfs_scrub_progress *progress,
3613                    int readonly, int is_dev_replace)
3614{
3615        struct scrub_ctx *sctx;
3616        int ret;
3617        struct btrfs_device *dev;
3618        struct rcu_string *name;
3619
3620        if (btrfs_fs_closing(fs_info))
3621                return -EINVAL;
3622
3623        if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
3624                /*
3625                 * in this case scrub is unable to calculate the checksum
3626                 * the way scrub is implemented. Do not handle this
3627                 * situation at all because it won't ever happen.
3628                 */
3629                btrfs_err(fs_info,
3630                           "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
3631                       fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
3632                return -EINVAL;
3633        }
3634
3635        if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
3636                /* not supported for data w/o checksums */
3637                btrfs_err(fs_info,
3638                           "scrub: size assumption sectorsize != PAGE_SIZE "
3639                           "(%d != %lu) fails",
3640                       fs_info->chunk_root->sectorsize, PAGE_SIZE);
3641                return -EINVAL;
3642        }
3643
3644        if (fs_info->chunk_root->nodesize >
3645            PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
3646            fs_info->chunk_root->sectorsize >
3647            PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
3648                /*
3649                 * would exhaust the array bounds of pagev member in
3650                 * struct scrub_block
3651                 */
3652                btrfs_err(fs_info, "scrub: size assumption nodesize and sectorsize "
3653                           "<= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
3654                       fs_info->chunk_root->nodesize,
3655                       SCRUB_MAX_PAGES_PER_BLOCK,
3656                       fs_info->chunk_root->sectorsize,
3657                       SCRUB_MAX_PAGES_PER_BLOCK);
3658                return -EINVAL;
3659        }
3660
3661
3662        mutex_lock(&fs_info->fs_devices->device_list_mutex);
3663        dev = btrfs_find_device(fs_info, devid, NULL, NULL);
3664        if (!dev || (dev->missing && !is_dev_replace)) {
3665                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3666                return -ENODEV;
3667        }
3668
3669        if (!is_dev_replace && !readonly && !dev->writeable) {
3670                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3671                rcu_read_lock();
3672                name = rcu_dereference(dev->name);
3673                btrfs_err(fs_info, "scrub: device %s is not writable",
3674                          name->str);
3675                rcu_read_unlock();
3676                return -EROFS;
3677        }
3678
3679        mutex_lock(&fs_info->scrub_lock);
3680        if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
3681                mutex_unlock(&fs_info->scrub_lock);
3682                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3683                return -EIO;
3684        }
3685
3686        btrfs_dev_replace_lock(&fs_info->dev_replace);
3687        if (dev->scrub_device ||
3688            (!is_dev_replace &&
3689             btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
3690                btrfs_dev_replace_unlock(&fs_info->dev_replace);
3691                mutex_unlock(&fs_info->scrub_lock);
3692                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3693                return -EINPROGRESS;
3694        }
3695        btrfs_dev_replace_unlock(&fs_info->dev_replace);
3696
3697        ret = scrub_workers_get(fs_info, is_dev_replace);
3698        if (ret) {
3699                mutex_unlock(&fs_info->scrub_lock);
3700                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3701                return ret;
3702        }
3703
3704        sctx = scrub_setup_ctx(dev, is_dev_replace);
3705        if (IS_ERR(sctx)) {
3706                mutex_unlock(&fs_info->scrub_lock);
3707                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3708                scrub_workers_put(fs_info);
3709                return PTR_ERR(sctx);
3710        }
3711        sctx->readonly = readonly;
3712        dev->scrub_device = sctx;
3713        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3714
3715        /*
3716         * checking @scrub_pause_req here, we can avoid
3717         * race between committing transaction and scrubbing.
3718         */
3719        __scrub_blocked_if_needed(fs_info);
3720        atomic_inc(&fs_info->scrubs_running);
3721        mutex_unlock(&fs_info->scrub_lock);
3722
3723        if (!is_dev_replace) {
3724                /*
3725                 * by holding device list mutex, we can
3726                 * kick off writing super in log tree sync.
3727                 */
3728                mutex_lock(&fs_info->fs_devices->device_list_mutex);
3729                ret = scrub_supers(sctx, dev);
3730                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3731        }
3732
3733        if (!ret)
3734                ret = scrub_enumerate_chunks(sctx, dev, start, end,
3735                                             is_dev_replace);
3736
3737        wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3738        atomic_dec(&fs_info->scrubs_running);
3739        wake_up(&fs_info->scrub_pause_wait);
3740
3741        wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
3742
3743        if (progress)
3744                memcpy(progress, &sctx->stat, sizeof(*progress));
3745
3746        mutex_lock(&fs_info->scrub_lock);
3747        dev->scrub_device = NULL;
3748        scrub_workers_put(fs_info);
3749        mutex_unlock(&fs_info->scrub_lock);
3750
3751        scrub_put_ctx(sctx);
3752
3753        return ret;
3754}
3755
3756void btrfs_scrub_pause(struct btrfs_root *root)
3757{
3758        struct btrfs_fs_info *fs_info = root->fs_info;
3759
3760        mutex_lock(&fs_info->scrub_lock);
3761        atomic_inc(&fs_info->scrub_pause_req);
3762        while (atomic_read(&fs_info->scrubs_paused) !=
3763               atomic_read(&fs_info->scrubs_running)) {
3764                mutex_unlock(&fs_info->scrub_lock);
3765                wait_event(fs_info->scrub_pause_wait,
3766                           atomic_read(&fs_info->scrubs_paused) ==
3767                           atomic_read(&fs_info->scrubs_running));
3768                mutex_lock(&fs_info->scrub_lock);
3769        }
3770        mutex_unlock(&fs_info->scrub_lock);
3771}
3772
3773void btrfs_scrub_continue(struct btrfs_root *root)
3774{
3775        struct btrfs_fs_info *fs_info = root->fs_info;
3776
3777        atomic_dec(&fs_info->scrub_pause_req);
3778        wake_up(&fs_info->scrub_pause_wait);
3779}
3780
3781int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
3782{
3783        mutex_lock(&fs_info->scrub_lock);
3784        if (!atomic_read(&fs_info->scrubs_running)) {
3785                mutex_unlock(&fs_info->scrub_lock);
3786                return -ENOTCONN;
3787        }
3788
3789        atomic_inc(&fs_info->scrub_cancel_req);
3790        while (atomic_read(&fs_info->scrubs_running)) {
3791                mutex_unlock(&fs_info->scrub_lock);
3792                wait_event(fs_info->scrub_pause_wait,
3793                           atomic_read(&fs_info->scrubs_running) == 0);
3794                mutex_lock(&fs_info->scrub_lock);
3795        }
3796        atomic_dec(&fs_info->scrub_cancel_req);
3797        mutex_unlock(&fs_info->scrub_lock);
3798
3799        return 0;
3800}
3801
3802int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
3803                           struct btrfs_device *dev)
3804{
3805        struct scrub_ctx *sctx;
3806
3807        mutex_lock(&fs_info->scrub_lock);
3808        sctx = dev->scrub_device;
3809        if (!sctx) {
3810                mutex_unlock(&fs_info->scrub_lock);
3811                return -ENOTCONN;
3812        }
3813        atomic_inc(&sctx->cancel_req);
3814        while (dev->scrub_device) {
3815                mutex_unlock(&fs_info->scrub_lock);
3816                wait_event(fs_info->scrub_pause_wait,
3817                           dev->scrub_device == NULL);
3818                mutex_lock(&fs_info->scrub_lock);
3819        }
3820        mutex_unlock(&fs_info->scrub_lock);
3821
3822        return 0;
3823}
3824
3825int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
3826                         struct btrfs_scrub_progress *progress)
3827{
3828        struct btrfs_device *dev;
3829        struct scrub_ctx *sctx = NULL;
3830
3831        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
3832        dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);
3833        if (dev)
3834                sctx = dev->scrub_device;
3835        if (sctx)
3836                memcpy(progress, &sctx->stat, sizeof(*progress));
3837        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
3838
3839        return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
3840}
3841
3842static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
3843                               u64 extent_logical, u64 extent_len,
3844                               u64 *extent_physical,
3845                               struct btrfs_device **extent_dev,
3846                               int *extent_mirror_num)
3847{
3848        u64 mapped_length;
3849        struct btrfs_bio *bbio = NULL;
3850        int ret;
3851
3852        mapped_length = extent_len;
3853        ret = btrfs_map_block(fs_info, READ, extent_logical,
3854                              &mapped_length, &bbio, 0);
3855        if (ret || !bbio || mapped_length < extent_len ||
3856            !bbio->stripes[0].dev->bdev) {
3857                btrfs_put_bbio(bbio);
3858                return;
3859        }
3860
3861        *extent_physical = bbio->stripes[0].physical;
3862        *extent_mirror_num = bbio->mirror_num;
3863        *extent_dev = bbio->stripes[0].dev;
3864        btrfs_put_bbio(bbio);
3865}
3866
3867static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
3868                              struct scrub_wr_ctx *wr_ctx,
3869                              struct btrfs_fs_info *fs_info,
3870                              struct btrfs_device *dev,
3871                              int is_dev_replace)
3872{
3873        WARN_ON(wr_ctx->wr_curr_bio != NULL);
3874
3875        mutex_init(&wr_ctx->wr_lock);
3876        wr_ctx->wr_curr_bio = NULL;
3877        if (!is_dev_replace)
3878                return 0;
3879
3880        WARN_ON(!dev->bdev);
3881        wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
3882                                         bio_get_nr_vecs(dev->bdev));
3883        wr_ctx->tgtdev = dev;
3884        atomic_set(&wr_ctx->flush_all_writes, 0);
3885        return 0;
3886}
3887
3888static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
3889{
3890        mutex_lock(&wr_ctx->wr_lock);
3891        kfree(wr_ctx->wr_curr_bio);
3892        wr_ctx->wr_curr_bio = NULL;
3893        mutex_unlock(&wr_ctx->wr_lock);
3894}
3895
3896static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
3897                            int mirror_num, u64 physical_for_dev_replace)
3898{
3899        struct scrub_copy_nocow_ctx *nocow_ctx;
3900        struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
3901
3902        nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
3903        if (!nocow_ctx) {
3904                spin_lock(&sctx->stat_lock);
3905                sctx->stat.malloc_errors++;
3906                spin_unlock(&sctx->stat_lock);
3907                return -ENOMEM;
3908        }
3909
3910        scrub_pending_trans_workers_inc(sctx);
3911
3912        nocow_ctx->sctx = sctx;
3913        nocow_ctx->logical = logical;
3914        nocow_ctx->len = len;
3915        nocow_ctx->mirror_num = mirror_num;
3916        nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
3917        btrfs_init_work(&nocow_ctx->work, btrfs_scrubnc_helper,
3918                        copy_nocow_pages_worker, NULL, NULL);
3919        INIT_LIST_HEAD(&nocow_ctx->inodes);
3920        btrfs_queue_work(fs_info->scrub_nocow_workers,
3921                         &nocow_ctx->work);
3922
3923        return 0;
3924}
3925
3926static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx)
3927{
3928        struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
3929        struct scrub_nocow_inode *nocow_inode;
3930
3931        nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS);
3932        if (!nocow_inode)
3933                return -ENOMEM;
3934        nocow_inode->inum = inum;
3935        nocow_inode->offset = offset;
3936        nocow_inode->root = root;
3937        list_add_tail(&nocow_inode->list, &nocow_ctx->inodes);
3938        return 0;
3939}
3940
3941#define COPY_COMPLETE 1
3942
3943static void copy_nocow_pages_worker(struct btrfs_work *work)
3944{
3945        struct scrub_copy_nocow_ctx *nocow_ctx =
3946                container_of(work, struct scrub_copy_nocow_ctx, work);
3947        struct scrub_ctx *sctx = nocow_ctx->sctx;
3948        u64 logical = nocow_ctx->logical;
3949        u64 len = nocow_ctx->len;
3950        int mirror_num = nocow_ctx->mirror_num;
3951        u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3952        int ret;
3953        struct btrfs_trans_handle *trans = NULL;
3954        struct btrfs_fs_info *fs_info;
3955        struct btrfs_path *path;
3956        struct btrfs_root *root;
3957        int not_written = 0;
3958
3959        fs_info = sctx->dev_root->fs_info;
3960        root = fs_info->extent_root;
3961
3962        path = btrfs_alloc_path();
3963        if (!path) {
3964                spin_lock(&sctx->stat_lock);
3965                sctx->stat.malloc_errors++;
3966                spin_unlock(&sctx->stat_lock);
3967                not_written = 1;
3968                goto out;
3969        }
3970
3971        trans = btrfs_join_transaction(root);
3972        if (IS_ERR(trans)) {
3973                not_written = 1;
3974                goto out;
3975        }
3976
3977        ret = iterate_inodes_from_logical(logical, fs_info, path,
3978                                          record_inode_for_nocow, nocow_ctx);
3979        if (ret != 0 && ret != -ENOENT) {
3980                btrfs_warn(fs_info, "iterate_inodes_from_logical() failed: log %llu, "
3981                        "phys %llu, len %llu, mir %u, ret %d",
3982                        logical, physical_for_dev_replace, len, mirror_num,
3983                        ret);
3984                not_written = 1;
3985                goto out;
3986        }
3987
3988        btrfs_end_transaction(trans, root);
3989        trans = NULL;
3990        while (!list_empty(&nocow_ctx->inodes)) {
3991                struct scrub_nocow_inode *entry;
3992                entry = list_first_entry(&nocow_ctx->inodes,
3993                                         struct scrub_nocow_inode,
3994                                         list);
3995                list_del_init(&entry->list);
3996                ret = copy_nocow_pages_for_inode(entry->inum, entry->offset,
3997                                                 entry->root, nocow_ctx);
3998                kfree(entry);
3999                if (ret == COPY_COMPLETE) {
4000                        ret = 0;

4001                        break;
4002                } else if (ret) {
4003                        break;
4004                }
4005        }
4006out:
4007        while (!list_empty(&nocow_ctx->inodes)) {
4008                struct scrub_nocow_inode *entry;
4009                entry = list_first_entry(&nocow_ctx->inodes,
4010                                         struct scrub_nocow_inode,
4011                                         list);
4012                list_del_init(&entry->list);
4013                kfree(entry);
4014        }
4015        if (trans && !IS_ERR(trans))
4016                btrfs_end_transaction(trans, root);
4017        if (not_written)
4018                btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
4019                                            num_uncorrectable_read_errors);
4020
4021        btrfs_free_path(path);
4022        kfree(nocow_ctx);
4023
4024        scrub_pending_trans_workers_dec(sctx);
4025}
4026
4027static int check_extent_to_block(struct inode *inode, u64 start, u64 len,
4028                                 u64 logical)
4029{
4030        struct extent_state *cached_state = NULL;
4031        struct btrfs_ordered_extent *ordered;
4032        struct extent_io_tree *io_tree;
4033        struct extent_map *em;
4034        u64 lockstart = start, lockend = start + len - 1;
4035        int ret = 0;
4036
4037        io_tree = &BTRFS_I(inode)->io_tree;
4038
4039        lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
4040        ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
4041        if (ordered) {
4042                btrfs_put_ordered_extent(ordered);
4043                ret = 1;
4044                goto out_unlock;
4045        }
4046
4047        em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
4048        if (IS_ERR(em)) {
4049                ret = PTR_ERR(em);
4050                goto out_unlock;
4051        }
4052
4053        /*
4054         * This extent does not actually cover the logical extent anymore,
4055         * move on to the next inode.
4056         */
4057        if (em->block_start > logical ||
4058            em->block_start + em->block_len < logical + len) {
4059                free_extent_map(em);
4060                ret = 1;
4061                goto out_unlock;
4062        }
4063        free_extent_map(em);
4064
4065out_unlock:
4066        unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
4067                             GFP_NOFS);
4068        return ret;
4069}
4070
4071static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
4072                                      struct scrub_copy_nocow_ctx *nocow_ctx)
4073{
4074        struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
4075        struct btrfs_key key;
4076        struct inode *inode;
4077        struct page *page;
4078        struct btrfs_root *local_root;
4079        struct extent_io_tree *io_tree;
4080        u64 physical_for_dev_replace;
4081        u64 nocow_ctx_logical;
4082        u64 len = nocow_ctx->len;
4083        unsigned long index;
4084        int srcu_index;
4085        int ret = 0;
4086        int err = 0;
4087
4088        key.objectid = root;
4089        key.type = BTRFS_ROOT_ITEM_KEY;
4090        key.offset = (u64)-1;
4091
4092        srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
4093
4094        local_root = btrfs_read_fs_root_no_name(fs_info, &key);
4095        if (IS_ERR(local_root)) {
4096                srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
4097                return PTR_ERR(local_root);
4098        }
4099
4100        key.type = BTRFS_INODE_ITEM_KEY;
4101        key.objectid = inum;
4102        key.offset = 0;
4103        inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
4104        srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
4105        if (IS_ERR(inode))
4106                return PTR_ERR(inode);
4107
4108        /* Avoid truncate/dio/punch hole.. */
4109        mutex_lock(&inode->i_mutex);
4110        inode_dio_wait(inode);
4111
4112        physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
4113        io_tree = &BTRFS_I(inode)->io_tree;
4114        nocow_ctx_logical = nocow_ctx->logical;
4115
4116        ret = check_extent_to_block(inode, offset, len, nocow_ctx_logical);
4117        if (ret) {
4118                ret = ret > 0 ? 0 : ret;
4119                goto out;
4120        }
4121
4122        while (len >= PAGE_CACHE_SIZE) {
4123                index = offset >> PAGE_CACHE_SHIFT;
4124again:
4125                page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
4126                if (!page) {
4127                        btrfs_err(fs_info, "find_or_create_page() failed");
4128                        ret = -ENOMEM;
4129                        goto out;
4130                }
4131
4132                if (PageUptodate(page)) {
4133                        if (PageDirty(page))
4134                                goto next_page;
4135                } else {
4136                        ClearPageError(page);
4137                        err = extent_read_full_page(io_tree, page,
4138                                                           btrfs_get_extent,
4139                                                           nocow_ctx->mirror_num);
4140                        if (err) {
4141                                ret = err;
4142                                goto next_page;
4143                        }
4144
4145                        lock_page(page);
4146                        /*
4147                         * If the page has been remove from the page cache,
4148                         * the data on it is meaningless, because it may be
4149                         * old one, the new data may be written into the new
4150                         * page in the page cache.
4151                         */
4152                        if (page->mapping != inode->i_mapping) {
4153                                unlock_page(page);
4154                                page_cache_release(page);
4155                                goto again;
4156                        }
4157                        if (!PageUptodate(page)) {
4158                                ret = -EIO;
4159                                goto next_page;
4160                        }
4161                }
4162
4163                ret = check_extent_to_block(inode, offset, len,
4164                                            nocow_ctx_logical);
4165                if (ret) {
4166                        ret = ret > 0 ? 0 : ret;
4167                        goto next_page;
4168                }
4169
4170                err = write_page_nocow(nocow_ctx->sctx,
4171                                       physical_for_dev_replace, page);
4172                if (err)
4173                        ret = err;
4174next_page:
4175                unlock_page(page);
4176                page_cache_release(page);
4177
4178                if (ret)
4179                        break;
4180
4181                offset += PAGE_CACHE_SIZE;
4182                physical_for_dev_replace += PAGE_CACHE_SIZE;
4183                nocow_ctx_logical += PAGE_CACHE_SIZE;
4184                len -= PAGE_CACHE_SIZE;
4185        }
4186        ret = COPY_COMPLETE;
4187out:
4188        mutex_unlock(&inode->i_mutex);
4189        iput(inode);
4190        return ret;
4191}
4192
4193static int write_page_nocow(struct scrub_ctx *sctx,
4194                            u64 physical_for_dev_replace, struct page *page)
4195{
4196        struct bio *bio;
4197        struct btrfs_device *dev;
4198        int ret;
4199
4200        dev = sctx->wr_ctx.tgtdev;
4201        if (!dev)
4202                return -EIO;
4203        if (!dev->bdev) {
4204                printk_ratelimited(KERN_WARNING
4205                        "BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
4206                return -EIO;
4207        }
4208        bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
4209        if (!bio) {
4210                spin_lock(&sctx->stat_lock);
4211                sctx->stat.malloc_errors++;
4212                spin_unlock(&sctx->stat_lock);
4213                return -ENOMEM;
4214        }
4215        bio->bi_iter.bi_size = 0;
4216        bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
4217        bio->bi_bdev = dev->bdev;
4218        ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
4219        if (ret != PAGE_CACHE_SIZE) {
4220leave_with_eio:
4221                bio_put(bio);
4222                btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
4223                return -EIO;
4224        }
4225
4226        if (btrfsic_submit_bio_wait(WRITE_SYNC, bio))
4227                goto leave_with_eio;
4228
4229        bio_put(bio);
4230        return 0;
4231}
4232