LXR linux/fs/btrfs/scrub.c

   1/*
   2 * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
   3 *
   4 * This program is free software; you can redistribute it and/or
   5 * modify it under the terms of the GNU General Public
   6 * License v2 as published by the Free Software Foundation.
   7 *
   8 * This program is distributed in the hope that it will be useful,
   9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11 * General Public License for more details.
  12 *
  13 * You should have received a copy of the GNU General Public
  14 * License along with this program; if not, write to the
  15 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  16 * Boston, MA 021110-1307, USA.
  17 */
  18
  19#include <linux/blkdev.h>
  20#include <linux/ratelimit.h>
  21#include "ctree.h"
  22#include "volumes.h"
  23#include "disk-io.h"
  24#include "ordered-data.h"
  25#include "transaction.h"
  26#include "backref.h"
  27#include "extent_io.h"
  28#include "dev-replace.h"
  29#include "check-integrity.h"
  30#include "rcu-string.h"
  31
  32/*
  33 * This is only the first step towards a full-features scrub. It reads all
  34 * extent and super block and verifies the checksums. In case a bad checksum
  35 * is found or the extent cannot be read, good data will be written back if
  36 * any can be found.
  37 *
  38 * Future enhancements:
  39 *  - In case an unrepairable extent is encountered, track which files are
  40 *    affected and report them
  41 *  - track and record media errors, throw out bad devices
  42 *  - add a mode to also read unallocated space
  43 */
  44
  45struct scrub_block;
  46struct scrub_ctx;
  47
  48/*
  49 * the following three values only influence the performance.
  50 * The last one configures the number of parallel and outstanding I/O
  51 * operations. The first two values configure an upper limit for the number
  52 * of (dynamically allocated) pages that are added to a bio.
  53 */
  54#define SCRUB_PAGES_PER_RD_BIO  32      /* 128k per bio */
  55#define SCRUB_PAGES_PER_WR_BIO  32      /* 128k per bio */
  56#define SCRUB_BIOS_PER_SCTX     64      /* 8MB per device in flight */
  57
  58/*
  59 * the following value times PAGE_SIZE needs to be large enough to match the
  60 * largest node/leaf/sector size that shall be supported.
  61 * Values larger than BTRFS_STRIPE_LEN are not supported.
  62 */
  63#define SCRUB_MAX_PAGES_PER_BLOCK       16      /* 64k per node/leaf/sector */
  64
  65struct scrub_page {
  66        struct scrub_block      *sblock;
  67        struct page             *page;
  68        struct btrfs_device     *dev;
  69        u64                     flags;  /* extent flags */
  70        u64                     generation;
  71        u64                     logical;
  72        u64                     physical;
  73        u64                     physical_for_dev_replace;
  74        atomic_t                ref_count;
  75        struct {
  76                unsigned int    mirror_num:8;
  77                unsigned int    have_csum:1;
  78                unsigned int    io_error:1;
  79        };
  80        u8                      csum[BTRFS_CSUM_SIZE];
  81};
  82
  83struct scrub_bio {
  84        int                     index;
  85        struct scrub_ctx        *sctx;
  86        struct btrfs_device     *dev;
  87        struct bio              *bio;
  88        int                     err;
  89        u64                     logical;
  90        u64                     physical;
  91#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
  92        struct scrub_page       *pagev[SCRUB_PAGES_PER_WR_BIO];
  93#else
  94        struct scrub_page       *pagev[SCRUB_PAGES_PER_RD_BIO];
  95#endif
  96        int                     page_count;
  97        int                     next_free;
  98        struct btrfs_work       work;
  99};
 100
 101struct scrub_block {
 102        struct scrub_page       *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
 103        int                     page_count;
 104        atomic_t                outstanding_pages;
 105        atomic_t                ref_count; /* free mem on transition to zero */
 106        struct scrub_ctx        *sctx;
 107        struct {
 108                unsigned int    header_error:1;
 109                unsigned int    checksum_error:1;
 110                unsigned int    no_io_error_seen:1;
 111                unsigned int    generation_error:1; /* also sets header_error */
 112        };
 113};
 114
 115struct scrub_wr_ctx {
 116        struct scrub_bio *wr_curr_bio;
 117        struct btrfs_device *tgtdev;
 118        int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
 119        atomic_t flush_all_writes;
 120        struct mutex wr_lock;
 121};
 122
 123struct scrub_ctx {
 124        struct scrub_bio        *bios[SCRUB_BIOS_PER_SCTX];
 125        struct btrfs_root       *dev_root;
 126        int                     first_free;
 127        int                     curr;
 128        atomic_t                bios_in_flight;
 129        atomic_t                workers_pending;
 130        spinlock_t              list_lock;
 131        wait_queue_head_t       list_wait;
 132        u16                     csum_size;
 133        struct list_head        csum_list;
 134        atomic_t                cancel_req;
 135        int                     readonly;
 136        int                     pages_per_rd_bio;
 137        u32                     sectorsize;
 138        u32                     nodesize;
 139        u32                     leafsize;
 140
 141        int                     is_dev_replace;
 142        struct scrub_wr_ctx     wr_ctx;
 143
 144        /*
 145         * statistics
 146         */
 147        struct btrfs_scrub_progress stat;
 148        spinlock_t              stat_lock;
 149};
 150
 151struct scrub_fixup_nodatasum {
 152        struct scrub_ctx        *sctx;
 153        struct btrfs_device     *dev;
 154        u64                     logical;
 155        struct btrfs_root       *root;
 156        struct btrfs_work       work;
 157        int                     mirror_num;
 158};
 159
 160struct scrub_copy_nocow_ctx {
 161        struct scrub_ctx        *sctx;
 162        u64                     logical;
 163        u64                     len;
 164        int                     mirror_num;
 165        u64                     physical_for_dev_replace;
 166        struct btrfs_work       work;
 167};
 168
 169struct scrub_warning {
 170        struct btrfs_path       *path;
 171        u64                     extent_item_size;
 172        char                    *scratch_buf;
 173        char                    *msg_buf;
 174        const char              *errstr;
 175        sector_t                sector;
 176        u64                     logical;
 177        struct btrfs_device     *dev;
 178        int                     msg_bufsize;
 179        int                     scratch_bufsize;
 180};
 181
 182
 183static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
 184static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
 185static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
 186static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
 187static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
 188static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
 189                                     struct btrfs_fs_info *fs_info,
 190                                     struct scrub_block *original_sblock,
 191                                     u64 length, u64 logical,
 192                                     struct scrub_block *sblocks_for_recheck);
 193static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 194                                struct scrub_block *sblock, int is_metadata,
 195                                int have_csum, u8 *csum, u64 generation,
 196                                u16 csum_size);
 197static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
 198                                         struct scrub_block *sblock,
 199                                         int is_metadata, int have_csum,
 200                                         const u8 *csum, u64 generation,
 201                                         u16 csum_size);
 202static void scrub_complete_bio_end_io(struct bio *bio, int err);
 203static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
 204                                             struct scrub_block *sblock_good,
 205                                             int force_write);
 206static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 207                                            struct scrub_block *sblock_good,
 208                                            int page_num, int force_write);
 209static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
 210static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
 211                                           int page_num);
 212static int scrub_checksum_data(struct scrub_block *sblock);
 213static int scrub_checksum_tree_block(struct scrub_block *sblock);
 214static int scrub_checksum_super(struct scrub_block *sblock);
 215static void scrub_block_get(struct scrub_block *sblock);
 216static void scrub_block_put(struct scrub_block *sblock);
 217static void scrub_page_get(struct scrub_page *spage);
 218static void scrub_page_put(struct scrub_page *spage);
 219static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
 220                                    struct scrub_page *spage);
 221static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 222                       u64 physical, struct btrfs_device *dev, u64 flags,
 223                       u64 gen, int mirror_num, u8 *csum, int force,
 224                       u64 physical_for_dev_replace);
 225static void scrub_bio_end_io(struct bio *bio, int err);
 226static void scrub_bio_end_io_worker(struct btrfs_work *work);
 227static void scrub_block_complete(struct scrub_block *sblock);
 228static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
 229                               u64 extent_logical, u64 extent_len,
 230                               u64 *extent_physical,
 231                               struct btrfs_device **extent_dev,
 232                               int *extent_mirror_num);
 233static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
 234                              struct scrub_wr_ctx *wr_ctx,
 235                              struct btrfs_fs_info *fs_info,
 236                              struct btrfs_device *dev,
 237                              int is_dev_replace);
 238static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
 239static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
 240                                    struct scrub_page *spage);
 241static void scrub_wr_submit(struct scrub_ctx *sctx);
 242static void scrub_wr_bio_end_io(struct bio *bio, int err);
 243static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
 244static int write_page_nocow(struct scrub_ctx *sctx,
 245                            u64 physical_for_dev_replace, struct page *page);
 246static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
 247                                      void *ctx);
 248static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 249                            int mirror_num, u64 physical_for_dev_replace);
 250static void copy_nocow_pages_worker(struct btrfs_work *work);
 251
 252
 253static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
 254{
 255        atomic_inc(&sctx->bios_in_flight);
 256}
 257
 258static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
 259{
 260        atomic_dec(&sctx->bios_in_flight);
 261        wake_up(&sctx->list_wait);
 262}
 263
 264/*
 265 * used for workers that require transaction commits (i.e., for the
 266 * NOCOW case)
 267 */
 268static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
 269{
 270        struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
 271
 272        /*
 273         * increment scrubs_running to prevent cancel requests from
 274         * completing as long as a worker is running. we must also
 275         * increment scrubs_paused to prevent deadlocking on pause
 276         * requests used for transactions commits (as the worker uses a
 277         * transaction context). it is safe to regard the worker
 278         * as paused for all matters practical. effectively, we only
 279         * avoid cancellation requests from completing.
 280         */
 281        mutex_lock(&fs_info->scrub_lock);
 282        atomic_inc(&fs_info->scrubs_running);
 283        atomic_inc(&fs_info->scrubs_paused);
 284        mutex_unlock(&fs_info->scrub_lock);
 285        atomic_inc(&sctx->workers_pending);
 286}
 287
 288/* used for workers that require transaction commits */
 289static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
 290{
 291        struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
 292
 293        /*
 294         * see scrub_pending_trans_workers_inc() why we're pretending
 295         * to be paused in the scrub counters
 296         */
 297        mutex_lock(&fs_info->scrub_lock);
 298        atomic_dec(&fs_info->scrubs_running);
 299        atomic_dec(&fs_info->scrubs_paused);
 300        mutex_unlock(&fs_info->scrub_lock);
 301        atomic_dec(&sctx->workers_pending);
 302        wake_up(&fs_info->scrub_pause_wait);
 303        wake_up(&sctx->list_wait);
 304}
 305
 306static void scrub_free_csums(struct scrub_ctx *sctx)
 307{
 308        while (!list_empty(&sctx->csum_list)) {
 309                struct btrfs_ordered_sum *sum;
 310                sum = list_first_entry(&sctx->csum_list,
 311                                       struct btrfs_ordered_sum, list);
 312                list_del(&sum->list);
 313                kfree(sum);
 314        }
 315}
 316
 317static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
 318{
 319        int i;
 320
 321        if (!sctx)
 322                return;
 323
 324        scrub_free_wr_ctx(&sctx->wr_ctx);
 325
 326        /* this can happen when scrub is cancelled */
 327        if (sctx->curr != -1) {
 328                struct scrub_bio *sbio = sctx->bios[sctx->curr];
 329
 330                for (i = 0; i < sbio->page_count; i++) {
 331                        WARN_ON(!sbio->pagev[i]->page);
 332                        scrub_block_put(sbio->pagev[i]->sblock);
 333                }
 334                bio_put(sbio->bio);
 335        }
 336
 337        for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 338                struct scrub_bio *sbio = sctx->bios[i];
 339
 340                if (!sbio)
 341                        break;
 342                kfree(sbio);
 343        }
 344
 345        scrub_free_csums(sctx);
 346        kfree(sctx);
 347}
 348
 349static noinline_for_stack
 350struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
 351{
 352        struct scrub_ctx *sctx;
 353        int             i;
 354        struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
 355        int pages_per_rd_bio;
 356        int ret;
 357
 358        /*
 359         * the setting of pages_per_rd_bio is correct for scrub but might
 360         * be wrong for the dev_replace code where we might read from
 361         * different devices in the initial huge bios. However, that
 362         * code is able to correctly handle the case when adding a page
 363         * to a bio fails.
 364         */
 365        if (dev->bdev)
 366                pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
 367                                         bio_get_nr_vecs(dev->bdev));
 368        else
 369                pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
 370        sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
 371        if (!sctx)
 372                goto nomem;
 373        sctx->is_dev_replace = is_dev_replace;
 374        sctx->pages_per_rd_bio = pages_per_rd_bio;
 375        sctx->curr = -1;
 376        sctx->dev_root = dev->dev_root;
 377        for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 378                struct scrub_bio *sbio;
 379
 380                sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
 381                if (!sbio)
 382                        goto nomem;
 383                sctx->bios[i] = sbio;
 384
 385                sbio->index = i;
 386                sbio->sctx = sctx;
 387                sbio->page_count = 0;
 388                sbio->work.func = scrub_bio_end_io_worker;
 389
 390                if (i != SCRUB_BIOS_PER_SCTX - 1)
 391                        sctx->bios[i]->next_free = i + 1;
 392                else
 393                        sctx->bios[i]->next_free = -1;
 394        }
 395        sctx->first_free = 0;
 396        sctx->nodesize = dev->dev_root->nodesize;
 397        sctx->leafsize = dev->dev_root->leafsize;
 398        sctx->sectorsize = dev->dev_root->sectorsize;
 399        atomic_set(&sctx->bios_in_flight, 0);
 400        atomic_set(&sctx->workers_pending, 0);
 401        atomic_set(&sctx->cancel_req, 0);
 402        sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
 403        INIT_LIST_HEAD(&sctx->csum_list);
 404
 405        spin_lock_init(&sctx->list_lock);
 406        spin_lock_init(&sctx->stat_lock);
 407        init_waitqueue_head(&sctx->list_wait);
 408
 409        ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
 410                                 fs_info->dev_replace.tgtdev, is_dev_replace);
 411        if (ret) {
 412                scrub_free_ctx(sctx);
 413                return ERR_PTR(ret);
 414        }
 415        return sctx;
 416
 417nomem:
 418        scrub_free_ctx(sctx);
 419        return ERR_PTR(-ENOMEM);
 420}
 421
 422static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
 423                                     void *warn_ctx)
 424{
 425        u64 isize;
 426        u32 nlink;
 427        int ret;
 428        int i;
 429        struct extent_buffer *eb;
 430        struct btrfs_inode_item *inode_item;
 431        struct scrub_warning *swarn = warn_ctx;
 432        struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
 433        struct inode_fs_paths *ipath = NULL;
 434        struct btrfs_root *local_root;
 435        struct btrfs_key root_key;
 436
 437        root_key.objectid = root;
 438        root_key.type = BTRFS_ROOT_ITEM_KEY;
 439        root_key.offset = (u64)-1;
 440        local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
 441        if (IS_ERR(local_root)) {
 442                ret = PTR_ERR(local_root);
 443                goto err;
 444        }
 445
 446        ret = inode_item_info(inum, 0, local_root, swarn->path);
 447        if (ret) {
 448                btrfs_release_path(swarn->path);
 449                goto err;
 450        }
 451
 452        eb = swarn->path->nodes[0];
 453        inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
 454                                        struct btrfs_inode_item);
 455        isize = btrfs_inode_size(eb, inode_item);
 456        nlink = btrfs_inode_nlink(eb, inode_item);
 457        btrfs_release_path(swarn->path);
 458
 459        ipath = init_ipath(4096, local_root, swarn->path);
 460        if (IS_ERR(ipath)) {
 461                ret = PTR_ERR(ipath);
 462                ipath = NULL;
 463                goto err;
 464        }
 465        ret = paths_from_inode(inum, ipath);
 466
 467        if (ret < 0)
 468                goto err;
 469
 470        /*
 471         * we deliberately ignore the bit ipath might have been too small to
 472         * hold all of the paths here
 473         */
 474        for (i = 0; i < ipath->fspath->elem_cnt; ++i)
 475                printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev "
 476                        "%s, sector %llu, root %llu, inode %llu, offset %llu, "
 477                        "length %llu, links %u (path: %s)\n", swarn->errstr,
 478                        swarn->logical, rcu_str_deref(swarn->dev->name),
 479                        (unsigned long long)swarn->sector, root, inum, offset,
 480                        min(isize - offset, (u64)PAGE_SIZE), nlink,
 481                        (char *)(unsigned long)ipath->fspath->val[i]);
 482
 483        free_ipath(ipath);
 484        return 0;
 485
 486err:
 487        printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev "
 488                "%s, sector %llu, root %llu, inode %llu, offset %llu: path "
 489                "resolving failed with ret=%d\n", swarn->errstr,
 490                swarn->logical, rcu_str_deref(swarn->dev->name),
 491                (unsigned long long)swarn->sector, root, inum, offset, ret);
 492
 493        free_ipath(ipath);
 494        return 0;
 495}
 496
 497static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 498{
 499        struct btrfs_device *dev;
 500        struct btrfs_fs_info *fs_info;
 501        struct btrfs_path *path;
 502        struct btrfs_key found_key;
 503        struct extent_buffer *eb;
 504        struct btrfs_extent_item *ei;
 505        struct scrub_warning swarn;
 506        unsigned long ptr = 0;
 507        u64 extent_item_pos;
 508        u64 flags = 0;
 509        u64 ref_root;
 510        u32 item_size;
 511        u8 ref_level;
 512        const int bufsize = 4096;
 513        int ret;
 514
 515        WARN_ON(sblock->page_count < 1);
 516        dev = sblock->pagev[0]->dev;
 517        fs_info = sblock->sctx->dev_root->fs_info;
 518
 519        path = btrfs_alloc_path();
 520
 521        swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
 522        swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
 523        swarn.sector = (sblock->pagev[0]->physical) >> 9;
 524        swarn.logical = sblock->pagev[0]->logical;
 525        swarn.errstr = errstr;
 526        swarn.dev = NULL;
 527        swarn.msg_bufsize = bufsize;
 528        swarn.scratch_bufsize = bufsize;
 529
 530        if (!path || !swarn.scratch_buf || !swarn.msg_buf)
 531                goto out;
 532
 533        ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
 534                                  &flags);
 535        if (ret < 0)
 536                goto out;
 537
 538        extent_item_pos = swarn.logical - found_key.objectid;
 539        swarn.extent_item_size = found_key.offset;
 540
 541        eb = path->nodes[0];
 542        ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
 543        item_size = btrfs_item_size_nr(eb, path->slots[0]);
 544        btrfs_release_path(path);
 545
 546        if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 547                do {
 548                        ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
 549                                                        &ref_root, &ref_level);
 550                        printk_in_rcu(KERN_WARNING
 551                                "btrfs: %s at logical %llu on dev %s, "
 552                                "sector %llu: metadata %s (level %d) in tree "
 553                                "%llu\n", errstr, swarn.logical,
 554                                rcu_str_deref(dev->name),
 555                                (unsigned long long)swarn.sector,
 556                                ref_level ? "node" : "leaf",
 557                                ret < 0 ? -1 : ref_level,
 558                                ret < 0 ? -1 : ref_root);
 559                } while (ret != 1);
 560        } else {
 561                swarn.path = path;
 562                swarn.dev = dev;
 563                iterate_extent_inodes(fs_info, found_key.objectid,
 564                                        extent_item_pos, 1,
 565                                        scrub_print_warning_inode, &swarn);
 566        }
 567
 568out:
 569        btrfs_free_path(path);
 570        kfree(swarn.scratch_buf);
 571        kfree(swarn.msg_buf);
 572}
 573
 574static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
 575{
 576        struct page *page = NULL;
 577        unsigned long index;
 578        struct scrub_fixup_nodatasum *fixup = fixup_ctx;
 579        int ret;
 580        int corrected = 0;
 581        struct btrfs_key key;
 582        struct inode *inode = NULL;
 583        struct btrfs_fs_info *fs_info;
 584        u64 end = offset + PAGE_SIZE - 1;
 585        struct btrfs_root *local_root;
 586        int srcu_index;
 587
 588        key.objectid = root;
 589        key.type = BTRFS_ROOT_ITEM_KEY;
 590        key.offset = (u64)-1;
 591
 592        fs_info = fixup->root->fs_info;
 593        srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
 594
 595        local_root = btrfs_read_fs_root_no_name(fs_info, &key);
 596        if (IS_ERR(local_root)) {
 597                srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
 598                return PTR_ERR(local_root);
 599        }
 600
 601        key.type = BTRFS_INODE_ITEM_KEY;
 602        key.objectid = inum;
 603        key.offset = 0;
 604        inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
 605        srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
 606        if (IS_ERR(inode))
 607                return PTR_ERR(inode);
 608
 609        index = offset >> PAGE_CACHE_SHIFT;
 610
 611        page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
 612        if (!page) {
 613                ret = -ENOMEM;
 614                goto out;
 615        }
 616
 617        if (PageUptodate(page)) {
 618                if (PageDirty(page)) {
 619                        /*
 620                         * we need to write the data to the defect sector. the
 621                         * data that was in that sector is not in memory,
 622                         * because the page was modified. we must not write the
 623                         * modified page to that sector.
 624                         *
 625                         * TODO: what could be done here: wait for the delalloc
 626                         *       runner to write out that page (might involve
 627                         *       COW) and see whether the sector is still
 628                         *       referenced afterwards.
 629                         *
 630                         * For the meantime, we'll treat this error
 631                         * incorrectable, although there is a chance that a
 632                         * later scrub will find the bad sector again and that
 633                         * there's no dirty page in memory, then.
 634                         */
 635                        ret = -EIO;
 636                        goto out;
 637                }
 638                fs_info = BTRFS_I(inode)->root->fs_info;
 639                ret = repair_io_failure(fs_info, offset, PAGE_SIZE,
 640                                        fixup->logical, page,
 641                                        fixup->mirror_num);
 642                unlock_page(page);
 643                corrected = !ret;
 644        } else {
 645                /*
 646                 * we need to get good data first. the general readpage path
 647                 * will call repair_io_failure for us, we just have to make
 648                 * sure we read the bad mirror.
 649                 */
 650                ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
 651                                        EXTENT_DAMAGED, GFP_NOFS);
 652                if (ret) {
 653                        /* set_extent_bits should give proper error */
 654                        WARN_ON(ret > 0);
 655                        if (ret > 0)
 656                                ret = -EFAULT;
 657                        goto out;
 658                }
 659
 660                ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
 661                                                btrfs_get_extent,
 662                                                fixup->mirror_num);
 663                wait_on_page_locked(page);
 664
 665                corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
 666                                                end, EXTENT_DAMAGED, 0, NULL);
 667                if (!corrected)
 668                        clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
 669                                                EXTENT_DAMAGED, GFP_NOFS);
 670        }
 671
 672out:
 673        if (page)
 674                put_page(page);
 675        if (inode)
 676                iput(inode);
 677
 678        if (ret < 0)
 679                return ret;
 680
 681        if (ret == 0 && corrected) {
 682                /*
 683                 * we only need to call readpage for one of the inodes belonging
 684                 * to this extent. so make iterate_extent_inodes stop
 685                 */
 686                return 1;
 687        }
 688
 689        return -EIO;
 690}
 691
 692static void scrub_fixup_nodatasum(struct btrfs_work *work)
 693{
 694        int ret;
 695        struct scrub_fixup_nodatasum *fixup;
 696        struct scrub_ctx *sctx;
 697        struct btrfs_trans_handle *trans = NULL;
 698        struct btrfs_fs_info *fs_info;
 699        struct btrfs_path *path;
 700        int uncorrectable = 0;
 701
 702        fixup = container_of(work, struct scrub_fixup_nodatasum, work);
 703        sctx = fixup->sctx;
 704        fs_info = fixup->root->fs_info;
 705
 706        path = btrfs_alloc_path();
 707        if (!path) {
 708                spin_lock(&sctx->stat_lock);
 709                ++sctx->stat.malloc_errors;
 710                spin_unlock(&sctx->stat_lock);
 711                uncorrectable = 1;
 712                goto out;
 713        }
 714
 715        trans = btrfs_join_transaction(fixup->root);
 716        if (IS_ERR(trans)) {
 717                uncorrectable = 1;
 718                goto out;
 719        }
 720
 721        /*
 722         * the idea is to trigger a regular read through the standard path. we
 723         * read a page from the (failed) logical address by specifying the
 724         * corresponding copynum of the failed sector. thus, that readpage is
 725         * expected to fail.
 726         * that is the point where on-the-fly error correction will kick in
 727         * (once it's finished) and rewrite the failed sector if a good copy
 728         * can be found.
 729         */
 730        ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
 731                                                path, scrub_fixup_readpage,
 732                                                fixup);
 733        if (ret < 0) {
 734                uncorrectable = 1;
 735                goto out;
 736        }
 737        WARN_ON(ret != 1);
 738
 739        spin_lock(&sctx->stat_lock);
 740        ++sctx->stat.corrected_errors;
 741        spin_unlock(&sctx->stat_lock);
 742
 743out:
 744        if (trans && !IS_ERR(trans))
 745                btrfs_end_transaction(trans, fixup->root);
 746        if (uncorrectable) {
 747                spin_lock(&sctx->stat_lock);
 748                ++sctx->stat.uncorrectable_errors;
 749                spin_unlock(&sctx->stat_lock);
 750                btrfs_dev_replace_stats_inc(
 751                        &sctx->dev_root->fs_info->dev_replace.
 752                        num_uncorrectable_read_errors);
 753                printk_ratelimited_in_rcu(KERN_ERR
 754                        "btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
 755                        (unsigned long long)fixup->logical,
 756                        rcu_str_deref(fixup->dev->name));
 757        }
 758
 759        btrfs_free_path(path);
 760        kfree(fixup);
 761
 762        scrub_pending_trans_workers_dec(sctx);
 763}
 764
 765/*
 766 * scrub_handle_errored_block gets called when either verification of the
 767 * pages failed or the bio failed to read, e.g. with EIO. In the latter
 768 * case, this function handles all pages in the bio, even though only one
 769 * may be bad.
 770 * The goal of this function is to repair the errored block by using the
 771 * contents of one of the mirrors.
 772 */
 773static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 774{
 775        struct scrub_ctx *sctx = sblock_to_check->sctx;
 776        struct btrfs_device *dev;
 777        struct btrfs_fs_info *fs_info;
 778        u64 length;
 779        u64 logical;
 780        u64 generation;
 781        unsigned int failed_mirror_index;
 782        unsigned int is_metadata;
 783        unsigned int have_csum;
 784        u8 *csum;
 785        struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
 786        struct scrub_block *sblock_bad;
 787        int ret;
 788        int mirror_index;
 789        int page_num;
 790        int success;
 791        static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
 792                                      DEFAULT_RATELIMIT_BURST);
 793
 794        BUG_ON(sblock_to_check->page_count < 1);
 795        fs_info = sctx->dev_root->fs_info;
 796        if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
 797                /*
 798                 * if we find an error in a super block, we just report it.
 799                 * They will get written with the next transaction commit
 800                 * anyway
 801                 */
 802                spin_lock(&sctx->stat_lock);
 803                ++sctx->stat.super_errors;
 804                spin_unlock(&sctx->stat_lock);
 805                return 0;
 806        }
 807        length = sblock_to_check->page_count * PAGE_SIZE;
 808        logical = sblock_to_check->pagev[0]->logical;
 809        generation = sblock_to_check->pagev[0]->generation;
 810        BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
 811        failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
 812        is_metadata = !(sblock_to_check->pagev[0]->flags &
 813                        BTRFS_EXTENT_FLAG_DATA);
 814        have_csum = sblock_to_check->pagev[0]->have_csum;
 815        csum = sblock_to_check->pagev[0]->csum;
 816        dev = sblock_to_check->pagev[0]->dev;
 817
 818        if (sctx->is_dev_replace && !is_metadata && !have_csum) {
 819                sblocks_for_recheck = NULL;
 820                goto nodatasum_case;
 821        }
 822
 823        /*
 824         * read all mirrors one after the other. This includes to
 825         * re-read the extent or metadata block that failed (that was
 826         * the cause that this fixup code is called) another time,
 827         * page by page this time in order to know which pages
 828         * caused I/O errors and which ones are good (for all mirrors).
 829         * It is the goal to handle the situation when more than one
 830         * mirror contains I/O errors, but the errors do not
 831         * overlap, i.e. the data can be repaired by selecting the
 832         * pages from those mirrors without I/O error on the
 833         * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
 834         * would be that mirror #1 has an I/O error on the first page,
 835         * the second page is good, and mirror #2 has an I/O error on
 836         * the second page, but the first page is good.
 837         * Then the first page of the first mirror can be repaired by
 838         * taking the first page of the second mirror, and the
 839         * second page of the second mirror can be repaired by
 840         * copying the contents of the 2nd page of the 1st mirror.
 841         * One more note: if the pages of one mirror contain I/O
 842         * errors, the checksum cannot be verified. In order to get
 843         * the best data for repairing, the first attempt is to find
 844         * a mirror without I/O errors and with a validated checksum.
 845         * Only if this is not possible, the pages are picked from
 846         * mirrors with I/O errors without considering the checksum.
 847         * If the latter is the case, at the end, the checksum of the
 848         * repaired area is verified in order to correctly maintain
 849         * the statistics.
 850         */
 851
 852        sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS *
 853                                     sizeof(*sblocks_for_recheck),
 854                                     GFP_NOFS);
 855        if (!sblocks_for_recheck) {
 856                spin_lock(&sctx->stat_lock);
 857                sctx->stat.malloc_errors++;
 858                sctx->stat.read_errors++;
 859                sctx->stat.uncorrectable_errors++;
 860                spin_unlock(&sctx->stat_lock);
 861                btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 862                goto out;
 863        }
 864
 865        /* setup the context, map the logical blocks and alloc the pages */
 866        ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
 867                                        logical, sblocks_for_recheck);
 868        if (ret) {
 869                spin_lock(&sctx->stat_lock);
 870                sctx->stat.read_errors++;
 871                sctx->stat.uncorrectable_errors++;
 872                spin_unlock(&sctx->stat_lock);
 873                btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 874                goto out;
 875        }
 876        BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
 877        sblock_bad = sblocks_for_recheck + failed_mirror_index;
 878
 879        /* build and submit the bios for the failed mirror, check checksums */
 880        scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
 881                            csum, generation, sctx->csum_size);
 882
 883        if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
 884            sblock_bad->no_io_error_seen) {
 885                /*
 886                 * the error disappeared after reading page by page, or
 887                 * the area was part of a huge bio and other parts of the
 888                 * bio caused I/O errors, or the block layer merged several
 889                 * read requests into one and the error is caused by a
 890                 * different bio (usually one of the two latter cases is
 891                 * the cause)
 892                 */
 893                spin_lock(&sctx->stat_lock);
 894                sctx->stat.unverified_errors++;
 895                spin_unlock(&sctx->stat_lock);
 896
 897                if (sctx->is_dev_replace)
 898                        scrub_write_block_to_dev_replace(sblock_bad);
 899                goto out;
 900        }
 901
 902        if (!sblock_bad->no_io_error_seen) {
 903                spin_lock(&sctx->stat_lock);
 904                sctx->stat.read_errors++;
 905                spin_unlock(&sctx->stat_lock);
 906                if (__ratelimit(&_rs))
 907                        scrub_print_warning("i/o error", sblock_to_check);
 908                btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 909        } else if (sblock_bad->checksum_error) {
 910                spin_lock(&sctx->stat_lock);
 911                sctx->stat.csum_errors++;
 912                spin_unlock(&sctx->stat_lock);
 913                if (__ratelimit(&_rs))
 914                        scrub_print_warning("checksum error", sblock_to_check);
 915                btrfs_dev_stat_inc_and_print(dev,
 916                                             BTRFS_DEV_STAT_CORRUPTION_ERRS);
 917        } else if (sblock_bad->header_error) {
 918                spin_lock(&sctx->stat_lock);
 919                sctx->stat.verify_errors++;
 920                spin_unlock(&sctx->stat_lock);
 921                if (__ratelimit(&_rs))
 922                        scrub_print_warning("checksum/header error",
 923                                            sblock_to_check);
 924                if (sblock_bad->generation_error)
 925                        btrfs_dev_stat_inc_and_print(dev,
 926                                BTRFS_DEV_STAT_GENERATION_ERRS);
 927                else
 928                        btrfs_dev_stat_inc_and_print(dev,
 929                                BTRFS_DEV_STAT_CORRUPTION_ERRS);
 930        }
 931
 932        if (sctx->readonly && !sctx->is_dev_replace)
 933                goto did_not_correct_error;
 934
 935        if (!is_metadata && !have_csum) {
 936                struct scrub_fixup_nodatasum *fixup_nodatasum;
 937
 938nodatasum_case:
 939                WARN_ON(sctx->is_dev_replace);
 940
 941                /*
 942                 * !is_metadata and !have_csum, this means that the data
 943                 * might not be COW'ed, that it might be modified
 944                 * concurrently. The general strategy to work on the
 945                 * commit root does not help in the case when COW is not
 946                 * used.
 947                 */
 948                fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
 949                if (!fixup_nodatasum)
 950                        goto did_not_correct_error;
 951                fixup_nodatasum->sctx = sctx;
 952                fixup_nodatasum->dev = dev;
 953                fixup_nodatasum->logical = logical;
 954                fixup_nodatasum->root = fs_info->extent_root;
 955                fixup_nodatasum->mirror_num = failed_mirror_index + 1;
 956                scrub_pending_trans_workers_inc(sctx);
 957                fixup_nodatasum->work.func = scrub_fixup_nodatasum;
 958                btrfs_queue_worker(&fs_info->scrub_workers,
 959                                   &fixup_nodatasum->work);
 960                goto out;
 961        }
 962
 963        /*
 964         * now build and submit the bios for the other mirrors, check
 965         * checksums.
 966         * First try to pick the mirror which is completely without I/O
 967         * errors and also does not have a checksum error.
 968         * If one is found, and if a checksum is present, the full block
 969         * that is known to contain an error is rewritten. Afterwards
 970         * the block is known to be corrected.
 971         * If a mirror is found which is completely correct, and no
 972         * checksum is present, only those pages are rewritten that had
 973         * an I/O error in the block to be repaired, since it cannot be
 974         * determined, which copy of the other pages is better (and it
 975         * could happen otherwise that a correct page would be
 976         * overwritten by a bad one).
 977         */
 978        for (mirror_index = 0;
 979             mirror_index < BTRFS_MAX_MIRRORS &&
 980             sblocks_for_recheck[mirror_index].page_count > 0;
 981             mirror_index++) {
 982                struct scrub_block *sblock_other;
 983
 984                if (mirror_index == failed_mirror_index)
 985                        continue;
 986                sblock_other = sblocks_for_recheck + mirror_index;
 987
 988                /* build and submit the bios, check checksums */
 989                scrub_recheck_block(fs_info, sblock_other, is_metadata,
 990                                    have_csum, csum, generation,
 991                                    sctx->csum_size);
 992
 993                if (!sblock_other->header_error &&
 994                    !sblock_other->checksum_error &&
 995                    sblock_other->no_io_error_seen) {
 996                        if (sctx->is_dev_replace) {
 997                                scrub_write_block_to_dev_replace(sblock_other);
 998                        } else {
 999                                int force_write = is_metadata || have_csum;
1000

1001                                ret = scrub_repair_block_from_good_copy(
1002                                                sblock_bad, sblock_other,
1003                                                force_write);
1004                        }
1005                        if (0 == ret)
1006                                goto corrected_error;
1007                }
1008        }
1009
1010        /*
1011         * for dev_replace, pick good pages and write to the target device.
1012         */
1013        if (sctx->is_dev_replace) {
1014                success = 1;
1015                for (page_num = 0; page_num < sblock_bad->page_count;
1016                     page_num++) {
1017                        int sub_success;
1018
1019                        sub_success = 0;
1020                        for (mirror_index = 0;
1021                             mirror_index < BTRFS_MAX_MIRRORS &&
1022                             sblocks_for_recheck[mirror_index].page_count > 0;
1023                             mirror_index++) {
1024                                struct scrub_block *sblock_other =
1025                                        sblocks_for_recheck + mirror_index;
1026                                struct scrub_page *page_other =
1027                                        sblock_other->pagev[page_num];
1028
1029                                if (!page_other->io_error) {
1030                                        ret = scrub_write_page_to_dev_replace(
1031                                                        sblock_other, page_num);
1032                                        if (ret == 0) {
1033                                                /* succeeded for this page */
1034                                                sub_success = 1;
1035                                                break;
1036                                        } else {
1037                                                btrfs_dev_replace_stats_inc(
1038                                                        &sctx->dev_root->
1039                                                        fs_info->dev_replace.
1040                                                        num_write_errors);
1041                                        }
1042                                }
1043                        }
1044
1045                        if (!sub_success) {
1046                                /*
1047                                 * did not find a mirror to fetch the page
1048                                 * from. scrub_write_page_to_dev_replace()
1049                                 * handles this case (page->io_error), by
1050                                 * filling the block with zeros before
1051                                 * submitting the write request
1052                                 */
1053                                success = 0;
1054                                ret = scrub_write_page_to_dev_replace(
1055                                                sblock_bad, page_num);
1056                                if (ret)
1057                                        btrfs_dev_replace_stats_inc(
1058                                                &sctx->dev_root->fs_info->
1059                                                dev_replace.num_write_errors);
1060                        }
1061                }
1062
1063                goto out;
1064        }
1065
1066        /*
1067         * for regular scrub, repair those pages that are errored.
1068         * In case of I/O errors in the area that is supposed to be
1069         * repaired, continue by picking good copies of those pages.
1070         * Select the good pages from mirrors to rewrite bad pages from
1071         * the area to fix. Afterwards verify the checksum of the block
1072         * that is supposed to be repaired. This verification step is
1073         * only done for the purpose of statistic counting and for the
1074         * final scrub report, whether errors remain.
1075         * A perfect algorithm could make use of the checksum and try
1076         * all possible combinations of pages from the different mirrors
1077         * until the checksum verification succeeds. For example, when
1078         * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
1079         * of mirror #2 is readable but the final checksum test fails,
1080         * then the 2nd page of mirror #3 could be tried, whether now
1081         * the final checksum succeedes. But this would be a rare
1082         * exception and is therefore not implemented. At least it is
1083         * avoided that the good copy is overwritten.
1084         * A more useful improvement would be to pick the sectors
1085         * without I/O error based on sector sizes (512 bytes on legacy
1086         * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
1087         * mirror could be repaired by taking 512 byte of a different
1088         * mirror, even if other 512 byte sectors in the same PAGE_SIZE
1089         * area are unreadable.
1090         */
1091
1092        /* can only fix I/O errors from here on */
1093        if (sblock_bad->no_io_error_seen)
1094                goto did_not_correct_error;
1095
1096        success = 1;
1097        for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1098                struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1099
1100                if (!page_bad->io_error)
1101                        continue;
1102
1103                for (mirror_index = 0;
1104                     mirror_index < BTRFS_MAX_MIRRORS &&
1105                     sblocks_for_recheck[mirror_index].page_count > 0;
1106                     mirror_index++) {
1107                        struct scrub_block *sblock_other = sblocks_for_recheck +
1108                                                           mirror_index;
1109                        struct scrub_page *page_other = sblock_other->pagev[
1110                                                        page_num];
1111
1112                        if (!page_other->io_error) {
1113                                ret = scrub_repair_page_from_good_copy(
1114                                        sblock_bad, sblock_other, page_num, 0);
1115                                if (0 == ret) {
1116                                        page_bad->io_error = 0;
1117                                        break; /* succeeded for this page */
1118                                }
1119                        }
1120                }
1121
1122                if (page_bad->io_error) {
1123                        /* did not find a mirror to copy the page from */
1124                        success = 0;
1125                }
1126        }
1127
1128        if (success) {
1129                if (is_metadata || have_csum) {
1130                        /*
1131                         * need to verify the checksum now that all
1132                         * sectors on disk are repaired (the write
1133                         * request for data to be repaired is on its way).
1134                         * Just be lazy and use scrub_recheck_block()
1135                         * which re-reads the data before the checksum
1136                         * is verified, but most likely the data comes out
1137                         * of the page cache.
1138                         */
1139                        scrub_recheck_block(fs_info, sblock_bad,
1140                                            is_metadata, have_csum, csum,
1141                                            generation, sctx->csum_size);
1142                        if (!sblock_bad->header_error &&
1143                            !sblock_bad->checksum_error &&
1144                            sblock_bad->no_io_error_seen)
1145                                goto corrected_error;
1146                        else
1147                                goto did_not_correct_error;
1148                } else {
1149corrected_error:
1150                        spin_lock(&sctx->stat_lock);
1151                        sctx->stat.corrected_errors++;
1152                        spin_unlock(&sctx->stat_lock);
1153                        printk_ratelimited_in_rcu(KERN_ERR
1154                                "btrfs: fixed up error at logical %llu on dev %s\n",
1155                                (unsigned long long)logical,
1156                                rcu_str_deref(dev->name));
1157                }
1158        } else {
1159did_not_correct_error:
1160                spin_lock(&sctx->stat_lock);
1161                sctx->stat.uncorrectable_errors++;
1162                spin_unlock(&sctx->stat_lock);
1163                printk_ratelimited_in_rcu(KERN_ERR
1164                        "btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",
1165                        (unsigned long long)logical,
1166                        rcu_str_deref(dev->name));
1167        }
1168
1169out:
1170        if (sblocks_for_recheck) {
1171                for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1172                     mirror_index++) {
1173                        struct scrub_block *sblock = sblocks_for_recheck +
1174                                                     mirror_index;
1175                        int page_index;
1176
1177                        for (page_index = 0; page_index < sblock->page_count;
1178                             page_index++) {
1179                                sblock->pagev[page_index]->sblock = NULL;
1180                                scrub_page_put(sblock->pagev[page_index]);
1181                        }
1182                }
1183                kfree(sblocks_for_recheck);
1184        }
1185
1186        return 0;
1187}
1188
1189static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
1190                                     struct btrfs_fs_info *fs_info,
1191                                     struct scrub_block *original_sblock,
1192                                     u64 length, u64 logical,
1193                                     struct scrub_block *sblocks_for_recheck)
1194{
1195        int page_index;
1196        int mirror_index;
1197        int ret;
1198
1199        /*
1200         * note: the two members ref_count and outstanding_pages
1201         * are not used (and not set) in the blocks that are used for
1202         * the recheck procedure
1203         */
1204
1205        page_index = 0;
1206        while (length > 0) {
1207                u64 sublen = min_t(u64, length, PAGE_SIZE);
1208                u64 mapped_length = sublen;
1209                struct btrfs_bio *bbio = NULL;
1210
1211                /*
1212                 * with a length of PAGE_SIZE, each returned stripe
1213                 * represents one mirror
1214                 */
1215                ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical,
1216                                      &mapped_length, &bbio, 0);
1217                if (ret || !bbio || mapped_length < sublen) {
1218                        kfree(bbio);
1219                        return -EIO;
1220                }
1221
1222                BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
1223                for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
1224                     mirror_index++) {
1225                        struct scrub_block *sblock;
1226                        struct scrub_page *page;
1227
1228                        if (mirror_index >= BTRFS_MAX_MIRRORS)
1229                                continue;
1230
1231                        sblock = sblocks_for_recheck + mirror_index;
1232                        sblock->sctx = sctx;
1233                        page = kzalloc(sizeof(*page), GFP_NOFS);
1234                        if (!page) {
1235leave_nomem:
1236                                spin_lock(&sctx->stat_lock);
1237                                sctx->stat.malloc_errors++;
1238                                spin_unlock(&sctx->stat_lock);
1239                                kfree(bbio);
1240                                return -ENOMEM;
1241                        }
1242                        scrub_page_get(page);
1243                        sblock->pagev[page_index] = page;
1244                        page->logical = logical;
1245                        page->physical = bbio->stripes[mirror_index].physical;
1246                        BUG_ON(page_index >= original_sblock->page_count);
1247                        page->physical_for_dev_replace =
1248                                original_sblock->pagev[page_index]->
1249                                physical_for_dev_replace;
1250                        /* for missing devices, dev->bdev is NULL */
1251                        page->dev = bbio->stripes[mirror_index].dev;
1252                        page->mirror_num = mirror_index + 1;
1253                        sblock->page_count++;
1254                        page->page = alloc_page(GFP_NOFS);
1255                        if (!page->page)
1256                                goto leave_nomem;
1257                }
1258                kfree(bbio);
1259                length -= sublen;
1260                logical += sublen;
1261                page_index++;
1262        }
1263
1264        return 0;
1265}
1266
1267/*
1268 * this function will check the on disk data for checksum errors, header
1269 * errors and read I/O errors. If any I/O errors happen, the exact pages
1270 * which are errored are marked as being bad. The goal is to enable scrub
1271 * to take those pages that are not errored from all the mirrors so that
1272 * the pages that are errored in the just handled mirror can be repaired.
1273 */
1274static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1275                                struct scrub_block *sblock, int is_metadata,
1276                                int have_csum, u8 *csum, u64 generation,
1277                                u16 csum_size)
1278{
1279        int page_num;
1280
1281        sblock->no_io_error_seen = 1;
1282        sblock->header_error = 0;
1283        sblock->checksum_error = 0;
1284
1285        for (page_num = 0; page_num < sblock->page_count; page_num++) {
1286                struct bio *bio;
1287                struct scrub_page *page = sblock->pagev[page_num];
1288                DECLARE_COMPLETION_ONSTACK(complete);
1289
1290                if (page->dev->bdev == NULL) {
1291                        page->io_error = 1;
1292                        sblock->no_io_error_seen = 0;
1293                        continue;
1294                }
1295
1296                WARN_ON(!page->page);
1297                bio = bio_alloc(GFP_NOFS, 1);
1298                if (!bio) {
1299                        page->io_error = 1;
1300                        sblock->no_io_error_seen = 0;
1301                        continue;
1302                }
1303                bio->bi_bdev = page->dev->bdev;
1304                bio->bi_sector = page->physical >> 9;
1305                bio->bi_end_io = scrub_complete_bio_end_io;
1306                bio->bi_private = &complete;
1307
1308                bio_add_page(bio, page->page, PAGE_SIZE, 0);
1309                btrfsic_submit_bio(READ, bio);
1310
1311                /* this will also unplug the queue */
1312                wait_for_completion(&complete);
1313
1314                page->io_error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
1315                if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
1316                        sblock->no_io_error_seen = 0;
1317                bio_put(bio);
1318        }
1319
1320        if (sblock->no_io_error_seen)
1321                scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
1322                                             have_csum, csum, generation,
1323                                             csum_size);
1324
1325        return;
1326}
1327
1328static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
1329                                         struct scrub_block *sblock,
1330                                         int is_metadata, int have_csum,
1331                                         const u8 *csum, u64 generation,
1332                                         u16 csum_size)
1333{
1334        int page_num;
1335        u8 calculated_csum[BTRFS_CSUM_SIZE];
1336        u32 crc = ~(u32)0;
1337        struct btrfs_root *root = fs_info->extent_root;
1338        void *mapped_buffer;
1339
1340        WARN_ON(!sblock->pagev[0]->page);
1341        if (is_metadata) {
1342                struct btrfs_header *h;
1343
1344                mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1345                h = (struct btrfs_header *)mapped_buffer;
1346
1347                if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr) ||
1348                    memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
1349                    memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1350                           BTRFS_UUID_SIZE)) {
1351                        sblock->header_error = 1;
1352                } else if (generation != le64_to_cpu(h->generation)) {
1353                        sblock->header_error = 1;
1354                        sblock->generation_error = 1;
1355                }
1356                csum = h->csum;
1357        } else {
1358                if (!have_csum)
1359                        return;
1360
1361                mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
1362        }
1363
1364        for (page_num = 0;;) {
1365                if (page_num == 0 && is_metadata)
1366                        crc = btrfs_csum_data(root,
1367                                ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
1368                                crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
1369                else
1370                        crc = btrfs_csum_data(root, mapped_buffer, crc,
1371                                              PAGE_SIZE);
1372
1373                kunmap_atomic(mapped_buffer);
1374                page_num++;
1375                if (page_num >= sblock->page_count)
1376                        break;
1377                WARN_ON(!sblock->pagev[page_num]->page);
1378
1379                mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
1380        }
1381
1382        btrfs_csum_final(crc, calculated_csum);
1383        if (memcmp(calculated_csum, csum, csum_size))
1384                sblock->checksum_error = 1;
1385}
1386
1387static void scrub_complete_bio_end_io(struct bio *bio, int err)
1388{
1389        complete((struct completion *)bio->bi_private);
1390}
1391
1392static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1393                                             struct scrub_block *sblock_good,
1394                                             int force_write)
1395{
1396        int page_num;
1397        int ret = 0;
1398
1399        for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
1400                int ret_sub;
1401
1402                ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
1403                                                           sblock_good,
1404                                                           page_num,
1405                                                           force_write);
1406                if (ret_sub)
1407                        ret = ret_sub;
1408        }
1409
1410        return ret;
1411}
1412
1413static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
1414                                            struct scrub_block *sblock_good,
1415                                            int page_num, int force_write)
1416{
1417        struct scrub_page *page_bad = sblock_bad->pagev[page_num];
1418        struct scrub_page *page_good = sblock_good->pagev[page_num];
1419
1420        BUG_ON(page_bad->page == NULL);
1421        BUG_ON(page_good->page == NULL);
1422        if (force_write || sblock_bad->header_error ||
1423            sblock_bad->checksum_error || page_bad->io_error) {
1424                struct bio *bio;
1425                int ret;
1426                DECLARE_COMPLETION_ONSTACK(complete);
1427
1428                if (!page_bad->dev->bdev) {
1429                        printk_ratelimited(KERN_WARNING
1430                                "btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n");
1431                        return -EIO;
1432                }
1433
1434                bio = bio_alloc(GFP_NOFS, 1);
1435                if (!bio)
1436                        return -EIO;
1437                bio->bi_bdev = page_bad->dev->bdev;
1438                bio->bi_sector = page_bad->physical >> 9;
1439                bio->bi_end_io = scrub_complete_bio_end_io;
1440                bio->bi_private = &complete;
1441
1442                ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
1443                if (PAGE_SIZE != ret) {
1444                        bio_put(bio);
1445                        return -EIO;
1446                }
1447                btrfsic_submit_bio(WRITE, bio);
1448
1449                /* this will also unplug the queue */
1450                wait_for_completion(&complete);
1451                if (!bio_flagged(bio, BIO_UPTODATE)) {
1452                        btrfs_dev_stat_inc_and_print(page_bad->dev,
1453                                BTRFS_DEV_STAT_WRITE_ERRS);
1454                        btrfs_dev_replace_stats_inc(
1455                                &sblock_bad->sctx->dev_root->fs_info->
1456                                dev_replace.num_write_errors);
1457                        bio_put(bio);
1458                        return -EIO;
1459                }
1460                bio_put(bio);
1461        }
1462
1463        return 0;
1464}
1465
1466static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1467{
1468        int page_num;
1469
1470        for (page_num = 0; page_num < sblock->page_count; page_num++) {
1471                int ret;
1472
1473                ret = scrub_write_page_to_dev_replace(sblock, page_num);
1474                if (ret)
1475                        btrfs_dev_replace_stats_inc(
1476                                &sblock->sctx->dev_root->fs_info->dev_replace.
1477                                num_write_errors);
1478        }
1479}
1480
1481static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
1482                                           int page_num)
1483{
1484        struct scrub_page *spage = sblock->pagev[page_num];
1485
1486        BUG_ON(spage->page == NULL);
1487        if (spage->io_error) {
1488                void *mapped_buffer = kmap_atomic(spage->page);
1489
1490                memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
1491                flush_dcache_page(spage->page);
1492                kunmap_atomic(mapped_buffer);
1493        }
1494        return scrub_add_page_to_wr_bio(sblock->sctx, spage);
1495}
1496
1497static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
1498                                    struct scrub_page *spage)
1499{
1500        struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1501        struct scrub_bio *sbio;
1502        int ret;
1503
1504        mutex_lock(&wr_ctx->wr_lock);
1505again:
1506        if (!wr_ctx->wr_curr_bio) {
1507                wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
1508                                              GFP_NOFS);
1509                if (!wr_ctx->wr_curr_bio) {
1510                        mutex_unlock(&wr_ctx->wr_lock);
1511                        return -ENOMEM;
1512                }
1513                wr_ctx->wr_curr_bio->sctx = sctx;
1514                wr_ctx->wr_curr_bio->page_count = 0;
1515        }
1516        sbio = wr_ctx->wr_curr_bio;
1517        if (sbio->page_count == 0) {
1518                struct bio *bio;
1519
1520                sbio->physical = spage->physical_for_dev_replace;
1521                sbio->logical = spage->logical;
1522                sbio->dev = wr_ctx->tgtdev;
1523                bio = sbio->bio;
1524                if (!bio) {
1525                        bio = bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
1526                        if (!bio) {
1527                                mutex_unlock(&wr_ctx->wr_lock);
1528                                return -ENOMEM;
1529                        }
1530                        sbio->bio = bio;
1531                }
1532
1533                bio->bi_private = sbio;
1534                bio->bi_end_io = scrub_wr_bio_end_io;
1535                bio->bi_bdev = sbio->dev->bdev;
1536                bio->bi_sector = sbio->physical >> 9;
1537                sbio->err = 0;
1538        } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1539                   spage->physical_for_dev_replace ||
1540                   sbio->logical + sbio->page_count * PAGE_SIZE !=
1541                   spage->logical) {
1542                scrub_wr_submit(sctx);
1543                goto again;
1544        }
1545
1546        ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1547        if (ret != PAGE_SIZE) {
1548                if (sbio->page_count < 1) {
1549                        bio_put(sbio->bio);
1550                        sbio->bio = NULL;
1551                        mutex_unlock(&wr_ctx->wr_lock);
1552                        return -EIO;
1553                }
1554                scrub_wr_submit(sctx);
1555                goto again;
1556        }
1557
1558        sbio->pagev[sbio->page_count] = spage;
1559        scrub_page_get(spage);
1560        sbio->page_count++;
1561        if (sbio->page_count == wr_ctx->pages_per_wr_bio)
1562                scrub_wr_submit(sctx);
1563        mutex_unlock(&wr_ctx->wr_lock);
1564
1565        return 0;
1566}
1567
1568static void scrub_wr_submit(struct scrub_ctx *sctx)
1569{
1570        struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
1571        struct scrub_bio *sbio;
1572
1573        if (!wr_ctx->wr_curr_bio)
1574                return;
1575
1576        sbio = wr_ctx->wr_curr_bio;
1577        wr_ctx->wr_curr_bio = NULL;
1578        WARN_ON(!sbio->bio->bi_bdev);
1579        scrub_pending_bio_inc(sctx);
1580        /* process all writes in a single worker thread. Then the block layer
1581         * orders the requests before sending them to the driver which
1582         * doubled the write performance on spinning disks when measured
1583         * with Linux 3.5 */
1584        btrfsic_submit_bio(WRITE, sbio->bio);
1585}
1586
1587static void scrub_wr_bio_end_io(struct bio *bio, int err)
1588{
1589        struct scrub_bio *sbio = bio->bi_private;
1590        struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
1591
1592        sbio->err = err;
1593        sbio->bio = bio;
1594
1595        sbio->work.func = scrub_wr_bio_end_io_worker;
1596        btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work);
1597}
1598
1599static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
1600{
1601        struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1602        struct scrub_ctx *sctx = sbio->sctx;
1603        int i;
1604
1605        WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
1606        if (sbio->err) {
1607                struct btrfs_dev_replace *dev_replace =
1608                        &sbio->sctx->dev_root->fs_info->dev_replace;
1609
1610                for (i = 0; i < sbio->page_count; i++) {
1611                        struct scrub_page *spage = sbio->pagev[i];
1612
1613                        spage->io_error = 1;
1614                        btrfs_dev_replace_stats_inc(&dev_replace->
1615                                                    num_write_errors);
1616                }
1617        }
1618
1619        for (i = 0; i < sbio->page_count; i++)
1620                scrub_page_put(sbio->pagev[i]);
1621
1622        bio_put(sbio->bio);
1623        kfree(sbio);
1624        scrub_pending_bio_dec(sctx);
1625}
1626
1627static int scrub_checksum(struct scrub_block *sblock)
1628{
1629        u64 flags;
1630        int ret;
1631
1632        WARN_ON(sblock->page_count < 1);
1633        flags = sblock->pagev[0]->flags;
1634        ret = 0;
1635        if (flags & BTRFS_EXTENT_FLAG_DATA)
1636                ret = scrub_checksum_data(sblock);
1637        else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1638                ret = scrub_checksum_tree_block(sblock);
1639        else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1640                (void)scrub_checksum_super(sblock);
1641        else
1642                WARN_ON(1);
1643        if (ret)
1644                scrub_handle_errored_block(sblock);
1645
1646        return ret;
1647}
1648
1649static int scrub_checksum_data(struct scrub_block *sblock)
1650{
1651        struct scrub_ctx *sctx = sblock->sctx;
1652        u8 csum[BTRFS_CSUM_SIZE];
1653        u8 *on_disk_csum;
1654        struct page *page;
1655        void *buffer;
1656        u32 crc = ~(u32)0;
1657        int fail = 0;
1658        struct btrfs_root *root = sctx->dev_root;
1659        u64 len;
1660        int index;
1661
1662        BUG_ON(sblock->page_count < 1);
1663        if (!sblock->pagev[0]->have_csum)
1664                return 0;
1665
1666        on_disk_csum = sblock->pagev[0]->csum;
1667        page = sblock->pagev[0]->page;
1668        buffer = kmap_atomic(page);
1669
1670        len = sctx->sectorsize;
1671        index = 0;
1672        for (;;) {
1673                u64 l = min_t(u64, len, PAGE_SIZE);
1674
1675                crc = btrfs_csum_data(root, buffer, crc, l);
1676                kunmap_atomic(buffer);
1677                len -= l;
1678                if (len == 0)
1679                        break;
1680                index++;
1681                BUG_ON(index >= sblock->page_count);
1682                BUG_ON(!sblock->pagev[index]->page);
1683                page = sblock->pagev[index]->page;
1684                buffer = kmap_atomic(page);
1685        }
1686
1687        btrfs_csum_final(crc, csum);
1688        if (memcmp(csum, on_disk_csum, sctx->csum_size))
1689                fail = 1;
1690
1691        return fail;
1692}
1693
1694static int scrub_checksum_tree_block(struct scrub_block *sblock)
1695{
1696        struct scrub_ctx *sctx = sblock->sctx;
1697        struct btrfs_header *h;
1698        struct btrfs_root *root = sctx->dev_root;
1699        struct btrfs_fs_info *fs_info = root->fs_info;
1700        u8 calculated_csum[BTRFS_CSUM_SIZE];
1701        u8 on_disk_csum[BTRFS_CSUM_SIZE];
1702        struct page *page;
1703        void *mapped_buffer;
1704        u64 mapped_size;
1705        void *p;
1706        u32 crc = ~(u32)0;
1707        int fail = 0;
1708        int crc_fail = 0;
1709        u64 len;
1710        int index;
1711
1712        BUG_ON(sblock->page_count < 1);
1713        page = sblock->pagev[0]->page;
1714        mapped_buffer = kmap_atomic(page);
1715        h = (struct btrfs_header *)mapped_buffer;
1716        memcpy(on_disk_csum, h->csum, sctx->csum_size);
1717
1718        /*
1719         * we don't use the getter functions here, as we
1720         * a) don't have an extent buffer and
1721         * b) the page is already kmapped
1722         */
1723
1724        if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr))
1725                ++fail;
1726
1727        if (sblock->pagev[0]->generation != le64_to_cpu(h->generation))
1728                ++fail;
1729
1730        if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
1731                ++fail;
1732
1733        if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1734                   BTRFS_UUID_SIZE))
1735                ++fail;
1736
1737        WARN_ON(sctx->nodesize != sctx->leafsize);
1738        len = sctx->nodesize - BTRFS_CSUM_SIZE;
1739        mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1740        p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1741        index = 0;
1742        for (;;) {
1743                u64 l = min_t(u64, len, mapped_size);
1744
1745                crc = btrfs_csum_data(root, p, crc, l);
1746                kunmap_atomic(mapped_buffer);
1747                len -= l;
1748                if (len == 0)
1749                        break;
1750                index++;
1751                BUG_ON(index >= sblock->page_count);
1752                BUG_ON(!sblock->pagev[index]->page);
1753                page = sblock->pagev[index]->page;
1754                mapped_buffer = kmap_atomic(page);
1755                mapped_size = PAGE_SIZE;
1756                p = mapped_buffer;
1757        }
1758
1759        btrfs_csum_final(crc, calculated_csum);
1760        if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1761                ++crc_fail;
1762
1763        return fail || crc_fail;
1764}
1765
1766static int scrub_checksum_super(struct scrub_block *sblock)
1767{
1768        struct btrfs_super_block *s;
1769        struct scrub_ctx *sctx = sblock->sctx;
1770        struct btrfs_root *root = sctx->dev_root;
1771        struct btrfs_fs_info *fs_info = root->fs_info;
1772        u8 calculated_csum[BTRFS_CSUM_SIZE];
1773        u8 on_disk_csum[BTRFS_CSUM_SIZE];
1774        struct page *page;
1775        void *mapped_buffer;
1776        u64 mapped_size;
1777        void *p;
1778        u32 crc = ~(u32)0;
1779        int fail_gen = 0;
1780        int fail_cor = 0;
1781        u64 len;
1782        int index;
1783
1784        BUG_ON(sblock->page_count < 1);
1785        page = sblock->pagev[0]->page;
1786        mapped_buffer = kmap_atomic(page);
1787        s = (struct btrfs_super_block *)mapped_buffer;
1788        memcpy(on_disk_csum, s->csum, sctx->csum_size);
1789
1790        if (sblock->pagev[0]->logical != le64_to_cpu(s->bytenr))
1791                ++fail_cor;
1792
1793        if (sblock->pagev[0]->generation != le64_to_cpu(s->generation))
1794                ++fail_gen;
1795
1796        if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
1797                ++fail_cor;
1798
1799        len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
1800        mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
1801        p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
1802        index = 0;
1803        for (;;) {
1804                u64 l = min_t(u64, len, mapped_size);
1805
1806                crc = btrfs_csum_data(root, p, crc, l);
1807                kunmap_atomic(mapped_buffer);
1808                len -= l;
1809                if (len == 0)
1810                        break;
1811                index++;
1812                BUG_ON(index >= sblock->page_count);
1813                BUG_ON(!sblock->pagev[index]->page);
1814                page = sblock->pagev[index]->page;
1815                mapped_buffer = kmap_atomic(page);
1816                mapped_size = PAGE_SIZE;
1817                p = mapped_buffer;
1818        }
1819
1820        btrfs_csum_final(crc, calculated_csum);
1821        if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
1822                ++fail_cor;
1823
1824        if (fail_cor + fail_gen) {
1825                /*
1826                 * if we find an error in a super block, we just report it.
1827                 * They will get written with the next transaction commit
1828                 * anyway
1829                 */
1830                spin_lock(&sctx->stat_lock);
1831                ++sctx->stat.super_errors;
1832                spin_unlock(&sctx->stat_lock);
1833                if (fail_cor)
1834                        btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
1835                                BTRFS_DEV_STAT_CORRUPTION_ERRS);
1836                else
1837                        btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
1838                                BTRFS_DEV_STAT_GENERATION_ERRS);
1839        }
1840
1841        return fail_cor + fail_gen;
1842}
1843
1844static void scrub_block_get(struct scrub_block *sblock)
1845{
1846        atomic_inc(&sblock->ref_count);
1847}
1848
1849static void scrub_block_put(struct scrub_block *sblock)
1850{
1851        if (atomic_dec_and_test(&sblock->ref_count)) {
1852                int i;
1853
1854                for (i = 0; i < sblock->page_count; i++)
1855                        scrub_page_put(sblock->pagev[i]);
1856                kfree(sblock);
1857        }
1858}
1859
1860static void scrub_page_get(struct scrub_page *spage)
1861{
1862        atomic_inc(&spage->ref_count);
1863}
1864
1865static void scrub_page_put(struct scrub_page *spage)
1866{
1867        if (atomic_dec_and_test(&spage->ref_count)) {
1868                if (spage->page)
1869                        __free_page(spage->page);
1870                kfree(spage);
1871        }
1872}
1873
1874static void scrub_submit(struct scrub_ctx *sctx)
1875{
1876        struct scrub_bio *sbio;
1877
1878        if (sctx->curr == -1)
1879                return;
1880
1881        sbio = sctx->bios[sctx->curr];
1882        sctx->curr = -1;
1883        scrub_pending_bio_inc(sctx);
1884
1885        if (!sbio->bio->bi_bdev) {
1886                /*
1887                 * this case should not happen. If btrfs_map_block() is
1888                 * wrong, it could happen for dev-replace operations on
1889                 * missing devices when no mirrors are available, but in
1890                 * this case it should already fail the mount.
1891                 * This case is handled correctly (but _very_ slowly).
1892                 */
1893                printk_ratelimited(KERN_WARNING
1894                        "btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n");
1895                bio_endio(sbio->bio, -EIO);
1896        } else {
1897                btrfsic_submit_bio(READ, sbio->bio);
1898        }
1899}
1900
1901static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
1902                                    struct scrub_page *spage)
1903{
1904        struct scrub_block *sblock = spage->sblock;
1905        struct scrub_bio *sbio;
1906        int ret;
1907
1908again:
1909        /*
1910         * grab a fresh bio or wait for one to become available
1911         */
1912        while (sctx->curr == -1) {
1913                spin_lock(&sctx->list_lock);
1914                sctx->curr = sctx->first_free;
1915                if (sctx->curr != -1) {
1916                        sctx->first_free = sctx->bios[sctx->curr]->next_free;
1917                        sctx->bios[sctx->curr]->next_free = -1;
1918                        sctx->bios[sctx->curr]->page_count = 0;
1919                        spin_unlock(&sctx->list_lock);
1920                } else {
1921                        spin_unlock(&sctx->list_lock);
1922                        wait_event(sctx->list_wait, sctx->first_free != -1);
1923                }
1924        }
1925        sbio = sctx->bios[sctx->curr];
1926        if (sbio->page_count == 0) {
1927                struct bio *bio;
1928
1929                sbio->physical = spage->physical;
1930                sbio->logical = spage->logical;
1931                sbio->dev = spage->dev;
1932                bio = sbio->bio;
1933                if (!bio) {
1934                        bio = bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
1935                        if (!bio)
1936                                return -ENOMEM;
1937                        sbio->bio = bio;
1938                }
1939
1940                bio->bi_private = sbio;
1941                bio->bi_end_io = scrub_bio_end_io;
1942                bio->bi_bdev = sbio->dev->bdev;
1943                bio->bi_sector = sbio->physical >> 9;
1944                sbio->err = 0;
1945        } else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
1946                   spage->physical ||
1947                   sbio->logical + sbio->page_count * PAGE_SIZE !=
1948                   spage->logical ||
1949                   sbio->dev != spage->dev) {
1950                scrub_submit(sctx);
1951                goto again;
1952        }
1953
1954        sbio->pagev[sbio->page_count] = spage;
1955        ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
1956        if (ret != PAGE_SIZE) {
1957                if (sbio->page_count < 1) {
1958                        bio_put(sbio->bio);
1959                        sbio->bio = NULL;
1960                        return -EIO;
1961                }
1962                scrub_submit(sctx);
1963                goto again;
1964        }
1965
1966        scrub_block_get(sblock); /* one for the page added to the bio */
1967        atomic_inc(&sblock->outstanding_pages);
1968        sbio->page_count++;
1969        if (sbio->page_count == sctx->pages_per_rd_bio)
1970                scrub_submit(sctx);
1971
1972        return 0;
1973}
1974
1975static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
1976                       u64 physical, struct btrfs_device *dev, u64 flags,
1977                       u64 gen, int mirror_num, u8 *csum, int force,
1978                       u64 physical_for_dev_replace)
1979{
1980        struct scrub_block *sblock;
1981        int index;
1982
1983        sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
1984        if (!sblock) {
1985                spin_lock(&sctx->stat_lock);
1986                sctx->stat.malloc_errors++;
1987                spin_unlock(&sctx->stat_lock);
1988                return -ENOMEM;
1989        }
1990
1991        /* one ref inside this function, plus one for each page added to
1992         * a bio later on */
1993        atomic_set(&sblock->ref_count, 1);
1994        sblock->sctx = sctx;
1995        sblock->no_io_error_seen = 1;
1996
1997        for (index = 0; len > 0; index++) {
1998                struct scrub_page *spage;
1999                u64 l = min_t(u64, len, PAGE_SIZE);
2000

2001                spage = kzalloc(sizeof(*spage), GFP_NOFS);
2002                if (!spage) {
2003leave_nomem:
2004                        spin_lock(&sctx->stat_lock);
2005                        sctx->stat.malloc_errors++;
2006                        spin_unlock(&sctx->stat_lock);
2007                        scrub_block_put(sblock);
2008                        return -ENOMEM;
2009                }
2010                BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
2011                scrub_page_get(spage);
2012                sblock->pagev[index] = spage;
2013                spage->sblock = sblock;
2014                spage->dev = dev;
2015                spage->flags = flags;
2016                spage->generation = gen;
2017                spage->logical = logical;
2018                spage->physical = physical;
2019                spage->physical_for_dev_replace = physical_for_dev_replace;
2020                spage->mirror_num = mirror_num;
2021                if (csum) {
2022                        spage->have_csum = 1;
2023                        memcpy(spage->csum, csum, sctx->csum_size);
2024                } else {
2025                        spage->have_csum = 0;
2026                }
2027                sblock->page_count++;
2028                spage->page = alloc_page(GFP_NOFS);
2029                if (!spage->page)
2030                        goto leave_nomem;
2031                len -= l;
2032                logical += l;
2033                physical += l;
2034                physical_for_dev_replace += l;
2035        }
2036
2037        WARN_ON(sblock->page_count == 0);
2038        for (index = 0; index < sblock->page_count; index++) {
2039                struct scrub_page *spage = sblock->pagev[index];
2040                int ret;
2041
2042                ret = scrub_add_page_to_rd_bio(sctx, spage);
2043                if (ret) {
2044                        scrub_block_put(sblock);
2045                        return ret;
2046                }
2047        }
2048
2049        if (force)
2050                scrub_submit(sctx);
2051
2052        /* last one frees, either here or in bio completion for last page */
2053        scrub_block_put(sblock);
2054        return 0;
2055}
2056
2057static void scrub_bio_end_io(struct bio *bio, int err)
2058{
2059        struct scrub_bio *sbio = bio->bi_private;
2060        struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
2061
2062        sbio->err = err;
2063        sbio->bio = bio;
2064
2065        btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
2066}
2067
2068static void scrub_bio_end_io_worker(struct btrfs_work *work)
2069{
2070        struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2071        struct scrub_ctx *sctx = sbio->sctx;
2072        int i;
2073
2074        BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
2075        if (sbio->err) {
2076                for (i = 0; i < sbio->page_count; i++) {
2077                        struct scrub_page *spage = sbio->pagev[i];
2078
2079                        spage->io_error = 1;
2080                        spage->sblock->no_io_error_seen = 0;
2081                }
2082        }
2083
2084        /* now complete the scrub_block items that have all pages completed */
2085        for (i = 0; i < sbio->page_count; i++) {
2086                struct scrub_page *spage = sbio->pagev[i];
2087                struct scrub_block *sblock = spage->sblock;
2088
2089                if (atomic_dec_and_test(&sblock->outstanding_pages))
2090                        scrub_block_complete(sblock);
2091                scrub_block_put(sblock);
2092        }
2093
2094        bio_put(sbio->bio);
2095        sbio->bio = NULL;
2096        spin_lock(&sctx->list_lock);
2097        sbio->next_free = sctx->first_free;
2098        sctx->first_free = sbio->index;
2099        spin_unlock(&sctx->list_lock);
2100
2101        if (sctx->is_dev_replace &&
2102            atomic_read(&sctx->wr_ctx.flush_all_writes)) {
2103                mutex_lock(&sctx->wr_ctx.wr_lock);
2104                scrub_wr_submit(sctx);
2105                mutex_unlock(&sctx->wr_ctx.wr_lock);
2106        }
2107
2108        scrub_pending_bio_dec(sctx);
2109}
2110
2111static void scrub_block_complete(struct scrub_block *sblock)
2112{
2113        if (!sblock->no_io_error_seen) {
2114                scrub_handle_errored_block(sblock);
2115        } else {
2116                /*
2117                 * if has checksum error, write via repair mechanism in
2118                 * dev replace case, otherwise write here in dev replace
2119                 * case.
2120                 */
2121                if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace)
2122                        scrub_write_block_to_dev_replace(sblock);
2123        }
2124}
2125
2126static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
2127                           u8 *csum)
2128{
2129        struct btrfs_ordered_sum *sum = NULL;
2130        int ret = 0;
2131        unsigned long i;
2132        unsigned long num_sectors;
2133
2134        while (!list_empty(&sctx->csum_list)) {
2135                sum = list_first_entry(&sctx->csum_list,
2136                                       struct btrfs_ordered_sum, list);
2137                if (sum->bytenr > logical)
2138                        return 0;
2139                if (sum->bytenr + sum->len > logical)
2140                        break;
2141
2142                ++sctx->stat.csum_discards;
2143                list_del(&sum->list);
2144                kfree(sum);
2145                sum = NULL;
2146        }
2147        if (!sum)
2148                return 0;
2149
2150        num_sectors = sum->len / sctx->sectorsize;
2151        for (i = 0; i < num_sectors; ++i) {
2152                if (sum->sums[i].bytenr == logical) {
2153                        memcpy(csum, &sum->sums[i].sum, sctx->csum_size);
2154                        ret = 1;
2155                        break;
2156                }
2157        }
2158        if (ret && i == num_sectors - 1) {
2159                list_del(&sum->list);
2160                kfree(sum);
2161        }
2162        return ret;
2163}
2164
2165/* scrub extent tries to collect up to 64 kB for each bio */
2166static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
2167                        u64 physical, struct btrfs_device *dev, u64 flags,
2168                        u64 gen, int mirror_num, u64 physical_for_dev_replace)
2169{
2170        int ret;
2171        u8 csum[BTRFS_CSUM_SIZE];
2172        u32 blocksize;
2173
2174        if (flags & BTRFS_EXTENT_FLAG_DATA) {
2175                blocksize = sctx->sectorsize;
2176                spin_lock(&sctx->stat_lock);
2177                sctx->stat.data_extents_scrubbed++;
2178                sctx->stat.data_bytes_scrubbed += len;
2179                spin_unlock(&sctx->stat_lock);
2180        } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2181                WARN_ON(sctx->nodesize != sctx->leafsize);
2182                blocksize = sctx->nodesize;
2183                spin_lock(&sctx->stat_lock);
2184                sctx->stat.tree_extents_scrubbed++;
2185                sctx->stat.tree_bytes_scrubbed += len;
2186                spin_unlock(&sctx->stat_lock);
2187        } else {
2188                blocksize = sctx->sectorsize;
2189                WARN_ON(1);
2190        }
2191
2192        while (len) {
2193                u64 l = min_t(u64, len, blocksize);
2194                int have_csum = 0;
2195
2196                if (flags & BTRFS_EXTENT_FLAG_DATA) {
2197                        /* push csums to sbio */
2198                        have_csum = scrub_find_csum(sctx, logical, l, csum);
2199                        if (have_csum == 0)
2200                                ++sctx->stat.no_csum;
2201                        if (sctx->is_dev_replace && !have_csum) {
2202                                ret = copy_nocow_pages(sctx, logical, l,
2203                                                       mirror_num,
2204                                                      physical_for_dev_replace);
2205                                goto behind_scrub_pages;
2206                        }
2207                }
2208                ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
2209                                  mirror_num, have_csum ? csum : NULL, 0,
2210                                  physical_for_dev_replace);
2211behind_scrub_pages:
2212                if (ret)
2213                        return ret;
2214                len -= l;
2215                logical += l;
2216                physical += l;
2217                physical_for_dev_replace += l;
2218        }
2219        return 0;
2220}
2221
2222static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
2223                                           struct map_lookup *map,
2224                                           struct btrfs_device *scrub_dev,
2225                                           int num, u64 base, u64 length,
2226                                           int is_dev_replace)
2227{
2228        struct btrfs_path *path;
2229        struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
2230        struct btrfs_root *root = fs_info->extent_root;
2231        struct btrfs_root *csum_root = fs_info->csum_root;
2232        struct btrfs_extent_item *extent;
2233        struct blk_plug plug;
2234        u64 flags;
2235        int ret;
2236        int slot;
2237        int i;
2238        u64 nstripes;
2239        struct extent_buffer *l;
2240        struct btrfs_key key;
2241        u64 physical;
2242        u64 logical;
2243        u64 generation;
2244        int mirror_num;
2245        struct reada_control *reada1;
2246        struct reada_control *reada2;
2247        struct btrfs_key key_start;
2248        struct btrfs_key key_end;
2249        u64 increment = map->stripe_len;
2250        u64 offset;
2251        u64 extent_logical;
2252        u64 extent_physical;
2253        u64 extent_len;
2254        struct btrfs_device *extent_dev;
2255        int extent_mirror_num;
2256
2257        nstripes = length;
2258        offset = 0;
2259        do_div(nstripes, map->stripe_len);
2260        if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
2261                offset = map->stripe_len * num;
2262                increment = map->stripe_len * map->num_stripes;
2263                mirror_num = 1;
2264        } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
2265                int factor = map->num_stripes / map->sub_stripes;
2266                offset = map->stripe_len * (num / map->sub_stripes);
2267                increment = map->stripe_len * factor;
2268                mirror_num = num % map->sub_stripes + 1;
2269        } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
2270                increment = map->stripe_len;
2271                mirror_num = num % map->num_stripes + 1;
2272        } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
2273                increment = map->stripe_len;
2274                mirror_num = num % map->num_stripes + 1;
2275        } else {
2276                increment = map->stripe_len;
2277                mirror_num = 1;
2278        }
2279
2280        path = btrfs_alloc_path();
2281        if (!path)
2282                return -ENOMEM;
2283
2284        /*
2285         * work on commit root. The related disk blocks are static as
2286         * long as COW is applied. This means, it is save to rewrite
2287         * them to repair disk errors without any race conditions
2288         */
2289        path->search_commit_root = 1;
2290        path->skip_locking = 1;
2291
2292        /*
2293         * trigger the readahead for extent tree csum tree and wait for
2294         * completion. During readahead, the scrub is officially paused
2295         * to not hold off transaction commits
2296         */
2297        logical = base + offset;
2298
2299        wait_event(sctx->list_wait,
2300                   atomic_read(&sctx->bios_in_flight) == 0);
2301        atomic_inc(&fs_info->scrubs_paused);
2302        wake_up(&fs_info->scrub_pause_wait);
2303
2304        /* FIXME it might be better to start readahead at commit root */
2305        key_start.objectid = logical;
2306        key_start.type = BTRFS_EXTENT_ITEM_KEY;
2307        key_start.offset = (u64)0;
2308        key_end.objectid = base + offset + nstripes * increment;
2309        key_end.type = BTRFS_EXTENT_ITEM_KEY;
2310        key_end.offset = (u64)0;
2311        reada1 = btrfs_reada_add(root, &key_start, &key_end);
2312
2313        key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
2314        key_start.type = BTRFS_EXTENT_CSUM_KEY;
2315        key_start.offset = logical;
2316        key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
2317        key_end.type = BTRFS_EXTENT_CSUM_KEY;
2318        key_end.offset = base + offset + nstripes * increment;
2319        reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
2320
2321        if (!IS_ERR(reada1))
2322                btrfs_reada_wait(reada1);
2323        if (!IS_ERR(reada2))
2324                btrfs_reada_wait(reada2);
2325
2326        mutex_lock(&fs_info->scrub_lock);
2327        while (atomic_read(&fs_info->scrub_pause_req)) {
2328                mutex_unlock(&fs_info->scrub_lock);
2329                wait_event(fs_info->scrub_pause_wait,
2330                   atomic_read(&fs_info->scrub_pause_req) == 0);
2331                mutex_lock(&fs_info->scrub_lock);
2332        }
2333        atomic_dec(&fs_info->scrubs_paused);
2334        mutex_unlock(&fs_info->scrub_lock);
2335        wake_up(&fs_info->scrub_pause_wait);
2336
2337        /*
2338         * collect all data csums for the stripe to avoid seeking during
2339         * the scrub. This might currently (crc32) end up to be about 1MB
2340         */
2341        blk_start_plug(&plug);
2342
2343        /*
2344         * now find all extents for each stripe and scrub them
2345         */
2346        logical = base + offset;
2347        physical = map->stripes[num].physical;
2348        ret = 0;
2349        for (i = 0; i < nstripes; ++i) {
2350                /*
2351                 * canceled?
2352                 */
2353                if (atomic_read(&fs_info->scrub_cancel_req) ||
2354                    atomic_read(&sctx->cancel_req)) {
2355                        ret = -ECANCELED;
2356                        goto out;
2357                }
2358                /*
2359                 * check to see if we have to pause
2360                 */
2361                if (atomic_read(&fs_info->scrub_pause_req)) {
2362                        /* push queued extents */
2363                        atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
2364                        scrub_submit(sctx);
2365                        mutex_lock(&sctx->wr_ctx.wr_lock);
2366                        scrub_wr_submit(sctx);
2367                        mutex_unlock(&sctx->wr_ctx.wr_lock);
2368                        wait_event(sctx->list_wait,
2369                                   atomic_read(&sctx->bios_in_flight) == 0);
2370                        atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
2371                        atomic_inc(&fs_info->scrubs_paused);
2372                        wake_up(&fs_info->scrub_pause_wait);
2373                        mutex_lock(&fs_info->scrub_lock);
2374                        while (atomic_read(&fs_info->scrub_pause_req)) {
2375                                mutex_unlock(&fs_info->scrub_lock);
2376                                wait_event(fs_info->scrub_pause_wait,
2377                                   atomic_read(&fs_info->scrub_pause_req) == 0);
2378                                mutex_lock(&fs_info->scrub_lock);
2379                        }
2380                        atomic_dec(&fs_info->scrubs_paused);
2381                        mutex_unlock(&fs_info->scrub_lock);
2382                        wake_up(&fs_info->scrub_pause_wait);
2383                }
2384
2385                ret = btrfs_lookup_csums_range(csum_root, logical,
2386                                               logical + map->stripe_len - 1,
2387                                               &sctx->csum_list, 1);
2388                if (ret)
2389                        goto out;
2390
2391                key.objectid = logical;
2392                key.type = BTRFS_EXTENT_ITEM_KEY;
2393                key.offset = (u64)0;
2394
2395                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2396                if (ret < 0)
2397                        goto out;
2398                if (ret > 0) {
2399                        ret = btrfs_previous_item(root, path, 0,
2400                                                  BTRFS_EXTENT_ITEM_KEY);
2401                        if (ret < 0)
2402                                goto out;
2403                        if (ret > 0) {
2404                                /* there's no smaller item, so stick with the
2405                                 * larger one */
2406                                btrfs_release_path(path);
2407                                ret = btrfs_search_slot(NULL, root, &key,
2408                                                        path, 0, 0);
2409                                if (ret < 0)
2410                                        goto out;
2411                        }
2412                }
2413
2414                while (1) {
2415                        l = path->nodes[0];
2416                        slot = path->slots[0];
2417                        if (slot >= btrfs_header_nritems(l)) {
2418                                ret = btrfs_next_leaf(root, path);
2419                                if (ret == 0)
2420                                        continue;
2421                                if (ret < 0)
2422                                        goto out;
2423
2424                                break;
2425                        }
2426                        btrfs_item_key_to_cpu(l, &key, slot);
2427
2428                        if (key.objectid + key.offset <= logical)
2429                                goto next;
2430
2431                        if (key.objectid >= logical + map->stripe_len)
2432                                break;
2433
2434                        if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY)
2435                                goto next;
2436
2437                        extent = btrfs_item_ptr(l, slot,
2438                                                struct btrfs_extent_item);
2439                        flags = btrfs_extent_flags(l, extent);
2440                        generation = btrfs_extent_generation(l, extent);
2441
2442                        if (key.objectid < logical &&
2443                            (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
2444                                printk(KERN_ERR
2445                                       "btrfs scrub: tree block %llu spanning "
2446                                       "stripes, ignored. logical=%llu\n",
2447                                       (unsigned long long)key.objectid,
2448                                       (unsigned long long)logical);
2449                                goto next;
2450                        }
2451
2452                        /*
2453                         * trim extent to this stripe
2454                         */
2455                        if (key.objectid < logical) {
2456                                key.offset -= logical - key.objectid;
2457                                key.objectid = logical;
2458                        }
2459                        if (key.objectid + key.offset >
2460                            logical + map->stripe_len) {
2461                                key.offset = logical + map->stripe_len -
2462                                             key.objectid;
2463                        }
2464
2465                        extent_logical = key.objectid;
2466                        extent_physical = key.objectid - logical + physical;
2467                        extent_len = key.offset;
2468                        extent_dev = scrub_dev;
2469                        extent_mirror_num = mirror_num;
2470                        if (is_dev_replace)
2471                                scrub_remap_extent(fs_info, extent_logical,
2472                                                   extent_len, &extent_physical,
2473                                                   &extent_dev,
2474                                                   &extent_mirror_num);
2475                        ret = scrub_extent(sctx, extent_logical, extent_len,
2476                                           extent_physical, extent_dev, flags,
2477                                           generation, extent_mirror_num,
2478                                           key.objectid - logical + physical);
2479                        if (ret)
2480                                goto out;
2481
2482next:
2483                        path->slots[0]++;
2484                }
2485                btrfs_release_path(path);
2486                logical += increment;
2487                physical += map->stripe_len;
2488                spin_lock(&sctx->stat_lock);
2489                sctx->stat.last_physical = physical;
2490                spin_unlock(&sctx->stat_lock);
2491        }
2492out:
2493        /* push queued extents */
2494        scrub_submit(sctx);
2495        mutex_lock(&sctx->wr_ctx.wr_lock);
2496        scrub_wr_submit(sctx);
2497        mutex_unlock(&sctx->wr_ctx.wr_lock);
2498
2499        blk_finish_plug(&plug);
2500        btrfs_free_path(path);
2501        return ret < 0 ? ret : 0;
2502}
2503
2504static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
2505                                          struct btrfs_device *scrub_dev,
2506                                          u64 chunk_tree, u64 chunk_objectid,
2507                                          u64 chunk_offset, u64 length,
2508                                          u64 dev_offset, int is_dev_replace)
2509{
2510        struct btrfs_mapping_tree *map_tree =
2511                &sctx->dev_root->fs_info->mapping_tree;
2512        struct map_lookup *map;
2513        struct extent_map *em;
2514        int i;
2515        int ret = 0;
2516
2517        read_lock(&map_tree->map_tree.lock);
2518        em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
2519        read_unlock(&map_tree->map_tree.lock);
2520
2521        if (!em)
2522                return -EINVAL;
2523
2524        map = (struct map_lookup *)em->bdev;
2525        if (em->start != chunk_offset)
2526                goto out;
2527
2528        if (em->len < length)
2529                goto out;
2530
2531        for (i = 0; i < map->num_stripes; ++i) {
2532                if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
2533                    map->stripes[i].physical == dev_offset) {
2534                        ret = scrub_stripe(sctx, map, scrub_dev, i,
2535                                           chunk_offset, length,
2536                                           is_dev_replace);
2537                        if (ret)
2538                                goto out;
2539                }
2540        }
2541out:
2542        free_extent_map(em);
2543
2544        return ret;
2545}
2546
2547static noinline_for_stack
2548int scrub_enumerate_chunks(struct scrub_ctx *sctx,
2549                           struct btrfs_device *scrub_dev, u64 start, u64 end,
2550                           int is_dev_replace)
2551{
2552        struct btrfs_dev_extent *dev_extent = NULL;
2553        struct btrfs_path *path;
2554        struct btrfs_root *root = sctx->dev_root;
2555        struct btrfs_fs_info *fs_info = root->fs_info;
2556        u64 length;
2557        u64 chunk_tree;
2558        u64 chunk_objectid;
2559        u64 chunk_offset;
2560        int ret;
2561        int slot;
2562        struct extent_buffer *l;
2563        struct btrfs_key key;
2564        struct btrfs_key found_key;
2565        struct btrfs_block_group_cache *cache;
2566        struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
2567
2568        path = btrfs_alloc_path();
2569        if (!path)
2570                return -ENOMEM;
2571
2572        path->reada = 2;
2573        path->search_commit_root = 1;
2574        path->skip_locking = 1;
2575
2576        key.objectid = scrub_dev->devid;
2577        key.offset = 0ull;
2578        key.type = BTRFS_DEV_EXTENT_KEY;
2579
2580        while (1) {
2581                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2582                if (ret < 0)
2583                        break;
2584                if (ret > 0) {
2585                        if (path->slots[0] >=
2586                            btrfs_header_nritems(path->nodes[0])) {
2587                                ret = btrfs_next_leaf(root, path);
2588                                if (ret)
2589                                        break;
2590                        }
2591                }
2592
2593                l = path->nodes[0];
2594                slot = path->slots[0];
2595
2596                btrfs_item_key_to_cpu(l, &found_key, slot);
2597
2598                if (found_key.objectid != scrub_dev->devid)
2599                        break;
2600
2601                if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
2602                        break;
2603
2604                if (found_key.offset >= end)
2605                        break;
2606
2607                if (found_key.offset < key.offset)
2608                        break;
2609
2610                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
2611                length = btrfs_dev_extent_length(l, dev_extent);
2612
2613                if (found_key.offset + length <= start) {
2614                        key.offset = found_key.offset + length;
2615                        btrfs_release_path(path);
2616                        continue;
2617                }
2618
2619                chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
2620                chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
2621                chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
2622
2623                /*
2624                 * get a reference on the corresponding block group to prevent
2625                 * the chunk from going away while we scrub it
2626                 */
2627                cache = btrfs_lookup_block_group(fs_info, chunk_offset);
2628                if (!cache) {
2629                        ret = -ENOENT;
2630                        break;
2631                }
2632                dev_replace->cursor_right = found_key.offset + length;
2633                dev_replace->cursor_left = found_key.offset;
2634                dev_replace->item_needs_writeback = 1;
2635                ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
2636                                  chunk_offset, length, found_key.offset,
2637                                  is_dev_replace);
2638
2639                /*
2640                 * flush, submit all pending read and write bios, afterwards
2641                 * wait for them.
2642                 * Note that in the dev replace case, a read request causes
2643                 * write requests that are submitted in the read completion
2644                 * worker. Therefore in the current situation, it is required
2645                 * that all write requests are flushed, so that all read and
2646                 * write requests are really completed when bios_in_flight
2647                 * changes to 0.
2648                 */
2649                atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
2650                scrub_submit(sctx);
2651                mutex_lock(&sctx->wr_ctx.wr_lock);
2652                scrub_wr_submit(sctx);
2653                mutex_unlock(&sctx->wr_ctx.wr_lock);
2654
2655                wait_event(sctx->list_wait,
2656                           atomic_read(&sctx->bios_in_flight) == 0);
2657                atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
2658                atomic_inc(&fs_info->scrubs_paused);
2659                wake_up(&fs_info->scrub_pause_wait);
2660                wait_event(sctx->list_wait,
2661                           atomic_read(&sctx->workers_pending) == 0);
2662
2663                mutex_lock(&fs_info->scrub_lock);
2664                while (atomic_read(&fs_info->scrub_pause_req)) {
2665                        mutex_unlock(&fs_info->scrub_lock);
2666                        wait_event(fs_info->scrub_pause_wait,
2667                           atomic_read(&fs_info->scrub_pause_req) == 0);
2668                        mutex_lock(&fs_info->scrub_lock);
2669                }
2670                atomic_dec(&fs_info->scrubs_paused);
2671                mutex_unlock(&fs_info->scrub_lock);
2672                wake_up(&fs_info->scrub_pause_wait);
2673
2674                dev_replace->cursor_left = dev_replace->cursor_right;
2675                dev_replace->item_needs_writeback = 1;
2676                btrfs_put_block_group(cache);
2677                if (ret)
2678                        break;
2679                if (is_dev_replace &&
2680                    atomic64_read(&dev_replace->num_write_errors) > 0) {
2681                        ret = -EIO;
2682                        break;
2683                }
2684                if (sctx->stat.malloc_errors > 0) {
2685                        ret = -ENOMEM;
2686                        break;
2687                }
2688
2689                key.offset = found_key.offset + length;
2690                btrfs_release_path(path);
2691        }
2692
2693        btrfs_free_path(path);
2694
2695        /*
2696         * ret can still be 1 from search_slot or next_leaf,
2697         * that's not an error
2698         */
2699        return ret < 0 ? ret : 0;
2700}
2701
2702static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
2703                                           struct btrfs_device *scrub_dev)
2704{
2705        int     i;
2706        u64     bytenr;
2707        u64     gen;
2708        int     ret;
2709        struct btrfs_root *root = sctx->dev_root;
2710
2711        if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
2712                return -EIO;
2713
2714        gen = root->fs_info->last_trans_committed;
2715
2716        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
2717                bytenr = btrfs_sb_offset(i);
2718                if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes)
2719                        break;
2720
2721                ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
2722                                  scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
2723                                  NULL, 1, bytenr);
2724                if (ret)
2725                        return ret;
2726        }
2727        wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
2728
2729        return 0;
2730}
2731
2732/*
2733 * get a reference count on fs_info->scrub_workers. start worker if necessary
2734 */
2735static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
2736                                                int is_dev_replace)
2737{
2738        int ret = 0;
2739
2740        mutex_lock(&fs_info->scrub_lock);
2741        if (fs_info->scrub_workers_refcnt == 0) {
2742                if (is_dev_replace)
2743                        btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1,
2744                                        &fs_info->generic_worker);
2745                else
2746                        btrfs_init_workers(&fs_info->scrub_workers, "scrub",
2747                                        fs_info->thread_pool_size,
2748                                        &fs_info->generic_worker);
2749                fs_info->scrub_workers.idle_thresh = 4;
2750                ret = btrfs_start_workers(&fs_info->scrub_workers);
2751                if (ret)
2752                        goto out;
2753                btrfs_init_workers(&fs_info->scrub_wr_completion_workers,
2754                                   "scrubwrc",
2755                                   fs_info->thread_pool_size,
2756                                   &fs_info->generic_worker);
2757                fs_info->scrub_wr_completion_workers.idle_thresh = 2;
2758                ret = btrfs_start_workers(
2759                                &fs_info->scrub_wr_completion_workers);
2760                if (ret)
2761                        goto out;
2762                btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1,
2763                                   &fs_info->generic_worker);
2764                ret = btrfs_start_workers(&fs_info->scrub_nocow_workers);
2765                if (ret)
2766                        goto out;
2767        }
2768        ++fs_info->scrub_workers_refcnt;
2769out:
2770        mutex_unlock(&fs_info->scrub_lock);
2771
2772        return ret;
2773}
2774
2775static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
2776{
2777        mutex_lock(&fs_info->scrub_lock);
2778        if (--fs_info->scrub_workers_refcnt == 0) {
2779                btrfs_stop_workers(&fs_info->scrub_workers);
2780                btrfs_stop_workers(&fs_info->scrub_wr_completion_workers);
2781                btrfs_stop_workers(&fs_info->scrub_nocow_workers);
2782        }
2783        WARN_ON(fs_info->scrub_workers_refcnt < 0);
2784        mutex_unlock(&fs_info->scrub_lock);
2785}
2786
2787int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
2788                    u64 end, struct btrfs_scrub_progress *progress,
2789                    int readonly, int is_dev_replace)
2790{
2791        struct scrub_ctx *sctx;
2792        int ret;
2793        struct btrfs_device *dev;
2794
2795        if (btrfs_fs_closing(fs_info))
2796                return -EINVAL;
2797
2798        /*
2799         * check some assumptions
2800         */
2801        if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
2802                printk(KERN_ERR
2803                       "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n",
2804                       fs_info->chunk_root->nodesize,
2805                       fs_info->chunk_root->leafsize);
2806                return -EINVAL;
2807        }
2808
2809        if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
2810                /*
2811                 * in this case scrub is unable to calculate the checksum
2812                 * the way scrub is implemented. Do not handle this
2813                 * situation at all because it won't ever happen.
2814                 */
2815                printk(KERN_ERR
2816                       "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n",
2817                       fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
2818                return -EINVAL;
2819        }
2820
2821        if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
2822                /* not supported for data w/o checksums */
2823                printk(KERN_ERR
2824                       "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n",
2825                       fs_info->chunk_root->sectorsize,
2826                       (unsigned long long)PAGE_SIZE);
2827                return -EINVAL;
2828        }
2829
2830        if (fs_info->chunk_root->nodesize >
2831            PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
2832            fs_info->chunk_root->sectorsize >
2833            PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
2834                /*
2835                 * would exhaust the array bounds of pagev member in
2836                 * struct scrub_block
2837                 */
2838                pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n",
2839                       fs_info->chunk_root->nodesize,
2840                       SCRUB_MAX_PAGES_PER_BLOCK,
2841                       fs_info->chunk_root->sectorsize,
2842                       SCRUB_MAX_PAGES_PER_BLOCK);
2843                return -EINVAL;
2844        }
2845
2846        ret = scrub_workers_get(fs_info, is_dev_replace);
2847        if (ret)
2848                return ret;
2849
2850        mutex_lock(&fs_info->fs_devices->device_list_mutex);
2851        dev = btrfs_find_device(fs_info, devid, NULL, NULL);
2852        if (!dev || (dev->missing && !is_dev_replace)) {
2853                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2854                scrub_workers_put(fs_info);
2855                return -ENODEV;
2856        }
2857        mutex_lock(&fs_info->scrub_lock);
2858
2859        if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
2860                mutex_unlock(&fs_info->scrub_lock);
2861                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2862                scrub_workers_put(fs_info);
2863                return -EIO;
2864        }
2865
2866        btrfs_dev_replace_lock(&fs_info->dev_replace);
2867        if (dev->scrub_device ||
2868            (!is_dev_replace &&
2869             btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
2870                btrfs_dev_replace_unlock(&fs_info->dev_replace);
2871                mutex_unlock(&fs_info->scrub_lock);
2872                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2873                scrub_workers_put(fs_info);
2874                return -EINPROGRESS;
2875        }
2876        btrfs_dev_replace_unlock(&fs_info->dev_replace);
2877        sctx = scrub_setup_ctx(dev, is_dev_replace);
2878        if (IS_ERR(sctx)) {
2879                mutex_unlock(&fs_info->scrub_lock);
2880                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2881                scrub_workers_put(fs_info);
2882                return PTR_ERR(sctx);
2883        }
2884        sctx->readonly = readonly;
2885        dev->scrub_device = sctx;
2886
2887        atomic_inc(&fs_info->scrubs_running);
2888        mutex_unlock(&fs_info->scrub_lock);
2889        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2890
2891        if (!is_dev_replace) {
2892                down_read(&fs_info->scrub_super_lock);
2893                ret = scrub_supers(sctx, dev);
2894                up_read(&fs_info->scrub_super_lock);
2895        }
2896
2897        if (!ret)
2898                ret = scrub_enumerate_chunks(sctx, dev, start, end,
2899                                             is_dev_replace);
2900
2901        wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
2902        atomic_dec(&fs_info->scrubs_running);
2903        wake_up(&fs_info->scrub_pause_wait);
2904
2905        wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
2906
2907        if (progress)
2908                memcpy(progress, &sctx->stat, sizeof(*progress));
2909
2910        mutex_lock(&fs_info->scrub_lock);
2911        dev->scrub_device = NULL;
2912        mutex_unlock(&fs_info->scrub_lock);
2913
2914        scrub_free_ctx(sctx);
2915        scrub_workers_put(fs_info);
2916
2917        return ret;
2918}
2919
2920void btrfs_scrub_pause(struct btrfs_root *root)
2921{
2922        struct btrfs_fs_info *fs_info = root->fs_info;
2923
2924        mutex_lock(&fs_info->scrub_lock);
2925        atomic_inc(&fs_info->scrub_pause_req);
2926        while (atomic_read(&fs_info->scrubs_paused) !=
2927               atomic_read(&fs_info->scrubs_running)) {
2928                mutex_unlock(&fs_info->scrub_lock);
2929                wait_event(fs_info->scrub_pause_wait,
2930                           atomic_read(&fs_info->scrubs_paused) ==
2931                           atomic_read(&fs_info->scrubs_running));
2932                mutex_lock(&fs_info->scrub_lock);
2933        }
2934        mutex_unlock(&fs_info->scrub_lock);
2935}
2936
2937void btrfs_scrub_continue(struct btrfs_root *root)
2938{
2939        struct btrfs_fs_info *fs_info = root->fs_info;
2940
2941        atomic_dec(&fs_info->scrub_pause_req);
2942        wake_up(&fs_info->scrub_pause_wait);
2943}
2944
2945void btrfs_scrub_pause_super(struct btrfs_root *root)
2946{
2947        down_write(&root->fs_info->scrub_super_lock);
2948}
2949
2950void btrfs_scrub_continue_super(struct btrfs_root *root)
2951{
2952        up_write(&root->fs_info->scrub_super_lock);
2953}
2954
2955int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
2956{
2957        mutex_lock(&fs_info->scrub_lock);
2958        if (!atomic_read(&fs_info->scrubs_running)) {
2959                mutex_unlock(&fs_info->scrub_lock);
2960                return -ENOTCONN;
2961        }
2962
2963        atomic_inc(&fs_info->scrub_cancel_req);
2964        while (atomic_read(&fs_info->scrubs_running)) {
2965                mutex_unlock(&fs_info->scrub_lock);
2966                wait_event(fs_info->scrub_pause_wait,
2967                           atomic_read(&fs_info->scrubs_running) == 0);
2968                mutex_lock(&fs_info->scrub_lock);
2969        }
2970        atomic_dec(&fs_info->scrub_cancel_req);
2971        mutex_unlock(&fs_info->scrub_lock);
2972
2973        return 0;
2974}
2975
2976int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
2977                           struct btrfs_device *dev)
2978{
2979        struct scrub_ctx *sctx;
2980
2981        mutex_lock(&fs_info->scrub_lock);
2982        sctx = dev->scrub_device;
2983        if (!sctx) {
2984                mutex_unlock(&fs_info->scrub_lock);
2985                return -ENOTCONN;
2986        }
2987        atomic_inc(&sctx->cancel_req);
2988        while (dev->scrub_device) {
2989                mutex_unlock(&fs_info->scrub_lock);
2990                wait_event(fs_info->scrub_pause_wait,
2991                           dev->scrub_device == NULL);
2992                mutex_lock(&fs_info->scrub_lock);
2993        }
2994        mutex_unlock(&fs_info->scrub_lock);
2995
2996        return 0;
2997}
2998
2999int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
3000{

3001        struct btrfs_fs_info *fs_info = root->fs_info;
3002        struct btrfs_device *dev;
3003        int ret;
3004
3005        /*
3006         * we have to hold the device_list_mutex here so the device
3007         * does not go away in cancel_dev. FIXME: find a better solution
3008         */
3009        mutex_lock(&fs_info->fs_devices->device_list_mutex);
3010        dev = btrfs_find_device(fs_info, devid, NULL, NULL);
3011        if (!dev) {
3012                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3013                return -ENODEV;
3014        }
3015        ret = btrfs_scrub_cancel_dev(fs_info, dev);
3016        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
3017
3018        return ret;
3019}
3020
3021int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
3022                         struct btrfs_scrub_progress *progress)
3023{
3024        struct btrfs_device *dev;
3025        struct scrub_ctx *sctx = NULL;
3026
3027        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
3028        dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);
3029        if (dev)
3030                sctx = dev->scrub_device;
3031        if (sctx)
3032                memcpy(progress, &sctx->stat, sizeof(*progress));
3033        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
3034
3035        return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
3036}
3037
3038static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
3039                               u64 extent_logical, u64 extent_len,
3040                               u64 *extent_physical,
3041                               struct btrfs_device **extent_dev,
3042                               int *extent_mirror_num)
3043{
3044        u64 mapped_length;
3045        struct btrfs_bio *bbio = NULL;
3046        int ret;
3047
3048        mapped_length = extent_len;
3049        ret = btrfs_map_block(fs_info, READ, extent_logical,
3050                              &mapped_length, &bbio, 0);
3051        if (ret || !bbio || mapped_length < extent_len ||
3052            !bbio->stripes[0].dev->bdev) {
3053                kfree(bbio);
3054                return;
3055        }
3056
3057        *extent_physical = bbio->stripes[0].physical;
3058        *extent_mirror_num = bbio->mirror_num;
3059        *extent_dev = bbio->stripes[0].dev;
3060        kfree(bbio);
3061}
3062
3063static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
3064                              struct scrub_wr_ctx *wr_ctx,
3065                              struct btrfs_fs_info *fs_info,
3066                              struct btrfs_device *dev,
3067                              int is_dev_replace)
3068{
3069        WARN_ON(wr_ctx->wr_curr_bio != NULL);
3070
3071        mutex_init(&wr_ctx->wr_lock);
3072        wr_ctx->wr_curr_bio = NULL;
3073        if (!is_dev_replace)
3074                return 0;
3075
3076        WARN_ON(!dev->bdev);
3077        wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
3078                                         bio_get_nr_vecs(dev->bdev));
3079        wr_ctx->tgtdev = dev;
3080        atomic_set(&wr_ctx->flush_all_writes, 0);
3081        return 0;
3082}
3083
3084static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
3085{
3086        mutex_lock(&wr_ctx->wr_lock);
3087        kfree(wr_ctx->wr_curr_bio);
3088        wr_ctx->wr_curr_bio = NULL;
3089        mutex_unlock(&wr_ctx->wr_lock);
3090}
3091
3092static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
3093                            int mirror_num, u64 physical_for_dev_replace)
3094{
3095        struct scrub_copy_nocow_ctx *nocow_ctx;
3096        struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
3097
3098        nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
3099        if (!nocow_ctx) {
3100                spin_lock(&sctx->stat_lock);
3101                sctx->stat.malloc_errors++;
3102                spin_unlock(&sctx->stat_lock);
3103                return -ENOMEM;
3104        }
3105
3106        scrub_pending_trans_workers_inc(sctx);
3107
3108        nocow_ctx->sctx = sctx;
3109        nocow_ctx->logical = logical;
3110        nocow_ctx->len = len;
3111        nocow_ctx->mirror_num = mirror_num;
3112        nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
3113        nocow_ctx->work.func = copy_nocow_pages_worker;
3114        btrfs_queue_worker(&fs_info->scrub_nocow_workers,
3115                           &nocow_ctx->work);
3116
3117        return 0;
3118}
3119
3120static void copy_nocow_pages_worker(struct btrfs_work *work)
3121{
3122        struct scrub_copy_nocow_ctx *nocow_ctx =
3123                container_of(work, struct scrub_copy_nocow_ctx, work);
3124        struct scrub_ctx *sctx = nocow_ctx->sctx;
3125        u64 logical = nocow_ctx->logical;
3126        u64 len = nocow_ctx->len;
3127        int mirror_num = nocow_ctx->mirror_num;
3128        u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3129        int ret;
3130        struct btrfs_trans_handle *trans = NULL;
3131        struct btrfs_fs_info *fs_info;
3132        struct btrfs_path *path;
3133        struct btrfs_root *root;
3134        int not_written = 0;
3135
3136        fs_info = sctx->dev_root->fs_info;
3137        root = fs_info->extent_root;
3138
3139        path = btrfs_alloc_path();
3140        if (!path) {
3141                spin_lock(&sctx->stat_lock);
3142                sctx->stat.malloc_errors++;
3143                spin_unlock(&sctx->stat_lock);
3144                not_written = 1;
3145                goto out;
3146        }
3147
3148        trans = btrfs_join_transaction(root);
3149        if (IS_ERR(trans)) {
3150                not_written = 1;
3151                goto out;
3152        }
3153
3154        ret = iterate_inodes_from_logical(logical, fs_info, path,
3155                                          copy_nocow_pages_for_inode,
3156                                          nocow_ctx);
3157        if (ret != 0 && ret != -ENOENT) {
3158                pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %llu, ret %d\n",
3159                        (unsigned long long)logical,
3160                        (unsigned long long)physical_for_dev_replace,
3161                        (unsigned long long)len,
3162                        (unsigned long long)mirror_num, ret);
3163                not_written = 1;
3164                goto out;
3165        }
3166
3167out:
3168        if (trans && !IS_ERR(trans))
3169                btrfs_end_transaction(trans, root);
3170        if (not_written)
3171                btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
3172                                            num_uncorrectable_read_errors);
3173
3174        btrfs_free_path(path);
3175        kfree(nocow_ctx);
3176
3177        scrub_pending_trans_workers_dec(sctx);
3178}
3179
3180static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
3181{
3182        unsigned long index;
3183        struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
3184        int ret = 0;
3185        struct btrfs_key key;
3186        struct inode *inode = NULL;
3187        struct btrfs_root *local_root;
3188        u64 physical_for_dev_replace;
3189        u64 len;
3190        struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
3191        int srcu_index;
3192
3193        key.objectid = root;
3194        key.type = BTRFS_ROOT_ITEM_KEY;
3195        key.offset = (u64)-1;
3196
3197        srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
3198
3199        local_root = btrfs_read_fs_root_no_name(fs_info, &key);
3200        if (IS_ERR(local_root)) {
3201                srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
3202                return PTR_ERR(local_root);
3203        }
3204
3205        key.type = BTRFS_INODE_ITEM_KEY;
3206        key.objectid = inum;
3207        key.offset = 0;
3208        inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
3209        srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
3210        if (IS_ERR(inode))
3211                return PTR_ERR(inode);
3212
3213        physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
3214        len = nocow_ctx->len;
3215        while (len >= PAGE_CACHE_SIZE) {
3216                struct page *page = NULL;
3217                int ret_sub;
3218
3219                index = offset >> PAGE_CACHE_SHIFT;
3220
3221                page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
3222                if (!page) {
3223                        pr_err("find_or_create_page() failed\n");
3224                        ret = -ENOMEM;
3225                        goto next_page;
3226                }
3227
3228                if (PageUptodate(page)) {
3229                        if (PageDirty(page))
3230                                goto next_page;
3231                } else {
3232                        ClearPageError(page);
3233                        ret_sub = extent_read_full_page(&BTRFS_I(inode)->
3234                                                         io_tree,
3235                                                        page, btrfs_get_extent,
3236                                                        nocow_ctx->mirror_num);
3237                        if (ret_sub) {
3238                                ret = ret_sub;
3239                                goto next_page;
3240                        }
3241                        wait_on_page_locked(page);
3242                        if (!PageUptodate(page)) {
3243                                ret = -EIO;
3244                                goto next_page;
3245                        }
3246                }
3247                ret_sub = write_page_nocow(nocow_ctx->sctx,
3248                                           physical_for_dev_replace, page);
3249                if (ret_sub) {
3250                        ret = ret_sub;
3251                        goto next_page;
3252                }
3253
3254next_page:
3255                if (page) {
3256                        unlock_page(page);
3257                        put_page(page);
3258                }
3259                offset += PAGE_CACHE_SIZE;
3260                physical_for_dev_replace += PAGE_CACHE_SIZE;
3261                len -= PAGE_CACHE_SIZE;
3262        }
3263
3264        if (inode)
3265                iput(inode);
3266        return ret;
3267}
3268
3269static int write_page_nocow(struct scrub_ctx *sctx,
3270                            u64 physical_for_dev_replace, struct page *page)
3271{
3272        struct bio *bio;
3273        struct btrfs_device *dev;
3274        int ret;
3275        DECLARE_COMPLETION_ONSTACK(compl);
3276
3277        dev = sctx->wr_ctx.tgtdev;
3278        if (!dev)
3279                return -EIO;
3280        if (!dev->bdev) {
3281                printk_ratelimited(KERN_WARNING
3282                        "btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
3283                return -EIO;
3284        }
3285        bio = bio_alloc(GFP_NOFS, 1);
3286        if (!bio) {
3287                spin_lock(&sctx->stat_lock);
3288                sctx->stat.malloc_errors++;
3289                spin_unlock(&sctx->stat_lock);
3290                return -ENOMEM;
3291        }
3292        bio->bi_private = &compl;
3293        bio->bi_end_io = scrub_complete_bio_end_io;
3294        bio->bi_size = 0;
3295        bio->bi_sector = physical_for_dev_replace >> 9;
3296        bio->bi_bdev = dev->bdev;
3297        ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
3298        if (ret != PAGE_CACHE_SIZE) {
3299leave_with_eio:
3300                bio_put(bio);
3301                btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
3302                return -EIO;
3303        }
3304        btrfsic_submit_bio(WRITE_SYNC, bio);
3305        wait_for_completion(&compl);
3306
3307        if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
3308                goto leave_with_eio;
3309
3310        bio_put(bio);
3311        return 0;
3312}
3313