LXR linux/fs/btrfs/raid56.c

   1/*
   2 * Copyright (C) 2012 Fusion-io  All rights reserved.
   3 * Copyright (C) 2012 Intel Corp. All rights reserved.
   4 *
   5 * This program is free software; you can redistribute it and/or
   6 * modify it under the terms of the GNU General Public
   7 * License v2 as published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it will be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12 * General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public
  15 * License along with this program; if not, write to the
  16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  17 * Boston, MA 021110-1307, USA.
  18 */
  19#include <linux/sched.h>
  20#include <linux/wait.h>
  21#include <linux/bio.h>
  22#include <linux/slab.h>
  23#include <linux/buffer_head.h>
  24#include <linux/blkdev.h>
  25#include <linux/random.h>
  26#include <linux/iocontext.h>
  27#include <linux/capability.h>
  28#include <linux/ratelimit.h>
  29#include <linux/kthread.h>
  30#include <linux/raid/pq.h>
  31#include <linux/hash.h>
  32#include <linux/list_sort.h>
  33#include <linux/raid/xor.h>
  34#include <linux/vmalloc.h>
  35#include <asm/div64.h>
  36#include "ctree.h"
  37#include "extent_map.h"
  38#include "disk-io.h"
  39#include "transaction.h"
  40#include "print-tree.h"
  41#include "volumes.h"
  42#include "raid56.h"
  43#include "async-thread.h"
  44#include "check-integrity.h"
  45#include "rcu-string.h"
  46
  47/* set when additional merges to this rbio are not allowed */
  48#define RBIO_RMW_LOCKED_BIT     1
  49
  50/*
  51 * set when this rbio is sitting in the hash, but it is just a cache
  52 * of past RMW
  53 */
  54#define RBIO_CACHE_BIT          2
  55
  56/*
  57 * set when it is safe to trust the stripe_pages for caching
  58 */
  59#define RBIO_CACHE_READY_BIT    3
  60
  61/*
  62 * bbio and raid_map is managed by the caller, so we shouldn't free
  63 * them here. And besides that, all rbios with this flag should not
  64 * be cached, because we need raid_map to check the rbios' stripe
  65 * is the same or not, but it is very likely that the caller has
  66 * free raid_map, so don't cache those rbios.
  67 */
  68#define RBIO_HOLD_BBIO_MAP_BIT  4
  69
  70#define RBIO_CACHE_SIZE 1024
  71
  72enum btrfs_rbio_ops {
  73        BTRFS_RBIO_WRITE        = 0,
  74        BTRFS_RBIO_READ_REBUILD = 1,
  75        BTRFS_RBIO_PARITY_SCRUB = 2,
  76};
  77
  78struct btrfs_raid_bio {
  79        struct btrfs_fs_info *fs_info;
  80        struct btrfs_bio *bbio;
  81
  82        /*
  83         * logical block numbers for the start of each stripe
  84         * The last one or two are p/q.  These are sorted,
  85         * so raid_map[0] is the start of our full stripe
  86         */
  87        u64 *raid_map;
  88
  89        /* while we're doing rmw on a stripe
  90         * we put it into a hash table so we can
  91         * lock the stripe and merge more rbios
  92         * into it.
  93         */
  94        struct list_head hash_list;
  95
  96        /*
  97         * LRU list for the stripe cache
  98         */
  99        struct list_head stripe_cache;
 100
 101        /*
 102         * for scheduling work in the helper threads
 103         */
 104        struct btrfs_work work;
 105
 106        /*
 107         * bio list and bio_list_lock are used
 108         * to add more bios into the stripe
 109         * in hopes of avoiding the full rmw
 110         */
 111        struct bio_list bio_list;
 112        spinlock_t bio_list_lock;
 113
 114        /* also protected by the bio_list_lock, the
 115         * plug list is used by the plugging code
 116         * to collect partial bios while plugged.  The
 117         * stripe locking code also uses it to hand off
 118         * the stripe lock to the next pending IO
 119         */
 120        struct list_head plug_list;
 121
 122        /*
 123         * flags that tell us if it is safe to
 124         * merge with this bio
 125         */
 126        unsigned long flags;
 127
 128        /* size of each individual stripe on disk */
 129        int stripe_len;
 130
 131        /* number of data stripes (no p/q) */
 132        int nr_data;
 133
 134        int real_stripes;
 135
 136        int stripe_npages;
 137        /*
 138         * set if we're doing a parity rebuild
 139         * for a read from higher up, which is handled
 140         * differently from a parity rebuild as part of
 141         * rmw
 142         */
 143        enum btrfs_rbio_ops operation;
 144
 145        /* first bad stripe */
 146        int faila;
 147
 148        /* second bad stripe (for raid6 use) */
 149        int failb;
 150
 151        int scrubp;
 152        /*
 153         * number of pages needed to represent the full
 154         * stripe
 155         */
 156        int nr_pages;
 157
 158        /*
 159         * size of all the bios in the bio_list.  This
 160         * helps us decide if the rbio maps to a full
 161         * stripe or not
 162         */
 163        int bio_list_bytes;
 164
 165        int generic_bio_cnt;
 166
 167        atomic_t refs;
 168
 169        atomic_t stripes_pending;
 170
 171        atomic_t error;
 172        /*
 173         * these are two arrays of pointers.  We allocate the
 174         * rbio big enough to hold them both and setup their
 175         * locations when the rbio is allocated
 176         */
 177
 178        /* pointers to pages that we allocated for
 179         * reading/writing stripes directly from the disk (including P/Q)
 180         */
 181        struct page **stripe_pages;
 182
 183        /*
 184         * pointers to the pages in the bio_list.  Stored
 185         * here for faster lookup
 186         */
 187        struct page **bio_pages;
 188
 189        /*
 190         * bitmap to record which horizontal stripe has data
 191         */
 192        unsigned long *dbitmap;
 193};
 194
 195static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
 196static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
 197static void rmw_work(struct btrfs_work *work);
 198static void read_rebuild_work(struct btrfs_work *work);
 199static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
 200static void async_read_rebuild(struct btrfs_raid_bio *rbio);
 201static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
 202static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
 203static void __free_raid_bio(struct btrfs_raid_bio *rbio);
 204static void index_rbio_pages(struct btrfs_raid_bio *rbio);
 205static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
 206
 207static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
 208                                         int need_check);
 209static void async_scrub_parity(struct btrfs_raid_bio *rbio);
 210
 211/*
 212 * the stripe hash table is used for locking, and to collect
 213 * bios in hopes of making a full stripe
 214 */
 215int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
 216{
 217        struct btrfs_stripe_hash_table *table;
 218        struct btrfs_stripe_hash_table *x;
 219        struct btrfs_stripe_hash *cur;
 220        struct btrfs_stripe_hash *h;
 221        int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
 222        int i;
 223        int table_size;
 224
 225        if (info->stripe_hash_table)
 226                return 0;
 227
 228        /*
 229         * The table is large, starting with order 4 and can go as high as
 230         * order 7 in case lock debugging is turned on.
 231         *
 232         * Try harder to allocate and fallback to vmalloc to lower the chance
 233         * of a failing mount.
 234         */
 235        table_size = sizeof(*table) + sizeof(*h) * num_entries;
 236        table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
 237        if (!table) {
 238                table = vzalloc(table_size);
 239                if (!table)
 240                        return -ENOMEM;
 241        }
 242
 243        spin_lock_init(&table->cache_lock);
 244        INIT_LIST_HEAD(&table->stripe_cache);
 245
 246        h = table->table;
 247
 248        for (i = 0; i < num_entries; i++) {
 249                cur = h + i;
 250                INIT_LIST_HEAD(&cur->hash_list);
 251                spin_lock_init(&cur->lock);
 252                init_waitqueue_head(&cur->wait);
 253        }
 254
 255        x = cmpxchg(&info->stripe_hash_table, NULL, table);
 256        if (x) {
 257                if (is_vmalloc_addr(x))
 258                        vfree(x);
 259                else
 260                        kfree(x);
 261        }
 262        return 0;
 263}
 264
 265/*
 266 * caching an rbio means to copy anything from the
 267 * bio_pages array into the stripe_pages array.  We
 268 * use the page uptodate bit in the stripe cache array
 269 * to indicate if it has valid data
 270 *
 271 * once the caching is done, we set the cache ready
 272 * bit.
 273 */
 274static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
 275{
 276        int i;
 277        char *s;
 278        char *d;
 279        int ret;
 280
 281        ret = alloc_rbio_pages(rbio);
 282        if (ret)
 283                return;
 284
 285        for (i = 0; i < rbio->nr_pages; i++) {
 286                if (!rbio->bio_pages[i])
 287                        continue;
 288
 289                s = kmap(rbio->bio_pages[i]);
 290                d = kmap(rbio->stripe_pages[i]);
 291
 292                memcpy(d, s, PAGE_CACHE_SIZE);
 293
 294                kunmap(rbio->bio_pages[i]);
 295                kunmap(rbio->stripe_pages[i]);
 296                SetPageUptodate(rbio->stripe_pages[i]);
 297        }
 298        set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
 299}
 300
 301/*
 302 * we hash on the first logical address of the stripe
 303 */
 304static int rbio_bucket(struct btrfs_raid_bio *rbio)
 305{
 306        u64 num = rbio->raid_map[0];
 307
 308        /*
 309         * we shift down quite a bit.  We're using byte
 310         * addressing, and most of the lower bits are zeros.
 311         * This tends to upset hash_64, and it consistently
 312         * returns just one or two different values.
 313         *
 314         * shifting off the lower bits fixes things.
 315         */
 316        return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
 317}
 318
 319/*
 320 * stealing an rbio means taking all the uptodate pages from the stripe
 321 * array in the source rbio and putting them into the destination rbio
 322 */
 323static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
 324{
 325        int i;
 326        struct page *s;
 327        struct page *d;
 328
 329        if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
 330                return;
 331
 332        for (i = 0; i < dest->nr_pages; i++) {
 333                s = src->stripe_pages[i];
 334                if (!s || !PageUptodate(s)) {
 335                        continue;
 336                }
 337
 338                d = dest->stripe_pages[i];
 339                if (d)
 340                        __free_page(d);
 341
 342                dest->stripe_pages[i] = s;
 343                src->stripe_pages[i] = NULL;
 344        }
 345}
 346
 347/*
 348 * merging means we take the bio_list from the victim and
 349 * splice it into the destination.  The victim should
 350 * be discarded afterwards.
 351 *
 352 * must be called with dest->rbio_list_lock held
 353 */
 354static void merge_rbio(struct btrfs_raid_bio *dest,
 355                       struct btrfs_raid_bio *victim)
 356{
 357        bio_list_merge(&dest->bio_list, &victim->bio_list);
 358        dest->bio_list_bytes += victim->bio_list_bytes;
 359        dest->generic_bio_cnt += victim->generic_bio_cnt;
 360        bio_list_init(&victim->bio_list);
 361}
 362
 363/*
 364 * used to prune items that are in the cache.  The caller
 365 * must hold the hash table lock.
 366 */
 367static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
 368{
 369        int bucket = rbio_bucket(rbio);
 370        struct btrfs_stripe_hash_table *table;
 371        struct btrfs_stripe_hash *h;
 372        int freeit = 0;
 373
 374        /*
 375         * check the bit again under the hash table lock.
 376         */
 377        if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
 378                return;
 379
 380        table = rbio->fs_info->stripe_hash_table;
 381        h = table->table + bucket;
 382
 383        /* hold the lock for the bucket because we may be
 384         * removing it from the hash table
 385         */
 386        spin_lock(&h->lock);
 387
 388        /*
 389         * hold the lock for the bio list because we need
 390         * to make sure the bio list is empty
 391         */
 392        spin_lock(&rbio->bio_list_lock);
 393
 394        if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
 395                list_del_init(&rbio->stripe_cache);
 396                table->cache_size -= 1;
 397                freeit = 1;
 398
 399                /* if the bio list isn't empty, this rbio is
 400                 * still involved in an IO.  We take it out
 401                 * of the cache list, and drop the ref that
 402                 * was held for the list.
 403                 *
 404                 * If the bio_list was empty, we also remove
 405                 * the rbio from the hash_table, and drop
 406                 * the corresponding ref
 407                 */
 408                if (bio_list_empty(&rbio->bio_list)) {
 409                        if (!list_empty(&rbio->hash_list)) {
 410                                list_del_init(&rbio->hash_list);
 411                                atomic_dec(&rbio->refs);
 412                                BUG_ON(!list_empty(&rbio->plug_list));
 413                        }
 414                }
 415        }
 416
 417        spin_unlock(&rbio->bio_list_lock);
 418        spin_unlock(&h->lock);
 419
 420        if (freeit)
 421                __free_raid_bio(rbio);
 422}
 423
 424/*
 425 * prune a given rbio from the cache
 426 */
 427static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
 428{
 429        struct btrfs_stripe_hash_table *table;
 430        unsigned long flags;
 431
 432        if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
 433                return;
 434
 435        table = rbio->fs_info->stripe_hash_table;
 436
 437        spin_lock_irqsave(&table->cache_lock, flags);
 438        __remove_rbio_from_cache(rbio);
 439        spin_unlock_irqrestore(&table->cache_lock, flags);
 440}
 441
 442/*
 443 * remove everything in the cache
 444 */
 445static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
 446{
 447        struct btrfs_stripe_hash_table *table;
 448        unsigned long flags;
 449        struct btrfs_raid_bio *rbio;
 450
 451        table = info->stripe_hash_table;
 452
 453        spin_lock_irqsave(&table->cache_lock, flags);
 454        while (!list_empty(&table->stripe_cache)) {
 455                rbio = list_entry(table->stripe_cache.next,
 456                                  struct btrfs_raid_bio,
 457                                  stripe_cache);
 458                __remove_rbio_from_cache(rbio);
 459        }
 460        spin_unlock_irqrestore(&table->cache_lock, flags);
 461}
 462
 463/*
 464 * remove all cached entries and free the hash table
 465 * used by unmount
 466 */
 467void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
 468{
 469        if (!info->stripe_hash_table)
 470                return;
 471        btrfs_clear_rbio_cache(info);
 472        if (is_vmalloc_addr(info->stripe_hash_table))
 473                vfree(info->stripe_hash_table);
 474        else
 475                kfree(info->stripe_hash_table);
 476        info->stripe_hash_table = NULL;
 477}
 478
 479/*
 480 * insert an rbio into the stripe cache.  It
 481 * must have already been prepared by calling
 482 * cache_rbio_pages
 483 *
 484 * If this rbio was already cached, it gets
 485 * moved to the front of the lru.
 486 *
 487 * If the size of the rbio cache is too big, we
 488 * prune an item.
 489 */
 490static void cache_rbio(struct btrfs_raid_bio *rbio)
 491{
 492        struct btrfs_stripe_hash_table *table;
 493        unsigned long flags;
 494
 495        if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
 496                return;
 497
 498        table = rbio->fs_info->stripe_hash_table;
 499
 500        spin_lock_irqsave(&table->cache_lock, flags);
 501        spin_lock(&rbio->bio_list_lock);
 502
 503        /* bump our ref if we were not in the list before */
 504        if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
 505                atomic_inc(&rbio->refs);
 506
 507        if (!list_empty(&rbio->stripe_cache)){
 508                list_move(&rbio->stripe_cache, &table->stripe_cache);
 509        } else {
 510                list_add(&rbio->stripe_cache, &table->stripe_cache);
 511                table->cache_size += 1;
 512        }
 513
 514        spin_unlock(&rbio->bio_list_lock);
 515
 516        if (table->cache_size > RBIO_CACHE_SIZE) {
 517                struct btrfs_raid_bio *found;
 518
 519                found = list_entry(table->stripe_cache.prev,
 520                                  struct btrfs_raid_bio,
 521                                  stripe_cache);
 522
 523                if (found != rbio)
 524                        __remove_rbio_from_cache(found);
 525        }
 526
 527        spin_unlock_irqrestore(&table->cache_lock, flags);
 528        return;
 529}
 530
 531/*
 532 * helper function to run the xor_blocks api.  It is only
 533 * able to do MAX_XOR_BLOCKS at a time, so we need to
 534 * loop through.
 535 */
 536static void run_xor(void **pages, int src_cnt, ssize_t len)
 537{
 538        int src_off = 0;
 539        int xor_src_cnt = 0;
 540        void *dest = pages[src_cnt];
 541
 542        while(src_cnt > 0) {
 543                xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
 544                xor_blocks(xor_src_cnt, len, dest, pages + src_off);
 545
 546                src_cnt -= xor_src_cnt;
 547                src_off += xor_src_cnt;
 548        }
 549}
 550
 551/*
 552 * returns true if the bio list inside this rbio
 553 * covers an entire stripe (no rmw required).
 554 * Must be called with the bio list lock held, or
 555 * at a time when you know it is impossible to add
 556 * new bios into the list
 557 */
 558static int __rbio_is_full(struct btrfs_raid_bio *rbio)
 559{
 560        unsigned long size = rbio->bio_list_bytes;
 561        int ret = 1;
 562
 563        if (size != rbio->nr_data * rbio->stripe_len)
 564                ret = 0;
 565
 566        BUG_ON(size > rbio->nr_data * rbio->stripe_len);
 567        return ret;
 568}
 569
 570static int rbio_is_full(struct btrfs_raid_bio *rbio)
 571{
 572        unsigned long flags;
 573        int ret;
 574
 575        spin_lock_irqsave(&rbio->bio_list_lock, flags);
 576        ret = __rbio_is_full(rbio);
 577        spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
 578        return ret;
 579}
 580
 581/*
 582 * returns 1 if it is safe to merge two rbios together.
 583 * The merging is safe if the two rbios correspond to
 584 * the same stripe and if they are both going in the same
 585 * direction (read vs write), and if neither one is
 586 * locked for final IO
 587 *
 588 * The caller is responsible for locking such that
 589 * rmw_locked is safe to test
 590 */
 591static int rbio_can_merge(struct btrfs_raid_bio *last,
 592                          struct btrfs_raid_bio *cur)
 593{
 594        if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
 595            test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
 596                return 0;
 597
 598        /*
 599         * we can't merge with cached rbios, since the
 600         * idea is that when we merge the destination
 601         * rbio is going to run our IO for us.  We can
 602         * steal from cached rbio's though, other functions
 603         * handle that.
 604         */
 605        if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
 606            test_bit(RBIO_CACHE_BIT, &cur->flags))
 607                return 0;
 608
 609        if (last->raid_map[0] !=
 610            cur->raid_map[0])
 611                return 0;
 612
 613        /* we can't merge with different operations */
 614        if (last->operation != cur->operation)
 615                return 0;
 616        /*
 617         * We've need read the full stripe from the drive.
 618         * check and repair the parity and write the new results.
 619         *
 620         * We're not allowed to add any new bios to the
 621         * bio list here, anyone else that wants to
 622         * change this stripe needs to do their own rmw.
 623         */
 624        if (last->operation == BTRFS_RBIO_PARITY_SCRUB ||
 625            cur->operation == BTRFS_RBIO_PARITY_SCRUB)
 626                return 0;
 627
 628        return 1;
 629}
 630
 631/*
 632 * helper to index into the pstripe
 633 */
 634static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
 635{
 636        index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
 637        return rbio->stripe_pages[index];
 638}
 639
 640/*
 641 * helper to index into the qstripe, returns null
 642 * if there is no qstripe
 643 */
 644static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
 645{
 646        if (rbio->nr_data + 1 == rbio->real_stripes)
 647                return NULL;
 648
 649        index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
 650                PAGE_CACHE_SHIFT;
 651        return rbio->stripe_pages[index];
 652}
 653
 654/*
 655 * The first stripe in the table for a logical address
 656 * has the lock.  rbios are added in one of three ways:
 657 *
 658 * 1) Nobody has the stripe locked yet.  The rbio is given
 659 * the lock and 0 is returned.  The caller must start the IO
 660 * themselves.
 661 *
 662 * 2) Someone has the stripe locked, but we're able to merge
 663 * with the lock owner.  The rbio is freed and the IO will
 664 * start automatically along with the existing rbio.  1 is returned.
 665 *
 666 * 3) Someone has the stripe locked, but we're not able to merge.
 667 * The rbio is added to the lock owner's plug list, or merged into
 668 * an rbio already on the plug list.  When the lock owner unlocks,
 669 * the next rbio on the list is run and the IO is started automatically.
 670 * 1 is returned
 671 *
 672 * If we return 0, the caller still owns the rbio and must continue with
 673 * IO submission.  If we return 1, the caller must assume the rbio has
 674 * already been freed.
 675 */
 676static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
 677{
 678        int bucket = rbio_bucket(rbio);
 679        struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
 680        struct btrfs_raid_bio *cur;
 681        struct btrfs_raid_bio *pending;
 682        unsigned long flags;
 683        DEFINE_WAIT(wait);
 684        struct btrfs_raid_bio *freeit = NULL;
 685        struct btrfs_raid_bio *cache_drop = NULL;
 686        int ret = 0;
 687        int walk = 0;
 688
 689        spin_lock_irqsave(&h->lock, flags);
 690        list_for_each_entry(cur, &h->hash_list, hash_list) {
 691                walk++;
 692                if (cur->raid_map[0] == rbio->raid_map[0]) {
 693                        spin_lock(&cur->bio_list_lock);
 694
 695                        /* can we steal this cached rbio's pages? */
 696                        if (bio_list_empty(&cur->bio_list) &&
 697                            list_empty(&cur->plug_list) &&
 698                            test_bit(RBIO_CACHE_BIT, &cur->flags) &&
 699                            !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
 700                                list_del_init(&cur->hash_list);
 701                                atomic_dec(&cur->refs);
 702
 703                                steal_rbio(cur, rbio);
 704                                cache_drop = cur;
 705                                spin_unlock(&cur->bio_list_lock);
 706
 707                                goto lockit;
 708                        }
 709
 710                        /* can we merge into the lock owner? */
 711                        if (rbio_can_merge(cur, rbio)) {
 712                                merge_rbio(cur, rbio);
 713                                spin_unlock(&cur->bio_list_lock);
 714                                freeit = rbio;
 715                                ret = 1;
 716                                goto out;
 717                        }
 718
 719
 720                        /*
 721                         * we couldn't merge with the running
 722                         * rbio, see if we can merge with the
 723                         * pending ones.  We don't have to
 724                         * check for rmw_locked because there
 725                         * is no way they are inside finish_rmw
 726                         * right now
 727                         */
 728                        list_for_each_entry(pending, &cur->plug_list,
 729                                            plug_list) {
 730                                if (rbio_can_merge(pending, rbio)) {
 731                                        merge_rbio(pending, rbio);
 732                                        spin_unlock(&cur->bio_list_lock);
 733                                        freeit = rbio;
 734                                        ret = 1;
 735                                        goto out;
 736                                }
 737                        }
 738
 739                        /* no merging, put us on the tail of the plug list,
 740                         * our rbio will be started with the currently
 741                         * running rbio unlocks
 742                         */
 743                        list_add_tail(&rbio->plug_list, &cur->plug_list);
 744                        spin_unlock(&cur->bio_list_lock);
 745                        ret = 1;
 746                        goto out;
 747                }
 748        }
 749lockit:
 750        atomic_inc(&rbio->refs);
 751        list_add(&rbio->hash_list, &h->hash_list);
 752out:
 753        spin_unlock_irqrestore(&h->lock, flags);
 754        if (cache_drop)
 755                remove_rbio_from_cache(cache_drop);
 756        if (freeit)
 757                __free_raid_bio(freeit);
 758        return ret;
 759}
 760
 761/*
 762 * called as rmw or parity rebuild is completed.  If the plug list has more
 763 * rbios waiting for this stripe, the next one on the list will be started
 764 */
 765static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
 766{
 767        int bucket;
 768        struct btrfs_stripe_hash *h;
 769        unsigned long flags;
 770        int keep_cache = 0;
 771
 772        bucket = rbio_bucket(rbio);
 773        h = rbio->fs_info->stripe_hash_table->table + bucket;
 774
 775        if (list_empty(&rbio->plug_list))
 776                cache_rbio(rbio);
 777
 778        spin_lock_irqsave(&h->lock, flags);
 779        spin_lock(&rbio->bio_list_lock);
 780
 781        if (!list_empty(&rbio->hash_list)) {
 782                /*
 783                 * if we're still cached and there is no other IO
 784                 * to perform, just leave this rbio here for others
 785                 * to steal from later
 786                 */
 787                if (list_empty(&rbio->plug_list) &&
 788                    test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
 789                        keep_cache = 1;
 790                        clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
 791                        BUG_ON(!bio_list_empty(&rbio->bio_list));
 792                        goto done;
 793                }
 794
 795                list_del_init(&rbio->hash_list);
 796                atomic_dec(&rbio->refs);
 797
 798                /*
 799                 * we use the plug list to hold all the rbios
 800                 * waiting for the chance to lock this stripe.
 801                 * hand the lock over to one of them.
 802                 */
 803                if (!list_empty(&rbio->plug_list)) {
 804                        struct btrfs_raid_bio *next;
 805                        struct list_head *head = rbio->plug_list.next;
 806
 807                        next = list_entry(head, struct btrfs_raid_bio,
 808                                          plug_list);
 809
 810                        list_del_init(&rbio->plug_list);
 811
 812                        list_add(&next->hash_list, &h->hash_list);
 813                        atomic_inc(&next->refs);
 814                        spin_unlock(&rbio->bio_list_lock);
 815                        spin_unlock_irqrestore(&h->lock, flags);
 816
 817                        if (next->operation == BTRFS_RBIO_READ_REBUILD)
 818                                async_read_rebuild(next);
 819                        else if (next->operation == BTRFS_RBIO_WRITE) {
 820                                steal_rbio(rbio, next);
 821                                async_rmw_stripe(next);
 822                        } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
 823                                steal_rbio(rbio, next);
 824                                async_scrub_parity(next);
 825                        }
 826
 827                        goto done_nolock;
 828                } else  if (waitqueue_active(&h->wait)) {
 829                        spin_unlock(&rbio->bio_list_lock);
 830                        spin_unlock_irqrestore(&h->lock, flags);
 831                        wake_up(&h->wait);
 832                        goto done_nolock;
 833                }
 834        }
 835done:
 836        spin_unlock(&rbio->bio_list_lock);
 837        spin_unlock_irqrestore(&h->lock, flags);
 838
 839done_nolock:
 840        if (!keep_cache)
 841                remove_rbio_from_cache(rbio);
 842}
 843
 844static inline void
 845__free_bbio_and_raid_map(struct btrfs_bio *bbio, u64 *raid_map, int need)
 846{
 847        if (need) {
 848                kfree(raid_map);
 849                kfree(bbio);
 850        }
 851}
 852
 853static inline void free_bbio_and_raid_map(struct btrfs_raid_bio *rbio)
 854{
 855        __free_bbio_and_raid_map(rbio->bbio, rbio->raid_map,
 856                        !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags));
 857}
 858
 859static void __free_raid_bio(struct btrfs_raid_bio *rbio)
 860{
 861        int i;
 862
 863        WARN_ON(atomic_read(&rbio->refs) < 0);
 864        if (!atomic_dec_and_test(&rbio->refs))
 865                return;
 866
 867        WARN_ON(!list_empty(&rbio->stripe_cache));
 868        WARN_ON(!list_empty(&rbio->hash_list));
 869        WARN_ON(!bio_list_empty(&rbio->bio_list));
 870
 871        for (i = 0; i < rbio->nr_pages; i++) {
 872                if (rbio->stripe_pages[i]) {
 873                        __free_page(rbio->stripe_pages[i]);
 874                        rbio->stripe_pages[i] = NULL;
 875                }
 876        }
 877
 878        free_bbio_and_raid_map(rbio);
 879
 880        kfree(rbio);
 881}
 882
 883static void free_raid_bio(struct btrfs_raid_bio *rbio)
 884{
 885        unlock_stripe(rbio);
 886        __free_raid_bio(rbio);
 887}
 888
 889/*
 890 * this frees the rbio and runs through all the bios in the
 891 * bio_list and calls end_io on them
 892 */
 893static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
 894{
 895        struct bio *cur = bio_list_get(&rbio->bio_list);
 896        struct bio *next;
 897
 898        if (rbio->generic_bio_cnt)
 899                btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
 900
 901        free_raid_bio(rbio);
 902
 903        while (cur) {
 904                next = cur->bi_next;
 905                cur->bi_next = NULL;
 906                if (uptodate)
 907                        set_bit(BIO_UPTODATE, &cur->bi_flags);
 908                bio_endio(cur, err);
 909                cur = next;
 910        }
 911}
 912
 913/*
 914 * end io function used by finish_rmw.  When we finally
 915 * get here, we've written a full stripe
 916 */
 917static void raid_write_end_io(struct bio *bio, int err)
 918{
 919        struct btrfs_raid_bio *rbio = bio->bi_private;
 920
 921        if (err)
 922                fail_bio_stripe(rbio, bio);
 923
 924        bio_put(bio);
 925
 926        if (!atomic_dec_and_test(&rbio->stripes_pending))
 927                return;
 928
 929        err = 0;
 930
 931        /* OK, we have read all the stripes we need to. */
 932        if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
 933                err = -EIO;
 934
 935        rbio_orig_end_io(rbio, err, 0);
 936        return;
 937}
 938
 939/*
 940 * the read/modify/write code wants to use the original bio for
 941 * any pages it included, and then use the rbio for everything
 942 * else.  This function decides if a given index (stripe number)
 943 * and page number in that stripe fall inside the original bio
 944 * or the rbio.
 945 *
 946 * if you set bio_list_only, you'll get a NULL back for any ranges
 947 * that are outside the bio_list
 948 *
 949 * This doesn't take any refs on anything, you get a bare page pointer
 950 * and the caller must bump refs as required.
 951 *
 952 * You must call index_rbio_pages once before you can trust
 953 * the answers from this function.
 954 */
 955static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
 956                                 int index, int pagenr, int bio_list_only)
 957{
 958        int chunk_page;
 959        struct page *p = NULL;
 960
 961        chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
 962
 963        spin_lock_irq(&rbio->bio_list_lock);
 964        p = rbio->bio_pages[chunk_page];
 965        spin_unlock_irq(&rbio->bio_list_lock);
 966
 967        if (p || bio_list_only)
 968                return p;
 969
 970        return rbio->stripe_pages[chunk_page];
 971}
 972
 973/*
 974 * number of pages we need for the entire stripe across all the
 975 * drives
 976 */
 977static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
 978{
 979        unsigned long nr = stripe_len * nr_stripes;
 980        return DIV_ROUND_UP(nr, PAGE_CACHE_SIZE);
 981}
 982
 983/*
 984 * allocation and initial setup for the btrfs_raid_bio.  Not
 985 * this does not allocate any pages for rbio->pages.
 986 */
 987static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
 988                          struct btrfs_bio *bbio, u64 *raid_map,
 989                          u64 stripe_len)
 990{
 991        struct btrfs_raid_bio *rbio;
 992        int nr_data = 0;
 993        int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
 994        int num_pages = rbio_nr_pages(stripe_len, real_stripes);
 995        int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
 996        void *p;
 997
 998        rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 +
 999                       DIV_ROUND_UP(stripe_npages, BITS_PER_LONG / 8),
1000                        GFP_NOFS);

1001        if (!rbio)
1002                return ERR_PTR(-ENOMEM);
1003
1004        bio_list_init(&rbio->bio_list);
1005        INIT_LIST_HEAD(&rbio->plug_list);
1006        spin_lock_init(&rbio->bio_list_lock);
1007        INIT_LIST_HEAD(&rbio->stripe_cache);
1008        INIT_LIST_HEAD(&rbio->hash_list);
1009        rbio->bbio = bbio;
1010        rbio->raid_map = raid_map;
1011        rbio->fs_info = root->fs_info;
1012        rbio->stripe_len = stripe_len;
1013        rbio->nr_pages = num_pages;
1014        rbio->real_stripes = real_stripes;
1015        rbio->stripe_npages = stripe_npages;
1016        rbio->faila = -1;
1017        rbio->failb = -1;
1018        atomic_set(&rbio->refs, 1);
1019        atomic_set(&rbio->error, 0);
1020        atomic_set(&rbio->stripes_pending, 0);
1021
1022        /*
1023         * the stripe_pages and bio_pages array point to the extra
1024         * memory we allocated past the end of the rbio
1025         */
1026        p = rbio + 1;
1027        rbio->stripe_pages = p;
1028        rbio->bio_pages = p + sizeof(struct page *) * num_pages;
1029        rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2;
1030
1031        if (raid_map[real_stripes - 1] == RAID6_Q_STRIPE)
1032                nr_data = real_stripes - 2;
1033        else
1034                nr_data = real_stripes - 1;
1035
1036        rbio->nr_data = nr_data;
1037        return rbio;
1038}
1039
1040/* allocate pages for all the stripes in the bio, including parity */
1041static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
1042{
1043        int i;
1044        struct page *page;
1045
1046        for (i = 0; i < rbio->nr_pages; i++) {
1047                if (rbio->stripe_pages[i])
1048                        continue;
1049                page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1050                if (!page)
1051                        return -ENOMEM;
1052                rbio->stripe_pages[i] = page;
1053                ClearPageUptodate(page);
1054        }
1055        return 0;
1056}
1057
1058/* allocate pages for just the p/q stripes */
1059static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
1060{
1061        int i;
1062        struct page *page;
1063
1064        i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
1065
1066        for (; i < rbio->nr_pages; i++) {
1067                if (rbio->stripe_pages[i])
1068                        continue;
1069                page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1070                if (!page)
1071                        return -ENOMEM;
1072                rbio->stripe_pages[i] = page;
1073        }
1074        return 0;
1075}
1076
1077/*
1078 * add a single page from a specific stripe into our list of bios for IO
1079 * this will try to merge into existing bios if possible, and returns
1080 * zero if all went well.
1081 */
1082static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
1083                            struct bio_list *bio_list,
1084                            struct page *page,
1085                            int stripe_nr,
1086                            unsigned long page_index,
1087                            unsigned long bio_max_len)
1088{
1089        struct bio *last = bio_list->tail;
1090        u64 last_end = 0;
1091        int ret;
1092        struct bio *bio;
1093        struct btrfs_bio_stripe *stripe;
1094        u64 disk_start;
1095
1096        stripe = &rbio->bbio->stripes[stripe_nr];
1097        disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
1098
1099        /* if the device is missing, just fail this stripe */
1100        if (!stripe->dev->bdev)
1101                return fail_rbio_index(rbio, stripe_nr);
1102
1103        /* see if we can add this page onto our existing bio */
1104        if (last) {
1105                last_end = (u64)last->bi_iter.bi_sector << 9;
1106                last_end += last->bi_iter.bi_size;
1107
1108                /*
1109                 * we can't merge these if they are from different
1110                 * devices or if they are not contiguous
1111                 */
1112                if (last_end == disk_start && stripe->dev->bdev &&
1113                    test_bit(BIO_UPTODATE, &last->bi_flags) &&
1114                    last->bi_bdev == stripe->dev->bdev) {
1115                        ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
1116                        if (ret == PAGE_CACHE_SIZE)
1117                                return 0;
1118                }
1119        }
1120
1121        /* put a new bio on the list */
1122        bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
1123        if (!bio)
1124                return -ENOMEM;
1125
1126        bio->bi_iter.bi_size = 0;
1127        bio->bi_bdev = stripe->dev->bdev;
1128        bio->bi_iter.bi_sector = disk_start >> 9;
1129        set_bit(BIO_UPTODATE, &bio->bi_flags);
1130
1131        bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
1132        bio_list_add(bio_list, bio);
1133        return 0;
1134}
1135
1136/*
1137 * while we're doing the read/modify/write cycle, we could
1138 * have errors in reading pages off the disk.  This checks
1139 * for errors and if we're not able to read the page it'll
1140 * trigger parity reconstruction.  The rmw will be finished
1141 * after we've reconstructed the failed stripes
1142 */
1143static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1144{
1145        if (rbio->faila >= 0 || rbio->failb >= 0) {
1146                BUG_ON(rbio->faila == rbio->real_stripes - 1);
1147                __raid56_parity_recover(rbio);
1148        } else {
1149                finish_rmw(rbio);
1150        }
1151}
1152
1153/*
1154 * these are just the pages from the rbio array, not from anything
1155 * the FS sent down to us
1156 */
1157static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page)
1158{
1159        int index;
1160        index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
1161        index += page;
1162        return rbio->stripe_pages[index];
1163}
1164
1165/*
1166 * helper function to walk our bio list and populate the bio_pages array with
1167 * the result.  This seems expensive, but it is faster than constantly
1168 * searching through the bio list as we setup the IO in finish_rmw or stripe
1169 * reconstruction.
1170 *
1171 * This must be called before you trust the answers from page_in_rbio
1172 */
1173static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1174{
1175        struct bio *bio;
1176        u64 start;
1177        unsigned long stripe_offset;
1178        unsigned long page_index;
1179        struct page *p;
1180        int i;
1181
1182        spin_lock_irq(&rbio->bio_list_lock);
1183        bio_list_for_each(bio, &rbio->bio_list) {
1184                start = (u64)bio->bi_iter.bi_sector << 9;
1185                stripe_offset = start - rbio->raid_map[0];
1186                page_index = stripe_offset >> PAGE_CACHE_SHIFT;
1187
1188                for (i = 0; i < bio->bi_vcnt; i++) {
1189                        p = bio->bi_io_vec[i].bv_page;
1190                        rbio->bio_pages[page_index + i] = p;
1191                }
1192        }
1193        spin_unlock_irq(&rbio->bio_list_lock);
1194}
1195
1196/*
1197 * this is called from one of two situations.  We either
1198 * have a full stripe from the higher layers, or we've read all
1199 * the missing bits off disk.
1200 *
1201 * This will calculate the parity and then send down any
1202 * changed blocks.
1203 */
1204static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1205{
1206        struct btrfs_bio *bbio = rbio->bbio;
1207        void *pointers[rbio->real_stripes];
1208        int stripe_len = rbio->stripe_len;
1209        int nr_data = rbio->nr_data;
1210        int stripe;
1211        int pagenr;
1212        int p_stripe = -1;
1213        int q_stripe = -1;
1214        struct bio_list bio_list;
1215        struct bio *bio;
1216        int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
1217        int ret;
1218
1219        bio_list_init(&bio_list);
1220
1221        if (rbio->real_stripes - rbio->nr_data == 1) {
1222                p_stripe = rbio->real_stripes - 1;
1223        } else if (rbio->real_stripes - rbio->nr_data == 2) {
1224                p_stripe = rbio->real_stripes - 2;
1225                q_stripe = rbio->real_stripes - 1;
1226        } else {
1227                BUG();
1228        }
1229
1230        /* at this point we either have a full stripe,
1231         * or we've read the full stripe from the drive.
1232         * recalculate the parity and write the new results.
1233         *
1234         * We're not allowed to add any new bios to the
1235         * bio list here, anyone else that wants to
1236         * change this stripe needs to do their own rmw.
1237         */
1238        spin_lock_irq(&rbio->bio_list_lock);
1239        set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1240        spin_unlock_irq(&rbio->bio_list_lock);
1241
1242        atomic_set(&rbio->error, 0);
1243
1244        /*
1245         * now that we've set rmw_locked, run through the
1246         * bio list one last time and map the page pointers
1247         *
1248         * We don't cache full rbios because we're assuming
1249         * the higher layers are unlikely to use this area of
1250         * the disk again soon.  If they do use it again,
1251         * hopefully they will send another full bio.
1252         */
1253        index_rbio_pages(rbio);
1254        if (!rbio_is_full(rbio))
1255                cache_rbio_pages(rbio);
1256        else
1257                clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1258
1259        for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1260                struct page *p;
1261                /* first collect one page from each data stripe */
1262                for (stripe = 0; stripe < nr_data; stripe++) {
1263                        p = page_in_rbio(rbio, stripe, pagenr, 0);
1264                        pointers[stripe] = kmap(p);
1265                }
1266
1267                /* then add the parity stripe */
1268                p = rbio_pstripe_page(rbio, pagenr);
1269                SetPageUptodate(p);
1270                pointers[stripe++] = kmap(p);
1271
1272                if (q_stripe != -1) {
1273
1274                        /*
1275                         * raid6, add the qstripe and call the
1276                         * library function to fill in our p/q
1277                         */
1278                        p = rbio_qstripe_page(rbio, pagenr);
1279                        SetPageUptodate(p);
1280                        pointers[stripe++] = kmap(p);
1281
1282                        raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
1283                                                pointers);
1284                } else {
1285                        /* raid5 */
1286                        memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
1287                        run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
1288                }
1289
1290
1291                for (stripe = 0; stripe < rbio->real_stripes; stripe++)
1292                        kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
1293        }
1294
1295        /*
1296         * time to start writing.  Make bios for everything from the
1297         * higher layers (the bio_list in our rbio) and our p/q.  Ignore
1298         * everything else.
1299         */
1300        for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1301                for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1302                        struct page *page;
1303                        if (stripe < rbio->nr_data) {
1304                                page = page_in_rbio(rbio, stripe, pagenr, 1);
1305                                if (!page)
1306                                        continue;
1307                        } else {
1308                               page = rbio_stripe_page(rbio, stripe, pagenr);
1309                        }
1310
1311                        ret = rbio_add_io_page(rbio, &bio_list,
1312                                       page, stripe, pagenr, rbio->stripe_len);
1313                        if (ret)
1314                                goto cleanup;
1315                }
1316        }
1317
1318        if (likely(!bbio->num_tgtdevs))
1319                goto write_data;
1320
1321        for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1322                if (!bbio->tgtdev_map[stripe])
1323                        continue;
1324
1325                for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1326                        struct page *page;
1327                        if (stripe < rbio->nr_data) {
1328                                page = page_in_rbio(rbio, stripe, pagenr, 1);
1329                                if (!page)
1330                                        continue;
1331                        } else {
1332                               page = rbio_stripe_page(rbio, stripe, pagenr);
1333                        }
1334
1335                        ret = rbio_add_io_page(rbio, &bio_list, page,
1336                                               rbio->bbio->tgtdev_map[stripe],
1337                                               pagenr, rbio->stripe_len);
1338                        if (ret)
1339                                goto cleanup;
1340                }
1341        }
1342
1343write_data:
1344        atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
1345        BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
1346
1347        while (1) {
1348                bio = bio_list_pop(&bio_list);
1349                if (!bio)
1350                        break;
1351
1352                bio->bi_private = rbio;
1353                bio->bi_end_io = raid_write_end_io;
1354                BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1355                submit_bio(WRITE, bio);
1356        }
1357        return;
1358
1359cleanup:
1360        rbio_orig_end_io(rbio, -EIO, 0);
1361}
1362
1363/*
1364 * helper to find the stripe number for a given bio.  Used to figure out which
1365 * stripe has failed.  This expects the bio to correspond to a physical disk,
1366 * so it looks up based on physical sector numbers.
1367 */
1368static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1369                           struct bio *bio)
1370{
1371        u64 physical = bio->bi_iter.bi_sector;
1372        u64 stripe_start;
1373        int i;
1374        struct btrfs_bio_stripe *stripe;
1375
1376        physical <<= 9;
1377
1378        for (i = 0; i < rbio->bbio->num_stripes; i++) {
1379                stripe = &rbio->bbio->stripes[i];
1380                stripe_start = stripe->physical;
1381                if (physical >= stripe_start &&
1382                    physical < stripe_start + rbio->stripe_len &&
1383                    bio->bi_bdev == stripe->dev->bdev) {
1384                        return i;
1385                }
1386        }
1387        return -1;
1388}
1389
1390/*
1391 * helper to find the stripe number for a given
1392 * bio (before mapping).  Used to figure out which stripe has
1393 * failed.  This looks up based on logical block numbers.
1394 */
1395static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
1396                                   struct bio *bio)
1397{
1398        u64 logical = bio->bi_iter.bi_sector;
1399        u64 stripe_start;
1400        int i;
1401
1402        logical <<= 9;
1403
1404        for (i = 0; i < rbio->nr_data; i++) {
1405                stripe_start = rbio->raid_map[i];
1406                if (logical >= stripe_start &&
1407                    logical < stripe_start + rbio->stripe_len) {
1408                        return i;
1409                }
1410        }
1411        return -1;
1412}
1413
1414/*
1415 * returns -EIO if we had too many failures
1416 */
1417static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1418{
1419        unsigned long flags;
1420        int ret = 0;
1421
1422        spin_lock_irqsave(&rbio->bio_list_lock, flags);
1423
1424        /* we already know this stripe is bad, move on */
1425        if (rbio->faila == failed || rbio->failb == failed)
1426                goto out;
1427
1428        if (rbio->faila == -1) {
1429                /* first failure on this rbio */
1430                rbio->faila = failed;
1431                atomic_inc(&rbio->error);
1432        } else if (rbio->failb == -1) {
1433                /* second failure on this rbio */
1434                rbio->failb = failed;
1435                atomic_inc(&rbio->error);
1436        } else {
1437                ret = -EIO;
1438        }
1439out:
1440        spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1441
1442        return ret;
1443}
1444
1445/*
1446 * helper to fail a stripe based on a physical disk
1447 * bio.
1448 */
1449static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1450                           struct bio *bio)
1451{
1452        int failed = find_bio_stripe(rbio, bio);
1453
1454        if (failed < 0)
1455                return -EIO;
1456
1457        return fail_rbio_index(rbio, failed);
1458}
1459
1460/*
1461 * this sets each page in the bio uptodate.  It should only be used on private
1462 * rbio pages, nothing that comes in from the higher layers
1463 */
1464static void set_bio_pages_uptodate(struct bio *bio)
1465{
1466        int i;
1467        struct page *p;
1468
1469        for (i = 0; i < bio->bi_vcnt; i++) {
1470                p = bio->bi_io_vec[i].bv_page;
1471                SetPageUptodate(p);
1472        }
1473}
1474
1475/*
1476 * end io for the read phase of the rmw cycle.  All the bios here are physical
1477 * stripe bios we've read from the disk so we can recalculate the parity of the
1478 * stripe.
1479 *
1480 * This will usually kick off finish_rmw once all the bios are read in, but it
1481 * may trigger parity reconstruction if we had any errors along the way
1482 */
1483static void raid_rmw_end_io(struct bio *bio, int err)
1484{
1485        struct btrfs_raid_bio *rbio = bio->bi_private;
1486
1487        if (err)
1488                fail_bio_stripe(rbio, bio);
1489        else
1490                set_bio_pages_uptodate(bio);
1491
1492        bio_put(bio);
1493
1494        if (!atomic_dec_and_test(&rbio->stripes_pending))
1495                return;
1496
1497        err = 0;
1498        if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
1499                goto cleanup;
1500
1501        /*
1502         * this will normally call finish_rmw to start our write
1503         * but if there are any failed stripes we'll reconstruct
1504         * from parity first
1505         */
1506        validate_rbio_for_rmw(rbio);
1507        return;
1508
1509cleanup:
1510
1511        rbio_orig_end_io(rbio, -EIO, 0);
1512}
1513
1514static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
1515{
1516        btrfs_init_work(&rbio->work, btrfs_rmw_helper,
1517                        rmw_work, NULL, NULL);
1518
1519        btrfs_queue_work(rbio->fs_info->rmw_workers,
1520                         &rbio->work);
1521}
1522
1523static void async_read_rebuild(struct btrfs_raid_bio *rbio)
1524{
1525        btrfs_init_work(&rbio->work, btrfs_rmw_helper,
1526                        read_rebuild_work, NULL, NULL);
1527
1528        btrfs_queue_work(rbio->fs_info->rmw_workers,
1529                         &rbio->work);
1530}
1531
1532/*
1533 * the stripe must be locked by the caller.  It will
1534 * unlock after all the writes are done
1535 */
1536static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1537{
1538        int bios_to_read = 0;
1539        struct bio_list bio_list;
1540        int ret;
1541        int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
1542        int pagenr;
1543        int stripe;
1544        struct bio *bio;
1545
1546        bio_list_init(&bio_list);
1547
1548        ret = alloc_rbio_pages(rbio);
1549        if (ret)
1550                goto cleanup;
1551
1552        index_rbio_pages(rbio);
1553
1554        atomic_set(&rbio->error, 0);
1555        /*
1556         * build a list of bios to read all the missing parts of this
1557         * stripe
1558         */
1559        for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1560                for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1561                        struct page *page;
1562                        /*
1563                         * we want to find all the pages missing from
1564                         * the rbio and read them from the disk.  If
1565                         * page_in_rbio finds a page in the bio list
1566                         * we don't need to read it off the stripe.
1567                         */
1568                        page = page_in_rbio(rbio, stripe, pagenr, 1);
1569                        if (page)
1570                                continue;
1571
1572                        page = rbio_stripe_page(rbio, stripe, pagenr);
1573                        /*
1574                         * the bio cache may have handed us an uptodate
1575                         * page.  If so, be happy and use it
1576                         */
1577                        if (PageUptodate(page))
1578                                continue;
1579
1580                        ret = rbio_add_io_page(rbio, &bio_list, page,
1581                                       stripe, pagenr, rbio->stripe_len);
1582                        if (ret)
1583                                goto cleanup;
1584                }
1585        }
1586
1587        bios_to_read = bio_list_size(&bio_list);
1588        if (!bios_to_read) {
1589                /*
1590                 * this can happen if others have merged with
1591                 * us, it means there is nothing left to read.
1592                 * But if there are missing devices it may not be
1593                 * safe to do the full stripe write yet.
1594                 */
1595                goto finish;
1596        }
1597
1598        /*
1599         * the bbio may be freed once we submit the last bio.  Make sure
1600         * not to touch it after that
1601         */
1602        atomic_set(&rbio->stripes_pending, bios_to_read);
1603        while (1) {
1604                bio = bio_list_pop(&bio_list);
1605                if (!bio)
1606                        break;
1607
1608                bio->bi_private = rbio;
1609                bio->bi_end_io = raid_rmw_end_io;
1610
1611                btrfs_bio_wq_end_io(rbio->fs_info, bio,
1612                                    BTRFS_WQ_ENDIO_RAID56);
1613
1614                BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1615                submit_bio(READ, bio);
1616        }
1617        /* the actual write will happen once the reads are done */
1618        return 0;
1619
1620cleanup:
1621        rbio_orig_end_io(rbio, -EIO, 0);
1622        return -EIO;
1623
1624finish:
1625        validate_rbio_for_rmw(rbio);
1626        return 0;
1627}
1628
1629/*
1630 * if the upper layers pass in a full stripe, we thank them by only allocating
1631 * enough pages to hold the parity, and sending it all down quickly.
1632 */
1633static int full_stripe_write(struct btrfs_raid_bio *rbio)
1634{
1635        int ret;
1636
1637        ret = alloc_rbio_parity_pages(rbio);
1638        if (ret) {
1639                __free_raid_bio(rbio);
1640                return ret;
1641        }
1642
1643        ret = lock_stripe_add(rbio);
1644        if (ret == 0)
1645                finish_rmw(rbio);
1646        return 0;
1647}
1648
1649/*
1650 * partial stripe writes get handed over to async helpers.
1651 * We're really hoping to merge a few more writes into this
1652 * rbio before calculating new parity
1653 */
1654static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1655{
1656        int ret;
1657
1658        ret = lock_stripe_add(rbio);
1659        if (ret == 0)
1660                async_rmw_stripe(rbio);
1661        return 0;
1662}
1663
1664/*
1665 * sometimes while we were reading from the drive to
1666 * recalculate parity, enough new bios come into create
1667 * a full stripe.  So we do a check here to see if we can
1668 * go directly to finish_rmw
1669 */
1670static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1671{
1672        /* head off into rmw land if we don't have a full stripe */
1673        if (!rbio_is_full(rbio))
1674                return partial_stripe_write(rbio);
1675        return full_stripe_write(rbio);
1676}
1677
1678/*
1679 * We use plugging call backs to collect full stripes.
1680 * Any time we get a partial stripe write while plugged
1681 * we collect it into a list.  When the unplug comes down,
1682 * we sort the list by logical block number and merge
1683 * everything we can into the same rbios
1684 */
1685struct btrfs_plug_cb {
1686        struct blk_plug_cb cb;
1687        struct btrfs_fs_info *info;
1688        struct list_head rbio_list;
1689        struct btrfs_work work;
1690};
1691
1692/*
1693 * rbios on the plug list are sorted for easier merging.
1694 */
1695static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
1696{
1697        struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1698                                                 plug_list);
1699        struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1700                                                 plug_list);
1701        u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1702        u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
1703
1704        if (a_sector < b_sector)
1705                return -1;
1706        if (a_sector > b_sector)
1707                return 1;
1708        return 0;
1709}
1710
1711static void run_plug(struct btrfs_plug_cb *plug)
1712{
1713        struct btrfs_raid_bio *cur;
1714        struct btrfs_raid_bio *last = NULL;
1715
1716        /*
1717         * sort our plug list then try to merge
1718         * everything we can in hopes of creating full
1719         * stripes.
1720         */
1721        list_sort(NULL, &plug->rbio_list, plug_cmp);
1722        while (!list_empty(&plug->rbio_list)) {
1723                cur = list_entry(plug->rbio_list.next,
1724                                 struct btrfs_raid_bio, plug_list);
1725                list_del_init(&cur->plug_list);
1726
1727                if (rbio_is_full(cur)) {
1728                        /* we have a full stripe, send it down */
1729                        full_stripe_write(cur);
1730                        continue;
1731                }
1732                if (last) {
1733                        if (rbio_can_merge(last, cur)) {
1734                                merge_rbio(last, cur);
1735                                __free_raid_bio(cur);
1736                                continue;
1737
1738                        }
1739                        __raid56_parity_write(last);
1740                }
1741                last = cur;
1742        }
1743        if (last) {
1744                __raid56_parity_write(last);
1745        }
1746        kfree(plug);
1747}
1748
1749/*
1750 * if the unplug comes from schedule, we have to push the
1751 * work off to a helper thread
1752 */
1753static void unplug_work(struct btrfs_work *work)
1754{
1755        struct btrfs_plug_cb *plug;
1756        plug = container_of(work, struct btrfs_plug_cb, work);
1757        run_plug(plug);
1758}
1759
1760static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1761{
1762        struct btrfs_plug_cb *plug;
1763        plug = container_of(cb, struct btrfs_plug_cb, cb);
1764
1765        if (from_schedule) {
1766                btrfs_init_work(&plug->work, btrfs_rmw_helper,
1767                                unplug_work, NULL, NULL);
1768                btrfs_queue_work(plug->info->rmw_workers,
1769                                 &plug->work);
1770                return;
1771        }
1772        run_plug(plug);
1773}
1774
1775/*
1776 * our main entry point for writes from the rest of the FS.
1777 */
1778int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
1779                        struct btrfs_bio *bbio, u64 *raid_map,
1780                        u64 stripe_len)
1781{
1782        struct btrfs_raid_bio *rbio;
1783        struct btrfs_plug_cb *plug = NULL;
1784        struct blk_plug_cb *cb;
1785        int ret;
1786
1787        rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
1788        if (IS_ERR(rbio)) {
1789                __free_bbio_and_raid_map(bbio, raid_map, 1);
1790                return PTR_ERR(rbio);
1791        }
1792        bio_list_add(&rbio->bio_list, bio);
1793        rbio->bio_list_bytes = bio->bi_iter.bi_size;
1794        rbio->operation = BTRFS_RBIO_WRITE;
1795
1796        btrfs_bio_counter_inc_noblocked(root->fs_info);
1797        rbio->generic_bio_cnt = 1;
1798
1799        /*
1800         * don't plug on full rbios, just get them out the door
1801         * as quickly as we can
1802         */
1803        if (rbio_is_full(rbio)) {
1804                ret = full_stripe_write(rbio);
1805                if (ret)
1806                        btrfs_bio_counter_dec(root->fs_info);
1807                return ret;
1808        }
1809
1810        cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
1811                               sizeof(*plug));
1812        if (cb) {
1813                plug = container_of(cb, struct btrfs_plug_cb, cb);
1814                if (!plug->info) {
1815                        plug->info = root->fs_info;
1816                        INIT_LIST_HEAD(&plug->rbio_list);
1817                }
1818                list_add_tail(&rbio->plug_list, &plug->rbio_list);
1819                ret = 0;
1820        } else {
1821                ret = __raid56_parity_write(rbio);
1822                if (ret)
1823                        btrfs_bio_counter_dec(root->fs_info);
1824        }
1825        return ret;
1826}
1827
1828/*
1829 * all parity reconstruction happens here.  We've read in everything
1830 * we can find from the drives and this does the heavy lifting of
1831 * sorting the good from the bad.
1832 */
1833static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1834{
1835        int pagenr, stripe;
1836        void **pointers;
1837        int faila = -1, failb = -1;
1838        int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
1839        struct page *page;
1840        int err;
1841        int i;
1842
1843        pointers = kzalloc(rbio->real_stripes * sizeof(void *),
1844                           GFP_NOFS);
1845        if (!pointers) {
1846                err = -ENOMEM;
1847                goto cleanup_io;
1848        }
1849
1850        faila = rbio->faila;
1851        failb = rbio->failb;
1852
1853        if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1854                spin_lock_irq(&rbio->bio_list_lock);
1855                set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1856                spin_unlock_irq(&rbio->bio_list_lock);
1857        }
1858
1859        index_rbio_pages(rbio);
1860
1861        for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1862                /*
1863                 * Now we just use bitmap to mark the horizontal stripes in
1864                 * which we have data when doing parity scrub.
1865                 */
1866                if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
1867                    !test_bit(pagenr, rbio->dbitmap))
1868                        continue;
1869
1870                /* setup our array of pointers with pages
1871                 * from each stripe
1872                 */
1873                for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1874                        /*
1875                         * if we're rebuilding a read, we have to use
1876                         * pages from the bio list
1877                         */
1878                        if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
1879                            (stripe == faila || stripe == failb)) {
1880                                page = page_in_rbio(rbio, stripe, pagenr, 0);
1881                        } else {
1882                                page = rbio_stripe_page(rbio, stripe, pagenr);
1883                        }
1884                        pointers[stripe] = kmap(page);
1885                }
1886
1887                /* all raid6 handling here */
1888                if (rbio->raid_map[rbio->real_stripes - 1] ==
1889                    RAID6_Q_STRIPE) {
1890
1891                        /*
1892                         * single failure, rebuild from parity raid5
1893                         * style
1894                         */
1895                        if (failb < 0) {
1896                                if (faila == rbio->nr_data) {
1897                                        /*
1898                                         * Just the P stripe has failed, without
1899                                         * a bad data or Q stripe.
1900                                         * TODO, we should redo the xor here.
1901                                         */
1902                                        err = -EIO;
1903                                        goto cleanup;
1904                                }
1905                                /*
1906                                 * a single failure in raid6 is rebuilt
1907                                 * in the pstripe code below
1908                                 */
1909                                goto pstripe;
1910                        }
1911
1912                        /* make sure our ps and qs are in order */
1913                        if (faila > failb) {
1914                                int tmp = failb;
1915                                failb = faila;
1916                                faila = tmp;
1917                        }
1918
1919                        /* if the q stripe is failed, do a pstripe reconstruction
1920                         * from the xors.
1921                         * If both the q stripe and the P stripe are failed, we're
1922                         * here due to a crc mismatch and we can't give them the
1923                         * data they want
1924                         */
1925                        if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
1926                                if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
1927                                        err = -EIO;
1928                                        goto cleanup;
1929                                }
1930                                /*
1931                                 * otherwise we have one bad data stripe and
1932                                 * a good P stripe.  raid5!
1933                                 */
1934                                goto pstripe;
1935                        }
1936
1937                        if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
1938                                raid6_datap_recov(rbio->real_stripes,
1939                                                  PAGE_SIZE, faila, pointers);
1940                        } else {
1941                                raid6_2data_recov(rbio->real_stripes,
1942                                                  PAGE_SIZE, faila, failb,
1943                                                  pointers);
1944                        }
1945                } else {
1946                        void *p;
1947
1948                        /* rebuild from P stripe here (raid5 or raid6) */
1949                        BUG_ON(failb != -1);
1950pstripe:
1951                        /* Copy parity block into failed block to start with */
1952                        memcpy(pointers[faila],
1953                               pointers[rbio->nr_data],
1954                               PAGE_CACHE_SIZE);
1955
1956                        /* rearrange the pointer array */
1957                        p = pointers[faila];
1958                        for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
1959                                pointers[stripe] = pointers[stripe + 1];
1960                        pointers[rbio->nr_data - 1] = p;
1961
1962                        /* xor in the rest */
1963                        run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
1964                }
1965                /* if we're doing this rebuild as part of an rmw, go through
1966                 * and set all of our private rbio pages in the
1967                 * failed stripes as uptodate.  This way finish_rmw will
1968                 * know they can be trusted.  If this was a read reconstruction,
1969                 * other endio functions will fiddle the uptodate bits
1970                 */
1971                if (rbio->operation == BTRFS_RBIO_WRITE) {
1972                        for (i = 0;  i < nr_pages; i++) {
1973                                if (faila != -1) {
1974                                        page = rbio_stripe_page(rbio, faila, i);
1975                                        SetPageUptodate(page);
1976                                }
1977                                if (failb != -1) {
1978                                        page = rbio_stripe_page(rbio, failb, i);
1979                                        SetPageUptodate(page);
1980                                }
1981                        }
1982                }
1983                for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1984                        /*
1985                         * if we're rebuilding a read, we have to use
1986                         * pages from the bio list
1987                         */
1988                        if (rbio->operation == BTRFS_RBIO_READ_REBUILD &&
1989                            (stripe == faila || stripe == failb)) {
1990                                page = page_in_rbio(rbio, stripe, pagenr, 0);
1991                        } else {
1992                                page = rbio_stripe_page(rbio, stripe, pagenr);
1993                        }
1994                        kunmap(page);
1995                }
1996        }
1997
1998        err = 0;
1999cleanup:
2000        kfree(pointers);

2001
2002cleanup_io:
2003        if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
2004                if (err == 0 &&
2005                    !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags))
2006                        cache_rbio_pages(rbio);
2007                else
2008                        clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2009
2010                rbio_orig_end_io(rbio, err, err == 0);
2011        } else if (err == 0) {
2012                rbio->faila = -1;
2013                rbio->failb = -1;
2014
2015                if (rbio->operation == BTRFS_RBIO_WRITE)
2016                        finish_rmw(rbio);
2017                else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
2018                        finish_parity_scrub(rbio, 0);
2019                else
2020                        BUG();
2021        } else {
2022                rbio_orig_end_io(rbio, err, 0);
2023        }
2024}
2025
2026/*
2027 * This is called only for stripes we've read from disk to
2028 * reconstruct the parity.
2029 */
2030static void raid_recover_end_io(struct bio *bio, int err)
2031{
2032        struct btrfs_raid_bio *rbio = bio->bi_private;
2033
2034        /*
2035         * we only read stripe pages off the disk, set them
2036         * up to date if there were no errors
2037         */
2038        if (err)
2039                fail_bio_stripe(rbio, bio);
2040        else
2041                set_bio_pages_uptodate(bio);
2042        bio_put(bio);
2043
2044        if (!atomic_dec_and_test(&rbio->stripes_pending))
2045                return;
2046
2047        if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
2048                rbio_orig_end_io(rbio, -EIO, 0);
2049        else
2050                __raid_recover_end_io(rbio);
2051}
2052
2053/*
2054 * reads everything we need off the disk to reconstruct
2055 * the parity. endio handlers trigger final reconstruction
2056 * when the IO is done.
2057 *
2058 * This is used both for reads from the higher layers and for
2059 * parity construction required to finish a rmw cycle.
2060 */
2061static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
2062{
2063        int bios_to_read = 0;
2064        struct bio_list bio_list;
2065        int ret;
2066        int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
2067        int pagenr;
2068        int stripe;
2069        struct bio *bio;
2070
2071        bio_list_init(&bio_list);
2072
2073        ret = alloc_rbio_pages(rbio);
2074        if (ret)
2075                goto cleanup;
2076
2077        atomic_set(&rbio->error, 0);
2078
2079        /*
2080         * read everything that hasn't failed.  Thanks to the
2081         * stripe cache, it is possible that some or all of these
2082         * pages are going to be uptodate.
2083         */
2084        for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
2085                if (rbio->faila == stripe || rbio->failb == stripe) {
2086                        atomic_inc(&rbio->error);
2087                        continue;
2088                }
2089
2090                for (pagenr = 0; pagenr < nr_pages; pagenr++) {
2091                        struct page *p;
2092
2093                        /*
2094                         * the rmw code may have already read this
2095                         * page in
2096                         */
2097                        p = rbio_stripe_page(rbio, stripe, pagenr);
2098                        if (PageUptodate(p))
2099                                continue;
2100
2101                        ret = rbio_add_io_page(rbio, &bio_list,
2102                                       rbio_stripe_page(rbio, stripe, pagenr),
2103                                       stripe, pagenr, rbio->stripe_len);
2104                        if (ret < 0)
2105                                goto cleanup;
2106                }
2107        }
2108
2109        bios_to_read = bio_list_size(&bio_list);
2110        if (!bios_to_read) {
2111                /*
2112                 * we might have no bios to read just because the pages
2113                 * were up to date, or we might have no bios to read because
2114                 * the devices were gone.
2115                 */
2116                if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
2117                        __raid_recover_end_io(rbio);
2118                        goto out;
2119                } else {
2120                        goto cleanup;
2121                }
2122        }
2123
2124        /*
2125         * the bbio may be freed once we submit the last bio.  Make sure
2126         * not to touch it after that
2127         */
2128        atomic_set(&rbio->stripes_pending, bios_to_read);
2129        while (1) {
2130                bio = bio_list_pop(&bio_list);
2131                if (!bio)
2132                        break;
2133
2134                bio->bi_private = rbio;
2135                bio->bi_end_io = raid_recover_end_io;
2136
2137                btrfs_bio_wq_end_io(rbio->fs_info, bio,
2138                                    BTRFS_WQ_ENDIO_RAID56);
2139
2140                BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
2141                submit_bio(READ, bio);
2142        }
2143out:
2144        return 0;
2145
2146cleanup:
2147        if (rbio->operation == BTRFS_RBIO_READ_REBUILD)
2148                rbio_orig_end_io(rbio, -EIO, 0);
2149        return -EIO;
2150}
2151
2152/*
2153 * the main entry point for reads from the higher layers.  This
2154 * is really only called when the normal read path had a failure,
2155 * so we assume the bio they send down corresponds to a failed part
2156 * of the drive.
2157 */
2158int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
2159                          struct btrfs_bio *bbio, u64 *raid_map,
2160                          u64 stripe_len, int mirror_num, int generic_io)
2161{
2162        struct btrfs_raid_bio *rbio;
2163        int ret;
2164
2165        rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
2166        if (IS_ERR(rbio)) {
2167                __free_bbio_and_raid_map(bbio, raid_map, generic_io);
2168                return PTR_ERR(rbio);
2169        }
2170
2171        rbio->operation = BTRFS_RBIO_READ_REBUILD;
2172        bio_list_add(&rbio->bio_list, bio);
2173        rbio->bio_list_bytes = bio->bi_iter.bi_size;
2174
2175        rbio->faila = find_logical_bio_stripe(rbio, bio);
2176        if (rbio->faila == -1) {
2177                BUG();
2178                __free_bbio_and_raid_map(bbio, raid_map, generic_io);
2179                kfree(rbio);
2180                return -EIO;
2181        }
2182
2183        if (generic_io) {
2184                btrfs_bio_counter_inc_noblocked(root->fs_info);
2185                rbio->generic_bio_cnt = 1;
2186        } else {
2187                set_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags);
2188        }
2189
2190        /*
2191         * reconstruct from the q stripe if they are
2192         * asking for mirror 3
2193         */
2194        if (mirror_num == 3)
2195                rbio->failb = rbio->real_stripes - 2;
2196
2197        ret = lock_stripe_add(rbio);
2198
2199        /*
2200         * __raid56_parity_recover will end the bio with
2201         * any errors it hits.  We don't want to return
2202         * its error value up the stack because our caller
2203         * will end up calling bio_endio with any nonzero
2204         * return
2205         */
2206        if (ret == 0)
2207                __raid56_parity_recover(rbio);
2208        /*
2209         * our rbio has been added to the list of
2210         * rbios that will be handled after the
2211         * currently lock owner is done
2212         */
2213        return 0;
2214
2215}
2216
2217static void rmw_work(struct btrfs_work *work)
2218{
2219        struct btrfs_raid_bio *rbio;
2220
2221        rbio = container_of(work, struct btrfs_raid_bio, work);
2222        raid56_rmw_stripe(rbio);
2223}
2224
2225static void read_rebuild_work(struct btrfs_work *work)
2226{
2227        struct btrfs_raid_bio *rbio;
2228
2229        rbio = container_of(work, struct btrfs_raid_bio, work);
2230        __raid56_parity_recover(rbio);
2231}
2232
2233/*
2234 * The following code is used to scrub/replace the parity stripe
2235 *
2236 * Note: We need make sure all the pages that add into the scrub/replace
2237 * raid bio are correct and not be changed during the scrub/replace. That
2238 * is those pages just hold metadata or file data with checksum.
2239 */
2240
2241struct btrfs_raid_bio *
2242raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
2243                               struct btrfs_bio *bbio, u64 *raid_map,
2244                               u64 stripe_len, struct btrfs_device *scrub_dev,
2245                               unsigned long *dbitmap, int stripe_nsectors)
2246{
2247        struct btrfs_raid_bio *rbio;
2248        int i;
2249
2250        rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
2251        if (IS_ERR(rbio))
2252                return NULL;
2253        bio_list_add(&rbio->bio_list, bio);
2254        /*
2255         * This is a special bio which is used to hold the completion handler
2256         * and make the scrub rbio is similar to the other types
2257         */
2258        ASSERT(!bio->bi_iter.bi_size);
2259        rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2260
2261        for (i = 0; i < rbio->real_stripes; i++) {
2262                if (bbio->stripes[i].dev == scrub_dev) {
2263                        rbio->scrubp = i;
2264                        break;
2265                }
2266        }
2267
2268        /* Now we just support the sectorsize equals to page size */
2269        ASSERT(root->sectorsize == PAGE_SIZE);
2270        ASSERT(rbio->stripe_npages == stripe_nsectors);
2271        bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
2272
2273        return rbio;
2274}
2275
2276void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
2277                                   struct page *page, u64 logical)
2278{
2279        int stripe_offset;
2280        int index;
2281
2282        ASSERT(logical >= rbio->raid_map[0]);
2283        ASSERT(logical + PAGE_SIZE <= rbio->raid_map[0] +
2284                                rbio->stripe_len * rbio->nr_data);
2285        stripe_offset = (int)(logical - rbio->raid_map[0]);
2286        index = stripe_offset >> PAGE_CACHE_SHIFT;
2287        rbio->bio_pages[index] = page;
2288}
2289
2290/*
2291 * We just scrub the parity that we have correct data on the same horizontal,
2292 * so we needn't allocate all pages for all the stripes.
2293 */
2294static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2295{
2296        int i;
2297        int bit;
2298        int index;
2299        struct page *page;
2300
2301        for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
2302                for (i = 0; i < rbio->real_stripes; i++) {
2303                        index = i * rbio->stripe_npages + bit;
2304                        if (rbio->stripe_pages[index])
2305                                continue;
2306
2307                        page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2308                        if (!page)
2309                                return -ENOMEM;
2310                        rbio->stripe_pages[index] = page;
2311                        ClearPageUptodate(page);
2312                }
2313        }
2314        return 0;
2315}
2316
2317/*
2318 * end io function used by finish_rmw.  When we finally
2319 * get here, we've written a full stripe
2320 */
2321static void raid_write_parity_end_io(struct bio *bio, int err)
2322{
2323        struct btrfs_raid_bio *rbio = bio->bi_private;
2324
2325        if (err)
2326                fail_bio_stripe(rbio, bio);
2327
2328        bio_put(bio);
2329
2330        if (!atomic_dec_and_test(&rbio->stripes_pending))
2331                return;
2332
2333        err = 0;
2334
2335        if (atomic_read(&rbio->error))
2336                err = -EIO;
2337
2338        rbio_orig_end_io(rbio, err, 0);
2339}
2340
2341static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
2342                                         int need_check)
2343{
2344        struct btrfs_bio *bbio = rbio->bbio;
2345        void *pointers[rbio->real_stripes];
2346        DECLARE_BITMAP(pbitmap, rbio->stripe_npages);
2347        int nr_data = rbio->nr_data;
2348        int stripe;
2349        int pagenr;
2350        int p_stripe = -1;
2351        int q_stripe = -1;
2352        struct page *p_page = NULL;
2353        struct page *q_page = NULL;
2354        struct bio_list bio_list;
2355        struct bio *bio;
2356        int is_replace = 0;
2357        int ret;
2358
2359        bio_list_init(&bio_list);
2360
2361        if (rbio->real_stripes - rbio->nr_data == 1) {
2362                p_stripe = rbio->real_stripes - 1;
2363        } else if (rbio->real_stripes - rbio->nr_data == 2) {
2364                p_stripe = rbio->real_stripes - 2;
2365                q_stripe = rbio->real_stripes - 1;
2366        } else {
2367                BUG();
2368        }
2369
2370        if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
2371                is_replace = 1;
2372                bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
2373        }
2374
2375        /*
2376         * Because the higher layers(scrubber) are unlikely to
2377         * use this area of the disk again soon, so don't cache
2378         * it.
2379         */
2380        clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2381
2382        if (!need_check)
2383                goto writeback;
2384
2385        p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2386        if (!p_page)
2387                goto cleanup;
2388        SetPageUptodate(p_page);
2389
2390        if (q_stripe != -1) {
2391                q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2392                if (!q_page) {
2393                        __free_page(p_page);
2394                        goto cleanup;
2395                }
2396                SetPageUptodate(q_page);
2397        }
2398
2399        atomic_set(&rbio->error, 0);
2400
2401        for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2402                struct page *p;
2403                void *parity;
2404                /* first collect one page from each data stripe */
2405                for (stripe = 0; stripe < nr_data; stripe++) {
2406                        p = page_in_rbio(rbio, stripe, pagenr, 0);
2407                        pointers[stripe] = kmap(p);
2408                }
2409
2410                /* then add the parity stripe */
2411                pointers[stripe++] = kmap(p_page);
2412
2413                if (q_stripe != -1) {
2414
2415                        /*
2416                         * raid6, add the qstripe and call the
2417                         * library function to fill in our p/q
2418                         */
2419                        pointers[stripe++] = kmap(q_page);
2420
2421                        raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
2422                                                pointers);
2423                } else {
2424                        /* raid5 */
2425                        memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
2426                        run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
2427                }
2428
2429                /* Check scrubbing pairty and repair it */
2430                p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2431                parity = kmap(p);
2432                if (memcmp(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE))
2433                        memcpy(parity, pointers[rbio->scrubp], PAGE_CACHE_SIZE);
2434                else
2435                        /* Parity is right, needn't writeback */
2436                        bitmap_clear(rbio->dbitmap, pagenr, 1);
2437                kunmap(p);
2438
2439                for (stripe = 0; stripe < rbio->real_stripes; stripe++)
2440                        kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
2441        }
2442
2443        __free_page(p_page);
2444        if (q_page)
2445                __free_page(q_page);
2446
2447writeback:
2448        /*
2449         * time to start writing.  Make bios for everything from the
2450         * higher layers (the bio_list in our rbio) and our p/q.  Ignore
2451         * everything else.
2452         */
2453        for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2454                struct page *page;
2455
2456                page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2457                ret = rbio_add_io_page(rbio, &bio_list,
2458                               page, rbio->scrubp, pagenr, rbio->stripe_len);
2459                if (ret)
2460                        goto cleanup;
2461        }
2462
2463        if (!is_replace)
2464                goto submit_write;
2465
2466        for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
2467                struct page *page;
2468
2469                page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2470                ret = rbio_add_io_page(rbio, &bio_list, page,
2471                                       bbio->tgtdev_map[rbio->scrubp],
2472                                       pagenr, rbio->stripe_len);
2473                if (ret)
2474                        goto cleanup;
2475        }
2476
2477submit_write:
2478        nr_data = bio_list_size(&bio_list);
2479        if (!nr_data) {
2480                /* Every parity is right */
2481                rbio_orig_end_io(rbio, 0, 0);
2482                return;
2483        }
2484
2485        atomic_set(&rbio->stripes_pending, nr_data);
2486
2487        while (1) {
2488                bio = bio_list_pop(&bio_list);
2489                if (!bio)
2490                        break;
2491
2492                bio->bi_private = rbio;
2493                bio->bi_end_io = raid_write_parity_end_io;
2494                BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
2495                submit_bio(WRITE, bio);
2496        }
2497        return;
2498
2499cleanup:
2500        rbio_orig_end_io(rbio, -EIO, 0);
2501}
2502
2503static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2504{
2505        if (stripe >= 0 && stripe < rbio->nr_data)
2506                return 1;
2507        return 0;
2508}
2509
2510/*
2511 * While we're doing the parity check and repair, we could have errors
2512 * in reading pages off the disk.  This checks for errors and if we're
2513 * not able to read the page it'll trigger parity reconstruction.  The
2514 * parity scrub will be finished after we've reconstructed the failed
2515 * stripes
2516 */
2517static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
2518{
2519        if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
2520                goto cleanup;
2521
2522        if (rbio->faila >= 0 || rbio->failb >= 0) {
2523                int dfail = 0, failp = -1;
2524
2525                if (is_data_stripe(rbio, rbio->faila))
2526                        dfail++;
2527                else if (is_parity_stripe(rbio->faila))
2528                        failp = rbio->faila;
2529
2530                if (is_data_stripe(rbio, rbio->failb))
2531                        dfail++;
2532                else if (is_parity_stripe(rbio->failb))
2533                        failp = rbio->failb;
2534
2535                /*
2536                 * Because we can not use a scrubbing parity to repair
2537                 * the data, so the capability of the repair is declined.
2538                 * (In the case of RAID5, we can not repair anything)
2539                 */
2540                if (dfail > rbio->bbio->max_errors - 1)
2541                        goto cleanup;
2542
2543                /*
2544                 * If all data is good, only parity is correctly, just
2545                 * repair the parity.
2546                 */
2547                if (dfail == 0) {
2548                        finish_parity_scrub(rbio, 0);
2549                        return;
2550                }
2551
2552                /*
2553                 * Here means we got one corrupted data stripe and one
2554                 * corrupted parity on RAID6, if the corrupted parity
2555                 * is scrubbing parity, luckly, use the other one to repair
2556                 * the data, or we can not repair the data stripe.
2557                 */
2558                if (failp != rbio->scrubp)
2559                        goto cleanup;
2560
2561                __raid_recover_end_io(rbio);
2562        } else {
2563                finish_parity_scrub(rbio, 1);
2564        }
2565        return;
2566
2567cleanup:
2568        rbio_orig_end_io(rbio, -EIO, 0);
2569}
2570
2571/*
2572 * end io for the read phase of the rmw cycle.  All the bios here are physical
2573 * stripe bios we've read from the disk so we can recalculate the parity of the
2574 * stripe.
2575 *
2576 * This will usually kick off finish_rmw once all the bios are read in, but it
2577 * may trigger parity reconstruction if we had any errors along the way
2578 */
2579static void raid56_parity_scrub_end_io(struct bio *bio, int err)
2580{
2581        struct btrfs_raid_bio *rbio = bio->bi_private;
2582
2583        if (err)
2584                fail_bio_stripe(rbio, bio);
2585        else
2586                set_bio_pages_uptodate(bio);
2587
2588        bio_put(bio);
2589
2590        if (!atomic_dec_and_test(&rbio->stripes_pending))
2591                return;
2592
2593        /*
2594         * this will normally call finish_rmw to start our write
2595         * but if there are any failed stripes we'll reconstruct
2596         * from parity first
2597         */
2598        validate_rbio_for_parity_scrub(rbio);
2599}
2600
2601static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
2602{
2603        int bios_to_read = 0;
2604        struct bio_list bio_list;
2605        int ret;
2606        int pagenr;
2607        int stripe;
2608        struct bio *bio;
2609
2610        ret = alloc_rbio_essential_pages(rbio);
2611        if (ret)
2612                goto cleanup;
2613
2614        bio_list_init(&bio_list);
2615
2616        atomic_set(&rbio->error, 0);
2617        /*
2618         * build a list of bios to read all the missing parts of this
2619         * stripe
2620         */
2621        for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
2622                for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2623                        struct page *page;
2624                        /*
2625                         * we want to find all the pages missing from
2626                         * the rbio and read them from the disk.  If
2627                         * page_in_rbio finds a page in the bio list
2628                         * we don't need to read it off the stripe.
2629                         */
2630                        page = page_in_rbio(rbio, stripe, pagenr, 1);
2631                        if (page)
2632                                continue;
2633
2634                        page = rbio_stripe_page(rbio, stripe, pagenr);
2635                        /*
2636                         * the bio cache may have handed us an uptodate
2637                         * page.  If so, be happy and use it
2638                         */
2639                        if (PageUptodate(page))
2640                                continue;
2641
2642                        ret = rbio_add_io_page(rbio, &bio_list, page,
2643                                       stripe, pagenr, rbio->stripe_len);
2644                        if (ret)
2645                                goto cleanup;
2646                }
2647        }
2648
2649        bios_to_read = bio_list_size(&bio_list);
2650        if (!bios_to_read) {
2651                /*
2652                 * this can happen if others have merged with
2653                 * us, it means there is nothing left to read.
2654                 * But if there are missing devices it may not be
2655                 * safe to do the full stripe write yet.
2656                 */
2657                goto finish;
2658        }
2659
2660        /*
2661         * the bbio may be freed once we submit the last bio.  Make sure
2662         * not to touch it after that
2663         */
2664        atomic_set(&rbio->stripes_pending, bios_to_read);
2665        while (1) {
2666                bio = bio_list_pop(&bio_list);
2667                if (!bio)
2668                        break;
2669
2670                bio->bi_private = rbio;
2671                bio->bi_end_io = raid56_parity_scrub_end_io;
2672
2673                btrfs_bio_wq_end_io(rbio->fs_info, bio,
2674                                    BTRFS_WQ_ENDIO_RAID56);
2675
2676                BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
2677                submit_bio(READ, bio);
2678        }
2679        /* the actual write will happen once the reads are done */
2680        return;
2681
2682cleanup:
2683        rbio_orig_end_io(rbio, -EIO, 0);
2684        return;
2685
2686finish:
2687        validate_rbio_for_parity_scrub(rbio);
2688}
2689
2690static void scrub_parity_work(struct btrfs_work *work)
2691{
2692        struct btrfs_raid_bio *rbio;
2693
2694        rbio = container_of(work, struct btrfs_raid_bio, work);
2695        raid56_parity_scrub_stripe(rbio);
2696}
2697
2698static void async_scrub_parity(struct btrfs_raid_bio *rbio)
2699{
2700        btrfs_init_work(&rbio->work, btrfs_rmw_helper,
2701                        scrub_parity_work, NULL, NULL);
2702
2703        btrfs_queue_work(rbio->fs_info->rmw_workers,
2704                         &rbio->work);
2705}
2706
2707void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2708{
2709        if (!lock_stripe_add(rbio))
2710                async_scrub_parity(rbio);
2711}
2712