LXR linux/fs/btrfs/raid56.c

   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2012 Fusion-io  All rights reserved.
   4 * Copyright (C) 2012 Intel Corp. All rights reserved.
   5 */
   6
   7#include <linux/sched.h>
   8#include <linux/bio.h>
   9#include <linux/slab.h>
  10#include <linux/blkdev.h>
  11#include <linux/raid/pq.h>
  12#include <linux/hash.h>
  13#include <linux/list_sort.h>
  14#include <linux/raid/xor.h>
  15#include <linux/mm.h>
  16#include "ctree.h"
  17#include "disk-io.h"
  18#include "volumes.h"
  19#include "raid56.h"
  20#include "async-thread.h"
  21
  22/* set when additional merges to this rbio are not allowed */
  23#define RBIO_RMW_LOCKED_BIT     1
  24
  25/*
  26 * set when this rbio is sitting in the hash, but it is just a cache
  27 * of past RMW
  28 */
  29#define RBIO_CACHE_BIT          2
  30
  31/*
  32 * set when it is safe to trust the stripe_pages for caching
  33 */
  34#define RBIO_CACHE_READY_BIT    3
  35
  36#define RBIO_CACHE_SIZE 1024
  37
  38enum btrfs_rbio_ops {
  39        BTRFS_RBIO_WRITE,
  40        BTRFS_RBIO_READ_REBUILD,
  41        BTRFS_RBIO_PARITY_SCRUB,
  42        BTRFS_RBIO_REBUILD_MISSING,
  43};
  44
  45struct btrfs_raid_bio {
  46        struct btrfs_fs_info *fs_info;
  47        struct btrfs_bio *bbio;
  48
  49        /* while we're doing rmw on a stripe
  50         * we put it into a hash table so we can
  51         * lock the stripe and merge more rbios
  52         * into it.
  53         */
  54        struct list_head hash_list;
  55
  56        /*
  57         * LRU list for the stripe cache
  58         */
  59        struct list_head stripe_cache;
  60
  61        /*
  62         * for scheduling work in the helper threads
  63         */
  64        struct btrfs_work work;
  65
  66        /*
  67         * bio list and bio_list_lock are used
  68         * to add more bios into the stripe
  69         * in hopes of avoiding the full rmw
  70         */
  71        struct bio_list bio_list;
  72        spinlock_t bio_list_lock;
  73
  74        /* also protected by the bio_list_lock, the
  75         * plug list is used by the plugging code
  76         * to collect partial bios while plugged.  The
  77         * stripe locking code also uses it to hand off
  78         * the stripe lock to the next pending IO
  79         */
  80        struct list_head plug_list;
  81
  82        /*
  83         * flags that tell us if it is safe to
  84         * merge with this bio
  85         */
  86        unsigned long flags;
  87
  88        /* size of each individual stripe on disk */
  89        int stripe_len;
  90
  91        /* number of data stripes (no p/q) */
  92        int nr_data;
  93
  94        int real_stripes;
  95
  96        int stripe_npages;
  97        /*
  98         * set if we're doing a parity rebuild
  99         * for a read from higher up, which is handled
 100         * differently from a parity rebuild as part of
 101         * rmw
 102         */
 103        enum btrfs_rbio_ops operation;
 104
 105        /* first bad stripe */
 106        int faila;
 107
 108        /* second bad stripe (for raid6 use) */
 109        int failb;
 110
 111        int scrubp;
 112        /*
 113         * number of pages needed to represent the full
 114         * stripe
 115         */
 116        int nr_pages;
 117
 118        /*
 119         * size of all the bios in the bio_list.  This
 120         * helps us decide if the rbio maps to a full
 121         * stripe or not
 122         */
 123        int bio_list_bytes;
 124
 125        int generic_bio_cnt;
 126
 127        refcount_t refs;
 128
 129        atomic_t stripes_pending;
 130
 131        atomic_t error;
 132        /*
 133         * these are two arrays of pointers.  We allocate the
 134         * rbio big enough to hold them both and setup their
 135         * locations when the rbio is allocated
 136         */
 137
 138        /* pointers to pages that we allocated for
 139         * reading/writing stripes directly from the disk (including P/Q)
 140         */
 141        struct page **stripe_pages;
 142
 143        /*
 144         * pointers to the pages in the bio_list.  Stored
 145         * here for faster lookup
 146         */
 147        struct page **bio_pages;
 148
 149        /*
 150         * bitmap to record which horizontal stripe has data
 151         */
 152        unsigned long *dbitmap;
 153
 154        /* allocated with real_stripes-many pointers for finish_*() calls */
 155        void **finish_pointers;
 156
 157        /* allocated with stripe_npages-many bits for finish_*() calls */
 158        unsigned long *finish_pbitmap;
 159};
 160
 161static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
 162static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
 163static void rmw_work(struct btrfs_work *work);
 164static void read_rebuild_work(struct btrfs_work *work);
 165static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
 166static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
 167static void __free_raid_bio(struct btrfs_raid_bio *rbio);
 168static void index_rbio_pages(struct btrfs_raid_bio *rbio);
 169static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
 170
 171static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
 172                                         int need_check);
 173static void scrub_parity_work(struct btrfs_work *work);
 174
 175static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func)
 176{
 177        btrfs_init_work(&rbio->work, btrfs_rmw_helper, work_func, NULL, NULL);
 178        btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
 179}
 180
 181/*
 182 * the stripe hash table is used for locking, and to collect
 183 * bios in hopes of making a full stripe
 184 */
 185int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
 186{
 187        struct btrfs_stripe_hash_table *table;
 188        struct btrfs_stripe_hash_table *x;
 189        struct btrfs_stripe_hash *cur;
 190        struct btrfs_stripe_hash *h;
 191        int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
 192        int i;
 193        int table_size;
 194
 195        if (info->stripe_hash_table)
 196                return 0;
 197
 198        /*
 199         * The table is large, starting with order 4 and can go as high as
 200         * order 7 in case lock debugging is turned on.
 201         *
 202         * Try harder to allocate and fallback to vmalloc to lower the chance
 203         * of a failing mount.
 204         */
 205        table_size = sizeof(*table) + sizeof(*h) * num_entries;
 206        table = kvzalloc(table_size, GFP_KERNEL);
 207        if (!table)
 208                return -ENOMEM;
 209
 210        spin_lock_init(&table->cache_lock);
 211        INIT_LIST_HEAD(&table->stripe_cache);
 212
 213        h = table->table;
 214
 215        for (i = 0; i < num_entries; i++) {
 216                cur = h + i;
 217                INIT_LIST_HEAD(&cur->hash_list);
 218                spin_lock_init(&cur->lock);
 219        }
 220
 221        x = cmpxchg(&info->stripe_hash_table, NULL, table);
 222        if (x)
 223                kvfree(x);
 224        return 0;
 225}
 226
 227/*
 228 * caching an rbio means to copy anything from the
 229 * bio_pages array into the stripe_pages array.  We
 230 * use the page uptodate bit in the stripe cache array
 231 * to indicate if it has valid data
 232 *
 233 * once the caching is done, we set the cache ready
 234 * bit.
 235 */
 236static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
 237{
 238        int i;
 239        char *s;
 240        char *d;
 241        int ret;
 242
 243        ret = alloc_rbio_pages(rbio);
 244        if (ret)
 245                return;
 246
 247        for (i = 0; i < rbio->nr_pages; i++) {
 248                if (!rbio->bio_pages[i])
 249                        continue;
 250
 251                s = kmap(rbio->bio_pages[i]);
 252                d = kmap(rbio->stripe_pages[i]);
 253
 254                copy_page(d, s);
 255
 256                kunmap(rbio->bio_pages[i]);
 257                kunmap(rbio->stripe_pages[i]);
 258                SetPageUptodate(rbio->stripe_pages[i]);
 259        }
 260        set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
 261}
 262
 263/*
 264 * we hash on the first logical address of the stripe
 265 */
 266static int rbio_bucket(struct btrfs_raid_bio *rbio)
 267{
 268        u64 num = rbio->bbio->raid_map[0];
 269
 270        /*
 271         * we shift down quite a bit.  We're using byte
 272         * addressing, and most of the lower bits are zeros.
 273         * This tends to upset hash_64, and it consistently
 274         * returns just one or two different values.
 275         *
 276         * shifting off the lower bits fixes things.
 277         */
 278        return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
 279}
 280
 281/*
 282 * stealing an rbio means taking all the uptodate pages from the stripe
 283 * array in the source rbio and putting them into the destination rbio
 284 */
 285static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
 286{
 287        int i;
 288        struct page *s;
 289        struct page *d;
 290
 291        if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
 292                return;
 293
 294        for (i = 0; i < dest->nr_pages; i++) {
 295                s = src->stripe_pages[i];
 296                if (!s || !PageUptodate(s)) {
 297                        continue;
 298                }
 299
 300                d = dest->stripe_pages[i];
 301                if (d)
 302                        __free_page(d);
 303
 304                dest->stripe_pages[i] = s;
 305                src->stripe_pages[i] = NULL;
 306        }
 307}
 308
 309/*
 310 * merging means we take the bio_list from the victim and
 311 * splice it into the destination.  The victim should
 312 * be discarded afterwards.
 313 *
 314 * must be called with dest->rbio_list_lock held
 315 */
 316static void merge_rbio(struct btrfs_raid_bio *dest,
 317                       struct btrfs_raid_bio *victim)
 318{
 319        bio_list_merge(&dest->bio_list, &victim->bio_list);
 320        dest->bio_list_bytes += victim->bio_list_bytes;
 321        dest->generic_bio_cnt += victim->generic_bio_cnt;
 322        bio_list_init(&victim->bio_list);
 323}
 324
 325/*
 326 * used to prune items that are in the cache.  The caller
 327 * must hold the hash table lock.
 328 */
 329static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
 330{
 331        int bucket = rbio_bucket(rbio);
 332        struct btrfs_stripe_hash_table *table;
 333        struct btrfs_stripe_hash *h;
 334        int freeit = 0;
 335
 336        /*
 337         * check the bit again under the hash table lock.
 338         */
 339        if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
 340                return;
 341
 342        table = rbio->fs_info->stripe_hash_table;
 343        h = table->table + bucket;
 344
 345        /* hold the lock for the bucket because we may be
 346         * removing it from the hash table
 347         */
 348        spin_lock(&h->lock);
 349
 350        /*
 351         * hold the lock for the bio list because we need
 352         * to make sure the bio list is empty
 353         */
 354        spin_lock(&rbio->bio_list_lock);
 355
 356        if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
 357                list_del_init(&rbio->stripe_cache);
 358                table->cache_size -= 1;
 359                freeit = 1;
 360
 361                /* if the bio list isn't empty, this rbio is
 362                 * still involved in an IO.  We take it out
 363                 * of the cache list, and drop the ref that
 364                 * was held for the list.
 365                 *
 366                 * If the bio_list was empty, we also remove
 367                 * the rbio from the hash_table, and drop
 368                 * the corresponding ref
 369                 */
 370                if (bio_list_empty(&rbio->bio_list)) {
 371                        if (!list_empty(&rbio->hash_list)) {
 372                                list_del_init(&rbio->hash_list);
 373                                refcount_dec(&rbio->refs);
 374                                BUG_ON(!list_empty(&rbio->plug_list));
 375                        }
 376                }
 377        }
 378
 379        spin_unlock(&rbio->bio_list_lock);
 380        spin_unlock(&h->lock);
 381
 382        if (freeit)
 383                __free_raid_bio(rbio);
 384}
 385
 386/*
 387 * prune a given rbio from the cache
 388 */
 389static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
 390{
 391        struct btrfs_stripe_hash_table *table;
 392        unsigned long flags;
 393
 394        if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
 395                return;
 396
 397        table = rbio->fs_info->stripe_hash_table;
 398
 399        spin_lock_irqsave(&table->cache_lock, flags);
 400        __remove_rbio_from_cache(rbio);
 401        spin_unlock_irqrestore(&table->cache_lock, flags);
 402}
 403
 404/*
 405 * remove everything in the cache
 406 */
 407static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
 408{
 409        struct btrfs_stripe_hash_table *table;
 410        unsigned long flags;
 411        struct btrfs_raid_bio *rbio;
 412
 413        table = info->stripe_hash_table;
 414
 415        spin_lock_irqsave(&table->cache_lock, flags);
 416        while (!list_empty(&table->stripe_cache)) {
 417                rbio = list_entry(table->stripe_cache.next,
 418                                  struct btrfs_raid_bio,
 419                                  stripe_cache);
 420                __remove_rbio_from_cache(rbio);
 421        }
 422        spin_unlock_irqrestore(&table->cache_lock, flags);
 423}
 424
 425/*
 426 * remove all cached entries and free the hash table
 427 * used by unmount
 428 */
 429void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
 430{
 431        if (!info->stripe_hash_table)
 432                return;
 433        btrfs_clear_rbio_cache(info);
 434        kvfree(info->stripe_hash_table);
 435        info->stripe_hash_table = NULL;
 436}
 437
 438/*
 439 * insert an rbio into the stripe cache.  It
 440 * must have already been prepared by calling
 441 * cache_rbio_pages
 442 *
 443 * If this rbio was already cached, it gets
 444 * moved to the front of the lru.
 445 *
 446 * If the size of the rbio cache is too big, we
 447 * prune an item.
 448 */
 449static void cache_rbio(struct btrfs_raid_bio *rbio)
 450{
 451        struct btrfs_stripe_hash_table *table;
 452        unsigned long flags;
 453
 454        if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
 455                return;
 456
 457        table = rbio->fs_info->stripe_hash_table;
 458
 459        spin_lock_irqsave(&table->cache_lock, flags);
 460        spin_lock(&rbio->bio_list_lock);
 461
 462        /* bump our ref if we were not in the list before */
 463        if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
 464                refcount_inc(&rbio->refs);
 465
 466        if (!list_empty(&rbio->stripe_cache)){
 467                list_move(&rbio->stripe_cache, &table->stripe_cache);
 468        } else {
 469                list_add(&rbio->stripe_cache, &table->stripe_cache);
 470                table->cache_size += 1;
 471        }
 472
 473        spin_unlock(&rbio->bio_list_lock);
 474
 475        if (table->cache_size > RBIO_CACHE_SIZE) {
 476                struct btrfs_raid_bio *found;
 477
 478                found = list_entry(table->stripe_cache.prev,
 479                                  struct btrfs_raid_bio,
 480                                  stripe_cache);
 481
 482                if (found != rbio)
 483                        __remove_rbio_from_cache(found);
 484        }
 485
 486        spin_unlock_irqrestore(&table->cache_lock, flags);
 487}
 488
 489/*
 490 * helper function to run the xor_blocks api.  It is only
 491 * able to do MAX_XOR_BLOCKS at a time, so we need to
 492 * loop through.
 493 */
 494static void run_xor(void **pages, int src_cnt, ssize_t len)
 495{
 496        int src_off = 0;
 497        int xor_src_cnt = 0;
 498        void *dest = pages[src_cnt];
 499
 500        while(src_cnt > 0) {
 501                xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
 502                xor_blocks(xor_src_cnt, len, dest, pages + src_off);
 503
 504                src_cnt -= xor_src_cnt;
 505                src_off += xor_src_cnt;
 506        }
 507}
 508
 509/*
 510 * Returns true if the bio list inside this rbio covers an entire stripe (no
 511 * rmw required).
 512 */
 513static int rbio_is_full(struct btrfs_raid_bio *rbio)
 514{
 515        unsigned long flags;
 516        unsigned long size = rbio->bio_list_bytes;
 517        int ret = 1;
 518
 519        spin_lock_irqsave(&rbio->bio_list_lock, flags);
 520        if (size != rbio->nr_data * rbio->stripe_len)
 521                ret = 0;
 522        BUG_ON(size > rbio->nr_data * rbio->stripe_len);
 523        spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
 524
 525        return ret;
 526}
 527
 528/*
 529 * returns 1 if it is safe to merge two rbios together.
 530 * The merging is safe if the two rbios correspond to
 531 * the same stripe and if they are both going in the same
 532 * direction (read vs write), and if neither one is
 533 * locked for final IO
 534 *
 535 * The caller is responsible for locking such that
 536 * rmw_locked is safe to test
 537 */
 538static int rbio_can_merge(struct btrfs_raid_bio *last,
 539                          struct btrfs_raid_bio *cur)
 540{
 541        if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
 542            test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
 543                return 0;
 544
 545        /*
 546         * we can't merge with cached rbios, since the
 547         * idea is that when we merge the destination
 548         * rbio is going to run our IO for us.  We can
 549         * steal from cached rbios though, other functions
 550         * handle that.
 551         */
 552        if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
 553            test_bit(RBIO_CACHE_BIT, &cur->flags))
 554                return 0;
 555
 556        if (last->bbio->raid_map[0] !=
 557            cur->bbio->raid_map[0])
 558                return 0;
 559
 560        /* we can't merge with different operations */
 561        if (last->operation != cur->operation)
 562                return 0;
 563        /*
 564         * We've need read the full stripe from the drive.
 565         * check and repair the parity and write the new results.
 566         *
 567         * We're not allowed to add any new bios to the
 568         * bio list here, anyone else that wants to
 569         * change this stripe needs to do their own rmw.
 570         */
 571        if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
 572                return 0;
 573
 574        if (last->operation == BTRFS_RBIO_REBUILD_MISSING)
 575                return 0;
 576
 577        if (last->operation == BTRFS_RBIO_READ_REBUILD) {
 578                int fa = last->faila;
 579                int fb = last->failb;
 580                int cur_fa = cur->faila;
 581                int cur_fb = cur->failb;
 582
 583                if (last->faila >= last->failb) {
 584                        fa = last->failb;
 585                        fb = last->faila;
 586                }
 587
 588                if (cur->faila >= cur->failb) {
 589                        cur_fa = cur->failb;
 590                        cur_fb = cur->faila;
 591                }
 592
 593                if (fa != cur_fa || fb != cur_fb)
 594                        return 0;
 595        }
 596        return 1;
 597}
 598
 599static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe,
 600                                  int index)
 601{
 602        return stripe * rbio->stripe_npages + index;
 603}
 604
 605/*
 606 * these are just the pages from the rbio array, not from anything
 607 * the FS sent down to us
 608 */
 609static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe,
 610                                     int index)
 611{
 612        return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)];
 613}
 614
 615/*
 616 * helper to index into the pstripe
 617 */
 618static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
 619{
 620        return rbio_stripe_page(rbio, rbio->nr_data, index);
 621}
 622
 623/*
 624 * helper to index into the qstripe, returns null
 625 * if there is no qstripe
 626 */
 627static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
 628{
 629        if (rbio->nr_data + 1 == rbio->real_stripes)
 630                return NULL;
 631        return rbio_stripe_page(rbio, rbio->nr_data + 1, index);
 632}
 633
 634/*
 635 * The first stripe in the table for a logical address
 636 * has the lock.  rbios are added in one of three ways:
 637 *
 638 * 1) Nobody has the stripe locked yet.  The rbio is given
 639 * the lock and 0 is returned.  The caller must start the IO
 640 * themselves.
 641 *
 642 * 2) Someone has the stripe locked, but we're able to merge
 643 * with the lock owner.  The rbio is freed and the IO will
 644 * start automatically along with the existing rbio.  1 is returned.
 645 *
 646 * 3) Someone has the stripe locked, but we're not able to merge.
 647 * The rbio is added to the lock owner's plug list, or merged into
 648 * an rbio already on the plug list.  When the lock owner unlocks,
 649 * the next rbio on the list is run and the IO is started automatically.
 650 * 1 is returned
 651 *
 652 * If we return 0, the caller still owns the rbio and must continue with
 653 * IO submission.  If we return 1, the caller must assume the rbio has
 654 * already been freed.
 655 */
 656static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
 657{
 658        int bucket = rbio_bucket(rbio);
 659        struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
 660        struct btrfs_raid_bio *cur;
 661        struct btrfs_raid_bio *pending;
 662        unsigned long flags;
 663        struct btrfs_raid_bio *freeit = NULL;
 664        struct btrfs_raid_bio *cache_drop = NULL;
 665        int ret = 0;
 666
 667        spin_lock_irqsave(&h->lock, flags);
 668        list_for_each_entry(cur, &h->hash_list, hash_list) {
 669                if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) {
 670                        spin_lock(&cur->bio_list_lock);
 671
 672                        /* can we steal this cached rbio's pages? */
 673                        if (bio_list_empty(&cur->bio_list) &&
 674                            list_empty(&cur->plug_list) &&
 675                            test_bit(RBIO_CACHE_BIT, &cur->flags) &&
 676                            !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
 677                                list_del_init(&cur->hash_list);
 678                                refcount_dec(&cur->refs);
 679
 680                                steal_rbio(cur, rbio);
 681                                cache_drop = cur;
 682                                spin_unlock(&cur->bio_list_lock);
 683
 684                                goto lockit;
 685                        }
 686
 687                        /* can we merge into the lock owner? */
 688                        if (rbio_can_merge(cur, rbio)) {
 689                                merge_rbio(cur, rbio);
 690                                spin_unlock(&cur->bio_list_lock);
 691                                freeit = rbio;
 692                                ret = 1;
 693                                goto out;
 694                        }
 695
 696
 697                        /*
 698                         * we couldn't merge with the running
 699                         * rbio, see if we can merge with the
 700                         * pending ones.  We don't have to
 701                         * check for rmw_locked because there
 702                         * is no way they are inside finish_rmw
 703                         * right now
 704                         */
 705                        list_for_each_entry(pending, &cur->plug_list,
 706                                            plug_list) {
 707                                if (rbio_can_merge(pending, rbio)) {
 708                                        merge_rbio(pending, rbio);
 709                                        spin_unlock(&cur->bio_list_lock);
 710                                        freeit = rbio;
 711                                        ret = 1;
 712                                        goto out;
 713                                }
 714                        }
 715
 716                        /* no merging, put us on the tail of the plug list,
 717                         * our rbio will be started with the currently
 718                         * running rbio unlocks
 719                         */
 720                        list_add_tail(&rbio->plug_list, &cur->plug_list);
 721                        spin_unlock(&cur->bio_list_lock);
 722                        ret = 1;
 723                        goto out;
 724                }
 725        }
 726lockit:
 727        refcount_inc(&rbio->refs);
 728        list_add(&rbio->hash_list, &h->hash_list);
 729out:
 730        spin_unlock_irqrestore(&h->lock, flags);
 731        if (cache_drop)
 732                remove_rbio_from_cache(cache_drop);
 733        if (freeit)
 734                __free_raid_bio(freeit);
 735        return ret;
 736}
 737
 738/*
 739 * called as rmw or parity rebuild is completed.  If the plug list has more
 740 * rbios waiting for this stripe, the next one on the list will be started
 741 */
 742static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
 743{
 744        int bucket;
 745        struct btrfs_stripe_hash *h;
 746        unsigned long flags;
 747        int keep_cache = 0;
 748
 749        bucket = rbio_bucket(rbio);
 750        h = rbio->fs_info->stripe_hash_table->table + bucket;
 751
 752        if (list_empty(&rbio->plug_list))
 753                cache_rbio(rbio);
 754
 755        spin_lock_irqsave(&h->lock, flags);
 756        spin_lock(&rbio->bio_list_lock);
 757
 758        if (!list_empty(&rbio->hash_list)) {
 759                /*
 760                 * if we're still cached and there is no other IO
 761                 * to perform, just leave this rbio here for others
 762                 * to steal from later
 763                 */
 764                if (list_empty(&rbio->plug_list) &&
 765                    test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
 766                        keep_cache = 1;
 767                        clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
 768                        BUG_ON(!bio_list_empty(&rbio->bio_list));
 769                        goto done;
 770                }
 771
 772                list_del_init(&rbio->hash_list);
 773                refcount_dec(&rbio->refs);
 774
 775                /*
 776                 * we use the plug list to hold all the rbios
 777                 * waiting for the chance to lock this stripe.
 778                 * hand the lock over to one of them.
 779                 */
 780                if (!list_empty(&rbio->plug_list)) {
 781                        struct btrfs_raid_bio *next;
 782                        struct list_head *head = rbio->plug_list.next;
 783
 784                        next = list_entry(head, struct btrfs_raid_bio,
 785                                          plug_list);
 786
 787                        list_del_init(&rbio->plug_list);
 788
 789                        list_add(&next->hash_list, &h->hash_list);
 790                        refcount_inc(&next->refs);
 791                        spin_unlock(&rbio->bio_list_lock);
 792                        spin_unlock_irqrestore(&h->lock, flags);
 793
 794                        if (next->operation == BTRFS_RBIO_READ_REBUILD)
 795                                start_async_work(next, read_rebuild_work);
 796                        else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
 797                                steal_rbio(rbio, next);
 798                                start_async_work(next, read_rebuild_work);
 799                        } else if (next->operation == BTRFS_RBIO_WRITE) {
 800                                steal_rbio(rbio, next);
 801                                start_async_work(next, rmw_work);
 802                        } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
 803                                steal_rbio(rbio, next);
 804                                start_async_work(next, scrub_parity_work);
 805                        }
 806
 807                        goto done_nolock;
 808                }
 809        }
 810done:
 811        spin_unlock(&rbio->bio_list_lock);
 812        spin_unlock_irqrestore(&h->lock, flags);
 813
 814done_nolock:
 815        if (!keep_cache)
 816                remove_rbio_from_cache(rbio);
 817}
 818
 819static void __free_raid_bio(struct btrfs_raid_bio *rbio)
 820{
 821        int i;
 822
 823        if (!refcount_dec_and_test(&rbio->refs))
 824                return;
 825
 826        WARN_ON(!list_empty(&rbio->stripe_cache));
 827        WARN_ON(!list_empty(&rbio->hash_list));
 828        WARN_ON(!bio_list_empty(&rbio->bio_list));
 829
 830        for (i = 0; i < rbio->nr_pages; i++) {
 831                if (rbio->stripe_pages[i]) {
 832                        __free_page(rbio->stripe_pages[i]);
 833                        rbio->stripe_pages[i] = NULL;
 834                }
 835        }
 836
 837        btrfs_put_bbio(rbio->bbio);
 838        kfree(rbio);
 839}
 840
 841static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
 842{
 843        struct bio *next;
 844
 845        while (cur) {
 846                next = cur->bi_next;
 847                cur->bi_next = NULL;
 848                cur->bi_status = err;
 849                bio_endio(cur);
 850                cur = next;
 851        }
 852}
 853
 854/*
 855 * this frees the rbio and runs through all the bios in the
 856 * bio_list and calls end_io on them
 857 */
 858static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
 859{
 860        struct bio *cur = bio_list_get(&rbio->bio_list);
 861        struct bio *extra;
 862
 863        if (rbio->generic_bio_cnt)
 864                btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
 865
 866        /*
 867         * At this moment, rbio->bio_list is empty, however since rbio does not
 868         * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
 869         * hash list, rbio may be merged with others so that rbio->bio_list
 870         * becomes non-empty.
 871         * Once unlock_stripe() is done, rbio->bio_list will not be updated any
 872         * more and we can call bio_endio() on all queued bios.
 873         */
 874        unlock_stripe(rbio);
 875        extra = bio_list_get(&rbio->bio_list);
 876        __free_raid_bio(rbio);
 877
 878        rbio_endio_bio_list(cur, err);
 879        if (extra)
 880                rbio_endio_bio_list(extra, err);
 881}
 882
 883/*
 884 * end io function used by finish_rmw.  When we finally
 885 * get here, we've written a full stripe
 886 */
 887static void raid_write_end_io(struct bio *bio)
 888{
 889        struct btrfs_raid_bio *rbio = bio->bi_private;
 890        blk_status_t err = bio->bi_status;
 891        int max_errors;
 892
 893        if (err)
 894                fail_bio_stripe(rbio, bio);
 895
 896        bio_put(bio);
 897
 898        if (!atomic_dec_and_test(&rbio->stripes_pending))
 899                return;
 900
 901        err = BLK_STS_OK;
 902
 903        /* OK, we have read all the stripes we need to. */
 904        max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
 905                     0 : rbio->bbio->max_errors;
 906        if (atomic_read(&rbio->error) > max_errors)
 907                err = BLK_STS_IOERR;
 908
 909        rbio_orig_end_io(rbio, err);
 910}
 911
 912/*
 913 * the read/modify/write code wants to use the original bio for
 914 * any pages it included, and then use the rbio for everything
 915 * else.  This function decides if a given index (stripe number)
 916 * and page number in that stripe fall inside the original bio
 917 * or the rbio.
 918 *
 919 * if you set bio_list_only, you'll get a NULL back for any ranges
 920 * that are outside the bio_list
 921 *
 922 * This doesn't take any refs on anything, you get a bare page pointer
 923 * and the caller must bump refs as required.
 924 *
 925 * You must call index_rbio_pages once before you can trust
 926 * the answers from this function.
 927 */
 928static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
 929                                 int index, int pagenr, int bio_list_only)
 930{
 931        int chunk_page;
 932        struct page *p = NULL;
 933
 934        chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
 935
 936        spin_lock_irq(&rbio->bio_list_lock);
 937        p = rbio->bio_pages[chunk_page];
 938        spin_unlock_irq(&rbio->bio_list_lock);
 939
 940        if (p || bio_list_only)
 941                return p;
 942
 943        return rbio->stripe_pages[chunk_page];
 944}
 945
 946/*
 947 * number of pages we need for the entire stripe across all the
 948 * drives
 949 */
 950static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
 951{
 952        return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes;
 953}
 954
 955/*
 956 * allocation and initial setup for the btrfs_raid_bio.  Not
 957 * this does not allocate any pages for rbio->pages.
 958 */
 959static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
 960                                         struct btrfs_bio *bbio,
 961                                         u64 stripe_len)
 962{
 963        struct btrfs_raid_bio *rbio;
 964        int nr_data = 0;
 965        int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
 966        int num_pages = rbio_nr_pages(stripe_len, real_stripes);
 967        int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
 968        void *p;
 969
 970        rbio = kzalloc(sizeof(*rbio) +
 971                       sizeof(*rbio->stripe_pages) * num_pages +
 972                       sizeof(*rbio->bio_pages) * num_pages +
 973                       sizeof(*rbio->finish_pointers) * real_stripes +
 974                       sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_npages) +
 975                       sizeof(*rbio->finish_pbitmap) *
 976                                BITS_TO_LONGS(stripe_npages),
 977                       GFP_NOFS);
 978        if (!rbio)
 979                return ERR_PTR(-ENOMEM);
 980
 981        bio_list_init(&rbio->bio_list);
 982        INIT_LIST_HEAD(&rbio->plug_list);
 983        spin_lock_init(&rbio->bio_list_lock);
 984        INIT_LIST_HEAD(&rbio->stripe_cache);
 985        INIT_LIST_HEAD(&rbio->hash_list);
 986        rbio->bbio = bbio;
 987        rbio->fs_info = fs_info;
 988        rbio->stripe_len = stripe_len;
 989        rbio->nr_pages = num_pages;
 990        rbio->real_stripes = real_stripes;
 991        rbio->stripe_npages = stripe_npages;
 992        rbio->faila = -1;
 993        rbio->failb = -1;
 994        refcount_set(&rbio->refs, 1);
 995        atomic_set(&rbio->error, 0);
 996        atomic_set(&rbio->stripes_pending, 0);
 997
 998        /*
 999         * the stripe_pages, bio_pages, etc arrays point to the extra
1000         * memory we allocated past the end of the rbio

1001         */
1002        p = rbio + 1;
1003#define CONSUME_ALLOC(ptr, count)       do {                            \
1004                ptr = p;                                                \
1005                p = (unsigned char *)p + sizeof(*(ptr)) * (count);      \
1006        } while (0)
1007        CONSUME_ALLOC(rbio->stripe_pages, num_pages);
1008        CONSUME_ALLOC(rbio->bio_pages, num_pages);
1009        CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
1010        CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages));
1011        CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages));
1012#undef  CONSUME_ALLOC
1013
1014        if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
1015                nr_data = real_stripes - 1;
1016        else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
1017                nr_data = real_stripes - 2;
1018        else
1019                BUG();
1020
1021        rbio->nr_data = nr_data;
1022        return rbio;
1023}
1024
1025/* allocate pages for all the stripes in the bio, including parity */
1026static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
1027{
1028        int i;
1029        struct page *page;
1030
1031        for (i = 0; i < rbio->nr_pages; i++) {
1032                if (rbio->stripe_pages[i])
1033                        continue;
1034                page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1035                if (!page)
1036                        return -ENOMEM;
1037                rbio->stripe_pages[i] = page;
1038        }
1039        return 0;
1040}
1041
1042/* only allocate pages for p/q stripes */
1043static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
1044{
1045        int i;
1046        struct page *page;
1047
1048        i = rbio_stripe_page_index(rbio, rbio->nr_data, 0);
1049
1050        for (; i < rbio->nr_pages; i++) {
1051                if (rbio->stripe_pages[i])
1052                        continue;
1053                page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1054                if (!page)
1055                        return -ENOMEM;
1056                rbio->stripe_pages[i] = page;
1057        }
1058        return 0;
1059}
1060
1061/*
1062 * add a single page from a specific stripe into our list of bios for IO
1063 * this will try to merge into existing bios if possible, and returns
1064 * zero if all went well.
1065 */
1066static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
1067                            struct bio_list *bio_list,
1068                            struct page *page,
1069                            int stripe_nr,
1070                            unsigned long page_index,
1071                            unsigned long bio_max_len)
1072{
1073        struct bio *last = bio_list->tail;
1074        u64 last_end = 0;
1075        int ret;
1076        struct bio *bio;
1077        struct btrfs_bio_stripe *stripe;
1078        u64 disk_start;
1079
1080        stripe = &rbio->bbio->stripes[stripe_nr];
1081        disk_start = stripe->physical + (page_index << PAGE_SHIFT);
1082
1083        /* if the device is missing, just fail this stripe */
1084        if (!stripe->dev->bdev)
1085                return fail_rbio_index(rbio, stripe_nr);
1086
1087        /* see if we can add this page onto our existing bio */
1088        if (last) {
1089                last_end = (u64)last->bi_iter.bi_sector << 9;
1090                last_end += last->bi_iter.bi_size;
1091
1092                /*
1093                 * we can't merge these if they are from different
1094                 * devices or if they are not contiguous
1095                 */
1096                if (last_end == disk_start && stripe->dev->bdev &&
1097                    !last->bi_status &&
1098                    last->bi_disk == stripe->dev->bdev->bd_disk &&
1099                    last->bi_partno == stripe->dev->bdev->bd_partno) {
1100                        ret = bio_add_page(last, page, PAGE_SIZE, 0);
1101                        if (ret == PAGE_SIZE)
1102                                return 0;
1103                }
1104        }
1105
1106        /* put a new bio on the list */
1107        bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
1108        bio->bi_iter.bi_size = 0;
1109        bio_set_dev(bio, stripe->dev->bdev);
1110        bio->bi_iter.bi_sector = disk_start >> 9;
1111
1112        bio_add_page(bio, page, PAGE_SIZE, 0);
1113        bio_list_add(bio_list, bio);
1114        return 0;
1115}
1116
1117/*
1118 * while we're doing the read/modify/write cycle, we could
1119 * have errors in reading pages off the disk.  This checks
1120 * for errors and if we're not able to read the page it'll
1121 * trigger parity reconstruction.  The rmw will be finished
1122 * after we've reconstructed the failed stripes
1123 */
1124static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1125{
1126        if (rbio->faila >= 0 || rbio->failb >= 0) {
1127                BUG_ON(rbio->faila == rbio->real_stripes - 1);
1128                __raid56_parity_recover(rbio);
1129        } else {
1130                finish_rmw(rbio);
1131        }
1132}
1133
1134/*
1135 * helper function to walk our bio list and populate the bio_pages array with
1136 * the result.  This seems expensive, but it is faster than constantly
1137 * searching through the bio list as we setup the IO in finish_rmw or stripe
1138 * reconstruction.
1139 *
1140 * This must be called before you trust the answers from page_in_rbio
1141 */
1142static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1143{
1144        struct bio *bio;
1145        u64 start;
1146        unsigned long stripe_offset;
1147        unsigned long page_index;
1148
1149        spin_lock_irq(&rbio->bio_list_lock);
1150        bio_list_for_each(bio, &rbio->bio_list) {
1151                struct bio_vec bvec;
1152                struct bvec_iter iter;
1153                int i = 0;
1154
1155                start = (u64)bio->bi_iter.bi_sector << 9;
1156                stripe_offset = start - rbio->bbio->raid_map[0];
1157                page_index = stripe_offset >> PAGE_SHIFT;
1158
1159                if (bio_flagged(bio, BIO_CLONED))
1160                        bio->bi_iter = btrfs_io_bio(bio)->iter;
1161
1162                bio_for_each_segment(bvec, bio, iter) {
1163                        rbio->bio_pages[page_index + i] = bvec.bv_page;
1164                        i++;
1165                }
1166        }
1167        spin_unlock_irq(&rbio->bio_list_lock);
1168}
1169
1170/*
1171 * this is called from one of two situations.  We either
1172 * have a full stripe from the higher layers, or we've read all
1173 * the missing bits off disk.
1174 *
1175 * This will calculate the parity and then send down any
1176 * changed blocks.
1177 */
1178static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1179{
1180        struct btrfs_bio *bbio = rbio->bbio;
1181        void **pointers = rbio->finish_pointers;
1182        int nr_data = rbio->nr_data;
1183        int stripe;
1184        int pagenr;
1185        int p_stripe = -1;
1186        int q_stripe = -1;
1187        struct bio_list bio_list;
1188        struct bio *bio;
1189        int ret;
1190
1191        bio_list_init(&bio_list);
1192
1193        if (rbio->real_stripes - rbio->nr_data == 1) {
1194                p_stripe = rbio->real_stripes - 1;
1195        } else if (rbio->real_stripes - rbio->nr_data == 2) {
1196                p_stripe = rbio->real_stripes - 2;
1197                q_stripe = rbio->real_stripes - 1;
1198        } else {
1199                BUG();
1200        }
1201
1202        /* at this point we either have a full stripe,
1203         * or we've read the full stripe from the drive.
1204         * recalculate the parity and write the new results.
1205         *
1206         * We're not allowed to add any new bios to the
1207         * bio list here, anyone else that wants to
1208         * change this stripe needs to do their own rmw.
1209         */
1210        spin_lock_irq(&rbio->bio_list_lock);
1211        set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1212        spin_unlock_irq(&rbio->bio_list_lock);
1213
1214        atomic_set(&rbio->error, 0);
1215
1216        /*
1217         * now that we've set rmw_locked, run through the
1218         * bio list one last time and map the page pointers
1219         *
1220         * We don't cache full rbios because we're assuming
1221         * the higher layers are unlikely to use this area of
1222         * the disk again soon.  If they do use it again,
1223         * hopefully they will send another full bio.
1224         */
1225        index_rbio_pages(rbio);
1226        if (!rbio_is_full(rbio))
1227                cache_rbio_pages(rbio);
1228        else
1229                clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1230
1231        for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1232                struct page *p;
1233                /* first collect one page from each data stripe */
1234                for (stripe = 0; stripe < nr_data; stripe++) {
1235                        p = page_in_rbio(rbio, stripe, pagenr, 0);
1236                        pointers[stripe] = kmap(p);
1237                }
1238
1239                /* then add the parity stripe */
1240                p = rbio_pstripe_page(rbio, pagenr);
1241                SetPageUptodate(p);
1242                pointers[stripe++] = kmap(p);
1243
1244                if (q_stripe != -1) {
1245
1246                        /*
1247                         * raid6, add the qstripe and call the
1248                         * library function to fill in our p/q
1249                         */
1250                        p = rbio_qstripe_page(rbio, pagenr);
1251                        SetPageUptodate(p);
1252                        pointers[stripe++] = kmap(p);
1253
1254                        raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
1255                                                pointers);
1256                } else {
1257                        /* raid5 */
1258                        copy_page(pointers[nr_data], pointers[0]);
1259                        run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
1260                }
1261
1262
1263                for (stripe = 0; stripe < rbio->real_stripes; stripe++)
1264                        kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
1265        }
1266
1267        /*
1268         * time to start writing.  Make bios for everything from the
1269         * higher layers (the bio_list in our rbio) and our p/q.  Ignore
1270         * everything else.
1271         */
1272        for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1273                for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1274                        struct page *page;
1275                        if (stripe < rbio->nr_data) {
1276                                page = page_in_rbio(rbio, stripe, pagenr, 1);
1277                                if (!page)
1278                                        continue;
1279                        } else {
1280                               page = rbio_stripe_page(rbio, stripe, pagenr);
1281                        }
1282
1283                        ret = rbio_add_io_page(rbio, &bio_list,
1284                                       page, stripe, pagenr, rbio->stripe_len);
1285                        if (ret)
1286                                goto cleanup;
1287                }
1288        }
1289
1290        if (likely(!bbio->num_tgtdevs))
1291                goto write_data;
1292
1293        for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1294                if (!bbio->tgtdev_map[stripe])
1295                        continue;
1296
1297                for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1298                        struct page *page;
1299                        if (stripe < rbio->nr_data) {
1300                                page = page_in_rbio(rbio, stripe, pagenr, 1);
1301                                if (!page)
1302                                        continue;
1303                        } else {
1304                               page = rbio_stripe_page(rbio, stripe, pagenr);
1305                        }
1306
1307                        ret = rbio_add_io_page(rbio, &bio_list, page,
1308                                               rbio->bbio->tgtdev_map[stripe],
1309                                               pagenr, rbio->stripe_len);
1310                        if (ret)
1311                                goto cleanup;
1312                }
1313        }
1314
1315write_data:
1316        atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
1317        BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
1318
1319        while (1) {
1320                bio = bio_list_pop(&bio_list);
1321                if (!bio)
1322                        break;
1323
1324                bio->bi_private = rbio;
1325                bio->bi_end_io = raid_write_end_io;
1326                bio->bi_opf = REQ_OP_WRITE;
1327
1328                submit_bio(bio);
1329        }
1330        return;
1331
1332cleanup:
1333        rbio_orig_end_io(rbio, BLK_STS_IOERR);
1334
1335        while ((bio = bio_list_pop(&bio_list)))
1336                bio_put(bio);
1337}
1338
1339/*
1340 * helper to find the stripe number for a given bio.  Used to figure out which
1341 * stripe has failed.  This expects the bio to correspond to a physical disk,
1342 * so it looks up based on physical sector numbers.
1343 */
1344static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1345                           struct bio *bio)
1346{
1347        u64 physical = bio->bi_iter.bi_sector;
1348        u64 stripe_start;
1349        int i;
1350        struct btrfs_bio_stripe *stripe;
1351
1352        physical <<= 9;
1353
1354        for (i = 0; i < rbio->bbio->num_stripes; i++) {
1355                stripe = &rbio->bbio->stripes[i];
1356                stripe_start = stripe->physical;
1357                if (physical >= stripe_start &&
1358                    physical < stripe_start + rbio->stripe_len &&
1359                    stripe->dev->bdev &&
1360                    bio->bi_disk == stripe->dev->bdev->bd_disk &&
1361                    bio->bi_partno == stripe->dev->bdev->bd_partno) {
1362                        return i;
1363                }
1364        }
1365        return -1;
1366}
1367
1368/*
1369 * helper to find the stripe number for a given
1370 * bio (before mapping).  Used to figure out which stripe has
1371 * failed.  This looks up based on logical block numbers.
1372 */
1373static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
1374                                   struct bio *bio)
1375{
1376        u64 logical = bio->bi_iter.bi_sector;
1377        u64 stripe_start;
1378        int i;
1379
1380        logical <<= 9;
1381
1382        for (i = 0; i < rbio->nr_data; i++) {
1383                stripe_start = rbio->bbio->raid_map[i];
1384                if (logical >= stripe_start &&
1385                    logical < stripe_start + rbio->stripe_len) {
1386                        return i;
1387                }
1388        }
1389        return -1;
1390}
1391
1392/*
1393 * returns -EIO if we had too many failures
1394 */
1395static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1396{
1397        unsigned long flags;
1398        int ret = 0;
1399
1400        spin_lock_irqsave(&rbio->bio_list_lock, flags);
1401
1402        /* we already know this stripe is bad, move on */
1403        if (rbio->faila == failed || rbio->failb == failed)
1404                goto out;
1405
1406        if (rbio->faila == -1) {
1407                /* first failure on this rbio */
1408                rbio->faila = failed;
1409                atomic_inc(&rbio->error);
1410        } else if (rbio->failb == -1) {
1411                /* second failure on this rbio */
1412                rbio->failb = failed;
1413                atomic_inc(&rbio->error);
1414        } else {
1415                ret = -EIO;
1416        }
1417out:
1418        spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1419
1420        return ret;
1421}
1422
1423/*
1424 * helper to fail a stripe based on a physical disk
1425 * bio.
1426 */
1427static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1428                           struct bio *bio)
1429{
1430        int failed = find_bio_stripe(rbio, bio);
1431
1432        if (failed < 0)
1433                return -EIO;
1434
1435        return fail_rbio_index(rbio, failed);
1436}
1437
1438/*
1439 * this sets each page in the bio uptodate.  It should only be used on private
1440 * rbio pages, nothing that comes in from the higher layers
1441 */
1442static void set_bio_pages_uptodate(struct bio *bio)
1443{
1444        struct bio_vec *bvec;
1445        int i;
1446        struct bvec_iter_all iter_all;
1447
1448        ASSERT(!bio_flagged(bio, BIO_CLONED));
1449
1450        bio_for_each_segment_all(bvec, bio, i, iter_all)
1451                SetPageUptodate(bvec->bv_page);
1452}
1453
1454/*
1455 * end io for the read phase of the rmw cycle.  All the bios here are physical
1456 * stripe bios we've read from the disk so we can recalculate the parity of the
1457 * stripe.
1458 *
1459 * This will usually kick off finish_rmw once all the bios are read in, but it
1460 * may trigger parity reconstruction if we had any errors along the way
1461 */
1462static void raid_rmw_end_io(struct bio *bio)
1463{
1464        struct btrfs_raid_bio *rbio = bio->bi_private;
1465
1466        if (bio->bi_status)
1467                fail_bio_stripe(rbio, bio);
1468        else
1469                set_bio_pages_uptodate(bio);
1470
1471        bio_put(bio);
1472
1473        if (!atomic_dec_and_test(&rbio->stripes_pending))
1474                return;
1475
1476        if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
1477                goto cleanup;
1478
1479        /*
1480         * this will normally call finish_rmw to start our write
1481         * but if there are any failed stripes we'll reconstruct
1482         * from parity first
1483         */
1484        validate_rbio_for_rmw(rbio);
1485        return;
1486
1487cleanup:
1488
1489        rbio_orig_end_io(rbio, BLK_STS_IOERR);
1490}
1491
1492/*
1493 * the stripe must be locked by the caller.  It will
1494 * unlock after all the writes are done
1495 */
1496static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1497{
1498        int bios_to_read = 0;
1499        struct bio_list bio_list;
1500        int ret;
1501        int pagenr;
1502        int stripe;
1503        struct bio *bio;
1504
1505        bio_list_init(&bio_list);
1506
1507        ret = alloc_rbio_pages(rbio);
1508        if (ret)
1509                goto cleanup;
1510
1511        index_rbio_pages(rbio);
1512
1513        atomic_set(&rbio->error, 0);
1514        /*
1515         * build a list of bios to read all the missing parts of this
1516         * stripe
1517         */
1518        for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1519                for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1520                        struct page *page;
1521                        /*
1522                         * we want to find all the pages missing from
1523                         * the rbio and read them from the disk.  If
1524                         * page_in_rbio finds a page in the bio list
1525                         * we don't need to read it off the stripe.
1526                         */
1527                        page = page_in_rbio(rbio, stripe, pagenr, 1);
1528                        if (page)
1529                                continue;
1530
1531                        page = rbio_stripe_page(rbio, stripe, pagenr);
1532                        /*
1533                         * the bio cache may have handed us an uptodate
1534                         * page.  If so, be happy and use it
1535                         */
1536                        if (PageUptodate(page))
1537                                continue;
1538
1539                        ret = rbio_add_io_page(rbio, &bio_list, page,
1540                                       stripe, pagenr, rbio->stripe_len);
1541                        if (ret)
1542                                goto cleanup;
1543                }
1544        }
1545
1546        bios_to_read = bio_list_size(&bio_list);
1547        if (!bios_to_read) {
1548                /*
1549                 * this can happen if others have merged with
1550                 * us, it means there is nothing left to read.
1551                 * But if there are missing devices it may not be
1552                 * safe to do the full stripe write yet.
1553                 */
1554                goto finish;
1555        }
1556
1557        /*
1558         * the bbio may be freed once we submit the last bio.  Make sure
1559         * not to touch it after that
1560         */
1561        atomic_set(&rbio->stripes_pending, bios_to_read);
1562        while (1) {
1563                bio = bio_list_pop(&bio_list);
1564                if (!bio)
1565                        break;
1566
1567                bio->bi_private = rbio;
1568                bio->bi_end_io = raid_rmw_end_io;
1569                bio->bi_opf = REQ_OP_READ;
1570
1571                btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
1572
1573                submit_bio(bio);
1574        }
1575        /* the actual write will happen once the reads are done */
1576        return 0;
1577
1578cleanup:
1579        rbio_orig_end_io(rbio, BLK_STS_IOERR);
1580
1581        while ((bio = bio_list_pop(&bio_list)))
1582                bio_put(bio);
1583
1584        return -EIO;
1585
1586finish:
1587        validate_rbio_for_rmw(rbio);
1588        return 0;
1589}
1590
1591/*
1592 * if the upper layers pass in a full stripe, we thank them by only allocating
1593 * enough pages to hold the parity, and sending it all down quickly.
1594 */
1595static int full_stripe_write(struct btrfs_raid_bio *rbio)
1596{
1597        int ret;
1598
1599        ret = alloc_rbio_parity_pages(rbio);
1600        if (ret) {
1601                __free_raid_bio(rbio);
1602                return ret;
1603        }
1604
1605        ret = lock_stripe_add(rbio);
1606        if (ret == 0)
1607                finish_rmw(rbio);
1608        return 0;
1609}
1610
1611/*
1612 * partial stripe writes get handed over to async helpers.
1613 * We're really hoping to merge a few more writes into this
1614 * rbio before calculating new parity
1615 */
1616static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1617{
1618        int ret;
1619
1620        ret = lock_stripe_add(rbio);
1621        if (ret == 0)
1622                start_async_work(rbio, rmw_work);
1623        return 0;
1624}
1625
1626/*
1627 * sometimes while we were reading from the drive to
1628 * recalculate parity, enough new bios come into create
1629 * a full stripe.  So we do a check here to see if we can
1630 * go directly to finish_rmw
1631 */
1632static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1633{
1634        /* head off into rmw land if we don't have a full stripe */
1635        if (!rbio_is_full(rbio))
1636                return partial_stripe_write(rbio);
1637        return full_stripe_write(rbio);
1638}
1639
1640/*
1641 * We use plugging call backs to collect full stripes.
1642 * Any time we get a partial stripe write while plugged
1643 * we collect it into a list.  When the unplug comes down,
1644 * we sort the list by logical block number and merge
1645 * everything we can into the same rbios
1646 */
1647struct btrfs_plug_cb {
1648        struct blk_plug_cb cb;
1649        struct btrfs_fs_info *info;
1650        struct list_head rbio_list;
1651        struct btrfs_work work;
1652};
1653
1654/*
1655 * rbios on the plug list are sorted for easier merging.
1656 */
1657static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
1658{
1659        struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1660                                                 plug_list);
1661        struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1662                                                 plug_list);
1663        u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1664        u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
1665
1666        if (a_sector < b_sector)
1667                return -1;
1668        if (a_sector > b_sector)
1669                return 1;
1670        return 0;
1671}
1672
1673static void run_plug(struct btrfs_plug_cb *plug)
1674{
1675        struct btrfs_raid_bio *cur;
1676        struct btrfs_raid_bio *last = NULL;
1677
1678        /*
1679         * sort our plug list then try to merge
1680         * everything we can in hopes of creating full
1681         * stripes.
1682         */
1683        list_sort(NULL, &plug->rbio_list, plug_cmp);
1684        while (!list_empty(&plug->rbio_list)) {
1685                cur = list_entry(plug->rbio_list.next,
1686                                 struct btrfs_raid_bio, plug_list);
1687                list_del_init(&cur->plug_list);
1688
1689                if (rbio_is_full(cur)) {
1690                        int ret;
1691
1692                        /* we have a full stripe, send it down */
1693                        ret = full_stripe_write(cur);
1694                        BUG_ON(ret);
1695                        continue;
1696                }
1697                if (last) {
1698                        if (rbio_can_merge(last, cur)) {
1699                                merge_rbio(last, cur);
1700                                __free_raid_bio(cur);
1701                                continue;
1702
1703                        }
1704                        __raid56_parity_write(last);
1705                }
1706                last = cur;
1707        }
1708        if (last) {
1709                __raid56_parity_write(last);
1710        }
1711        kfree(plug);
1712}
1713
1714/*
1715 * if the unplug comes from schedule, we have to push the
1716 * work off to a helper thread
1717 */
1718static void unplug_work(struct btrfs_work *work)
1719{
1720        struct btrfs_plug_cb *plug;
1721        plug = container_of(work, struct btrfs_plug_cb, work);
1722        run_plug(plug);
1723}
1724
1725static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1726{
1727        struct btrfs_plug_cb *plug;
1728        plug = container_of(cb, struct btrfs_plug_cb, cb);
1729
1730        if (from_schedule) {
1731                btrfs_init_work(&plug->work, btrfs_rmw_helper,
1732                                unplug_work, NULL, NULL);
1733                btrfs_queue_work(plug->info->rmw_workers,
1734                                 &plug->work);
1735                return;
1736        }
1737        run_plug(plug);
1738}
1739
1740/*
1741 * our main entry point for writes from the rest of the FS.
1742 */
1743int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio,
1744                        struct btrfs_bio *bbio, u64 stripe_len)
1745{
1746        struct btrfs_raid_bio *rbio;
1747        struct btrfs_plug_cb *plug = NULL;
1748        struct blk_plug_cb *cb;
1749        int ret;
1750
1751        rbio = alloc_rbio(fs_info, bbio, stripe_len);
1752        if (IS_ERR(rbio)) {
1753                btrfs_put_bbio(bbio);
1754                return PTR_ERR(rbio);
1755        }
1756        bio_list_add(&rbio->bio_list, bio);
1757        rbio->bio_list_bytes = bio->bi_iter.bi_size;
1758        rbio->operation = BTRFS_RBIO_WRITE;
1759
1760        btrfs_bio_counter_inc_noblocked(fs_info);
1761        rbio->generic_bio_cnt = 1;
1762
1763        /*
1764         * don't plug on full rbios, just get them out the door
1765         * as quickly as we can
1766         */
1767        if (rbio_is_full(rbio)) {
1768                ret = full_stripe_write(rbio);
1769                if (ret)
1770                        btrfs_bio_counter_dec(fs_info);
1771                return ret;
1772        }
1773
1774        cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
1775        if (cb) {
1776                plug = container_of(cb, struct btrfs_plug_cb, cb);
1777                if (!plug->info) {
1778                        plug->info = fs_info;
1779                        INIT_LIST_HEAD(&plug->rbio_list);
1780                }
1781                list_add_tail(&rbio->plug_list, &plug->rbio_list);
1782                ret = 0;
1783        } else {
1784                ret = __raid56_parity_write(rbio);
1785                if (ret)
1786                        btrfs_bio_counter_dec(fs_info);
1787        }
1788        return ret;
1789}
1790
1791/*
1792 * all parity reconstruction happens here.  We've read in everything
1793 * we can find from the drives and this does the heavy lifting of
1794 * sorting the good from the bad.
1795 */
1796static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1797{
1798        int pagenr, stripe;
1799        void **pointers;
1800        int faila = -1, failb = -1;
1801        struct page *page;
1802        blk_status_t err;
1803        int i;
1804
1805        pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1806        if (!pointers) {
1807                err = BLK_STS_RESOURCE;
1808                goto cleanup_io;
1809        }
1810
1811        faila = rbio->faila;
1812        failb = rbio->failb;
1813
1814        if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1815            rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
1816                spin_lock_irq(&rbio->bio_list_lock);
1817                set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1818                spin_unlock_irq(&rbio->bio_list_lock);
1819        }
1820
1821        index_rbio_pages(rbio);
1822
1823        for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1824                /*
1825                 * Now we just use bitmap to mark the horizontal stripes in
1826                 * which we have data when doing parity scrub.
1827                 */
1828                if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
1829                    !test_bit(pagenr, rbio->dbitmap))
1830                        continue;
1831
1832                /* setup our array of pointers with pages
1833                 * from each stripe
1834                 */
1835                for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1836                        /*
1837                         * if we're rebuilding a read, we have to use
1838                         * pages from the bio list
1839                         */
1840                        if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1841                             rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
1842                            (stripe == faila || stripe == failb)) {
1843                                page = page_in_rbio(rbio, stripe, pagenr, 0);
1844                        } else {
1845                                page = rbio_stripe_page(rbio, stripe, pagenr);
1846                        }
1847                        pointers[stripe] = kmap(page);
1848                }
1849
1850                /* all raid6 handling here */
1851                if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) {
1852                        /*
1853                         * single failure, rebuild from parity raid5
1854                         * style
1855                         */
1856                        if (failb < 0) {
1857                                if (faila == rbio->nr_data) {
1858                                        /*
1859                                         * Just the P stripe has failed, without
1860                                         * a bad data or Q stripe.
1861                                         * TODO, we should redo the xor here.
1862                                         */
1863                                        err = BLK_STS_IOERR;
1864                                        goto cleanup;
1865                                }
1866                                /*
1867                                 * a single failure in raid6 is rebuilt
1868                                 * in the pstripe code below
1869                                 */
1870                                goto pstripe;
1871                        }
1872
1873                        /* make sure our ps and qs are in order */
1874                        if (faila > failb) {
1875                                int tmp = failb;
1876                                failb = faila;
1877                                faila = tmp;
1878                        }
1879
1880                        /* if the q stripe is failed, do a pstripe reconstruction
1881                         * from the xors.
1882                         * If both the q stripe and the P stripe are failed, we're
1883                         * here due to a crc mismatch and we can't give them the
1884                         * data they want
1885                         */
1886                        if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) {
1887                                if (rbio->bbio->raid_map[faila] ==
1888                                    RAID5_P_STRIPE) {
1889                                        err = BLK_STS_IOERR;
1890                                        goto cleanup;
1891                                }
1892                                /*
1893                                 * otherwise we have one bad data stripe and
1894                                 * a good P stripe.  raid5!
1895                                 */
1896                                goto pstripe;
1897                        }
1898
1899                        if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) {
1900                                raid6_datap_recov(rbio->real_stripes,
1901                                                  PAGE_SIZE, faila, pointers);
1902                        } else {
1903                                raid6_2data_recov(rbio->real_stripes,
1904                                                  PAGE_SIZE, faila, failb,
1905                                                  pointers);
1906                        }
1907                } else {
1908                        void *p;
1909
1910                        /* rebuild from P stripe here (raid5 or raid6) */
1911                        BUG_ON(failb != -1);
1912pstripe:
1913                        /* Copy parity block into failed block to start with */
1914                        copy_page(pointers[faila], pointers[rbio->nr_data]);
1915
1916                        /* rearrange the pointer array */
1917                        p = pointers[faila];
1918                        for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
1919                                pointers[stripe] = pointers[stripe + 1];
1920                        pointers[rbio->nr_data - 1] = p;
1921
1922                        /* xor in the rest */
1923                        run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE);
1924                }
1925                /* if we're doing this rebuild as part of an rmw, go through
1926                 * and set all of our private rbio pages in the
1927                 * failed stripes as uptodate.  This way finish_rmw will
1928                 * know they can be trusted.  If this was a read reconstruction,
1929                 * other endio functions will fiddle the uptodate bits
1930                 */
1931                if (rbio->operation == BTRFS_RBIO_WRITE) {
1932                        for (i = 0;  i < rbio->stripe_npages; i++) {
1933                                if (faila != -1) {
1934                                        page = rbio_stripe_page(rbio, faila, i);
1935                                        SetPageUptodate(page);
1936                                }
1937                                if (failb != -1) {
1938                                        page = rbio_stripe_page(rbio, failb, i);
1939                                        SetPageUptodate(page);
1940                                }
1941                        }
1942                }
1943                for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1944                        /*
1945                         * if we're rebuilding a read, we have to use
1946                         * pages from the bio list
1947                         */
1948                        if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1949                             rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
1950                            (stripe == faila || stripe == failb)) {
1951                                page = page_in_rbio(rbio, stripe, pagenr, 0);
1952                        } else {
1953                                page = rbio_stripe_page(rbio, stripe, pagenr);
1954                        }
1955                        kunmap(page);
1956                }
1957        }
1958
1959        err = BLK_STS_OK;
1960cleanup:
1961        kfree(pointers);
1962
1963cleanup_io:
1964        /*
1965         * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
1966         * valid rbio which is consistent with ondisk content, thus such a
1967         * valid rbio can be cached to avoid further disk reads.
1968         */
1969        if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1970            rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
1971                /*
1972                 * - In case of two failures, where rbio->failb != -1:
1973                 *
1974                 *   Do not cache this rbio since the above read reconstruction
1975                 *   (raid6_datap_recov() or raid6_2data_recov()) may have
1976                 *   changed some content of stripes which are not identical to
1977                 *   on-disk content any more, otherwise, a later write/recover
1978                 *   may steal stripe_pages from this rbio and end up with
1979                 *   corruptions or rebuild failures.
1980                 *
1981                 * - In case of single failure, where rbio->failb == -1:
1982                 *
1983                 *   Cache this rbio iff the above read reconstruction is
1984                 *   executed without problems.
1985                 */
1986                if (err == BLK_STS_OK && rbio->failb < 0)
1987                        cache_rbio_pages(rbio);
1988                else
1989                        clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1990
1991                rbio_orig_end_io(rbio, err);
1992        } else if (err == BLK_STS_OK) {
1993                rbio->faila = -1;
1994                rbio->failb = -1;
1995
1996                if (rbio->operation == BTRFS_RBIO_WRITE)
1997                        finish_rmw(rbio);
1998                else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
1999                        finish_parity_scrub(rbio, 0);
2000                else

2001                        BUG();
2002        } else {
2003                rbio_orig_end_io(rbio, err);
2004        }
2005}
2006
2007/*
2008 * This is called only for stripes we've read from disk to
2009 * reconstruct the parity.
2010 */
2011static void raid_recover_end_io(struct bio *bio)
2012{
2013        struct btrfs_raid_bio *rbio = bio->bi_private;
2014
2015        /*
2016         * we only read stripe pages off the disk, set them
2017         * up to date if there were no errors
2018         */
2019        if (bio->bi_status)
2020                fail_bio_stripe(rbio, bio);
2021        else
2022                set_bio_pages_uptodate(bio);
2023        bio_put(bio);
2024
2025        if (!atomic_dec_and_test(&rbio->stripes_pending))
2026                return;
2027
2028        if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
2029                rbio_orig_end_io(rbio, BLK_STS_IOERR);
2030        else
2031                __raid_recover_end_io(rbio);
2032}
2033
2034/*
2035 * reads everything we need off the disk to reconstruct
2036 * the parity. endio handlers trigger final reconstruction
2037 * when the IO is done.
2038 *
2039 * This is used both for reads from the higher layers and for
2040 * parity construction required to finish a rmw cycle.
2041 */
2042static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
2043{
2044        int bios_to_read = 0;
2045        struct bio_list bio_list;
2046        int ret;
2047        int pagenr;
2048        int stripe;
2049        struct bio *bio;
2050
2051        bio_list_init(&bio_list);
2052
2053        ret = alloc_rbio_pages(rbio);
2054        if (ret)
2055                goto cleanup;
2056
2057        atomic_set(&rbio->error, 0);
2058
2059        /*
2060         * read everything that hasn't failed.  Thanks to the
2061         * stripe cache, it is possible that some or all of these
2062         * pages are going to be uptodate.
2063         */
2064        for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
2065                if (rbio->faila == stripe || rbio->failb == stripe) {
2066                        atomic_inc(&rbio->error);
2067                        continue;
2068                }
2069
2070                for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
2071                        struct page *p;
2072
2073                        /*
2074                         * the rmw code may have already read this
2075                         * page in
2076                         */
2077                        p = rbio_stripe_page(rbio, stripe, pagenr);
2078                        if (PageUptodate(p))
2079                                continue;
2080
2081                        ret = rbio_add_io_page(rbio, &bio_list,
2082                                       rbio_stripe_page(rbio, stripe, pagenr),
2083                                       stripe, pagenr, rbio->stripe_len);
2084                        if (ret < 0)
2085                                goto cleanup;
2086                }
2087        }
2088
2089        bios_to_read = bio_list_size(&bio_list);
2090        if (!bios_to_read) {
2091                /*
2092                 * we might have no bios to read just because the pages
2093                 * were up to date, or we might have no bios to read because
2094                 * the devices were gone.
2095                 */
2096                if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
2097                        __raid_recover_end_io(rbio);
2098                        goto out;
2099                } else {
2100                        goto cleanup;
2101                }
2102        }
2103
2104        /*
2105         * the bbio may be freed once we submit the last bio.  Make sure
2106         * not to touch it after that
2107         */
2108        atomic_set(&rbio->stripes_pending, bios_to_read);
2109        while (1) {
2110                bio = bio_list_pop(&bio_list);
2111                if (!bio)
2112                        break;
2113
2114                bio->bi_private = rbio;
2115                bio->bi_end_io = raid_recover_end_io;
2116                bio->bi_opf = REQ_OP_READ;
2117
2118                btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
2119
2120                submit_bio(bio);
2121        }
2122out:
2123        return 0;
2124
2125cleanup:
2126        if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
2127            rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
2128                rbio_orig_end_io(rbio, BLK_STS_IOERR);
2129
2130        while ((bio = bio_list_pop(&bio_list)))
2131                bio_put(bio);
2132
2133        return -EIO;
2134}
2135
2136/*
2137 * the main entry point for reads from the higher layers.  This
2138 * is really only called when the normal read path had a failure,
2139 * so we assume the bio they send down corresponds to a failed part
2140 * of the drive.
2141 */
2142int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
2143                          struct btrfs_bio *bbio, u64 stripe_len,
2144                          int mirror_num, int generic_io)
2145{
2146        struct btrfs_raid_bio *rbio;
2147        int ret;
2148
2149        if (generic_io) {
2150                ASSERT(bbio->mirror_num == mirror_num);
2151                btrfs_io_bio(bio)->mirror_num = mirror_num;
2152        }
2153
2154        rbio = alloc_rbio(fs_info, bbio, stripe_len);
2155        if (IS_ERR(rbio)) {
2156                if (generic_io)
2157                        btrfs_put_bbio(bbio);
2158                return PTR_ERR(rbio);
2159        }
2160
2161        rbio->operation = BTRFS_RBIO_READ_REBUILD;
2162        bio_list_add(&rbio->bio_list, bio);
2163        rbio->bio_list_bytes = bio->bi_iter.bi_size;
2164
2165        rbio->faila = find_logical_bio_stripe(rbio, bio);
2166        if (rbio->faila == -1) {
2167                btrfs_warn(fs_info,
2168        "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bbio has map_type %llu)",
2169                           __func__, (u64)bio->bi_iter.bi_sector << 9,
2170                           (u64)bio->bi_iter.bi_size, bbio->map_type);
2171                if (generic_io)
2172                        btrfs_put_bbio(bbio);
2173                kfree(rbio);
2174                return -EIO;
2175        }
2176
2177        if (generic_io) {
2178                btrfs_bio_counter_inc_noblocked(fs_info);
2179                rbio->generic_bio_cnt = 1;
2180        } else {
2181                btrfs_get_bbio(bbio);
2182        }
2183
2184        /*
2185         * Loop retry:
2186         * for 'mirror == 2', reconstruct from all other stripes.
2187         * for 'mirror_num > 2', select a stripe to fail on every retry.
2188         */
2189        if (mirror_num > 2) {
2190                /*
2191                 * 'mirror == 3' is to fail the p stripe and
2192                 * reconstruct from the q stripe.  'mirror > 3' is to
2193                 * fail a data stripe and reconstruct from p+q stripe.
2194                 */
2195                rbio->failb = rbio->real_stripes - (mirror_num - 1);
2196                ASSERT(rbio->failb > 0);
2197                if (rbio->failb <= rbio->faila)
2198                        rbio->failb--;
2199        }
2200
2201        ret = lock_stripe_add(rbio);
2202
2203        /*
2204         * __raid56_parity_recover will end the bio with
2205         * any errors it hits.  We don't want to return
2206         * its error value up the stack because our caller
2207         * will end up calling bio_endio with any nonzero
2208         * return
2209         */
2210        if (ret == 0)
2211                __raid56_parity_recover(rbio);
2212        /*
2213         * our rbio has been added to the list of
2214         * rbios that will be handled after the
2215         * currently lock owner is done
2216         */
2217        return 0;
2218
2219}
2220
2221static void rmw_work(struct btrfs_work *work)
2222{
2223        struct btrfs_raid_bio *rbio;
2224
2225        rbio = container_of(work, struct btrfs_raid_bio, work);
2226        raid56_rmw_stripe(rbio);
2227}
2228
2229static void read_rebuild_work(struct btrfs_work *work)
2230{
2231        struct btrfs_raid_bio *rbio;
2232
2233        rbio = container_of(work, struct btrfs_raid_bio, work);
2234        __raid56_parity_recover(rbio);
2235}
2236
2237/*
2238 * The following code is used to scrub/replace the parity stripe
2239 *
2240 * Caller must have already increased bio_counter for getting @bbio.
2241 *
2242 * Note: We need make sure all the pages that add into the scrub/replace
2243 * raid bio are correct and not be changed during the scrub/replace. That
2244 * is those pages just hold metadata or file data with checksum.
2245 */
2246
2247struct btrfs_raid_bio *
2248raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
2249                               struct btrfs_bio *bbio, u64 stripe_len,
2250                               struct btrfs_device *scrub_dev,
2251                               unsigned long *dbitmap, int stripe_nsectors)
2252{
2253        struct btrfs_raid_bio *rbio;
2254        int i;
2255
2256        rbio = alloc_rbio(fs_info, bbio, stripe_len);
2257        if (IS_ERR(rbio))
2258                return NULL;
2259        bio_list_add(&rbio->bio_list, bio);
2260        /*
2261         * This is a special bio which is used to hold the completion handler
2262         * and make the scrub rbio is similar to the other types
2263         */
2264        ASSERT(!bio->bi_iter.bi_size);
2265        rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2266
2267        /*
2268         * After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted
2269         * to the end position, so this search can start from the first parity
2270         * stripe.
2271         */
2272        for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
2273                if (bbio->stripes[i].dev == scrub_dev) {
2274                        rbio->scrubp = i;
2275                        break;
2276                }
2277        }
2278        ASSERT(i < rbio->real_stripes);
2279
2280        /* Now we just support the sectorsize equals to page size */
2281        ASSERT(fs_info->sectorsize == PAGE_SIZE);
2282        ASSERT(rbio->stripe_npages == stripe_nsectors);
2283        bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
2284
2285        /*
2286         * We have already increased bio_counter when getting bbio, record it
2287         * so we can free it at rbio_orig_end_io().
2288         */
2289        rbio->generic_bio_cnt = 1;
2290
2291        return rbio;
2292}
2293
2294/* Used for both parity scrub and missing. */
2295void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
2296                            u64 logical)
2297{
2298        int stripe_offset;
2299        int index;
2300
2301        ASSERT(logical >= rbio->bbio->raid_map[0]);
2302        ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] +
2303                                rbio->stripe_len * rbio->nr_data);
2304        stripe_offset = (int)(logical - rbio->bbio->raid_map[0]);
2305        index = stripe_offset >> PAGE_SHIFT;
2306        rbio->bio_pages[index] = page;
2307}
2308
2309/*
2310 * We just scrub the parity that we have correct data on the same horizontal,
2311 * so we needn't allocate all pages for all the stripes.
2312 */
2313static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2314{
2315        int i;
2316        int bit;
2317        int index;
2318        struct page *page;
2319
2320        for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
2321                for (i = 0; i < rbio->real_stripes; i++) {
2322                        index = i * rbio->stripe_npages + bit;
2323                        if (rbio->stripe_pages[index])
2324                                continue;
2325
2326                        page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2327                        if (!page)
2328                                return -ENOMEM;
2329                        rbio->stripe_pages[index] = page;
2330                }
2331        }
2332        return 0;
2333}
2334
2335static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
2336                                         int need_check)
2337{
2338        struct btrfs_bio *bbio = rbio->bbio;
2339        void **pointers = rbio->finish_pointers;
2340        unsigned long *pbitmap = rbio->finish_pbitmap;
2341        int nr_data = rbio->nr_data;
2342        int stripe;
2343        int pagenr;
2344        int p_stripe = -1;
2345        int q_stripe = -1;
2346        struct page *p_page = NULL;
2347        struct page *q_page = NULL;
2348        struct bio_list bio_list;
2349        struct bio *bio;
2350        int is_replace = 0;
2351        int ret;
2352
2353        bio_list_init(&bio_list);
2354
2355        if (rbio->real_stripes - rbio->nr_data == 1) {
2356                p_stripe = rbio->real_stripes - 1;
2357        } else if (rbio->real_stripes - rbio->nr_data == 2) {
2358                p_stripe = rbio->real_stripes - 2;
2359                q_stripe = rbio->real_stripes - 1;
2360        } else {
2361                BUG();
2362        }
2363
2364        if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
2365                is_replace = 1;
2366                bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
2367        }
2368
2369        /*
2370         * Because the higher layers(scrubber) are unlikely to
2371         * use this area of the disk again soon, so don't cache
2372         * it.
2373         */
2374        clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2375
2376        if (!need_check)
2377                goto writeback;
2378
2379        p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2380        if (!p_page)
2381                goto cleanup;
2382        SetPageUptodate(p_page);
2383
2384        if (q_stripe != -1) {
2385                q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2386                if (!q_page) {
2387                        __free_page(p_page);
2388                        goto cleanup;
2389                }
2390                SetPageUptodate(q_page);
2391        }
2392
2393        atomic_set(&rbio->error, 0);
2394
2395        for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2396                struct page *p;
2397                void *parity;
2398                /* first collect one page from each data stripe */
2399                for (stripe = 0; stripe < nr_data; stripe++) {
2400                        p = page_in_rbio(rbio, stripe, pagenr, 0);
2401                        pointers[stripe] = kmap(p);
2402                }
2403
2404                /* then add the parity stripe */
2405                pointers[stripe++] = kmap(p_page);
2406
2407                if (q_stripe != -1) {
2408
2409                        /*
2410                         * raid6, add the qstripe and call the
2411                         * library function to fill in our p/q
2412                         */
2413                        pointers[stripe++] = kmap(q_page);
2414
2415                        raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
2416                                                pointers);
2417                } else {
2418                        /* raid5 */
2419                        copy_page(pointers[nr_data], pointers[0]);
2420                        run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
2421                }
2422
2423                /* Check scrubbing parity and repair it */
2424                p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2425                parity = kmap(p);
2426                if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE))
2427                        copy_page(parity, pointers[rbio->scrubp]);
2428                else
2429                        /* Parity is right, needn't writeback */
2430                        bitmap_clear(rbio->dbitmap, pagenr, 1);
2431                kunmap(p);
2432
2433                for (stripe = 0; stripe < nr_data; stripe++)
2434                        kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
2435                kunmap(p_page);
2436        }
2437
2438        __free_page(p_page);
2439        if (q_page)
2440                __free_page(q_page);
2441
2442writeback:
2443        /*
2444         * time to start writing.  Make bios for everything from the
2445         * higher layers (the bio_list in our rbio) and our p/q.  Ignore
2446         * everything else.
2447         */
2448        for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2449                struct page *page;
2450
2451                page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2452                ret = rbio_add_io_page(rbio, &bio_list,
2453                               page, rbio->scrubp, pagenr, rbio->stripe_len);
2454                if (ret)
2455                        goto cleanup;
2456        }
2457
2458        if (!is_replace)
2459                goto submit_write;
2460
2461        for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
2462                struct page *page;
2463
2464                page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2465                ret = rbio_add_io_page(rbio, &bio_list, page,
2466                                       bbio->tgtdev_map[rbio->scrubp],
2467                                       pagenr, rbio->stripe_len);
2468                if (ret)
2469                        goto cleanup;
2470        }
2471
2472submit_write:
2473        nr_data = bio_list_size(&bio_list);
2474        if (!nr_data) {
2475                /* Every parity is right */
2476                rbio_orig_end_io(rbio, BLK_STS_OK);
2477                return;
2478        }
2479
2480        atomic_set(&rbio->stripes_pending, nr_data);
2481
2482        while (1) {
2483                bio = bio_list_pop(&bio_list);
2484                if (!bio)
2485                        break;
2486
2487                bio->bi_private = rbio;
2488                bio->bi_end_io = raid_write_end_io;
2489                bio->bi_opf = REQ_OP_WRITE;
2490
2491                submit_bio(bio);
2492        }
2493        return;
2494
2495cleanup:
2496        rbio_orig_end_io(rbio, BLK_STS_IOERR);
2497
2498        while ((bio = bio_list_pop(&bio_list)))
2499                bio_put(bio);
2500}
2501
2502static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2503{
2504        if (stripe >= 0 && stripe < rbio->nr_data)
2505                return 1;
2506        return 0;
2507}
2508
2509/*
2510 * While we're doing the parity check and repair, we could have errors
2511 * in reading pages off the disk.  This checks for errors and if we're
2512 * not able to read the page it'll trigger parity reconstruction.  The
2513 * parity scrub will be finished after we've reconstructed the failed
2514 * stripes
2515 */
2516static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
2517{
2518        if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
2519                goto cleanup;
2520
2521        if (rbio->faila >= 0 || rbio->failb >= 0) {
2522                int dfail = 0, failp = -1;
2523
2524                if (is_data_stripe(rbio, rbio->faila))
2525                        dfail++;
2526                else if (is_parity_stripe(rbio->faila))
2527                        failp = rbio->faila;
2528
2529                if (is_data_stripe(rbio, rbio->failb))
2530                        dfail++;
2531                else if (is_parity_stripe(rbio->failb))
2532                        failp = rbio->failb;
2533
2534                /*
2535                 * Because we can not use a scrubbing parity to repair
2536                 * the data, so the capability of the repair is declined.
2537                 * (In the case of RAID5, we can not repair anything)
2538                 */
2539                if (dfail > rbio->bbio->max_errors - 1)
2540                        goto cleanup;
2541
2542                /*
2543                 * If all data is good, only parity is correctly, just
2544                 * repair the parity.
2545                 */
2546                if (dfail == 0) {
2547                        finish_parity_scrub(rbio, 0);
2548                        return;
2549                }
2550
2551                /*
2552                 * Here means we got one corrupted data stripe and one
2553                 * corrupted parity on RAID6, if the corrupted parity
2554                 * is scrubbing parity, luckily, use the other one to repair
2555                 * the data, or we can not repair the data stripe.
2556                 */
2557                if (failp != rbio->scrubp)
2558                        goto cleanup;
2559
2560                __raid_recover_end_io(rbio);
2561        } else {
2562                finish_parity_scrub(rbio, 1);
2563        }
2564        return;
2565
2566cleanup:
2567        rbio_orig_end_io(rbio, BLK_STS_IOERR);
2568}
2569
2570/*
2571 * end io for the read phase of the rmw cycle.  All the bios here are physical
2572 * stripe bios we've read from the disk so we can recalculate the parity of the
2573 * stripe.
2574 *
2575 * This will usually kick off finish_rmw once all the bios are read in, but it
2576 * may trigger parity reconstruction if we had any errors along the way
2577 */
2578static void raid56_parity_scrub_end_io(struct bio *bio)
2579{
2580        struct btrfs_raid_bio *rbio = bio->bi_private;
2581
2582        if (bio->bi_status)
2583                fail_bio_stripe(rbio, bio);
2584        else
2585                set_bio_pages_uptodate(bio);
2586
2587        bio_put(bio);
2588
2589        if (!atomic_dec_and_test(&rbio->stripes_pending))
2590                return;
2591
2592        /*
2593         * this will normally call finish_rmw to start our write
2594         * but if there are any failed stripes we'll reconstruct
2595         * from parity first
2596         */
2597        validate_rbio_for_parity_scrub(rbio);
2598}
2599
2600static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
2601{
2602        int bios_to_read = 0;
2603        struct bio_list bio_list;
2604        int ret;
2605        int pagenr;
2606        int stripe;
2607        struct bio *bio;
2608
2609        bio_list_init(&bio_list);
2610
2611        ret = alloc_rbio_essential_pages(rbio);
2612        if (ret)
2613                goto cleanup;
2614
2615        atomic_set(&rbio->error, 0);
2616        /*
2617         * build a list of bios to read all the missing parts of this
2618         * stripe
2619         */
2620        for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
2621                for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2622                        struct page *page;
2623                        /*
2624                         * we want to find all the pages missing from
2625                         * the rbio and read them from the disk.  If
2626                         * page_in_rbio finds a page in the bio list
2627                         * we don't need to read it off the stripe.
2628                         */
2629                        page = page_in_rbio(rbio, stripe, pagenr, 1);
2630                        if (page)
2631                                continue;
2632
2633                        page = rbio_stripe_page(rbio, stripe, pagenr);
2634                        /*
2635                         * the bio cache may have handed us an uptodate
2636                         * page.  If so, be happy and use it
2637                         */
2638                        if (PageUptodate(page))
2639                                continue;
2640
2641                        ret = rbio_add_io_page(rbio, &bio_list, page,
2642                                       stripe, pagenr, rbio->stripe_len);
2643                        if (ret)
2644                                goto cleanup;
2645                }
2646        }
2647
2648        bios_to_read = bio_list_size(&bio_list);
2649        if (!bios_to_read) {
2650                /*
2651                 * this can happen if others have merged with
2652                 * us, it means there is nothing left to read.
2653                 * But if there are missing devices it may not be
2654                 * safe to do the full stripe write yet.
2655                 */
2656                goto finish;
2657        }
2658
2659        /*
2660         * the bbio may be freed once we submit the last bio.  Make sure
2661         * not to touch it after that
2662         */
2663        atomic_set(&rbio->stripes_pending, bios_to_read);
2664        while (1) {
2665                bio = bio_list_pop(&bio_list);
2666                if (!bio)
2667                        break;
2668
2669                bio->bi_private = rbio;
2670                bio->bi_end_io = raid56_parity_scrub_end_io;
2671                bio->bi_opf = REQ_OP_READ;
2672
2673                btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
2674
2675                submit_bio(bio);
2676        }
2677        /* the actual write will happen once the reads are done */
2678        return;
2679
2680cleanup:
2681        rbio_orig_end_io(rbio, BLK_STS_IOERR);
2682
2683        while ((bio = bio_list_pop(&bio_list)))
2684                bio_put(bio);
2685
2686        return;
2687
2688finish:
2689        validate_rbio_for_parity_scrub(rbio);
2690}
2691
2692static void scrub_parity_work(struct btrfs_work *work)
2693{
2694        struct btrfs_raid_bio *rbio;
2695
2696        rbio = container_of(work, struct btrfs_raid_bio, work);
2697        raid56_parity_scrub_stripe(rbio);
2698}
2699
2700void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2701{
2702        if (!lock_stripe_add(rbio))
2703                start_async_work(rbio, scrub_parity_work);
2704}
2705
2706/* The following code is used for dev replace of a missing RAID 5/6 device. */
2707
2708struct btrfs_raid_bio *
2709raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
2710                          struct btrfs_bio *bbio, u64 length)
2711{
2712        struct btrfs_raid_bio *rbio;
2713
2714        rbio = alloc_rbio(fs_info, bbio, length);
2715        if (IS_ERR(rbio))
2716                return NULL;
2717
2718        rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
2719        bio_list_add(&rbio->bio_list, bio);
2720        /*
2721         * This is a special bio which is used to hold the completion handler
2722         * and make the scrub rbio is similar to the other types
2723         */
2724        ASSERT(!bio->bi_iter.bi_size);
2725
2726        rbio->faila = find_logical_bio_stripe(rbio, bio);
2727        if (rbio->faila == -1) {
2728                BUG();
2729                kfree(rbio);
2730                return NULL;
2731        }
2732
2733        /*
2734         * When we get bbio, we have already increased bio_counter, record it
2735         * so we can free it at rbio_orig_end_io()
2736         */
2737        rbio->generic_bio_cnt = 1;
2738
2739        return rbio;
2740}
2741
2742void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
2743{
2744        if (!lock_stripe_add(rbio))
2745                start_async_work(rbio, read_rebuild_work);
2746}
2747