linux/fs/btrfs/raid56.c
<<
>>
Prefs
   1// SPDX-License-Identifier: GPL-2.0
   2/*
   3 * Copyright (C) 2012 Fusion-io  All rights reserved.
   4 * Copyright (C) 2012 Intel Corp. All rights reserved.
   5 */
   6
   7#include <linux/sched.h>
   8#include <linux/bio.h>
   9#include <linux/slab.h>
  10#include <linux/blkdev.h>
  11#include <linux/raid/pq.h>
  12#include <linux/hash.h>
  13#include <linux/list_sort.h>
  14#include <linux/raid/xor.h>
  15#include <linux/mm.h>
  16#include "ctree.h"
  17#include "disk-io.h"
  18#include "volumes.h"
  19#include "raid56.h"
  20#include "async-thread.h"
  21
  22/* set when additional merges to this rbio are not allowed */
  23#define RBIO_RMW_LOCKED_BIT     1
  24
  25/*
  26 * set when this rbio is sitting in the hash, but it is just a cache
  27 * of past RMW
  28 */
  29#define RBIO_CACHE_BIT          2
  30
  31/*
  32 * set when it is safe to trust the stripe_pages for caching
  33 */
  34#define RBIO_CACHE_READY_BIT    3
  35
  36#define RBIO_CACHE_SIZE 1024
  37
  38enum btrfs_rbio_ops {
  39        BTRFS_RBIO_WRITE,
  40        BTRFS_RBIO_READ_REBUILD,
  41        BTRFS_RBIO_PARITY_SCRUB,
  42        BTRFS_RBIO_REBUILD_MISSING,
  43};
  44
  45struct btrfs_raid_bio {
  46        struct btrfs_fs_info *fs_info;
  47        struct btrfs_bio *bbio;
  48
  49        /* while we're doing rmw on a stripe
  50         * we put it into a hash table so we can
  51         * lock the stripe and merge more rbios
  52         * into it.
  53         */
  54        struct list_head hash_list;
  55
  56        /*
  57         * LRU list for the stripe cache
  58         */
  59        struct list_head stripe_cache;
  60
  61        /*
  62         * for scheduling work in the helper threads
  63         */
  64        struct btrfs_work work;
  65
  66        /*
  67         * bio list and bio_list_lock are used
  68         * to add more bios into the stripe
  69         * in hopes of avoiding the full rmw
  70         */
  71        struct bio_list bio_list;
  72        spinlock_t bio_list_lock;
  73
  74        /* also protected by the bio_list_lock, the
  75         * plug list is used by the plugging code
  76         * to collect partial bios while plugged.  The
  77         * stripe locking code also uses it to hand off
  78         * the stripe lock to the next pending IO
  79         */
  80        struct list_head plug_list;
  81
  82        /*
  83         * flags that tell us if it is safe to
  84         * merge with this bio
  85         */
  86        unsigned long flags;
  87
  88        /* size of each individual stripe on disk */
  89        int stripe_len;
  90
  91        /* number of data stripes (no p/q) */
  92        int nr_data;
  93
  94        int real_stripes;
  95
  96        int stripe_npages;
  97        /*
  98         * set if we're doing a parity rebuild
  99         * for a read from higher up, which is handled
 100         * differently from a parity rebuild as part of
 101         * rmw
 102         */
 103        enum btrfs_rbio_ops operation;
 104
 105        /* first bad stripe */
 106        int faila;
 107
 108        /* second bad stripe (for raid6 use) */
 109        int failb;
 110
 111        int scrubp;
 112        /*
 113         * number of pages needed to represent the full
 114         * stripe
 115         */
 116        int nr_pages;
 117
 118        /*
 119         * size of all the bios in the bio_list.  This
 120         * helps us decide if the rbio maps to a full
 121         * stripe or not
 122         */
 123        int bio_list_bytes;
 124
 125        int generic_bio_cnt;
 126
 127        refcount_t refs;
 128
 129        atomic_t stripes_pending;
 130
 131        atomic_t error;
 132        /*
 133         * these are two arrays of pointers.  We allocate the
 134         * rbio big enough to hold them both and setup their
 135         * locations when the rbio is allocated
 136         */
 137
 138        /* pointers to pages that we allocated for
 139         * reading/writing stripes directly from the disk (including P/Q)
 140         */
 141        struct page **stripe_pages;
 142
 143        /*
 144         * pointers to the pages in the bio_list.  Stored
 145         * here for faster lookup
 146         */
 147        struct page **bio_pages;
 148
 149        /*
 150         * bitmap to record which horizontal stripe has data
 151         */
 152        unsigned long *dbitmap;
 153
 154        /* allocated with real_stripes-many pointers for finish_*() calls */
 155        void **finish_pointers;
 156
 157        /* allocated with stripe_npages-many bits for finish_*() calls */
 158        unsigned long *finish_pbitmap;
 159};
 160
 161static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
 162static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
 163static void rmw_work(struct btrfs_work *work);
 164static void read_rebuild_work(struct btrfs_work *work);
 165static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
 166static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
 167static void __free_raid_bio(struct btrfs_raid_bio *rbio);
 168static void index_rbio_pages(struct btrfs_raid_bio *rbio);
 169static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
 170
 171static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
 172                                         int need_check);
 173static void scrub_parity_work(struct btrfs_work *work);
 174
 175static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func)
 176{
 177        btrfs_init_work(&rbio->work, btrfs_rmw_helper, work_func, NULL, NULL);
 178        btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
 179}
 180
 181/*
 182 * the stripe hash table is used for locking, and to collect
 183 * bios in hopes of making a full stripe
 184 */
 185int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
 186{
 187        struct btrfs_stripe_hash_table *table;
 188        struct btrfs_stripe_hash_table *x;
 189        struct btrfs_stripe_hash *cur;
 190        struct btrfs_stripe_hash *h;
 191        int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
 192        int i;
 193        int table_size;
 194
 195        if (info->stripe_hash_table)
 196                return 0;
 197
 198        /*
 199         * The table is large, starting with order 4 and can go as high as
 200         * order 7 in case lock debugging is turned on.
 201         *
 202         * Try harder to allocate and fallback to vmalloc to lower the chance
 203         * of a failing mount.
 204         */
 205        table_size = sizeof(*table) + sizeof(*h) * num_entries;
 206        table = kvzalloc(table_size, GFP_KERNEL);
 207        if (!table)
 208                return -ENOMEM;
 209
 210        spin_lock_init(&table->cache_lock);
 211        INIT_LIST_HEAD(&table->stripe_cache);
 212
 213        h = table->table;
 214
 215        for (i = 0; i < num_entries; i++) {
 216                cur = h + i;
 217                INIT_LIST_HEAD(&cur->hash_list);
 218                spin_lock_init(&cur->lock);
 219        }
 220
 221        x = cmpxchg(&info->stripe_hash_table, NULL, table);
 222        if (x)
 223                kvfree(x);
 224        return 0;
 225}
 226
 227/*
 228 * caching an rbio means to copy anything from the
 229 * bio_pages array into the stripe_pages array.  We
 230 * use the page uptodate bit in the stripe cache array
 231 * to indicate if it has valid data
 232 *
 233 * once the caching is done, we set the cache ready
 234 * bit.
 235 */
 236static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
 237{
 238        int i;
 239        char *s;
 240        char *d;
 241        int ret;
 242
 243        ret = alloc_rbio_pages(rbio);
 244        if (ret)
 245                return;
 246
 247        for (i = 0; i < rbio->nr_pages; i++) {
 248                if (!rbio->bio_pages[i])
 249                        continue;
 250
 251                s = kmap(rbio->bio_pages[i]);
 252                d = kmap(rbio->stripe_pages[i]);
 253
 254                copy_page(d, s);
 255
 256                kunmap(rbio->bio_pages[i]);
 257                kunmap(rbio->stripe_pages[i]);
 258                SetPageUptodate(rbio->stripe_pages[i]);
 259        }
 260        set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
 261}
 262
 263/*
 264 * we hash on the first logical address of the stripe
 265 */
 266static int rbio_bucket(struct btrfs_raid_bio *rbio)
 267{
 268        u64 num = rbio->bbio->raid_map[0];
 269
 270        /*
 271         * we shift down quite a bit.  We're using byte
 272         * addressing, and most of the lower bits are zeros.
 273         * This tends to upset hash_64, and it consistently
 274         * returns just one or two different values.
 275         *
 276         * shifting off the lower bits fixes things.
 277         */
 278        return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
 279}
 280
 281/*
 282 * stealing an rbio means taking all the uptodate pages from the stripe
 283 * array in the source rbio and putting them into the destination rbio
 284 */
 285static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
 286{
 287        int i;
 288        struct page *s;
 289        struct page *d;
 290
 291        if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
 292                return;
 293
 294        for (i = 0; i < dest->nr_pages; i++) {
 295                s = src->stripe_pages[i];
 296                if (!s || !PageUptodate(s)) {
 297                        continue;
 298                }
 299
 300                d = dest->stripe_pages[i];
 301                if (d)
 302                        __free_page(d);
 303
 304                dest->stripe_pages[i] = s;
 305                src->stripe_pages[i] = NULL;
 306        }
 307}
 308
 309/*
 310 * merging means we take the bio_list from the victim and
 311 * splice it into the destination.  The victim should
 312 * be discarded afterwards.
 313 *
 314 * must be called with dest->rbio_list_lock held
 315 */
 316static void merge_rbio(struct btrfs_raid_bio *dest,
 317                       struct btrfs_raid_bio *victim)
 318{
 319        bio_list_merge(&dest->bio_list, &victim->bio_list);
 320        dest->bio_list_bytes += victim->bio_list_bytes;
 321        dest->generic_bio_cnt += victim->generic_bio_cnt;
 322        bio_list_init(&victim->bio_list);
 323}
 324
 325/*
 326 * used to prune items that are in the cache.  The caller
 327 * must hold the hash table lock.
 328 */
 329static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
 330{
 331        int bucket = rbio_bucket(rbio);
 332        struct btrfs_stripe_hash_table *table;
 333        struct btrfs_stripe_hash *h;
 334        int freeit = 0;
 335
 336        /*
 337         * check the bit again under the hash table lock.
 338         */
 339        if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
 340                return;
 341
 342        table = rbio->fs_info->stripe_hash_table;
 343        h = table->table + bucket;
 344
 345        /* hold the lock for the bucket because we may be
 346         * removing it from the hash table
 347         */
 348        spin_lock(&h->lock);
 349
 350        /*
 351         * hold the lock for the bio list because we need
 352         * to make sure the bio list is empty
 353         */
 354        spin_lock(&rbio->bio_list_lock);
 355
 356        if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
 357                list_del_init(&rbio->stripe_cache);
 358                table->cache_size -= 1;
 359                freeit = 1;
 360
 361                /* if the bio list isn't empty, this rbio is
 362                 * still involved in an IO.  We take it out
 363                 * of the cache list, and drop the ref that
 364                 * was held for the list.
 365                 *
 366                 * If the bio_list was empty, we also remove
 367                 * the rbio from the hash_table, and drop
 368                 * the corresponding ref
 369                 */
 370                if (bio_list_empty(&rbio->bio_list)) {
 371                        if (!list_empty(&rbio->hash_list)) {
 372                                list_del_init(&rbio->hash_list);
 373                                refcount_dec(&rbio->refs);
 374                                BUG_ON(!list_empty(&rbio->plug_list));
 375                        }
 376                }
 377        }
 378
 379        spin_unlock(&rbio->bio_list_lock);
 380        spin_unlock(&h->lock);
 381
 382        if (freeit)
 383                __free_raid_bio(rbio);
 384}
 385
 386/*
 387 * prune a given rbio from the cache
 388 */
 389static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
 390{
 391        struct btrfs_stripe_hash_table *table;
 392        unsigned long flags;
 393
 394        if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
 395                return;
 396
 397        table = rbio->fs_info->stripe_hash_table;
 398
 399        spin_lock_irqsave(&table->cache_lock, flags);
 400        __remove_rbio_from_cache(rbio);
 401        spin_unlock_irqrestore(&table->cache_lock, flags);
 402}
 403
 404/*
 405 * remove everything in the cache
 406 */
 407static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
 408{
 409        struct btrfs_stripe_hash_table *table;
 410        unsigned long flags;
 411        struct btrfs_raid_bio *rbio;
 412
 413        table = info->stripe_hash_table;
 414
 415        spin_lock_irqsave(&table->cache_lock, flags);
 416        while (!list_empty(&table->stripe_cache)) {
 417                rbio = list_entry(table->stripe_cache.next,
 418                                  struct btrfs_raid_bio,
 419                                  stripe_cache);
 420                __remove_rbio_from_cache(rbio);
 421        }
 422        spin_unlock_irqrestore(&table->cache_lock, flags);
 423}
 424
 425/*
 426 * remove all cached entries and free the hash table
 427 * used by unmount
 428 */
 429void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
 430{
 431        if (!info->stripe_hash_table)
 432                return;
 433        btrfs_clear_rbio_cache(info);
 434        kvfree(info->stripe_hash_table);
 435        info->stripe_hash_table = NULL;
 436}
 437
 438/*
 439 * insert an rbio into the stripe cache.  It
 440 * must have already been prepared by calling
 441 * cache_rbio_pages
 442 *
 443 * If this rbio was already cached, it gets
 444 * moved to the front of the lru.
 445 *
 446 * If the size of the rbio cache is too big, we
 447 * prune an item.
 448 */
 449static void cache_rbio(struct btrfs_raid_bio *rbio)
 450{
 451        struct btrfs_stripe_hash_table *table;
 452        unsigned long flags;
 453
 454        if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
 455                return;
 456
 457        table = rbio->fs_info->stripe_hash_table;
 458
 459        spin_lock_irqsave(&table->cache_lock, flags);
 460        spin_lock(&rbio->bio_list_lock);
 461
 462        /* bump our ref if we were not in the list before */
 463        if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
 464                refcount_inc(&rbio->refs);
 465
 466        if (!list_empty(&rbio->stripe_cache)){
 467                list_move(&rbio->stripe_cache, &table->stripe_cache);
 468        } else {
 469                list_add(&rbio->stripe_cache, &table->stripe_cache);
 470                table->cache_size += 1;
 471        }
 472
 473        spin_unlock(&rbio->bio_list_lock);
 474
 475        if (table->cache_size > RBIO_CACHE_SIZE) {
 476                struct btrfs_raid_bio *found;
 477
 478                found = list_entry(table->stripe_cache.prev,
 479                                  struct btrfs_raid_bio,
 480                                  stripe_cache);
 481
 482                if (found != rbio)
 483                        __remove_rbio_from_cache(found);
 484        }
 485
 486        spin_unlock_irqrestore(&table->cache_lock, flags);
 487}
 488
 489/*
 490 * helper function to run the xor_blocks api.  It is only
 491 * able to do MAX_XOR_BLOCKS at a time, so we need to
 492 * loop through.
 493 */
 494static void run_xor(void **pages, int src_cnt, ssize_t len)
 495{
 496        int src_off = 0;
 497        int xor_src_cnt = 0;
 498        void *dest = pages[src_cnt];
 499
 500        while(src_cnt > 0) {
 501                xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
 502                xor_blocks(xor_src_cnt, len, dest, pages + src_off);
 503
 504                src_cnt -= xor_src_cnt;
 505                src_off += xor_src_cnt;
 506        }
 507}
 508
 509/*
 510 * Returns true if the bio list inside this rbio covers an entire stripe (no
 511 * rmw required).
 512 */
 513static int rbio_is_full(struct btrfs_raid_bio *rbio)
 514{
 515        unsigned long flags;
 516        unsigned long size = rbio->bio_list_bytes;
 517        int ret = 1;
 518
 519        spin_lock_irqsave(&rbio->bio_list_lock, flags);
 520        if (size != rbio->nr_data * rbio->stripe_len)
 521                ret = 0;
 522        BUG_ON(size > rbio->nr_data * rbio->stripe_len);
 523        spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
 524
 525        return ret;
 526}
 527
 528/*
 529 * returns 1 if it is safe to merge two rbios together.
 530 * The merging is safe if the two rbios correspond to
 531 * the same stripe and if they are both going in the same
 532 * direction (read vs write), and if neither one is
 533 * locked for final IO
 534 *
 535 * The caller is responsible for locking such that
 536 * rmw_locked is safe to test
 537 */
 538static int rbio_can_merge(struct btrfs_raid_bio *last,
 539                          struct btrfs_raid_bio *cur)
 540{
 541        if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
 542            test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
 543                return 0;
 544
 545        /*
 546         * we can't merge with cached rbios, since the
 547         * idea is that when we merge the destination
 548         * rbio is going to run our IO for us.  We can
 549         * steal from cached rbios though, other functions
 550         * handle that.
 551         */
 552        if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
 553            test_bit(RBIO_CACHE_BIT, &cur->flags))
 554                return 0;
 555
 556        if (last->bbio->raid_map[0] !=
 557            cur->bbio->raid_map[0])
 558                return 0;
 559
 560        /* we can't merge with different operations */
 561        if (last->operation != cur->operation)
 562                return 0;
 563        /*
 564         * We've need read the full stripe from the drive.
 565         * check and repair the parity and write the new results.
 566         *
 567         * We're not allowed to add any new bios to the
 568         * bio list here, anyone else that wants to
 569         * change this stripe needs to do their own rmw.
 570         */
 571        if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
 572                return 0;
 573
 574        if (last->operation == BTRFS_RBIO_REBUILD_MISSING)
 575                return 0;
 576
 577        if (last->operation == BTRFS_RBIO_READ_REBUILD) {
 578                int fa = last->faila;
 579                int fb = last->failb;
 580                int cur_fa = cur->faila;
 581                int cur_fb = cur->failb;
 582
 583                if (last->faila >= last->failb) {
 584                        fa = last->failb;
 585                        fb = last->faila;
 586                }
 587
 588                if (cur->faila >= cur->failb) {
 589                        cur_fa = cur->failb;
 590                        cur_fb = cur->faila;
 591                }
 592
 593                if (fa != cur_fa || fb != cur_fb)
 594                        return 0;
 595        }
 596        return 1;
 597}
 598
 599static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe,
 600                                  int index)
 601{
 602        return stripe * rbio->stripe_npages + index;
 603}
 604
 605/*
 606 * these are just the pages from the rbio array, not from anything
 607 * the FS sent down to us
 608 */
 609static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe,
 610                                     int index)
 611{
 612        return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)];
 613}
 614
 615/*
 616 * helper to index into the pstripe
 617 */
 618static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
 619{
 620        return rbio_stripe_page(rbio, rbio->nr_data, index);
 621}
 622
 623/*
 624 * helper to index into the qstripe, returns null
 625 * if there is no qstripe
 626 */
 627static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
 628{
 629        if (rbio->nr_data + 1 == rbio->real_stripes)
 630                return NULL;
 631        return rbio_stripe_page(rbio, rbio->nr_data + 1, index);
 632}
 633
 634/*
 635 * The first stripe in the table for a logical address
 636 * has the lock.  rbios are added in one of three ways:
 637 *
 638 * 1) Nobody has the stripe locked yet.  The rbio is given
 639 * the lock and 0 is returned.  The caller must start the IO
 640 * themselves.
 641 *
 642 * 2) Someone has the stripe locked, but we're able to merge
 643 * with the lock owner.  The rbio is freed and the IO will
 644 * start automatically along with the existing rbio.  1 is returned.
 645 *
 646 * 3) Someone has the stripe locked, but we're not able to merge.
 647 * The rbio is added to the lock owner's plug list, or merged into
 648 * an rbio already on the plug list.  When the lock owner unlocks,
 649 * the next rbio on the list is run and the IO is started automatically.
 650 * 1 is returned
 651 *
 652 * If we return 0, the caller still owns the rbio and must continue with
 653 * IO submission.  If we return 1, the caller must assume the rbio has
 654 * already been freed.
 655 */
 656static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
 657{
 658        int bucket = rbio_bucket(rbio);
 659        struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
 660        struct btrfs_raid_bio *cur;
 661        struct btrfs_raid_bio *pending;
 662        unsigned long flags;
 663        struct btrfs_raid_bio *freeit = NULL;
 664        struct btrfs_raid_bio *cache_drop = NULL;
 665        int ret = 0;
 666
 667        spin_lock_irqsave(&h->lock, flags);
 668        list_for_each_entry(cur, &h->hash_list, hash_list) {
 669                if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) {
 670                        spin_lock(&cur->bio_list_lock);
 671
 672                        /* can we steal this cached rbio's pages? */
 673                        if (bio_list_empty(&cur->bio_list) &&
 674                            list_empty(&cur->plug_list) &&
 675                            test_bit(RBIO_CACHE_BIT, &cur->flags) &&
 676                            !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
 677                                list_del_init(&cur->hash_list);
 678                                refcount_dec(&cur->refs);
 679
 680                                steal_rbio(cur, rbio);
 681                                cache_drop = cur;
 682                                spin_unlock(&cur->bio_list_lock);
 683
 684                                goto lockit;
 685                        }
 686
 687                        /* can we merge into the lock owner? */
 688                        if (rbio_can_merge(cur, rbio)) {
 689                                merge_rbio(cur, rbio);
 690                                spin_unlock(&cur->bio_list_lock);
 691                                freeit = rbio;
 692                                ret = 1;
 693                                goto out;
 694                        }
 695
 696
 697                        /*
 698                         * we couldn't merge with the running
 699                         * rbio, see if we can merge with the
 700                         * pending ones.  We don't have to
 701                         * check for rmw_locked because there
 702                         * is no way they are inside finish_rmw
 703                         * right now
 704                         */
 705                        list_for_each_entry(pending, &cur->plug_list,
 706                                            plug_list) {
 707                                if (rbio_can_merge(pending, rbio)) {
 708                                        merge_rbio(pending, rbio);
 709                                        spin_unlock(&cur->bio_list_lock);
 710                                        freeit = rbio;
 711                                        ret = 1;
 712                                        goto out;
 713                                }
 714                        }
 715
 716                        /* no merging, put us on the tail of the plug list,
 717                         * our rbio will be started with the currently
 718                         * running rbio unlocks
 719                         */
 720                        list_add_tail(&rbio->plug_list, &cur->plug_list);
 721                        spin_unlock(&cur->bio_list_lock);
 722                        ret = 1;
 723                        goto out;
 724                }
 725        }
 726lockit:
 727        refcount_inc(&rbio->refs);
 728        list_add(&rbio->hash_list, &h->hash_list);
 729out:
 730        spin_unlock_irqrestore(&h->lock, flags);
 731        if (cache_drop)
 732                remove_rbio_from_cache(cache_drop);
 733        if (freeit)
 734                __free_raid_bio(freeit);
 735        return ret;
 736}
 737
 738/*
 739 * called as rmw or parity rebuild is completed.  If the plug list has more
 740 * rbios waiting for this stripe, the next one on the list will be started
 741 */
 742static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
 743{
 744        int bucket;
 745        struct btrfs_stripe_hash *h;
 746        unsigned long flags;
 747        int keep_cache = 0;
 748
 749        bucket = rbio_bucket(rbio);
 750        h = rbio->fs_info->stripe_hash_table->table + bucket;
 751
 752        if (list_empty(&rbio->plug_list))
 753                cache_rbio(rbio);
 754
 755        spin_lock_irqsave(&h->lock, flags);
 756        spin_lock(&rbio->bio_list_lock);
 757
 758        if (!list_empty(&rbio->hash_list)) {
 759                /*
 760                 * if we're still cached and there is no other IO
 761                 * to perform, just leave this rbio here for others
 762                 * to steal from later
 763                 */
 764                if (list_empty(&rbio->plug_list) &&
 765                    test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
 766                        keep_cache = 1;
 767                        clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
 768                        BUG_ON(!bio_list_empty(&rbio->bio_list));
 769                        goto done;
 770                }
 771
 772                list_del_init(&rbio->hash_list);
 773                refcount_dec(&rbio->refs);
 774
 775                /*
 776                 * we use the plug list to hold all the rbios
 777                 * waiting for the chance to lock this stripe.
 778                 * hand the lock over to one of them.
 779                 */
 780                if (!list_empty(&rbio->plug_list)) {
 781                        struct btrfs_raid_bio *next;
 782                        struct list_head *head = rbio->plug_list.next;
 783
 784                        next = list_entry(head, struct btrfs_raid_bio,
 785                                          plug_list);
 786
 787                        list_del_init(&rbio->plug_list);
 788
 789                        list_add(&next->hash_list, &h->hash_list);
 790                        refcount_inc(&next->refs);
 791                        spin_unlock(&rbio->bio_list_lock);
 792                        spin_unlock_irqrestore(&h->lock, flags);
 793
 794                        if (next->operation == BTRFS_RBIO_READ_REBUILD)
 795                                start_async_work(next, read_rebuild_work);
 796                        else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
 797                                steal_rbio(rbio, next);
 798                                start_async_work(next, read_rebuild_work);
 799                        } else if (next->operation == BTRFS_RBIO_WRITE) {
 800                                steal_rbio(rbio, next);
 801                                start_async_work(next, rmw_work);
 802                        } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
 803                                steal_rbio(rbio, next);
 804                                start_async_work(next, scrub_parity_work);
 805                        }
 806
 807                        goto done_nolock;
 808                }
 809        }
 810done:
 811        spin_unlock(&rbio->bio_list_lock);
 812        spin_unlock_irqrestore(&h->lock, flags);
 813
 814done_nolock:
 815        if (!keep_cache)
 816                remove_rbio_from_cache(rbio);
 817}
 818
 819static void __free_raid_bio(struct btrfs_raid_bio *rbio)
 820{
 821        int i;
 822
 823        if (!refcount_dec_and_test(&rbio->refs))
 824                return;
 825
 826        WARN_ON(!list_empty(&rbio->stripe_cache));
 827        WARN_ON(!list_empty(&rbio->hash_list));
 828        WARN_ON(!bio_list_empty(&rbio->bio_list));
 829
 830        for (i = 0; i < rbio->nr_pages; i++) {
 831                if (rbio->stripe_pages[i]) {
 832                        __free_page(rbio->stripe_pages[i]);
 833                        rbio->stripe_pages[i] = NULL;
 834                }
 835        }
 836
 837        btrfs_put_bbio(rbio->bbio);
 838        kfree(rbio);
 839}
 840
 841static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
 842{
 843        struct bio *next;
 844
 845        while (cur) {
 846                next = cur->bi_next;
 847                cur->bi_next = NULL;
 848                cur->bi_status = err;
 849                bio_endio(cur);
 850                cur = next;
 851        }
 852}
 853
 854/*
 855 * this frees the rbio and runs through all the bios in the
 856 * bio_list and calls end_io on them
 857 */
 858static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
 859{
 860        struct bio *cur = bio_list_get(&rbio->bio_list);
 861        struct bio *extra;
 862
 863        if (rbio->generic_bio_cnt)
 864                btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
 865
 866        /*
 867         * At this moment, rbio->bio_list is empty, however since rbio does not
 868         * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
 869         * hash list, rbio may be merged with others so that rbio->bio_list
 870         * becomes non-empty.
 871         * Once unlock_stripe() is done, rbio->bio_list will not be updated any
 872         * more and we can call bio_endio() on all queued bios.
 873         */
 874        unlock_stripe(rbio);
 875        extra = bio_list_get(&rbio->bio_list);
 876        __free_raid_bio(rbio);
 877
 878        rbio_endio_bio_list(cur, err);
 879        if (extra)
 880                rbio_endio_bio_list(extra, err);
 881}
 882
 883/*
 884 * end io function used by finish_rmw.  When we finally
 885 * get here, we've written a full stripe
 886 */
 887static void raid_write_end_io(struct bio *bio)
 888{
 889        struct btrfs_raid_bio *rbio = bio->bi_private;
 890        blk_status_t err = bio->bi_status;
 891        int max_errors;
 892
 893        if (err)
 894                fail_bio_stripe(rbio, bio);
 895
 896        bio_put(bio);
 897
 898        if (!atomic_dec_and_test(&rbio->stripes_pending))
 899                return;
 900
 901        err = BLK_STS_OK;
 902
 903        /* OK, we have read all the stripes we need to. */
 904        max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
 905                     0 : rbio->bbio->max_errors;
 906        if (atomic_read(&rbio->error) > max_errors)
 907                err = BLK_STS_IOERR;
 908
 909        rbio_orig_end_io(rbio, err);
 910}
 911
 912/*
 913 * the read/modify/write code wants to use the original bio for
 914 * any pages it included, and then use the rbio for everything
 915 * else.  This function decides if a given index (stripe number)
 916 * and page number in that stripe fall inside the original bio
 917 * or the rbio.
 918 *
 919 * if you set bio_list_only, you'll get a NULL back for any ranges
 920 * that are outside the bio_list
 921 *
 922 * This doesn't take any refs on anything, you get a bare page pointer
 923 * and the caller must bump refs as required.
 924 *
 925 * You must call index_rbio_pages once before you can trust
 926 * the answers from this function.
 927 */
 928static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
 929                                 int index, int pagenr, int bio_list_only)
 930{
 931        int chunk_page;
 932        struct page *p = NULL;
 933
 934        chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
 935
 936        spin_lock_irq(&rbio->bio_list_lock);
 937        p = rbio->bio_pages[chunk_page];
 938        spin_unlock_irq(&rbio->bio_list_lock);
 939
 940        if (p || bio_list_only)
 941                return p;
 942
 943        return rbio->stripe_pages[chunk_page];
 944}
 945
 946/*
 947 * number of pages we need for the entire stripe across all the
 948 * drives
 949 */
 950static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
 951{
 952        return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes;
 953}
 954
 955/*
 956 * allocation and initial setup for the btrfs_raid_bio.  Not
 957 * this does not allocate any pages for rbio->pages.
 958 */
 959static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
 960                                         struct btrfs_bio *bbio,
 961                                         u64 stripe_len)
 962{
 963        struct btrfs_raid_bio *rbio;
 964        int nr_data = 0;
 965        int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
 966        int num_pages = rbio_nr_pages(stripe_len, real_stripes);
 967        int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
 968        void *p;
 969
 970        rbio = kzalloc(sizeof(*rbio) +
 971                       sizeof(*rbio->stripe_pages) * num_pages +
 972                       sizeof(*rbio->bio_pages) * num_pages +
 973                       sizeof(*rbio->finish_pointers) * real_stripes +
 974                       sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_npages) +
 975                       sizeof(*rbio->finish_pbitmap) *
 976                                BITS_TO_LONGS(stripe_npages),
 977                       GFP_NOFS);
 978        if (!rbio)
 979                return ERR_PTR(-ENOMEM);
 980
 981        bio_list_init(&rbio->bio_list);
 982        INIT_LIST_HEAD(&rbio->plug_list);
 983        spin_lock_init(&rbio->bio_list_lock);
 984        INIT_LIST_HEAD(&rbio->stripe_cache);
 985        INIT_LIST_HEAD(&rbio->hash_list);
 986        rbio->bbio = bbio;
 987        rbio->fs_info = fs_info;
 988        rbio->stripe_len = stripe_len;
 989        rbio->nr_pages = num_pages;
 990        rbio->real_stripes = real_stripes;
 991        rbio->stripe_npages = stripe_npages;
 992        rbio->faila = -1;
 993        rbio->failb = -1;
 994        refcount_set(&rbio->refs, 1);
 995        atomic_set(&rbio->error, 0);
 996        atomic_set(&rbio->stripes_pending, 0);
 997
 998        /*
 999         * the stripe_pages, bio_pages, etc arrays point to the extra
1000         * memory we allocated past the end of the rbio
1001         */
1002        p = rbio + 1;
1003#define CONSUME_ALLOC(ptr, count)       do {                            \
1004                ptr = p;                                                \
1005                p = (unsigned char *)p + sizeof(*(ptr)) * (count);      \
1006        } while (0)
1007        CONSUME_ALLOC(rbio->stripe_pages, num_pages);
1008        CONSUME_ALLOC(rbio->bio_pages, num_pages);
1009        CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
1010        CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages));
1011        CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages));
1012#undef  CONSUME_ALLOC
1013
1014        if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
1015                nr_data = real_stripes - 1;
1016        else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
1017                nr_data = real_stripes - 2;
1018        else
1019                BUG();
1020
1021        rbio->nr_data = nr_data;
1022        return rbio;
1023}
1024
1025/* allocate pages for all the stripes in the bio, including parity */
1026static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
1027{
1028        int i;
1029        struct page *page;
1030
1031        for (i = 0; i < rbio->nr_pages; i++) {
1032                if (rbio->stripe_pages[i])
1033                        continue;
1034                page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1035                if (!page)
1036                        return -ENOMEM;
1037                rbio->stripe_pages[i] = page;
1038        }
1039        return 0;
1040}
1041
1042/* only allocate pages for p/q stripes */
1043static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
1044{
1045        int i;
1046        struct page *page;
1047
1048        i = rbio_stripe_page_index(rbio, rbio->nr_data, 0);
1049
1050        for (; i < rbio->nr_pages; i++) {
1051                if (rbio->stripe_pages[i])
1052                        continue;
1053                page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1054                if (!page)
1055                        return -ENOMEM;
1056                rbio->stripe_pages[i] = page;
1057        }
1058        return 0;
1059}
1060
1061/*
1062 * add a single page from a specific stripe into our list of bios for IO
1063 * this will try to merge into existing bios if possible, and returns
1064 * zero if all went well.
1065 */
1066static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
1067                            struct bio_list *bio_list,
1068                            struct page *page,
1069                            int stripe_nr,
1070                            unsigned long page_index,
1071                            unsigned long bio_max_len)
1072{
1073        struct bio *last = bio_list->tail;
1074        u64 last_end = 0;
1075        int ret;
1076        struct bio *bio;
1077        struct btrfs_bio_stripe *stripe;
1078        u64 disk_start;
1079
1080        stripe = &rbio->bbio->stripes[stripe_nr];
1081        disk_start = stripe->physical + (page_index << PAGE_SHIFT);
1082
1083        /* if the device is missing, just fail this stripe */
1084        if (!stripe->dev->bdev)
1085                return fail_rbio_index(rbio, stripe_nr);
1086
1087        /* see if we can add this page onto our existing bio */
1088        if (last) {
1089                last_end = (u64)last->bi_iter.bi_sector << 9;
1090                last_end += last->bi_iter.bi_size;
1091
1092                /*
1093                 * we can't merge these if they are from different
1094                 * devices or if they are not contiguous
1095                 */
1096                if (last_end == disk_start && stripe->dev->bdev &&
1097                    !last->bi_status &&
1098                    last->bi_disk == stripe->dev->bdev->bd_disk &&
1099                    last->bi_partno == stripe->dev->bdev->bd_partno) {
1100                        ret = bio_add_page(last, page, PAGE_SIZE, 0);
1101                        if (ret == PAGE_SIZE)
1102                                return 0;
1103                }
1104        }
1105
1106        /* put a new bio on the list */
1107        bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
1108        bio->bi_iter.bi_size = 0;
1109        bio_set_dev(bio, stripe->dev->bdev);
1110        bio->bi_iter.bi_sector = disk_start >> 9;
1111
1112        bio_add_page(bio, page, PAGE_SIZE, 0);
1113        bio_list_add(bio_list, bio);
1114        return 0;
1115}
1116
1117/*
1118 * while we're doing the read/modify/write cycle, we could
1119 * have errors in reading pages off the disk.  This checks
1120 * for errors and if we're not able to read the page it'll
1121 * trigger parity reconstruction.  The rmw will be finished
1122 * after we've reconstructed the failed stripes
1123 */
1124static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1125{
1126        if (rbio->faila >= 0 || rbio->failb >= 0) {
1127                BUG_ON(rbio->faila == rbio->real_stripes - 1);
1128                __raid56_parity_recover(rbio);
1129        } else {
1130                finish_rmw(rbio);
1131        }
1132}
1133
1134/*
1135 * helper function to walk our bio list and populate the bio_pages array with
1136 * the result.  This seems expensive, but it is faster than constantly
1137 * searching through the bio list as we setup the IO in finish_rmw or stripe
1138 * reconstruction.
1139 *
1140 * This must be called before you trust the answers from page_in_rbio
1141 */
1142static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1143{
1144        struct bio *bio;
1145        u64 start;
1146        unsigned long stripe_offset;
1147        unsigned long page_index;
1148
1149        spin_lock_irq(&rbio->bio_list_lock);
1150        bio_list_for_each(bio, &rbio->bio_list) {
1151                struct bio_vec bvec;
1152                struct bvec_iter iter;
1153                int i = 0;
1154
1155                start = (u64)bio->bi_iter.bi_sector << 9;
1156                stripe_offset = start - rbio->bbio->raid_map[0];
1157                page_index = stripe_offset >> PAGE_SHIFT;
1158
1159                if (bio_flagged(bio, BIO_CLONED))
1160                        bio->bi_iter = btrfs_io_bio(bio)->iter;
1161
1162                bio_for_each_segment(bvec, bio, iter) {
1163                        rbio->bio_pages[page_index + i] = bvec.bv_page;
1164                        i++;
1165                }
1166        }
1167        spin_unlock_irq(&rbio->bio_list_lock);
1168}
1169
1170/*
1171 * this is called from one of two situations.  We either
1172 * have a full stripe from the higher layers, or we've read all
1173 * the missing bits off disk.
1174 *
1175 * This will calculate the parity and then send down any
1176 * changed blocks.
1177 */
1178static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1179{
1180        struct btrfs_bio *bbio = rbio->bbio;
1181        void **pointers = rbio->finish_pointers;
1182        int nr_data = rbio->nr_data;
1183        int stripe;
1184        int pagenr;
1185        int p_stripe = -1;
1186        int q_stripe = -1;
1187        struct bio_list bio_list;
1188        struct bio *bio;
1189        int ret;
1190
1191        bio_list_init(&bio_list);
1192
1193        if (rbio->real_stripes - rbio->nr_data == 1) {
1194                p_stripe = rbio->real_stripes - 1;
1195        } else if (rbio->real_stripes - rbio->nr_data == 2) {
1196                p_stripe = rbio->real_stripes - 2;
1197                q_stripe = rbio->real_stripes - 1;
1198        } else {
1199                BUG();
1200        }
1201
1202        /* at this point we either have a full stripe,
1203         * or we've read the full stripe from the drive.
1204         * recalculate the parity and write the new results.
1205         *
1206         * We're not allowed to add any new bios to the
1207         * bio list here, anyone else that wants to
1208         * change this stripe needs to do their own rmw.
1209         */
1210        spin_lock_irq(&rbio->bio_list_lock);
1211        set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1212        spin_unlock_irq(&rbio->bio_list_lock);
1213
1214        atomic_set(&rbio->error, 0);
1215
1216        /*
1217         * now that we've set rmw_locked, run through the
1218         * bio list one last time and map the page pointers
1219         *
1220         * We don't cache full rbios because we're assuming
1221         * the higher layers are unlikely to use this area of
1222         * the disk again soon.  If they do use it again,
1223         * hopefully they will send another full bio.
1224         */
1225        index_rbio_pages(rbio);
1226        if (!rbio_is_full(rbio))
1227                cache_rbio_pages(rbio);
1228        else
1229                clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1230
1231        for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1232                struct page *p;
1233                /* first collect one page from each data stripe */
1234                for (stripe = 0; stripe < nr_data; stripe++) {
1235                        p = page_in_rbio(rbio, stripe, pagenr, 0);
1236                        pointers[stripe] = kmap(p);
1237                }
1238
1239                /* then add the parity stripe */
1240                p = rbio_pstripe_page(rbio, pagenr);
1241                SetPageUptodate(p);
1242                pointers[stripe++] = kmap(p);
1243
1244                if (q_stripe != -1) {
1245
1246                        /*
1247                         * raid6, add the qstripe and call the
1248                         * library function to fill in our p/q
1249                         */
1250                        p = rbio_qstripe_page(rbio, pagenr);
1251                        SetPageUptodate(p);
1252                        pointers[stripe++] = kmap(p);
1253
1254                        raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
1255                                                pointers);
1256                } else {
1257                        /* raid5 */
1258                        copy_page(pointers[nr_data], pointers[0]);
1259                        run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
1260                }
1261
1262
1263                for (stripe = 0; stripe < rbio->real_stripes; stripe++)
1264                        kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
1265        }
1266
1267        /*
1268         * time to start writing.  Make bios for everything from the
1269         * higher layers (the bio_list in our rbio) and our p/q.  Ignore
1270         * everything else.
1271         */
1272        for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1273                for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1274                        struct page *page;
1275                        if (stripe < rbio->nr_data) {
1276                                page = page_in_rbio(rbio, stripe, pagenr, 1);
1277                                if (!page)
1278                                        continue;
1279                        } else {
1280                               page = rbio_stripe_page(rbio, stripe, pagenr);
1281                        }
1282
1283                        ret = rbio_add_io_page(rbio, &bio_list,
1284                                       page, stripe, pagenr, rbio->stripe_len);
1285                        if (ret)
1286                                goto cleanup;
1287                }
1288        }
1289
1290        if (likely(!bbio->num_tgtdevs))
1291                goto write_data;
1292
1293        for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1294                if (!bbio->tgtdev_map[stripe])
1295                        continue;
1296
1297                for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1298                        struct page *page;
1299                        if (stripe < rbio->nr_data) {
1300                                page = page_in_rbio(rbio, stripe, pagenr, 1);
1301                                if (!page)
1302                                        continue;
1303                        } else {
1304                               page = rbio_stripe_page(rbio, stripe, pagenr);
1305                        }
1306
1307                        ret = rbio_add_io_page(rbio, &bio_list, page,
1308                                               rbio->bbio->tgtdev_map[stripe],
1309                                               pagenr, rbio->stripe_len);
1310                        if (ret)
1311                                goto cleanup;
1312                }
1313        }
1314
1315write_data:
1316        atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
1317        BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
1318
1319        while (1) {
1320                bio = bio_list_pop(&bio_list);
1321                if (!bio)
1322                        break;
1323
1324                bio->bi_private = rbio;
1325                bio->bi_end_io = raid_write_end_io;
1326                bio->bi_opf = REQ_OP_WRITE;
1327
1328                submit_bio(bio);
1329        }
1330        return;
1331
1332cleanup:
1333        rbio_orig_end_io(rbio, BLK_STS_IOERR);
1334
1335        while ((bio = bio_list_pop(&bio_list)))
1336                bio_put(bio);
1337}
1338
1339/*
1340 * helper to find the stripe number for a given bio.  Used to figure out which
1341 * stripe has failed.  This expects the bio to correspond to a physical disk,
1342 * so it looks up based on physical sector numbers.
1343 */
1344static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1345                           struct bio *bio)
1346{
1347        u64 physical = bio->bi_iter.bi_sector;
1348        u64 stripe_start;
1349        int i;
1350        struct btrfs_bio_stripe *stripe;
1351
1352        physical <<= 9;
1353
1354        for (i = 0; i < rbio->bbio->num_stripes; i++) {
1355                stripe = &rbio->bbio->stripes[i];
1356                stripe_start = stripe->physical;
1357                if (physical >= stripe_start &&
1358                    physical < stripe_start + rbio->stripe_len &&
1359                    stripe->dev->bdev &&
1360                    bio->bi_disk == stripe->dev->bdev->bd_disk &&
1361                    bio->bi_partno == stripe->dev->bdev->bd_partno) {
1362                        return i;
1363                }
1364        }
1365        return -1;
1366}
1367
1368/*
1369 * helper to find the stripe number for a given
1370 * bio (before mapping).  Used to figure out which stripe has
1371 * failed.  This looks up based on logical block numbers.
1372 */
1373static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
1374                                   struct bio *bio)
1375{
1376        u64 logical = bio->bi_iter.bi_sector;
1377        u64 stripe_start;
1378        int i;
1379
1380        logical <<= 9;
1381
1382        for (i = 0; i < rbio->nr_data; i++) {
1383                stripe_start = rbio->bbio->raid_map[i];
1384                if (logical >= stripe_start &&
1385                    logical < stripe_start + rbio->stripe_len) {
1386                        return i;
1387                }
1388        }
1389        return -1;
1390}
1391
1392/*
1393 * returns -EIO if we had too many failures
1394 */
1395static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1396{
1397        unsigned long flags;
1398        int ret = 0;
1399
1400        spin_lock_irqsave(&rbio->bio_list_lock, flags);
1401
1402        /* we already know this stripe is bad, move on */
1403        if (rbio->faila == failed || rbio->failb == failed)
1404                goto out;
1405
1406        if (rbio->faila == -1) {
1407                /* first failure on this rbio */
1408                rbio->faila = failed;
1409                atomic_inc(&rbio->error);
1410        } else if (rbio->failb == -1) {
1411                /* second failure on this rbio */
1412                rbio->failb = failed;
1413                atomic_inc(&rbio->error);
1414        } else {
1415                ret = -EIO;
1416        }
1417out:
1418        spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1419
1420        return ret;
1421}
1422
1423/*
1424 * helper to fail a stripe based on a physical disk
1425 * bio.
1426 */
1427static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1428                           struct bio *bio)
1429{
1430        int failed = find_bio_stripe(rbio, bio);
1431
1432        if (failed < 0)
1433                return -EIO;
1434
1435        return fail_rbio_index(rbio, failed);
1436}
1437
1438/*
1439 * this sets each page in the bio uptodate.  It should only be used on private
1440 * rbio pages, nothing that comes in from the higher layers
1441 */
1442static void set_bio_pages_uptodate(struct bio *bio)
1443{
1444        struct bio_vec *bvec;
1445        int i;
1446
1447        ASSERT(!bio_flagged(bio, BIO_CLONED));
1448
1449        bio_for_each_segment_all(bvec, bio, i)
1450                SetPageUptodate(bvec->bv_page);
1451}
1452
1453/*
1454 * end io for the read phase of the rmw cycle.  All the bios here are physical
1455 * stripe bios we've read from the disk so we can recalculate the parity of the
1456 * stripe.
1457 *
1458 * This will usually kick off finish_rmw once all the bios are read in, but it
1459 * may trigger parity reconstruction if we had any errors along the way
1460 */
1461static void raid_rmw_end_io(struct bio *bio)
1462{
1463        struct btrfs_raid_bio *rbio = bio->bi_private;
1464
1465        if (bio->bi_status)
1466                fail_bio_stripe(rbio, bio);
1467        else
1468                set_bio_pages_uptodate(bio);
1469
1470        bio_put(bio);
1471
1472        if (!atomic_dec_and_test(&rbio->stripes_pending))
1473                return;
1474
1475        if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
1476                goto cleanup;
1477
1478        /*
1479         * this will normally call finish_rmw to start our write
1480         * but if there are any failed stripes we'll reconstruct
1481         * from parity first
1482         */
1483        validate_rbio_for_rmw(rbio);
1484        return;
1485
1486cleanup:
1487
1488        rbio_orig_end_io(rbio, BLK_STS_IOERR);
1489}
1490
1491/*
1492 * the stripe must be locked by the caller.  It will
1493 * unlock after all the writes are done
1494 */
1495static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1496{
1497        int bios_to_read = 0;
1498        struct bio_list bio_list;
1499        int ret;
1500        int pagenr;
1501        int stripe;
1502        struct bio *bio;
1503
1504        bio_list_init(&bio_list);
1505
1506        ret = alloc_rbio_pages(rbio);
1507        if (ret)
1508                goto cleanup;
1509
1510        index_rbio_pages(rbio);
1511
1512        atomic_set(&rbio->error, 0);
1513        /*
1514         * build a list of bios to read all the missing parts of this
1515         * stripe
1516         */
1517        for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1518                for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1519                        struct page *page;
1520                        /*
1521                         * we want to find all the pages missing from
1522                         * the rbio and read them from the disk.  If
1523                         * page_in_rbio finds a page in the bio list
1524                         * we don't need to read it off the stripe.
1525                         */
1526                        page = page_in_rbio(rbio, stripe, pagenr, 1);
1527                        if (page)
1528                                continue;
1529
1530                        page = rbio_stripe_page(rbio, stripe, pagenr);
1531                        /*
1532                         * the bio cache may have handed us an uptodate
1533                         * page.  If so, be happy and use it
1534                         */
1535                        if (PageUptodate(page))
1536                                continue;
1537
1538                        ret = rbio_add_io_page(rbio, &bio_list, page,
1539                                       stripe, pagenr, rbio->stripe_len);
1540                        if (ret)
1541                                goto cleanup;
1542                }
1543        }
1544
1545        bios_to_read = bio_list_size(&bio_list);
1546        if (!bios_to_read) {
1547                /*
1548                 * this can happen if others have merged with
1549                 * us, it means there is nothing left to read.
1550                 * But if there are missing devices it may not be
1551                 * safe to do the full stripe write yet.
1552                 */
1553                goto finish;
1554        }
1555
1556        /*
1557         * the bbio may be freed once we submit the last bio.  Make sure
1558         * not to touch it after that
1559         */
1560        atomic_set(&rbio->stripes_pending, bios_to_read);
1561        while (1) {
1562                bio = bio_list_pop(&bio_list);
1563                if (!bio)
1564                        break;
1565
1566                bio->bi_private = rbio;
1567                bio->bi_end_io = raid_rmw_end_io;
1568                bio->bi_opf = REQ_OP_READ;
1569
1570                btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
1571
1572                submit_bio(bio);
1573        }
1574        /* the actual write will happen once the reads are done */
1575        return 0;
1576
1577cleanup:
1578        rbio_orig_end_io(rbio, BLK_STS_IOERR);
1579
1580        while ((bio = bio_list_pop(&bio_list)))
1581                bio_put(bio);
1582
1583        return -EIO;
1584
1585finish:
1586        validate_rbio_for_rmw(rbio);
1587        return 0;
1588}
1589
1590/*
1591 * if the upper layers pass in a full stripe, we thank them by only allocating
1592 * enough pages to hold the parity, and sending it all down quickly.
1593 */
1594static int full_stripe_write(struct btrfs_raid_bio *rbio)
1595{
1596        int ret;
1597
1598        ret = alloc_rbio_parity_pages(rbio);
1599        if (ret) {
1600                __free_raid_bio(rbio);
1601                return ret;
1602        }
1603
1604        ret = lock_stripe_add(rbio);
1605        if (ret == 0)
1606                finish_rmw(rbio);
1607        return 0;
1608}
1609
1610/*
1611 * partial stripe writes get handed over to async helpers.
1612 * We're really hoping to merge a few more writes into this
1613 * rbio before calculating new parity
1614 */
1615static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1616{
1617        int ret;
1618
1619        ret = lock_stripe_add(rbio);
1620        if (ret == 0)
1621                start_async_work(rbio, rmw_work);
1622        return 0;
1623}
1624
1625/*
1626 * sometimes while we were reading from the drive to
1627 * recalculate parity, enough new bios come into create
1628 * a full stripe.  So we do a check here to see if we can
1629 * go directly to finish_rmw
1630 */
1631static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1632{
1633        /* head off into rmw land if we don't have a full stripe */
1634        if (!rbio_is_full(rbio))
1635                return partial_stripe_write(rbio);
1636        return full_stripe_write(rbio);
1637}
1638
1639/*
1640 * We use plugging call backs to collect full stripes.
1641 * Any time we get a partial stripe write while plugged
1642 * we collect it into a list.  When the unplug comes down,
1643 * we sort the list by logical block number and merge
1644 * everything we can into the same rbios
1645 */
1646struct btrfs_plug_cb {
1647        struct blk_plug_cb cb;
1648        struct btrfs_fs_info *info;
1649        struct list_head rbio_list;
1650        struct btrfs_work work;
1651};
1652
1653/*
1654 * rbios on the plug list are sorted for easier merging.
1655 */
1656static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
1657{
1658        struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1659                                                 plug_list);
1660        struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1661                                                 plug_list);
1662        u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1663        u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
1664
1665        if (a_sector < b_sector)
1666                return -1;
1667        if (a_sector > b_sector)
1668                return 1;
1669        return 0;
1670}
1671
1672static void run_plug(struct btrfs_plug_cb *plug)
1673{
1674        struct btrfs_raid_bio *cur;
1675        struct btrfs_raid_bio *last = NULL;
1676
1677        /*
1678         * sort our plug list then try to merge
1679         * everything we can in hopes of creating full
1680         * stripes.
1681         */
1682        list_sort(NULL, &plug->rbio_list, plug_cmp);
1683        while (!list_empty(&plug->rbio_list)) {
1684                cur = list_entry(plug->rbio_list.next,
1685                                 struct btrfs_raid_bio, plug_list);
1686                list_del_init(&cur->plug_list);
1687
1688                if (rbio_is_full(cur)) {
1689                        int ret;
1690
1691                        /* we have a full stripe, send it down */
1692                        ret = full_stripe_write(cur);
1693                        BUG_ON(ret);
1694                        continue;
1695                }
1696                if (last) {
1697                        if (rbio_can_merge(last, cur)) {
1698                                merge_rbio(last, cur);
1699                                __free_raid_bio(cur);
1700                                continue;
1701
1702                        }
1703                        __raid56_parity_write(last);
1704                }
1705                last = cur;
1706        }
1707        if (last) {
1708                __raid56_parity_write(last);
1709        }
1710        kfree(plug);
1711}
1712
1713/*
1714 * if the unplug comes from schedule, we have to push the
1715 * work off to a helper thread
1716 */
1717static void unplug_work(struct btrfs_work *work)
1718{
1719        struct btrfs_plug_cb *plug;
1720        plug = container_of(work, struct btrfs_plug_cb, work);
1721        run_plug(plug);
1722}
1723
1724static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1725{
1726        struct btrfs_plug_cb *plug;
1727        plug = container_of(cb, struct btrfs_plug_cb, cb);
1728
1729        if (from_schedule) {
1730                btrfs_init_work(&plug->work, btrfs_rmw_helper,
1731                                unplug_work, NULL, NULL);
1732                btrfs_queue_work(plug->info->rmw_workers,
1733                                 &plug->work);
1734                return;
1735        }
1736        run_plug(plug);
1737}
1738
1739/*
1740 * our main entry point for writes from the rest of the FS.
1741 */
1742int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio,
1743                        struct btrfs_bio *bbio, u64 stripe_len)
1744{
1745        struct btrfs_raid_bio *rbio;
1746        struct btrfs_plug_cb *plug = NULL;
1747        struct blk_plug_cb *cb;
1748        int ret;
1749
1750        rbio = alloc_rbio(fs_info, bbio, stripe_len);
1751        if (IS_ERR(rbio)) {
1752                btrfs_put_bbio(bbio);
1753                return PTR_ERR(rbio);
1754        }
1755        bio_list_add(&rbio->bio_list, bio);
1756        rbio->bio_list_bytes = bio->bi_iter.bi_size;
1757        rbio->operation = BTRFS_RBIO_WRITE;
1758
1759        btrfs_bio_counter_inc_noblocked(fs_info);
1760        rbio->generic_bio_cnt = 1;
1761
1762        /*
1763         * don't plug on full rbios, just get them out the door
1764         * as quickly as we can
1765         */
1766        if (rbio_is_full(rbio)) {
1767                ret = full_stripe_write(rbio);
1768                if (ret)
1769                        btrfs_bio_counter_dec(fs_info);
1770                return ret;
1771        }
1772
1773        cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
1774        if (cb) {
1775                plug = container_of(cb, struct btrfs_plug_cb, cb);
1776                if (!plug->info) {
1777                        plug->info = fs_info;
1778                        INIT_LIST_HEAD(&plug->rbio_list);
1779                }
1780                list_add_tail(&rbio->plug_list, &plug->rbio_list);
1781                ret = 0;
1782        } else {
1783                ret = __raid56_parity_write(rbio);
1784                if (ret)
1785                        btrfs_bio_counter_dec(fs_info);
1786        }
1787        return ret;
1788}
1789
1790/*
1791 * all parity reconstruction happens here.  We've read in everything
1792 * we can find from the drives and this does the heavy lifting of
1793 * sorting the good from the bad.
1794 */
1795static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1796{
1797        int pagenr, stripe;
1798        void **pointers;
1799        int faila = -1, failb = -1;
1800        struct page *page;
1801        blk_status_t err;
1802        int i;
1803
1804        pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1805        if (!pointers) {
1806                err = BLK_STS_RESOURCE;
1807                goto cleanup_io;
1808        }
1809
1810        faila = rbio->faila;
1811        failb = rbio->failb;
1812
1813        if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1814            rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
1815                spin_lock_irq(&rbio->bio_list_lock);
1816                set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1817                spin_unlock_irq(&rbio->bio_list_lock);
1818        }
1819
1820        index_rbio_pages(rbio);
1821
1822        for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1823                /*
1824                 * Now we just use bitmap to mark the horizontal stripes in
1825                 * which we have data when doing parity scrub.
1826                 */
1827                if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
1828                    !test_bit(pagenr, rbio->dbitmap))
1829                        continue;
1830
1831                /* setup our array of pointers with pages
1832                 * from each stripe
1833                 */
1834                for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1835                        /*
1836                         * if we're rebuilding a read, we have to use
1837                         * pages from the bio list
1838                         */
1839                        if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1840                             rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
1841                            (stripe == faila || stripe == failb)) {
1842                                page = page_in_rbio(rbio, stripe, pagenr, 0);
1843                        } else {
1844                                page = rbio_stripe_page(rbio, stripe, pagenr);
1845                        }
1846                        pointers[stripe] = kmap(page);
1847                }
1848
1849                /* all raid6 handling here */
1850                if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) {
1851                        /*
1852                         * single failure, rebuild from parity raid5
1853                         * style
1854                         */
1855                        if (failb < 0) {
1856                                if (faila == rbio->nr_data) {
1857                                        /*
1858                                         * Just the P stripe has failed, without
1859                                         * a bad data or Q stripe.
1860                                         * TODO, we should redo the xor here.
1861                                         */
1862                                        err = BLK_STS_IOERR;
1863                                        goto cleanup;
1864                                }
1865                                /*
1866                                 * a single failure in raid6 is rebuilt
1867                                 * in the pstripe code below
1868                                 */
1869                                goto pstripe;
1870                        }
1871
1872                        /* make sure our ps and qs are in order */
1873                        if (faila > failb) {
1874                                int tmp = failb;
1875                                failb = faila;
1876                                faila = tmp;
1877                        }
1878
1879                        /* if the q stripe is failed, do a pstripe reconstruction
1880                         * from the xors.
1881                         * If both the q stripe and the P stripe are failed, we're
1882                         * here due to a crc mismatch and we can't give them the
1883                         * data they want
1884                         */
1885                        if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) {
1886                                if (rbio->bbio->raid_map[faila] ==
1887                                    RAID5_P_STRIPE) {
1888                                        err = BLK_STS_IOERR;
1889                                        goto cleanup;
1890                                }
1891                                /*
1892                                 * otherwise we have one bad data stripe and
1893                                 * a good P stripe.  raid5!
1894                                 */
1895                                goto pstripe;
1896                        }
1897
1898                        if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) {
1899                                raid6_datap_recov(rbio->real_stripes,
1900                                                  PAGE_SIZE, faila, pointers);
1901                        } else {
1902                                raid6_2data_recov(rbio->real_stripes,
1903                                                  PAGE_SIZE, faila, failb,
1904                                                  pointers);
1905                        }
1906                } else {
1907                        void *p;
1908
1909                        /* rebuild from P stripe here (raid5 or raid6) */
1910                        BUG_ON(failb != -1);
1911pstripe:
1912                        /* Copy parity block into failed block to start with */
1913                        copy_page(pointers[faila], pointers[rbio->nr_data]);
1914
1915                        /* rearrange the pointer array */
1916                        p = pointers[faila];
1917                        for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
1918                                pointers[stripe] = pointers[stripe + 1];
1919                        pointers[rbio->nr_data - 1] = p;
1920
1921                        /* xor in the rest */
1922                        run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE);
1923                }
1924                /* if we're doing this rebuild as part of an rmw, go through
1925                 * and set all of our private rbio pages in the
1926                 * failed stripes as uptodate.  This way finish_rmw will
1927                 * know they can be trusted.  If this was a read reconstruction,
1928                 * other endio functions will fiddle the uptodate bits
1929                 */
1930                if (rbio->operation == BTRFS_RBIO_WRITE) {
1931                        for (i = 0;  i < rbio->stripe_npages; i++) {
1932                                if (faila != -1) {
1933                                        page = rbio_stripe_page(rbio, faila, i);
1934                                        SetPageUptodate(page);
1935                                }
1936                                if (failb != -1) {
1937                                        page = rbio_stripe_page(rbio, failb, i);
1938                                        SetPageUptodate(page);
1939                                }
1940                        }
1941                }
1942                for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1943                        /*
1944                         * if we're rebuilding a read, we have to use
1945                         * pages from the bio list
1946                         */
1947                        if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1948                             rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
1949                            (stripe == faila || stripe == failb)) {
1950                                page = page_in_rbio(rbio, stripe, pagenr, 0);
1951                        } else {
1952                                page = rbio_stripe_page(rbio, stripe, pagenr);
1953                        }
1954                        kunmap(page);
1955                }
1956        }
1957
1958        err = BLK_STS_OK;
1959cleanup:
1960        kfree(pointers);
1961
1962cleanup_io:
1963        /*
1964         * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
1965         * valid rbio which is consistent with ondisk content, thus such a
1966         * valid rbio can be cached to avoid further disk reads.
1967         */
1968        if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1969            rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
1970                /*
1971                 * - In case of two failures, where rbio->failb != -1:
1972                 *
1973                 *   Do not cache this rbio since the above read reconstruction
1974                 *   (raid6_datap_recov() or raid6_2data_recov()) may have
1975                 *   changed some content of stripes which are not identical to
1976                 *   on-disk content any more, otherwise, a later write/recover
1977                 *   may steal stripe_pages from this rbio and end up with
1978                 *   corruptions or rebuild failures.
1979                 *
1980                 * - In case of single failure, where rbio->failb == -1:
1981                 *
1982                 *   Cache this rbio iff the above read reconstruction is
1983                 *   excuted without problems.
1984                 */
1985                if (err == BLK_STS_OK && rbio->failb < 0)
1986                        cache_rbio_pages(rbio);
1987                else
1988                        clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1989
1990                rbio_orig_end_io(rbio, err);
1991        } else if (err == BLK_STS_OK) {
1992                rbio->faila = -1;
1993                rbio->failb = -1;
1994
1995                if (rbio->operation == BTRFS_RBIO_WRITE)
1996                        finish_rmw(rbio);
1997                else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
1998                        finish_parity_scrub(rbio, 0);
1999                else
2000                        BUG();
2001        } else {
2002                rbio_orig_end_io(rbio, err);
2003        }
2004}
2005
2006/*
2007 * This is called only for stripes we've read from disk to
2008 * reconstruct the parity.
2009 */
2010static void raid_recover_end_io(struct bio *bio)
2011{
2012        struct btrfs_raid_bio *rbio = bio->bi_private;
2013
2014        /*
2015         * we only read stripe pages off the disk, set them
2016         * up to date if there were no errors
2017         */
2018        if (bio->bi_status)
2019                fail_bio_stripe(rbio, bio);
2020        else
2021                set_bio_pages_uptodate(bio);
2022        bio_put(bio);
2023
2024        if (!atomic_dec_and_test(&rbio->stripes_pending))
2025                return;
2026
2027        if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
2028                rbio_orig_end_io(rbio, BLK_STS_IOERR);
2029        else
2030                __raid_recover_end_io(rbio);
2031}
2032
2033/*
2034 * reads everything we need off the disk to reconstruct
2035 * the parity. endio handlers trigger final reconstruction
2036 * when the IO is done.
2037 *
2038 * This is used both for reads from the higher layers and for
2039 * parity construction required to finish a rmw cycle.
2040 */
2041static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
2042{
2043        int bios_to_read = 0;
2044        struct bio_list bio_list;
2045        int ret;
2046        int pagenr;
2047        int stripe;
2048        struct bio *bio;
2049
2050        bio_list_init(&bio_list);
2051
2052        ret = alloc_rbio_pages(rbio);
2053        if (ret)
2054                goto cleanup;
2055
2056        atomic_set(&rbio->error, 0);
2057
2058        /*
2059         * read everything that hasn't failed.  Thanks to the
2060         * stripe cache, it is possible that some or all of these
2061         * pages are going to be uptodate.
2062         */
2063        for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
2064                if (rbio->faila == stripe || rbio->failb == stripe) {
2065                        atomic_inc(&rbio->error);
2066                        continue;
2067                }
2068
2069                for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
2070                        struct page *p;
2071
2072                        /*
2073                         * the rmw code may have already read this
2074                         * page in
2075                         */
2076                        p = rbio_stripe_page(rbio, stripe, pagenr);
2077                        if (PageUptodate(p))
2078                                continue;
2079
2080                        ret = rbio_add_io_page(rbio, &bio_list,
2081                                       rbio_stripe_page(rbio, stripe, pagenr),
2082                                       stripe, pagenr, rbio->stripe_len);
2083                        if (ret < 0)
2084                                goto cleanup;
2085                }
2086        }
2087
2088        bios_to_read = bio_list_size(&bio_list);
2089        if (!bios_to_read) {
2090                /*
2091                 * we might have no bios to read just because the pages
2092                 * were up to date, or we might have no bios to read because
2093                 * the devices were gone.
2094                 */
2095                if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
2096                        __raid_recover_end_io(rbio);
2097                        goto out;
2098                } else {
2099                        goto cleanup;
2100                }
2101        }
2102
2103        /*
2104         * the bbio may be freed once we submit the last bio.  Make sure
2105         * not to touch it after that
2106         */
2107        atomic_set(&rbio->stripes_pending, bios_to_read);
2108        while (1) {
2109                bio = bio_list_pop(&bio_list);
2110                if (!bio)
2111                        break;
2112
2113                bio->bi_private = rbio;
2114                bio->bi_end_io = raid_recover_end_io;
2115                bio->bi_opf = REQ_OP_READ;
2116
2117                btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
2118
2119                submit_bio(bio);
2120        }
2121out:
2122        return 0;
2123
2124cleanup:
2125        if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
2126            rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
2127                rbio_orig_end_io(rbio, BLK_STS_IOERR);
2128
2129        while ((bio = bio_list_pop(&bio_list)))
2130                bio_put(bio);
2131
2132        return -EIO;
2133}
2134
2135/*
2136 * the main entry point for reads from the higher layers.  This
2137 * is really only called when the normal read path had a failure,
2138 * so we assume the bio they send down corresponds to a failed part
2139 * of the drive.
2140 */
2141int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
2142                          struct btrfs_bio *bbio, u64 stripe_len,
2143                          int mirror_num, int generic_io)
2144{
2145        struct btrfs_raid_bio *rbio;
2146        int ret;
2147
2148        if (generic_io) {
2149                ASSERT(bbio->mirror_num == mirror_num);
2150                btrfs_io_bio(bio)->mirror_num = mirror_num;
2151        }
2152
2153        rbio = alloc_rbio(fs_info, bbio, stripe_len);
2154        if (IS_ERR(rbio)) {
2155                if (generic_io)
2156                        btrfs_put_bbio(bbio);
2157                return PTR_ERR(rbio);
2158        }
2159
2160        rbio->operation = BTRFS_RBIO_READ_REBUILD;
2161        bio_list_add(&rbio->bio_list, bio);
2162        rbio->bio_list_bytes = bio->bi_iter.bi_size;
2163
2164        rbio->faila = find_logical_bio_stripe(rbio, bio);
2165        if (rbio->faila == -1) {
2166                btrfs_warn(fs_info,
2167        "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bbio has map_type %llu)",
2168                           __func__, (u64)bio->bi_iter.bi_sector << 9,
2169                           (u64)bio->bi_iter.bi_size, bbio->map_type);
2170                if (generic_io)
2171                        btrfs_put_bbio(bbio);
2172                kfree(rbio);
2173                return -EIO;
2174        }
2175
2176        if (generic_io) {
2177                btrfs_bio_counter_inc_noblocked(fs_info);
2178                rbio->generic_bio_cnt = 1;
2179        } else {
2180                btrfs_get_bbio(bbio);
2181        }
2182
2183        /*
2184         * Loop retry:
2185         * for 'mirror == 2', reconstruct from all other stripes.
2186         * for 'mirror_num > 2', select a stripe to fail on every retry.
2187         */
2188        if (mirror_num > 2) {
2189                /*
2190                 * 'mirror == 3' is to fail the p stripe and
2191                 * reconstruct from the q stripe.  'mirror > 3' is to
2192                 * fail a data stripe and reconstruct from p+q stripe.
2193                 */
2194                rbio->failb = rbio->real_stripes - (mirror_num - 1);
2195                ASSERT(rbio->failb > 0);
2196                if (rbio->failb <= rbio->faila)
2197                        rbio->failb--;
2198        }
2199
2200        ret = lock_stripe_add(rbio);
2201
2202        /*
2203         * __raid56_parity_recover will end the bio with
2204         * any errors it hits.  We don't want to return
2205         * its error value up the stack because our caller
2206         * will end up calling bio_endio with any nonzero
2207         * return
2208         */
2209        if (ret == 0)
2210                __raid56_parity_recover(rbio);
2211        /*
2212         * our rbio has been added to the list of
2213         * rbios that will be handled after the
2214         * currently lock owner is done
2215         */
2216        return 0;
2217
2218}
2219
2220static void rmw_work(struct btrfs_work *work)
2221{
2222        struct btrfs_raid_bio *rbio;
2223
2224        rbio = container_of(work, struct btrfs_raid_bio, work);
2225        raid56_rmw_stripe(rbio);
2226}
2227
2228static void read_rebuild_work(struct btrfs_work *work)
2229{
2230        struct btrfs_raid_bio *rbio;
2231
2232        rbio = container_of(work, struct btrfs_raid_bio, work);
2233        __raid56_parity_recover(rbio);
2234}
2235
2236/*
2237 * The following code is used to scrub/replace the parity stripe
2238 *
2239 * Caller must have already increased bio_counter for getting @bbio.
2240 *
2241 * Note: We need make sure all the pages that add into the scrub/replace
2242 * raid bio are correct and not be changed during the scrub/replace. That
2243 * is those pages just hold metadata or file data with checksum.
2244 */
2245
2246struct btrfs_raid_bio *
2247raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
2248                               struct btrfs_bio *bbio, u64 stripe_len,
2249                               struct btrfs_device *scrub_dev,
2250                               unsigned long *dbitmap, int stripe_nsectors)
2251{
2252        struct btrfs_raid_bio *rbio;
2253        int i;
2254
2255        rbio = alloc_rbio(fs_info, bbio, stripe_len);
2256        if (IS_ERR(rbio))
2257                return NULL;
2258        bio_list_add(&rbio->bio_list, bio);
2259        /*
2260         * This is a special bio which is used to hold the completion handler
2261         * and make the scrub rbio is similar to the other types
2262         */
2263        ASSERT(!bio->bi_iter.bi_size);
2264        rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2265
2266        /*
2267         * After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted
2268         * to the end position, so this search can start from the first parity
2269         * stripe.
2270         */
2271        for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
2272                if (bbio->stripes[i].dev == scrub_dev) {
2273                        rbio->scrubp = i;
2274                        break;
2275                }
2276        }
2277        ASSERT(i < rbio->real_stripes);
2278
2279        /* Now we just support the sectorsize equals to page size */
2280        ASSERT(fs_info->sectorsize == PAGE_SIZE);
2281        ASSERT(rbio->stripe_npages == stripe_nsectors);
2282        bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
2283
2284        /*
2285         * We have already increased bio_counter when getting bbio, record it
2286         * so we can free it at rbio_orig_end_io().
2287         */
2288        rbio->generic_bio_cnt = 1;
2289
2290        return rbio;
2291}
2292
2293/* Used for both parity scrub and missing. */
2294void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
2295                            u64 logical)
2296{
2297        int stripe_offset;
2298        int index;
2299
2300        ASSERT(logical >= rbio->bbio->raid_map[0]);
2301        ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] +
2302                                rbio->stripe_len * rbio->nr_data);
2303        stripe_offset = (int)(logical - rbio->bbio->raid_map[0]);
2304        index = stripe_offset >> PAGE_SHIFT;
2305        rbio->bio_pages[index] = page;
2306}
2307
2308/*
2309 * We just scrub the parity that we have correct data on the same horizontal,
2310 * so we needn't allocate all pages for all the stripes.
2311 */
2312static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2313{
2314        int i;
2315        int bit;
2316        int index;
2317        struct page *page;
2318
2319        for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
2320                for (i = 0; i < rbio->real_stripes; i++) {
2321                        index = i * rbio->stripe_npages + bit;
2322                        if (rbio->stripe_pages[index])
2323                                continue;
2324
2325                        page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2326                        if (!page)
2327                                return -ENOMEM;
2328                        rbio->stripe_pages[index] = page;
2329                }
2330        }
2331        return 0;
2332}
2333
2334static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
2335                                         int need_check)
2336{
2337        struct btrfs_bio *bbio = rbio->bbio;
2338        void **pointers = rbio->finish_pointers;
2339        unsigned long *pbitmap = rbio->finish_pbitmap;
2340        int nr_data = rbio->nr_data;
2341        int stripe;
2342        int pagenr;
2343        int p_stripe = -1;
2344        int q_stripe = -1;
2345        struct page *p_page = NULL;
2346        struct page *q_page = NULL;
2347        struct bio_list bio_list;
2348        struct bio *bio;
2349        int is_replace = 0;
2350        int ret;
2351
2352        bio_list_init(&bio_list);
2353
2354        if (rbio->real_stripes - rbio->nr_data == 1) {
2355                p_stripe = rbio->real_stripes - 1;
2356        } else if (rbio->real_stripes - rbio->nr_data == 2) {
2357                p_stripe = rbio->real_stripes - 2;
2358                q_stripe = rbio->real_stripes - 1;
2359        } else {
2360                BUG();
2361        }
2362
2363        if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
2364                is_replace = 1;
2365                bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
2366        }
2367
2368        /*
2369         * Because the higher layers(scrubber) are unlikely to
2370         * use this area of the disk again soon, so don't cache
2371         * it.
2372         */
2373        clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2374
2375        if (!need_check)
2376                goto writeback;
2377
2378        p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2379        if (!p_page)
2380                goto cleanup;
2381        SetPageUptodate(p_page);
2382
2383        if (q_stripe != -1) {
2384                q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2385                if (!q_page) {
2386                        __free_page(p_page);
2387                        goto cleanup;
2388                }
2389                SetPageUptodate(q_page);
2390        }
2391
2392        atomic_set(&rbio->error, 0);
2393
2394        for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2395                struct page *p;
2396                void *parity;
2397                /* first collect one page from each data stripe */
2398                for (stripe = 0; stripe < nr_data; stripe++) {
2399                        p = page_in_rbio(rbio, stripe, pagenr, 0);
2400                        pointers[stripe] = kmap(p);
2401                }
2402
2403                /* then add the parity stripe */
2404                pointers[stripe++] = kmap(p_page);
2405
2406                if (q_stripe != -1) {
2407
2408                        /*
2409                         * raid6, add the qstripe and call the
2410                         * library function to fill in our p/q
2411                         */
2412                        pointers[stripe++] = kmap(q_page);
2413
2414                        raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
2415                                                pointers);
2416                } else {
2417                        /* raid5 */
2418                        copy_page(pointers[nr_data], pointers[0]);
2419                        run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
2420                }
2421
2422                /* Check scrubbing parity and repair it */
2423                p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2424                parity = kmap(p);
2425                if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE))
2426                        copy_page(parity, pointers[rbio->scrubp]);
2427                else
2428                        /* Parity is right, needn't writeback */
2429                        bitmap_clear(rbio->dbitmap, pagenr, 1);
2430                kunmap(p);
2431
2432                for (stripe = 0; stripe < rbio->real_stripes; stripe++)
2433                        kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
2434        }
2435
2436        __free_page(p_page);
2437        if (q_page)
2438                __free_page(q_page);
2439
2440writeback:
2441        /*
2442         * time to start writing.  Make bios for everything from the
2443         * higher layers (the bio_list in our rbio) and our p/q.  Ignore
2444         * everything else.
2445         */
2446        for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2447                struct page *page;
2448
2449                page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2450                ret = rbio_add_io_page(rbio, &bio_list,
2451                               page, rbio->scrubp, pagenr, rbio->stripe_len);
2452                if (ret)
2453                        goto cleanup;
2454        }
2455
2456        if (!is_replace)
2457                goto submit_write;
2458
2459        for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
2460                struct page *page;
2461
2462                page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2463                ret = rbio_add_io_page(rbio, &bio_list, page,
2464                                       bbio->tgtdev_map[rbio->scrubp],
2465                                       pagenr, rbio->stripe_len);
2466                if (ret)
2467                        goto cleanup;
2468        }
2469
2470submit_write:
2471        nr_data = bio_list_size(&bio_list);
2472        if (!nr_data) {
2473                /* Every parity is right */
2474                rbio_orig_end_io(rbio, BLK_STS_OK);
2475                return;
2476        }
2477
2478        atomic_set(&rbio->stripes_pending, nr_data);
2479
2480        while (1) {
2481                bio = bio_list_pop(&bio_list);
2482                if (!bio)
2483                        break;
2484
2485                bio->bi_private = rbio;
2486                bio->bi_end_io = raid_write_end_io;
2487                bio->bi_opf = REQ_OP_WRITE;
2488
2489                submit_bio(bio);
2490        }
2491        return;
2492
2493cleanup:
2494        rbio_orig_end_io(rbio, BLK_STS_IOERR);
2495
2496        while ((bio = bio_list_pop(&bio_list)))
2497                bio_put(bio);
2498}
2499
2500static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2501{
2502        if (stripe >= 0 && stripe < rbio->nr_data)
2503                return 1;
2504        return 0;
2505}
2506
2507/*
2508 * While we're doing the parity check and repair, we could have errors
2509 * in reading pages off the disk.  This checks for errors and if we're
2510 * not able to read the page it'll trigger parity reconstruction.  The
2511 * parity scrub will be finished after we've reconstructed the failed
2512 * stripes
2513 */
2514static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
2515{
2516        if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
2517                goto cleanup;
2518
2519        if (rbio->faila >= 0 || rbio->failb >= 0) {
2520                int dfail = 0, failp = -1;
2521
2522                if (is_data_stripe(rbio, rbio->faila))
2523                        dfail++;
2524                else if (is_parity_stripe(rbio->faila))
2525                        failp = rbio->faila;
2526
2527                if (is_data_stripe(rbio, rbio->failb))
2528                        dfail++;
2529                else if (is_parity_stripe(rbio->failb))
2530                        failp = rbio->failb;
2531
2532                /*
2533                 * Because we can not use a scrubbing parity to repair
2534                 * the data, so the capability of the repair is declined.
2535                 * (In the case of RAID5, we can not repair anything)
2536                 */
2537                if (dfail > rbio->bbio->max_errors - 1)
2538                        goto cleanup;
2539
2540                /*
2541                 * If all data is good, only parity is correctly, just
2542                 * repair the parity.
2543                 */
2544                if (dfail == 0) {
2545                        finish_parity_scrub(rbio, 0);
2546                        return;
2547                }
2548
2549                /*
2550                 * Here means we got one corrupted data stripe and one
2551                 * corrupted parity on RAID6, if the corrupted parity
2552                 * is scrubbing parity, luckily, use the other one to repair
2553                 * the data, or we can not repair the data stripe.
2554                 */
2555                if (failp != rbio->scrubp)
2556                        goto cleanup;
2557
2558                __raid_recover_end_io(rbio);
2559        } else {
2560                finish_parity_scrub(rbio, 1);
2561        }
2562        return;
2563
2564cleanup:
2565        rbio_orig_end_io(rbio, BLK_STS_IOERR);
2566}
2567
2568/*
2569 * end io for the read phase of the rmw cycle.  All the bios here are physical
2570 * stripe bios we've read from the disk so we can recalculate the parity of the
2571 * stripe.
2572 *
2573 * This will usually kick off finish_rmw once all the bios are read in, but it
2574 * may trigger parity reconstruction if we had any errors along the way
2575 */
2576static void raid56_parity_scrub_end_io(struct bio *bio)
2577{
2578        struct btrfs_raid_bio *rbio = bio->bi_private;
2579
2580        if (bio->bi_status)
2581                fail_bio_stripe(rbio, bio);
2582        else
2583                set_bio_pages_uptodate(bio);
2584
2585        bio_put(bio);
2586
2587        if (!atomic_dec_and_test(&rbio->stripes_pending))
2588                return;
2589
2590        /*
2591         * this will normally call finish_rmw to start our write
2592         * but if there are any failed stripes we'll reconstruct
2593         * from parity first
2594         */
2595        validate_rbio_for_parity_scrub(rbio);
2596}
2597
2598static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
2599{
2600        int bios_to_read = 0;
2601        struct bio_list bio_list;
2602        int ret;
2603        int pagenr;
2604        int stripe;
2605        struct bio *bio;
2606
2607        bio_list_init(&bio_list);
2608
2609        ret = alloc_rbio_essential_pages(rbio);
2610        if (ret)
2611                goto cleanup;
2612
2613        atomic_set(&rbio->error, 0);
2614        /*
2615         * build a list of bios to read all the missing parts of this
2616         * stripe
2617         */
2618        for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
2619                for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2620                        struct page *page;
2621                        /*
2622                         * we want to find all the pages missing from
2623                         * the rbio and read them from the disk.  If
2624                         * page_in_rbio finds a page in the bio list
2625                         * we don't need to read it off the stripe.
2626                         */
2627                        page = page_in_rbio(rbio, stripe, pagenr, 1);
2628                        if (page)
2629                                continue;
2630
2631                        page = rbio_stripe_page(rbio, stripe, pagenr);
2632                        /*
2633                         * the bio cache may have handed us an uptodate
2634                         * page.  If so, be happy and use it
2635                         */
2636                        if (PageUptodate(page))
2637                                continue;
2638
2639                        ret = rbio_add_io_page(rbio, &bio_list, page,
2640                                       stripe, pagenr, rbio->stripe_len);
2641                        if (ret)
2642                                goto cleanup;
2643                }
2644        }
2645
2646        bios_to_read = bio_list_size(&bio_list);
2647        if (!bios_to_read) {
2648                /*
2649                 * this can happen if others have merged with
2650                 * us, it means there is nothing left to read.
2651                 * But if there are missing devices it may not be
2652                 * safe to do the full stripe write yet.
2653                 */
2654                goto finish;
2655        }
2656
2657        /*
2658         * the bbio may be freed once we submit the last bio.  Make sure
2659         * not to touch it after that
2660         */
2661        atomic_set(&rbio->stripes_pending, bios_to_read);
2662        while (1) {
2663                bio = bio_list_pop(&bio_list);
2664                if (!bio)
2665                        break;
2666
2667                bio->bi_private = rbio;
2668                bio->bi_end_io = raid56_parity_scrub_end_io;
2669                bio->bi_opf = REQ_OP_READ;
2670
2671                btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
2672
2673                submit_bio(bio);
2674        }
2675        /* the actual write will happen once the reads are done */
2676        return;
2677
2678cleanup:
2679        rbio_orig_end_io(rbio, BLK_STS_IOERR);
2680
2681        while ((bio = bio_list_pop(&bio_list)))
2682                bio_put(bio);
2683
2684        return;
2685
2686finish:
2687        validate_rbio_for_parity_scrub(rbio);
2688}
2689
2690static void scrub_parity_work(struct btrfs_work *work)
2691{
2692        struct btrfs_raid_bio *rbio;
2693
2694        rbio = container_of(work, struct btrfs_raid_bio, work);
2695        raid56_parity_scrub_stripe(rbio);
2696}
2697
2698void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2699{
2700        if (!lock_stripe_add(rbio))
2701                start_async_work(rbio, scrub_parity_work);
2702}
2703
2704/* The following code is used for dev replace of a missing RAID 5/6 device. */
2705
2706struct btrfs_raid_bio *
2707raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
2708                          struct btrfs_bio *bbio, u64 length)
2709{
2710        struct btrfs_raid_bio *rbio;
2711
2712        rbio = alloc_rbio(fs_info, bbio, length);
2713        if (IS_ERR(rbio))
2714                return NULL;
2715
2716        rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
2717        bio_list_add(&rbio->bio_list, bio);
2718        /*
2719         * This is a special bio which is used to hold the completion handler
2720         * and make the scrub rbio is similar to the other types
2721         */
2722        ASSERT(!bio->bi_iter.bi_size);
2723
2724        rbio->faila = find_logical_bio_stripe(rbio, bio);
2725        if (rbio->faila == -1) {
2726                BUG();
2727                kfree(rbio);
2728                return NULL;
2729        }
2730
2731        /*
2732         * When we get bbio, we have already increased bio_counter, record it
2733         * so we can free it at rbio_orig_end_io()
2734         */
2735        rbio->generic_bio_cnt = 1;
2736
2737        return rbio;
2738}
2739
2740void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
2741{
2742        if (!lock_stripe_add(rbio))
2743                start_async_work(rbio, read_rebuild_work);
2744}
2745