linux/fs/btrfs/raid56.c
<<
>>
Prefs
   1/*
   2 * Copyright (C) 2012 Fusion-io  All rights reserved.
   3 * Copyright (C) 2012 Intel Corp. All rights reserved.
   4 *
   5 * This program is free software; you can redistribute it and/or
   6 * modify it under the terms of the GNU General Public
   7 * License v2 as published by the Free Software Foundation.
   8 *
   9 * This program is distributed in the hope that it will be useful,
  10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12 * General Public License for more details.
  13 *
  14 * You should have received a copy of the GNU General Public
  15 * License along with this program; if not, write to the
  16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  17 * Boston, MA 021110-1307, USA.
  18 */
  19#include <linux/sched.h>
  20#include <linux/wait.h>
  21#include <linux/bio.h>
  22#include <linux/slab.h>
  23#include <linux/buffer_head.h>
  24#include <linux/blkdev.h>
  25#include <linux/random.h>
  26#include <linux/iocontext.h>
  27#include <linux/capability.h>
  28#include <linux/ratelimit.h>
  29#include <linux/kthread.h>
  30#include <linux/raid/pq.h>
  31#include <linux/hash.h>
  32#include <linux/list_sort.h>
  33#include <linux/raid/xor.h>
  34#include <linux/vmalloc.h>
  35#include <asm/div64.h>
  36#include "ctree.h"
  37#include "extent_map.h"
  38#include "disk-io.h"
  39#include "transaction.h"
  40#include "print-tree.h"
  41#include "volumes.h"
  42#include "raid56.h"
  43#include "async-thread.h"
  44#include "check-integrity.h"
  45#include "rcu-string.h"
  46
  47/* set when additional merges to this rbio are not allowed */
  48#define RBIO_RMW_LOCKED_BIT     1
  49
  50/*
  51 * set when this rbio is sitting in the hash, but it is just a cache
  52 * of past RMW
  53 */
  54#define RBIO_CACHE_BIT          2
  55
  56/*
  57 * set when it is safe to trust the stripe_pages for caching
  58 */
  59#define RBIO_CACHE_READY_BIT    3
  60
  61
  62#define RBIO_CACHE_SIZE 1024
  63
  64struct btrfs_raid_bio {
  65        struct btrfs_fs_info *fs_info;
  66        struct btrfs_bio *bbio;
  67
  68        /*
  69         * logical block numbers for the start of each stripe
  70         * The last one or two are p/q.  These are sorted,
  71         * so raid_map[0] is the start of our full stripe
  72         */
  73        u64 *raid_map;
  74
  75        /* while we're doing rmw on a stripe
  76         * we put it into a hash table so we can
  77         * lock the stripe and merge more rbios
  78         * into it.
  79         */
  80        struct list_head hash_list;
  81
  82        /*
  83         * LRU list for the stripe cache
  84         */
  85        struct list_head stripe_cache;
  86
  87        /*
  88         * for scheduling work in the helper threads
  89         */
  90        struct btrfs_work work;
  91
  92        /*
  93         * bio list and bio_list_lock are used
  94         * to add more bios into the stripe
  95         * in hopes of avoiding the full rmw
  96         */
  97        struct bio_list bio_list;
  98        spinlock_t bio_list_lock;
  99
 100        /* also protected by the bio_list_lock, the
 101         * plug list is used by the plugging code
 102         * to collect partial bios while plugged.  The
 103         * stripe locking code also uses it to hand off
 104         * the stripe lock to the next pending IO
 105         */
 106        struct list_head plug_list;
 107
 108        /*
 109         * flags that tell us if it is safe to
 110         * merge with this bio
 111         */
 112        unsigned long flags;
 113
 114        /* size of each individual stripe on disk */
 115        int stripe_len;
 116
 117        /* number of data stripes (no p/q) */
 118        int nr_data;
 119
 120        /*
 121         * set if we're doing a parity rebuild
 122         * for a read from higher up, which is handled
 123         * differently from a parity rebuild as part of
 124         * rmw
 125         */
 126        int read_rebuild;
 127
 128        /* first bad stripe */
 129        int faila;
 130
 131        /* second bad stripe (for raid6 use) */
 132        int failb;
 133
 134        /*
 135         * number of pages needed to represent the full
 136         * stripe
 137         */
 138        int nr_pages;
 139
 140        /*
 141         * size of all the bios in the bio_list.  This
 142         * helps us decide if the rbio maps to a full
 143         * stripe or not
 144         */
 145        int bio_list_bytes;
 146
 147        atomic_t refs;
 148
 149        /*
 150         * these are two arrays of pointers.  We allocate the
 151         * rbio big enough to hold them both and setup their
 152         * locations when the rbio is allocated
 153         */
 154
 155        /* pointers to pages that we allocated for
 156         * reading/writing stripes directly from the disk (including P/Q)
 157         */
 158        struct page **stripe_pages;
 159
 160        /*
 161         * pointers to the pages in the bio_list.  Stored
 162         * here for faster lookup
 163         */
 164        struct page **bio_pages;
 165};
 166
 167static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
 168static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
 169static void rmw_work(struct btrfs_work *work);
 170static void read_rebuild_work(struct btrfs_work *work);
 171static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
 172static void async_read_rebuild(struct btrfs_raid_bio *rbio);
 173static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
 174static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
 175static void __free_raid_bio(struct btrfs_raid_bio *rbio);
 176static void index_rbio_pages(struct btrfs_raid_bio *rbio);
 177static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
 178
 179/*
 180 * the stripe hash table is used for locking, and to collect
 181 * bios in hopes of making a full stripe
 182 */
 183int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
 184{
 185        struct btrfs_stripe_hash_table *table;
 186        struct btrfs_stripe_hash_table *x;
 187        struct btrfs_stripe_hash *cur;
 188        struct btrfs_stripe_hash *h;
 189        int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
 190        int i;
 191        int table_size;
 192
 193        if (info->stripe_hash_table)
 194                return 0;
 195
 196        /*
 197         * The table is large, starting with order 4 and can go as high as
 198         * order 7 in case lock debugging is turned on.
 199         *
 200         * Try harder to allocate and fallback to vmalloc to lower the chance
 201         * of a failing mount.
 202         */
 203        table_size = sizeof(*table) + sizeof(*h) * num_entries;
 204        table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
 205        if (!table) {
 206                table = vzalloc(table_size);
 207                if (!table)
 208                        return -ENOMEM;
 209        }
 210
 211        spin_lock_init(&table->cache_lock);
 212        INIT_LIST_HEAD(&table->stripe_cache);
 213
 214        h = table->table;
 215
 216        for (i = 0; i < num_entries; i++) {
 217                cur = h + i;
 218                INIT_LIST_HEAD(&cur->hash_list);
 219                spin_lock_init(&cur->lock);
 220                init_waitqueue_head(&cur->wait);
 221        }
 222
 223        x = cmpxchg(&info->stripe_hash_table, NULL, table);
 224        if (x) {
 225                if (is_vmalloc_addr(x))
 226                        vfree(x);
 227                else
 228                        kfree(x);
 229        }
 230        return 0;
 231}
 232
 233/*
 234 * caching an rbio means to copy anything from the
 235 * bio_pages array into the stripe_pages array.  We
 236 * use the page uptodate bit in the stripe cache array
 237 * to indicate if it has valid data
 238 *
 239 * once the caching is done, we set the cache ready
 240 * bit.
 241 */
 242static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
 243{
 244        int i;
 245        char *s;
 246        char *d;
 247        int ret;
 248
 249        ret = alloc_rbio_pages(rbio);
 250        if (ret)
 251                return;
 252
 253        for (i = 0; i < rbio->nr_pages; i++) {
 254                if (!rbio->bio_pages[i])
 255                        continue;
 256
 257                s = kmap(rbio->bio_pages[i]);
 258                d = kmap(rbio->stripe_pages[i]);
 259
 260                memcpy(d, s, PAGE_CACHE_SIZE);
 261
 262                kunmap(rbio->bio_pages[i]);
 263                kunmap(rbio->stripe_pages[i]);
 264                SetPageUptodate(rbio->stripe_pages[i]);
 265        }
 266        set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
 267}
 268
 269/*
 270 * we hash on the first logical address of the stripe
 271 */
 272static int rbio_bucket(struct btrfs_raid_bio *rbio)
 273{
 274        u64 num = rbio->raid_map[0];
 275
 276        /*
 277         * we shift down quite a bit.  We're using byte
 278         * addressing, and most of the lower bits are zeros.
 279         * This tends to upset hash_64, and it consistently
 280         * returns just one or two different values.
 281         *
 282         * shifting off the lower bits fixes things.
 283         */
 284        return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
 285}
 286
 287/*
 288 * stealing an rbio means taking all the uptodate pages from the stripe
 289 * array in the source rbio and putting them into the destination rbio
 290 */
 291static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
 292{
 293        int i;
 294        struct page *s;
 295        struct page *d;
 296
 297        if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
 298                return;
 299
 300        for (i = 0; i < dest->nr_pages; i++) {
 301                s = src->stripe_pages[i];
 302                if (!s || !PageUptodate(s)) {
 303                        continue;
 304                }
 305
 306                d = dest->stripe_pages[i];
 307                if (d)
 308                        __free_page(d);
 309
 310                dest->stripe_pages[i] = s;
 311                src->stripe_pages[i] = NULL;
 312        }
 313}
 314
 315/*
 316 * merging means we take the bio_list from the victim and
 317 * splice it into the destination.  The victim should
 318 * be discarded afterwards.
 319 *
 320 * must be called with dest->rbio_list_lock held
 321 */
 322static void merge_rbio(struct btrfs_raid_bio *dest,
 323                       struct btrfs_raid_bio *victim)
 324{
 325        bio_list_merge(&dest->bio_list, &victim->bio_list);
 326        dest->bio_list_bytes += victim->bio_list_bytes;
 327        bio_list_init(&victim->bio_list);
 328}
 329
 330/*
 331 * used to prune items that are in the cache.  The caller
 332 * must hold the hash table lock.
 333 */
 334static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
 335{
 336        int bucket = rbio_bucket(rbio);
 337        struct btrfs_stripe_hash_table *table;
 338        struct btrfs_stripe_hash *h;
 339        int freeit = 0;
 340
 341        /*
 342         * check the bit again under the hash table lock.
 343         */
 344        if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
 345                return;
 346
 347        table = rbio->fs_info->stripe_hash_table;
 348        h = table->table + bucket;
 349
 350        /* hold the lock for the bucket because we may be
 351         * removing it from the hash table
 352         */
 353        spin_lock(&h->lock);
 354
 355        /*
 356         * hold the lock for the bio list because we need
 357         * to make sure the bio list is empty
 358         */
 359        spin_lock(&rbio->bio_list_lock);
 360
 361        if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
 362                list_del_init(&rbio->stripe_cache);
 363                table->cache_size -= 1;
 364                freeit = 1;
 365
 366                /* if the bio list isn't empty, this rbio is
 367                 * still involved in an IO.  We take it out
 368                 * of the cache list, and drop the ref that
 369                 * was held for the list.
 370                 *
 371                 * If the bio_list was empty, we also remove
 372                 * the rbio from the hash_table, and drop
 373                 * the corresponding ref
 374                 */
 375                if (bio_list_empty(&rbio->bio_list)) {
 376                        if (!list_empty(&rbio->hash_list)) {
 377                                list_del_init(&rbio->hash_list);
 378                                atomic_dec(&rbio->refs);
 379                                BUG_ON(!list_empty(&rbio->plug_list));
 380                        }
 381                }
 382        }
 383
 384        spin_unlock(&rbio->bio_list_lock);
 385        spin_unlock(&h->lock);
 386
 387        if (freeit)
 388                __free_raid_bio(rbio);
 389}
 390
 391/*
 392 * prune a given rbio from the cache
 393 */
 394static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
 395{
 396        struct btrfs_stripe_hash_table *table;
 397        unsigned long flags;
 398
 399        if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
 400                return;
 401
 402        table = rbio->fs_info->stripe_hash_table;
 403
 404        spin_lock_irqsave(&table->cache_lock, flags);
 405        __remove_rbio_from_cache(rbio);
 406        spin_unlock_irqrestore(&table->cache_lock, flags);
 407}
 408
 409/*
 410 * remove everything in the cache
 411 */
 412static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
 413{
 414        struct btrfs_stripe_hash_table *table;
 415        unsigned long flags;
 416        struct btrfs_raid_bio *rbio;
 417
 418        table = info->stripe_hash_table;
 419
 420        spin_lock_irqsave(&table->cache_lock, flags);
 421        while (!list_empty(&table->stripe_cache)) {
 422                rbio = list_entry(table->stripe_cache.next,
 423                                  struct btrfs_raid_bio,
 424                                  stripe_cache);
 425                __remove_rbio_from_cache(rbio);
 426        }
 427        spin_unlock_irqrestore(&table->cache_lock, flags);
 428}
 429
 430/*
 431 * remove all cached entries and free the hash table
 432 * used by unmount
 433 */
 434void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
 435{
 436        if (!info->stripe_hash_table)
 437                return;
 438        btrfs_clear_rbio_cache(info);
 439        if (is_vmalloc_addr(info->stripe_hash_table))
 440                vfree(info->stripe_hash_table);
 441        else
 442                kfree(info->stripe_hash_table);
 443        info->stripe_hash_table = NULL;
 444}
 445
 446/*
 447 * insert an rbio into the stripe cache.  It
 448 * must have already been prepared by calling
 449 * cache_rbio_pages
 450 *
 451 * If this rbio was already cached, it gets
 452 * moved to the front of the lru.
 453 *
 454 * If the size of the rbio cache is too big, we
 455 * prune an item.
 456 */
 457static void cache_rbio(struct btrfs_raid_bio *rbio)
 458{
 459        struct btrfs_stripe_hash_table *table;
 460        unsigned long flags;
 461
 462        if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
 463                return;
 464
 465        table = rbio->fs_info->stripe_hash_table;
 466
 467        spin_lock_irqsave(&table->cache_lock, flags);
 468        spin_lock(&rbio->bio_list_lock);
 469
 470        /* bump our ref if we were not in the list before */
 471        if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
 472                atomic_inc(&rbio->refs);
 473
 474        if (!list_empty(&rbio->stripe_cache)){
 475                list_move(&rbio->stripe_cache, &table->stripe_cache);
 476        } else {
 477                list_add(&rbio->stripe_cache, &table->stripe_cache);
 478                table->cache_size += 1;
 479        }
 480
 481        spin_unlock(&rbio->bio_list_lock);
 482
 483        if (table->cache_size > RBIO_CACHE_SIZE) {
 484                struct btrfs_raid_bio *found;
 485
 486                found = list_entry(table->stripe_cache.prev,
 487                                  struct btrfs_raid_bio,
 488                                  stripe_cache);
 489
 490                if (found != rbio)
 491                        __remove_rbio_from_cache(found);
 492        }
 493
 494        spin_unlock_irqrestore(&table->cache_lock, flags);
 495        return;
 496}
 497
 498/*
 499 * helper function to run the xor_blocks api.  It is only
 500 * able to do MAX_XOR_BLOCKS at a time, so we need to
 501 * loop through.
 502 */
 503static void run_xor(void **pages, int src_cnt, ssize_t len)
 504{
 505        int src_off = 0;
 506        int xor_src_cnt = 0;
 507        void *dest = pages[src_cnt];
 508
 509        while(src_cnt > 0) {
 510                xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
 511                xor_blocks(xor_src_cnt, len, dest, pages + src_off);
 512
 513                src_cnt -= xor_src_cnt;
 514                src_off += xor_src_cnt;
 515        }
 516}
 517
 518/*
 519 * returns true if the bio list inside this rbio
 520 * covers an entire stripe (no rmw required).
 521 * Must be called with the bio list lock held, or
 522 * at a time when you know it is impossible to add
 523 * new bios into the list
 524 */
 525static int __rbio_is_full(struct btrfs_raid_bio *rbio)
 526{
 527        unsigned long size = rbio->bio_list_bytes;
 528        int ret = 1;
 529
 530        if (size != rbio->nr_data * rbio->stripe_len)
 531                ret = 0;
 532
 533        BUG_ON(size > rbio->nr_data * rbio->stripe_len);
 534        return ret;
 535}
 536
 537static int rbio_is_full(struct btrfs_raid_bio *rbio)
 538{
 539        unsigned long flags;
 540        int ret;
 541
 542        spin_lock_irqsave(&rbio->bio_list_lock, flags);
 543        ret = __rbio_is_full(rbio);
 544        spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
 545        return ret;
 546}
 547
 548/*
 549 * returns 1 if it is safe to merge two rbios together.
 550 * The merging is safe if the two rbios correspond to
 551 * the same stripe and if they are both going in the same
 552 * direction (read vs write), and if neither one is
 553 * locked for final IO
 554 *
 555 * The caller is responsible for locking such that
 556 * rmw_locked is safe to test
 557 */
 558static int rbio_can_merge(struct btrfs_raid_bio *last,
 559                          struct btrfs_raid_bio *cur)
 560{
 561        if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
 562            test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
 563                return 0;
 564
 565        /*
 566         * we can't merge with cached rbios, since the
 567         * idea is that when we merge the destination
 568         * rbio is going to run our IO for us.  We can
 569         * steal from cached rbio's though, other functions
 570         * handle that.
 571         */
 572        if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
 573            test_bit(RBIO_CACHE_BIT, &cur->flags))
 574                return 0;
 575
 576        if (last->raid_map[0] !=
 577            cur->raid_map[0])
 578                return 0;
 579
 580        /* reads can't merge with writes */
 581        if (last->read_rebuild !=
 582            cur->read_rebuild) {
 583                return 0;
 584        }
 585
 586        return 1;
 587}
 588
 589/*
 590 * helper to index into the pstripe
 591 */
 592static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
 593{
 594        index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
 595        return rbio->stripe_pages[index];
 596}
 597
 598/*
 599 * helper to index into the qstripe, returns null
 600 * if there is no qstripe
 601 */
 602static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
 603{
 604        if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
 605                return NULL;
 606
 607        index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
 608                PAGE_CACHE_SHIFT;
 609        return rbio->stripe_pages[index];
 610}
 611
 612/*
 613 * The first stripe in the table for a logical address
 614 * has the lock.  rbios are added in one of three ways:
 615 *
 616 * 1) Nobody has the stripe locked yet.  The rbio is given
 617 * the lock and 0 is returned.  The caller must start the IO
 618 * themselves.
 619 *
 620 * 2) Someone has the stripe locked, but we're able to merge
 621 * with the lock owner.  The rbio is freed and the IO will
 622 * start automatically along with the existing rbio.  1 is returned.
 623 *
 624 * 3) Someone has the stripe locked, but we're not able to merge.
 625 * The rbio is added to the lock owner's plug list, or merged into
 626 * an rbio already on the plug list.  When the lock owner unlocks,
 627 * the next rbio on the list is run and the IO is started automatically.
 628 * 1 is returned
 629 *
 630 * If we return 0, the caller still owns the rbio and must continue with
 631 * IO submission.  If we return 1, the caller must assume the rbio has
 632 * already been freed.
 633 */
 634static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
 635{
 636        int bucket = rbio_bucket(rbio);
 637        struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
 638        struct btrfs_raid_bio *cur;
 639        struct btrfs_raid_bio *pending;
 640        unsigned long flags;
 641        DEFINE_WAIT(wait);
 642        struct btrfs_raid_bio *freeit = NULL;
 643        struct btrfs_raid_bio *cache_drop = NULL;
 644        int ret = 0;
 645        int walk = 0;
 646
 647        spin_lock_irqsave(&h->lock, flags);
 648        list_for_each_entry(cur, &h->hash_list, hash_list) {
 649                walk++;
 650                if (cur->raid_map[0] == rbio->raid_map[0]) {
 651                        spin_lock(&cur->bio_list_lock);
 652
 653                        /* can we steal this cached rbio's pages? */
 654                        if (bio_list_empty(&cur->bio_list) &&
 655                            list_empty(&cur->plug_list) &&
 656                            test_bit(RBIO_CACHE_BIT, &cur->flags) &&
 657                            !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
 658                                list_del_init(&cur->hash_list);
 659                                atomic_dec(&cur->refs);
 660
 661                                steal_rbio(cur, rbio);
 662                                cache_drop = cur;
 663                                spin_unlock(&cur->bio_list_lock);
 664
 665                                goto lockit;
 666                        }
 667
 668                        /* can we merge into the lock owner? */
 669                        if (rbio_can_merge(cur, rbio)) {
 670                                merge_rbio(cur, rbio);
 671                                spin_unlock(&cur->bio_list_lock);
 672                                freeit = rbio;
 673                                ret = 1;
 674                                goto out;
 675                        }
 676
 677
 678                        /*
 679                         * we couldn't merge with the running
 680                         * rbio, see if we can merge with the
 681                         * pending ones.  We don't have to
 682                         * check for rmw_locked because there
 683                         * is no way they are inside finish_rmw
 684                         * right now
 685                         */
 686                        list_for_each_entry(pending, &cur->plug_list,
 687                                            plug_list) {
 688                                if (rbio_can_merge(pending, rbio)) {
 689                                        merge_rbio(pending, rbio);
 690                                        spin_unlock(&cur->bio_list_lock);
 691                                        freeit = rbio;
 692                                        ret = 1;
 693                                        goto out;
 694                                }
 695                        }
 696
 697                        /* no merging, put us on the tail of the plug list,
 698                         * our rbio will be started with the currently
 699                         * running rbio unlocks
 700                         */
 701                        list_add_tail(&rbio->plug_list, &cur->plug_list);
 702                        spin_unlock(&cur->bio_list_lock);
 703                        ret = 1;
 704                        goto out;
 705                }
 706        }
 707lockit:
 708        atomic_inc(&rbio->refs);
 709        list_add(&rbio->hash_list, &h->hash_list);
 710out:
 711        spin_unlock_irqrestore(&h->lock, flags);
 712        if (cache_drop)
 713                remove_rbio_from_cache(cache_drop);
 714        if (freeit)
 715                __free_raid_bio(freeit);
 716        return ret;
 717}
 718
 719/*
 720 * called as rmw or parity rebuild is completed.  If the plug list has more
 721 * rbios waiting for this stripe, the next one on the list will be started
 722 */
 723static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
 724{
 725        int bucket;
 726        struct btrfs_stripe_hash *h;
 727        unsigned long flags;
 728        int keep_cache = 0;
 729
 730        bucket = rbio_bucket(rbio);
 731        h = rbio->fs_info->stripe_hash_table->table + bucket;
 732
 733        if (list_empty(&rbio->plug_list))
 734                cache_rbio(rbio);
 735
 736        spin_lock_irqsave(&h->lock, flags);
 737        spin_lock(&rbio->bio_list_lock);
 738
 739        if (!list_empty(&rbio->hash_list)) {
 740                /*
 741                 * if we're still cached and there is no other IO
 742                 * to perform, just leave this rbio here for others
 743                 * to steal from later
 744                 */
 745                if (list_empty(&rbio->plug_list) &&
 746                    test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
 747                        keep_cache = 1;
 748                        clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
 749                        BUG_ON(!bio_list_empty(&rbio->bio_list));
 750                        goto done;
 751                }
 752
 753                list_del_init(&rbio->hash_list);
 754                atomic_dec(&rbio->refs);
 755
 756                /*
 757                 * we use the plug list to hold all the rbios
 758                 * waiting for the chance to lock this stripe.
 759                 * hand the lock over to one of them.
 760                 */
 761                if (!list_empty(&rbio->plug_list)) {
 762                        struct btrfs_raid_bio *next;
 763                        struct list_head *head = rbio->plug_list.next;
 764
 765                        next = list_entry(head, struct btrfs_raid_bio,
 766                                          plug_list);
 767
 768                        list_del_init(&rbio->plug_list);
 769
 770                        list_add(&next->hash_list, &h->hash_list);
 771                        atomic_inc(&next->refs);
 772                        spin_unlock(&rbio->bio_list_lock);
 773                        spin_unlock_irqrestore(&h->lock, flags);
 774
 775                        if (next->read_rebuild)
 776                                async_read_rebuild(next);
 777                        else {
 778                                steal_rbio(rbio, next);
 779                                async_rmw_stripe(next);
 780                        }
 781
 782                        goto done_nolock;
 783                } else  if (waitqueue_active(&h->wait)) {
 784                        spin_unlock(&rbio->bio_list_lock);
 785                        spin_unlock_irqrestore(&h->lock, flags);
 786                        wake_up(&h->wait);
 787                        goto done_nolock;
 788                }
 789        }
 790done:
 791        spin_unlock(&rbio->bio_list_lock);
 792        spin_unlock_irqrestore(&h->lock, flags);
 793
 794done_nolock:
 795        if (!keep_cache)
 796                remove_rbio_from_cache(rbio);
 797}
 798
 799static void __free_raid_bio(struct btrfs_raid_bio *rbio)
 800{
 801        int i;
 802
 803        WARN_ON(atomic_read(&rbio->refs) < 0);
 804        if (!atomic_dec_and_test(&rbio->refs))
 805                return;
 806
 807        WARN_ON(!list_empty(&rbio->stripe_cache));
 808        WARN_ON(!list_empty(&rbio->hash_list));
 809        WARN_ON(!bio_list_empty(&rbio->bio_list));
 810
 811        for (i = 0; i < rbio->nr_pages; i++) {
 812                if (rbio->stripe_pages[i]) {
 813                        __free_page(rbio->stripe_pages[i]);
 814                        rbio->stripe_pages[i] = NULL;
 815                }
 816        }
 817        kfree(rbio->raid_map);
 818        kfree(rbio->bbio);
 819        kfree(rbio);
 820}
 821
 822static void free_raid_bio(struct btrfs_raid_bio *rbio)
 823{
 824        unlock_stripe(rbio);
 825        __free_raid_bio(rbio);
 826}
 827
 828/*
 829 * this frees the rbio and runs through all the bios in the
 830 * bio_list and calls end_io on them
 831 */
 832static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
 833{
 834        struct bio *cur = bio_list_get(&rbio->bio_list);
 835        struct bio *next;
 836        free_raid_bio(rbio);
 837
 838        while (cur) {
 839                next = cur->bi_next;
 840                cur->bi_next = NULL;
 841                if (uptodate)
 842                        set_bit(BIO_UPTODATE, &cur->bi_flags);
 843                bio_endio(cur, err);
 844                cur = next;
 845        }
 846}
 847
 848/*
 849 * end io function used by finish_rmw.  When we finally
 850 * get here, we've written a full stripe
 851 */
 852static void raid_write_end_io(struct bio *bio, int err)
 853{
 854        struct btrfs_raid_bio *rbio = bio->bi_private;
 855
 856        if (err)
 857                fail_bio_stripe(rbio, bio);
 858
 859        bio_put(bio);
 860
 861        if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
 862                return;
 863
 864        err = 0;
 865
 866        /* OK, we have read all the stripes we need to. */
 867        if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
 868                err = -EIO;
 869
 870        rbio_orig_end_io(rbio, err, 0);
 871        return;
 872}
 873
 874/*
 875 * the read/modify/write code wants to use the original bio for
 876 * any pages it included, and then use the rbio for everything
 877 * else.  This function decides if a given index (stripe number)
 878 * and page number in that stripe fall inside the original bio
 879 * or the rbio.
 880 *
 881 * if you set bio_list_only, you'll get a NULL back for any ranges
 882 * that are outside the bio_list
 883 *
 884 * This doesn't take any refs on anything, you get a bare page pointer
 885 * and the caller must bump refs as required.
 886 *
 887 * You must call index_rbio_pages once before you can trust
 888 * the answers from this function.
 889 */
 890static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
 891                                 int index, int pagenr, int bio_list_only)
 892{
 893        int chunk_page;
 894        struct page *p = NULL;
 895
 896        chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
 897
 898        spin_lock_irq(&rbio->bio_list_lock);
 899        p = rbio->bio_pages[chunk_page];
 900        spin_unlock_irq(&rbio->bio_list_lock);
 901
 902        if (p || bio_list_only)
 903                return p;
 904
 905        return rbio->stripe_pages[chunk_page];
 906}
 907
 908/*
 909 * number of pages we need for the entire stripe across all the
 910 * drives
 911 */
 912static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
 913{
 914        unsigned long nr = stripe_len * nr_stripes;
 915        return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 916}
 917
 918/*
 919 * allocation and initial setup for the btrfs_raid_bio.  Not
 920 * this does not allocate any pages for rbio->pages.
 921 */
 922static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
 923                          struct btrfs_bio *bbio, u64 *raid_map,
 924                          u64 stripe_len)
 925{
 926        struct btrfs_raid_bio *rbio;
 927        int nr_data = 0;
 928        int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
 929        void *p;
 930
 931        rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2,
 932                        GFP_NOFS);
 933        if (!rbio) {
 934                kfree(raid_map);
 935                kfree(bbio);
 936                return ERR_PTR(-ENOMEM);
 937        }
 938
 939        bio_list_init(&rbio->bio_list);
 940        INIT_LIST_HEAD(&rbio->plug_list);
 941        spin_lock_init(&rbio->bio_list_lock);
 942        INIT_LIST_HEAD(&rbio->stripe_cache);
 943        INIT_LIST_HEAD(&rbio->hash_list);
 944        rbio->bbio = bbio;
 945        rbio->raid_map = raid_map;
 946        rbio->fs_info = root->fs_info;
 947        rbio->stripe_len = stripe_len;
 948        rbio->nr_pages = num_pages;
 949        rbio->faila = -1;
 950        rbio->failb = -1;
 951        atomic_set(&rbio->refs, 1);
 952
 953        /*
 954         * the stripe_pages and bio_pages array point to the extra
 955         * memory we allocated past the end of the rbio
 956         */
 957        p = rbio + 1;
 958        rbio->stripe_pages = p;
 959        rbio->bio_pages = p + sizeof(struct page *) * num_pages;
 960
 961        if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
 962                nr_data = bbio->num_stripes - 2;
 963        else
 964                nr_data = bbio->num_stripes - 1;
 965
 966        rbio->nr_data = nr_data;
 967        return rbio;
 968}
 969
 970/* allocate pages for all the stripes in the bio, including parity */
 971static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
 972{
 973        int i;
 974        struct page *page;
 975
 976        for (i = 0; i < rbio->nr_pages; i++) {
 977                if (rbio->stripe_pages[i])
 978                        continue;
 979                page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
 980                if (!page)
 981                        return -ENOMEM;
 982                rbio->stripe_pages[i] = page;
 983                ClearPageUptodate(page);
 984        }
 985        return 0;
 986}
 987
 988/* allocate pages for just the p/q stripes */
 989static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
 990{
 991        int i;
 992        struct page *page;
 993
 994        i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
 995
 996        for (; i < rbio->nr_pages; i++) {
 997                if (rbio->stripe_pages[i])
 998                        continue;
 999                page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1000                if (!page)
1001                        return -ENOMEM;
1002                rbio->stripe_pages[i] = page;
1003        }
1004        return 0;
1005}
1006
1007/*
1008 * add a single page from a specific stripe into our list of bios for IO
1009 * this will try to merge into existing bios if possible, and returns
1010 * zero if all went well.
1011 */
1012static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
1013                            struct bio_list *bio_list,
1014                            struct page *page,
1015                            int stripe_nr,
1016                            unsigned long page_index,
1017                            unsigned long bio_max_len)
1018{
1019        struct bio *last = bio_list->tail;
1020        u64 last_end = 0;
1021        int ret;
1022        struct bio *bio;
1023        struct btrfs_bio_stripe *stripe;
1024        u64 disk_start;
1025
1026        stripe = &rbio->bbio->stripes[stripe_nr];
1027        disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
1028
1029        /* if the device is missing, just fail this stripe */
1030        if (!stripe->dev->bdev)
1031                return fail_rbio_index(rbio, stripe_nr);
1032
1033        /* see if we can add this page onto our existing bio */
1034        if (last) {
1035                last_end = (u64)last->bi_sector << 9;
1036                last_end += last->bi_size;
1037
1038                /*
1039                 * we can't merge these if they are from different
1040                 * devices or if they are not contiguous
1041                 */
1042                if (last_end == disk_start && stripe->dev->bdev &&
1043                    test_bit(BIO_UPTODATE, &last->bi_flags) &&
1044                    last->bi_bdev == stripe->dev->bdev) {
1045                        ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
1046                        if (ret == PAGE_CACHE_SIZE)
1047                                return 0;
1048                }
1049        }
1050
1051        /* put a new bio on the list */
1052        bio = btrfs_io_bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
1053        if (!bio)
1054                return -ENOMEM;
1055
1056        bio->bi_size = 0;
1057        bio->bi_bdev = stripe->dev->bdev;
1058        bio->bi_sector = disk_start >> 9;
1059        set_bit(BIO_UPTODATE, &bio->bi_flags);
1060
1061        bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
1062        bio_list_add(bio_list, bio);
1063        return 0;
1064}
1065
1066/*
1067 * while we're doing the read/modify/write cycle, we could
1068 * have errors in reading pages off the disk.  This checks
1069 * for errors and if we're not able to read the page it'll
1070 * trigger parity reconstruction.  The rmw will be finished
1071 * after we've reconstructed the failed stripes
1072 */
1073static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1074{
1075        if (rbio->faila >= 0 || rbio->failb >= 0) {
1076                BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
1077                __raid56_parity_recover(rbio);
1078        } else {
1079                finish_rmw(rbio);
1080        }
1081}
1082
1083/*
1084 * these are just the pages from the rbio array, not from anything
1085 * the FS sent down to us
1086 */
1087static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page)
1088{
1089        int index;
1090        index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
1091        index += page;
1092        return rbio->stripe_pages[index];
1093}
1094
1095/*
1096 * helper function to walk our bio list and populate the bio_pages array with
1097 * the result.  This seems expensive, but it is faster than constantly
1098 * searching through the bio list as we setup the IO in finish_rmw or stripe
1099 * reconstruction.
1100 *
1101 * This must be called before you trust the answers from page_in_rbio
1102 */
1103static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1104{
1105        struct bio *bio;
1106        u64 start;
1107        unsigned long stripe_offset;
1108        unsigned long page_index;
1109        struct page *p;
1110        int i;
1111
1112        spin_lock_irq(&rbio->bio_list_lock);
1113        bio_list_for_each(bio, &rbio->bio_list) {
1114                start = (u64)bio->bi_sector << 9;
1115                stripe_offset = start - rbio->raid_map[0];
1116                page_index = stripe_offset >> PAGE_CACHE_SHIFT;
1117
1118                for (i = 0; i < bio->bi_vcnt; i++) {
1119                        p = bio->bi_io_vec[i].bv_page;
1120                        rbio->bio_pages[page_index + i] = p;
1121                }
1122        }
1123        spin_unlock_irq(&rbio->bio_list_lock);
1124}
1125
1126/*
1127 * this is called from one of two situations.  We either
1128 * have a full stripe from the higher layers, or we've read all
1129 * the missing bits off disk.
1130 *
1131 * This will calculate the parity and then send down any
1132 * changed blocks.
1133 */
1134static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1135{
1136        struct btrfs_bio *bbio = rbio->bbio;
1137        void *pointers[bbio->num_stripes];
1138        int stripe_len = rbio->stripe_len;
1139        int nr_data = rbio->nr_data;
1140        int stripe;
1141        int pagenr;
1142        int p_stripe = -1;
1143        int q_stripe = -1;
1144        struct bio_list bio_list;
1145        struct bio *bio;
1146        int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
1147        int ret;
1148
1149        bio_list_init(&bio_list);
1150
1151        if (bbio->num_stripes - rbio->nr_data == 1) {
1152                p_stripe = bbio->num_stripes - 1;
1153        } else if (bbio->num_stripes - rbio->nr_data == 2) {
1154                p_stripe = bbio->num_stripes - 2;
1155                q_stripe = bbio->num_stripes - 1;
1156        } else {
1157                BUG();
1158        }
1159
1160        /* at this point we either have a full stripe,
1161         * or we've read the full stripe from the drive.
1162         * recalculate the parity and write the new results.
1163         *
1164         * We're not allowed to add any new bios to the
1165         * bio list here, anyone else that wants to
1166         * change this stripe needs to do their own rmw.
1167         */
1168        spin_lock_irq(&rbio->bio_list_lock);
1169        set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1170        spin_unlock_irq(&rbio->bio_list_lock);
1171
1172        atomic_set(&rbio->bbio->error, 0);
1173
1174        /*
1175         * now that we've set rmw_locked, run through the
1176         * bio list one last time and map the page pointers
1177         *
1178         * We don't cache full rbios because we're assuming
1179         * the higher layers are unlikely to use this area of
1180         * the disk again soon.  If they do use it again,
1181         * hopefully they will send another full bio.
1182         */
1183        index_rbio_pages(rbio);
1184        if (!rbio_is_full(rbio))
1185                cache_rbio_pages(rbio);
1186        else
1187                clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1188
1189        for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1190                struct page *p;
1191                /* first collect one page from each data stripe */
1192                for (stripe = 0; stripe < nr_data; stripe++) {
1193                        p = page_in_rbio(rbio, stripe, pagenr, 0);
1194                        pointers[stripe] = kmap(p);
1195                }
1196
1197                /* then add the parity stripe */
1198                p = rbio_pstripe_page(rbio, pagenr);
1199                SetPageUptodate(p);
1200                pointers[stripe++] = kmap(p);
1201
1202                if (q_stripe != -1) {
1203
1204                        /*
1205                         * raid6, add the qstripe and call the
1206                         * library function to fill in our p/q
1207                         */
1208                        p = rbio_qstripe_page(rbio, pagenr);
1209                        SetPageUptodate(p);
1210                        pointers[stripe++] = kmap(p);
1211
1212                        raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
1213                                                pointers);
1214                } else {
1215                        /* raid5 */
1216                        memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
1217                        run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
1218                }
1219
1220
1221                for (stripe = 0; stripe < bbio->num_stripes; stripe++)
1222                        kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
1223        }
1224
1225        /*
1226         * time to start writing.  Make bios for everything from the
1227         * higher layers (the bio_list in our rbio) and our p/q.  Ignore
1228         * everything else.
1229         */
1230        for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
1231                for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1232                        struct page *page;
1233                        if (stripe < rbio->nr_data) {
1234                                page = page_in_rbio(rbio, stripe, pagenr, 1);
1235                                if (!page)
1236                                        continue;
1237                        } else {
1238                               page = rbio_stripe_page(rbio, stripe, pagenr);
1239                        }
1240
1241                        ret = rbio_add_io_page(rbio, &bio_list,
1242                                       page, stripe, pagenr, rbio->stripe_len);
1243                        if (ret)
1244                                goto cleanup;
1245                }
1246        }
1247
1248        atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list));
1249        BUG_ON(atomic_read(&bbio->stripes_pending) == 0);
1250
1251        while (1) {
1252                bio = bio_list_pop(&bio_list);
1253                if (!bio)
1254                        break;
1255
1256                bio->bi_private = rbio;
1257                bio->bi_end_io = raid_write_end_io;
1258                BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1259                submit_bio(WRITE, bio);
1260        }
1261        return;
1262
1263cleanup:
1264        rbio_orig_end_io(rbio, -EIO, 0);
1265}
1266
1267/*
1268 * helper to find the stripe number for a given bio.  Used to figure out which
1269 * stripe has failed.  This expects the bio to correspond to a physical disk,
1270 * so it looks up based on physical sector numbers.
1271 */
1272static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1273                           struct bio *bio)
1274{
1275        u64 physical = bio->bi_sector;
1276        u64 stripe_start;
1277        int i;
1278        struct btrfs_bio_stripe *stripe;
1279
1280        physical <<= 9;
1281
1282        for (i = 0; i < rbio->bbio->num_stripes; i++) {
1283                stripe = &rbio->bbio->stripes[i];
1284                stripe_start = stripe->physical;
1285                if (physical >= stripe_start &&
1286                    physical < stripe_start + rbio->stripe_len) {
1287                        return i;
1288                }
1289        }
1290        return -1;
1291}
1292
1293/*
1294 * helper to find the stripe number for a given
1295 * bio (before mapping).  Used to figure out which stripe has
1296 * failed.  This looks up based on logical block numbers.
1297 */
1298static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
1299                                   struct bio *bio)
1300{
1301        u64 logical = bio->bi_sector;
1302        u64 stripe_start;
1303        int i;
1304
1305        logical <<= 9;
1306
1307        for (i = 0; i < rbio->nr_data; i++) {
1308                stripe_start = rbio->raid_map[i];
1309                if (logical >= stripe_start &&
1310                    logical < stripe_start + rbio->stripe_len) {
1311                        return i;
1312                }
1313        }
1314        return -1;
1315}
1316
1317/*
1318 * returns -EIO if we had too many failures
1319 */
1320static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1321{
1322        unsigned long flags;
1323        int ret = 0;
1324
1325        spin_lock_irqsave(&rbio->bio_list_lock, flags);
1326
1327        /* we already know this stripe is bad, move on */
1328        if (rbio->faila == failed || rbio->failb == failed)
1329                goto out;
1330
1331        if (rbio->faila == -1) {
1332                /* first failure on this rbio */
1333                rbio->faila = failed;
1334                atomic_inc(&rbio->bbio->error);
1335        } else if (rbio->failb == -1) {
1336                /* second failure on this rbio */
1337                rbio->failb = failed;
1338                atomic_inc(&rbio->bbio->error);
1339        } else {
1340                ret = -EIO;
1341        }
1342out:
1343        spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1344
1345        return ret;
1346}
1347
1348/*
1349 * helper to fail a stripe based on a physical disk
1350 * bio.
1351 */
1352static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1353                           struct bio *bio)
1354{
1355        int failed = find_bio_stripe(rbio, bio);
1356
1357        if (failed < 0)
1358                return -EIO;
1359
1360        return fail_rbio_index(rbio, failed);
1361}
1362
1363/*
1364 * this sets each page in the bio uptodate.  It should only be used on private
1365 * rbio pages, nothing that comes in from the higher layers
1366 */
1367static void set_bio_pages_uptodate(struct bio *bio)
1368{
1369        int i;
1370        struct page *p;
1371
1372        for (i = 0; i < bio->bi_vcnt; i++) {
1373                p = bio->bi_io_vec[i].bv_page;
1374                SetPageUptodate(p);
1375        }
1376}
1377
1378/*
1379 * end io for the read phase of the rmw cycle.  All the bios here are physical
1380 * stripe bios we've read from the disk so we can recalculate the parity of the
1381 * stripe.
1382 *
1383 * This will usually kick off finish_rmw once all the bios are read in, but it
1384 * may trigger parity reconstruction if we had any errors along the way
1385 */
1386static void raid_rmw_end_io(struct bio *bio, int err)
1387{
1388        struct btrfs_raid_bio *rbio = bio->bi_private;
1389
1390        if (err)
1391                fail_bio_stripe(rbio, bio);
1392        else
1393                set_bio_pages_uptodate(bio);
1394
1395        bio_put(bio);
1396
1397        if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
1398                return;
1399
1400        err = 0;
1401        if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
1402                goto cleanup;
1403
1404        /*
1405         * this will normally call finish_rmw to start our write
1406         * but if there are any failed stripes we'll reconstruct
1407         * from parity first
1408         */
1409        validate_rbio_for_rmw(rbio);
1410        return;
1411
1412cleanup:
1413
1414        rbio_orig_end_io(rbio, -EIO, 0);
1415}
1416
1417static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
1418{
1419        rbio->work.flags = 0;
1420        rbio->work.func = rmw_work;
1421
1422        btrfs_queue_worker(&rbio->fs_info->rmw_workers,
1423                           &rbio->work);
1424}
1425
1426static void async_read_rebuild(struct btrfs_raid_bio *rbio)
1427{
1428        rbio->work.flags = 0;
1429        rbio->work.func = read_rebuild_work;
1430
1431        btrfs_queue_worker(&rbio->fs_info->rmw_workers,
1432                           &rbio->work);
1433}
1434
1435/*
1436 * the stripe must be locked by the caller.  It will
1437 * unlock after all the writes are done
1438 */
1439static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1440{
1441        int bios_to_read = 0;
1442        struct btrfs_bio *bbio = rbio->bbio;
1443        struct bio_list bio_list;
1444        int ret;
1445        int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1446        int pagenr;
1447        int stripe;
1448        struct bio *bio;
1449
1450        bio_list_init(&bio_list);
1451
1452        ret = alloc_rbio_pages(rbio);
1453        if (ret)
1454                goto cleanup;
1455
1456        index_rbio_pages(rbio);
1457
1458        atomic_set(&rbio->bbio->error, 0);
1459        /*
1460         * build a list of bios to read all the missing parts of this
1461         * stripe
1462         */
1463        for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1464                for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1465                        struct page *page;
1466                        /*
1467                         * we want to find all the pages missing from
1468                         * the rbio and read them from the disk.  If
1469                         * page_in_rbio finds a page in the bio list
1470                         * we don't need to read it off the stripe.
1471                         */
1472                        page = page_in_rbio(rbio, stripe, pagenr, 1);
1473                        if (page)
1474                                continue;
1475
1476                        page = rbio_stripe_page(rbio, stripe, pagenr);
1477                        /*
1478                         * the bio cache may have handed us an uptodate
1479                         * page.  If so, be happy and use it
1480                         */
1481                        if (PageUptodate(page))
1482                                continue;
1483
1484                        ret = rbio_add_io_page(rbio, &bio_list, page,
1485                                       stripe, pagenr, rbio->stripe_len);
1486                        if (ret)
1487                                goto cleanup;
1488                }
1489        }
1490
1491        bios_to_read = bio_list_size(&bio_list);
1492        if (!bios_to_read) {
1493                /*
1494                 * this can happen if others have merged with
1495                 * us, it means there is nothing left to read.
1496                 * But if there are missing devices it may not be
1497                 * safe to do the full stripe write yet.
1498                 */
1499                goto finish;
1500        }
1501
1502        /*
1503         * the bbio may be freed once we submit the last bio.  Make sure
1504         * not to touch it after that
1505         */
1506        atomic_set(&bbio->stripes_pending, bios_to_read);
1507        while (1) {
1508                bio = bio_list_pop(&bio_list);
1509                if (!bio)
1510                        break;
1511
1512                bio->bi_private = rbio;
1513                bio->bi_end_io = raid_rmw_end_io;
1514
1515                btrfs_bio_wq_end_io(rbio->fs_info, bio,
1516                                    BTRFS_WQ_ENDIO_RAID56);
1517
1518                BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1519                submit_bio(READ, bio);
1520        }
1521        /* the actual write will happen once the reads are done */
1522        return 0;
1523
1524cleanup:
1525        rbio_orig_end_io(rbio, -EIO, 0);
1526        return -EIO;
1527
1528finish:
1529        validate_rbio_for_rmw(rbio);
1530        return 0;
1531}
1532
1533/*
1534 * if the upper layers pass in a full stripe, we thank them by only allocating
1535 * enough pages to hold the parity, and sending it all down quickly.
1536 */
1537static int full_stripe_write(struct btrfs_raid_bio *rbio)
1538{
1539        int ret;
1540
1541        ret = alloc_rbio_parity_pages(rbio);
1542        if (ret) {
1543                __free_raid_bio(rbio);
1544                return ret;
1545        }
1546
1547        ret = lock_stripe_add(rbio);
1548        if (ret == 0)
1549                finish_rmw(rbio);
1550        return 0;
1551}
1552
1553/*
1554 * partial stripe writes get handed over to async helpers.
1555 * We're really hoping to merge a few more writes into this
1556 * rbio before calculating new parity
1557 */
1558static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1559{
1560        int ret;
1561
1562        ret = lock_stripe_add(rbio);
1563        if (ret == 0)
1564                async_rmw_stripe(rbio);
1565        return 0;
1566}
1567
1568/*
1569 * sometimes while we were reading from the drive to
1570 * recalculate parity, enough new bios come into create
1571 * a full stripe.  So we do a check here to see if we can
1572 * go directly to finish_rmw
1573 */
1574static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1575{
1576        /* head off into rmw land if we don't have a full stripe */
1577        if (!rbio_is_full(rbio))
1578                return partial_stripe_write(rbio);
1579        return full_stripe_write(rbio);
1580}
1581
1582/*
1583 * We use plugging call backs to collect full stripes.
1584 * Any time we get a partial stripe write while plugged
1585 * we collect it into a list.  When the unplug comes down,
1586 * we sort the list by logical block number and merge
1587 * everything we can into the same rbios
1588 */
1589struct btrfs_plug_cb {
1590        struct blk_plug_cb cb;
1591        struct btrfs_fs_info *info;
1592        struct list_head rbio_list;
1593        struct btrfs_work work;
1594};
1595
1596/*
1597 * rbios on the plug list are sorted for easier merging.
1598 */
1599static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
1600{
1601        struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1602                                                 plug_list);
1603        struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1604                                                 plug_list);
1605        u64 a_sector = ra->bio_list.head->bi_sector;
1606        u64 b_sector = rb->bio_list.head->bi_sector;
1607
1608        if (a_sector < b_sector)
1609                return -1;
1610        if (a_sector > b_sector)
1611                return 1;
1612        return 0;
1613}
1614
1615static void run_plug(struct btrfs_plug_cb *plug)
1616{
1617        struct btrfs_raid_bio *cur;
1618        struct btrfs_raid_bio *last = NULL;
1619
1620        /*
1621         * sort our plug list then try to merge
1622         * everything we can in hopes of creating full
1623         * stripes.
1624         */
1625        list_sort(NULL, &plug->rbio_list, plug_cmp);
1626        while (!list_empty(&plug->rbio_list)) {
1627                cur = list_entry(plug->rbio_list.next,
1628                                 struct btrfs_raid_bio, plug_list);
1629                list_del_init(&cur->plug_list);
1630
1631                if (rbio_is_full(cur)) {
1632                        /* we have a full stripe, send it down */
1633                        full_stripe_write(cur);
1634                        continue;
1635                }
1636                if (last) {
1637                        if (rbio_can_merge(last, cur)) {
1638                                merge_rbio(last, cur);
1639                                __free_raid_bio(cur);
1640                                continue;
1641
1642                        }
1643                        __raid56_parity_write(last);
1644                }
1645                last = cur;
1646        }
1647        if (last) {
1648                __raid56_parity_write(last);
1649        }
1650        kfree(plug);
1651}
1652
1653/*
1654 * if the unplug comes from schedule, we have to push the
1655 * work off to a helper thread
1656 */
1657static void unplug_work(struct btrfs_work *work)
1658{
1659        struct btrfs_plug_cb *plug;
1660        plug = container_of(work, struct btrfs_plug_cb, work);
1661        run_plug(plug);
1662}
1663
1664static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1665{
1666        struct btrfs_plug_cb *plug;
1667        plug = container_of(cb, struct btrfs_plug_cb, cb);
1668
1669        if (from_schedule) {
1670                plug->work.flags = 0;
1671                plug->work.func = unplug_work;
1672                btrfs_queue_worker(&plug->info->rmw_workers,
1673                                   &plug->work);
1674                return;
1675        }
1676        run_plug(plug);
1677}
1678
1679/*
1680 * our main entry point for writes from the rest of the FS.
1681 */
1682int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
1683                        struct btrfs_bio *bbio, u64 *raid_map,
1684                        u64 stripe_len)
1685{
1686        struct btrfs_raid_bio *rbio;
1687        struct btrfs_plug_cb *plug = NULL;
1688        struct blk_plug_cb *cb;
1689
1690        rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
1691        if (IS_ERR(rbio))
1692                return PTR_ERR(rbio);
1693        bio_list_add(&rbio->bio_list, bio);
1694        rbio->bio_list_bytes = bio->bi_size;
1695
1696        /*
1697         * don't plug on full rbios, just get them out the door
1698         * as quickly as we can
1699         */
1700        if (rbio_is_full(rbio))
1701                return full_stripe_write(rbio);
1702
1703        cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
1704                               sizeof(*plug));
1705        if (cb) {
1706                plug = container_of(cb, struct btrfs_plug_cb, cb);
1707                if (!plug->info) {
1708                        plug->info = root->fs_info;
1709                        INIT_LIST_HEAD(&plug->rbio_list);
1710                }
1711                list_add_tail(&rbio->plug_list, &plug->rbio_list);
1712        } else {
1713                return __raid56_parity_write(rbio);
1714        }
1715        return 0;
1716}
1717
1718/*
1719 * all parity reconstruction happens here.  We've read in everything
1720 * we can find from the drives and this does the heavy lifting of
1721 * sorting the good from the bad.
1722 */
1723static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1724{
1725        int pagenr, stripe;
1726        void **pointers;
1727        int faila = -1, failb = -1;
1728        int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1729        struct page *page;
1730        int err;
1731        int i;
1732
1733        pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
1734                           GFP_NOFS);
1735        if (!pointers) {
1736                err = -ENOMEM;
1737                goto cleanup_io;
1738        }
1739
1740        faila = rbio->faila;
1741        failb = rbio->failb;
1742
1743        if (rbio->read_rebuild) {
1744                spin_lock_irq(&rbio->bio_list_lock);
1745                set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1746                spin_unlock_irq(&rbio->bio_list_lock);
1747        }
1748
1749        index_rbio_pages(rbio);
1750
1751        for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1752                /* setup our array of pointers with pages
1753                 * from each stripe
1754                 */
1755                for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
1756                        /*
1757                         * if we're rebuilding a read, we have to use
1758                         * pages from the bio list
1759                         */
1760                        if (rbio->read_rebuild &&
1761                            (stripe == faila || stripe == failb)) {
1762                                page = page_in_rbio(rbio, stripe, pagenr, 0);
1763                        } else {
1764                                page = rbio_stripe_page(rbio, stripe, pagenr);
1765                        }
1766                        pointers[stripe] = kmap(page);
1767                }
1768
1769                /* all raid6 handling here */
1770                if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
1771                    RAID6_Q_STRIPE) {
1772
1773                        /*
1774                         * single failure, rebuild from parity raid5
1775                         * style
1776                         */
1777                        if (failb < 0) {
1778                                if (faila == rbio->nr_data) {
1779                                        /*
1780                                         * Just the P stripe has failed, without
1781                                         * a bad data or Q stripe.
1782                                         * TODO, we should redo the xor here.
1783                                         */
1784                                        err = -EIO;
1785                                        goto cleanup;
1786                                }
1787                                /*
1788                                 * a single failure in raid6 is rebuilt
1789                                 * in the pstripe code below
1790                                 */
1791                                goto pstripe;
1792                        }
1793
1794                        /* make sure our ps and qs are in order */
1795                        if (faila > failb) {
1796                                int tmp = failb;
1797                                failb = faila;
1798                                faila = tmp;
1799                        }
1800
1801                        /* if the q stripe is failed, do a pstripe reconstruction
1802                         * from the xors.
1803                         * If both the q stripe and the P stripe are failed, we're
1804                         * here due to a crc mismatch and we can't give them the
1805                         * data they want
1806                         */
1807                        if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
1808                                if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
1809                                        err = -EIO;
1810                                        goto cleanup;
1811                                }
1812                                /*
1813                                 * otherwise we have one bad data stripe and
1814                                 * a good P stripe.  raid5!
1815                                 */
1816                                goto pstripe;
1817                        }
1818
1819                        if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
1820                                raid6_datap_recov(rbio->bbio->num_stripes,
1821                                                  PAGE_SIZE, faila, pointers);
1822                        } else {
1823                                raid6_2data_recov(rbio->bbio->num_stripes,
1824                                                  PAGE_SIZE, faila, failb,
1825                                                  pointers);
1826                        }
1827                } else {
1828                        void *p;
1829
1830                        /* rebuild from P stripe here (raid5 or raid6) */
1831                        BUG_ON(failb != -1);
1832pstripe:
1833                        /* Copy parity block into failed block to start with */
1834                        memcpy(pointers[faila],
1835                               pointers[rbio->nr_data],
1836                               PAGE_CACHE_SIZE);
1837
1838                        /* rearrange the pointer array */
1839                        p = pointers[faila];
1840                        for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
1841                                pointers[stripe] = pointers[stripe + 1];
1842                        pointers[rbio->nr_data - 1] = p;
1843
1844                        /* xor in the rest */
1845                        run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
1846                }
1847                /* if we're doing this rebuild as part of an rmw, go through
1848                 * and set all of our private rbio pages in the
1849                 * failed stripes as uptodate.  This way finish_rmw will
1850                 * know they can be trusted.  If this was a read reconstruction,
1851                 * other endio functions will fiddle the uptodate bits
1852                 */
1853                if (!rbio->read_rebuild) {
1854                        for (i = 0;  i < nr_pages; i++) {
1855                                if (faila != -1) {
1856                                        page = rbio_stripe_page(rbio, faila, i);
1857                                        SetPageUptodate(page);
1858                                }
1859                                if (failb != -1) {
1860                                        page = rbio_stripe_page(rbio, failb, i);
1861                                        SetPageUptodate(page);
1862                                }
1863                        }
1864                }
1865                for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
1866                        /*
1867                         * if we're rebuilding a read, we have to use
1868                         * pages from the bio list
1869                         */
1870                        if (rbio->read_rebuild &&
1871                            (stripe == faila || stripe == failb)) {
1872                                page = page_in_rbio(rbio, stripe, pagenr, 0);
1873                        } else {
1874                                page = rbio_stripe_page(rbio, stripe, pagenr);
1875                        }
1876                        kunmap(page);
1877                }
1878        }
1879
1880        err = 0;
1881cleanup:
1882        kfree(pointers);
1883
1884cleanup_io:
1885
1886        if (rbio->read_rebuild) {
1887                if (err == 0)
1888                        cache_rbio_pages(rbio);
1889                else
1890                        clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1891
1892                rbio_orig_end_io(rbio, err, err == 0);
1893        } else if (err == 0) {
1894                rbio->faila = -1;
1895                rbio->failb = -1;
1896                finish_rmw(rbio);
1897        } else {
1898                rbio_orig_end_io(rbio, err, 0);
1899        }
1900}
1901
1902/*
1903 * This is called only for stripes we've read from disk to
1904 * reconstruct the parity.
1905 */
1906static void raid_recover_end_io(struct bio *bio, int err)
1907{
1908        struct btrfs_raid_bio *rbio = bio->bi_private;
1909
1910        /*
1911         * we only read stripe pages off the disk, set them
1912         * up to date if there were no errors
1913         */
1914        if (err)
1915                fail_bio_stripe(rbio, bio);
1916        else
1917                set_bio_pages_uptodate(bio);
1918        bio_put(bio);
1919
1920        if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
1921                return;
1922
1923        if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
1924                rbio_orig_end_io(rbio, -EIO, 0);
1925        else
1926                __raid_recover_end_io(rbio);
1927}
1928
1929/*
1930 * reads everything we need off the disk to reconstruct
1931 * the parity. endio handlers trigger final reconstruction
1932 * when the IO is done.
1933 *
1934 * This is used both for reads from the higher layers and for
1935 * parity construction required to finish a rmw cycle.
1936 */
1937static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
1938{
1939        int bios_to_read = 0;
1940        struct btrfs_bio *bbio = rbio->bbio;
1941        struct bio_list bio_list;
1942        int ret;
1943        int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1944        int pagenr;
1945        int stripe;
1946        struct bio *bio;
1947
1948        bio_list_init(&bio_list);
1949
1950        ret = alloc_rbio_pages(rbio);
1951        if (ret)
1952                goto cleanup;
1953
1954        atomic_set(&rbio->bbio->error, 0);
1955
1956        /*
1957         * read everything that hasn't failed.  Thanks to the
1958         * stripe cache, it is possible that some or all of these
1959         * pages are going to be uptodate.
1960         */
1961        for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
1962                if (rbio->faila == stripe ||
1963                    rbio->failb == stripe)
1964                        continue;
1965
1966                for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1967                        struct page *p;
1968
1969                        /*
1970                         * the rmw code may have already read this
1971                         * page in
1972                         */
1973                        p = rbio_stripe_page(rbio, stripe, pagenr);
1974                        if (PageUptodate(p))
1975                                continue;
1976
1977                        ret = rbio_add_io_page(rbio, &bio_list,
1978                                       rbio_stripe_page(rbio, stripe, pagenr),
1979                                       stripe, pagenr, rbio->stripe_len);
1980                        if (ret < 0)
1981                                goto cleanup;
1982                }
1983        }
1984
1985        bios_to_read = bio_list_size(&bio_list);
1986        if (!bios_to_read) {
1987                /*
1988                 * we might have no bios to read just because the pages
1989                 * were up to date, or we might have no bios to read because
1990                 * the devices were gone.
1991                 */
1992                if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) {
1993                        __raid_recover_end_io(rbio);
1994                        goto out;
1995                } else {
1996                        goto cleanup;
1997                }
1998        }
1999
2000        /*
2001         * the bbio may be freed once we submit the last bio.  Make sure
2002         * not to touch it after that
2003         */
2004        atomic_set(&bbio->stripes_pending, bios_to_read);
2005        while (1) {
2006                bio = bio_list_pop(&bio_list);
2007                if (!bio)
2008                        break;
2009
2010                bio->bi_private = rbio;
2011                bio->bi_end_io = raid_recover_end_io;
2012
2013                btrfs_bio_wq_end_io(rbio->fs_info, bio,
2014                                    BTRFS_WQ_ENDIO_RAID56);
2015
2016                BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
2017                submit_bio(READ, bio);
2018        }
2019out:
2020        return 0;
2021
2022cleanup:
2023        if (rbio->read_rebuild)
2024                rbio_orig_end_io(rbio, -EIO, 0);
2025        return -EIO;
2026}
2027
2028/*
2029 * the main entry point for reads from the higher layers.  This
2030 * is really only called when the normal read path had a failure,
2031 * so we assume the bio they send down corresponds to a failed part
2032 * of the drive.
2033 */
2034int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
2035                          struct btrfs_bio *bbio, u64 *raid_map,
2036                          u64 stripe_len, int mirror_num)
2037{
2038        struct btrfs_raid_bio *rbio;
2039        int ret;
2040
2041        rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
2042        if (IS_ERR(rbio))
2043                return PTR_ERR(rbio);
2044
2045        rbio->read_rebuild = 1;
2046        bio_list_add(&rbio->bio_list, bio);
2047        rbio->bio_list_bytes = bio->bi_size;
2048
2049        rbio->faila = find_logical_bio_stripe(rbio, bio);
2050        if (rbio->faila == -1) {
2051                BUG();
2052                kfree(raid_map);
2053                kfree(bbio);
2054                kfree(rbio);
2055                return -EIO;
2056        }
2057
2058        /*
2059         * reconstruct from the q stripe if they are
2060         * asking for mirror 3
2061         */
2062        if (mirror_num == 3)
2063                rbio->failb = bbio->num_stripes - 2;
2064
2065        ret = lock_stripe_add(rbio);
2066
2067        /*
2068         * __raid56_parity_recover will end the bio with
2069         * any errors it hits.  We don't want to return
2070         * its error value up the stack because our caller
2071         * will end up calling bio_endio with any nonzero
2072         * return
2073         */
2074        if (ret == 0)
2075                __raid56_parity_recover(rbio);
2076        /*
2077         * our rbio has been added to the list of
2078         * rbios that will be handled after the
2079         * currently lock owner is done
2080         */
2081        return 0;
2082
2083}
2084
2085static void rmw_work(struct btrfs_work *work)
2086{
2087        struct btrfs_raid_bio *rbio;
2088
2089        rbio = container_of(work, struct btrfs_raid_bio, work);
2090        raid56_rmw_stripe(rbio);
2091}
2092
2093static void read_rebuild_work(struct btrfs_work *work)
2094{
2095        struct btrfs_raid_bio *rbio;
2096
2097        rbio = container_of(work, struct btrfs_raid_bio, work);
2098        __raid56_parity_recover(rbio);
2099}
2100