linux/fs/btrfs/extent_io.c
<<
>>
Prefs
   1#include <linux/bitops.h>
   2#include <linux/slab.h>
   3#include <linux/bio.h>
   4#include <linux/mm.h>
   5#include <linux/pagemap.h>
   6#include <linux/page-flags.h>
   7#include <linux/spinlock.h>
   8#include <linux/blkdev.h>
   9#include <linux/swap.h>
  10#include <linux/writeback.h>
  11#include <linux/pagevec.h>
  12#include <linux/prefetch.h>
  13#include <linux/cleancache.h>
  14#include "extent_io.h"
  15#include "extent_map.h"
  16#include "compat.h"
  17#include "ctree.h"
  18#include "btrfs_inode.h"
  19#include "volumes.h"
  20#include "check-integrity.h"
  21#include "locking.h"
  22#include "rcu-string.h"
  23
  24static struct kmem_cache *extent_state_cache;
  25static struct kmem_cache *extent_buffer_cache;
  26
  27static LIST_HEAD(buffers);
  28static LIST_HEAD(states);
  29
  30#define LEAK_DEBUG 0
  31#if LEAK_DEBUG
  32static DEFINE_SPINLOCK(leak_lock);
  33#endif
  34
  35#define BUFFER_LRU_MAX 64
  36
  37struct tree_entry {
  38        u64 start;
  39        u64 end;
  40        struct rb_node rb_node;
  41};
  42
  43struct extent_page_data {
  44        struct bio *bio;
  45        struct extent_io_tree *tree;
  46        get_extent_t *get_extent;
  47        unsigned long bio_flags;
  48
  49        /* tells writepage not to lock the state bits for this range
  50         * it still does the unlocking
  51         */
  52        unsigned int extent_locked:1;
  53
  54        /* tells the submit_bio code to use a WRITE_SYNC */
  55        unsigned int sync_io:1;
  56};
  57
  58static noinline void flush_write_bio(void *data);
  59static inline struct btrfs_fs_info *
  60tree_fs_info(struct extent_io_tree *tree)
  61{
  62        return btrfs_sb(tree->mapping->host->i_sb);
  63}
  64
  65int __init extent_io_init(void)
  66{
  67        extent_state_cache = kmem_cache_create("btrfs_extent_state",
  68                        sizeof(struct extent_state), 0,
  69                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
  70        if (!extent_state_cache)
  71                return -ENOMEM;
  72
  73        extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
  74                        sizeof(struct extent_buffer), 0,
  75                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
  76        if (!extent_buffer_cache)
  77                goto free_state_cache;
  78        return 0;
  79
  80free_state_cache:
  81        kmem_cache_destroy(extent_state_cache);
  82        return -ENOMEM;
  83}
  84
  85void extent_io_exit(void)
  86{
  87        struct extent_state *state;
  88        struct extent_buffer *eb;
  89
  90        while (!list_empty(&states)) {
  91                state = list_entry(states.next, struct extent_state, leak_list);
  92                printk(KERN_ERR "btrfs state leak: start %llu end %llu "
  93                       "state %lu in tree %p refs %d\n",
  94                       (unsigned long long)state->start,
  95                       (unsigned long long)state->end,
  96                       state->state, state->tree, atomic_read(&state->refs));
  97                list_del(&state->leak_list);
  98                kmem_cache_free(extent_state_cache, state);
  99
 100        }
 101
 102        while (!list_empty(&buffers)) {
 103                eb = list_entry(buffers.next, struct extent_buffer, leak_list);
 104                printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
 105                       "refs %d\n", (unsigned long long)eb->start,
 106                       eb->len, atomic_read(&eb->refs));
 107                list_del(&eb->leak_list);
 108                kmem_cache_free(extent_buffer_cache, eb);
 109        }
 110
 111        /*
 112         * Make sure all delayed rcu free are flushed before we
 113         * destroy caches.
 114         */
 115        rcu_barrier();
 116        if (extent_state_cache)
 117                kmem_cache_destroy(extent_state_cache);
 118        if (extent_buffer_cache)
 119                kmem_cache_destroy(extent_buffer_cache);
 120}
 121
 122void extent_io_tree_init(struct extent_io_tree *tree,
 123                         struct address_space *mapping)
 124{
 125        tree->state = RB_ROOT;
 126        INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
 127        tree->ops = NULL;
 128        tree->dirty_bytes = 0;
 129        spin_lock_init(&tree->lock);
 130        spin_lock_init(&tree->buffer_lock);
 131        tree->mapping = mapping;
 132}
 133
 134static struct extent_state *alloc_extent_state(gfp_t mask)
 135{
 136        struct extent_state *state;
 137#if LEAK_DEBUG
 138        unsigned long flags;
 139#endif
 140
 141        state = kmem_cache_alloc(extent_state_cache, mask);
 142        if (!state)
 143                return state;
 144        state->state = 0;
 145        state->private = 0;
 146        state->tree = NULL;
 147#if LEAK_DEBUG
 148        spin_lock_irqsave(&leak_lock, flags);
 149        list_add(&state->leak_list, &states);
 150        spin_unlock_irqrestore(&leak_lock, flags);
 151#endif
 152        atomic_set(&state->refs, 1);
 153        init_waitqueue_head(&state->wq);
 154        trace_alloc_extent_state(state, mask, _RET_IP_);
 155        return state;
 156}
 157
 158void free_extent_state(struct extent_state *state)
 159{
 160        if (!state)
 161                return;
 162        if (atomic_dec_and_test(&state->refs)) {
 163#if LEAK_DEBUG
 164                unsigned long flags;
 165#endif
 166                WARN_ON(state->tree);
 167#if LEAK_DEBUG
 168                spin_lock_irqsave(&leak_lock, flags);
 169                list_del(&state->leak_list);
 170                spin_unlock_irqrestore(&leak_lock, flags);
 171#endif
 172                trace_free_extent_state(state, _RET_IP_);
 173                kmem_cache_free(extent_state_cache, state);
 174        }
 175}
 176
 177static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
 178                                   struct rb_node *node)
 179{
 180        struct rb_node **p = &root->rb_node;
 181        struct rb_node *parent = NULL;
 182        struct tree_entry *entry;
 183
 184        while (*p) {
 185                parent = *p;
 186                entry = rb_entry(parent, struct tree_entry, rb_node);
 187
 188                if (offset < entry->start)
 189                        p = &(*p)->rb_left;
 190                else if (offset > entry->end)
 191                        p = &(*p)->rb_right;
 192                else
 193                        return parent;
 194        }
 195
 196        rb_link_node(node, parent, p);
 197        rb_insert_color(node, root);
 198        return NULL;
 199}
 200
 201static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
 202                                     struct rb_node **prev_ret,
 203                                     struct rb_node **next_ret)
 204{
 205        struct rb_root *root = &tree->state;
 206        struct rb_node *n = root->rb_node;
 207        struct rb_node *prev = NULL;
 208        struct rb_node *orig_prev = NULL;
 209        struct tree_entry *entry;
 210        struct tree_entry *prev_entry = NULL;
 211
 212        while (n) {
 213                entry = rb_entry(n, struct tree_entry, rb_node);
 214                prev = n;
 215                prev_entry = entry;
 216
 217                if (offset < entry->start)
 218                        n = n->rb_left;
 219                else if (offset > entry->end)
 220                        n = n->rb_right;
 221                else
 222                        return n;
 223        }
 224
 225        if (prev_ret) {
 226                orig_prev = prev;
 227                while (prev && offset > prev_entry->end) {
 228                        prev = rb_next(prev);
 229                        prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 230                }
 231                *prev_ret = prev;
 232                prev = orig_prev;
 233        }
 234
 235        if (next_ret) {
 236                prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 237                while (prev && offset < prev_entry->start) {
 238                        prev = rb_prev(prev);
 239                        prev_entry = rb_entry(prev, struct tree_entry, rb_node);
 240                }
 241                *next_ret = prev;
 242        }
 243        return NULL;
 244}
 245
 246static inline struct rb_node *tree_search(struct extent_io_tree *tree,
 247                                          u64 offset)
 248{
 249        struct rb_node *prev = NULL;
 250        struct rb_node *ret;
 251
 252        ret = __etree_search(tree, offset, &prev, NULL);
 253        if (!ret)
 254                return prev;
 255        return ret;
 256}
 257
 258static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
 259                     struct extent_state *other)
 260{
 261        if (tree->ops && tree->ops->merge_extent_hook)
 262                tree->ops->merge_extent_hook(tree->mapping->host, new,
 263                                             other);
 264}
 265
 266/*
 267 * utility function to look for merge candidates inside a given range.
 268 * Any extents with matching state are merged together into a single
 269 * extent in the tree.  Extents with EXTENT_IO in their state field
 270 * are not merged because the end_io handlers need to be able to do
 271 * operations on them without sleeping (or doing allocations/splits).
 272 *
 273 * This should be called with the tree lock held.
 274 */
 275static void merge_state(struct extent_io_tree *tree,
 276                        struct extent_state *state)
 277{
 278        struct extent_state *other;
 279        struct rb_node *other_node;
 280
 281        if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
 282                return;
 283
 284        other_node = rb_prev(&state->rb_node);
 285        if (other_node) {
 286                other = rb_entry(other_node, struct extent_state, rb_node);
 287                if (other->end == state->start - 1 &&
 288                    other->state == state->state) {
 289                        merge_cb(tree, state, other);
 290                        state->start = other->start;
 291                        other->tree = NULL;
 292                        rb_erase(&other->rb_node, &tree->state);
 293                        free_extent_state(other);
 294                }
 295        }
 296        other_node = rb_next(&state->rb_node);
 297        if (other_node) {
 298                other = rb_entry(other_node, struct extent_state, rb_node);
 299                if (other->start == state->end + 1 &&
 300                    other->state == state->state) {
 301                        merge_cb(tree, state, other);
 302                        state->end = other->end;
 303                        other->tree = NULL;
 304                        rb_erase(&other->rb_node, &tree->state);
 305                        free_extent_state(other);
 306                }
 307        }
 308}
 309
 310static void set_state_cb(struct extent_io_tree *tree,
 311                         struct extent_state *state, int *bits)
 312{
 313        if (tree->ops && tree->ops->set_bit_hook)
 314                tree->ops->set_bit_hook(tree->mapping->host, state, bits);
 315}
 316
 317static void clear_state_cb(struct extent_io_tree *tree,
 318                           struct extent_state *state, int *bits)
 319{
 320        if (tree->ops && tree->ops->clear_bit_hook)
 321                tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
 322}
 323
 324static void set_state_bits(struct extent_io_tree *tree,
 325                           struct extent_state *state, int *bits);
 326
 327/*
 328 * insert an extent_state struct into the tree.  'bits' are set on the
 329 * struct before it is inserted.
 330 *
 331 * This may return -EEXIST if the extent is already there, in which case the
 332 * state struct is freed.
 333 *
 334 * The tree lock is not taken internally.  This is a utility function and
 335 * probably isn't what you want to call (see set/clear_extent_bit).
 336 */
 337static int insert_state(struct extent_io_tree *tree,
 338                        struct extent_state *state, u64 start, u64 end,
 339                        int *bits)
 340{
 341        struct rb_node *node;
 342
 343        if (end < start)
 344                WARN(1, KERN_ERR "btrfs end < start %llu %llu\n",
 345                       (unsigned long long)end,
 346                       (unsigned long long)start);
 347        state->start = start;
 348        state->end = end;
 349
 350        set_state_bits(tree, state, bits);
 351
 352        node = tree_insert(&tree->state, end, &state->rb_node);
 353        if (node) {
 354                struct extent_state *found;
 355                found = rb_entry(node, struct extent_state, rb_node);
 356                printk(KERN_ERR "btrfs found node %llu %llu on insert of "
 357                       "%llu %llu\n", (unsigned long long)found->start,
 358                       (unsigned long long)found->end,
 359                       (unsigned long long)start, (unsigned long long)end);
 360                return -EEXIST;
 361        }
 362        state->tree = tree;
 363        merge_state(tree, state);
 364        return 0;
 365}
 366
 367static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
 368                     u64 split)
 369{
 370        if (tree->ops && tree->ops->split_extent_hook)
 371                tree->ops->split_extent_hook(tree->mapping->host, orig, split);
 372}
 373
 374/*
 375 * split a given extent state struct in two, inserting the preallocated
 376 * struct 'prealloc' as the newly created second half.  'split' indicates an
 377 * offset inside 'orig' where it should be split.
 378 *
 379 * Before calling,
 380 * the tree has 'orig' at [orig->start, orig->end].  After calling, there
 381 * are two extent state structs in the tree:
 382 * prealloc: [orig->start, split - 1]
 383 * orig: [ split, orig->end ]
 384 *
 385 * The tree locks are not taken by this function. They need to be held
 386 * by the caller.
 387 */
 388static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
 389                       struct extent_state *prealloc, u64 split)
 390{
 391        struct rb_node *node;
 392
 393        split_cb(tree, orig, split);
 394
 395        prealloc->start = orig->start;
 396        prealloc->end = split - 1;
 397        prealloc->state = orig->state;
 398        orig->start = split;
 399
 400        node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
 401        if (node) {
 402                free_extent_state(prealloc);
 403                return -EEXIST;
 404        }
 405        prealloc->tree = tree;
 406        return 0;
 407}
 408
 409static struct extent_state *next_state(struct extent_state *state)
 410{
 411        struct rb_node *next = rb_next(&state->rb_node);
 412        if (next)
 413                return rb_entry(next, struct extent_state, rb_node);
 414        else
 415                return NULL;
 416}
 417
 418/*
 419 * utility function to clear some bits in an extent state struct.
 420 * it will optionally wake up any one waiting on this state (wake == 1).
 421 *
 422 * If no bits are set on the state struct after clearing things, the
 423 * struct is freed and removed from the tree
 424 */
 425static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
 426                                            struct extent_state *state,
 427                                            int *bits, int wake)
 428{
 429        struct extent_state *next;
 430        int bits_to_clear = *bits & ~EXTENT_CTLBITS;
 431
 432        if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
 433                u64 range = state->end - state->start + 1;
 434                WARN_ON(range > tree->dirty_bytes);
 435                tree->dirty_bytes -= range;
 436        }
 437        clear_state_cb(tree, state, bits);
 438        state->state &= ~bits_to_clear;
 439        if (wake)
 440                wake_up(&state->wq);
 441        if (state->state == 0) {
 442                next = next_state(state);
 443                if (state->tree) {
 444                        rb_erase(&state->rb_node, &tree->state);
 445                        state->tree = NULL;
 446                        free_extent_state(state);
 447                } else {
 448                        WARN_ON(1);
 449                }
 450        } else {
 451                merge_state(tree, state);
 452                next = next_state(state);
 453        }
 454        return next;
 455}
 456
 457static struct extent_state *
 458alloc_extent_state_atomic(struct extent_state *prealloc)
 459{
 460        if (!prealloc)
 461                prealloc = alloc_extent_state(GFP_ATOMIC);
 462
 463        return prealloc;
 464}
 465
 466void extent_io_tree_panic(struct extent_io_tree *tree, int err)
 467{
 468        btrfs_panic(tree_fs_info(tree), err, "Locking error: "
 469                    "Extent tree was modified by another "
 470                    "thread while locked.");
 471}
 472
 473/*
 474 * clear some bits on a range in the tree.  This may require splitting
 475 * or inserting elements in the tree, so the gfp mask is used to
 476 * indicate which allocations or sleeping are allowed.
 477 *
 478 * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
 479 * the given range from the tree regardless of state (ie for truncate).
 480 *
 481 * the range [start, end] is inclusive.
 482 *
 483 * This takes the tree lock, and returns 0 on success and < 0 on error.
 484 */
 485int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 486                     int bits, int wake, int delete,
 487                     struct extent_state **cached_state,
 488                     gfp_t mask)
 489{
 490        struct extent_state *state;
 491        struct extent_state *cached;
 492        struct extent_state *prealloc = NULL;
 493        struct rb_node *node;
 494        u64 last_end;
 495        int err;
 496        int clear = 0;
 497
 498        if (delete)
 499                bits |= ~EXTENT_CTLBITS;
 500        bits |= EXTENT_FIRST_DELALLOC;
 501
 502        if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
 503                clear = 1;
 504again:
 505        if (!prealloc && (mask & __GFP_WAIT)) {
 506                prealloc = alloc_extent_state(mask);
 507                if (!prealloc)
 508                        return -ENOMEM;
 509        }
 510
 511        spin_lock(&tree->lock);
 512        if (cached_state) {
 513                cached = *cached_state;
 514
 515                if (clear) {
 516                        *cached_state = NULL;
 517                        cached_state = NULL;
 518                }
 519
 520                if (cached && cached->tree && cached->start <= start &&
 521                    cached->end > start) {
 522                        if (clear)
 523                                atomic_dec(&cached->refs);
 524                        state = cached;
 525                        goto hit_next;
 526                }
 527                if (clear)
 528                        free_extent_state(cached);
 529        }
 530        /*
 531         * this search will find the extents that end after
 532         * our range starts
 533         */
 534        node = tree_search(tree, start);
 535        if (!node)
 536                goto out;
 537        state = rb_entry(node, struct extent_state, rb_node);
 538hit_next:
 539        if (state->start > end)
 540                goto out;
 541        WARN_ON(state->end < start);
 542        last_end = state->end;
 543
 544        /* the state doesn't have the wanted bits, go ahead */
 545        if (!(state->state & bits)) {
 546                state = next_state(state);
 547                goto next;
 548        }
 549
 550        /*
 551         *     | ---- desired range ---- |
 552         *  | state | or
 553         *  | ------------- state -------------- |
 554         *
 555         * We need to split the extent we found, and may flip
 556         * bits on second half.
 557         *
 558         * If the extent we found extends past our range, we
 559         * just split and search again.  It'll get split again
 560         * the next time though.
 561         *
 562         * If the extent we found is inside our range, we clear
 563         * the desired bit on it.
 564         */
 565
 566        if (state->start < start) {
 567                prealloc = alloc_extent_state_atomic(prealloc);
 568                BUG_ON(!prealloc);
 569                err = split_state(tree, state, prealloc, start);
 570                if (err)
 571                        extent_io_tree_panic(tree, err);
 572
 573                prealloc = NULL;
 574                if (err)
 575                        goto out;
 576                if (state->end <= end) {
 577                        state = clear_state_bit(tree, state, &bits, wake);
 578                        goto next;
 579                }
 580                goto search_again;
 581        }
 582        /*
 583         * | ---- desired range ---- |
 584         *                        | state |
 585         * We need to split the extent, and clear the bit
 586         * on the first half
 587         */
 588        if (state->start <= end && state->end > end) {
 589                prealloc = alloc_extent_state_atomic(prealloc);
 590                BUG_ON(!prealloc);
 591                err = split_state(tree, state, prealloc, end + 1);
 592                if (err)
 593                        extent_io_tree_panic(tree, err);
 594
 595                if (wake)
 596                        wake_up(&state->wq);
 597
 598                clear_state_bit(tree, prealloc, &bits, wake);
 599
 600                prealloc = NULL;
 601                goto out;
 602        }
 603
 604        state = clear_state_bit(tree, state, &bits, wake);
 605next:
 606        if (last_end == (u64)-1)
 607                goto out;
 608        start = last_end + 1;
 609        if (start <= end && state && !need_resched())
 610                goto hit_next;
 611        goto search_again;
 612
 613out:
 614        spin_unlock(&tree->lock);
 615        if (prealloc)
 616                free_extent_state(prealloc);
 617
 618        return 0;
 619
 620search_again:
 621        if (start > end)
 622                goto out;
 623        spin_unlock(&tree->lock);
 624        if (mask & __GFP_WAIT)
 625                cond_resched();
 626        goto again;
 627}
 628
 629static void wait_on_state(struct extent_io_tree *tree,
 630                          struct extent_state *state)
 631                __releases(tree->lock)
 632                __acquires(tree->lock)
 633{
 634        DEFINE_WAIT(wait);
 635        prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
 636        spin_unlock(&tree->lock);
 637        schedule();
 638        spin_lock(&tree->lock);
 639        finish_wait(&state->wq, &wait);
 640}
 641
 642/*
 643 * waits for one or more bits to clear on a range in the state tree.
 644 * The range [start, end] is inclusive.
 645 * The tree lock is taken by this function
 646 */
 647void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
 648{
 649        struct extent_state *state;
 650        struct rb_node *node;
 651
 652        spin_lock(&tree->lock);
 653again:
 654        while (1) {
 655                /*
 656                 * this search will find all the extents that end after
 657                 * our range starts
 658                 */
 659                node = tree_search(tree, start);
 660                if (!node)
 661                        break;
 662
 663                state = rb_entry(node, struct extent_state, rb_node);
 664
 665                if (state->start > end)
 666                        goto out;
 667
 668                if (state->state & bits) {
 669                        start = state->start;
 670                        atomic_inc(&state->refs);
 671                        wait_on_state(tree, state);
 672                        free_extent_state(state);
 673                        goto again;
 674                }
 675                start = state->end + 1;
 676
 677                if (start > end)
 678                        break;
 679
 680                cond_resched_lock(&tree->lock);
 681        }
 682out:
 683        spin_unlock(&tree->lock);
 684}
 685
 686static void set_state_bits(struct extent_io_tree *tree,
 687                           struct extent_state *state,
 688                           int *bits)
 689{
 690        int bits_to_set = *bits & ~EXTENT_CTLBITS;
 691
 692        set_state_cb(tree, state, bits);
 693        if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
 694                u64 range = state->end - state->start + 1;
 695                tree->dirty_bytes += range;
 696        }
 697        state->state |= bits_to_set;
 698}
 699
 700static void cache_state(struct extent_state *state,
 701                        struct extent_state **cached_ptr)
 702{
 703        if (cached_ptr && !(*cached_ptr)) {
 704                if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
 705                        *cached_ptr = state;
 706                        atomic_inc(&state->refs);
 707                }
 708        }
 709}
 710
 711static void uncache_state(struct extent_state **cached_ptr)
 712{
 713        if (cached_ptr && (*cached_ptr)) {
 714                struct extent_state *state = *cached_ptr;
 715                *cached_ptr = NULL;
 716                free_extent_state(state);
 717        }
 718}
 719
 720/*
 721 * set some bits on a range in the tree.  This may require allocations or
 722 * sleeping, so the gfp mask is used to indicate what is allowed.
 723 *
 724 * If any of the exclusive bits are set, this will fail with -EEXIST if some
 725 * part of the range already has the desired bits set.  The start of the
 726 * existing range is returned in failed_start in this case.
 727 *
 728 * [start, end] is inclusive This takes the tree lock.
 729 */
 730
 731static int __must_check
 732__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 733                 int bits, int exclusive_bits, u64 *failed_start,
 734                 struct extent_state **cached_state, gfp_t mask)
 735{
 736        struct extent_state *state;
 737        struct extent_state *prealloc = NULL;
 738        struct rb_node *node;
 739        int err = 0;
 740        u64 last_start;
 741        u64 last_end;
 742
 743        bits |= EXTENT_FIRST_DELALLOC;
 744again:
 745        if (!prealloc && (mask & __GFP_WAIT)) {
 746                prealloc = alloc_extent_state(mask);
 747                BUG_ON(!prealloc);
 748        }
 749
 750        spin_lock(&tree->lock);
 751        if (cached_state && *cached_state) {
 752                state = *cached_state;
 753                if (state->start <= start && state->end > start &&
 754                    state->tree) {
 755                        node = &state->rb_node;
 756                        goto hit_next;
 757                }
 758        }
 759        /*
 760         * this search will find all the extents that end after
 761         * our range starts.
 762         */
 763        node = tree_search(tree, start);
 764        if (!node) {
 765                prealloc = alloc_extent_state_atomic(prealloc);
 766                BUG_ON(!prealloc);
 767                err = insert_state(tree, prealloc, start, end, &bits);
 768                if (err)
 769                        extent_io_tree_panic(tree, err);
 770
 771                prealloc = NULL;
 772                goto out;
 773        }
 774        state = rb_entry(node, struct extent_state, rb_node);
 775hit_next:
 776        last_start = state->start;
 777        last_end = state->end;
 778
 779        /*
 780         * | ---- desired range ---- |
 781         * | state |
 782         *
 783         * Just lock what we found and keep going
 784         */
 785        if (state->start == start && state->end <= end) {
 786                if (state->state & exclusive_bits) {
 787                        *failed_start = state->start;
 788                        err = -EEXIST;
 789                        goto out;
 790                }
 791
 792                set_state_bits(tree, state, &bits);
 793                cache_state(state, cached_state);
 794                merge_state(tree, state);
 795                if (last_end == (u64)-1)
 796                        goto out;
 797                start = last_end + 1;
 798                state = next_state(state);
 799                if (start < end && state && state->start == start &&
 800                    !need_resched())
 801                        goto hit_next;
 802                goto search_again;
 803        }
 804
 805        /*
 806         *     | ---- desired range ---- |
 807         * | state |
 808         *   or
 809         * | ------------- state -------------- |
 810         *
 811         * We need to split the extent we found, and may flip bits on
 812         * second half.
 813         *
 814         * If the extent we found extends past our
 815         * range, we just split and search again.  It'll get split
 816         * again the next time though.
 817         *
 818         * If the extent we found is inside our range, we set the
 819         * desired bit on it.
 820         */
 821        if (state->start < start) {
 822                if (state->state & exclusive_bits) {
 823                        *failed_start = start;
 824                        err = -EEXIST;
 825                        goto out;
 826                }
 827
 828                prealloc = alloc_extent_state_atomic(prealloc);
 829                BUG_ON(!prealloc);
 830                err = split_state(tree, state, prealloc, start);
 831                if (err)
 832                        extent_io_tree_panic(tree, err);
 833
 834                prealloc = NULL;
 835                if (err)
 836                        goto out;
 837                if (state->end <= end) {
 838                        set_state_bits(tree, state, &bits);
 839                        cache_state(state, cached_state);
 840                        merge_state(tree, state);
 841                        if (last_end == (u64)-1)
 842                                goto out;
 843                        start = last_end + 1;
 844                        state = next_state(state);
 845                        if (start < end && state && state->start == start &&
 846                            !need_resched())
 847                                goto hit_next;
 848                }
 849                goto search_again;
 850        }
 851        /*
 852         * | ---- desired range ---- |
 853         *     | state | or               | state |
 854         *
 855         * There's a hole, we need to insert something in it and
 856         * ignore the extent we found.
 857         */
 858        if (state->start > start) {
 859                u64 this_end;
 860                if (end < last_start)
 861                        this_end = end;
 862                else
 863                        this_end = last_start - 1;
 864
 865                prealloc = alloc_extent_state_atomic(prealloc);
 866                BUG_ON(!prealloc);
 867
 868                /*
 869                 * Avoid to free 'prealloc' if it can be merged with
 870                 * the later extent.
 871                 */
 872                err = insert_state(tree, prealloc, start, this_end,
 873                                   &bits);
 874                if (err)
 875                        extent_io_tree_panic(tree, err);
 876
 877                cache_state(prealloc, cached_state);
 878                prealloc = NULL;
 879                start = this_end + 1;
 880                goto search_again;
 881        }
 882        /*
 883         * | ---- desired range ---- |
 884         *                        | state |
 885         * We need to split the extent, and set the bit
 886         * on the first half
 887         */
 888        if (state->start <= end && state->end > end) {
 889                if (state->state & exclusive_bits) {
 890                        *failed_start = start;
 891                        err = -EEXIST;
 892                        goto out;
 893                }
 894
 895                prealloc = alloc_extent_state_atomic(prealloc);
 896                BUG_ON(!prealloc);
 897                err = split_state(tree, state, prealloc, end + 1);
 898                if (err)
 899                        extent_io_tree_panic(tree, err);
 900
 901                set_state_bits(tree, prealloc, &bits);
 902                cache_state(prealloc, cached_state);
 903                merge_state(tree, prealloc);
 904                prealloc = NULL;
 905                goto out;
 906        }
 907
 908        goto search_again;
 909
 910out:
 911        spin_unlock(&tree->lock);
 912        if (prealloc)
 913                free_extent_state(prealloc);
 914
 915        return err;
 916
 917search_again:
 918        if (start > end)
 919                goto out;
 920        spin_unlock(&tree->lock);
 921        if (mask & __GFP_WAIT)
 922                cond_resched();
 923        goto again;
 924}
 925
 926int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
 927                   u64 *failed_start, struct extent_state **cached_state,
 928                   gfp_t mask)
 929{
 930        return __set_extent_bit(tree, start, end, bits, 0, failed_start,
 931                                cached_state, mask);
 932}
 933
 934
 935/**
 936 * convert_extent_bit - convert all bits in a given range from one bit to
 937 *                      another
 938 * @tree:       the io tree to search
 939 * @start:      the start offset in bytes
 940 * @end:        the end offset in bytes (inclusive)
 941 * @bits:       the bits to set in this range
 942 * @clear_bits: the bits to clear in this range
 943 * @cached_state:       state that we're going to cache
 944 * @mask:       the allocation mask
 945 *
 946 * This will go through and set bits for the given range.  If any states exist
 947 * already in this range they are set with the given bit and cleared of the
 948 * clear_bits.  This is only meant to be used by things that are mergeable, ie
 949 * converting from say DELALLOC to DIRTY.  This is not meant to be used with
 950 * boundary bits like LOCK.
 951 */
 952int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 953                       int bits, int clear_bits,
 954                       struct extent_state **cached_state, gfp_t mask)
 955{
 956        struct extent_state *state;
 957        struct extent_state *prealloc = NULL;
 958        struct rb_node *node;
 959        int err = 0;
 960        u64 last_start;
 961        u64 last_end;
 962
 963again:
 964        if (!prealloc && (mask & __GFP_WAIT)) {
 965                prealloc = alloc_extent_state(mask);
 966                if (!prealloc)
 967                        return -ENOMEM;
 968        }
 969
 970        spin_lock(&tree->lock);
 971        if (cached_state && *cached_state) {
 972                state = *cached_state;
 973                if (state->start <= start && state->end > start &&
 974                    state->tree) {
 975                        node = &state->rb_node;
 976                        goto hit_next;
 977                }
 978        }
 979
 980        /*
 981         * this search will find all the extents that end after
 982         * our range starts.
 983         */
 984        node = tree_search(tree, start);
 985        if (!node) {
 986                prealloc = alloc_extent_state_atomic(prealloc);
 987                if (!prealloc) {
 988                        err = -ENOMEM;
 989                        goto out;
 990                }
 991                err = insert_state(tree, prealloc, start, end, &bits);
 992                prealloc = NULL;
 993                if (err)
 994                        extent_io_tree_panic(tree, err);
 995                goto out;
 996        }
 997        state = rb_entry(node, struct extent_state, rb_node);
 998hit_next:
 999        last_start = state->start;
1000        last_end = state->end;
1001
1002        /*
1003         * | ---- desired range ---- |
1004         * | state |
1005         *
1006         * Just lock what we found and keep going
1007         */
1008        if (state->start == start && state->end <= end) {
1009                set_state_bits(tree, state, &bits);
1010                cache_state(state, cached_state);
1011                state = clear_state_bit(tree, state, &clear_bits, 0);
1012                if (last_end == (u64)-1)
1013                        goto out;
1014                start = last_end + 1;
1015                if (start < end && state && state->start == start &&
1016                    !need_resched())
1017                        goto hit_next;
1018                goto search_again;
1019        }
1020
1021        /*
1022         *     | ---- desired range ---- |
1023         * | state |
1024         *   or
1025         * | ------------- state -------------- |
1026         *
1027         * We need to split the extent we found, and may flip bits on
1028         * second half.
1029         *
1030         * If the extent we found extends past our
1031         * range, we just split and search again.  It'll get split
1032         * again the next time though.
1033         *
1034         * If the extent we found is inside our range, we set the
1035         * desired bit on it.
1036         */
1037        if (state->start < start) {
1038                prealloc = alloc_extent_state_atomic(prealloc);
1039                if (!prealloc) {
1040                        err = -ENOMEM;
1041                        goto out;
1042                }
1043                err = split_state(tree, state, prealloc, start);
1044                if (err)
1045                        extent_io_tree_panic(tree, err);
1046                prealloc = NULL;
1047                if (err)
1048                        goto out;
1049                if (state->end <= end) {
1050                        set_state_bits(tree, state, &bits);
1051                        cache_state(state, cached_state);
1052                        state = clear_state_bit(tree, state, &clear_bits, 0);
1053                        if (last_end == (u64)-1)
1054                                goto out;
1055                        start = last_end + 1;
1056                        if (start < end && state && state->start == start &&
1057                            !need_resched())
1058                                goto hit_next;
1059                }
1060                goto search_again;
1061        }
1062        /*
1063         * | ---- desired range ---- |
1064         *     | state | or               | state |
1065         *
1066         * There's a hole, we need to insert something in it and
1067         * ignore the extent we found.
1068         */
1069        if (state->start > start) {
1070                u64 this_end;
1071                if (end < last_start)
1072                        this_end = end;
1073                else
1074                        this_end = last_start - 1;
1075
1076                prealloc = alloc_extent_state_atomic(prealloc);
1077                if (!prealloc) {
1078                        err = -ENOMEM;
1079                        goto out;
1080                }
1081
1082                /*
1083                 * Avoid to free 'prealloc' if it can be merged with
1084                 * the later extent.
1085                 */
1086                err = insert_state(tree, prealloc, start, this_end,
1087                                   &bits);
1088                if (err)
1089                        extent_io_tree_panic(tree, err);
1090                cache_state(prealloc, cached_state);
1091                prealloc = NULL;
1092                start = this_end + 1;
1093                goto search_again;
1094        }
1095        /*
1096         * | ---- desired range ---- |
1097         *                        | state |
1098         * We need to split the extent, and set the bit
1099         * on the first half
1100         */
1101        if (state->start <= end && state->end > end) {
1102                prealloc = alloc_extent_state_atomic(prealloc);
1103                if (!prealloc) {
1104                        err = -ENOMEM;
1105                        goto out;
1106                }
1107
1108                err = split_state(tree, state, prealloc, end + 1);
1109                if (err)
1110                        extent_io_tree_panic(tree, err);
1111
1112                set_state_bits(tree, prealloc, &bits);
1113                cache_state(prealloc, cached_state);
1114                clear_state_bit(tree, prealloc, &clear_bits, 0);
1115                prealloc = NULL;
1116                goto out;
1117        }
1118
1119        goto search_again;
1120
1121out:
1122        spin_unlock(&tree->lock);
1123        if (prealloc)
1124                free_extent_state(prealloc);
1125
1126        return err;
1127
1128search_again:
1129        if (start > end)
1130                goto out;
1131        spin_unlock(&tree->lock);
1132        if (mask & __GFP_WAIT)
1133                cond_resched();
1134        goto again;
1135}
1136
1137/* wrappers around set/clear extent bit */
1138int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
1139                     gfp_t mask)
1140{
1141        return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
1142                              NULL, mask);
1143}
1144
1145int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1146                    int bits, gfp_t mask)
1147{
1148        return set_extent_bit(tree, start, end, bits, NULL,
1149                              NULL, mask);
1150}
1151
1152int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1153                      int bits, gfp_t mask)
1154{
1155        return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
1156}
1157
1158int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
1159                        struct extent_state **cached_state, gfp_t mask)
1160{
1161        return set_extent_bit(tree, start, end,
1162                              EXTENT_DELALLOC | EXTENT_UPTODATE,
1163                              NULL, cached_state, mask);
1164}
1165
1166int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
1167                      struct extent_state **cached_state, gfp_t mask)
1168{
1169        return set_extent_bit(tree, start, end,
1170                              EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
1171                              NULL, cached_state, mask);
1172}
1173
1174int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
1175                       gfp_t mask)
1176{
1177        return clear_extent_bit(tree, start, end,
1178                                EXTENT_DIRTY | EXTENT_DELALLOC |
1179                                EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
1180}
1181
1182int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
1183                     gfp_t mask)
1184{
1185        return set_extent_bit(tree, start, end, EXTENT_NEW, NULL,
1186                              NULL, mask);
1187}
1188
1189int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
1190                        struct extent_state **cached_state, gfp_t mask)
1191{
1192        return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
1193                              cached_state, mask);
1194}
1195
1196int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
1197                          struct extent_state **cached_state, gfp_t mask)
1198{
1199        return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
1200                                cached_state, mask);
1201}
1202
1203/*
1204 * either insert or lock state struct between start and end use mask to tell
1205 * us if waiting is desired.
1206 */
1207int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1208                     int bits, struct extent_state **cached_state)
1209{
1210        int err;
1211        u64 failed_start;
1212        while (1) {
1213                err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
1214                                       EXTENT_LOCKED, &failed_start,
1215                                       cached_state, GFP_NOFS);
1216                if (err == -EEXIST) {
1217                        wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1218                        start = failed_start;
1219                } else
1220                        break;
1221                WARN_ON(start > end);
1222        }
1223        return err;
1224}
1225
1226int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1227{
1228        return lock_extent_bits(tree, start, end, 0, NULL);
1229}
1230
1231int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1232{
1233        int err;
1234        u64 failed_start;
1235
1236        err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1237                               &failed_start, NULL, GFP_NOFS);
1238        if (err == -EEXIST) {
1239                if (failed_start > start)
1240                        clear_extent_bit(tree, start, failed_start - 1,
1241                                         EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
1242                return 0;
1243        }
1244        return 1;
1245}
1246
1247int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
1248                         struct extent_state **cached, gfp_t mask)
1249{
1250        return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
1251                                mask);
1252}
1253
1254int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1255{
1256        return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
1257                                GFP_NOFS);
1258}
1259
1260int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
1261{
1262        unsigned long index = start >> PAGE_CACHE_SHIFT;
1263        unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1264        struct page *page;
1265
1266        while (index <= end_index) {
1267                page = find_get_page(inode->i_mapping, index);
1268                BUG_ON(!page); /* Pages should be in the extent_io_tree */
1269                clear_page_dirty_for_io(page);
1270                page_cache_release(page);
1271                index++;
1272        }
1273        return 0;
1274}
1275
1276int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
1277{
1278        unsigned long index = start >> PAGE_CACHE_SHIFT;
1279        unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1280        struct page *page;
1281
1282        while (index <= end_index) {
1283                page = find_get_page(inode->i_mapping, index);
1284                BUG_ON(!page); /* Pages should be in the extent_io_tree */
1285                account_page_redirty(page);
1286                __set_page_dirty_nobuffers(page);
1287                page_cache_release(page);
1288                index++;
1289        }
1290        return 0;
1291}
1292
1293/*
1294 * helper function to set both pages and extents in the tree writeback
1295 */
1296static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
1297{
1298        unsigned long index = start >> PAGE_CACHE_SHIFT;
1299        unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1300        struct page *page;
1301
1302        while (index <= end_index) {
1303                page = find_get_page(tree->mapping, index);
1304                BUG_ON(!page); /* Pages should be in the extent_io_tree */
1305                set_page_writeback(page);
1306                page_cache_release(page);
1307                index++;
1308        }
1309        return 0;
1310}
1311
1312/* find the first state struct with 'bits' set after 'start', and
1313 * return it.  tree->lock must be held.  NULL will returned if
1314 * nothing was found after 'start'
1315 */
1316struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
1317                                                 u64 start, int bits)
1318{
1319        struct rb_node *node;
1320        struct extent_state *state;
1321
1322        /*
1323         * this search will find all the extents that end after
1324         * our range starts.
1325         */
1326        node = tree_search(tree, start);
1327        if (!node)
1328                goto out;
1329
1330        while (1) {
1331                state = rb_entry(node, struct extent_state, rb_node);
1332                if (state->end >= start && (state->state & bits))
1333                        return state;
1334
1335                node = rb_next(node);
1336                if (!node)
1337                        break;
1338        }
1339out:
1340        return NULL;
1341}
1342
1343/*
1344 * find the first offset in the io tree with 'bits' set. zero is
1345 * returned if we find something, and *start_ret and *end_ret are
1346 * set to reflect the state struct that was found.
1347 *
1348 * If nothing was found, 1 is returned. If found something, return 0.
1349 */
1350int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1351                          u64 *start_ret, u64 *end_ret, int bits,
1352                          struct extent_state **cached_state)
1353{
1354        struct extent_state *state;
1355        struct rb_node *n;
1356        int ret = 1;
1357
1358        spin_lock(&tree->lock);
1359        if (cached_state && *cached_state) {
1360                state = *cached_state;
1361                if (state->end == start - 1 && state->tree) {
1362                        n = rb_next(&state->rb_node);
1363                        while (n) {
1364                                state = rb_entry(n, struct extent_state,
1365                                                 rb_node);
1366                                if (state->state & bits)
1367                                        goto got_it;
1368                                n = rb_next(n);
1369                        }
1370                        free_extent_state(*cached_state);
1371                        *cached_state = NULL;
1372                        goto out;
1373                }
1374                free_extent_state(*cached_state);
1375                *cached_state = NULL;
1376        }
1377
1378        state = find_first_extent_bit_state(tree, start, bits);
1379got_it:
1380        if (state) {
1381                cache_state(state, cached_state);
1382                *start_ret = state->start;
1383                *end_ret = state->end;
1384                ret = 0;
1385        }
1386out:
1387        spin_unlock(&tree->lock);
1388        return ret;
1389}
1390
1391/*
1392 * find a contiguous range of bytes in the file marked as delalloc, not
1393 * more than 'max_bytes'.  start and end are used to return the range,
1394 *
1395 * 1 is returned if we find something, 0 if nothing was in the tree
1396 */
1397static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1398                                        u64 *start, u64 *end, u64 max_bytes,
1399                                        struct extent_state **cached_state)
1400{
1401        struct rb_node *node;
1402        struct extent_state *state;
1403        u64 cur_start = *start;
1404        u64 found = 0;
1405        u64 total_bytes = 0;
1406
1407        spin_lock(&tree->lock);
1408
1409        /*
1410         * this search will find all the extents that end after
1411         * our range starts.
1412         */
1413        node = tree_search(tree, cur_start);
1414        if (!node) {
1415                if (!found)
1416                        *end = (u64)-1;
1417                goto out;
1418        }
1419
1420        while (1) {
1421                state = rb_entry(node, struct extent_state, rb_node);
1422                if (found && (state->start != cur_start ||
1423                              (state->state & EXTENT_BOUNDARY))) {
1424                        goto out;
1425                }
1426                if (!(state->state & EXTENT_DELALLOC)) {
1427                        if (!found)
1428                                *end = state->end;
1429                        goto out;
1430                }
1431                if (!found) {
1432                        *start = state->start;
1433                        *cached_state = state;
1434                        atomic_inc(&state->refs);
1435                }
1436                found++;
1437                *end = state->end;
1438                cur_start = state->end + 1;
1439                node = rb_next(node);
1440                if (!node)
1441                        break;
1442                total_bytes += state->end - state->start + 1;
1443                if (total_bytes >= max_bytes)
1444                        break;
1445        }
1446out:
1447        spin_unlock(&tree->lock);
1448        return found;
1449}
1450
1451static noinline void __unlock_for_delalloc(struct inode *inode,
1452                                           struct page *locked_page,
1453                                           u64 start, u64 end)
1454{
1455        int ret;
1456        struct page *pages[16];
1457        unsigned long index = start >> PAGE_CACHE_SHIFT;
1458        unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1459        unsigned long nr_pages = end_index - index + 1;
1460        int i;
1461
1462        if (index == locked_page->index && end_index == index)
1463                return;
1464
1465        while (nr_pages > 0) {
1466                ret = find_get_pages_contig(inode->i_mapping, index,
1467                                     min_t(unsigned long, nr_pages,
1468                                     ARRAY_SIZE(pages)), pages);
1469                for (i = 0; i < ret; i++) {
1470                        if (pages[i] != locked_page)
1471                                unlock_page(pages[i]);
1472                        page_cache_release(pages[i]);
1473                }
1474                nr_pages -= ret;
1475                index += ret;
1476                cond_resched();
1477        }
1478}
1479
1480static noinline int lock_delalloc_pages(struct inode *inode,
1481                                        struct page *locked_page,
1482                                        u64 delalloc_start,
1483                                        u64 delalloc_end)
1484{
1485        unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1486        unsigned long start_index = index;
1487        unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1488        unsigned long pages_locked = 0;
1489        struct page *pages[16];
1490        unsigned long nrpages;
1491        int ret;
1492        int i;
1493
1494        /* the caller is responsible for locking the start index */
1495        if (index == locked_page->index && index == end_index)
1496                return 0;
1497
1498        /* skip the page at the start index */
1499        nrpages = end_index - index + 1;
1500        while (nrpages > 0) {
1501                ret = find_get_pages_contig(inode->i_mapping, index,
1502                                     min_t(unsigned long,
1503                                     nrpages, ARRAY_SIZE(pages)), pages);
1504                if (ret == 0) {
1505                        ret = -EAGAIN;
1506                        goto done;
1507                }
1508                /* now we have an array of pages, lock them all */
1509                for (i = 0; i < ret; i++) {
1510                        /*
1511                         * the caller is taking responsibility for
1512                         * locked_page
1513                         */
1514                        if (pages[i] != locked_page) {
1515                                lock_page(pages[i]);
1516                                if (!PageDirty(pages[i]) ||
1517                                    pages[i]->mapping != inode->i_mapping) {
1518                                        ret = -EAGAIN;
1519                                        unlock_page(pages[i]);
1520                                        page_cache_release(pages[i]);
1521                                        goto done;
1522                                }
1523                        }
1524                        page_cache_release(pages[i]);
1525                        pages_locked++;
1526                }
1527                nrpages -= ret;
1528                index += ret;
1529                cond_resched();
1530        }
1531        ret = 0;
1532done:
1533        if (ret && pages_locked) {
1534                __unlock_for_delalloc(inode, locked_page,
1535                              delalloc_start,
1536                              ((u64)(start_index + pages_locked - 1)) <<
1537                              PAGE_CACHE_SHIFT);
1538        }
1539        return ret;
1540}
1541
1542/*
1543 * find a contiguous range of bytes in the file marked as delalloc, not
1544 * more than 'max_bytes'.  start and end are used to return the range,
1545 *
1546 * 1 is returned if we find something, 0 if nothing was in the tree
1547 */
1548static noinline u64 find_lock_delalloc_range(struct inode *inode,
1549                                             struct extent_io_tree *tree,
1550                                             struct page *locked_page,
1551                                             u64 *start, u64 *end,
1552                                             u64 max_bytes)
1553{
1554        u64 delalloc_start;
1555        u64 delalloc_end;
1556        u64 found;
1557        struct extent_state *cached_state = NULL;
1558        int ret;
1559        int loops = 0;
1560
1561again:
1562        /* step one, find a bunch of delalloc bytes starting at start */
1563        delalloc_start = *start;
1564        delalloc_end = 0;
1565        found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1566                                    max_bytes, &cached_state);
1567        if (!found || delalloc_end <= *start) {
1568                *start = delalloc_start;
1569                *end = delalloc_end;
1570                free_extent_state(cached_state);
1571                return found;
1572        }
1573
1574        /*
1575         * start comes from the offset of locked_page.  We have to lock
1576         * pages in order, so we can't process delalloc bytes before
1577         * locked_page
1578         */
1579        if (delalloc_start < *start)
1580                delalloc_start = *start;
1581
1582        /*
1583         * make sure to limit the number of pages we try to lock down
1584         * if we're looping.
1585         */
1586        if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
1587                delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
1588
1589        /* step two, lock all the pages after the page that has start */
1590        ret = lock_delalloc_pages(inode, locked_page,
1591                                  delalloc_start, delalloc_end);
1592        if (ret == -EAGAIN) {
1593                /* some of the pages are gone, lets avoid looping by
1594                 * shortening the size of the delalloc range we're searching
1595                 */
1596                free_extent_state(cached_state);
1597                if (!loops) {
1598                        unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
1599                        max_bytes = PAGE_CACHE_SIZE - offset;
1600                        loops = 1;
1601                        goto again;
1602                } else {
1603                        found = 0;
1604                        goto out_failed;
1605                }
1606        }
1607        BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */
1608
1609        /* step three, lock the state bits for the whole range */
1610        lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state);
1611
1612        /* then test to make sure it is all still delalloc */
1613        ret = test_range_bit(tree, delalloc_start, delalloc_end,
1614                             EXTENT_DELALLOC, 1, cached_state);
1615        if (!ret) {
1616                unlock_extent_cached(tree, delalloc_start, delalloc_end,
1617                                     &cached_state, GFP_NOFS);
1618                __unlock_for_delalloc(inode, locked_page,
1619                              delalloc_start, delalloc_end);
1620                cond_resched();
1621                goto again;
1622        }
1623        free_extent_state(cached_state);
1624        *start = delalloc_start;
1625        *end = delalloc_end;
1626out_failed:
1627        return found;
1628}
1629
1630int extent_clear_unlock_delalloc(struct inode *inode,
1631                                struct extent_io_tree *tree,
1632                                u64 start, u64 end, struct page *locked_page,
1633                                unsigned long op)
1634{
1635        int ret;
1636        struct page *pages[16];
1637        unsigned long index = start >> PAGE_CACHE_SHIFT;
1638        unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1639        unsigned long nr_pages = end_index - index + 1;
1640        int i;
1641        int clear_bits = 0;
1642
1643        if (op & EXTENT_CLEAR_UNLOCK)
1644                clear_bits |= EXTENT_LOCKED;
1645        if (op & EXTENT_CLEAR_DIRTY)
1646                clear_bits |= EXTENT_DIRTY;
1647
1648        if (op & EXTENT_CLEAR_DELALLOC)
1649                clear_bits |= EXTENT_DELALLOC;
1650
1651        clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1652        if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
1653                    EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
1654                    EXTENT_SET_PRIVATE2)))
1655                return 0;
1656
1657        while (nr_pages > 0) {
1658                ret = find_get_pages_contig(inode->i_mapping, index,
1659                                     min_t(unsigned long,
1660                                     nr_pages, ARRAY_SIZE(pages)), pages);
1661                for (i = 0; i < ret; i++) {
1662
1663                        if (op & EXTENT_SET_PRIVATE2)
1664                                SetPagePrivate2(pages[i]);
1665
1666                        if (pages[i] == locked_page) {
1667                                page_cache_release(pages[i]);
1668                                continue;
1669                        }
1670                        if (op & EXTENT_CLEAR_DIRTY)
1671                                clear_page_dirty_for_io(pages[i]);
1672                        if (op & EXTENT_SET_WRITEBACK)
1673                                set_page_writeback(pages[i]);
1674                        if (op & EXTENT_END_WRITEBACK)
1675                                end_page_writeback(pages[i]);
1676                        if (op & EXTENT_CLEAR_UNLOCK_PAGE)
1677                                unlock_page(pages[i]);
1678                        page_cache_release(pages[i]);
1679                }
1680                nr_pages -= ret;
1681                index += ret;
1682                cond_resched();
1683        }
1684        return 0;
1685}
1686
1687/*
1688 * count the number of bytes in the tree that have a given bit(s)
1689 * set.  This can be fairly slow, except for EXTENT_DIRTY which is
1690 * cached.  The total number found is returned.
1691 */
1692u64 count_range_bits(struct extent_io_tree *tree,
1693                     u64 *start, u64 search_end, u64 max_bytes,
1694                     unsigned long bits, int contig)
1695{
1696        struct rb_node *node;
1697        struct extent_state *state;
1698        u64 cur_start = *start;
1699        u64 total_bytes = 0;
1700        u64 last = 0;
1701        int found = 0;
1702
1703        if (search_end <= cur_start) {
1704                WARN_ON(1);
1705                return 0;
1706        }
1707
1708        spin_lock(&tree->lock);
1709        if (cur_start == 0 && bits == EXTENT_DIRTY) {
1710                total_bytes = tree->dirty_bytes;
1711                goto out;
1712        }
1713        /*
1714         * this search will find all the extents that end after
1715         * our range starts.
1716         */
1717        node = tree_search(tree, cur_start);
1718        if (!node)
1719                goto out;
1720
1721        while (1) {
1722                state = rb_entry(node, struct extent_state, rb_node);
1723                if (state->start > search_end)
1724                        break;
1725                if (contig && found && state->start > last + 1)
1726                        break;
1727                if (state->end >= cur_start && (state->state & bits) == bits) {
1728                        total_bytes += min(search_end, state->end) + 1 -
1729                                       max(cur_start, state->start);
1730                        if (total_bytes >= max_bytes)
1731                                break;
1732                        if (!found) {
1733                                *start = max(cur_start, state->start);
1734                                found = 1;
1735                        }
1736                        last = state->end;
1737                } else if (contig && found) {
1738                        break;
1739                }
1740                node = rb_next(node);
1741                if (!node)
1742                        break;
1743        }
1744out:
1745        spin_unlock(&tree->lock);
1746        return total_bytes;
1747}
1748
1749/*
1750 * set the private field for a given byte offset in the tree.  If there isn't
1751 * an extent_state there already, this does nothing.
1752 */
1753int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1754{
1755        struct rb_node *node;
1756        struct extent_state *state;
1757        int ret = 0;
1758
1759        spin_lock(&tree->lock);
1760        /*
1761         * this search will find all the extents that end after
1762         * our range starts.
1763         */
1764        node = tree_search(tree, start);
1765        if (!node) {
1766                ret = -ENOENT;
1767                goto out;
1768        }
1769        state = rb_entry(node, struct extent_state, rb_node);
1770        if (state->start != start) {
1771                ret = -ENOENT;
1772                goto out;
1773        }
1774        state->private = private;
1775out:
1776        spin_unlock(&tree->lock);
1777        return ret;
1778}
1779
1780int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1781{
1782        struct rb_node *node;
1783        struct extent_state *state;
1784        int ret = 0;
1785
1786        spin_lock(&tree->lock);
1787        /*
1788         * this search will find all the extents that end after
1789         * our range starts.
1790         */
1791        node = tree_search(tree, start);
1792        if (!node) {
1793                ret = -ENOENT;
1794                goto out;
1795        }
1796        state = rb_entry(node, struct extent_state, rb_node);
1797        if (state->start != start) {
1798                ret = -ENOENT;
1799                goto out;
1800        }
1801        *private = state->private;
1802out:
1803        spin_unlock(&tree->lock);
1804        return ret;
1805}
1806
1807/*
1808 * searches a range in the state tree for a given mask.
1809 * If 'filled' == 1, this returns 1 only if every extent in the tree
1810 * has the bits set.  Otherwise, 1 is returned if any bit in the
1811 * range is found set.
1812 */
1813int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1814                   int bits, int filled, struct extent_state *cached)
1815{
1816        struct extent_state *state = NULL;
1817        struct rb_node *node;
1818        int bitset = 0;
1819
1820        spin_lock(&tree->lock);
1821        if (cached && cached->tree && cached->start <= start &&
1822            cached->end > start)
1823                node = &cached->rb_node;
1824        else
1825                node = tree_search(tree, start);
1826        while (node && start <= end) {
1827                state = rb_entry(node, struct extent_state, rb_node);
1828
1829                if (filled && state->start > start) {
1830                        bitset = 0;
1831                        break;
1832                }
1833
1834                if (state->start > end)
1835                        break;
1836
1837                if (state->state & bits) {
1838                        bitset = 1;
1839                        if (!filled)
1840                                break;
1841                } else if (filled) {
1842                        bitset = 0;
1843                        break;
1844                }
1845
1846                if (state->end == (u64)-1)
1847                        break;
1848
1849                start = state->end + 1;
1850                if (start > end)
1851                        break;
1852                node = rb_next(node);
1853                if (!node) {
1854                        if (filled)
1855                                bitset = 0;
1856                        break;
1857                }
1858        }
1859        spin_unlock(&tree->lock);
1860        return bitset;
1861}
1862
1863/*
1864 * helper function to set a given page up to date if all the
1865 * extents in the tree for that page are up to date
1866 */
1867static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
1868{
1869        u64 start = page_offset(page);
1870        u64 end = start + PAGE_CACHE_SIZE - 1;
1871        if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
1872                SetPageUptodate(page);
1873}
1874
1875/*
1876 * helper function to unlock a page if all the extents in the tree
1877 * for that page are unlocked
1878 */
1879static void check_page_locked(struct extent_io_tree *tree, struct page *page)
1880{
1881        u64 start = page_offset(page);
1882        u64 end = start + PAGE_CACHE_SIZE - 1;
1883        if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
1884                unlock_page(page);
1885}
1886
1887/*
1888 * helper function to end page writeback if all the extents
1889 * in the tree for that page are done with writeback
1890 */
1891static void check_page_writeback(struct extent_io_tree *tree,
1892                                 struct page *page)
1893{
1894        end_page_writeback(page);
1895}
1896
1897/*
1898 * When IO fails, either with EIO or csum verification fails, we
1899 * try other mirrors that might have a good copy of the data.  This
1900 * io_failure_record is used to record state as we go through all the
1901 * mirrors.  If another mirror has good data, the page is set up to date
1902 * and things continue.  If a good mirror can't be found, the original
1903 * bio end_io callback is called to indicate things have failed.
1904 */
1905struct io_failure_record {
1906        struct page *page;
1907        u64 start;
1908        u64 len;
1909        u64 logical;
1910        unsigned long bio_flags;
1911        int this_mirror;
1912        int failed_mirror;
1913        int in_validation;
1914};
1915
1916static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
1917                                int did_repair)
1918{
1919        int ret;
1920        int err = 0;
1921        struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1922
1923        set_state_private(failure_tree, rec->start, 0);
1924        ret = clear_extent_bits(failure_tree, rec->start,
1925                                rec->start + rec->len - 1,
1926                                EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1927        if (ret)
1928                err = ret;
1929
1930        ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
1931                                rec->start + rec->len - 1,
1932                                EXTENT_DAMAGED, GFP_NOFS);
1933        if (ret && !err)
1934                err = ret;
1935
1936        kfree(rec);
1937        return err;
1938}
1939
1940static void repair_io_failure_callback(struct bio *bio, int err)
1941{
1942        complete(bio->bi_private);
1943}
1944
1945/*
1946 * this bypasses the standard btrfs submit functions deliberately, as
1947 * the standard behavior is to write all copies in a raid setup. here we only
1948 * want to write the one bad copy. so we do the mapping for ourselves and issue
1949 * submit_bio directly.
1950 * to avoid any synchronization issues, wait for the data after writing, which
1951 * actually prevents the read that triggered the error from finishing.
1952 * currently, there can be no more than two copies of every data bit. thus,
1953 * exactly one rewrite is required.
1954 */
1955int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
1956                        u64 length, u64 logical, struct page *page,
1957                        int mirror_num)
1958{
1959        struct bio *bio;
1960        struct btrfs_device *dev;
1961        DECLARE_COMPLETION_ONSTACK(compl);
1962        u64 map_length = 0;
1963        u64 sector;
1964        struct btrfs_bio *bbio = NULL;
1965        struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
1966        int ret;
1967
1968        BUG_ON(!mirror_num);
1969
1970        /* we can't repair anything in raid56 yet */
1971        if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num))
1972                return 0;
1973
1974        bio = bio_alloc(GFP_NOFS, 1);
1975        if (!bio)
1976                return -EIO;
1977        bio->bi_private = &compl;
1978        bio->bi_end_io = repair_io_failure_callback;
1979        bio->bi_size = 0;
1980        map_length = length;
1981
1982        ret = btrfs_map_block(fs_info, WRITE, logical,
1983                              &map_length, &bbio, mirror_num);
1984        if (ret) {
1985                bio_put(bio);
1986                return -EIO;
1987        }
1988        BUG_ON(mirror_num != bbio->mirror_num);
1989        sector = bbio->stripes[mirror_num-1].physical >> 9;
1990        bio->bi_sector = sector;
1991        dev = bbio->stripes[mirror_num-1].dev;
1992        kfree(bbio);
1993        if (!dev || !dev->bdev || !dev->writeable) {
1994                bio_put(bio);
1995                return -EIO;
1996        }
1997        bio->bi_bdev = dev->bdev;
1998        bio_add_page(bio, page, length, start - page_offset(page));
1999        btrfsic_submit_bio(WRITE_SYNC, bio);
2000        wait_for_completion(&compl);
2001
2002        if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
2003                /* try to remap that extent elsewhere? */
2004                bio_put(bio);
2005                btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
2006                return -EIO;
2007        }
2008
2009        printk_ratelimited_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu "
2010                      "(dev %s sector %llu)\n", page->mapping->host->i_ino,
2011                      start, rcu_str_deref(dev->name), sector);
2012
2013        bio_put(bio);
2014        return 0;
2015}
2016
2017int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
2018                         int mirror_num)
2019{
2020        u64 start = eb->start;
2021        unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
2022        int ret = 0;
2023
2024        for (i = 0; i < num_pages; i++) {
2025                struct page *p = extent_buffer_page(eb, i);
2026                ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE,
2027                                        start, p, mirror_num);
2028                if (ret)
2029                        break;
2030                start += PAGE_CACHE_SIZE;
2031        }
2032
2033        return ret;
2034}
2035
2036/*
2037 * each time an IO finishes, we do a fast check in the IO failure tree
2038 * to see if we need to process or clean up an io_failure_record
2039 */
2040static int clean_io_failure(u64 start, struct page *page)
2041{
2042        u64 private;
2043        u64 private_failure;
2044        struct io_failure_record *failrec;
2045        struct btrfs_fs_info *fs_info;
2046        struct extent_state *state;
2047        int num_copies;
2048        int did_repair = 0;
2049        int ret;
2050        struct inode *inode = page->mapping->host;
2051
2052        private = 0;
2053        ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
2054                                (u64)-1, 1, EXTENT_DIRTY, 0);
2055        if (!ret)
2056                return 0;
2057
2058        ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
2059                                &private_failure);
2060        if (ret)
2061                return 0;
2062
2063        failrec = (struct io_failure_record *)(unsigned long) private_failure;
2064        BUG_ON(!failrec->this_mirror);
2065
2066        if (failrec->in_validation) {
2067                /* there was no real error, just free the record */
2068                pr_debug("clean_io_failure: freeing dummy error at %llu\n",
2069                         failrec->start);
2070                did_repair = 1;
2071                goto out;
2072        }
2073
2074        spin_lock(&BTRFS_I(inode)->io_tree.lock);
2075        state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
2076                                            failrec->start,
2077                                            EXTENT_LOCKED);
2078        spin_unlock(&BTRFS_I(inode)->io_tree.lock);
2079
2080        if (state && state->start == failrec->start) {
2081                fs_info = BTRFS_I(inode)->root->fs_info;
2082                num_copies = btrfs_num_copies(fs_info, failrec->logical,
2083                                              failrec->len);
2084                if (num_copies > 1)  {
2085                        ret = repair_io_failure(fs_info, start, failrec->len,
2086                                                failrec->logical, page,
2087                                                failrec->failed_mirror);
2088                        did_repair = !ret;
2089                }
2090                ret = 0;
2091        }
2092
2093out:
2094        if (!ret)
2095                ret = free_io_failure(inode, failrec, did_repair);
2096
2097        return ret;
2098}
2099
2100/*
2101 * this is a generic handler for readpage errors (default
2102 * readpage_io_failed_hook). if other copies exist, read those and write back
2103 * good data to the failed position. does not investigate in remapping the
2104 * failed extent elsewhere, hoping the device will be smart enough to do this as
2105 * needed
2106 */
2107
2108static int bio_readpage_error(struct bio *failed_bio, struct page *page,
2109                                u64 start, u64 end, int failed_mirror,
2110                                struct extent_state *state)
2111{
2112        struct io_failure_record *failrec = NULL;
2113        u64 private;
2114        struct extent_map *em;
2115        struct inode *inode = page->mapping->host;
2116        struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2117        struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2118        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2119        struct bio *bio;
2120        int num_copies;
2121        int ret;
2122        int read_mode;
2123        u64 logical;
2124
2125        BUG_ON(failed_bio->bi_rw & REQ_WRITE);
2126
2127        ret = get_state_private(failure_tree, start, &private);
2128        if (ret) {
2129                failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2130                if (!failrec)
2131                        return -ENOMEM;
2132                failrec->start = start;
2133                failrec->len = end - start + 1;
2134                failrec->this_mirror = 0;
2135                failrec->bio_flags = 0;
2136                failrec->in_validation = 0;
2137
2138                read_lock(&em_tree->lock);
2139                em = lookup_extent_mapping(em_tree, start, failrec->len);
2140                if (!em) {
2141                        read_unlock(&em_tree->lock);
2142                        kfree(failrec);
2143                        return -EIO;
2144                }
2145
2146                if (em->start > start || em->start + em->len < start) {
2147                        free_extent_map(em);
2148                        em = NULL;
2149                }
2150                read_unlock(&em_tree->lock);
2151
2152                if (!em) {
2153                        kfree(failrec);
2154                        return -EIO;
2155                }
2156                logical = start - em->start;
2157                logical = em->block_start + logical;
2158                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2159                        logical = em->block_start;
2160                        failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2161                        extent_set_compress_type(&failrec->bio_flags,
2162                                                 em->compress_type);
2163                }
2164                pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
2165                         "len=%llu\n", logical, start, failrec->len);
2166                failrec->logical = logical;
2167                free_extent_map(em);
2168
2169                /* set the bits in the private failure tree */
2170                ret = set_extent_bits(failure_tree, start, end,
2171                                        EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
2172                if (ret >= 0)
2173                        ret = set_state_private(failure_tree, start,
2174                                                (u64)(unsigned long)failrec);
2175                /* set the bits in the inode's tree */
2176                if (ret >= 0)
2177                        ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
2178                                                GFP_NOFS);
2179                if (ret < 0) {
2180                        kfree(failrec);
2181                        return ret;
2182                }
2183        } else {
2184                failrec = (struct io_failure_record *)(unsigned long)private;
2185                pr_debug("bio_readpage_error: (found) logical=%llu, "
2186                         "start=%llu, len=%llu, validation=%d\n",
2187                         failrec->logical, failrec->start, failrec->len,
2188                         failrec->in_validation);
2189                /*
2190                 * when data can be on disk more than twice, add to failrec here
2191                 * (e.g. with a list for failed_mirror) to make
2192                 * clean_io_failure() clean all those errors at once.
2193                 */
2194        }
2195        num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
2196                                      failrec->logical, failrec->len);
2197        if (num_copies == 1) {
2198                /*
2199                 * we only have a single copy of the data, so don't bother with
2200                 * all the retry and error correction code that follows. no
2201                 * matter what the error is, it is very likely to persist.
2202                 */
2203                pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
2204                         "state=%p, num_copies=%d, next_mirror %d, "
2205                         "failed_mirror %d\n", state, num_copies,
2206                         failrec->this_mirror, failed_mirror);
2207                free_io_failure(inode, failrec, 0);
2208                return -EIO;
2209        }
2210
2211        if (!state) {
2212                spin_lock(&tree->lock);
2213                state = find_first_extent_bit_state(tree, failrec->start,
2214                                                    EXTENT_LOCKED);
2215                if (state && state->start != failrec->start)
2216                        state = NULL;
2217                spin_unlock(&tree->lock);
2218        }
2219
2220        /*
2221         * there are two premises:
2222         *      a) deliver good data to the caller
2223         *      b) correct the bad sectors on disk
2224         */
2225        if (failed_bio->bi_vcnt > 1) {
2226                /*
2227                 * to fulfill b), we need to know the exact failing sectors, as
2228                 * we don't want to rewrite any more than the failed ones. thus,
2229                 * we need separate read requests for the failed bio
2230                 *
2231                 * if the following BUG_ON triggers, our validation request got
2232                 * merged. we need separate requests for our algorithm to work.
2233                 */
2234                BUG_ON(failrec->in_validation);
2235                failrec->in_validation = 1;
2236                failrec->this_mirror = failed_mirror;
2237                read_mode = READ_SYNC | REQ_FAILFAST_DEV;
2238        } else {
2239                /*
2240                 * we're ready to fulfill a) and b) alongside. get a good copy
2241                 * of the failed sector and if we succeed, we have setup
2242                 * everything for repair_io_failure to do the rest for us.
2243                 */
2244                if (failrec->in_validation) {
2245                        BUG_ON(failrec->this_mirror != failed_mirror);
2246                        failrec->in_validation = 0;
2247                        failrec->this_mirror = 0;
2248                }
2249                failrec->failed_mirror = failed_mirror;
2250                failrec->this_mirror++;
2251                if (failrec->this_mirror == failed_mirror)
2252                        failrec->this_mirror++;
2253                read_mode = READ_SYNC;
2254        }
2255
2256        if (!state || failrec->this_mirror > num_copies) {
2257                pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
2258                         "next_mirror %d, failed_mirror %d\n", state,
2259                         num_copies, failrec->this_mirror, failed_mirror);
2260                free_io_failure(inode, failrec, 0);
2261                return -EIO;
2262        }
2263
2264        bio = bio_alloc(GFP_NOFS, 1);
2265        if (!bio) {
2266                free_io_failure(inode, failrec, 0);
2267                return -EIO;
2268        }
2269        bio->bi_private = state;
2270        bio->bi_end_io = failed_bio->bi_end_io;
2271        bio->bi_sector = failrec->logical >> 9;
2272        bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
2273        bio->bi_size = 0;
2274
2275        bio_add_page(bio, page, failrec->len, start - page_offset(page));
2276
2277        pr_debug("bio_readpage_error: submitting new read[%#x] to "
2278                 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
2279                 failrec->this_mirror, num_copies, failrec->in_validation);
2280
2281        ret = tree->ops->submit_bio_hook(inode, read_mode, bio,
2282                                         failrec->this_mirror,
2283                                         failrec->bio_flags, 0);
2284        return ret;
2285}
2286
2287/* lots and lots of room for performance fixes in the end_bio funcs */
2288
2289int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
2290{
2291        int uptodate = (err == 0);
2292        struct extent_io_tree *tree;
2293        int ret;
2294
2295        tree = &BTRFS_I(page->mapping->host)->io_tree;
2296
2297        if (tree->ops && tree->ops->writepage_end_io_hook) {
2298                ret = tree->ops->writepage_end_io_hook(page, start,
2299                                               end, NULL, uptodate);
2300                if (ret)
2301                        uptodate = 0;
2302        }
2303
2304        if (!uptodate) {
2305                ClearPageUptodate(page);
2306                SetPageError(page);
2307        }
2308        return 0;
2309}
2310
2311/*
2312 * after a writepage IO is done, we need to:
2313 * clear the uptodate bits on error
2314 * clear the writeback bits in the extent tree for this IO
2315 * end_page_writeback if the page has no more pending IO
2316 *
2317 * Scheduling is not allowed, so the extent state tree is expected
2318 * to have one and only one object corresponding to this IO.
2319 */
2320static void end_bio_extent_writepage(struct bio *bio, int err)
2321{
2322        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
2323        struct extent_io_tree *tree;
2324        u64 start;
2325        u64 end;
2326        int whole_page;
2327
2328        do {
2329                struct page *page = bvec->bv_page;
2330                tree = &BTRFS_I(page->mapping->host)->io_tree;
2331
2332                start = page_offset(page) + bvec->bv_offset;
2333                end = start + bvec->bv_len - 1;
2334
2335                if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
2336                        whole_page = 1;
2337                else
2338                        whole_page = 0;
2339
2340                if (--bvec >= bio->bi_io_vec)
2341                        prefetchw(&bvec->bv_page->flags);
2342
2343                if (end_extent_writepage(page, err, start, end))
2344                        continue;
2345
2346                if (whole_page)
2347                        end_page_writeback(page);
2348                else
2349                        check_page_writeback(tree, page);
2350        } while (bvec >= bio->bi_io_vec);
2351
2352        bio_put(bio);
2353}
2354
2355/*
2356 * after a readpage IO is done, we need to:
2357 * clear the uptodate bits on error
2358 * set the uptodate bits if things worked
2359 * set the page up to date if all extents in the tree are uptodate
2360 * clear the lock bit in the extent tree
2361 * unlock the page if there are no other extents locked for it
2362 *
2363 * Scheduling is not allowed, so the extent state tree is expected
2364 * to have one and only one object corresponding to this IO.
2365 */
2366static void end_bio_extent_readpage(struct bio *bio, int err)
2367{
2368        int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
2369        struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
2370        struct bio_vec *bvec = bio->bi_io_vec;
2371        struct extent_io_tree *tree;
2372        u64 start;
2373        u64 end;
2374        int whole_page;
2375        int mirror;
2376        int ret;
2377
2378        if (err)
2379                uptodate = 0;
2380
2381        do {
2382                struct page *page = bvec->bv_page;
2383                struct extent_state *cached = NULL;
2384                struct extent_state *state;
2385
2386                pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
2387                         "mirror=%ld\n", (u64)bio->bi_sector, err,
2388                         (long int)bio->bi_bdev);
2389                tree = &BTRFS_I(page->mapping->host)->io_tree;
2390
2391                start = page_offset(page) + bvec->bv_offset;
2392                end = start + bvec->bv_len - 1;
2393
2394                if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
2395                        whole_page = 1;
2396                else
2397                        whole_page = 0;
2398
2399                if (++bvec <= bvec_end)
2400                        prefetchw(&bvec->bv_page->flags);
2401
2402                spin_lock(&tree->lock);
2403                state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED);
2404                if (state && state->start == start) {
2405                        /*
2406                         * take a reference on the state, unlock will drop
2407                         * the ref
2408                         */
2409                        cache_state(state, &cached);
2410                }
2411                spin_unlock(&tree->lock);
2412
2413                mirror = (int)(unsigned long)bio->bi_bdev;
2414                if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
2415                        ret = tree->ops->readpage_end_io_hook(page, start, end,
2416                                                              state, mirror);
2417                        if (ret)
2418                                uptodate = 0;
2419                        else
2420                                clean_io_failure(start, page);
2421                }
2422
2423                if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) {
2424                        ret = tree->ops->readpage_io_failed_hook(page, mirror);
2425                        if (!ret && !err &&
2426                            test_bit(BIO_UPTODATE, &bio->bi_flags))
2427                                uptodate = 1;
2428                } else if (!uptodate) {
2429                        /*
2430                         * The generic bio_readpage_error handles errors the
2431                         * following way: If possible, new read requests are
2432                         * created and submitted and will end up in
2433                         * end_bio_extent_readpage as well (if we're lucky, not
2434                         * in the !uptodate case). In that case it returns 0 and
2435                         * we just go on with the next page in our bio. If it
2436                         * can't handle the error it will return -EIO and we
2437                         * remain responsible for that page.
2438                         */
2439                        ret = bio_readpage_error(bio, page, start, end, mirror, NULL);
2440                        if (ret == 0) {
2441                                uptodate =
2442                                        test_bit(BIO_UPTODATE, &bio->bi_flags);
2443                                if (err)
2444                                        uptodate = 0;
2445                                uncache_state(&cached);
2446                                continue;
2447                        }
2448                }
2449
2450                if (uptodate && tree->track_uptodate) {
2451                        set_extent_uptodate(tree, start, end, &cached,
2452                                            GFP_ATOMIC);
2453                }
2454                unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
2455
2456                if (whole_page) {
2457                        if (uptodate) {
2458                                SetPageUptodate(page);
2459                        } else {
2460                                ClearPageUptodate(page);
2461                                SetPageError(page);
2462                        }
2463                        unlock_page(page);
2464                } else {
2465                        if (uptodate) {
2466                                check_page_uptodate(tree, page);
2467                        } else {
2468                                ClearPageUptodate(page);
2469                                SetPageError(page);
2470                        }
2471                        check_page_locked(tree, page);
2472                }
2473        } while (bvec <= bvec_end);
2474
2475        bio_put(bio);
2476}
2477
2478struct bio *
2479btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
2480                gfp_t gfp_flags)
2481{
2482        struct bio *bio;
2483
2484        bio = bio_alloc(gfp_flags, nr_vecs);
2485
2486        if (bio == NULL && (current->flags & PF_MEMALLOC)) {
2487                while (!bio && (nr_vecs /= 2))
2488                        bio = bio_alloc(gfp_flags, nr_vecs);
2489        }
2490
2491        if (bio) {
2492                bio->bi_size = 0;
2493                bio->bi_bdev = bdev;
2494                bio->bi_sector = first_sector;
2495        }
2496        return bio;
2497}
2498
2499static int __must_check submit_one_bio(int rw, struct bio *bio,
2500                                       int mirror_num, unsigned long bio_flags)
2501{
2502        int ret = 0;
2503        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
2504        struct page *page = bvec->bv_page;
2505        struct extent_io_tree *tree = bio->bi_private;
2506        u64 start;
2507
2508        start = page_offset(page) + bvec->bv_offset;
2509
2510        bio->bi_private = NULL;
2511
2512        bio_get(bio);
2513
2514        if (tree->ops && tree->ops->submit_bio_hook)
2515                ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
2516                                           mirror_num, bio_flags, start);
2517        else
2518                btrfsic_submit_bio(rw, bio);
2519
2520        if (bio_flagged(bio, BIO_EOPNOTSUPP))
2521                ret = -EOPNOTSUPP;
2522        bio_put(bio);
2523        return ret;
2524}
2525
2526static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page,
2527                     unsigned long offset, size_t size, struct bio *bio,
2528                     unsigned long bio_flags)
2529{
2530        int ret = 0;
2531        if (tree->ops && tree->ops->merge_bio_hook)
2532                ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio,
2533                                                bio_flags);
2534        BUG_ON(ret < 0);
2535        return ret;
2536
2537}
2538
2539static int submit_extent_page(int rw, struct extent_io_tree *tree,
2540                              struct page *page, sector_t sector,
2541                              size_t size, unsigned long offset,
2542                              struct block_device *bdev,
2543                              struct bio **bio_ret,
2544                              unsigned long max_pages,
2545                              bio_end_io_t end_io_func,
2546                              int mirror_num,
2547                              unsigned long prev_bio_flags,
2548                              unsigned long bio_flags)
2549{
2550        int ret = 0;
2551        struct bio *bio;
2552        int nr;
2553        int contig = 0;
2554        int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
2555        int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
2556        size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
2557
2558        if (bio_ret && *bio_ret) {
2559                bio = *bio_ret;
2560                if (old_compressed)
2561                        contig = bio->bi_sector == sector;
2562                else
2563                        contig = bio->bi_sector + (bio->bi_size >> 9) ==
2564                                sector;
2565
2566                if (prev_bio_flags != bio_flags || !contig ||
2567                    merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) ||
2568                    bio_add_page(bio, page, page_size, offset) < page_size) {
2569                        ret = submit_one_bio(rw, bio, mirror_num,
2570                                             prev_bio_flags);
2571                        if (ret < 0)
2572                                return ret;
2573                        bio = NULL;
2574                } else {
2575                        return 0;
2576                }
2577        }
2578        if (this_compressed)
2579                nr = BIO_MAX_PAGES;
2580        else
2581                nr = bio_get_nr_vecs(bdev);
2582
2583        bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
2584        if (!bio)
2585                return -ENOMEM;
2586
2587        bio_add_page(bio, page, page_size, offset);
2588        bio->bi_end_io = end_io_func;
2589        bio->bi_private = tree;
2590
2591        if (bio_ret)
2592                *bio_ret = bio;
2593        else
2594                ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
2595
2596        return ret;
2597}
2598
2599void attach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
2600{
2601        if (!PagePrivate(page)) {
2602                SetPagePrivate(page);
2603                page_cache_get(page);
2604                set_page_private(page, (unsigned long)eb);
2605        } else {
2606                WARN_ON(page->private != (unsigned long)eb);
2607        }
2608}
2609
2610void set_page_extent_mapped(struct page *page)
2611{
2612        if (!PagePrivate(page)) {
2613                SetPagePrivate(page);
2614                page_cache_get(page);
2615                set_page_private(page, EXTENT_PAGE_PRIVATE);
2616        }
2617}
2618
2619/*
2620 * basic readpage implementation.  Locked extent state structs are inserted
2621 * into the tree that are removed when the IO is done (by the end_io
2622 * handlers)
2623 * XXX JDM: This needs looking at to ensure proper page locking
2624 */
2625static int __extent_read_full_page(struct extent_io_tree *tree,
2626                                   struct page *page,
2627                                   get_extent_t *get_extent,
2628                                   struct bio **bio, int mirror_num,
2629                                   unsigned long *bio_flags)
2630{
2631        struct inode *inode = page->mapping->host;
2632        u64 start = page_offset(page);
2633        u64 page_end = start + PAGE_CACHE_SIZE - 1;
2634        u64 end;
2635        u64 cur = start;
2636        u64 extent_offset;
2637        u64 last_byte = i_size_read(inode);
2638        u64 block_start;
2639        u64 cur_end;
2640        sector_t sector;
2641        struct extent_map *em;
2642        struct block_device *bdev;
2643        struct btrfs_ordered_extent *ordered;
2644        int ret;
2645        int nr = 0;
2646        size_t pg_offset = 0;
2647        size_t iosize;
2648        size_t disk_io_size;
2649        size_t blocksize = inode->i_sb->s_blocksize;
2650        unsigned long this_bio_flag = 0;
2651
2652        set_page_extent_mapped(page);
2653
2654        if (!PageUptodate(page)) {
2655                if (cleancache_get_page(page) == 0) {
2656                        BUG_ON(blocksize != PAGE_SIZE);
2657                        goto out;
2658                }
2659        }
2660
2661        end = page_end;
2662        while (1) {
2663                lock_extent(tree, start, end);
2664                ordered = btrfs_lookup_ordered_extent(inode, start);
2665                if (!ordered)
2666                        break;
2667                unlock_extent(tree, start, end);
2668                btrfs_start_ordered_extent(inode, ordered, 1);
2669                btrfs_put_ordered_extent(ordered);
2670        }
2671
2672        if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
2673                char *userpage;
2674                size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
2675
2676                if (zero_offset) {
2677                        iosize = PAGE_CACHE_SIZE - zero_offset;
2678                        userpage = kmap_atomic(page);
2679                        memset(userpage + zero_offset, 0, iosize);
2680                        flush_dcache_page(page);
2681                        kunmap_atomic(userpage);
2682                }
2683        }
2684        while (cur <= end) {
2685                unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2686
2687                if (cur >= last_byte) {
2688                        char *userpage;
2689                        struct extent_state *cached = NULL;
2690
2691                        iosize = PAGE_CACHE_SIZE - pg_offset;
2692                        userpage = kmap_atomic(page);
2693                        memset(userpage + pg_offset, 0, iosize);
2694                        flush_dcache_page(page);
2695                        kunmap_atomic(userpage);
2696                        set_extent_uptodate(tree, cur, cur + iosize - 1,
2697                                            &cached, GFP_NOFS);
2698                        unlock_extent_cached(tree, cur, cur + iosize - 1,
2699                                             &cached, GFP_NOFS);
2700                        break;
2701                }
2702                em = get_extent(inode, page, pg_offset, cur,
2703                                end - cur + 1, 0);
2704                if (IS_ERR_OR_NULL(em)) {
2705                        SetPageError(page);
2706                        unlock_extent(tree, cur, end);
2707                        break;
2708                }
2709                extent_offset = cur - em->start;
2710                BUG_ON(extent_map_end(em) <= cur);
2711                BUG_ON(end < cur);
2712
2713                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2714                        this_bio_flag = EXTENT_BIO_COMPRESSED;
2715                        extent_set_compress_type(&this_bio_flag,
2716                                                 em->compress_type);
2717                }
2718
2719                iosize = min(extent_map_end(em) - cur, end - cur + 1);
2720                cur_end = min(extent_map_end(em) - 1, end);
2721                iosize = ALIGN(iosize, blocksize);
2722                if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
2723                        disk_io_size = em->block_len;
2724                        sector = em->block_start >> 9;
2725                } else {
2726                        sector = (em->block_start + extent_offset) >> 9;
2727                        disk_io_size = iosize;
2728                }
2729                bdev = em->bdev;
2730                block_start = em->block_start;
2731                if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2732                        block_start = EXTENT_MAP_HOLE;
2733                free_extent_map(em);
2734                em = NULL;
2735
2736                /* we've found a hole, just zero and go on */
2737                if (block_start == EXTENT_MAP_HOLE) {
2738                        char *userpage;
2739                        struct extent_state *cached = NULL;
2740
2741                        userpage = kmap_atomic(page);
2742                        memset(userpage + pg_offset, 0, iosize);
2743                        flush_dcache_page(page);
2744                        kunmap_atomic(userpage);
2745
2746                        set_extent_uptodate(tree, cur, cur + iosize - 1,
2747                                            &cached, GFP_NOFS);
2748                        unlock_extent_cached(tree, cur, cur + iosize - 1,
2749                                             &cached, GFP_NOFS);
2750                        cur = cur + iosize;
2751                        pg_offset += iosize;
2752                        continue;
2753                }
2754                /* the get_extent function already copied into the page */
2755                if (test_range_bit(tree, cur, cur_end,
2756                                   EXTENT_UPTODATE, 1, NULL)) {
2757                        check_page_uptodate(tree, page);
2758                        unlock_extent(tree, cur, cur + iosize - 1);
2759                        cur = cur + iosize;
2760                        pg_offset += iosize;
2761                        continue;
2762                }
2763                /* we have an inline extent but it didn't get marked up
2764                 * to date.  Error out
2765                 */
2766                if (block_start == EXTENT_MAP_INLINE) {
2767                        SetPageError(page);
2768                        unlock_extent(tree, cur, cur + iosize - 1);
2769                        cur = cur + iosize;
2770                        pg_offset += iosize;
2771                        continue;
2772                }
2773
2774                pnr -= page->index;
2775                ret = submit_extent_page(READ, tree, page,
2776                                         sector, disk_io_size, pg_offset,
2777                                         bdev, bio, pnr,
2778                                         end_bio_extent_readpage, mirror_num,
2779                                         *bio_flags,
2780                                         this_bio_flag);
2781                if (!ret) {
2782                        nr++;
2783                        *bio_flags = this_bio_flag;
2784                } else {
2785                        SetPageError(page);
2786                        unlock_extent(tree, cur, cur + iosize - 1);
2787                }
2788                cur = cur + iosize;
2789                pg_offset += iosize;
2790        }
2791out:
2792        if (!nr) {
2793                if (!PageError(page))
2794                        SetPageUptodate(page);
2795                unlock_page(page);
2796        }
2797        return 0;
2798}
2799
2800int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2801                            get_extent_t *get_extent, int mirror_num)
2802{
2803        struct bio *bio = NULL;
2804        unsigned long bio_flags = 0;
2805        int ret;
2806
2807        ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
2808                                      &bio_flags);
2809        if (bio)
2810                ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
2811        return ret;
2812}
2813
2814static noinline void update_nr_written(struct page *page,
2815                                      struct writeback_control *wbc,
2816                                      unsigned long nr_written)
2817{
2818        wbc->nr_to_write -= nr_written;
2819        if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
2820            wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
2821                page->mapping->writeback_index = page->index + nr_written;
2822}
2823
2824/*
2825 * the writepage semantics are similar to regular writepage.  extent
2826 * records are inserted to lock ranges in the tree, and as dirty areas
2827 * are found, they are marked writeback.  Then the lock bits are removed
2828 * and the end_io handler clears the writeback ranges
2829 */
2830static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2831                              void *data)
2832{
2833        struct inode *inode = page->mapping->host;
2834        struct extent_page_data *epd = data;
2835        struct extent_io_tree *tree = epd->tree;
2836        u64 start = page_offset(page);
2837        u64 delalloc_start;
2838        u64 page_end = start + PAGE_CACHE_SIZE - 1;
2839        u64 end;
2840        u64 cur = start;
2841        u64 extent_offset;
2842        u64 last_byte = i_size_read(inode);
2843        u64 block_start;
2844        u64 iosize;
2845        sector_t sector;
2846        struct extent_state *cached_state = NULL;
2847        struct extent_map *em;
2848        struct block_device *bdev;
2849        int ret;
2850        int nr = 0;
2851        size_t pg_offset = 0;
2852        size_t blocksize;
2853        loff_t i_size = i_size_read(inode);
2854        unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
2855        u64 nr_delalloc;
2856        u64 delalloc_end;
2857        int page_started;
2858        int compressed;
2859        int write_flags;
2860        unsigned long nr_written = 0;
2861        bool fill_delalloc = true;
2862
2863        if (wbc->sync_mode == WB_SYNC_ALL)
2864                write_flags = WRITE_SYNC;
2865        else
2866                write_flags = WRITE;
2867
2868        trace___extent_writepage(page, inode, wbc);
2869
2870        WARN_ON(!PageLocked(page));
2871
2872        ClearPageError(page);
2873
2874        pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2875        if (page->index > end_index ||
2876           (page->index == end_index && !pg_offset)) {
2877                page->mapping->a_ops->invalidatepage(page, 0);
2878                unlock_page(page);
2879                return 0;
2880        }
2881
2882        if (page->index == end_index) {
2883                char *userpage;
2884
2885                userpage = kmap_atomic(page);
2886                memset(userpage + pg_offset, 0,
2887                       PAGE_CACHE_SIZE - pg_offset);
2888                kunmap_atomic(userpage);
2889                flush_dcache_page(page);
2890        }
2891        pg_offset = 0;
2892
2893        set_page_extent_mapped(page);
2894
2895        if (!tree->ops || !tree->ops->fill_delalloc)
2896                fill_delalloc = false;
2897
2898        delalloc_start = start;
2899        delalloc_end = 0;
2900        page_started = 0;
2901        if (!epd->extent_locked && fill_delalloc) {
2902                u64 delalloc_to_write = 0;
2903                /*
2904                 * make sure the wbc mapping index is at least updated
2905                 * to this page.
2906                 */
2907                update_nr_written(page, wbc, 0);
2908
2909                while (delalloc_end < page_end) {
2910                        nr_delalloc = find_lock_delalloc_range(inode, tree,
2911                                                       page,
2912                                                       &delalloc_start,
2913                                                       &delalloc_end,
2914                                                       128 * 1024 * 1024);
2915                        if (nr_delalloc == 0) {
2916                                delalloc_start = delalloc_end + 1;
2917                                continue;
2918                        }
2919                        ret = tree->ops->fill_delalloc(inode, page,
2920                                                       delalloc_start,
2921                                                       delalloc_end,
2922                                                       &page_started,
2923                                                       &nr_written);
2924                        /* File system has been set read-only */
2925                        if (ret) {
2926                                SetPageError(page);
2927                                goto done;
2928                        }
2929                        /*
2930                         * delalloc_end is already one less than the total
2931                         * length, so we don't subtract one from
2932                         * PAGE_CACHE_SIZE
2933                         */
2934                        delalloc_to_write += (delalloc_end - delalloc_start +
2935                                              PAGE_CACHE_SIZE) >>
2936                                              PAGE_CACHE_SHIFT;
2937                        delalloc_start = delalloc_end + 1;
2938                }
2939                if (wbc->nr_to_write < delalloc_to_write) {
2940                        int thresh = 8192;
2941
2942                        if (delalloc_to_write < thresh * 2)
2943                                thresh = delalloc_to_write;
2944                        wbc->nr_to_write = min_t(u64, delalloc_to_write,
2945                                                 thresh);
2946                }
2947
2948                /* did the fill delalloc function already unlock and start
2949                 * the IO?
2950                 */
2951                if (page_started) {
2952                        ret = 0;
2953                        /*
2954                         * we've unlocked the page, so we can't update
2955                         * the mapping's writeback index, just update
2956                         * nr_to_write.
2957                         */
2958                        wbc->nr_to_write -= nr_written;
2959                        goto done_unlocked;
2960                }
2961        }
2962        if (tree->ops && tree->ops->writepage_start_hook) {
2963                ret = tree->ops->writepage_start_hook(page, start,
2964                                                      page_end);
2965                if (ret) {
2966                        /* Fixup worker will requeue */
2967                        if (ret == -EBUSY)
2968                                wbc->pages_skipped++;
2969                        else
2970                                redirty_page_for_writepage(wbc, page);
2971                        update_nr_written(page, wbc, nr_written);
2972                        unlock_page(page);
2973                        ret = 0;
2974                        goto done_unlocked;
2975                }
2976        }
2977
2978        /*
2979         * we don't want to touch the inode after unlocking the page,
2980         * so we update the mapping writeback index now
2981         */
2982        update_nr_written(page, wbc, nr_written + 1);
2983
2984        end = page_end;
2985        if (last_byte <= start) {
2986                if (tree->ops && tree->ops->writepage_end_io_hook)
2987                        tree->ops->writepage_end_io_hook(page, start,
2988                                                         page_end, NULL, 1);
2989                goto done;
2990        }
2991
2992        blocksize = inode->i_sb->s_blocksize;
2993
2994        while (cur <= end) {
2995                if (cur >= last_byte) {
2996                        if (tree->ops && tree->ops->writepage_end_io_hook)
2997                                tree->ops->writepage_end_io_hook(page, cur,
2998                                                         page_end, NULL, 1);
2999                        break;
3000                }
3001                em = epd->get_extent(inode, page, pg_offset, cur,
3002                                     end - cur + 1, 1);
3003                if (IS_ERR_OR_NULL(em)) {
3004                        SetPageError(page);
3005                        break;
3006                }
3007
3008                extent_offset = cur - em->start;
3009                BUG_ON(extent_map_end(em) <= cur);
3010                BUG_ON(end < cur);
3011                iosize = min(extent_map_end(em) - cur, end - cur + 1);
3012                iosize = ALIGN(iosize, blocksize);
3013                sector = (em->block_start + extent_offset) >> 9;
3014                bdev = em->bdev;
3015                block_start = em->block_start;
3016                compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
3017                free_extent_map(em);
3018                em = NULL;
3019
3020                /*
3021                 * compressed and inline extents are written through other
3022                 * paths in the FS
3023                 */
3024                if (compressed || block_start == EXTENT_MAP_HOLE ||
3025                    block_start == EXTENT_MAP_INLINE) {
3026                        /*
3027                         * end_io notification does not happen here for
3028                         * compressed extents
3029                         */
3030                        if (!compressed && tree->ops &&
3031                            tree->ops->writepage_end_io_hook)
3032                                tree->ops->writepage_end_io_hook(page, cur,
3033                                                         cur + iosize - 1,
3034                                                         NULL, 1);
3035                        else if (compressed) {
3036                                /* we don't want to end_page_writeback on
3037                                 * a compressed extent.  this happens
3038                                 * elsewhere
3039                                 */
3040                                nr++;
3041                        }
3042
3043                        cur += iosize;
3044                        pg_offset += iosize;
3045                        continue;
3046                }
3047                /* leave this out until we have a page_mkwrite call */
3048                if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
3049                                   EXTENT_DIRTY, 0, NULL)) {
3050                        cur = cur + iosize;
3051                        pg_offset += iosize;
3052                        continue;
3053                }
3054
3055                if (tree->ops && tree->ops->writepage_io_hook) {
3056                        ret = tree->ops->writepage_io_hook(page, cur,
3057                                                cur + iosize - 1);
3058                } else {
3059                        ret = 0;
3060                }
3061                if (ret) {
3062                        SetPageError(page);
3063                } else {
3064                        unsigned long max_nr = end_index + 1;
3065
3066                        set_range_writeback(tree, cur, cur + iosize - 1);
3067                        if (!PageWriteback(page)) {
3068                                printk(KERN_ERR "btrfs warning page %lu not "
3069                                       "writeback, cur %llu end %llu\n",
3070                                       page->index, (unsigned long long)cur,
3071                                       (unsigned long long)end);
3072                        }
3073
3074                        ret = submit_extent_page(write_flags, tree, page,
3075                                                 sector, iosize, pg_offset,
3076                                                 bdev, &epd->bio, max_nr,
3077                                                 end_bio_extent_writepage,
3078                                                 0, 0, 0);
3079                        if (ret)
3080                                SetPageError(page);
3081                }
3082                cur = cur + iosize;
3083                pg_offset += iosize;
3084                nr++;
3085        }
3086done:
3087        if (nr == 0) {
3088                /* make sure the mapping tag for page dirty gets cleared */
3089                set_page_writeback(page);
3090                end_page_writeback(page);
3091        }
3092        unlock_page(page);
3093
3094done_unlocked:
3095
3096        /* drop our reference on any cached states */
3097        free_extent_state(cached_state);
3098        return 0;
3099}
3100
3101static int eb_wait(void *word)
3102{
3103        io_schedule();
3104        return 0;
3105}
3106
3107static void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
3108{
3109        wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait,
3110                    TASK_UNINTERRUPTIBLE);
3111}
3112
3113static int lock_extent_buffer_for_io(struct extent_buffer *eb,
3114                                     struct btrfs_fs_info *fs_info,
3115                                     struct extent_page_data *epd)
3116{
3117        unsigned long i, num_pages;
3118        int flush = 0;
3119        int ret = 0;
3120
3121        if (!btrfs_try_tree_write_lock(eb)) {
3122                flush = 1;
3123                flush_write_bio(epd);
3124                btrfs_tree_lock(eb);
3125        }
3126
3127        if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
3128                btrfs_tree_unlock(eb);
3129                if (!epd->sync_io)
3130                        return 0;
3131                if (!flush) {
3132                        flush_write_bio(epd);
3133                        flush = 1;
3134                }
3135                while (1) {
3136                        wait_on_extent_buffer_writeback(eb);
3137                        btrfs_tree_lock(eb);
3138                        if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
3139                                break;
3140                        btrfs_tree_unlock(eb);
3141                }
3142        }
3143
3144        /*
3145         * We need to do this to prevent races in people who check if the eb is
3146         * under IO since we can end up having no IO bits set for a short period
3147         * of time.
3148         */
3149        spin_lock(&eb->refs_lock);
3150        if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3151                set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3152                spin_unlock(&eb->refs_lock);
3153                btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
3154                __percpu_counter_add(&fs_info->dirty_metadata_bytes,
3155                                     -eb->len,
3156                                     fs_info->dirty_metadata_batch);
3157                ret = 1;
3158        } else {
3159                spin_unlock(&eb->refs_lock);
3160        }
3161
3162        btrfs_tree_unlock(eb);
3163
3164        if (!ret)
3165                return ret;
3166
3167        num_pages = num_extent_pages(eb->start, eb->len);
3168        for (i = 0; i < num_pages; i++) {
3169                struct page *p = extent_buffer_page(eb, i);
3170
3171                if (!trylock_page(p)) {
3172                        if (!flush) {
3173                                flush_write_bio(epd);
3174                                flush = 1;
3175                        }
3176                        lock_page(p);
3177                }
3178        }
3179
3180        return ret;
3181}
3182
3183static void end_extent_buffer_writeback(struct extent_buffer *eb)
3184{
3185        clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3186        smp_mb__after_clear_bit();
3187        wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
3188}
3189
3190static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
3191{
3192        int uptodate = err == 0;
3193        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
3194        struct extent_buffer *eb;
3195        int done;
3196
3197        do {
3198                struct page *page = bvec->bv_page;
3199
3200                bvec--;
3201                eb = (struct extent_buffer *)page->private;
3202                BUG_ON(!eb);
3203                done = atomic_dec_and_test(&eb->io_pages);
3204
3205                if (!uptodate || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
3206                        set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3207                        ClearPageUptodate(page);
3208                        SetPageError(page);
3209                }
3210
3211                end_page_writeback(page);
3212
3213                if (!done)
3214                        continue;
3215
3216                end_extent_buffer_writeback(eb);
3217        } while (bvec >= bio->bi_io_vec);
3218
3219        bio_put(bio);
3220
3221}
3222
3223static int write_one_eb(struct extent_buffer *eb,
3224                        struct btrfs_fs_info *fs_info,
3225                        struct writeback_control *wbc,
3226                        struct extent_page_data *epd)
3227{
3228        struct block_device *bdev = fs_info->fs_devices->latest_bdev;
3229        u64 offset = eb->start;
3230        unsigned long i, num_pages;
3231        unsigned long bio_flags = 0;
3232        int rw = (epd->sync_io ? WRITE_SYNC : WRITE);
3233        int ret = 0;
3234
3235        clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3236        num_pages = num_extent_pages(eb->start, eb->len);
3237        atomic_set(&eb->io_pages, num_pages);
3238        if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID)
3239                bio_flags = EXTENT_BIO_TREE_LOG;
3240
3241        for (i = 0; i < num_pages; i++) {
3242                struct page *p = extent_buffer_page(eb, i);
3243
3244                clear_page_dirty_for_io(p);
3245                set_page_writeback(p);
3246                ret = submit_extent_page(rw, eb->tree, p, offset >> 9,
3247                                         PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
3248                                         -1, end_bio_extent_buffer_writepage,
3249                                         0, epd->bio_flags, bio_flags);
3250                epd->bio_flags = bio_flags;
3251                if (ret) {
3252                        set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3253                        SetPageError(p);
3254                        if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
3255                                end_extent_buffer_writeback(eb);
3256                        ret = -EIO;
3257                        break;
3258                }
3259                offset += PAGE_CACHE_SIZE;
3260                update_nr_written(p, wbc, 1);
3261                unlock_page(p);
3262        }
3263
3264        if (unlikely(ret)) {
3265                for (; i < num_pages; i++) {
3266                        struct page *p = extent_buffer_page(eb, i);
3267                        unlock_page(p);
3268                }
3269        }
3270
3271        return ret;
3272}
3273
3274int btree_write_cache_pages(struct address_space *mapping,
3275                                   struct writeback_control *wbc)
3276{
3277        struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
3278        struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
3279        struct extent_buffer *eb, *prev_eb = NULL;
3280        struct extent_page_data epd = {
3281                .bio = NULL,
3282                .tree = tree,
3283                .extent_locked = 0,
3284                .sync_io = wbc->sync_mode == WB_SYNC_ALL,
3285                .bio_flags = 0,
3286        };
3287        int ret = 0;
3288        int done = 0;
3289        int nr_to_write_done = 0;
3290        struct pagevec pvec;
3291        int nr_pages;
3292        pgoff_t index;
3293        pgoff_t end;            /* Inclusive */
3294        int scanned = 0;
3295        int tag;
3296
3297        pagevec_init(&pvec, 0);
3298        if (wbc->range_cyclic) {
3299                index = mapping->writeback_index; /* Start from prev offset */
3300                end = -1;
3301        } else {
3302                index = wbc->range_start >> PAGE_CACHE_SHIFT;
3303                end = wbc->range_end >> PAGE_CACHE_SHIFT;
3304                scanned = 1;
3305        }
3306        if (wbc->sync_mode == WB_SYNC_ALL)
3307                tag = PAGECACHE_TAG_TOWRITE;
3308        else
3309                tag = PAGECACHE_TAG_DIRTY;
3310retry:
3311        if (wbc->sync_mode == WB_SYNC_ALL)
3312                tag_pages_for_writeback(mapping, index, end);
3313        while (!done && !nr_to_write_done && (index <= end) &&
3314               (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
3315                        min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
3316                unsigned i;
3317
3318                scanned = 1;
3319                for (i = 0; i < nr_pages; i++) {
3320                        struct page *page = pvec.pages[i];
3321
3322                        if (!PagePrivate(page))
3323                                continue;
3324
3325                        if (!wbc->range_cyclic && page->index > end) {
3326                                done = 1;
3327                                break;
3328                        }
3329
3330                        spin_lock(&mapping->private_lock);
3331                        if (!PagePrivate(page)) {
3332                                spin_unlock(&mapping->private_lock);
3333                                continue;
3334                        }
3335
3336                        eb = (struct extent_buffer *)page->private;
3337
3338                        /*
3339                         * Shouldn't happen and normally this would be a BUG_ON
3340                         * but no sense in crashing the users box for something
3341                         * we can survive anyway.
3342                         */
3343                        if (!eb) {
3344                                spin_unlock(&mapping->private_lock);
3345                                WARN_ON(1);
3346                                continue;
3347                        }
3348
3349                        if (eb == prev_eb) {
3350                                spin_unlock(&mapping->private_lock);
3351                                continue;
3352                        }
3353
3354                        ret = atomic_inc_not_zero(&eb->refs);
3355                        spin_unlock(&mapping->private_lock);
3356                        if (!ret)
3357                                continue;
3358
3359                        prev_eb = eb;
3360                        ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
3361                        if (!ret) {
3362                                free_extent_buffer(eb);
3363                                continue;
3364                        }
3365
3366                        ret = write_one_eb(eb, fs_info, wbc, &epd);
3367                        if (ret) {
3368                                done = 1;
3369                                free_extent_buffer(eb);
3370                                break;
3371                        }
3372                        free_extent_buffer(eb);
3373
3374                        /*
3375                         * the filesystem may choose to bump up nr_to_write.
3376                         * We have to make sure to honor the new nr_to_write
3377                         * at any time
3378                         */
3379                        nr_to_write_done = wbc->nr_to_write <= 0;
3380                }
3381                pagevec_release(&pvec);
3382                cond_resched();
3383        }
3384        if (!scanned && !done) {
3385                /*
3386                 * We hit the last page and there is more work to be done: wrap
3387                 * back to the start of the file
3388                 */
3389                scanned = 1;
3390                index = 0;
3391                goto retry;
3392        }
3393        flush_write_bio(&epd);
3394        return ret;
3395}
3396
3397/**
3398 * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
3399 * @mapping: address space structure to write
3400 * @wbc: subtract the number of written pages from *@wbc->nr_to_write
3401 * @writepage: function called for each page
3402 * @data: data passed to writepage function
3403 *
3404 * If a page is already under I/O, write_cache_pages() skips it, even
3405 * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
3406 * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
3407 * and msync() need to guarantee that all the data which was dirty at the time
3408 * the call was made get new I/O started against them.  If wbc->sync_mode is
3409 * WB_SYNC_ALL then we were called for data integrity and we must wait for
3410 * existing IO to complete.
3411 */
3412static int extent_write_cache_pages(struct extent_io_tree *tree,
3413                             struct address_space *mapping,
3414                             struct writeback_control *wbc,
3415                             writepage_t writepage, void *data,
3416                             void (*flush_fn)(void *))
3417{
3418        struct inode *inode = mapping->host;
3419        int ret = 0;
3420        int done = 0;
3421        int nr_to_write_done = 0;
3422        struct pagevec pvec;
3423        int nr_pages;
3424        pgoff_t index;
3425        pgoff_t end;            /* Inclusive */
3426        int scanned = 0;
3427        int tag;
3428
3429        /*
3430         * We have to hold onto the inode so that ordered extents can do their
3431         * work when the IO finishes.  The alternative to this is failing to add
3432         * an ordered extent if the igrab() fails there and that is a huge pain
3433         * to deal with, so instead just hold onto the inode throughout the
3434         * writepages operation.  If it fails here we are freeing up the inode
3435         * anyway and we'd rather not waste our time writing out stuff that is
3436         * going to be truncated anyway.
3437         */
3438        if (!igrab(inode))
3439                return 0;
3440
3441        pagevec_init(&pvec, 0);
3442        if (wbc->range_cyclic) {
3443                index = mapping->writeback_index; /* Start from prev offset */
3444                end = -1;
3445        } else {
3446                index = wbc->range_start >> PAGE_CACHE_SHIFT;
3447                end = wbc->range_end >> PAGE_CACHE_SHIFT;
3448                scanned = 1;
3449        }
3450        if (wbc->sync_mode == WB_SYNC_ALL)
3451                tag = PAGECACHE_TAG_TOWRITE;
3452        else
3453                tag = PAGECACHE_TAG_DIRTY;
3454retry:
3455        if (wbc->sync_mode == WB_SYNC_ALL)
3456                tag_pages_for_writeback(mapping, index, end);
3457        while (!done && !nr_to_write_done && (index <= end) &&
3458               (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
3459                        min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
3460                unsigned i;
3461
3462                scanned = 1;
3463                for (i = 0; i < nr_pages; i++) {
3464                        struct page *page = pvec.pages[i];
3465
3466                        /*
3467                         * At this point we hold neither mapping->tree_lock nor
3468                         * lock on the page itself: the page may be truncated or
3469                         * invalidated (changing page->mapping to NULL), or even
3470                         * swizzled back from swapper_space to tmpfs file
3471                         * mapping
3472                         */
3473                        if (!trylock_page(page)) {
3474                                flush_fn(data);
3475                                lock_page(page);
3476                        }
3477
3478                        if (unlikely(page->mapping != mapping)) {
3479                                unlock_page(page);
3480                                continue;
3481                        }
3482
3483                        if (!wbc->range_cyclic && page->index > end) {
3484                                done = 1;
3485                                unlock_page(page);
3486                                continue;
3487                        }
3488
3489                        if (wbc->sync_mode != WB_SYNC_NONE) {
3490                                if (PageWriteback(page))
3491                                        flush_fn(data);
3492                                wait_on_page_writeback(page);
3493                        }
3494
3495                        if (PageWriteback(page) ||
3496                            !clear_page_dirty_for_io(page)) {
3497                                unlock_page(page);
3498                                continue;
3499                        }
3500
3501                        ret = (*writepage)(page, wbc, data);
3502
3503                        if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
3504                                unlock_page(page);
3505                                ret = 0;
3506                        }
3507                        if (ret)
3508                                done = 1;
3509
3510                        /*
3511                         * the filesystem may choose to bump up nr_to_write.
3512                         * We have to make sure to honor the new nr_to_write
3513                         * at any time
3514                         */
3515                        nr_to_write_done = wbc->nr_to_write <= 0;
3516                }
3517                pagevec_release(&pvec);
3518                cond_resched();
3519        }
3520        if (!scanned && !done) {
3521                /*
3522                 * We hit the last page and there is more work to be done: wrap
3523                 * back to the start of the file
3524                 */
3525                scanned = 1;
3526                index = 0;
3527                goto retry;
3528        }
3529        btrfs_add_delayed_iput(inode);
3530        return ret;
3531}
3532
3533static void flush_epd_write_bio(struct extent_page_data *epd)
3534{
3535        if (epd->bio) {
3536                int rw = WRITE;
3537                int ret;
3538
3539                if (epd->sync_io)
3540                        rw = WRITE_SYNC;
3541
3542                ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags);
3543                BUG_ON(ret < 0); /* -ENOMEM */
3544                epd->bio = NULL;
3545        }
3546}
3547
3548static noinline void flush_write_bio(void *data)
3549{
3550        struct extent_page_data *epd = data;
3551        flush_epd_write_bio(epd);
3552}
3553
3554int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
3555                          get_extent_t *get_extent,
3556                          struct writeback_control *wbc)
3557{
3558        int ret;
3559        struct extent_page_data epd = {
3560                .bio = NULL,
3561                .tree = tree,
3562                .get_extent = get_extent,
3563                .extent_locked = 0,
3564                .sync_io = wbc->sync_mode == WB_SYNC_ALL,
3565                .bio_flags = 0,
3566        };
3567
3568        ret = __extent_writepage(page, wbc, &epd);
3569
3570        flush_epd_write_bio(&epd);
3571        return ret;
3572}
3573
3574int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
3575                              u64 start, u64 end, get_extent_t *get_extent,
3576                              int mode)
3577{
3578        int ret = 0;
3579        struct address_space *mapping = inode->i_mapping;
3580        struct page *page;
3581        unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
3582                PAGE_CACHE_SHIFT;
3583
3584        struct extent_page_data epd = {
3585                .bio = NULL,
3586                .tree = tree,
3587                .get_extent = get_extent,
3588                .extent_locked = 1,
3589                .sync_io = mode == WB_SYNC_ALL,
3590                .bio_flags = 0,
3591        };
3592        struct writeback_control wbc_writepages = {
3593                .sync_mode      = mode,
3594                .nr_to_write    = nr_pages * 2,
3595                .range_start    = start,
3596                .range_end      = end + 1,
3597        };
3598
3599        while (start <= end) {
3600                page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
3601                if (clear_page_dirty_for_io(page))
3602                        ret = __extent_writepage(page, &wbc_writepages, &epd);
3603                else {
3604                        if (tree->ops && tree->ops->writepage_end_io_hook)
3605                                tree->ops->writepage_end_io_hook(page, start,
3606                                                 start + PAGE_CACHE_SIZE - 1,
3607                                                 NULL, 1);
3608                        unlock_page(page);
3609                }
3610                page_cache_release(page);
3611                start += PAGE_CACHE_SIZE;
3612        }
3613
3614        flush_epd_write_bio(&epd);
3615        return ret;
3616}
3617
3618int extent_writepages(struct extent_io_tree *tree,
3619                      struct address_space *mapping,
3620                      get_extent_t *get_extent,
3621                      struct writeback_control *wbc)
3622{
3623        int ret = 0;
3624        struct extent_page_data epd = {
3625                .bio = NULL,
3626                .tree = tree,
3627                .get_extent = get_extent,
3628                .extent_locked = 0,
3629                .sync_io = wbc->sync_mode == WB_SYNC_ALL,
3630                .bio_flags = 0,
3631        };
3632
3633        ret = extent_write_cache_pages(tree, mapping, wbc,
3634                                       __extent_writepage, &epd,
3635                                       flush_write_bio);
3636        flush_epd_write_bio(&epd);
3637        return ret;
3638}
3639
3640int extent_readpages(struct extent_io_tree *tree,
3641                     struct address_space *mapping,
3642                     struct list_head *pages, unsigned nr_pages,
3643                     get_extent_t get_extent)
3644{
3645        struct bio *bio = NULL;
3646        unsigned page_idx;
3647        unsigned long bio_flags = 0;
3648        struct page *pagepool[16];
3649        struct page *page;
3650        int i = 0;
3651        int nr = 0;
3652
3653        for (page_idx = 0; page_idx < nr_pages; page_idx++) {
3654                page = list_entry(pages->prev, struct page, lru);
3655
3656                prefetchw(&page->flags);
3657                list_del(&page->lru);
3658                if (add_to_page_cache_lru(page, mapping,
3659                                        page->index, GFP_NOFS)) {
3660                        page_cache_release(page);
3661                        continue;
3662                }
3663
3664                pagepool[nr++] = page;
3665                if (nr < ARRAY_SIZE(pagepool))
3666                        continue;
3667                for (i = 0; i < nr; i++) {
3668                        __extent_read_full_page(tree, pagepool[i], get_extent,
3669                                        &bio, 0, &bio_flags);
3670                        page_cache_release(pagepool[i]);
3671                }
3672                nr = 0;
3673        }
3674        for (i = 0; i < nr; i++) {
3675                __extent_read_full_page(tree, pagepool[i], get_extent,
3676                                        &bio, 0, &bio_flags);
3677                page_cache_release(pagepool[i]);
3678        }
3679
3680        BUG_ON(!list_empty(pages));
3681        if (bio)
3682                return submit_one_bio(READ, bio, 0, bio_flags);
3683        return 0;
3684}
3685
3686/*
3687 * basic invalidatepage code, this waits on any locked or writeback
3688 * ranges corresponding to the page, and then deletes any extent state
3689 * records from the tree
3690 */
3691int extent_invalidatepage(struct extent_io_tree *tree,
3692                          struct page *page, unsigned long offset)
3693{
3694        struct extent_state *cached_state = NULL;
3695        u64 start = page_offset(page);
3696        u64 end = start + PAGE_CACHE_SIZE - 1;
3697        size_t blocksize = page->mapping->host->i_sb->s_blocksize;
3698
3699        start += ALIGN(offset, blocksize);
3700        if (start > end)
3701                return 0;
3702
3703        lock_extent_bits(tree, start, end, 0, &cached_state);
3704        wait_on_page_writeback(page);
3705        clear_extent_bit(tree, start, end,
3706                         EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
3707                         EXTENT_DO_ACCOUNTING,
3708                         1, 1, &cached_state, GFP_NOFS);
3709        return 0;
3710}
3711
3712/*
3713 * a helper for releasepage, this tests for areas of the page that
3714 * are locked or under IO and drops the related state bits if it is safe
3715 * to drop the page.
3716 */
3717int try_release_extent_state(struct extent_map_tree *map,
3718                             struct extent_io_tree *tree, struct page *page,
3719                             gfp_t mask)
3720{
3721        u64 start = page_offset(page);
3722        u64 end = start + PAGE_CACHE_SIZE - 1;
3723        int ret = 1;
3724
3725        if (test_range_bit(tree, start, end,
3726                           EXTENT_IOBITS, 0, NULL))
3727                ret = 0;
3728        else {
3729                if ((mask & GFP_NOFS) == GFP_NOFS)
3730                        mask = GFP_NOFS;
3731                /*
3732                 * at this point we can safely clear everything except the
3733                 * locked bit and the nodatasum bit
3734                 */
3735                ret = clear_extent_bit(tree, start, end,
3736                                 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
3737                                 0, 0, NULL, mask);
3738
3739                /* if clear_extent_bit failed for enomem reasons,
3740                 * we can't allow the release to continue.
3741                 */
3742                if (ret < 0)
3743                        ret = 0;
3744                else
3745                        ret = 1;
3746        }
3747        return ret;
3748}
3749
3750/*
3751 * a helper for releasepage.  As long as there are no locked extents
3752 * in the range corresponding to the page, both state records and extent
3753 * map records are removed
3754 */
3755int try_release_extent_mapping(struct extent_map_tree *map,
3756                               struct extent_io_tree *tree, struct page *page,
3757                               gfp_t mask)
3758{
3759        struct extent_map *em;
3760        u64 start = page_offset(page);
3761        u64 end = start + PAGE_CACHE_SIZE - 1;
3762
3763        if ((mask & __GFP_WAIT) &&
3764            page->mapping->host->i_size > 16 * 1024 * 1024) {
3765                u64 len;
3766                while (start <= end) {
3767                        len = end - start + 1;
3768                        write_lock(&map->lock);
3769                        em = lookup_extent_mapping(map, start, len);
3770                        if (!em) {
3771                                write_unlock(&map->lock);
3772                                break;
3773                        }
3774                        if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
3775                            em->start != start) {
3776                                write_unlock(&map->lock);
3777                                free_extent_map(em);
3778                                break;
3779                        }
3780                        if (!test_range_bit(tree, em->start,
3781                                            extent_map_end(em) - 1,
3782                                            EXTENT_LOCKED | EXTENT_WRITEBACK,
3783                                            0, NULL)) {
3784                                remove_extent_mapping(map, em);
3785                                /* once for the rb tree */
3786                                free_extent_map(em);
3787                        }
3788                        start = extent_map_end(em);
3789                        write_unlock(&map->lock);
3790
3791                        /* once for us */
3792                        free_extent_map(em);
3793                }
3794        }
3795        return try_release_extent_state(map, tree, page, mask);
3796}
3797
3798/*
3799 * helper function for fiemap, which doesn't want to see any holes.
3800 * This maps until we find something past 'last'
3801 */
3802static struct extent_map *get_extent_skip_holes(struct inode *inode,
3803                                                u64 offset,
3804                                                u64 last,
3805                                                get_extent_t *get_extent)
3806{
3807        u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
3808        struct extent_map *em;
3809        u64 len;
3810
3811        if (offset >= last)
3812                return NULL;
3813
3814        while(1) {
3815                len = last - offset;
3816                if (len == 0)
3817                        break;
3818                len = ALIGN(len, sectorsize);
3819                em = get_extent(inode, NULL, 0, offset, len, 0);
3820                if (IS_ERR_OR_NULL(em))
3821                        return em;
3822
3823                /* if this isn't a hole return it */
3824                if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
3825                    em->block_start != EXTENT_MAP_HOLE) {
3826                        return em;
3827                }
3828
3829                /* this is a hole, advance to the next extent */
3830                offset = extent_map_end(em);
3831                free_extent_map(em);
3832                if (offset >= last)
3833                        break;
3834        }
3835        return NULL;
3836}
3837
3838int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3839                __u64 start, __u64 len, get_extent_t *get_extent)
3840{
3841        int ret = 0;
3842        u64 off = start;
3843        u64 max = start + len;
3844        u32 flags = 0;
3845        u32 found_type;
3846        u64 last;
3847        u64 last_for_get_extent = 0;
3848        u64 disko = 0;
3849        u64 isize = i_size_read(inode);
3850        struct btrfs_key found_key;
3851        struct extent_map *em = NULL;
3852        struct extent_state *cached_state = NULL;
3853        struct btrfs_path *path;
3854        struct btrfs_file_extent_item *item;
3855        int end = 0;
3856        u64 em_start = 0;
3857        u64 em_len = 0;
3858        u64 em_end = 0;
3859        unsigned long emflags;
3860
3861        if (len == 0)
3862                return -EINVAL;
3863
3864        path = btrfs_alloc_path();
3865        if (!path)
3866                return -ENOMEM;
3867        path->leave_spinning = 1;
3868
3869        start = ALIGN(start, BTRFS_I(inode)->root->sectorsize);
3870        len = ALIGN(len, BTRFS_I(inode)->root->sectorsize);
3871
3872        /*
3873         * lookup the last file extent.  We're not using i_size here
3874         * because there might be preallocation past i_size
3875         */
3876        ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
3877                                       path, btrfs_ino(inode), -1, 0);
3878        if (ret < 0) {
3879                btrfs_free_path(path);
3880                return ret;
3881        }
3882        WARN_ON(!ret);
3883        path->slots[0]--;
3884        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3885                              struct btrfs_file_extent_item);
3886        btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
3887        found_type = btrfs_key_type(&found_key);
3888
3889        /* No extents, but there might be delalloc bits */
3890        if (found_key.objectid != btrfs_ino(inode) ||
3891            found_type != BTRFS_EXTENT_DATA_KEY) {
3892                /* have to trust i_size as the end */
3893                last = (u64)-1;
3894                last_for_get_extent = isize;
3895        } else {
3896                /*
3897                 * remember the start of the last extent.  There are a
3898                 * bunch of different factors that go into the length of the
3899                 * extent, so its much less complex to remember where it started
3900                 */
3901                last = found_key.offset;
3902                last_for_get_extent = last + 1;
3903        }
3904        btrfs_free_path(path);
3905
3906        /*
3907         * we might have some extents allocated but more delalloc past those
3908         * extents.  so, we trust isize unless the start of the last extent is
3909         * beyond isize
3910         */
3911        if (last < isize) {
3912                last = (u64)-1;
3913                last_for_get_extent = isize;
3914        }
3915
3916        lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
3917                         &cached_state);
3918
3919        em = get_extent_skip_holes(inode, start, last_for_get_extent,
3920                                   get_extent);
3921        if (!em)
3922                goto out;
3923        if (IS_ERR(em)) {
3924                ret = PTR_ERR(em);
3925                goto out;
3926        }
3927
3928        while (!end) {
3929                u64 offset_in_extent;
3930
3931                /* break if the extent we found is outside the range */
3932                if (em->start >= max || extent_map_end(em) < off)
3933                        break;
3934
3935                /*
3936                 * get_extent may return an extent that starts before our
3937                 * requested range.  We have to make sure the ranges
3938                 * we return to fiemap always move forward and don't
3939                 * overlap, so adjust the offsets here
3940                 */
3941                em_start = max(em->start, off);
3942
3943                /*
3944                 * record the offset from the start of the extent
3945                 * for adjusting the disk offset below
3946                 */
3947                offset_in_extent = em_start - em->start;
3948                em_end = extent_map_end(em);
3949                em_len = em_end - em_start;
3950                emflags = em->flags;
3951                disko = 0;
3952                flags = 0;
3953
3954                /*
3955                 * bump off for our next call to get_extent
3956                 */
3957                off = extent_map_end(em);
3958                if (off >= max)
3959                        end = 1;
3960
3961                if (em->block_start == EXTENT_MAP_LAST_BYTE) {
3962                        end = 1;
3963                        flags |= FIEMAP_EXTENT_LAST;
3964                } else if (em->block_start == EXTENT_MAP_INLINE) {
3965                        flags |= (FIEMAP_EXTENT_DATA_INLINE |
3966                                  FIEMAP_EXTENT_NOT_ALIGNED);
3967                } else if (em->block_start == EXTENT_MAP_DELALLOC) {
3968                        flags |= (FIEMAP_EXTENT_DELALLOC |
3969                                  FIEMAP_EXTENT_UNKNOWN);
3970                } else {
3971                        disko = em->block_start + offset_in_extent;
3972                }
3973                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
3974                        flags |= FIEMAP_EXTENT_ENCODED;
3975
3976                free_extent_map(em);
3977                em = NULL;
3978                if ((em_start >= last) || em_len == (u64)-1 ||
3979                   (last == (u64)-1 && isize <= em_end)) {
3980                        flags |= FIEMAP_EXTENT_LAST;
3981                        end = 1;
3982                }
3983
3984                /* now scan forward to see if this is really the last extent. */
3985                em = get_extent_skip_holes(inode, off, last_for_get_extent,
3986                                           get_extent);
3987                if (IS_ERR(em)) {
3988                        ret = PTR_ERR(em);
3989                        goto out;
3990                }
3991                if (!em) {
3992                        flags |= FIEMAP_EXTENT_LAST;
3993                        end = 1;
3994                }
3995                ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
3996                                              em_len, flags);
3997                if (ret)
3998                        goto out_free;
3999        }
4000out_free:
4001        free_extent_map(em);
4002out:
4003        unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
4004                             &cached_state, GFP_NOFS);
4005        return ret;
4006}
4007
4008static void __free_extent_buffer(struct extent_buffer *eb)
4009{
4010#if LEAK_DEBUG
4011        unsigned long flags;
4012        spin_lock_irqsave(&leak_lock, flags);
4013        list_del(&eb->leak_list);
4014        spin_unlock_irqrestore(&leak_lock, flags);
4015#endif
4016        kmem_cache_free(extent_buffer_cache, eb);
4017}
4018
4019static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
4020                                                   u64 start,
4021                                                   unsigned long len,
4022                                                   gfp_t mask)
4023{
4024        struct extent_buffer *eb = NULL;
4025#if LEAK_DEBUG
4026        unsigned long flags;
4027#endif
4028
4029        eb = kmem_cache_zalloc(extent_buffer_cache, mask);
4030        if (eb == NULL)
4031                return NULL;
4032        eb->start = start;
4033        eb->len = len;
4034        eb->tree = tree;
4035        eb->bflags = 0;
4036        rwlock_init(&eb->lock);
4037        atomic_set(&eb->write_locks, 0);
4038        atomic_set(&eb->read_locks, 0);
4039        atomic_set(&eb->blocking_readers, 0);
4040        atomic_set(&eb->blocking_writers, 0);
4041        atomic_set(&eb->spinning_readers, 0);
4042        atomic_set(&eb->spinning_writers, 0);
4043        eb->lock_nested = 0;
4044        init_waitqueue_head(&eb->write_lock_wq);
4045        init_waitqueue_head(&eb->read_lock_wq);
4046
4047#if LEAK_DEBUG
4048        spin_lock_irqsave(&leak_lock, flags);
4049        list_add(&eb->leak_list, &buffers);
4050        spin_unlock_irqrestore(&leak_lock, flags);
4051#endif
4052        spin_lock_init(&eb->refs_lock);
4053        atomic_set(&eb->refs, 1);
4054        atomic_set(&eb->io_pages, 0);
4055
4056        /*
4057         * Sanity checks, currently the maximum is 64k covered by 16x 4k pages
4058         */
4059        BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE
4060                > MAX_INLINE_EXTENT_BUFFER_SIZE);
4061        BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE);
4062
4063        return eb;
4064}
4065
4066struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
4067{
4068        unsigned long i;
4069        struct page *p;
4070        struct extent_buffer *new;
4071        unsigned long num_pages = num_extent_pages(src->start, src->len);
4072
4073        new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_ATOMIC);
4074        if (new == NULL)
4075                return NULL;
4076
4077        for (i = 0; i < num_pages; i++) {
4078                p = alloc_page(GFP_ATOMIC);
4079                BUG_ON(!p);
4080                attach_extent_buffer_page(new, p);
4081                WARN_ON(PageDirty(p));
4082                SetPageUptodate(p);
4083                new->pages[i] = p;
4084        }
4085
4086        copy_extent_buffer(new, src, 0, 0, src->len);
4087        set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
4088        set_bit(EXTENT_BUFFER_DUMMY, &new->bflags);
4089
4090        return new;
4091}
4092
4093struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len)
4094{
4095        struct extent_buffer *eb;
4096        unsigned long num_pages = num_extent_pages(0, len);
4097        unsigned long i;
4098
4099        eb = __alloc_extent_buffer(NULL, start, len, GFP_ATOMIC);
4100        if (!eb)
4101                return NULL;
4102
4103        for (i = 0; i < num_pages; i++) {
4104                eb->pages[i] = alloc_page(GFP_ATOMIC);
4105                if (!eb->pages[i])
4106                        goto err;
4107        }
4108        set_extent_buffer_uptodate(eb);
4109        btrfs_set_header_nritems(eb, 0);
4110        set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
4111
4112        return eb;
4113err:
4114        for (; i > 0; i--)
4115                __free_page(eb->pages[i - 1]);
4116        __free_extent_buffer(eb);
4117        return NULL;
4118}
4119
4120static int extent_buffer_under_io(struct extent_buffer *eb)
4121{
4122        return (atomic_read(&eb->io_pages) ||
4123                test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
4124                test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
4125}
4126
4127/*
4128 * Helper for releasing extent buffer page.
4129 */
4130static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
4131                                                unsigned long start_idx)
4132{
4133        unsigned long index;
4134        unsigned long num_pages;
4135        struct page *page;
4136        int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
4137
4138        BUG_ON(extent_buffer_under_io(eb));
4139
4140        num_pages = num_extent_pages(eb->start, eb->len);
4141        index = start_idx + num_pages;
4142        if (start_idx >= index)
4143                return;
4144
4145        do {
4146                index--;
4147                page = extent_buffer_page(eb, index);
4148                if (page && mapped) {
4149                        spin_lock(&page->mapping->private_lock);
4150                        /*
4151                         * We do this since we'll remove the pages after we've
4152                         * removed the eb from the radix tree, so we could race
4153                         * and have this page now attached to the new eb.  So
4154                         * only clear page_private if it's still connected to
4155                         * this eb.
4156                         */
4157                        if (PagePrivate(page) &&
4158                            page->private == (unsigned long)eb) {
4159                                BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
4160                                BUG_ON(PageDirty(page));
4161                                BUG_ON(PageWriteback(page));
4162                                /*
4163                                 * We need to make sure we haven't be attached
4164                                 * to a new eb.
4165                                 */
4166                                ClearPagePrivate(page);
4167                                set_page_private(page, 0);
4168                                /* One for the page private */
4169                                page_cache_release(page);
4170                        }
4171                        spin_unlock(&page->mapping->private_lock);
4172
4173                }
4174                if (page) {
4175                        /* One for when we alloced the page */
4176                        page_cache_release(page);
4177                }
4178        } while (index != start_idx);
4179}
4180
4181/*
4182 * Helper for releasing the extent buffer.
4183 */
4184static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
4185{
4186        btrfs_release_extent_buffer_page(eb, 0);
4187        __free_extent_buffer(eb);
4188}
4189
4190static void check_buffer_tree_ref(struct extent_buffer *eb)
4191{
4192        int refs;
4193        /* the ref bit is tricky.  We have to make sure it is set
4194         * if we have the buffer dirty.   Otherwise the
4195         * code to free a buffer can end up dropping a dirty
4196         * page
4197         *
4198         * Once the ref bit is set, it won't go away while the
4199         * buffer is dirty or in writeback, and it also won't
4200         * go away while we have the reference count on the
4201         * eb bumped.
4202         *
4203         * We can't just set the ref bit without bumping the
4204         * ref on the eb because free_extent_buffer might
4205         * see the ref bit and try to clear it.  If this happens
4206         * free_extent_buffer might end up dropping our original
4207         * ref by mistake and freeing the page before we are able
4208         * to add one more ref.
4209         *
4210         * So bump the ref count first, then set the bit.  If someone
4211         * beat us to it, drop the ref we added.
4212         */
4213        refs = atomic_read(&eb->refs);
4214        if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4215                return;
4216
4217        spin_lock(&eb->refs_lock);
4218        if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4219                atomic_inc(&eb->refs);
4220        spin_unlock(&eb->refs_lock);
4221}
4222
4223static void mark_extent_buffer_accessed(struct extent_buffer *eb)
4224{
4225        unsigned long num_pages, i;
4226
4227        check_buffer_tree_ref(eb);
4228
4229        num_pages = num_extent_pages(eb->start, eb->len);
4230        for (i = 0; i < num_pages; i++) {
4231                struct page *p = extent_buffer_page(eb, i);
4232                mark_page_accessed(p);
4233        }
4234}
4235
4236struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
4237                                          u64 start, unsigned long len)
4238{
4239        unsigned long num_pages = num_extent_pages(start, len);
4240        unsigned long i;
4241        unsigned long index = start >> PAGE_CACHE_SHIFT;
4242        struct extent_buffer *eb;
4243        struct extent_buffer *exists = NULL;
4244        struct page *p;
4245        struct address_space *mapping = tree->mapping;
4246        int uptodate = 1;
4247        int ret;
4248
4249        rcu_read_lock();
4250        eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
4251        if (eb && atomic_inc_not_zero(&eb->refs)) {
4252                rcu_read_unlock();
4253                mark_extent_buffer_accessed(eb);
4254                return eb;
4255        }
4256        rcu_read_unlock();
4257
4258        eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS);
4259        if (!eb)
4260                return NULL;
4261
4262        for (i = 0; i < num_pages; i++, index++) {
4263                p = find_or_create_page(mapping, index, GFP_NOFS);
4264                if (!p)
4265                        goto free_eb;
4266
4267                spin_lock(&mapping->private_lock);
4268                if (PagePrivate(p)) {
4269                        /*
4270                         * We could have already allocated an eb for this page
4271                         * and attached one so lets see if we can get a ref on
4272                         * the existing eb, and if we can we know it's good and
4273                         * we can just return that one, else we know we can just
4274                         * overwrite page->private.
4275                         */
4276                        exists = (struct extent_buffer *)p->private;
4277                        if (atomic_inc_not_zero(&exists->refs)) {
4278                                spin_unlock(&mapping->private_lock);
4279                                unlock_page(p);
4280                                page_cache_release(p);
4281                                mark_extent_buffer_accessed(exists);
4282                                goto free_eb;
4283                        }
4284
4285                        /*
4286                         * Do this so attach doesn't complain and we need to
4287                         * drop the ref the old guy had.
4288                         */
4289                        ClearPagePrivate(p);
4290                        WARN_ON(PageDirty(p));
4291                        page_cache_release(p);
4292                }
4293                attach_extent_buffer_page(eb, p);
4294                spin_unlock(&mapping->private_lock);
4295                WARN_ON(PageDirty(p));
4296                mark_page_accessed(p);
4297                eb->pages[i] = p;
4298                if (!PageUptodate(p))
4299                        uptodate = 0;
4300
4301                /*
4302                 * see below about how we avoid a nasty race with release page
4303                 * and why we unlock later
4304                 */
4305        }
4306        if (uptodate)
4307                set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4308again:
4309        ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
4310        if (ret)
4311                goto free_eb;
4312
4313        spin_lock(&tree->buffer_lock);
4314        ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb);
4315        if (ret == -EEXIST) {
4316                exists = radix_tree_lookup(&tree->buffer,
4317                                                start >> PAGE_CACHE_SHIFT);
4318                if (!atomic_inc_not_zero(&exists->refs)) {
4319                        spin_unlock(&tree->buffer_lock);
4320                        radix_tree_preload_end();
4321                        exists = NULL;
4322                        goto again;
4323                }
4324                spin_unlock(&tree->buffer_lock);
4325                radix_tree_preload_end();
4326                mark_extent_buffer_accessed(exists);
4327                goto free_eb;
4328        }
4329        /* add one reference for the tree */
4330        check_buffer_tree_ref(eb);
4331        spin_unlock(&tree->buffer_lock);
4332        radix_tree_preload_end();
4333
4334        /*
4335         * there is a race where release page may have
4336         * tried to find this extent buffer in the radix
4337         * but failed.  It will tell the VM it is safe to
4338         * reclaim the, and it will clear the page private bit.
4339         * We must make sure to set the page private bit properly
4340         * after the extent buffer is in the radix tree so
4341         * it doesn't get lost
4342         */
4343        SetPageChecked(eb->pages[0]);
4344        for (i = 1; i < num_pages; i++) {
4345                p = extent_buffer_page(eb, i);
4346                ClearPageChecked(p);
4347                unlock_page(p);
4348        }
4349        unlock_page(eb->pages[0]);
4350        return eb;
4351
4352free_eb:
4353        for (i = 0; i < num_pages; i++) {
4354                if (eb->pages[i])
4355                        unlock_page(eb->pages[i]);
4356        }
4357
4358        WARN_ON(!atomic_dec_and_test(&eb->refs));
4359        btrfs_release_extent_buffer(eb);
4360        return exists;
4361}
4362
4363struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
4364                                         u64 start, unsigned long len)
4365{
4366        struct extent_buffer *eb;
4367
4368        rcu_read_lock();
4369        eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
4370        if (eb && atomic_inc_not_zero(&eb->refs)) {
4371                rcu_read_unlock();
4372                mark_extent_buffer_accessed(eb);
4373                return eb;
4374        }
4375        rcu_read_unlock();
4376
4377        return NULL;
4378}
4379
4380static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
4381{
4382        struct extent_buffer *eb =
4383                        container_of(head, struct extent_buffer, rcu_head);
4384
4385        __free_extent_buffer(eb);
4386}
4387
4388/* Expects to have eb->eb_lock already held */
4389static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
4390{
4391        WARN_ON(atomic_read(&eb->refs) == 0);
4392        if (atomic_dec_and_test(&eb->refs)) {
4393                if (test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) {
4394                        spin_unlock(&eb->refs_lock);
4395                } else {
4396                        struct extent_io_tree *tree = eb->tree;
4397
4398                        spin_unlock(&eb->refs_lock);
4399
4400                        spin_lock(&tree->buffer_lock);
4401                        radix_tree_delete(&tree->buffer,
4402                                          eb->start >> PAGE_CACHE_SHIFT);
4403                        spin_unlock(&tree->buffer_lock);
4404                }
4405
4406                /* Should be safe to release our pages at this point */
4407                btrfs_release_extent_buffer_page(eb, 0);
4408                call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
4409                return 1;
4410        }
4411        spin_unlock(&eb->refs_lock);
4412
4413        return 0;
4414}
4415
4416void free_extent_buffer(struct extent_buffer *eb)
4417{
4418        int refs;
4419        int old;
4420        if (!eb)
4421                return;
4422
4423        while (1) {
4424                refs = atomic_read(&eb->refs);
4425                if (refs <= 3)
4426                        break;
4427                old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
4428                if (old == refs)
4429                        return;
4430        }
4431
4432        spin_lock(&eb->refs_lock);
4433        if (atomic_read(&eb->refs) == 2 &&
4434            test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))
4435                atomic_dec(&eb->refs);
4436
4437        if (atomic_read(&eb->refs) == 2 &&
4438            test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
4439            !extent_buffer_under_io(eb) &&
4440            test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4441                atomic_dec(&eb->refs);
4442
4443        /*
4444         * I know this is terrible, but it's temporary until we stop tracking
4445         * the uptodate bits and such for the extent buffers.
4446         */
4447        release_extent_buffer(eb, GFP_ATOMIC);
4448}
4449
4450void free_extent_buffer_stale(struct extent_buffer *eb)
4451{
4452        if (!eb)
4453                return;
4454
4455        spin_lock(&eb->refs_lock);
4456        set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
4457
4458        if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
4459            test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4460                atomic_dec(&eb->refs);
4461        release_extent_buffer(eb, GFP_NOFS);
4462}
4463
4464void clear_extent_buffer_dirty(struct extent_buffer *eb)
4465{
4466        unsigned long i;
4467        unsigned long num_pages;
4468        struct page *page;
4469
4470        num_pages = num_extent_pages(eb->start, eb->len);
4471
4472        for (i = 0; i < num_pages; i++) {
4473                page = extent_buffer_page(eb, i);
4474                if (!PageDirty(page))
4475                        continue;
4476
4477                lock_page(page);
4478                WARN_ON(!PagePrivate(page));
4479
4480                clear_page_dirty_for_io(page);
4481                spin_lock_irq(&page->mapping->tree_lock);
4482                if (!PageDirty(page)) {
4483                        radix_tree_tag_clear(&page->mapping->page_tree,
4484                                                page_index(page),
4485                                                PAGECACHE_TAG_DIRTY);
4486                }
4487                spin_unlock_irq(&page->mapping->tree_lock);
4488                ClearPageError(page);
4489                unlock_page(page);
4490        }
4491        WARN_ON(atomic_read(&eb->refs) == 0);
4492}
4493
4494int set_extent_buffer_dirty(struct extent_buffer *eb)
4495{
4496        unsigned long i;
4497        unsigned long num_pages;
4498        int was_dirty = 0;
4499
4500        check_buffer_tree_ref(eb);
4501
4502        was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
4503
4504        num_pages = num_extent_pages(eb->start, eb->len);
4505        WARN_ON(atomic_read(&eb->refs) == 0);
4506        WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
4507
4508        for (i = 0; i < num_pages; i++)
4509                set_page_dirty(extent_buffer_page(eb, i));
4510        return was_dirty;
4511}
4512
4513static int range_straddles_pages(u64 start, u64 len)
4514{
4515        if (len < PAGE_CACHE_SIZE)
4516                return 1;
4517        if (start & (PAGE_CACHE_SIZE - 1))
4518                return 1;
4519        if ((start + len) & (PAGE_CACHE_SIZE - 1))
4520                return 1;
4521        return 0;
4522}
4523
4524int clear_extent_buffer_uptodate(struct extent_buffer *eb)
4525{
4526        unsigned long i;
4527        struct page *page;
4528        unsigned long num_pages;
4529
4530        clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4531        num_pages = num_extent_pages(eb->start, eb->len);
4532        for (i = 0; i < num_pages; i++) {
4533                page = extent_buffer_page(eb, i);
4534                if (page)
4535                        ClearPageUptodate(page);
4536        }
4537        return 0;
4538}
4539
4540int set_extent_buffer_uptodate(struct extent_buffer *eb)
4541{
4542        unsigned long i;
4543        struct page *page;
4544        unsigned long num_pages;
4545
4546        set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4547        num_pages = num_extent_pages(eb->start, eb->len);
4548        for (i = 0; i < num_pages; i++) {
4549                page = extent_buffer_page(eb, i);
4550                SetPageUptodate(page);
4551        }
4552        return 0;
4553}
4554
4555int extent_range_uptodate(struct extent_io_tree *tree,
4556                          u64 start, u64 end)
4557{
4558        struct page *page;
4559        int ret;
4560        int pg_uptodate = 1;
4561        int uptodate;
4562        unsigned long index;
4563
4564        if (range_straddles_pages(start, end - start + 1)) {
4565                ret = test_range_bit(tree, start, end,
4566                                     EXTENT_UPTODATE, 1, NULL);
4567                if (ret)
4568                        return 1;
4569        }
4570        while (start <= end) {
4571                index = start >> PAGE_CACHE_SHIFT;
4572                page = find_get_page(tree->mapping, index);
4573                if (!page)
4574                        return 1;
4575                uptodate = PageUptodate(page);
4576                page_cache_release(page);
4577                if (!uptodate) {
4578                        pg_uptodate = 0;
4579                        break;
4580                }
4581                start += PAGE_CACHE_SIZE;
4582        }
4583        return pg_uptodate;
4584}
4585
4586int extent_buffer_uptodate(struct extent_buffer *eb)
4587{
4588        return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4589}
4590
4591int read_extent_buffer_pages(struct extent_io_tree *tree,
4592                             struct extent_buffer *eb, u64 start, int wait,
4593                             get_extent_t *get_extent, int mirror_num)
4594{
4595        unsigned long i;
4596        unsigned long start_i;
4597        struct page *page;
4598        int err;
4599        int ret = 0;
4600        int locked_pages = 0;
4601        int all_uptodate = 1;
4602        unsigned long num_pages;
4603        unsigned long num_reads = 0;
4604        struct bio *bio = NULL;
4605        unsigned long bio_flags = 0;
4606
4607        if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
4608                return 0;
4609
4610        if (start) {
4611                WARN_ON(start < eb->start);
4612                start_i = (start >> PAGE_CACHE_SHIFT) -
4613                        (eb->start >> PAGE_CACHE_SHIFT);
4614        } else {
4615                start_i = 0;
4616        }
4617
4618        num_pages = num_extent_pages(eb->start, eb->len);
4619        for (i = start_i; i < num_pages; i++) {
4620                page = extent_buffer_page(eb, i);
4621                if (wait == WAIT_NONE) {
4622                        if (!trylock_page(page))
4623                                goto unlock_exit;
4624                } else {
4625                        lock_page(page);
4626                }
4627                locked_pages++;
4628                if (!PageUptodate(page)) {
4629                        num_reads++;
4630                        all_uptodate = 0;
4631                }
4632        }
4633        if (all_uptodate) {
4634                if (start_i == 0)
4635                        set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4636                goto unlock_exit;
4637        }
4638
4639        clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
4640        eb->read_mirror = 0;
4641        atomic_set(&eb->io_pages, num_reads);
4642        for (i = start_i; i < num_pages; i++) {
4643                page = extent_buffer_page(eb, i);
4644                if (!PageUptodate(page)) {
4645                        ClearPageError(page);
4646                        err = __extent_read_full_page(tree, page,
4647                                                      get_extent, &bio,
4648                                                      mirror_num, &bio_flags);
4649                        if (err)
4650                                ret = err;
4651                } else {
4652                        unlock_page(page);
4653                }
4654        }
4655
4656        if (bio) {
4657                err = submit_one_bio(READ, bio, mirror_num, bio_flags);
4658                if (err)
4659                        return err;
4660        }
4661
4662        if (ret || wait != WAIT_COMPLETE)
4663                return ret;
4664
4665        for (i = start_i; i < num_pages; i++) {
4666                page = extent_buffer_page(eb, i);
4667                wait_on_page_locked(page);
4668                if (!PageUptodate(page))
4669                        ret = -EIO;
4670        }
4671
4672        return ret;
4673
4674unlock_exit:
4675        i = start_i;
4676        while (locked_pages > 0) {
4677                page = extent_buffer_page(eb, i);
4678                i++;
4679                unlock_page(page);
4680                locked_pages--;
4681        }
4682        return ret;
4683}
4684
4685void read_extent_buffer(struct extent_buffer *eb, void *dstv,
4686                        unsigned long start,
4687                        unsigned long len)
4688{
4689        size_t cur;
4690        size_t offset;
4691        struct page *page;
4692        char *kaddr;
4693        char *dst = (char *)dstv;
4694        size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4695        unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4696
4697        WARN_ON(start > eb->len);
4698        WARN_ON(start + len > eb->start + eb->len);
4699
4700        offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4701
4702        while (len > 0) {
4703                page = extent_buffer_page(eb, i);
4704
4705                cur = min(len, (PAGE_CACHE_SIZE - offset));
4706                kaddr = page_address(page);
4707                memcpy(dst, kaddr + offset, cur);
4708
4709                dst += cur;
4710                len -= cur;
4711                offset = 0;
4712                i++;
4713        }
4714}
4715
4716int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
4717                               unsigned long min_len, char **map,
4718                               unsigned long *map_start,
4719                               unsigned long *map_len)
4720{
4721        size_t offset = start & (PAGE_CACHE_SIZE - 1);
4722        char *kaddr;
4723        struct page *p;
4724        size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4725        unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4726        unsigned long end_i = (start_offset + start + min_len - 1) >>
4727                PAGE_CACHE_SHIFT;
4728
4729        if (i != end_i)
4730                return -EINVAL;
4731
4732        if (i == 0) {
4733                offset = start_offset;
4734                *map_start = 0;
4735        } else {
4736                offset = 0;
4737                *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
4738        }
4739
4740        if (start + min_len > eb->len) {
4741                WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
4742                       "wanted %lu %lu\n", (unsigned long long)eb->start,
4743                       eb->len, start, min_len);
4744                return -EINVAL;
4745        }
4746
4747        p = extent_buffer_page(eb, i);
4748        kaddr = page_address(p);
4749        *map = kaddr + offset;
4750        *map_len = PAGE_CACHE_SIZE - offset;
4751        return 0;
4752}
4753
4754int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
4755                          unsigned long start,
4756                          unsigned long len)
4757{
4758        size_t cur;
4759        size_t offset;
4760        struct page *page;
4761        char *kaddr;
4762        char *ptr = (char *)ptrv;
4763        size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4764        unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4765        int ret = 0;
4766
4767        WARN_ON(start > eb->len);
4768        WARN_ON(start + len > eb->start + eb->len);
4769
4770        offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4771
4772        while (len > 0) {
4773                page = extent_buffer_page(eb, i);
4774
4775                cur = min(len, (PAGE_CACHE_SIZE - offset));
4776
4777                kaddr = page_address(page);
4778                ret = memcmp(ptr, kaddr + offset, cur);
4779                if (ret)
4780                        break;
4781
4782                ptr += cur;
4783                len -= cur;
4784                offset = 0;
4785                i++;
4786        }
4787        return ret;
4788}
4789
4790void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
4791                         unsigned long start, unsigned long len)
4792{
4793        size_t cur;
4794        size_t offset;
4795        struct page *page;
4796        char *kaddr;
4797        char *src = (char *)srcv;
4798        size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4799        unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4800
4801        WARN_ON(start > eb->len);
4802        WARN_ON(start + len > eb->start + eb->len);
4803
4804        offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4805
4806        while (len > 0) {
4807                page = extent_buffer_page(eb, i);
4808                WARN_ON(!PageUptodate(page));
4809
4810                cur = min(len, PAGE_CACHE_SIZE - offset);
4811                kaddr = page_address(page);
4812                memcpy(kaddr + offset, src, cur);
4813
4814                src += cur;
4815                len -= cur;
4816                offset = 0;
4817                i++;
4818        }
4819}
4820
4821void memset_extent_buffer(struct extent_buffer *eb, char c,
4822                          unsigned long start, unsigned long len)
4823{
4824        size_t cur;
4825        size_t offset;
4826        struct page *page;
4827        char *kaddr;
4828        size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4829        unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4830
4831        WARN_ON(start > eb->len);
4832        WARN_ON(start + len > eb->start + eb->len);
4833
4834        offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4835
4836        while (len > 0) {
4837                page = extent_buffer_page(eb, i);
4838                WARN_ON(!PageUptodate(page));
4839
4840                cur = min(len, PAGE_CACHE_SIZE - offset);
4841                kaddr = page_address(page);
4842                memset(kaddr + offset, c, cur);
4843
4844                len -= cur;
4845                offset = 0;
4846                i++;
4847        }
4848}
4849
4850void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
4851                        unsigned long dst_offset, unsigned long src_offset,
4852                        unsigned long len)
4853{
4854        u64 dst_len = dst->len;
4855        size_t cur;
4856        size_t offset;
4857        struct page *page;
4858        char *kaddr;
4859        size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
4860        unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
4861
4862        WARN_ON(src->len != dst_len);
4863
4864        offset = (start_offset + dst_offset) &
4865                ((unsigned long)PAGE_CACHE_SIZE - 1);
4866
4867        while (len > 0) {
4868                page = extent_buffer_page(dst, i);
4869                WARN_ON(!PageUptodate(page));
4870
4871                cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
4872
4873                kaddr = page_address(page);
4874                read_extent_buffer(src, kaddr + offset, src_offset, cur);
4875
4876                src_offset += cur;
4877                len -= cur;
4878                offset = 0;
4879                i++;
4880        }
4881}
4882
4883static void move_pages(struct page *dst_page, struct page *src_page,
4884                       unsigned long dst_off, unsigned long src_off,
4885                       unsigned long len)
4886{
4887        char *dst_kaddr = page_address(dst_page);
4888        if (dst_page == src_page) {
4889                memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
4890        } else {
4891                char *src_kaddr = page_address(src_page);
4892                char *p = dst_kaddr + dst_off + len;
4893                char *s = src_kaddr + src_off + len;
4894
4895                while (len--)
4896                        *--p = *--s;
4897        }
4898}
4899
4900static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
4901{
4902        unsigned long distance = (src > dst) ? src - dst : dst - src;
4903        return distance < len;
4904}
4905
4906static void copy_pages(struct page *dst_page, struct page *src_page,
4907                       unsigned long dst_off, unsigned long src_off,
4908                       unsigned long len)
4909{
4910        char *dst_kaddr = page_address(dst_page);
4911        char *src_kaddr;
4912        int must_memmove = 0;
4913
4914        if (dst_page != src_page) {
4915                src_kaddr = page_address(src_page);
4916        } else {
4917                src_kaddr = dst_kaddr;
4918                if (areas_overlap(src_off, dst_off, len))
4919                        must_memmove = 1;
4920        }
4921
4922        if (must_memmove)
4923                memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
4924        else
4925                memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
4926}
4927
4928void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
4929                           unsigned long src_offset, unsigned long len)
4930{
4931        size_t cur;
4932        size_t dst_off_in_page;
4933        size_t src_off_in_page;
4934        size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
4935        unsigned long dst_i;
4936        unsigned long src_i;
4937
4938        if (src_offset + len > dst->len) {
4939                printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
4940                       "len %lu dst len %lu\n", src_offset, len, dst->len);
4941                BUG_ON(1);
4942        }
4943        if (dst_offset + len > dst->len) {
4944                printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
4945                       "len %lu dst len %lu\n", dst_offset, len, dst->len);
4946                BUG_ON(1);
4947        }
4948
4949        while (len > 0) {
4950                dst_off_in_page = (start_offset + dst_offset) &
4951                        ((unsigned long)PAGE_CACHE_SIZE - 1);
4952                src_off_in_page = (start_offset + src_offset) &
4953                        ((unsigned long)PAGE_CACHE_SIZE - 1);
4954
4955                dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
4956                src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
4957
4958                cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
4959                                               src_off_in_page));
4960                cur = min_t(unsigned long, cur,
4961                        (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
4962
4963                copy_pages(extent_buffer_page(dst, dst_i),
4964                           extent_buffer_page(dst, src_i),
4965                           dst_off_in_page, src_off_in_page, cur);
4966
4967                src_offset += cur;
4968                dst_offset += cur;
4969                len -= cur;
4970        }
4971}
4972
4973void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
4974                           unsigned long src_offset, unsigned long len)
4975{
4976        size_t cur;
4977        size_t dst_off_in_page;
4978        size_t src_off_in_page;
4979        unsigned long dst_end = dst_offset + len - 1;
4980        unsigned long src_end = src_offset + len - 1;
4981        size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
4982        unsigned long dst_i;
4983        unsigned long src_i;
4984
4985        if (src_offset + len > dst->len) {
4986                printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
4987                       "len %lu len %lu\n", src_offset, len, dst->len);
4988                BUG_ON(1);
4989        }
4990        if (dst_offset + len > dst->len) {
4991                printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
4992                       "len %lu len %lu\n", dst_offset, len, dst->len);
4993                BUG_ON(1);
4994        }
4995        if (dst_offset < src_offset) {
4996                memcpy_extent_buffer(dst, dst_offset, src_offset, len);
4997                return;
4998        }
4999        while (len > 0) {
5000                dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
5001                src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
5002
5003                dst_off_in_page = (start_offset + dst_end) &
5004                        ((unsigned long)PAGE_CACHE_SIZE - 1);
5005                src_off_in_page = (start_offset + src_end) &
5006                        ((unsigned long)PAGE_CACHE_SIZE - 1);
5007
5008                cur = min_t(unsigned long, len, src_off_in_page + 1);
5009                cur = min(cur, dst_off_in_page + 1);
5010                move_pages(extent_buffer_page(dst, dst_i),
5011                           extent_buffer_page(dst, src_i),
5012                           dst_off_in_page - cur + 1,
5013                           src_off_in_page - cur + 1, cur);
5014
5015                dst_end -= cur;
5016                src_end -= cur;
5017                len -= cur;
5018        }
5019}
5020
5021int try_release_extent_buffer(struct page *page, gfp_t mask)
5022{
5023        struct extent_buffer *eb;
5024
5025        /*
5026         * We need to make sure noboody is attaching this page to an eb right
5027         * now.
5028         */
5029        spin_lock(&page->mapping->private_lock);
5030        if (!PagePrivate(page)) {
5031                spin_unlock(&page->mapping->private_lock);
5032                return 1;
5033        }
5034
5035        eb = (struct extent_buffer *)page->private;
5036        BUG_ON(!eb);
5037
5038        /*
5039         * This is a little awful but should be ok, we need to make sure that
5040         * the eb doesn't disappear out from under us while we're looking at
5041         * this page.
5042         */
5043        spin_lock(&eb->refs_lock);
5044        if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
5045                spin_unlock(&eb->refs_lock);
5046                spin_unlock(&page->mapping->private_lock);
5047                return 0;
5048        }
5049        spin_unlock(&page->mapping->private_lock);
5050
5051        if ((mask & GFP_NOFS) == GFP_NOFS)
5052                mask = GFP_NOFS;
5053
5054        /*
5055         * If tree ref isn't set then we know the ref on this eb is a real ref,
5056         * so just return, this page will likely be freed soon anyway.
5057         */
5058        if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
5059                spin_unlock(&eb->refs_lock);
5060                return 0;
5061        }
5062
5063        return release_extent_buffer(eb, mask);
5064}
5065